From 27c99f190037405327b6d6df635dc41db608b29b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 14 Oct 2021 16:53:04 +0100 Subject: [PATCH] Tidying --- content.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/content.py b/content.py index e4c8a828c..eb2cd6f4f 100644 --- a/content.py +++ b/content.py @@ -1149,19 +1149,28 @@ def _wordsSimilarityHistogram(words: []) -> {}: return histogram +def _wordsSimilarityWordsList(content: str) -> []: + """Returns a list of words for the given content + """ + removePunctuation = ('.', ',', ';', '-', ':') + content = removeHtml(content).lower() + for p in removePunctuation: + content = content.replace(p, ' ') + content = content.replace(' ', ' ') + return content.split(' ') + + def wordsSimilarity(content1: str, content2: str, minWords: int) -> int: """Returns percentage similarity """ if content1 == content2: return 100 - content1 = removeHtml(content1).lower() - words1 = content1.split(' ') + words1 = _wordsSimilarityWordsList(content1) if len(words1) < minWords: return 0 - content2 = removeHtml(content2).lower() - words2 = content2.split(' ') + words2 = _wordsSimilarityWordsList(content2) if len(words2) < minWords: return 0