mirror of https://gitlab.com/bashrc2/epicyon
Tidying
parent
67bc0d61f4
commit
07f4b3605a
32
content.py
32
content.py
|
@ -1136,35 +1136,37 @@ def getPriceFromString(priceStr: str) -> (str, str):
|
||||||
return "0.00", "EUR"
|
return "0.00", "EUR"
|
||||||
|
|
||||||
|
|
||||||
|
def _wordsSimilarityHistogram(words: []) -> {}:
|
||||||
|
"""Returns a histogram for word combinations
|
||||||
|
"""
|
||||||
|
histogram = {}
|
||||||
|
for index in range(1, len(words)):
|
||||||
|
combinedWords = words[index - 1] + words[index]
|
||||||
|
if histogram.get(combinedWords):
|
||||||
|
histogram[combinedWords] += 1
|
||||||
|
else:
|
||||||
|
histogram[combinedWords] = 1
|
||||||
|
return histogram
|
||||||
|
|
||||||
|
|
||||||
def wordsSimilarity(content1: str, content2: str, minWords: int) -> int:
|
def wordsSimilarity(content1: str, content2: str, minWords: int) -> int:
|
||||||
"""Returns percentage similarity
|
"""Returns percentage similarity
|
||||||
"""
|
"""
|
||||||
if content1 == content2:
|
if content1 == content2:
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
content1 = removeHtml(content1).lower()
|
content1 = removeHtml(content1).lower()
|
||||||
words1 = content1.split(' ')
|
words1 = content1.split(' ')
|
||||||
if len(words1) < minWords:
|
if len(words1) < minWords:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
content2 = removeHtml(content2).lower()
|
content2 = removeHtml(content2).lower()
|
||||||
words2 = content2.split(' ')
|
words2 = content2.split(' ')
|
||||||
if len(words2) < minWords:
|
if len(words2) < minWords:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
histogram1 = {}
|
histogram1 = _wordsSimilarityHistogram(words1)
|
||||||
for index in range(1, len(words1)):
|
histogram2 = _wordsSimilarityHistogram(words2)
|
||||||
combinedWords = words1[index-1] + words1[index]
|
|
||||||
if histogram1.get(combinedWords):
|
|
||||||
histogram1[combinedWords] += 1
|
|
||||||
else:
|
|
||||||
histogram1[combinedWords] = 1
|
|
||||||
|
|
||||||
histogram2 = {}
|
|
||||||
for index in range(1, len(words2)):
|
|
||||||
combinedWords = words2[index-1] + words2[index]
|
|
||||||
if histogram2.get(combinedWords):
|
|
||||||
histogram2[combinedWords] += 1
|
|
||||||
else:
|
|
||||||
histogram2[combinedWords] = 1
|
|
||||||
|
|
||||||
diff = 0
|
diff = 0
|
||||||
for combinedWords, hits in histogram1.items():
|
for combinedWords, hits in histogram1.items():
|
||||||
|
|
Loading…
Reference in New Issue