Remove colon endings and question marks

main
Bob Mottram 2021-01-11 14:13:17 +00:00
parent 1934a269f1
commit 2d31488d49
1 changed files with 7 additions and 2 deletions

View File

@ -476,13 +476,15 @@ def _updateWordFrequency(content: str, wordFrequency: {}) -> None:
plainText = removeHtml(content) plainText = removeHtml(content)
plainText = plainText.replace('.', ' ') plainText = plainText.replace('.', ' ')
plainText = plainText.replace(';', ' ') plainText = plainText.replace(';', ' ')
plainText = plainText.replace('?', ' ')
wordsList = plainText.split(' ') wordsList = plainText.split(' ')
commonWords = ( commonWords = (
'that', 'some', 'about', 'then', 'they', 'were', 'that', 'some', 'about', 'then', 'they', 'were',
'also', 'from', 'with', 'this', 'have', 'more', 'also', 'from', 'with', 'this', 'have', 'more',
'need', 'here', 'would', 'these', 'into', 'very', 'need', 'here', 'would', 'these', 'into', 'very',
'well', 'when', 'what', 'your', 'there', 'which', 'well', 'when', 'what', 'your', 'there', 'which',
'even', 'there', 'such', 'just', 'those', 'only' 'even', 'there', 'such', 'just', 'those', 'only',
'will', 'much'
) )
for word in wordsList: for word in wordsList:
wordLen = len(word) wordLen = len(word)
@ -493,8 +495,11 @@ def _updateWordFrequency(content: str, wordFrequency: {}) -> None:
continue continue
if '&' in word or \ if '&' in word or \
'"' in word or \ '"' in word or \
'@' in word: '@' in word or \
'://' in word:
continue continue
if word.endswith(':'):
word = word.replace(':', '')
if word in commonWords: if word in commonWords:
continue continue
if wordFrequency.get(word): if wordFrequency.get(word):