From df2a947516f1c9682140be8d48f5b1e0f64868fa Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 23 Jul 2021 12:57:39 +0100 Subject: [PATCH] Remove extra character --- posts.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/posts.py b/posts.py index 7349a8566..076a82336 100644 --- a/posts.py +++ b/posts.py @@ -491,16 +491,10 @@ def _getPosts(session, outboxUrl: str, maxPosts: int, return personPosts -def _updateWordFrequency(content: str, wordFrequency: {}) -> None: - """Creates a dictionary containing words and the number of times - that they appear +def _getCommonWords() -> str: + """Returns a list of common words """ - plainText = removeHtml(content) - removeChars = ('.', ';', '?', '\n') - for ch in removeChars: - plainText = plainText.replace(ch, ' ') - wordsList = plainText.split(' ') - commonWords = ( + return ( 'that', 'some', 'about', 'then', 'they', 'were', 'also', 'from', 'with', 'this', 'have', 'more', 'need', 'here', 'would', 'these', 'into', 'very', @@ -510,8 +504,23 @@ def _updateWordFrequency(content: str, wordFrequency: {}) -> None: 'been', 'over', 'their', 'where', 'could', 'though', 'like', 'think', 'same', 'maybe', 'really', 'thing', 'something', 'possible', 'actual', 'actually', - 'because', 'around', 'having' + 'because', 'around', 'having', 'especially', 'other', + 'making', 'made', 'make', 'makes', 'including', + 'includes', 'know', 'knowing', 'knows', 'things', + 'say', 'says', 'saying', 'many', 'somewhat', + 'problem', 'problems', 'idea', 'ideas' ) + +def _updateWordFrequency(content: str, wordFrequency: {}) -> None: + """Creates a dictionary containing words and the number of times + that they appear + """ + plainText = removeHtml(content) + removeChars = ('.', ';', '?', '\n', ':') + for ch in removeChars: + plainText = plainText.replace(ch, ' ') + wordsList = plainText.split(' ') + commonWords = _getCommonWords() for word in wordsList: wordLen = len(word) if wordLen < 3: @@ -524,8 +533,6 @@ def _updateWordFrequency(content: str, wordFrequency: {}) -> None: '@' in word or \ '://' in word: continue - if word.endswith(':'): - word = word.replace(':', '') if word.lower() in commonWords: continue if wordFrequency.get(word):