From 7511af13d0d123cd0cda66ce956485ecd35b7095 Mon Sep 17 00:00:00 2001
From: Bob Mottram
') content = htmlReplaceEmailQuote(content) return '
' + htmlReplaceQuoteMarks(content) + '
' @@ -1053,3 +1054,35 @@ def extractTextFieldsInPOST(postBytes, boundary: str, debug: bool, postValue += postLines[line] fields[postKey] = urllib.parse.unquote(postValue) return fields + + +def limitRepeatedWords(text: str, maxRepeats: int) -> str: + """Removes words which are repeated many times + """ + words = text.replace('\n', ' ').split(' ') + repeatCtr = 0 + repeatedText = '' + replacements = {} + prevWord = '' + for word in words: + if word == prevWord: + repeatCtr += 1 + if repeatedText: + repeatedText += ' ' + word + else: + repeatedText = word + ' ' + word + else: + if repeatCtr > maxRepeats: + newText = ((prevWord + ' ') * maxRepeats).strip() + replacements[prevWord] = [repeatedText, newText] + repeatCtr = 0 + repeatedText = '' + prevWord = word + + if repeatCtr > maxRepeats: + newText = ((prevWord + ' ') * maxRepeats).strip() + replacements[prevWord] = [repeatedText, newText] + + for word, item in replacements.items(): + text = text.replace(item[0], item[1]) + return text diff --git a/posts.py b/posts.py index 0dc9e7179..07cef6cab 100644 --- a/posts.py +++ b/posts.py @@ -61,6 +61,7 @@ from utils import removeHtml from utils import dangerousMarkup from media import attachMedia from media import replaceYouTube +from content import limitRepeatedWords from content import tagExists from content import removeLongWords from content import addHtmlTags @@ -4031,6 +4032,9 @@ def downloadAnnounce(session, baseDir: str, httpPrefix: str, # remove any long words contentStr = removeLongWords(contentStr, 40, []) + # Prevent the same word from being repeated many times + contentStr = limitRepeatedWords(contentStr, 6) + # remove text formatting, such as bold/italics contentStr = removeTextFormatting(contentStr) diff --git a/tests.py b/tests.py index cbcec1c6c..eacaa47bd 100644 --- a/tests.py +++ b/tests.py @@ -94,6 +94,7 @@ from inbox import jsonPostAllowsComments from inbox import validInbox from inbox import validInboxFilenames from categories import guessHashtagCategory +from content import limitRepeatedWords from content import switchWords from content import extractTextFieldsInPOST from content import validHashTag @@ -4154,9 +4155,47 @@ def _testLimitWordLengths() -> None: assert result == "This is an exceptionally test" +def _testLimitRepetedWords() -> None: + print('limitRepeatedWords') + text = \ + "This is a preamble.\n\n" + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same\n\n" + \ + "Some other text." + expected = \ + "This is a preamble.\n\n" + \ + "Same Same Same Same Same Same\n\n" + \ + "Some other text." + result = limitRepeatedWords(text, 6) + assert result == expected + + text = \ + "This is other preamble.\n\n" + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same " + \ + "Same Same Same Same Same Same Same Same Same Same" + expected = \ + "This is other preamble.\n\n" + \ + "Same Same Same Same Same Same" + result = limitRepeatedWords(text, 6) + assert result == expected + + def runAllTests(): print('Running tests...') updateDefaultThemesList(os.getcwd()) + _testLimitRepetedWords() _testLimitWordLengths() _testSwitchWords() _testFunctions() diff --git a/webapp_column_right.py b/webapp_column_right.py index 731ac86bf..ab3cff465 100644 --- a/webapp_column_right.py +++ b/webapp_column_right.py @@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns" import os from datetime import datetime from content import removeLongWords +from content import limitRepeatedWords from utils import removeHtml from utils import locatePost from utils import loadJson @@ -265,6 +266,7 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool, _votesIndicator(totalVotes, positiveVoting) title = removeLongWords(item[0], 16, []).replace('\n', '