Limit the number of times that the same word can be repeated

merge-requests/30/head
Bob Mottram 2021-07-10 10:38:59 +01:00
parent e8553eb192
commit 7511af13d0
5 changed files with 82 additions and 0 deletions

View File

@ -866,6 +866,7 @@ def addHtmlTags(baseDir: str, httpPrefix: str,
content = addWebLinks(content)
if longWordsList:
content = removeLongWords(content, maxWordLength, longWordsList)
content = limitRepeatedWords(content, 6)
content = content.replace(' --linebreak-- ', '</p><p>')
content = htmlReplaceEmailQuote(content)
return '<p>' + htmlReplaceQuoteMarks(content) + '</p>'
@ -1053,3 +1054,35 @@ def extractTextFieldsInPOST(postBytes, boundary: str, debug: bool,
postValue += postLines[line]
fields[postKey] = urllib.parse.unquote(postValue)
return fields
def limitRepeatedWords(text: str, maxRepeats: int) -> str:
"""Removes words which are repeated many times
"""
words = text.replace('\n', ' ').split(' ')
repeatCtr = 0
repeatedText = ''
replacements = {}
prevWord = ''
for word in words:
if word == prevWord:
repeatCtr += 1
if repeatedText:
repeatedText += ' ' + word
else:
repeatedText = word + ' ' + word
else:
if repeatCtr > maxRepeats:
newText = ((prevWord + ' ') * maxRepeats).strip()
replacements[prevWord] = [repeatedText, newText]
repeatCtr = 0
repeatedText = ''
prevWord = word
if repeatCtr > maxRepeats:
newText = ((prevWord + ' ') * maxRepeats).strip()
replacements[prevWord] = [repeatedText, newText]
for word, item in replacements.items():
text = text.replace(item[0], item[1])
return text

View File

@ -61,6 +61,7 @@ from utils import removeHtml
from utils import dangerousMarkup
from media import attachMedia
from media import replaceYouTube
from content import limitRepeatedWords
from content import tagExists
from content import removeLongWords
from content import addHtmlTags
@ -4031,6 +4032,9 @@ def downloadAnnounce(session, baseDir: str, httpPrefix: str,
# remove any long words
contentStr = removeLongWords(contentStr, 40, [])
# Prevent the same word from being repeated many times
contentStr = limitRepeatedWords(contentStr, 6)
# remove text formatting, such as bold/italics
contentStr = removeTextFormatting(contentStr)

View File

@ -94,6 +94,7 @@ from inbox import jsonPostAllowsComments
from inbox import validInbox
from inbox import validInboxFilenames
from categories import guessHashtagCategory
from content import limitRepeatedWords
from content import switchWords
from content import extractTextFieldsInPOST
from content import validHashTag
@ -4154,9 +4155,47 @@ def _testLimitWordLengths() -> None:
assert result == "This is an exceptionally test"
def _testLimitRepetedWords() -> None:
print('limitRepeatedWords')
text = \
"This is a preamble.\n\n" + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same\n\n" + \
"Some other text."
expected = \
"This is a preamble.\n\n" + \
"Same Same Same Same Same Same\n\n" + \
"Some other text."
result = limitRepeatedWords(text, 6)
assert result == expected
text = \
"This is other preamble.\n\n" + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same"
expected = \
"This is other preamble.\n\n" + \
"Same Same Same Same Same Same"
result = limitRepeatedWords(text, 6)
assert result == expected
def runAllTests():
print('Running tests...')
updateDefaultThemesList(os.getcwd())
_testLimitRepetedWords()
_testLimitWordLengths()
_testSwitchWords()
_testFunctions()

View File

@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns"
import os
from datetime import datetime
from content import removeLongWords
from content import limitRepeatedWords
from utils import removeHtml
from utils import locatePost
from utils import loadJson
@ -265,6 +266,7 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool,
_votesIndicator(totalVotes, positiveVoting)
title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
htmlStr += '<p class="newswireItemVotedOn">' + \
'<a href="' + url + '" target="_blank" ' + \
'rel="nofollow noopener noreferrer">' + \
@ -293,6 +295,7 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool,
_votesIndicator(totalVotes, positiveVoting)
title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
if moderator and moderatedItem:
htmlStr += '<p class="newswireItemModerated">' + \
'<a href="' + url + '" target="_blank" ' + \
@ -417,6 +420,7 @@ def htmlCitations(baseDir: str, nickname: str, domain: str,
dateShown = publishedDate.strftime("%Y-%m-%d %H:%M")
title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
link = item[1]
citationValue = \

View File

@ -45,6 +45,7 @@ from utils import removeIdEnding
from utils import getNicknameFromActor
from utils import getDomainFromActor
from utils import isEventPost
from content import limitRepeatedWords
from content import replaceEmojiFromTags
from content import htmlReplaceQuoteMarks
from content import htmlReplaceEmailQuote
@ -1601,6 +1602,7 @@ def individualPostAsHtml(allowDownloads: bool,
objectContent = \
removeLongWords(postJsonObject['object']['content'], 40, [])
objectContent = removeTextFormatting(objectContent)
objectContent = limitRepeatedWords(objectContent, 6)
objectContent = \
switchWords(baseDir, nickname, domain, objectContent)
objectContent = htmlReplaceEmailQuote(objectContent)