Limit the number of times that the same word can be repeated

merge-requests/30/head
Bob Mottram 2021-07-10 10:38:59 +01:00
parent e8553eb192
commit 7511af13d0
5 changed files with 82 additions and 0 deletions

View File

@ -866,6 +866,7 @@ def addHtmlTags(baseDir: str, httpPrefix: str,
content = addWebLinks(content) content = addWebLinks(content)
if longWordsList: if longWordsList:
content = removeLongWords(content, maxWordLength, longWordsList) content = removeLongWords(content, maxWordLength, longWordsList)
content = limitRepeatedWords(content, 6)
content = content.replace(' --linebreak-- ', '</p><p>') content = content.replace(' --linebreak-- ', '</p><p>')
content = htmlReplaceEmailQuote(content) content = htmlReplaceEmailQuote(content)
return '<p>' + htmlReplaceQuoteMarks(content) + '</p>' return '<p>' + htmlReplaceQuoteMarks(content) + '</p>'
@ -1053,3 +1054,35 @@ def extractTextFieldsInPOST(postBytes, boundary: str, debug: bool,
postValue += postLines[line] postValue += postLines[line]
fields[postKey] = urllib.parse.unquote(postValue) fields[postKey] = urllib.parse.unquote(postValue)
return fields return fields
def limitRepeatedWords(text: str, maxRepeats: int) -> str:
"""Removes words which are repeated many times
"""
words = text.replace('\n', ' ').split(' ')
repeatCtr = 0
repeatedText = ''
replacements = {}
prevWord = ''
for word in words:
if word == prevWord:
repeatCtr += 1
if repeatedText:
repeatedText += ' ' + word
else:
repeatedText = word + ' ' + word
else:
if repeatCtr > maxRepeats:
newText = ((prevWord + ' ') * maxRepeats).strip()
replacements[prevWord] = [repeatedText, newText]
repeatCtr = 0
repeatedText = ''
prevWord = word
if repeatCtr > maxRepeats:
newText = ((prevWord + ' ') * maxRepeats).strip()
replacements[prevWord] = [repeatedText, newText]
for word, item in replacements.items():
text = text.replace(item[0], item[1])
return text

View File

@ -61,6 +61,7 @@ from utils import removeHtml
from utils import dangerousMarkup from utils import dangerousMarkup
from media import attachMedia from media import attachMedia
from media import replaceYouTube from media import replaceYouTube
from content import limitRepeatedWords
from content import tagExists from content import tagExists
from content import removeLongWords from content import removeLongWords
from content import addHtmlTags from content import addHtmlTags
@ -4031,6 +4032,9 @@ def downloadAnnounce(session, baseDir: str, httpPrefix: str,
# remove any long words # remove any long words
contentStr = removeLongWords(contentStr, 40, []) contentStr = removeLongWords(contentStr, 40, [])
# Prevent the same word from being repeated many times
contentStr = limitRepeatedWords(contentStr, 6)
# remove text formatting, such as bold/italics # remove text formatting, such as bold/italics
contentStr = removeTextFormatting(contentStr) contentStr = removeTextFormatting(contentStr)

View File

@ -94,6 +94,7 @@ from inbox import jsonPostAllowsComments
from inbox import validInbox from inbox import validInbox
from inbox import validInboxFilenames from inbox import validInboxFilenames
from categories import guessHashtagCategory from categories import guessHashtagCategory
from content import limitRepeatedWords
from content import switchWords from content import switchWords
from content import extractTextFieldsInPOST from content import extractTextFieldsInPOST
from content import validHashTag from content import validHashTag
@ -4154,9 +4155,47 @@ def _testLimitWordLengths() -> None:
assert result == "This is an exceptionally test" assert result == "This is an exceptionally test"
def _testLimitRepetedWords() -> None:
print('limitRepeatedWords')
text = \
"This is a preamble.\n\n" + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same\n\n" + \
"Some other text."
expected = \
"This is a preamble.\n\n" + \
"Same Same Same Same Same Same\n\n" + \
"Some other text."
result = limitRepeatedWords(text, 6)
assert result == expected
text = \
"This is other preamble.\n\n" + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same " + \
"Same Same Same Same Same Same Same Same Same Same"
expected = \
"This is other preamble.\n\n" + \
"Same Same Same Same Same Same"
result = limitRepeatedWords(text, 6)
assert result == expected
def runAllTests(): def runAllTests():
print('Running tests...') print('Running tests...')
updateDefaultThemesList(os.getcwd()) updateDefaultThemesList(os.getcwd())
_testLimitRepetedWords()
_testLimitWordLengths() _testLimitWordLengths()
_testSwitchWords() _testSwitchWords()
_testFunctions() _testFunctions()

View File

@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns"
import os import os
from datetime import datetime from datetime import datetime
from content import removeLongWords from content import removeLongWords
from content import limitRepeatedWords
from utils import removeHtml from utils import removeHtml
from utils import locatePost from utils import locatePost
from utils import loadJson from utils import loadJson
@ -265,6 +266,7 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool,
_votesIndicator(totalVotes, positiveVoting) _votesIndicator(totalVotes, positiveVoting)
title = removeLongWords(item[0], 16, []).replace('\n', '<br>') title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
htmlStr += '<p class="newswireItemVotedOn">' + \ htmlStr += '<p class="newswireItemVotedOn">' + \
'<a href="' + url + '" target="_blank" ' + \ '<a href="' + url + '" target="_blank" ' + \
'rel="nofollow noopener noreferrer">' + \ 'rel="nofollow noopener noreferrer">' + \
@ -293,6 +295,7 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool,
_votesIndicator(totalVotes, positiveVoting) _votesIndicator(totalVotes, positiveVoting)
title = removeLongWords(item[0], 16, []).replace('\n', '<br>') title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
if moderator and moderatedItem: if moderator and moderatedItem:
htmlStr += '<p class="newswireItemModerated">' + \ htmlStr += '<p class="newswireItemModerated">' + \
'<a href="' + url + '" target="_blank" ' + \ '<a href="' + url + '" target="_blank" ' + \
@ -417,6 +420,7 @@ def htmlCitations(baseDir: str, nickname: str, domain: str,
dateShown = publishedDate.strftime("%Y-%m-%d %H:%M") dateShown = publishedDate.strftime("%Y-%m-%d %H:%M")
title = removeLongWords(item[0], 16, []).replace('\n', '<br>') title = removeLongWords(item[0], 16, []).replace('\n', '<br>')
title = limitRepeatedWords(title, 6)
link = item[1] link = item[1]
citationValue = \ citationValue = \

View File

@ -45,6 +45,7 @@ from utils import removeIdEnding
from utils import getNicknameFromActor from utils import getNicknameFromActor
from utils import getDomainFromActor from utils import getDomainFromActor
from utils import isEventPost from utils import isEventPost
from content import limitRepeatedWords
from content import replaceEmojiFromTags from content import replaceEmojiFromTags
from content import htmlReplaceQuoteMarks from content import htmlReplaceQuoteMarks
from content import htmlReplaceEmailQuote from content import htmlReplaceEmailQuote
@ -1601,6 +1602,7 @@ def individualPostAsHtml(allowDownloads: bool,
objectContent = \ objectContent = \
removeLongWords(postJsonObject['object']['content'], 40, []) removeLongWords(postJsonObject['object']['content'], 40, [])
objectContent = removeTextFormatting(objectContent) objectContent = removeTextFormatting(objectContent)
objectContent = limitRepeatedWords(objectContent, 6)
objectContent = \ objectContent = \
switchWords(baseDir, nickname, domain, objectContent) switchWords(baseDir, nickname, domain, objectContent)
objectContent = htmlReplaceEmailQuote(objectContent) objectContent = htmlReplaceEmailQuote(objectContent)