From 3fd0723684fec913af1dd9072b2d541d6cce026a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 25 Oct 2020 12:47:16 +0000 Subject: [PATCH] Remove html before checking for hashtags --- content.py | 19 ------------------- newswire.py | 3 ++- posts.py | 2 +- tests.py | 2 +- utils.py | 19 +++++++++++++++++++ webinterface.py | 2 +- 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/content.py b/content.py index 6d619509c..5676bdcd0 100644 --- a/content.py +++ b/content.py @@ -561,25 +561,6 @@ def removeTextFormatting(content: str) -> str: return content -def removeHtml(content: str) -> str: - """Removes html links from the given content. - Used to ensure that profile descriptions don't contain dubious content - """ - if '<' not in content: - return content - removing = False - content = content.replace('', '"').replace('', '"') - result = '' - for ch in content: - if ch == '<': - removing = True - elif ch == '>': - removing = False - elif not removing: - result += ch - return result - - def removeLongWords(content: str, maxWordLength: int, longWordsList: []) -> str: """Breaks up long words so that on mobile screens this doesn't diff --git a/newswire.py b/newswire.py index 80815d192..e044f8a12 100644 --- a/newswire.py +++ b/newswire.py @@ -18,6 +18,7 @@ from utils import loadJson from utils import saveJson from utils import isSuspended from utils import containsInvalidChars +from utils import removeHtml from blocking import isBlockedDomain from blocking import isBlockedHashtag from filters import isFiltered @@ -88,7 +89,7 @@ def addNewswireDictEntry(baseDir: str, domain: str, tags=[], maxTags=32) -> None: """Update the newswire dictionary """ - allText = title + ' ' + description + allText = removeHtml(title + ' ' + description) # check that none of the text is filtered against if isFiltered(baseDir, 'news', domain, allText): diff --git a/posts.py b/posts.py index e033957bb..ed93f04e9 100644 --- a/posts.py +++ b/posts.py @@ -49,9 +49,9 @@ from utils import getConfigParam from utils import locateNewsVotes from utils import locateNewsArrival from utils import votesOnNewswireItem +from utils import removeHtml from media import attachMedia from media import replaceYouTube -from content import removeHtml from content import removeLongWords from content import addHtmlTags from content import replaceEmojiFromTags diff --git a/tests.py b/tests.py index deeae1523..2f45f44e2 100644 --- a/tests.py +++ b/tests.py @@ -43,6 +43,7 @@ from utils import loadJson from utils import saveJson from utils import getStatusNumber from utils import getFollowersOfPerson +from utils import removeHtml from follow import followerOfPerson from follow import unfollowPerson from follow import unfollowerOfPerson @@ -71,7 +72,6 @@ from inbox import validInboxFilenames from content import htmlReplaceEmailQuote from content import htmlReplaceQuoteMarks from content import dangerousMarkup -from content import removeHtml from content import addWebLinks from content import replaceEmojiFromTags from content import addHtmlTags diff --git a/utils.py b/utils.py index 77e3162ba..f23b2ed25 100644 --- a/utils.py +++ b/utils.py @@ -19,6 +19,25 @@ from calendar import monthrange from followingCalendar import addPersonToCalendar +def removeHtml(content: str) -> str: + """Removes html links from the given content. + Used to ensure that profile descriptions don't contain dubious content + """ + if '<' not in content: + return content + removing = False + content = content.replace('', '"').replace('', '"') + result = '' + for ch in content: + if ch == '<': + removing = True + elif ch == '>': + removing = False + elif not removing: + result += ch + return result + + def isSystemAccount(nickname: str) -> bool: """Returns true if the given nickname is a system account """ diff --git a/webinterface.py b/webinterface.py index 8b9014a6c..04f96714a 100644 --- a/webinterface.py +++ b/webinterface.py @@ -45,6 +45,7 @@ from utils import getCachedPostFilename from utils import loadJson from utils import getConfigParam from utils import votesOnNewswireItem +from utils import removeHtml from follow import isFollowingActor from webfinger import webfingerHandle from posts import isDM @@ -71,7 +72,6 @@ from content import getMentionsFromHtml from content import addHtmlTags from content import replaceEmojiFromTags from content import removeLongWords -from content import removeHtml from skills import getSkills from cache import getPersonFromCache from cache import storePersonInCache