Remove html before checking for hashtags

2020-10-25 12:47:16 +00:00 · 2020-10-25 12:47:16 +00:00 · 3fd0723684
parent e5c436fa3f
commit 3fd0723684
6 changed files with 24 additions and 23 deletions
--- a/content.py
+++ b/content.py
@ -561,25 +561,6 @@ def removeTextFormatting(content: str) -> str:
    return content


-def removeHtml(content: str) -> str:
-    """Removes html links from the given content.
-    Used to ensure that profile descriptions don't contain dubious content
-    """
-    if '<' not in content:
-        return content
-    removing = False
-    content = content.replace('<q>', '"').replace('</q>', '"')
-    result = ''
-    for ch in content:
-        if ch == '<':
-            removing = True
-        elif ch == '>':
-            removing = False
-        elif not removing:
-            result += ch
-    return result
-
-
 def removeLongWords(content: str, maxWordLength: int,
                    longWordsList: []) -> str:
    """Breaks up long words so that on mobile screens this doesn't
--- a/newswire.py
+++ b/newswire.py
@ -18,6 +18,7 @@ from utils import loadJson
 from utils import saveJson
 from utils import isSuspended
 from utils import containsInvalidChars
+from utils import removeHtml
 from blocking import isBlockedDomain
 from blocking import isBlockedHashtag
 from filters import isFiltered
@ -88,7 +89,7 @@ def addNewswireDictEntry(baseDir: str, domain: str,
                         tags=[], maxTags=32) -> None:
    """Update the newswire dictionary
    """
-    allText = title + ' ' + description
+    allText = removeHtml(title + ' ' + description)

    # check that none of the text is filtered against
    if isFiltered(baseDir, 'news', domain, allText):
--- a/posts.py
+++ b/posts.py
@ -49,9 +49,9 @@ from utils import getConfigParam
 from utils import locateNewsVotes
 from utils import locateNewsArrival
 from utils import votesOnNewswireItem
+from utils import removeHtml
 from media import attachMedia
 from media import replaceYouTube
-from content import removeHtml
 from content import removeLongWords
 from content import addHtmlTags
 from content import replaceEmojiFromTags
--- a/tests.py
+++ b/tests.py
@ -43,6 +43,7 @@ from utils import loadJson
 from utils import saveJson
 from utils import getStatusNumber
 from utils import getFollowersOfPerson
+from utils import removeHtml
 from follow import followerOfPerson
 from follow import unfollowPerson
 from follow import unfollowerOfPerson
@ -71,7 +72,6 @@ from inbox import validInboxFilenames
 from content import htmlReplaceEmailQuote
 from content import htmlReplaceQuoteMarks
 from content import dangerousMarkup
-from content import removeHtml
 from content import addWebLinks
 from content import replaceEmojiFromTags
 from content import addHtmlTags
--- a/utils.py
+++ b/utils.py
@ -19,6 +19,25 @@ from calendar import monthrange
 from followingCalendar import addPersonToCalendar


+def removeHtml(content: str) -> str:
+    """Removes html links from the given content.
+    Used to ensure that profile descriptions don't contain dubious content
+    """
+    if '<' not in content:
+        return content
+    removing = False
+    content = content.replace('<q>', '"').replace('</q>', '"')
+    result = ''
+    for ch in content:
+        if ch == '<':
+            removing = True
+        elif ch == '>':
+            removing = False
+        elif not removing:
+            result += ch
+    return result
+
+
 def isSystemAccount(nickname: str) -> bool:
    """Returns true if the given nickname is a system account
    """
--- a/webinterface.py
+++ b/webinterface.py
@ -45,6 +45,7 @@ from utils import getCachedPostFilename
 from utils import loadJson
 from utils import getConfigParam
 from utils import votesOnNewswireItem
+from utils import removeHtml
 from follow import isFollowingActor
 from webfinger import webfingerHandle
 from posts import isDM
@ -71,7 +72,6 @@ from content import getMentionsFromHtml
 from content import addHtmlTags
 from content import replaceEmojiFromTags
 from content import removeLongWords
-from content import removeHtml
 from skills import getSkills
 from cache import getPersonFromCache
 from cache import storePersonInCache