Remove html before checking for hashtags

2020-10-25 12:47:16 +00:00 · 2020-10-25 12:47:16 +00:00 · 3fd0723684
parent e5c436fa3f
commit 3fd0723684
6 changed files with 24 additions and 23 deletions
--- a/content.py
+++ b/content.py
@ -561,25 +561,6 @@ def removeTextFormatting(content: str) -> str:
    return content
 def removeHtml(content: str) -> str:
    """Removes html links from the given content.
    Used to ensure that profile descriptions don't contain dubious content
    """
    if '<' not in content:
        return content
    removing = False
    content = content.replace('<q>', '"').replace('</q>', '"')
    result = ''
    for ch in content:
        if ch == '<':
            removing = True
        elif ch == '>':
            removing = False
        elif not removing:
            result += ch
    return result
 def removeLongWords(content: str, maxWordLength: int,
                    longWordsList: []) -> str:
    """Breaks up long words so that on mobile screens this doesn't
--- a/newswire.py
+++ b/newswire.py
@ -18,6 +18,7 @@ from utils import loadJson
 from utils import saveJson
 from utils import isSuspended
 from utils import containsInvalidChars
 from utils import removeHtml
 from blocking import isBlockedDomain
 from blocking import isBlockedHashtag
 from filters import isFiltered
@ -88,7 +89,7 @@ def addNewswireDictEntry(baseDir: str, domain: str,
                         tags=[], maxTags=32) -> None:
    """Update the newswire dictionary
    """
-    allText = title + ' ' + description
+    allText = removeHtml(title + ' ' + description)
    # check that none of the text is filtered against
    if isFiltered(baseDir, 'news', domain, allText):
--- a/posts.py
+++ b/posts.py
@ -49,9 +49,9 @@ from utils import getConfigParam
 from utils import locateNewsVotes
 from utils import locateNewsArrival
 from utils import votesOnNewswireItem
 from utils import removeHtml
 from media import attachMedia
 from media import replaceYouTube
 from content import removeHtml
 from content import removeLongWords
 from content import addHtmlTags
 from content import replaceEmojiFromTags
--- a/tests.py
+++ b/tests.py
@ -43,6 +43,7 @@ from utils import loadJson
 from utils import saveJson
 from utils import getStatusNumber
 from utils import getFollowersOfPerson
 from utils import removeHtml
 from follow import followerOfPerson
 from follow import unfollowPerson
 from follow import unfollowerOfPerson
@ -71,7 +72,6 @@ from inbox import validInboxFilenames
 from content import htmlReplaceEmailQuote
 from content import htmlReplaceQuoteMarks
 from content import dangerousMarkup
 from content import removeHtml
 from content import addWebLinks
 from content import replaceEmojiFromTags
 from content import addHtmlTags
--- a/utils.py
+++ b/utils.py
@ -19,6 +19,25 @@ from calendar import monthrange
 from followingCalendar import addPersonToCalendar
 def removeHtml(content: str) -> str:
    """Removes html links from the given content.
    Used to ensure that profile descriptions don't contain dubious content
    """
    if '<' not in content:
        return content
    removing = False
    content = content.replace('<q>', '"').replace('</q>', '"')
    result = ''
    for ch in content:
        if ch == '<':
            removing = True
        elif ch == '>':
            removing = False
        elif not removing:
            result += ch
    return result
 def isSystemAccount(nickname: str) -> bool:
    """Returns true if the given nickname is a system account
    """
--- a/webinterface.py
+++ b/webinterface.py
@ -45,6 +45,7 @@ from utils import getCachedPostFilename
 from utils import loadJson
 from utils import getConfigParam
 from utils import votesOnNewswireItem
 from utils import removeHtml
 from follow import isFollowingActor
 from webfinger import webfingerHandle
 from posts import isDM
@ -71,7 +72,6 @@ from content import getMentionsFromHtml
 from content import addHtmlTags
 from content import replaceEmojiFromTags
 from content import removeLongWords
 from content import removeHtml
 from skills import getSkills
 from cache import getPersonFromCache
 from cache import storePersonInCache