Remove html before checking for hashtags

main
Bob Mottram 2020-10-25 12:47:16 +00:00
parent e5c436fa3f
commit 3fd0723684
6 changed files with 24 additions and 23 deletions

View File

@ -561,25 +561,6 @@ def removeTextFormatting(content: str) -> str:
return content return content
def removeHtml(content: str) -> str:
"""Removes html links from the given content.
Used to ensure that profile descriptions don't contain dubious content
"""
if '<' not in content:
return content
removing = False
content = content.replace('<q>', '"').replace('</q>', '"')
result = ''
for ch in content:
if ch == '<':
removing = True
elif ch == '>':
removing = False
elif not removing:
result += ch
return result
def removeLongWords(content: str, maxWordLength: int, def removeLongWords(content: str, maxWordLength: int,
longWordsList: []) -> str: longWordsList: []) -> str:
"""Breaks up long words so that on mobile screens this doesn't """Breaks up long words so that on mobile screens this doesn't

View File

@ -18,6 +18,7 @@ from utils import loadJson
from utils import saveJson from utils import saveJson
from utils import isSuspended from utils import isSuspended
from utils import containsInvalidChars from utils import containsInvalidChars
from utils import removeHtml
from blocking import isBlockedDomain from blocking import isBlockedDomain
from blocking import isBlockedHashtag from blocking import isBlockedHashtag
from filters import isFiltered from filters import isFiltered
@ -88,7 +89,7 @@ def addNewswireDictEntry(baseDir: str, domain: str,
tags=[], maxTags=32) -> None: tags=[], maxTags=32) -> None:
"""Update the newswire dictionary """Update the newswire dictionary
""" """
allText = title + ' ' + description allText = removeHtml(title + ' ' + description)
# check that none of the text is filtered against # check that none of the text is filtered against
if isFiltered(baseDir, 'news', domain, allText): if isFiltered(baseDir, 'news', domain, allText):

View File

@ -49,9 +49,9 @@ from utils import getConfigParam
from utils import locateNewsVotes from utils import locateNewsVotes
from utils import locateNewsArrival from utils import locateNewsArrival
from utils import votesOnNewswireItem from utils import votesOnNewswireItem
from utils import removeHtml
from media import attachMedia from media import attachMedia
from media import replaceYouTube from media import replaceYouTube
from content import removeHtml
from content import removeLongWords from content import removeLongWords
from content import addHtmlTags from content import addHtmlTags
from content import replaceEmojiFromTags from content import replaceEmojiFromTags

View File

@ -43,6 +43,7 @@ from utils import loadJson
from utils import saveJson from utils import saveJson
from utils import getStatusNumber from utils import getStatusNumber
from utils import getFollowersOfPerson from utils import getFollowersOfPerson
from utils import removeHtml
from follow import followerOfPerson from follow import followerOfPerson
from follow import unfollowPerson from follow import unfollowPerson
from follow import unfollowerOfPerson from follow import unfollowerOfPerson
@ -71,7 +72,6 @@ from inbox import validInboxFilenames
from content import htmlReplaceEmailQuote from content import htmlReplaceEmailQuote
from content import htmlReplaceQuoteMarks from content import htmlReplaceQuoteMarks
from content import dangerousMarkup from content import dangerousMarkup
from content import removeHtml
from content import addWebLinks from content import addWebLinks
from content import replaceEmojiFromTags from content import replaceEmojiFromTags
from content import addHtmlTags from content import addHtmlTags

View File

@ -19,6 +19,25 @@ from calendar import monthrange
from followingCalendar import addPersonToCalendar from followingCalendar import addPersonToCalendar
def removeHtml(content: str) -> str:
"""Removes html links from the given content.
Used to ensure that profile descriptions don't contain dubious content
"""
if '<' not in content:
return content
removing = False
content = content.replace('<q>', '"').replace('</q>', '"')
result = ''
for ch in content:
if ch == '<':
removing = True
elif ch == '>':
removing = False
elif not removing:
result += ch
return result
def isSystemAccount(nickname: str) -> bool: def isSystemAccount(nickname: str) -> bool:
"""Returns true if the given nickname is a system account """Returns true if the given nickname is a system account
""" """

View File

@ -45,6 +45,7 @@ from utils import getCachedPostFilename
from utils import loadJson from utils import loadJson
from utils import getConfigParam from utils import getConfigParam
from utils import votesOnNewswireItem from utils import votesOnNewswireItem
from utils import removeHtml
from follow import isFollowingActor from follow import isFollowingActor
from webfinger import webfingerHandle from webfinger import webfingerHandle
from posts import isDM from posts import isDM
@ -71,7 +72,6 @@ from content import getMentionsFromHtml
from content import addHtmlTags from content import addHtmlTags
from content import replaceEmojiFromTags from content import replaceEmojiFromTags
from content import removeLongWords from content import removeLongWords
from content import removeHtml
from skills import getSkills from skills import getSkills
from cache import getPersonFromCache from cache import getPersonFromCache
from cache import storePersonInCache from cache import storePersonInCache