From 13c067bfa41388a5c2515596216daf87a58bb2df Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 11 Oct 2020 10:33:31 +0100 Subject: [PATCH] Remove dangerous markup from rss feeds --- content.py | 17 +++++++++++++++++ newsdaemon.py | 27 ++++++++++++++++++++++++--- tests.py | 11 +++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/content.py b/content.py index d1a17019..6de17eb6 100644 --- a/content.py +++ b/content.py @@ -14,6 +14,23 @@ from utils import fileLastModified from utils import getLinkPrefixes +def removeHtmlTag(htmlStr: str, tag: str) -> str: + """Removes a given tag from a html string + """ + tagFound = True + while tagFound: + matchStr = ' ' + tag + '="' + if matchStr not in htmlStr: + tagFound = False + break + sections = htmlStr.split(matchStr, 1) + if '"' not in sections[1]: + tagFound = False + break + htmlStr = sections[0] + sections[1].split('"', 1)[1] + return htmlStr + + def removeQuotesWithinQuotes(content: str) -> str: """Removes any blockquote inside blockquote """ diff --git a/newsdaemon.py b/newsdaemon.py index 610a8477..9162a869 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -12,6 +12,8 @@ import datetime from collections import OrderedDict from newswire import getDictFromNewswire from posts import createNewsPost +from content import removeHtmlTag +from content import dangerousMarkup from utils import loadJson from utils import saveJson from utils import getStatusNumber @@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None: def removeControlCharacters(content: str) -> str: - content = content.replace('&8211;', '-').replace('–', '-') - return content.replace('&8230;', '...').replace('…', '...') + """TODO this is hacky and a better solution is needed + the unicode is messing up somehow + """ + lookups = { + "8211": "-", + "8230": "...", + "8216": "'", + "8217": "'", + "8220": '"', + "8221": '"' + } + for code, ch in lookups.items(): + content = content.replace('&' + code + ';', ch) + content = content.replace('&#' + code + ';', ch) + return content def convertRSStoActivityPub(baseDir: str, httpPrefix: str, @@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, rssTitle = removeControlCharacters(item[0]) url = item[1] + if dangerousMarkup(url) or dangerousMarkup(rssTitle): + continue rssDescription = '' # get the rss description if it exists @@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, rssDescription = '

' + rssDescription + '

' # add the off-site link to the description - if rssDescription: + if rssDescription and not dangerousMarkup(rssDescription): rssDescription += \ '
' + \ translate['Read more...'] + '' @@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, '' + \ translate['Read more...'] + '' + # remove image dimensions + rssDescription = removeHtmlTag(rssDescription, 'width') + rssDescription = removeHtmlTag(rssDescription, 'height') + followersOnly = False useBlurhash = False # NOTE: the id when the post is created will not be diff --git a/tests.py b/tests.py index ea93839e..af64d550 100644 --- a/tests.py +++ b/tests.py @@ -78,6 +78,7 @@ from content import addHtmlTags from content import removeLongWords from content import replaceContentDuplicates from content import removeTextFormatting +from content import removeHtmlTag from theme import setCSSparam from jsonldsig import testSignJsonld from jsonldsig import jsonldVerify @@ -2162,8 +2163,18 @@ def testReplaceEmailQuote(): assert resultStr == expectedStr +def testRemoveHtmlTag(): + print('testRemoveHtmlTag') + testStr = "

" + resultStr = removeHtmlTag(testStr, 'width') + assert resultStr == "

" + + def runAllTests(): print('Running tests...') + testRemoveHtmlTag() testReplaceEmailQuote() testConstantTimeStringCheck() testTranslations()