Remove dangerous markup from rss feeds

2020-10-11 10:33:31 +01:00 · 2020-10-11 10:33:31 +01:00 · 13c067bfa4
parent 051b361c79
commit 13c067bfa4
3 changed files with 52 additions and 3 deletions
--- a/content.py
+++ b/content.py
@ -14,6 +14,23 @@ from utils import fileLastModified
 from utils import getLinkPrefixes
 def removeHtmlTag(htmlStr: str, tag: str) -> str:
    """Removes a given tag from a html string
    """
    tagFound = True
    while tagFound:
        matchStr = ' ' + tag + '="'
        if matchStr not in htmlStr:
            tagFound = False
            break
        sections = htmlStr.split(matchStr, 1)
        if '"' not in sections[1]:
            tagFound = False
            break
        htmlStr = sections[0] + sections[1].split('"', 1)[1]
    return htmlStr
 def removeQuotesWithinQuotes(content: str) -> str:
    """Removes any blockquote inside blockquote
    """
--- a/newsdaemon.py
+++ b/newsdaemon.py
@ -12,6 +12,8 @@ import datetime
 from collections import OrderedDict
 from newswire import getDictFromNewswire
 from posts import createNewsPost
 from content import removeHtmlTag
 from content import dangerousMarkup
 from utils import loadJson
 from utils import saveJson
 from utils import getStatusNumber
@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
 def removeControlCharacters(content: str) -> str:
-    content = content.replace('&8211;', '-').replace('&#8211;', '-')
+    """TODO this is hacky and a better solution is needed
-    return content.replace('&8230;', '...').replace('&#8230;', '...')
+    the unicode is messing up somehow
    """
    lookups = {
        "8211": "-",
        "8230": "...",
        "8216": "'",
        "8217": "'",
        "8220": '"',
        "8221": '"'
    }
    for code, ch in lookups.items():
        content = content.replace('&' + code + ';', ch)
        content = content.replace('&#' + code + ';', ch)
    return content
 def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
        rssTitle = removeControlCharacters(item[0])
        url = item[1]
        if dangerousMarkup(url) or dangerousMarkup(rssTitle):
            continue
        rssDescription = ''
        # get the rss description if it exists
@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
        rssDescription = '<p>' + rssDescription + '<p>'
        # add the off-site link to the description
-        if rssDescription:
+        if rssDescription and not dangerousMarkup(rssDescription):
            rssDescription += \
                '<br><a href="' + url + '">' + \
                translate['Read more...'] + '</a>'
@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
                '<a href="' + url + '">' + \
                translate['Read more...'] + '</a>'
        # remove image dimensions
        rssDescription = removeHtmlTag(rssDescription, 'width')
        rssDescription = removeHtmlTag(rssDescription, 'height')
        followersOnly = False
        useBlurhash = False
        # NOTE: the id when the post is created will not be
--- a/tests.py
+++ b/tests.py
@ -78,6 +78,7 @@ from content import addHtmlTags
 from content import removeLongWords
 from content import replaceContentDuplicates
 from content import removeTextFormatting
 from content import removeHtmlTag
 from theme import setCSSparam
 from jsonldsig import testSignJsonld
 from jsonldsig import jsonldVerify
@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
    assert resultStr == expectedStr
 def testRemoveHtmlTag():
    print('testRemoveHtmlTag')
    testStr = "<p><img width=\"864\" height=\"486\" " + \
        "src=\"https://somesiteorother.com/image.jpg\"></p>"
    resultStr = removeHtmlTag(testStr, 'width')
    assert resultStr == "<p><img height=\"486\" " + \
        "src=\"https://somesiteorother.com/image.jpg\"></p>"
 def runAllTests():
    print('Running tests...')
    testRemoveHtmlTag()
    testReplaceEmailQuote()
    testConstantTimeStringCheck()
    testTranslations()