Remove dangerous markup from rss feeds

2020-10-11 10:33:31 +01:00 · 2020-10-11 10:33:31 +01:00 · 13c067bfa4
parent 051b361c79
commit 13c067bfa4
3 changed files with 52 additions and 3 deletions
--- a/content.py
+++ b/content.py
@ -14,6 +14,23 @@ from utils import fileLastModified
 from utils import getLinkPrefixes


+def removeHtmlTag(htmlStr: str, tag: str) -> str:
+    """Removes a given tag from a html string
+    """
+    tagFound = True
+    while tagFound:
+        matchStr = ' ' + tag + '="'
+        if matchStr not in htmlStr:
+            tagFound = False
+            break
+        sections = htmlStr.split(matchStr, 1)
+        if '"' not in sections[1]:
+            tagFound = False
+            break
+        htmlStr = sections[0] + sections[1].split('"', 1)[1]
+    return htmlStr
+
+
 def removeQuotesWithinQuotes(content: str) -> str:
    """Removes any blockquote inside blockquote
    """
--- a/newsdaemon.py
+++ b/newsdaemon.py
@ -12,6 +12,8 @@ import datetime
 from collections import OrderedDict
 from newswire import getDictFromNewswire
 from posts import createNewsPost
+from content import removeHtmlTag
+from content import dangerousMarkup
 from utils import loadJson
 from utils import saveJson
 from utils import getStatusNumber
@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:


 def removeControlCharacters(content: str) -> str:
-    content = content.replace('&8211;', '-').replace('&#8211;', '-')
-    return content.replace('&8230;', '...').replace('&#8230;', '...')
+    """TODO this is hacky and a better solution is needed
+    the unicode is messing up somehow
+    """
+    lookups = {
+        "8211": "-",
+        "8230": "...",
+        "8216": "'",
+        "8217": "'",
+        "8220": '"',
+        "8221": '"'
+    }
+    for code, ch in lookups.items():
+        content = content.replace('&' + code + ';', ch)
+        content = content.replace('&#' + code + ';', ch)
+    return content


 def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,

        rssTitle = removeControlCharacters(item[0])
        url = item[1]
+        if dangerousMarkup(url) or dangerousMarkup(rssTitle):
+            continue
        rssDescription = ''

        # get the rss description if it exists
@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
        rssDescription = '<p>' + rssDescription + '<p>'

        # add the off-site link to the description
-        if rssDescription:
+        if rssDescription and not dangerousMarkup(rssDescription):
            rssDescription += \
                '<br><a href="' + url + '">' + \
                translate['Read more...'] + '</a>'
@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
                '<a href="' + url + '">' + \
                translate['Read more...'] + '</a>'

+        # remove image dimensions
+        rssDescription = removeHtmlTag(rssDescription, 'width')
+        rssDescription = removeHtmlTag(rssDescription, 'height')
+
        followersOnly = False
        useBlurhash = False
        # NOTE: the id when the post is created will not be
--- a/tests.py
+++ b/tests.py
@ -78,6 +78,7 @@ from content import addHtmlTags
 from content import removeLongWords
 from content import replaceContentDuplicates
 from content import removeTextFormatting
+from content import removeHtmlTag
 from theme import setCSSparam
 from jsonldsig import testSignJsonld
 from jsonldsig import jsonldVerify
@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
    assert resultStr == expectedStr


+def testRemoveHtmlTag():
+    print('testRemoveHtmlTag')
+    testStr = "<p><img width=\"864\" height=\"486\" " + \
+        "src=\"https://somesiteorother.com/image.jpg\"></p>"
+    resultStr = removeHtmlTag(testStr, 'width')
+    assert resultStr == "<p><img height=\"486\" " + \
+        "src=\"https://somesiteorother.com/image.jpg\"></p>"
+
+
 def runAllTests():
    print('Running tests...')
+    testRemoveHtmlTag()
    testReplaceEmailQuote()
    testConstantTimeStringCheck()
    testTranslations()