From 13c067bfa41388a5c2515596216daf87a58bb2df Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@freedombone.net>
Date: Sun, 11 Oct 2020 10:33:31 +0100
Subject: [PATCH] Remove dangerous markup from rss feeds

---
 content.py    | 17 +++++++++++++++++
 newsdaemon.py | 27 ++++++++++++++++++++++++---
 tests.py      | 11 +++++++++++
 3 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/content.py b/content.py
index d1a17019..6de17eb6 100644
--- a/content.py
+++ b/content.py
@@ -14,6 +14,23 @@ from utils import fileLastModified
 from utils import getLinkPrefixes
 
 
+def removeHtmlTag(htmlStr: str, tag: str) -> str:
+    """Removes a given tag from a html string
+    """
+    tagFound = True
+    while tagFound:
+        matchStr = ' ' + tag + '="'
+        if matchStr not in htmlStr:
+            tagFound = False
+            break
+        sections = htmlStr.split(matchStr, 1)
+        if '"' not in sections[1]:
+            tagFound = False
+            break
+        htmlStr = sections[0] + sections[1].split('"', 1)[1]
+    return htmlStr
+
+
 def removeQuotesWithinQuotes(content: str) -> str:
     """Removes any blockquote inside blockquote
     """
diff --git a/newsdaemon.py b/newsdaemon.py
index 610a8477..9162a869 100644
--- a/newsdaemon.py
+++ b/newsdaemon.py
@@ -12,6 +12,8 @@ import datetime
 from collections import OrderedDict
 from newswire import getDictFromNewswire
 from posts import createNewsPost
+from content import removeHtmlTag
+from content import dangerousMarkup
 from utils import loadJson
 from utils import saveJson
 from utils import getStatusNumber
@@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
 
 
 def removeControlCharacters(content: str) -> str:
-    content = content.replace('&8211;', '-').replace('&#8211;', '-')
-    return content.replace('&8230;', '...').replace('&#8230;', '...')
+    """TODO this is hacky and a better solution is needed
+    the unicode is messing up somehow
+    """
+    lookups = {
+        "8211": "-",
+        "8230": "...",
+        "8216": "'",
+        "8217": "'",
+        "8220": '"',
+        "8221": '"'
+    }
+    for code, ch in lookups.items():
+        content = content.replace('&' + code + ';', ch)
+        content = content.replace('&#' + code + ';', ch)
+    return content
 
 
 def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
@@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
 
         rssTitle = removeControlCharacters(item[0])
         url = item[1]
+        if dangerousMarkup(url) or dangerousMarkup(rssTitle):
+            continue
         rssDescription = ''
 
         # get the rss description if it exists
@@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
         rssDescription = '<p>' + rssDescription + '<p>'
 
         # add the off-site link to the description
-        if rssDescription:
+        if rssDescription and not dangerousMarkup(rssDescription):
             rssDescription += \
                 '<br><a href="' + url + '">' + \
                 translate['Read more...'] + '</a>'
@@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
                 '<a href="' + url + '">' + \
                 translate['Read more...'] + '</a>'
 
+        # remove image dimensions
+        rssDescription = removeHtmlTag(rssDescription, 'width')
+        rssDescription = removeHtmlTag(rssDescription, 'height')
+
         followersOnly = False
         useBlurhash = False
         # NOTE: the id when the post is created will not be
diff --git a/tests.py b/tests.py
index ea93839e..af64d550 100644
--- a/tests.py
+++ b/tests.py
@@ -78,6 +78,7 @@ from content import addHtmlTags
 from content import removeLongWords
 from content import replaceContentDuplicates
 from content import removeTextFormatting
+from content import removeHtmlTag
 from theme import setCSSparam
 from jsonldsig import testSignJsonld
 from jsonldsig import jsonldVerify
@@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
     assert resultStr == expectedStr
 
 
+def testRemoveHtmlTag():
+    print('testRemoveHtmlTag')
+    testStr = "<p><img width=\"864\" height=\"486\" " + \
+        "src=\"https://somesiteorother.com/image.jpg\"></p>"
+    resultStr = removeHtmlTag(testStr, 'width')
+    assert resultStr == "<p><img height=\"486\" " + \
+        "src=\"https://somesiteorother.com/image.jpg\"></p>"
+
+
 def runAllTests():
     print('Running tests...')
+    testRemoveHtmlTag()
     testReplaceEmailQuote()
     testConstantTimeStringCheck()
     testTranslations()