Remove dangerous markup from rss feeds

merge-requests/30/head
Bob Mottram 2020-10-11 10:33:31 +01:00
parent 051b361c79
commit 13c067bfa4
3 changed files with 52 additions and 3 deletions

View File

@ -14,6 +14,23 @@ from utils import fileLastModified
from utils import getLinkPrefixes from utils import getLinkPrefixes
def removeHtmlTag(htmlStr: str, tag: str) -> str:
"""Removes a given tag from a html string
"""
tagFound = True
while tagFound:
matchStr = ' ' + tag + '="'
if matchStr not in htmlStr:
tagFound = False
break
sections = htmlStr.split(matchStr, 1)
if '"' not in sections[1]:
tagFound = False
break
htmlStr = sections[0] + sections[1].split('"', 1)[1]
return htmlStr
def removeQuotesWithinQuotes(content: str) -> str: def removeQuotesWithinQuotes(content: str) -> str:
"""Removes any blockquote inside blockquote """Removes any blockquote inside blockquote
""" """

View File

@ -12,6 +12,8 @@ import datetime
from collections import OrderedDict from collections import OrderedDict
from newswire import getDictFromNewswire from newswire import getDictFromNewswire
from posts import createNewsPost from posts import createNewsPost
from content import removeHtmlTag
from content import dangerousMarkup
from utils import loadJson from utils import loadJson
from utils import saveJson from utils import saveJson
from utils import getStatusNumber from utils import getStatusNumber
@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
def removeControlCharacters(content: str) -> str: def removeControlCharacters(content: str) -> str:
content = content.replace('&8211;', '-').replace('–', '-') """TODO this is hacky and a better solution is needed
return content.replace('&8230;', '...').replace('…', '...') the unicode is messing up somehow
"""
lookups = {
"8211": "-",
"8230": "...",
"8216": "'",
"8217": "'",
"8220": '"',
"8221": '"'
}
for code, ch in lookups.items():
content = content.replace('&' + code + ';', ch)
content = content.replace('&#' + code + ';', ch)
return content
def convertRSStoActivityPub(baseDir: str, httpPrefix: str, def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
rssTitle = removeControlCharacters(item[0]) rssTitle = removeControlCharacters(item[0])
url = item[1] url = item[1]
if dangerousMarkup(url) or dangerousMarkup(rssTitle):
continue
rssDescription = '' rssDescription = ''
# get the rss description if it exists # get the rss description if it exists
@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
rssDescription = '<p>' + rssDescription + '<p>' rssDescription = '<p>' + rssDescription + '<p>'
# add the off-site link to the description # add the off-site link to the description
if rssDescription: if rssDescription and not dangerousMarkup(rssDescription):
rssDescription += \ rssDescription += \
'<br><a href="' + url + '">' + \ '<br><a href="' + url + '">' + \
translate['Read more...'] + '</a>' translate['Read more...'] + '</a>'
@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
'<a href="' + url + '">' + \ '<a href="' + url + '">' + \
translate['Read more...'] + '</a>' translate['Read more...'] + '</a>'
# remove image dimensions
rssDescription = removeHtmlTag(rssDescription, 'width')
rssDescription = removeHtmlTag(rssDescription, 'height')
followersOnly = False followersOnly = False
useBlurhash = False useBlurhash = False
# NOTE: the id when the post is created will not be # NOTE: the id when the post is created will not be

View File

@ -78,6 +78,7 @@ from content import addHtmlTags
from content import removeLongWords from content import removeLongWords
from content import replaceContentDuplicates from content import replaceContentDuplicates
from content import removeTextFormatting from content import removeTextFormatting
from content import removeHtmlTag
from theme import setCSSparam from theme import setCSSparam
from jsonldsig import testSignJsonld from jsonldsig import testSignJsonld
from jsonldsig import jsonldVerify from jsonldsig import jsonldVerify
@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
assert resultStr == expectedStr assert resultStr == expectedStr
def testRemoveHtmlTag():
print('testRemoveHtmlTag')
testStr = "<p><img width=\"864\" height=\"486\" " + \
"src=\"https://somesiteorother.com/image.jpg\"></p>"
resultStr = removeHtmlTag(testStr, 'width')
assert resultStr == "<p><img height=\"486\" " + \
"src=\"https://somesiteorother.com/image.jpg\"></p>"
def runAllTests(): def runAllTests():
print('Running tests...') print('Running tests...')
testRemoveHtmlTag()
testReplaceEmailQuote() testReplaceEmailQuote()
testConstantTimeStringCheck() testConstantTimeStringCheck()
testTranslations() testTranslations()