Remove dangerous markup from rss feeds

main
Bob Mottram 2020-10-11 10:33:31 +01:00
parent 051b361c79
commit 13c067bfa4
3 changed files with 52 additions and 3 deletions

View File

@ -14,6 +14,23 @@ from utils import fileLastModified
from utils import getLinkPrefixes
def removeHtmlTag(htmlStr: str, tag: str) -> str:
"""Removes a given tag from a html string
"""
tagFound = True
while tagFound:
matchStr = ' ' + tag + '="'
if matchStr not in htmlStr:
tagFound = False
break
sections = htmlStr.split(matchStr, 1)
if '"' not in sections[1]:
tagFound = False
break
htmlStr = sections[0] + sections[1].split('"', 1)[1]
return htmlStr
def removeQuotesWithinQuotes(content: str) -> str:
"""Removes any blockquote inside blockquote
"""

View File

@ -12,6 +12,8 @@ import datetime
from collections import OrderedDict
from newswire import getDictFromNewswire
from posts import createNewsPost
from content import removeHtmlTag
from content import dangerousMarkup
from utils import loadJson
from utils import saveJson
from utils import getStatusNumber
@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
def removeControlCharacters(content: str) -> str:
content = content.replace('&8211;', '-').replace('–', '-')
return content.replace('&8230;', '...').replace('…', '...')
"""TODO this is hacky and a better solution is needed
the unicode is messing up somehow
"""
lookups = {
"8211": "-",
"8230": "...",
"8216": "'",
"8217": "'",
"8220": '"',
"8221": '"'
}
for code, ch in lookups.items():
content = content.replace('&' + code + ';', ch)
content = content.replace('&#' + code + ';', ch)
return content
def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
rssTitle = removeControlCharacters(item[0])
url = item[1]
if dangerousMarkup(url) or dangerousMarkup(rssTitle):
continue
rssDescription = ''
# get the rss description if it exists
@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
rssDescription = '<p>' + rssDescription + '<p>'
# add the off-site link to the description
if rssDescription:
if rssDescription and not dangerousMarkup(rssDescription):
rssDescription += \
'<br><a href="' + url + '">' + \
translate['Read more...'] + '</a>'
@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
'<a href="' + url + '">' + \
translate['Read more...'] + '</a>'
# remove image dimensions
rssDescription = removeHtmlTag(rssDescription, 'width')
rssDescription = removeHtmlTag(rssDescription, 'height')
followersOnly = False
useBlurhash = False
# NOTE: the id when the post is created will not be

View File

@ -78,6 +78,7 @@ from content import addHtmlTags
from content import removeLongWords
from content import replaceContentDuplicates
from content import removeTextFormatting
from content import removeHtmlTag
from theme import setCSSparam
from jsonldsig import testSignJsonld
from jsonldsig import jsonldVerify
@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
assert resultStr == expectedStr
def testRemoveHtmlTag():
print('testRemoveHtmlTag')
testStr = "<p><img width=\"864\" height=\"486\" " + \
"src=\"https://somesiteorother.com/image.jpg\"></p>"
resultStr = removeHtmlTag(testStr, 'width')
assert resultStr == "<p><img height=\"486\" " + \
"src=\"https://somesiteorother.com/image.jpg\"></p>"
def runAllTests():
print('Running tests...')
testRemoveHtmlTag()
testReplaceEmailQuote()
testConstantTimeStringCheck()
testTranslations()