mirror of https://gitlab.com/bashrc2/epicyon
Remove dangerous markup from rss feeds
parent
051b361c79
commit
13c067bfa4
17
content.py
17
content.py
|
@ -14,6 +14,23 @@ from utils import fileLastModified
|
||||||
from utils import getLinkPrefixes
|
from utils import getLinkPrefixes
|
||||||
|
|
||||||
|
|
||||||
|
def removeHtmlTag(htmlStr: str, tag: str) -> str:
|
||||||
|
"""Removes a given tag from a html string
|
||||||
|
"""
|
||||||
|
tagFound = True
|
||||||
|
while tagFound:
|
||||||
|
matchStr = ' ' + tag + '="'
|
||||||
|
if matchStr not in htmlStr:
|
||||||
|
tagFound = False
|
||||||
|
break
|
||||||
|
sections = htmlStr.split(matchStr, 1)
|
||||||
|
if '"' not in sections[1]:
|
||||||
|
tagFound = False
|
||||||
|
break
|
||||||
|
htmlStr = sections[0] + sections[1].split('"', 1)[1]
|
||||||
|
return htmlStr
|
||||||
|
|
||||||
|
|
||||||
def removeQuotesWithinQuotes(content: str) -> str:
|
def removeQuotesWithinQuotes(content: str) -> str:
|
||||||
"""Removes any blockquote inside blockquote
|
"""Removes any blockquote inside blockquote
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -12,6 +12,8 @@ import datetime
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from newswire import getDictFromNewswire
|
from newswire import getDictFromNewswire
|
||||||
from posts import createNewsPost
|
from posts import createNewsPost
|
||||||
|
from content import removeHtmlTag
|
||||||
|
from content import dangerousMarkup
|
||||||
from utils import loadJson
|
from utils import loadJson
|
||||||
from utils import saveJson
|
from utils import saveJson
|
||||||
from utils import getStatusNumber
|
from utils import getStatusNumber
|
||||||
|
@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
def removeControlCharacters(content: str) -> str:
|
def removeControlCharacters(content: str) -> str:
|
||||||
content = content.replace('&8211;', '-').replace('–', '-')
|
"""TODO this is hacky and a better solution is needed
|
||||||
return content.replace('&8230;', '...').replace('…', '...')
|
the unicode is messing up somehow
|
||||||
|
"""
|
||||||
|
lookups = {
|
||||||
|
"8211": "-",
|
||||||
|
"8230": "...",
|
||||||
|
"8216": "'",
|
||||||
|
"8217": "'",
|
||||||
|
"8220": '"',
|
||||||
|
"8221": '"'
|
||||||
|
}
|
||||||
|
for code, ch in lookups.items():
|
||||||
|
content = content.replace('&' + code + ';', ch)
|
||||||
|
content = content.replace('&#' + code + ';', ch)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
|
def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
|
||||||
|
@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
|
||||||
|
|
||||||
rssTitle = removeControlCharacters(item[0])
|
rssTitle = removeControlCharacters(item[0])
|
||||||
url = item[1]
|
url = item[1]
|
||||||
|
if dangerousMarkup(url) or dangerousMarkup(rssTitle):
|
||||||
|
continue
|
||||||
rssDescription = ''
|
rssDescription = ''
|
||||||
|
|
||||||
# get the rss description if it exists
|
# get the rss description if it exists
|
||||||
|
@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
|
||||||
rssDescription = '<p>' + rssDescription + '<p>'
|
rssDescription = '<p>' + rssDescription + '<p>'
|
||||||
|
|
||||||
# add the off-site link to the description
|
# add the off-site link to the description
|
||||||
if rssDescription:
|
if rssDescription and not dangerousMarkup(rssDescription):
|
||||||
rssDescription += \
|
rssDescription += \
|
||||||
'<br><a href="' + url + '">' + \
|
'<br><a href="' + url + '">' + \
|
||||||
translate['Read more...'] + '</a>'
|
translate['Read more...'] + '</a>'
|
||||||
|
@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
|
||||||
'<a href="' + url + '">' + \
|
'<a href="' + url + '">' + \
|
||||||
translate['Read more...'] + '</a>'
|
translate['Read more...'] + '</a>'
|
||||||
|
|
||||||
|
# remove image dimensions
|
||||||
|
rssDescription = removeHtmlTag(rssDescription, 'width')
|
||||||
|
rssDescription = removeHtmlTag(rssDescription, 'height')
|
||||||
|
|
||||||
followersOnly = False
|
followersOnly = False
|
||||||
useBlurhash = False
|
useBlurhash = False
|
||||||
# NOTE: the id when the post is created will not be
|
# NOTE: the id when the post is created will not be
|
||||||
|
|
11
tests.py
11
tests.py
|
@ -78,6 +78,7 @@ from content import addHtmlTags
|
||||||
from content import removeLongWords
|
from content import removeLongWords
|
||||||
from content import replaceContentDuplicates
|
from content import replaceContentDuplicates
|
||||||
from content import removeTextFormatting
|
from content import removeTextFormatting
|
||||||
|
from content import removeHtmlTag
|
||||||
from theme import setCSSparam
|
from theme import setCSSparam
|
||||||
from jsonldsig import testSignJsonld
|
from jsonldsig import testSignJsonld
|
||||||
from jsonldsig import jsonldVerify
|
from jsonldsig import jsonldVerify
|
||||||
|
@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
|
||||||
assert resultStr == expectedStr
|
assert resultStr == expectedStr
|
||||||
|
|
||||||
|
|
||||||
|
def testRemoveHtmlTag():
|
||||||
|
print('testRemoveHtmlTag')
|
||||||
|
testStr = "<p><img width=\"864\" height=\"486\" " + \
|
||||||
|
"src=\"https://somesiteorother.com/image.jpg\"></p>"
|
||||||
|
resultStr = removeHtmlTag(testStr, 'width')
|
||||||
|
assert resultStr == "<p><img height=\"486\" " + \
|
||||||
|
"src=\"https://somesiteorother.com/image.jpg\"></p>"
|
||||||
|
|
||||||
|
|
||||||
def runAllTests():
|
def runAllTests():
|
||||||
print('Running tests...')
|
print('Running tests...')
|
||||||
|
testRemoveHtmlTag()
|
||||||
testReplaceEmailQuote()
|
testReplaceEmailQuote()
|
||||||
testConstantTimeStringCheck()
|
testConstantTimeStringCheck()
|
||||||
testTranslations()
|
testTranslations()
|
||||||
|
|
Loading…
Reference in New Issue