diff --git a/content.py b/content.py index d1a170192..6de17eb60 100644 --- a/content.py +++ b/content.py @@ -14,6 +14,23 @@ from utils import fileLastModified from utils import getLinkPrefixes +def removeHtmlTag(htmlStr: str, tag: str) -> str: + """Removes a given tag from a html string + """ + tagFound = True + while tagFound: + matchStr = ' ' + tag + '="' + if matchStr not in htmlStr: + tagFound = False + break + sections = htmlStr.split(matchStr, 1) + if '"' not in sections[1]: + tagFound = False + break + htmlStr = sections[0] + sections[1].split('"', 1)[1] + return htmlStr + + def removeQuotesWithinQuotes(content: str) -> str: """Removes any blockquote inside blockquote """ diff --git a/newsdaemon.py b/newsdaemon.py index 610a84779..9162a869c 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -12,6 +12,8 @@ import datetime from collections import OrderedDict from newswire import getDictFromNewswire from posts import createNewsPost +from content import removeHtmlTag +from content import dangerousMarkup from utils import loadJson from utils import saveJson from utils import getStatusNumber @@ -51,8 +53,21 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None: def removeControlCharacters(content: str) -> str: - content = content.replace('&8211;', '-').replace('–', '-') - return content.replace('&8230;', '...').replace('…', '...') + """TODO this is hacky and a better solution is needed + the unicode is messing up somehow + """ + lookups = { + "8211": "-", + "8230": "...", + "8216": "'", + "8217": "'", + "8220": '"', + "8221": '"' + } + for code, ch in lookups.items(): + content = content.replace('&' + code + ';', ch) + content = content.replace('' + code + ';', ch) + return content def convertRSStoActivityPub(baseDir: str, httpPrefix: str, @@ -96,6 +111,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, rssTitle = removeControlCharacters(item[0]) url = item[1] + if dangerousMarkup(url) or dangerousMarkup(rssTitle): + continue rssDescription = '' # get the rss description if it exists @@ -106,7 +123,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, rssDescription = '
' + rssDescription + '
'
# add the off-site link to the description
- if rssDescription:
+ if rssDescription and not dangerousMarkup(rssDescription):
rssDescription += \
'
' + \
translate['Read more...'] + ''
@@ -115,6 +132,10 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
'' + \
translate['Read more...'] + ''
+ # remove image dimensions
+ rssDescription = removeHtmlTag(rssDescription, 'width')
+ rssDescription = removeHtmlTag(rssDescription, 'height')
+
followersOnly = False
useBlurhash = False
# NOTE: the id when the post is created will not be
diff --git a/tests.py b/tests.py
index ea93839e1..af64d550c 100644
--- a/tests.py
+++ b/tests.py
@@ -78,6 +78,7 @@ from content import addHtmlTags
from content import removeLongWords
from content import replaceContentDuplicates
from content import removeTextFormatting
+from content import removeHtmlTag
from theme import setCSSparam
from jsonldsig import testSignJsonld
from jsonldsig import jsonldVerify
@@ -2162,8 +2163,18 @@ def testReplaceEmailQuote():
assert resultStr == expectedStr
+def testRemoveHtmlTag():
+ print('testRemoveHtmlTag')
+ testStr = "