Remove escaped html from feeds

main
Bob Mottram 2020-10-20 14:07:02 +01:00
parent ee76750305
commit 77fd759adf
1 changed files with 7 additions and 13 deletions

View File

@ -15,6 +15,7 @@ __status__ = "Production"
import os import os
import time import time
import datetime import datetime
import html
from shutil import rmtree from shutil import rmtree
from subprocess import Popen from subprocess import Popen
from collections import OrderedDict from collections import OrderedDict
@ -65,20 +66,10 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
def removeControlCharacters(content: str) -> str: def removeControlCharacters(content: str) -> str:
"""TODO this is hacky and a better solution is needed """Remove escaped html
the unicode is messing up somehow
""" """
lookups = { if '&' in content:
"8211": "-", return html.unescape(content)
"8230": "...",
"8216": "'",
"8217": "'",
"8220": '"',
"8221": '"'
}
for code, ch in lookups.items():
content = content.replace('&' + code + ';', ch)
content = content.replace('&#' + code + ';', ch)
return content return content
@ -513,6 +504,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
if rssDescription.startswith('<![CDATA['): if rssDescription.startswith('<![CDATA['):
rssDescription = rssDescription.replace('<![CDATA[', '') rssDescription = rssDescription.replace('<![CDATA[', '')
rssDescription = rssDescription.replace(']]>', '') rssDescription = rssDescription.replace(']]>', '')
if '&' in rssDescription:
rssDescription = html.unescape(rssDescription)
rssDescription = '<p>' + rssDescription + '<p>' rssDescription = '<p>' + rssDescription + '<p>'
mirrored = item[7] mirrored = item[7]
@ -578,6 +571,7 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
blog['object']['url'] = \ blog['object']['url'] = \
httpPrefix + '://' + domain + '/@news/' + statusNumber httpPrefix + '://' + domain + '/@news/' + statusNumber
blog['object']['published'] = dateStr blog['object']['published'] = dateStr
blog['object']['content'] = rssDescription blog['object']['content'] = rssDescription
blog['object']['contentMap']['en'] = rssDescription blog['object']['contentMap']['en'] = rssDescription