From bc77031e6a08b5055079733c312f883e0ea02fd4 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:18:43 +0000 Subject: [PATCH] Ensure that CDATA is removed from titles and descriptions --- newswire.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/newswire.py b/newswire.py index b9b476b8..52b8c931 100644 --- a/newswire.py +++ b/newswire.py @@ -25,6 +25,16 @@ from blocking import isBlockedHashtag from filters import isFiltered +def removeCDATA(text: str) -> str: + """Removes any CDATA from the given text + """ + if 'CDATA[' in text: + text = text.split('CDATA[')[1] + if ']' in text: + text = text.split(']')[0] + return text + + def rss2Header(httpPrefix: str, nickname: str, domainFull: str, title: str, translate: {}) -> str: @@ -154,16 +164,17 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] - description = description.split('')[0] + description = removeCDATA(description.split('')[0]) else: if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: @@ -243,16 +254,17 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] - description = description.split('')[0] + description = removeCDATA(description.split('')[0]) else: if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: @@ -333,15 +345,17 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) elif '' in rssItem and '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] link = 'https://www.youtube.com/watch?v=' + link.strip() @@ -494,7 +508,7 @@ def getRSSfromDict(baseDir: str, newswire: {}, continue rssStr += '\n' rssStr += ' ' + fields[0] + '\n' - description = firstParagraphFromString(fields[4]) + description = removeCDATA(firstParagraphFromString(fields[4])) rssStr += ' ' + description + '\n' url = fields[1] if '://' not in url: @@ -614,6 +628,7 @@ def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, votes = loadJson(fullPostFilename + '.votes') content = postJsonObject['object']['content'] description = firstParagraphFromString(content) + description = removeCDATA(description) addNewswireDictEntry(baseDir, domain, newswire, published, postJsonObject['object']['summary'],