Remove html from rss feed descriptions

merge-requests/30/head
Bob Mottram 2021-01-11 21:54:25 +00:00
parent ff15cea822
commit 3c1314d4b4
2 changed files with 10 additions and 11 deletions

View File

@ -23,7 +23,6 @@ from newswire import getDictFromNewswire
# from posts import sendSignedJson # from posts import sendSignedJson
from posts import createNewsPost from posts import createNewsPost
from posts import archivePostsForPerson from posts import archivePostsForPerson
from content import removeHtmlTag
from content import dangerousMarkup from content import dangerousMarkup
from content import validHashTag from content import validHashTag
from utils import removeHtml from utils import removeHtml

View File

@ -304,13 +304,13 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
description = '' description = ''
if '<description>' in rssItem and '</description>' in rssItem: if '<description>' in rssItem and '</description>' in rssItem:
description = rssItem.split('<description>')[1] description = rssItem.split('<description>')[1]
description = _removeCDATA(description.split('</description>')[0]) description = removeHtml(description.split('</description>')[0])
else: else:
if '<media:description>' in rssItem and \ if '<media:description>' in rssItem and \
'</media:description>' in rssItem: '</media:description>' in rssItem:
description = rssItem.split('<media:description>')[1] description = rssItem.split('<media:description>')[1]
description = description.split('</media:description>')[0] description = description.split('</media:description>')[0]
description = _removeCDATA(description) description = removeHtml(description)
link = rssItem.split('<link>')[1] link = rssItem.split('<link>')[1]
link = link.split('</link>')[0] link = link.split('</link>')[0]
if '://' not in link: if '://' not in link:
@ -388,13 +388,13 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
description = '' description = ''
if '<description>' in rssItem and '</description>' in rssItem: if '<description>' in rssItem and '</description>' in rssItem:
description = rssItem.split('<description>')[1] description = rssItem.split('<description>')[1]
description = _removeCDATA(description.split('</description>')[0]) description = removeHtml(description.split('</description>')[0])
else: else:
if '<media:description>' in rssItem and \ if '<media:description>' in rssItem and \
'</media:description>' in rssItem: '</media:description>' in rssItem:
description = rssItem.split('<media:description>')[1] description = rssItem.split('<media:description>')[1]
description = description.split('</media:description>')[0] description = description.split('</media:description>')[0]
description = _removeCDATA(description) description = removeHtml(description)
link = rssItem.split('<link>')[1] link = rssItem.split('<link>')[1]
link = link.split('</link>')[0] link = link.split('</link>')[0]
if '://' not in link: if '://' not in link:
@ -460,13 +460,13 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
description = '' description = ''
if '<summary>' in atomItem and '</summary>' in atomItem: if '<summary>' in atomItem and '</summary>' in atomItem:
description = atomItem.split('<summary>')[1] description = atomItem.split('<summary>')[1]
description = _removeCDATA(description.split('</summary>')[0]) description = removeHtml(description.split('</summary>')[0])
else: else:
if '<media:description>' in atomItem and \ if '<media:description>' in atomItem and \
'</media:description>' in atomItem: '</media:description>' in atomItem:
description = atomItem.split('<media:description>')[1] description = atomItem.split('<media:description>')[1]
description = description.split('</media:description>')[0] description = description.split('</media:description>')[0]
description = _removeCDATA(description) description = removeHtml(description)
link = atomItem.split('<link>')[1] link = atomItem.split('<link>')[1]
link = link.split('</link>')[0] link = link.split('</link>')[0]
if '://' not in link: if '://' not in link:
@ -538,11 +538,11 @@ def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
'</media:description>' in atomItem: '</media:description>' in atomItem:
description = atomItem.split('<media:description>')[1] description = atomItem.split('<media:description>')[1]
description = description.split('</media:description>')[0] description = description.split('</media:description>')[0]
description = _removeCDATA(description) description = removeHtml(description)
elif '<summary>' in atomItem and '</summary>' in atomItem: elif '<summary>' in atomItem and '</summary>' in atomItem:
description = atomItem.split('<summary>')[1] description = atomItem.split('<summary>')[1]
description = description.split('</summary>')[0] description = description.split('</summary>')[0]
description = _removeCDATA(description) description = removeHtml(description)
link = atomItem.split('<yt:videoId>')[1] link = atomItem.split('<yt:videoId>')[1]
link = link.split('</yt:videoId>')[0] link = link.split('</yt:videoId>')[0]
link = 'https://www.youtube.com/watch?v=' + link.strip() link = 'https://www.youtube.com/watch?v=' + link.strip()
@ -692,7 +692,7 @@ def getRSSfromDict(baseDir: str, newswire: {},
continue continue
rssStr += '<item>\n' rssStr += '<item>\n'
rssStr += ' <title>' + fields[0] + '</title>\n' rssStr += ' <title>' + fields[0] + '</title>\n'
description = _removeCDATA(firstParagraphFromString(fields[4])) description = removeHtml(firstParagraphFromString(fields[4]))
rssStr += ' <description>' + description + '</description>\n' rssStr += ' <description>' + description + '</description>\n'
url = fields[1] url = fields[1]
if '://' not in url: if '://' not in url:
@ -812,7 +812,7 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
votes = loadJson(fullPostFilename + '.votes') votes = loadJson(fullPostFilename + '.votes')
content = postJsonObject['object']['content'] content = postJsonObject['object']['content']
description = firstParagraphFromString(content) description = firstParagraphFromString(content)
description = _removeCDATA(description) description = removeHtml(description)
tagsFromPost = _getHashtagsFromPost(postJsonObject) tagsFromPost = _getHashtagsFromPost(postJsonObject)
_addNewswireDictEntry(baseDir, domain, _addNewswireDictEntry(baseDir, domain,
newswire, published, newswire, published,