Support for rss 1.0 feeds

alt-html-css
Bob Mottram 2020-12-14 14:22:44 +00:00
parent 8d29ac3cbe
commit 96c139a63e
1 changed files with 86 additions and 1 deletions

View File

@ -258,7 +258,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
maxPostsPerSource: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
"""Converts an xml 2.0 string to a dictionary
"""Converts an xml RSS 2.0 string to a dictionary
"""
if '<item>' not in xmlStr:
return {}
@ -332,6 +332,86 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
return result
def xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
"""Converts an xml RSS 1.0 string to a dictionary
https://validator.w3.org/feed/docs/rss1.html
"""
if '<item' not in xmlStr:
return {}
result = {}
# is this an rss feed containing hashtag categories?
if '<title>#categories</title>' in xmlStr:
xml2StrToHashtagCategories(baseDir, xmlStr,
maxCategoriesFeedItemSizeKb)
return {}
rssItems = xmlStr.split('<item')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
for rssItem in rssItems:
if not rssItem:
continue
if len(rssItem) > maxBytes:
print('WARN: rss feed item is too big')
continue
if '<title>' not in rssItem:
continue
if '</title>' not in rssItem:
continue
if '<link>' not in rssItem:
continue
if '</link>' not in rssItem:
continue
if '<dc:date>' not in rssItem:
continue
if '</dc:date>' not in rssItem:
continue
title = rssItem.split('<title>')[1]
title = removeCDATA(title.split('</title>')[0])
description = ''
if '<description>' in rssItem and '</description>' in rssItem:
description = rssItem.split('<description>')[1]
description = removeCDATA(description.split('</description>')[0])
else:
if '<media:description>' in rssItem and \
'</media:description>' in rssItem:
description = rssItem.split('<media:description>')[1]
description = description.split('</media:description>')[0]
description = removeCDATA(description)
link = rssItem.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
continue
itemDomain = link.split('://')[1]
if '/' in itemDomain:
itemDomain = itemDomain.split('/')[0]
if isBlockedDomain(baseDir, itemDomain):
continue
pubDate = rssItem.split('<dc:date>')[1]
pubDate = pubDate.split('</dc:date>')[0]
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if postCtr > 0:
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
return result
def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
@ -489,6 +569,11 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
elif '<rdf:RDF>' in xmlStr:
return xml1StrToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return atomFeedToDict(baseDir, domain,
xmlStr, moderated, mirrored,