forked from indymedia/epicyon
Support for rss 1.0 feeds
parent
8d29ac3cbe
commit
96c139a63e
87
newswire.py
87
newswire.py
|
@ -258,7 +258,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
maxPostsPerSource: int,
|
maxPostsPerSource: int,
|
||||||
maxFeedItemSizeKb: int,
|
maxFeedItemSizeKb: int,
|
||||||
maxCategoriesFeedItemSizeKb: int) -> {}:
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
||||||
"""Converts an xml 2.0 string to a dictionary
|
"""Converts an xml RSS 2.0 string to a dictionary
|
||||||
"""
|
"""
|
||||||
if '<item>' not in xmlStr:
|
if '<item>' not in xmlStr:
|
||||||
return {}
|
return {}
|
||||||
|
@ -332,6 +332,86 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
moderated: bool, mirrored: bool,
|
||||||
|
maxPostsPerSource: int,
|
||||||
|
maxFeedItemSizeKb: int,
|
||||||
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
||||||
|
"""Converts an xml RSS 1.0 string to a dictionary
|
||||||
|
https://validator.w3.org/feed/docs/rss1.html
|
||||||
|
"""
|
||||||
|
if '<item' not in xmlStr:
|
||||||
|
return {}
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
# is this an rss feed containing hashtag categories?
|
||||||
|
if '<title>#categories</title>' in xmlStr:
|
||||||
|
xml2StrToHashtagCategories(baseDir, xmlStr,
|
||||||
|
maxCategoriesFeedItemSizeKb)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
rssItems = xmlStr.split('<item')
|
||||||
|
postCtr = 0
|
||||||
|
maxBytes = maxFeedItemSizeKb * 1024
|
||||||
|
for rssItem in rssItems:
|
||||||
|
if not rssItem:
|
||||||
|
continue
|
||||||
|
if len(rssItem) > maxBytes:
|
||||||
|
print('WARN: rss feed item is too big')
|
||||||
|
continue
|
||||||
|
if '<title>' not in rssItem:
|
||||||
|
continue
|
||||||
|
if '</title>' not in rssItem:
|
||||||
|
continue
|
||||||
|
if '<link>' not in rssItem:
|
||||||
|
continue
|
||||||
|
if '</link>' not in rssItem:
|
||||||
|
continue
|
||||||
|
if '<dc:date>' not in rssItem:
|
||||||
|
continue
|
||||||
|
if '</dc:date>' not in rssItem:
|
||||||
|
continue
|
||||||
|
title = rssItem.split('<title>')[1]
|
||||||
|
title = removeCDATA(title.split('</title>')[0])
|
||||||
|
description = ''
|
||||||
|
if '<description>' in rssItem and '</description>' in rssItem:
|
||||||
|
description = rssItem.split('<description>')[1]
|
||||||
|
description = removeCDATA(description.split('</description>')[0])
|
||||||
|
else:
|
||||||
|
if '<media:description>' in rssItem and \
|
||||||
|
'</media:description>' in rssItem:
|
||||||
|
description = rssItem.split('<media:description>')[1]
|
||||||
|
description = description.split('</media:description>')[0]
|
||||||
|
description = removeCDATA(description)
|
||||||
|
link = rssItem.split('<link>')[1]
|
||||||
|
link = link.split('</link>')[0]
|
||||||
|
if '://' not in link:
|
||||||
|
continue
|
||||||
|
itemDomain = link.split('://')[1]
|
||||||
|
if '/' in itemDomain:
|
||||||
|
itemDomain = itemDomain.split('/')[0]
|
||||||
|
if isBlockedDomain(baseDir, itemDomain):
|
||||||
|
continue
|
||||||
|
pubDate = rssItem.split('<dc:date>')[1]
|
||||||
|
pubDate = pubDate.split('</dc:date>')[0]
|
||||||
|
|
||||||
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
|
if pubDateStr:
|
||||||
|
postFilename = ''
|
||||||
|
votesStatus = []
|
||||||
|
addNewswireDictEntry(baseDir, domain,
|
||||||
|
result, pubDateStr,
|
||||||
|
title, link,
|
||||||
|
votesStatus, postFilename,
|
||||||
|
description, moderated, mirrored)
|
||||||
|
postCtr += 1
|
||||||
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
|
if postCtr > 0:
|
||||||
|
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
moderated: bool, mirrored: bool,
|
moderated: bool, mirrored: bool,
|
||||||
maxPostsPerSource: int,
|
maxPostsPerSource: int,
|
||||||
|
@ -489,6 +569,11 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
xmlStr, moderated, mirrored,
|
xmlStr, moderated, mirrored,
|
||||||
maxPostsPerSource, maxFeedItemSizeKb,
|
maxPostsPerSource, maxFeedItemSizeKb,
|
||||||
maxCategoriesFeedItemSizeKb)
|
maxCategoriesFeedItemSizeKb)
|
||||||
|
elif '<rdf:RDF>' in xmlStr:
|
||||||
|
return xml1StrToDict(baseDir, domain,
|
||||||
|
xmlStr, moderated, mirrored,
|
||||||
|
maxPostsPerSource, maxFeedItemSizeKb,
|
||||||
|
maxCategoriesFeedItemSizeKb)
|
||||||
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
|
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
|
||||||
return atomFeedToDict(baseDir, domain,
|
return atomFeedToDict(baseDir, domain,
|
||||||
xmlStr, moderated, mirrored,
|
xmlStr, moderated, mirrored,
|
||||||
|
|
Loading…
Reference in New Issue