Set an upper limit on the number of newswire posts per rss feed

Helps to avoid having a giant list of items
main
Bob Mottram 2020-10-16 11:13:14 +01:00
parent e715072886
commit 5c5a25e534
1 changed files with 41 additions and 11 deletions

View File

@ -50,13 +50,15 @@ def rss2Footer() -> str:
return rssStr return rssStr
def xml2StrToDict(xmlStr: str, moderated: bool) -> {}: def xml2StrToDict(xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}:
"""Converts an xml 2.0 string to a dictionary """Converts an xml 2.0 string to a dictionary
""" """
if '<item>' not in xmlStr: if '<item>' not in xmlStr:
return {} return {}
result = {} result = {}
rssItems = xmlStr.split('<item>') rssItems = xmlStr.split('<item>')
postCtr = 0
for rssItem in rssItems: for rssItem in rssItems:
if '<title>' not in rssItem: if '<title>' not in rssItem:
continue continue
@ -89,6 +91,9 @@ def xml2StrToDict(xmlStr: str, moderated: bool) -> {}:
result[str(publishedDate)] = [title, link, result[str(publishedDate)] = [title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated] description, moderated]
postCtr += 1
if postCtr >= maxPostsPerSource:
break
parsed = True parsed = True
except BaseException: except BaseException:
pass pass
@ -96,7 +101,15 @@ def xml2StrToDict(xmlStr: str, moderated: bool) -> {}:
try: try:
publishedDate = \ publishedDate = \
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
result[str(publishedDate) + '+00:00'] = [title, link] postFilename = ''
votesStatus = []
result[str(publishedDate) + '+00:00'] = \
[title, link,
votesStatus, postFilename,
description, moderated]
postCtr += 1
if postCtr >= maxPostsPerSource:
break
parsed = True parsed = True
except BaseException: except BaseException:
print('WARN: unrecognized RSS date format: ' + pubDate) print('WARN: unrecognized RSS date format: ' + pubDate)
@ -104,13 +117,15 @@ def xml2StrToDict(xmlStr: str, moderated: bool) -> {}:
return result return result
def atomFeedToDict(xmlStr: str, moderated: bool) -> {}: def atomFeedToDict(xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}:
"""Converts an atom feed string to a dictionary """Converts an atom feed string to a dictionary
""" """
if '<entry>' not in xmlStr: if '<entry>' not in xmlStr:
return {} return {}
result = {} result = {}
rssItems = xmlStr.split('<entry>') rssItems = xmlStr.split('<entry>')
postCtr = 0
for rssItem in rssItems: for rssItem in rssItems:
if '<title>' not in rssItem: if '<title>' not in rssItem:
continue continue
@ -143,6 +158,9 @@ def atomFeedToDict(xmlStr: str, moderated: bool) -> {}:
result[str(publishedDate)] = [title, link, result[str(publishedDate)] = [title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated] description, moderated]
postCtr += 1
if postCtr >= maxPostsPerSource:
break
parsed = True parsed = True
except BaseException: except BaseException:
pass pass
@ -150,7 +168,15 @@ def atomFeedToDict(xmlStr: str, moderated: bool) -> {}:
try: try:
publishedDate = \ publishedDate = \
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
result[str(publishedDate) + '+00:00'] = [title, link] postFilename = ''
votesStatus = []
result[str(publishedDate) + '+00:00'] = \
[title, link,
votesStatus, postFilename,
description, moderated]
postCtr += 1
if postCtr >= maxPostsPerSource:
break
parsed = True parsed = True
except BaseException: except BaseException:
print('WARN: unrecognized atom feed date format: ' + pubDate) print('WARN: unrecognized atom feed date format: ' + pubDate)
@ -158,17 +184,19 @@ def atomFeedToDict(xmlStr: str, moderated: bool) -> {}:
return result return result
def xmlStrToDict(xmlStr: str, moderated: bool) -> {}: def xmlStrToDict(xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}:
"""Converts an xml string to a dictionary """Converts an xml string to a dictionary
""" """
if 'rss version="2.0"' in xmlStr: if 'rss version="2.0"' in xmlStr:
return xml2StrToDict(xmlStr, moderated) return xml2StrToDict(xmlStr, moderated, maxPostsPerSource)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr: elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return atomFeedToDict(xmlStr, moderated) return atomFeedToDict(xmlStr, moderated, maxPostsPerSource)
return {} return {}
def getRSS(session, url: str, moderated: bool) -> {}: def getRSS(session, url: str, moderated: bool,
maxPostsPerSource: int) -> {}:
"""Returns an RSS url as a dict """Returns an RSS url as a dict
""" """
if not isinstance(url, str): if not isinstance(url, str):
@ -191,7 +219,7 @@ def getRSS(session, url: str, moderated: bool) -> {}:
print('WARN: no session specified for getRSS') print('WARN: no session specified for getRSS')
try: try:
result = session.get(url, headers=sessionHeaders, params=sessionParams) result = session.get(url, headers=sessionHeaders, params=sessionParams)
return xmlStrToDict(result.text, moderated) return xmlStrToDict(result.text, moderated, maxPostsPerSource)
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' + print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
'headers: ' + str(sessionHeaders) + '\n' + 'headers: ' + str(sessionHeaders) + '\n' +
@ -375,6 +403,8 @@ def getDictFromNewswire(session, baseDir: str) -> {}:
if not os.path.isfile(subscriptionsFilename): if not os.path.isfile(subscriptionsFilename):
return {} return {}
maxPostsPerSource = 5
# add rss feeds # add rss feeds
rssFeed = [] rssFeed = []
with open(subscriptionsFilename, 'r') as fp: with open(subscriptionsFilename, 'r') as fp:
@ -397,12 +427,12 @@ def getDictFromNewswire(session, baseDir: str) -> {}:
moderated = True moderated = True
url = url.replace('*', '').strip() url = url.replace('*', '').strip()
itemsList = getRSS(session, url, moderated) itemsList = getRSS(session, url, moderated, maxPostsPerSource)
for dateStr, item in itemsList.items(): for dateStr, item in itemsList.items():
result[dateStr] = item result[dateStr] = item
# add blogs from each user account # add blogs from each user account
addBlogsToNewswire(baseDir, result, 5) addBlogsToNewswire(baseDir, result, maxPostsPerSource)
# sort into chronological order, latest first # sort into chronological order, latest first
sortedResult = OrderedDict(sorted(result.items(), reverse=True)) sortedResult = OrderedDict(sorted(result.items(), reverse=True))