mirror of https://gitlab.com/bashrc2/epicyon
Support for json feeds on newswire
parent
e104b9c5b0
commit
4308317b3d
121
newswire.py
121
newswire.py
|
@ -7,6 +7,7 @@ __email__ = "bob@freedombone.net"
|
||||||
__status__ = "Production"
|
__status__ = "Production"
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
import requests
|
import requests
|
||||||
from socket import error as SocketError
|
from socket import error as SocketError
|
||||||
import errno
|
import errno
|
||||||
|
@ -332,12 +333,14 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
result, pubDateStr,
|
result, pubDateStr,
|
||||||
title, link,
|
title, link,
|
||||||
votesStatus, postFilename,
|
votesStatus, postFilename,
|
||||||
description, moderated, mirrored)
|
description, moderated,
|
||||||
|
mirrored)
|
||||||
postCtr += 1
|
postCtr += 1
|
||||||
if postCtr >= maxPostsPerSource:
|
if postCtr >= maxPostsPerSource:
|
||||||
break
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' rss 2.0 feed items to newswire')
|
print('Added ' + str(postCtr) +
|
||||||
|
' rss 2.0 feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -416,12 +419,14 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
result, pubDateStr,
|
result, pubDateStr,
|
||||||
title, link,
|
title, link,
|
||||||
votesStatus, postFilename,
|
votesStatus, postFilename,
|
||||||
description, moderated, mirrored)
|
description, moderated,
|
||||||
|
mirrored)
|
||||||
postCtr += 1
|
postCtr += 1
|
||||||
if postCtr >= maxPostsPerSource:
|
if postCtr >= maxPostsPerSource:
|
||||||
break
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
|
print('Added ' + str(postCtr) +
|
||||||
|
' rss 1.0 feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -488,12 +493,112 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
result, pubDateStr,
|
result, pubDateStr,
|
||||||
title, link,
|
title, link,
|
||||||
votesStatus, postFilename,
|
votesStatus, postFilename,
|
||||||
description, moderated, mirrored)
|
description, moderated,
|
||||||
|
mirrored)
|
||||||
postCtr += 1
|
postCtr += 1
|
||||||
if postCtr >= maxPostsPerSource:
|
if postCtr >= maxPostsPerSource:
|
||||||
break
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' atom feed items to newswire')
|
print('Added ' + str(postCtr) +
|
||||||
|
' atom feed items to newswire')
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _jsonFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
moderated: bool, mirrored: bool,
|
||||||
|
maxPostsPerSource: int,
|
||||||
|
maxFeedItemSizeKb: int) -> {}:
|
||||||
|
"""Converts a json feed string to a dictionary
|
||||||
|
See https://jsonfeed.org/version/1
|
||||||
|
"""
|
||||||
|
if '<entry>' not in xmlStr:
|
||||||
|
return {}
|
||||||
|
result = {}
|
||||||
|
try:
|
||||||
|
feedJson = json.loads(xmlStr)
|
||||||
|
except BaseException:
|
||||||
|
return {}
|
||||||
|
postCtr = 0
|
||||||
|
maxBytes = maxFeedItemSizeKb * 1024
|
||||||
|
if not feedJson.get('version'):
|
||||||
|
return {}
|
||||||
|
if feedJson['version'] != 'https://jsonfeed.org/version/1':
|
||||||
|
return {}
|
||||||
|
if not feedJson.get('items'):
|
||||||
|
return {}
|
||||||
|
if not isinstance(feedJson['items'], list):
|
||||||
|
return {}
|
||||||
|
for jsonFeedItem in feedJson['items']:
|
||||||
|
if not jsonFeedItem:
|
||||||
|
continue
|
||||||
|
if not isinstance(jsonFeedItem, dict):
|
||||||
|
continue
|
||||||
|
if not jsonFeedItem.get('url'):
|
||||||
|
continue
|
||||||
|
if not isinstance(jsonFeedItem['url'], str):
|
||||||
|
continue
|
||||||
|
if not jsonFeedItem.get('date_published'):
|
||||||
|
if not jsonFeedItem.get('date_modified'):
|
||||||
|
continue
|
||||||
|
if not jsonFeedItem.get('content_text'):
|
||||||
|
if not jsonFeedItem.get('content_html'):
|
||||||
|
continue
|
||||||
|
if jsonFeedItem.get('content_html'):
|
||||||
|
if not isinstance(jsonFeedItem['content_html'], str):
|
||||||
|
continue
|
||||||
|
title = removeHtml(jsonFeedItem['content_html'])
|
||||||
|
else:
|
||||||
|
if not isinstance(jsonFeedItem['content_text'], str):
|
||||||
|
continue
|
||||||
|
title = jsonFeedItem['content_text']
|
||||||
|
if len(title) > maxBytes:
|
||||||
|
print('WARN: json feed title is too long')
|
||||||
|
continue
|
||||||
|
description = ''
|
||||||
|
if jsonFeedItem.get('description'):
|
||||||
|
if not isinstance(jsonFeedItem['description'], str):
|
||||||
|
continue
|
||||||
|
description = jsonFeedItem['description']
|
||||||
|
if len(description) > maxBytes:
|
||||||
|
print('WARN: json feed description is too long')
|
||||||
|
continue
|
||||||
|
link = jsonFeedItem['url']
|
||||||
|
if '://' not in link:
|
||||||
|
continue
|
||||||
|
if len(link) > maxBytes:
|
||||||
|
print('WARN: json feed link is too long')
|
||||||
|
continue
|
||||||
|
itemDomain = link.split('://')[1]
|
||||||
|
if '/' in itemDomain:
|
||||||
|
itemDomain = itemDomain.split('/')[0]
|
||||||
|
if isBlockedDomain(baseDir, itemDomain):
|
||||||
|
continue
|
||||||
|
if jsonFeedItem.get('date_published'):
|
||||||
|
if not isinstance(jsonFeedItem['date_published'], str):
|
||||||
|
continue
|
||||||
|
pubDate = jsonFeedItem['date_published']
|
||||||
|
else:
|
||||||
|
if not isinstance(jsonFeedItem['date_modified'], str):
|
||||||
|
continue
|
||||||
|
pubDate = jsonFeedItem['date_modified']
|
||||||
|
|
||||||
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
|
if pubDateStr:
|
||||||
|
if _validFeedDate(pubDateStr):
|
||||||
|
postFilename = ''
|
||||||
|
votesStatus = []
|
||||||
|
_addNewswireDictEntry(baseDir, domain,
|
||||||
|
result, pubDateStr,
|
||||||
|
title, link,
|
||||||
|
votesStatus, postFilename,
|
||||||
|
description, moderated,
|
||||||
|
mirrored)
|
||||||
|
postCtr += 1
|
||||||
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
|
if postCtr > 0:
|
||||||
|
print('Added ' + str(postCtr) +
|
||||||
|
' json feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -593,6 +698,10 @@ def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
return _atomFeedToDict(baseDir, domain,
|
return _atomFeedToDict(baseDir, domain,
|
||||||
xmlStr, moderated, mirrored,
|
xmlStr, moderated, mirrored,
|
||||||
maxPostsPerSource, maxFeedItemSizeKb)
|
maxPostsPerSource, maxFeedItemSizeKb)
|
||||||
|
elif 'https://jsonfeed.org/version/1' in xmlStr:
|
||||||
|
return _jsonFeedToDict(baseDir, domain,
|
||||||
|
xmlStr, moderated, mirrored,
|
||||||
|
maxPostsPerSource, maxFeedItemSizeKb)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue