Save newswire favicons

main
Bob Mottram 2021-12-16 20:57:30 +00:00
parent ea422f0ead
commit 64c41279b4
4 changed files with 112 additions and 28 deletions

View File

@ -1016,7 +1016,7 @@ if args.domain:
if args.rss:
session = createSession(None)
testRSS = getRSS(baseDir, domain, session, args.rss,
False, False, 1000, 1000, 1000, 1000)
False, False, 1000, 1000, 1000, 1000, debug)
pprint(testRSS)
sys.exit()

View File

@ -801,7 +801,8 @@ def runNewswireDaemon(baseDir: str, httpd,
httpd.maxFeedItemSizeKb,
httpd.maxNewswirePosts,
httpd.maxCategoriesFeedItemSizeKb,
httpd.systemLanguage)
httpd.systemLanguage,
httpd.debug)
if not httpd.newswire:
print('Newswire feeds not updated')

View File

@ -34,6 +34,7 @@ from utils import localActorUrl
from blocking import isBlockedDomain
from blocking import isBlockedHashtag
from filters import isFiltered
from session import getImageBinaryFromUrl
def _removeCDATA(text: str) -> str:
@ -126,6 +127,46 @@ def limitWordLengths(text: str, maxWordLength: int) -> str:
return result
def _getNewswireFaviconUrl(url: str) -> str:
"""Returns a favicon url from the given article link
"""
if '://' not in url:
return '/newswire_favicon.ico'
if url.startswith('http://'):
if not (url.endswith('.onion') or url.endswith('.i2p')):
return '/newswire_favicon.ico'
domain = url.split('://')[1]
if '/' not in domain:
return url + '/favicon.ico'
else:
domain = domain.split('/')[0]
return url.split('://')[0] + '://' + domain + '/favicon.ico'
def _downloadNewswireFeedFavicon(session, baseDir: str,
link: str, debug: bool) -> bool:
"""Downloads the favicon for the given feed link
"""
url = _getNewswireFaviconUrl(link)
if '://' not in link:
return False
timeoutSec = 10
imageData = getImageBinaryFromUrl(session, url, timeoutSec, debug)
if not imageData:
return False
if not os.path.isdir(baseDir + '/favicons'):
os.mkdir(baseDir + '/favicons')
linkFilename = url.replace('/', '#')
imageFilename = baseDir + '/favicons/' + linkFilename
try:
with open(imageFilename, 'wb+') as fp:
fp.write(imageData)
except OSError:
print('EX: failed writing favicon ' + url)
return False
return True
def _addNewswireDictEntry(baseDir: str, domain: str,
newswire: {}, dateStr: str,
title: str, link: str,
@ -133,7 +174,7 @@ def _addNewswireDictEntry(baseDir: str, domain: str,
description: str, moderated: bool,
mirrored: bool,
tags: [],
maxTags: int) -> None:
maxTags: int, session, debug: bool) -> None:
"""Update the newswire dictionary
"""
# remove any markup
@ -166,6 +207,8 @@ def _addNewswireDictEntry(baseDir: str, domain: str,
if isBlockedHashtag(baseDir, tag):
return
_downloadNewswireFeedFavicon(session, baseDir, link, debug)
newswire[dateStr] = [
title,
link,
@ -309,7 +352,8 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml RSS 2.0 string to a dictionary
"""
if '<item>' not in xmlStr:
@ -378,7 +422,7 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
title, link,
votesStatus, postFilename,
description, moderated,
mirrored, [], 32)
mirrored, [], 32, session, debug)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
@ -392,7 +436,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml RSS 1.0 string to a dictionary
https://validator.w3.org/feed/docs/rss1.html
"""
@ -465,7 +510,7 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
title, link,
votesStatus, postFilename,
description, moderated,
mirrored, [], 32)
mirrored, [], 32, session, debug)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
@ -478,7 +523,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an atom feed string to a dictionary
"""
if '<entry>' not in xmlStr:
@ -540,7 +586,7 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
title, link,
votesStatus, postFilename,
description, moderated,
mirrored, [], 32)
mirrored, [], 32, session, debug)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
@ -553,7 +599,8 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts a json feed string to a dictionary
See https://jsonfeed.org/version/1.1
"""
@ -651,7 +698,7 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
title, link,
votesStatus, postFilename,
description, moderated,
mirrored, [], 32)
mirrored, [], 32, session, debug)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
@ -664,7 +711,8 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an atom-style YouTube feed string to a dictionary
"""
if '<entry>' not in xmlStr:
@ -723,7 +771,7 @@ def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
title, link,
votesStatus, postFilename,
description, moderated, mirrored,
[], 32)
[], 32, session, debug)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
@ -736,32 +784,38 @@ def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml string to a dictionary
"""
if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr:
print('YouTube feed: reading')
return _atomFeedYTToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
elif 'rss version="2.0"' in xmlStr:
return _xml2StrToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
maxCategoriesFeedItemSizeKb,
session, debug)
elif '<?xml version="1.0"' in xmlStr:
return _xml1StrToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
maxCategoriesFeedItemSizeKb,
session, debug)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return _atomFeedToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
elif 'https://jsonfeed.org/version/1' in xmlStr:
return _jsonFeedV1ToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
return {}
@ -781,7 +835,7 @@ def getRSS(baseDir: str, domain: str, session, url: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxFeedSizeKb: int,
maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}:
maxCategoriesFeedItemSizeKb: int, debug: bool) -> {}:
"""Returns an RSS url as a dict
"""
if not isinstance(url, str):
@ -812,7 +866,8 @@ def getRSS(baseDir: str, domain: str, session, url: str,
moderated, mirrored,
maxPostsPerSource,
maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
maxCategoriesFeedItemSizeKb,
session, debug)
else:
print('WARN: feed is too large, ' +
'or contains invalid characters: ' + url)
@ -923,7 +978,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
newswire: {},
maxBlogsPerAccount: int,
indexFilename: str,
maxTags: int, systemLanguage: str) -> None:
maxTags: int, systemLanguage: str,
session, debug: bool) -> None:
"""Adds blogs for the given account to the newswire
"""
if not os.path.isfile(indexFilename):
@ -987,7 +1043,7 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
votes, fullPostFilename,
description, moderated, False,
tagsFromPost,
maxTags)
maxTags, session, debug)
ctr += 1
if ctr >= maxBlogsPerAccount:
@ -996,7 +1052,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
maxBlogsPerAccount: int,
maxTags: int, systemLanguage: str) -> None:
maxTags: int, systemLanguage: str,
session, debug: bool) -> None:
"""Adds blogs from each user account into the newswire
"""
moderationDict = {}
@ -1025,7 +1082,8 @@ def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
_addAccountBlogsToNewswire(baseDir, nickname, domain,
newswire, maxBlogsPerAccount,
blogsIndex, maxTags,
systemLanguage)
systemLanguage, session,
debug)
break
# sort the moderation dict into chronological order, latest first
@ -1050,7 +1108,7 @@ def getDictFromNewswire(session, baseDir: str, domain: str,
maxTags: int, maxFeedItemSizeKb: int,
maxNewswirePosts: int,
maxCategoriesFeedItemSizeKb: int,
systemLanguage: str) -> {}:
systemLanguage: str, debug: bool) -> {}:
"""Gets rss feeds as a dictionary from newswire file
"""
subscriptionsFilename = baseDir + '/accounts/newswire.txt'
@ -1091,14 +1149,15 @@ def getDictFromNewswire(session, baseDir: str, domain: str,
moderated, mirrored,
maxPostsPerSource, maxFeedSizeKb,
maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb)
maxCategoriesFeedItemSizeKb, debug)
if itemsList:
for dateStr, item in itemsList.items():
result[dateStr] = item
# add blogs from each user account
_addBlogsToNewswire(baseDir, domain, result,
maxPostsPerSource, maxTags, systemLanguage)
maxPostsPerSource, maxTags, systemLanguage,
session, debug)
# sort into chronological order, latest first
sortedResult = OrderedDict(sorted(result.items(), reverse=True))

View File

@ -452,3 +452,27 @@ def downloadImage(session, baseDir: str, url: str,
print('EX: Failed to download image: ' +
str(url) + ' ' + str(e))
return False
def getImageBinaryFromUrl(session, url: str, timeoutSec: int, debug: bool):
"""http GET for an image
"""
try:
result = session.get(url, timeout=timeoutSec)
if result.status_code != 200:
print('WARN: getImageFromUrl: ' + url +
' failed with error code ' + str(result.status_code))
return result.content
except requests.exceptions.RequestException as e:
if debug:
print('ERROR: getImageFromUrl failed: ' + str(url) + ', ' +
str(e))
except ValueError as e:
if debug:
print('ERROR: getImageFromUrl failed: ' + str(url) + ', ' +
str(e))
except SocketError as e:
if e.errno == errno.ECONNRESET:
print('WARN: getImageFromUrl failed, ' +
'connection was reset ' + str(e))
return None