Save newswire favicons

main
Bob Mottram 2021-12-16 20:57:30 +00:00
parent ea422f0ead
commit 64c41279b4
4 changed files with 112 additions and 28 deletions

View File

@ -1016,7 +1016,7 @@ if args.domain:
if args.rss: if args.rss:
session = createSession(None) session = createSession(None)
testRSS = getRSS(baseDir, domain, session, args.rss, testRSS = getRSS(baseDir, domain, session, args.rss,
False, False, 1000, 1000, 1000, 1000) False, False, 1000, 1000, 1000, 1000, debug)
pprint(testRSS) pprint(testRSS)
sys.exit() sys.exit()

View File

@ -801,7 +801,8 @@ def runNewswireDaemon(baseDir: str, httpd,
httpd.maxFeedItemSizeKb, httpd.maxFeedItemSizeKb,
httpd.maxNewswirePosts, httpd.maxNewswirePosts,
httpd.maxCategoriesFeedItemSizeKb, httpd.maxCategoriesFeedItemSizeKb,
httpd.systemLanguage) httpd.systemLanguage,
httpd.debug)
if not httpd.newswire: if not httpd.newswire:
print('Newswire feeds not updated') print('Newswire feeds not updated')

View File

@ -34,6 +34,7 @@ from utils import localActorUrl
from blocking import isBlockedDomain from blocking import isBlockedDomain
from blocking import isBlockedHashtag from blocking import isBlockedHashtag
from filters import isFiltered from filters import isFiltered
from session import getImageBinaryFromUrl
def _removeCDATA(text: str) -> str: def _removeCDATA(text: str) -> str:
@ -126,6 +127,46 @@ def limitWordLengths(text: str, maxWordLength: int) -> str:
return result return result
def _getNewswireFaviconUrl(url: str) -> str:
"""Returns a favicon url from the given article link
"""
if '://' not in url:
return '/newswire_favicon.ico'
if url.startswith('http://'):
if not (url.endswith('.onion') or url.endswith('.i2p')):
return '/newswire_favicon.ico'
domain = url.split('://')[1]
if '/' not in domain:
return url + '/favicon.ico'
else:
domain = domain.split('/')[0]
return url.split('://')[0] + '://' + domain + '/favicon.ico'
def _downloadNewswireFeedFavicon(session, baseDir: str,
link: str, debug: bool) -> bool:
"""Downloads the favicon for the given feed link
"""
url = _getNewswireFaviconUrl(link)
if '://' not in link:
return False
timeoutSec = 10
imageData = getImageBinaryFromUrl(session, url, timeoutSec, debug)
if not imageData:
return False
if not os.path.isdir(baseDir + '/favicons'):
os.mkdir(baseDir + '/favicons')
linkFilename = url.replace('/', '#')
imageFilename = baseDir + '/favicons/' + linkFilename
try:
with open(imageFilename, 'wb+') as fp:
fp.write(imageData)
except OSError:
print('EX: failed writing favicon ' + url)
return False
return True
def _addNewswireDictEntry(baseDir: str, domain: str, def _addNewswireDictEntry(baseDir: str, domain: str,
newswire: {}, dateStr: str, newswire: {}, dateStr: str,
title: str, link: str, title: str, link: str,
@ -133,7 +174,7 @@ def _addNewswireDictEntry(baseDir: str, domain: str,
description: str, moderated: bool, description: str, moderated: bool,
mirrored: bool, mirrored: bool,
tags: [], tags: [],
maxTags: int) -> None: maxTags: int, session, debug: bool) -> None:
"""Update the newswire dictionary """Update the newswire dictionary
""" """
# remove any markup # remove any markup
@ -166,6 +207,8 @@ def _addNewswireDictEntry(baseDir: str, domain: str,
if isBlockedHashtag(baseDir, tag): if isBlockedHashtag(baseDir, tag):
return return
_downloadNewswireFeedFavicon(session, baseDir, link, debug)
newswire[dateStr] = [ newswire[dateStr] = [
title, title,
link, link,
@ -309,7 +352,8 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int, maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}: maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml RSS 2.0 string to a dictionary """Converts an xml RSS 2.0 string to a dictionary
""" """
if '<item>' not in xmlStr: if '<item>' not in xmlStr:
@ -378,7 +422,7 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, description, moderated,
mirrored, [], 32) mirrored, [], 32, session, debug)
postCtr += 1 postCtr += 1
if postCtr >= maxPostsPerSource: if postCtr >= maxPostsPerSource:
break break
@ -392,7 +436,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int, maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}: maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml RSS 1.0 string to a dictionary """Converts an xml RSS 1.0 string to a dictionary
https://validator.w3.org/feed/docs/rss1.html https://validator.w3.org/feed/docs/rss1.html
""" """
@ -465,7 +510,7 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, description, moderated,
mirrored, [], 32) mirrored, [], 32, session, debug)
postCtr += 1 postCtr += 1
if postCtr >= maxPostsPerSource: if postCtr >= maxPostsPerSource:
break break
@ -478,7 +523,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str, def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}: maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an atom feed string to a dictionary """Converts an atom feed string to a dictionary
""" """
if '<entry>' not in xmlStr: if '<entry>' not in xmlStr:
@ -540,7 +586,7 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, description, moderated,
mirrored, [], 32) mirrored, [], 32, session, debug)
postCtr += 1 postCtr += 1
if postCtr >= maxPostsPerSource: if postCtr >= maxPostsPerSource:
break break
@ -553,7 +599,8 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str, def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}: maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts a json feed string to a dictionary """Converts a json feed string to a dictionary
See https://jsonfeed.org/version/1.1 See https://jsonfeed.org/version/1.1
""" """
@ -651,7 +698,7 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, description, moderated,
mirrored, [], 32) mirrored, [], 32, session, debug)
postCtr += 1 postCtr += 1
if postCtr >= maxPostsPerSource: if postCtr >= maxPostsPerSource:
break break
@ -664,7 +711,8 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}: maxFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an atom-style YouTube feed string to a dictionary """Converts an atom-style YouTube feed string to a dictionary
""" """
if '<entry>' not in xmlStr: if '<entry>' not in xmlStr:
@ -723,7 +771,7 @@ def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, mirrored, description, moderated, mirrored,
[], 32) [], 32, session, debug)
postCtr += 1 postCtr += 1
if postCtr >= maxPostsPerSource: if postCtr >= maxPostsPerSource:
break break
@ -736,32 +784,38 @@ def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedItemSizeKb: int, maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}: maxCategoriesFeedItemSizeKb: int,
session, debug: bool) -> {}:
"""Converts an xml string to a dictionary """Converts an xml string to a dictionary
""" """
if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr: if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr:
print('YouTube feed: reading') print('YouTube feed: reading')
return _atomFeedYTToDict(baseDir, domain, return _atomFeedYTToDict(baseDir, domain,
xmlStr, moderated, mirrored, xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb) maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
elif 'rss version="2.0"' in xmlStr: elif 'rss version="2.0"' in xmlStr:
return _xml2StrToDict(baseDir, domain, return _xml2StrToDict(baseDir, domain,
xmlStr, moderated, mirrored, xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb, maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb) maxCategoriesFeedItemSizeKb,
session, debug)
elif '<?xml version="1.0"' in xmlStr: elif '<?xml version="1.0"' in xmlStr:
return _xml1StrToDict(baseDir, domain, return _xml1StrToDict(baseDir, domain,
xmlStr, moderated, mirrored, xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb, maxPostsPerSource, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb) maxCategoriesFeedItemSizeKb,
session, debug)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr: elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return _atomFeedToDict(baseDir, domain, return _atomFeedToDict(baseDir, domain,
xmlStr, moderated, mirrored, xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb) maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
elif 'https://jsonfeed.org/version/1' in xmlStr: elif 'https://jsonfeed.org/version/1' in xmlStr:
return _jsonFeedV1ToDict(baseDir, domain, return _jsonFeedV1ToDict(baseDir, domain,
xmlStr, moderated, mirrored, xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb) maxPostsPerSource, maxFeedItemSizeKb,
session, debug)
return {} return {}
@ -781,7 +835,7 @@ def getRSS(baseDir: str, domain: str, session, url: str,
moderated: bool, mirrored: bool, moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxFeedSizeKb: int, maxPostsPerSource: int, maxFeedSizeKb: int,
maxFeedItemSizeKb: int, maxFeedItemSizeKb: int,
maxCategoriesFeedItemSizeKb: int) -> {}: maxCategoriesFeedItemSizeKb: int, debug: bool) -> {}:
"""Returns an RSS url as a dict """Returns an RSS url as a dict
""" """
if not isinstance(url, str): if not isinstance(url, str):
@ -812,7 +866,8 @@ def getRSS(baseDir: str, domain: str, session, url: str,
moderated, mirrored, moderated, mirrored,
maxPostsPerSource, maxPostsPerSource,
maxFeedItemSizeKb, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb) maxCategoriesFeedItemSizeKb,
session, debug)
else: else:
print('WARN: feed is too large, ' + print('WARN: feed is too large, ' +
'or contains invalid characters: ' + url) 'or contains invalid characters: ' + url)
@ -923,7 +978,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
newswire: {}, newswire: {},
maxBlogsPerAccount: int, maxBlogsPerAccount: int,
indexFilename: str, indexFilename: str,
maxTags: int, systemLanguage: str) -> None: maxTags: int, systemLanguage: str,
session, debug: bool) -> None:
"""Adds blogs for the given account to the newswire """Adds blogs for the given account to the newswire
""" """
if not os.path.isfile(indexFilename): if not os.path.isfile(indexFilename):
@ -987,7 +1043,7 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
votes, fullPostFilename, votes, fullPostFilename,
description, moderated, False, description, moderated, False,
tagsFromPost, tagsFromPost,
maxTags) maxTags, session, debug)
ctr += 1 ctr += 1
if ctr >= maxBlogsPerAccount: if ctr >= maxBlogsPerAccount:
@ -996,7 +1052,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {}, def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
maxBlogsPerAccount: int, maxBlogsPerAccount: int,
maxTags: int, systemLanguage: str) -> None: maxTags: int, systemLanguage: str,
session, debug: bool) -> None:
"""Adds blogs from each user account into the newswire """Adds blogs from each user account into the newswire
""" """
moderationDict = {} moderationDict = {}
@ -1025,7 +1082,8 @@ def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
_addAccountBlogsToNewswire(baseDir, nickname, domain, _addAccountBlogsToNewswire(baseDir, nickname, domain,
newswire, maxBlogsPerAccount, newswire, maxBlogsPerAccount,
blogsIndex, maxTags, blogsIndex, maxTags,
systemLanguage) systemLanguage, session,
debug)
break break
# sort the moderation dict into chronological order, latest first # sort the moderation dict into chronological order, latest first
@ -1050,7 +1108,7 @@ def getDictFromNewswire(session, baseDir: str, domain: str,
maxTags: int, maxFeedItemSizeKb: int, maxTags: int, maxFeedItemSizeKb: int,
maxNewswirePosts: int, maxNewswirePosts: int,
maxCategoriesFeedItemSizeKb: int, maxCategoriesFeedItemSizeKb: int,
systemLanguage: str) -> {}: systemLanguage: str, debug: bool) -> {}:
"""Gets rss feeds as a dictionary from newswire file """Gets rss feeds as a dictionary from newswire file
""" """
subscriptionsFilename = baseDir + '/accounts/newswire.txt' subscriptionsFilename = baseDir + '/accounts/newswire.txt'
@ -1091,14 +1149,15 @@ def getDictFromNewswire(session, baseDir: str, domain: str,
moderated, mirrored, moderated, mirrored,
maxPostsPerSource, maxFeedSizeKb, maxPostsPerSource, maxFeedSizeKb,
maxFeedItemSizeKb, maxFeedItemSizeKb,
maxCategoriesFeedItemSizeKb) maxCategoriesFeedItemSizeKb, debug)
if itemsList: if itemsList:
for dateStr, item in itemsList.items(): for dateStr, item in itemsList.items():
result[dateStr] = item result[dateStr] = item
# add blogs from each user account # add blogs from each user account
_addBlogsToNewswire(baseDir, domain, result, _addBlogsToNewswire(baseDir, domain, result,
maxPostsPerSource, maxTags, systemLanguage) maxPostsPerSource, maxTags, systemLanguage,
session, debug)
# sort into chronological order, latest first # sort into chronological order, latest first
sortedResult = OrderedDict(sorted(result.items(), reverse=True)) sortedResult = OrderedDict(sorted(result.items(), reverse=True))

View File

@ -452,3 +452,27 @@ def downloadImage(session, baseDir: str, url: str,
print('EX: Failed to download image: ' + print('EX: Failed to download image: ' +
str(url) + ' ' + str(e)) str(url) + ' ' + str(e))
return False return False
def getImageBinaryFromUrl(session, url: str, timeoutSec: int, debug: bool):
"""http GET for an image
"""
try:
result = session.get(url, timeout=timeoutSec)
if result.status_code != 200:
print('WARN: getImageFromUrl: ' + url +
' failed with error code ' + str(result.status_code))
return result.content
except requests.exceptions.RequestException as e:
if debug:
print('ERROR: getImageFromUrl failed: ' + str(url) + ', ' +
str(e))
except ValueError as e:
if debug:
print('ERROR: getImageFromUrl failed: ' + str(url) + ', ' +
str(e))
except SocketError as e:
if e.errno == errno.ECONNRESET:
print('WARN: getImageFromUrl failed, ' +
'connection was reset ' + str(e))
return None