diff --git a/daemon.py b/daemon.py index 716c90b0a..33b206b01 100644 --- a/daemon.py +++ b/daemon.py @@ -1562,7 +1562,7 @@ class PubServer(BaseHTTPRequestHandler): self.authorizedNickname = None notAuthPaths = ( - '/icons/', '/avatars/', + '/icons/', '/avatars/', '/favicons/', '/system/accounts/avatars/', '/system/accounts/headers/', '/system/media_attachments/files/', @@ -7407,6 +7407,54 @@ class PubServer(BaseHTTPRequestHandler): return self._404() + def _showCachedFavicon(self, refererDomain: str, path: str, + baseDir: str, GETstartTime) -> None: + """Shows a favicon image obtained from the cache + """ + favFile = path.replace('/favicons/', '') + favFilename = baseDir + urllib.parse.unquote_plus(path) + print('showCachedFavicon: ' + favFilename) + if self.server.faviconsCache.get(favFile): + mediaBinary = self.server.faviconsCache[favFile] + mimeType = mediaFileMimeType(favFilename) + self._set_headers_etag(favFilename, + mimeType, + mediaBinary, None, + refererDomain, + False, None) + self._write(mediaBinary) + fitnessPerformance(GETstartTime, self.server.fitness, + '_GET', '_showCachedFavicon2', + self.server.debug) + return + if not os.path.isfile(favFilename): + self._404() + return + if self._etag_exists(favFilename): + # The file has not changed + self._304() + return + mediaBinary = None + try: + with open(favFilename, 'rb') as avFile: + mediaBinary = avFile.read() + except OSError: + print('EX: unable to read cached favicon ' + favFilename) + if mediaBinary: + mimeType = mediaFileMimeType(favFilename) + self._set_headers_etag(favFilename, + mimeType, + mediaBinary, None, + refererDomain, + False, None) + self._write(mediaBinary) + fitnessPerformance(GETstartTime, self.server.fitness, + '_GET', '_showCachedFavicon', + self.server.debug) + self.server.faviconsCache[favFile] = mediaBinary + return + self._404() + def _showCachedAvatar(self, refererDomain: str, path: str, baseDir: str, GETstartTime) -> None: """Shows an avatar image obtained from the cache @@ -12329,6 +12377,7 @@ class PubServer(BaseHTTPRequestHandler): '/emoji/' not in path and \ '/tags/' not in path and \ '/avatars/' not in path and \ + '/favicons/' not in path and \ '/headers/' not in path and \ '/fonts/' not in path and \ '/icons/' not in path: @@ -13378,18 +13427,19 @@ class PubServer(BaseHTTPRequestHandler): # default newswire favicon, for links to sites which # have no favicon - if 'newswire_favicon.ico' in self.path: - self._getFavicon(callingDomain, self.server.baseDir, - self.server.debug, - 'newswire_favicon.ico') - return + if not self.path.startswith('/favicons/'): + if 'newswire_favicon.ico' in self.path: + self._getFavicon(callingDomain, self.server.baseDir, + self.server.debug, + 'newswire_favicon.ico') + return - # favicon image - if 'favicon.ico' in self.path: - self._getFavicon(callingDomain, self.server.baseDir, - self.server.debug, - 'favicon.ico') - return + # favicon image + if 'favicon.ico' in self.path: + self._getFavicon(callingDomain, self.server.baseDir, + self.server.debug, + 'favicon.ico') + return # check authorization authorized = self._isAuthorized() @@ -13647,6 +13697,20 @@ class PubServer(BaseHTTPRequestHandler): '_GET', 'hasAccept', self.server.debug) + # cached favicon images + # Note that this comes before the busy flag to avoid conflicts + if self.path.startswith('/favicons/'): + if self.server.domainFull in self.path: + # favicon for this instance + self._getFavicon(callingDomain, self.server.baseDir, + self.server.debug, + 'favicon.ico') + return + self._showCachedFavicon(refererDomain, self.path, + self.server.baseDir, + GETstartTime) + return + # get css # Note that this comes before the busy flag to avoid conflicts if self.path.endswith('.css'): @@ -18623,6 +18687,7 @@ def runDaemon(contentLicenseUrl: str, httpd.instanceId = instanceId httpd.personCache = {} httpd.cachedWebfingers = {} + httpd.faviconsCache = {} httpd.proxyType = proxyType httpd.session = None httpd.sessionLastUpdate = 0 diff --git a/epicyon.py b/epicyon.py index 52b0fcca0..391650119 100644 --- a/epicyon.py +++ b/epicyon.py @@ -1016,7 +1016,7 @@ if args.domain: if args.rss: session = createSession(None) testRSS = getRSS(baseDir, domain, session, args.rss, - False, False, 1000, 1000, 1000, 1000) + False, False, 1000, 1000, 1000, 1000, debug) pprint(testRSS) sys.exit() diff --git a/newsdaemon.py b/newsdaemon.py index dc5b35e0b..a4f340b4b 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -801,7 +801,8 @@ def runNewswireDaemon(baseDir: str, httpd, httpd.maxFeedItemSizeKb, httpd.maxNewswirePosts, httpd.maxCategoriesFeedItemSizeKb, - httpd.systemLanguage) + httpd.systemLanguage, + httpd.debug) if not httpd.newswire: print('Newswire feeds not updated') diff --git a/newswire.py b/newswire.py index b069e8ba5..c718569e4 100644 --- a/newswire.py +++ b/newswire.py @@ -18,6 +18,7 @@ from datetime import timezone from collections import OrderedDict from utils import validPostDate from categories import setHashtagCategory +from utils import getFavFilenameFromUrl from utils import getBaseContentFromPost from utils import hasObjectDict from utils import firstParagraphFromString @@ -34,6 +35,7 @@ from utils import localActorUrl from blocking import isBlockedDomain from blocking import isBlockedHashtag from filters import isFiltered +from session import downloadImageAnyMimeType def _removeCDATA(text: str) -> str: @@ -126,6 +128,67 @@ def limitWordLengths(text: str, maxWordLength: int) -> str: return result +def getNewswireFaviconUrl(url: str) -> str: + """Returns a favicon url from the given article link + """ + if '://' not in url: + return '/newswire_favicon.ico' + if url.startswith('http://'): + if not (url.endswith('.onion') or url.endswith('.i2p')): + return '/newswire_favicon.ico' + domain = url.split('://')[1] + if '/' not in domain: + return url + '/favicon.ico' + else: + domain = domain.split('/')[0] + return url.split('://')[0] + '://' + domain + '/favicon.ico' + + +def _downloadNewswireFeedFavicon(session, baseDir: str, + link: str, debug: bool) -> bool: + """Downloads the favicon for the given feed link + """ + favUrl = getNewswireFaviconUrl(link) + if '://' not in link: + return False + timeoutSec = 10 + imageData, mimeType = \ + downloadImageAnyMimeType(session, favUrl, timeoutSec, debug) + if not imageData or not mimeType: + return False + + # update the favicon url + extensionsToMime = { + 'ico': 'x-icon', + 'png': 'png', + 'jpg': 'jpeg', + 'gif': 'gif', + 'avif': 'avif', + 'svg': 'svg+xml', + 'webp': 'webp' + } + for ext, mimeExt in extensionsToMime.items(): + if 'image/' + mimeExt in mimeType: + favUrl = favUrl.replace('.ico', '.' + ext) + break + + # create cached favicons directory if needed + if not os.path.isdir(baseDir + '/favicons'): + os.mkdir(baseDir + '/favicons') + + # save to the cache + favFilename = getFavFilenameFromUrl(baseDir, favUrl) + if os.path.isfile(favFilename): + return True + try: + with open(favFilename, 'wb+') as fp: + fp.write(imageData) + except OSError: + print('EX: failed writing favicon ' + favFilename) + return False + return True + + def _addNewswireDictEntry(baseDir: str, domain: str, newswire: {}, dateStr: str, title: str, link: str, @@ -133,7 +196,7 @@ def _addNewswireDictEntry(baseDir: str, domain: str, description: str, moderated: bool, mirrored: bool, tags: [], - maxTags: int) -> None: + maxTags: int, session, debug: bool) -> None: """Update the newswire dictionary """ # remove any markup @@ -166,6 +229,8 @@ def _addNewswireDictEntry(baseDir: str, domain: str, if isBlockedHashtag(baseDir, tag): return + _downloadNewswireFeedFavicon(session, baseDir, link, debug) + newswire[dateStr] = [ title, link, @@ -314,7 +379,8 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, maxFeedItemSizeKb: int, - maxCategoriesFeedItemSizeKb: int) -> {}: + maxCategoriesFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts an xml RSS 2.0 string to a dictionary """ if '' not in xmlStr: @@ -383,7 +449,7 @@ def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str, title, link, votesStatus, postFilename, description, moderated, - mirrored, [], 32) + mirrored, [], 32, session, debug) postCtr += 1 if postCtr >= maxPostsPerSource: break @@ -397,7 +463,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, maxFeedItemSizeKb: int, - maxCategoriesFeedItemSizeKb: int) -> {}: + maxCategoriesFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts an xml RSS 1.0 string to a dictionary https://validator.w3.org/feed/docs/rss1.html """ @@ -470,7 +537,7 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str, title, link, votesStatus, postFilename, description, moderated, - mirrored, [], 32) + mirrored, [], 32, session, debug) postCtr += 1 if postCtr >= maxPostsPerSource: break @@ -483,7 +550,8 @@ def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str, def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, - maxFeedItemSizeKb: int) -> {}: + maxFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts an atom feed string to a dictionary """ if '' not in xmlStr: @@ -545,7 +613,7 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str, title, link, votesStatus, postFilename, description, moderated, - mirrored, [], 32) + mirrored, [], 32, session, debug) postCtr += 1 if postCtr >= maxPostsPerSource: break @@ -558,7 +626,8 @@ def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str, def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, - maxFeedItemSizeKb: int) -> {}: + maxFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts a json feed string to a dictionary See https://jsonfeed.org/version/1.1 """ @@ -656,7 +725,7 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str, title, link, votesStatus, postFilename, description, moderated, - mirrored, [], 32) + mirrored, [], 32, session, debug) postCtr += 1 if postCtr >= maxPostsPerSource: break @@ -669,7 +738,8 @@ def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str, def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, - maxFeedItemSizeKb: int) -> {}: + maxFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts an atom-style YouTube feed string to a dictionary """ if '' not in xmlStr: @@ -728,7 +798,7 @@ def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, title, link, votesStatus, postFilename, description, moderated, mirrored, - [], 32) + [], 32, session, debug) postCtr += 1 if postCtr >= maxPostsPerSource: break @@ -741,32 +811,38 @@ def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, maxFeedItemSizeKb: int, - maxCategoriesFeedItemSizeKb: int) -> {}: + maxCategoriesFeedItemSizeKb: int, + session, debug: bool) -> {}: """Converts an xml string to a dictionary """ if '' in xmlStr and '' in xmlStr: print('YouTube feed: reading') return _atomFeedYTToDict(baseDir, domain, xmlStr, moderated, mirrored, - maxPostsPerSource, maxFeedItemSizeKb) + maxPostsPerSource, maxFeedItemSizeKb, + session, debug) elif 'rss version="2.0"' in xmlStr: return _xml2StrToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb, - maxCategoriesFeedItemSizeKb) + maxCategoriesFeedItemSizeKb, + session, debug) elif ' {}: + maxCategoriesFeedItemSizeKb: int, debug: bool) -> {}: """Returns an RSS url as a dict """ if not isinstance(url, str): @@ -817,7 +893,8 @@ def getRSS(baseDir: str, domain: str, session, url: str, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb, - maxCategoriesFeedItemSizeKb) + maxCategoriesFeedItemSizeKb, + session, debug) else: print('WARN: feed is too large, ' + 'or contains invalid characters: ' + url) @@ -928,7 +1005,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, newswire: {}, maxBlogsPerAccount: int, indexFilename: str, - maxTags: int, systemLanguage: str) -> None: + maxTags: int, systemLanguage: str, + session, debug: bool) -> None: """Adds blogs for the given account to the newswire """ if not os.path.isfile(indexFilename): @@ -992,7 +1070,7 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, votes, fullPostFilename, description, moderated, False, tagsFromPost, - maxTags) + maxTags, session, debug) ctr += 1 if ctr >= maxBlogsPerAccount: @@ -1001,7 +1079,8 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {}, maxBlogsPerAccount: int, - maxTags: int, systemLanguage: str) -> None: + maxTags: int, systemLanguage: str, + session, debug: bool) -> None: """Adds blogs from each user account into the newswire """ moderationDict = {} @@ -1030,7 +1109,8 @@ def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {}, _addAccountBlogsToNewswire(baseDir, nickname, domain, newswire, maxBlogsPerAccount, blogsIndex, maxTags, - systemLanguage) + systemLanguage, session, + debug) break # sort the moderation dict into chronological order, latest first @@ -1055,7 +1135,7 @@ def getDictFromNewswire(session, baseDir: str, domain: str, maxTags: int, maxFeedItemSizeKb: int, maxNewswirePosts: int, maxCategoriesFeedItemSizeKb: int, - systemLanguage: str) -> {}: + systemLanguage: str, debug: bool) -> {}: """Gets rss feeds as a dictionary from newswire file """ subscriptionsFilename = baseDir + '/accounts/newswire.txt' @@ -1096,14 +1176,15 @@ def getDictFromNewswire(session, baseDir: str, domain: str, moderated, mirrored, maxPostsPerSource, maxFeedSizeKb, maxFeedItemSizeKb, - maxCategoriesFeedItemSizeKb) + maxCategoriesFeedItemSizeKb, debug) if itemsList: for dateStr, item in itemsList.items(): result[dateStr] = item # add blogs from each user account _addBlogsToNewswire(baseDir, domain, result, - maxPostsPerSource, maxTags, systemLanguage) + maxPostsPerSource, maxTags, systemLanguage, + session, debug) # sort into chronological order, latest first sortedResult = OrderedDict(sorted(result.items(), reverse=True)) diff --git a/session.py b/session.py index 4eedae57e..8464bd0ab 100644 --- a/session.py +++ b/session.py @@ -394,7 +394,7 @@ def postImage(session, attachImageFilename: str, federationList: [], def downloadImage(session, baseDir: str, url: str, imageFilename: str, debug: bool, force: bool = False) -> bool: - """Downloads an image + """Downloads an image with an expected mime type """ if not url: return None @@ -407,7 +407,8 @@ def downloadImage(session, baseDir: str, url: str, 'gif': 'gif', 'svg': 'svg+xml', 'webp': 'webp', - 'avif': 'avif' + 'avif': 'avif', + 'ico': 'x-icon' } sessionHeaders = None for imFormat, mimeType in imageFormats.items(): @@ -452,3 +453,62 @@ def downloadImage(session, baseDir: str, url: str, print('EX: Failed to download image: ' + str(url) + ' ' + str(e)) return False + + +def downloadImageAnyMimeType(session, url: str, timeoutSec: int, debug: bool): + """http GET for an image with any mime type + """ + mimeType = None + contentType = None + result = None + sessionHeaders = { + 'Accept': 'image/x-icon, image/png, image/webp, image/jpeg, image/gif' + } + try: + result = session.get(url, headers=sessionHeaders, timeout=timeoutSec) + except requests.exceptions.RequestException as e: + print('ERROR: downloadImageAnyMimeType failed: ' + + str(url) + ', ' + str(e)) + return None, None + except ValueError as e: + print('ERROR: downloadImageAnyMimeType failed: ' + + str(url) + ', ' + str(e)) + return None, None + except SocketError as e: + if e.errno == errno.ECONNRESET: + print('WARN: downloadImageAnyMimeType failed, ' + + 'connection was reset ' + str(e)) + return None, None + + if not result: + return None, None + + if result.status_code != 200: + print('WARN: downloadImageAnyMimeType: ' + url + + ' failed with error code ' + str(result.status_code)) + return None, None + + if result.headers.get('content-type'): + contentType = result.headers['content-type'] + elif result.headers.get('Content-type'): + contentType = result.headers['Content-type'] + elif result.headers.get('Content-Type'): + contentType = result.headers['Content-Type'] + + if not contentType: + return None, None + + imageFormats = { + 'ico': 'x-icon', + 'png': 'png', + 'jpg': 'jpeg', + 'jpeg': 'jpeg', + 'gif': 'gif', + 'svg': 'svg+xml', + 'webp': 'webp', + 'avif': 'avif' + } + for imFormat, mType in imageFormats.items(): + if 'image/' + mType in contentType: + mimeType = 'image/' + mType + return result.content, mimeType diff --git a/utils.py b/utils.py index 127b57a21..14314e7e5 100644 --- a/utils.py +++ b/utils.py @@ -346,7 +346,7 @@ def getAudioExtensions() -> []: def getImageExtensions() -> []: """Returns a list of the possible image file extensions """ - return ('png', 'jpg', 'jpeg', 'gif', 'webp', 'avif', 'svg') + return ('png', 'jpg', 'jpeg', 'gif', 'webp', 'avif', 'svg', 'ico') def getImageMimeType(imageFilename: str) -> str: @@ -358,7 +358,8 @@ def getImageMimeType(imageFilename: str) -> str: 'gif': 'gif', 'avif': 'avif', 'svg': 'svg+xml', - 'webp': 'webp' + 'webp': 'webp', + 'ico': 'x-icon' } for ext, mimeExt in extensionsToMime.items(): if imageFilename.endswith('.' + ext): @@ -375,7 +376,8 @@ def getImageExtensionFromMimeType(contentType: str) -> str: 'gif': 'gif', 'svg+xml': 'svg', 'webp': 'webp', - 'avif': 'avif' + 'avif': 'avif', + 'x-icon': 'ico' } for mimeExt, ext in imageMedia.items(): if contentType.endswith(mimeExt): @@ -2482,6 +2484,7 @@ def mediaFileMimeType(filename: str) -> str: 'svg': 'image/svg+xml', 'webp': 'image/webp', 'avif': 'image/avif', + 'ico': 'image/x-icon', 'mp3': 'audio/mpeg', 'ogg': 'audio/ogg', 'flac': 'audio/flac', @@ -3219,3 +3222,13 @@ def getNewPostEndpoints() -> []: 'newreminder', 'newreport', 'newquestion', 'newshare', 'newwanted', 'editblogpost' ) + + +def getFavFilenameFromUrl(baseDir: str, faviconUrl: str) -> str: + """Returns the cached filename for a favicon based upon its url + """ + if '://' in faviconUrl: + faviconUrl = faviconUrl.split('://')[1] + if '/favicon.' in faviconUrl: + faviconUrl = faviconUrl.replace('/favicon.', '.') + return baseDir + '/favicons/' + faviconUrl.replace('/', '-') diff --git a/webapp_column_right.py b/webapp_column_right.py index 5f3b44d2c..8e476c34e 100644 --- a/webapp_column_right.py +++ b/webapp_column_right.py @@ -11,6 +11,7 @@ import os from datetime import datetime from content import removeLongWords from content import limitRepeatedWords +from utils import getFavFilenameFromUrl from utils import getBaseContentFromPost from utils import removeHtml from utils import locatePost @@ -22,6 +23,7 @@ from utils import getConfigParam from utils import removeDomainPort from utils import acctDir from posts import isModerator +from newswire import getNewswireFaviconUrl from webapp_utils import getRightImageFile from webapp_utils import htmlHeaderWithExternalStyle from webapp_utils import htmlFooter @@ -210,22 +212,6 @@ def _getBrokenFavSubstitute() -> str: return " onerror=\"this.onerror=null; this.src='/newswire_favicon.ico'\"" -def _getNewswireFavicon(url: str) -> str: - """Returns a favicon url from the given article link - """ - if '://' not in url: - return '/newswire_favicon.ico' - if url.startswith('http://'): - if not (url.endswith('.onion') or url.endswith('.i2p')): - return '/newswire_favicon.ico' - domain = url.split('://')[1] - if '/' not in domain: - return url + '/favicon.ico' - else: - domain = domain.split('/')[0] - return url.split('://')[0] + '://' + domain + '/favicon.ico' - - def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool, translate: {}, positiveVoting: bool) -> str: """Converts a newswire dict into html @@ -252,9 +238,24 @@ def _htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool, dateStrLink = dateStr.replace('T', ' ') dateStrLink = dateStrLink.replace('Z', '') url = item[1] - faviconUrl = _getNewswireFavicon(url) + faviconUrl = getNewswireFaviconUrl(url) faviconLink = '' if faviconUrl: + cachedFaviconFilename = getFavFilenameFromUrl(baseDir, faviconUrl) + if os.path.isfile(cachedFaviconFilename): + faviconUrl = \ + cachedFaviconFilename.replace(baseDir, '') + else: + extensions = ('png', 'jpg', 'gif', 'avif', 'svg', 'webp') + for ext in extensions: + cachedFaviconFilename = \ + getFavFilenameFromUrl(baseDir, faviconUrl) + cachedFaviconFilename = \ + cachedFaviconFilename.replace('.ico', '.' + ext) + if os.path.isfile(cachedFaviconFilename): + faviconUrl = \ + cachedFaviconFilename.replace(baseDir, '') + faviconLink = \ ''