From 71d2e129afcdca67635ea33626873db577547e34 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 09:44:33 +0000 Subject: [PATCH 01/50] Title style --- webapp_column_left.py | 2 +- webapp_column_right.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/webapp_column_left.py b/webapp_column_left.py index b72c282da..928c89df6 100644 --- a/webapp_column_left.py +++ b/webapp_column_left.py @@ -287,7 +287,7 @@ def htmlEditLinks(cssCache: {}, translate: {}, baseDir: str, path: str, editLinksForm += \ '
\n' editLinksForm += \ - '

' + translate['Edit Links'] + '

' + '

' + translate['Edit Links'] + '

' editLinksForm += \ '
\n' editLinksForm += \ diff --git a/webapp_column_right.py b/webapp_column_right.py index dff002dd4..93fed4d3c 100644 --- a/webapp_column_right.py +++ b/webapp_column_right.py @@ -495,7 +495,7 @@ def htmlEditNewswire(cssCache: {}, translate: {}, baseDir: str, path: str, editNewswireForm += \ '
\n' editNewswireForm += \ - '

' + translate['Edit newswire'] + '

' + '

' + translate['Edit newswire'] + '

' editNewswireForm += \ '
\n' # editNewswireForm += \ From 54592b6cb0129164946e0fadb0c0950e3271fc5c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 09:47:02 +0000 Subject: [PATCH 02/50] Header font for columns --- epicyon-links.css | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/epicyon-links.css b/epicyon-links.css index 849c2cf90..aec1af46b 100644 --- a/epicyon-links.css +++ b/epicyon-links.css @@ -66,6 +66,7 @@ --column-right-width: 10vw; --banner-height: 15vh; --banner-height-mobile: 10vh; + --header-font: 'Arial, Helvetica, sans-serif'; } @font-face { @@ -214,6 +215,10 @@ a:focus { transform: translateY(30%) scaleX(-1); } +h1 { + font-family: var(--header-font); +} + .new-post-text { font-size: var(--font-size2); font-family: Arial, Helvetica, sans-serif; From a8ed5224594ea96705a0107d695e2e14d723fed0 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 09:56:45 +0000 Subject: [PATCH 03/50] Headers --- blog.py | 5 ++--- epicyon-links.css | 6 ------ webapp_column_right.py | 2 +- webapp_create_post.py | 18 +++++++++--------- webapp_profile.py | 4 ++-- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/blog.py b/blog.py index ca6d5a6a2..0778c82c3 100644 --- a/blog.py +++ b/blog.py @@ -724,12 +724,11 @@ def htmlEditBlog(mediaInstance: bool, translate: {}, iconsPath = getIconsWebPath(baseDir) - editBlogText = '

' + \ - translate['Write your post text below.'] + '

' + editBlogText = '' + translate['Write your post text below.'] + '' if os.path.isfile(baseDir + '/accounts/newpost.txt'): with open(baseDir + '/accounts/newpost.txt', 'r') as file: - editBlogText = '

' + file.read() + '

' + editBlogText = '

' + file.read() + '

' cssFilename = baseDir + '/epicyon-profile.css' if os.path.isfile(baseDir + '/epicyon.css'): diff --git a/epicyon-links.css b/epicyon-links.css index aec1af46b..0b616df21 100644 --- a/epicyon-links.css +++ b/epicyon-links.css @@ -219,12 +219,6 @@ h1 { font-family: var(--header-font); } -.new-post-text { - font-size: var(--font-size2); - font-family: Arial, Helvetica, sans-serif; - padding: 4px 0; -} - .new-post-subtext { font-size: var(--font-size-header); font-family: Arial, Helvetica, sans-serif; diff --git a/webapp_column_right.py b/webapp_column_right.py index 93fed4d3c..c23d538fa 100644 --- a/webapp_column_right.py +++ b/webapp_column_right.py @@ -605,7 +605,7 @@ def htmlEditNewsPost(cssCache: {}, translate: {}, baseDir: str, path: str, editNewsPostForm += \ '
\n' editNewsPostForm += \ - '

' + translate['Edit News Post'] + '

' + '

' + translate['Edit News Post'] + '

' editNewsPostForm += \ '
\n' editNewsPostForm += \ diff --git a/webapp_create_post.py b/webapp_create_post.py index 49ee03a5d..4034602c3 100644 --- a/webapp_create_post.py +++ b/webapp_create_post.py @@ -183,8 +183,8 @@ def htmlNewPost(cssCache: {}, mediaInstance: bool, translate: {}, if not path.endswith('/newshare'): if not path.endswith('/newreport'): if not inReplyTo or path.endswith('/newreminder'): - newPostText = '

' + \ - translate['Write your post text below.'] + '

\n' + newPostText = '

' + \ + translate['Write your post text below.'] + '

\n' else: newPostText = \ '

' + \ @@ -208,8 +208,8 @@ def htmlNewPost(cssCache: {}, mediaInstance: bool, translate: {}, showPublicOnDropdown = False else: newPostText = \ - '

' + \ - translate['Write your report below.'] + '

\n' + '

' + \ + translate['Write your report below.'] + '

\n' # custom report header with any additional instructions if os.path.isfile(baseDir + '/accounts/report.txt'): @@ -233,20 +233,20 @@ def htmlNewPost(cssCache: {}, mediaInstance: bool, translate: {}, translate['Terms of Service'] + '

\n' else: newPostText = \ - '

' + \ + '

' + \ translate['Enter the details for your shared item below.'] + \ - '

\n' + '

\n' if path.endswith('/newquestion'): newPostText = \ - '

' + \ + '

' + \ translate['Enter the choices for your question below.'] + \ - '

\n' + '

\n' if os.path.isfile(baseDir + '/accounts/newpost.txt'): with open(baseDir + '/accounts/newpost.txt', 'r') as file: newPostText = \ - '

' + file.read() + '

\n' + '

' + file.read() + '

\n' cssFilename = baseDir + '/epicyon-profile.css' if os.path.isfile(baseDir + '/epicyon.css'): diff --git a/webapp_profile.py b/webapp_profile.py index 8cb4efdfc..3d26b3626 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -1171,8 +1171,8 @@ def htmlEditProfile(cssCache: {}, translate: {}, baseDir: str, path: str, 'accept-charset="UTF-8" action="' + path + '/profiledata">\n' editProfileForm += '
\n' editProfileForm += \ - '

' + translate['Profile for'] + \ - ' ' + nickname + '@' + domainFull + '

' + '

' + translate['Profile for'] + \ + ' ' + nickname + '@' + domainFull + '

' editProfileForm += '
\n' # editProfileForm += \ # ' \n' editProfileForm += \ '
\n' + \ ' str: + """Returns a string of permissable image formats + used when selecting an image for a new post + """ + imageExt = getImageExtensions() + + imageFormats = '' + for ext in imageExt: + if imageFormats: + imageFormats += ', ' + imageFormats += '.' + ext + return imageFormats + + def removeHtml(content: str) -> str: """Removes html links from the given content. Used to ensure that profile descriptions don't contain dubious content diff --git a/webapp_profile.py b/webapp_profile.py index b84169b72..1e5d2acfd 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -14,6 +14,7 @@ from utils import isSystemAccount from utils import removeHtml from utils import loadJson from utils import getConfigParam +from utils import getImageFormats from skills import getSkills from theme import getThemesList from person import personBoxJson @@ -851,7 +852,7 @@ def htmlEditProfile(cssCache: {}, translate: {}, baseDir: str, path: str, defaultTimeline: str) -> str: """Shows the edit profile screen """ - imageFormats = '.png, .jpg, .jpeg, .gif, .webp, .avif' + imageFormats = getImageFormats() path = path.replace('/inbox', '').replace('/outbox', '') path = path.replace('/shares', '') nickname = getNicknameFromActor(path) diff --git a/webapp_utils.py b/webapp_utils.py index 6a18f06c4..9736aaaa2 100644 --- a/webapp_utils.py +++ b/webapp_utils.py @@ -9,6 +9,7 @@ __status__ = "Production" import os from collections import OrderedDict from session import getJson +from utils import getImageExtensions from utils import getProtocolPrefixes from utils import loadJson from utils import getCachedPostFilename @@ -248,12 +249,6 @@ def updateAvatarImageCache(session, baseDir: str, httpPrefix: str, return avatarImageFilename.replace(baseDir + '/cache', '') -def getImageExtensions() -> []: - """Returns a list of the possible image file extensions - """ - return ('png', 'jpg', 'jpeg', 'gif', 'webp', 'avif') - - def getPersonAvatarUrl(baseDir: str, personUrl: str, personCache: {}, allowDownloads: bool) -> str: """Returns the avatar url for the person From 30ebfda697bd984b441f6311d2399762890aaee7 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 11:54:29 +0000 Subject: [PATCH 09/50] Tidy up the handling of media formats --- blog.py | 4 ++-- content.py | 3 ++- daemon.py | 4 +++- media.py | 17 +++++++++-------- shares.py | 9 +++++---- theme.py | 3 ++- utils.py | 34 +++++++++++++++++++++++++++++++++- webapp_create_post.py | 7 ++++--- 8 files changed, 60 insertions(+), 21 deletions(-) diff --git a/blog.py b/blog.py index 0778c82c3..d14c72951 100644 --- a/blog.py +++ b/blog.py @@ -15,6 +15,7 @@ from webapp import htmlHeaderWithExternalStyle from webapp import htmlFooter from webapp_media import addEmbeddedElements from webapp_utils import getPostAttachmentsAsHtml +from utils import getMediaFormats from utils import getNicknameFromActor from utils import getDomainFromActor from utils import locatePost @@ -745,8 +746,7 @@ def htmlEditBlog(mediaInstance: bool, translate: {}, editBlogImageSection += \ ' ' + ' accept="' + getMediaFormats() + '">' editBlogImageSection += '
' placeholderMessage = translate['Write something'] + '...' diff --git a/content.py b/content.py index 44113ac59..efac10f97 100644 --- a/content.py +++ b/content.py @@ -9,6 +9,7 @@ __status__ = "Production" import os import email.parser from shutil import copyfile +from utils import getImageExtensions from utils import loadJson from utils import fileLastModified from utils import getLinkPrefixes @@ -939,7 +940,7 @@ def saveMediaInFormPOST(mediaBytes, debug: bool, break # remove any existing image files with a different format - extensionTypes = ('png', 'jpg', 'jpeg', 'gif', 'webp', 'avif') + extensionTypes = getImageExtensions() for ex in extensionTypes: if ex == detectedExtension: continue diff --git a/daemon.py b/daemon.py index 5a160d9c6..15c97bc98 100644 --- a/daemon.py +++ b/daemon.py @@ -166,6 +166,7 @@ from shares import getSharesFeedForPerson from shares import addShare from shares import removeShare from shares import expireShares +from utils import getImageExtensions from utils import mediaFileMimeType from utils import getCSS from utils import firstParagraphFromString @@ -8412,7 +8413,8 @@ class PubServer(BaseHTTPRequestHandler): GETstartTime, GETtimings: {}) -> bool: """Show a background image """ - for ext in ('webp', 'gif', 'jpg', 'png', 'avif'): + imageExtensions = getImageExtensions() + for ext in imageExtensions: for bg in ('follow', 'options', 'login'): # follow screen background image if path.endswith('/' + bg + '-background.' + ext): diff --git a/media.py b/media.py index a231c7906..25532ce22 100644 --- a/media.py +++ b/media.py @@ -13,6 +13,10 @@ import os import datetime from hashlib import sha1 from auth import createPassword +from utils import getImageExtensions +from utils import getVideoExtensions +from utils import getAudioExtensions +from utils import getMediaExtensions from shutil import copyfile from shutil import rmtree from shutil import move @@ -56,8 +60,7 @@ def getImageHash(imageFilename: str) -> str: def isMedia(imageFilename: str) -> bool: - permittedMedia = ('png', 'jpg', 'gif', 'webp', 'avif', - 'mp4', 'ogv', 'mp3', 'ogg') + permittedMedia = getMediaExtensions() for m in permittedMedia: if imageFilename.endswith('.' + m): return True @@ -83,16 +86,15 @@ def getAttachmentMediaType(filename: str) -> str: image, video or audio """ mediaType = None - imageTypes = ('png', 'jpg', 'jpeg', - 'gif', 'webp', 'avif') + imageTypes = getImageExtensions() for mType in imageTypes: if filename.endswith('.' + mType): return 'image' - videoTypes = ('mp4', 'webm', 'ogv') + videoTypes = getVideoExtensions() for mType in videoTypes: if filename.endswith('.' + mType): return 'video' - audioTypes = ('mp3', 'ogg') + audioTypes = getAudioExtensions() for mType in audioTypes: if filename.endswith('.' + mType): return 'audio' @@ -143,8 +145,7 @@ def attachMedia(baseDir: str, httpPrefix: str, domain: str, port: int, return postJson fileExtension = None - acceptedTypes = ('png', 'jpg', 'gif', 'webp', 'avif', - 'mp4', 'webm', 'ogv', 'mp3', 'ogg') + acceptedTypes = getMediaExtensions() for mType in acceptedTypes: if imageFilename.endswith('.' + mType): if mType == 'jpg': diff --git a/shares.py b/shares.py index 48b01fba2..9c9cba297 100644 --- a/shares.py +++ b/shares.py @@ -16,6 +16,7 @@ from session import postImage from utils import validNickname from utils import loadJson from utils import saveJson +from utils import getImageExtensions from media import removeMetaData @@ -54,7 +55,7 @@ def removeShare(baseDir: str, nickname: str, domain: str, # remove any image for the item itemIDfile = baseDir + '/sharefiles/' + nickname + '/' + itemID if sharesJson[itemID]['imageUrl']: - formats = ('png', 'jpg', 'gif', 'webp', 'avif') + formats = getImageExtensions() for ext in formats: if sharesJson[itemID]['imageUrl'].endswith('.' + ext): if os.path.isfile(itemIDfile + '.' + ext): @@ -108,7 +109,7 @@ def addShare(baseDir: str, if not imageFilename: sharesImageFilename = \ baseDir + '/accounts/' + nickname + '@' + domain + '/upload' - formats = ('png', 'jpg', 'gif', 'webp', 'avif') + formats = getImageExtensions() for ext in formats: if os.path.isfile(sharesImageFilename + '.' + ext): imageFilename = sharesImageFilename + '.' + ext @@ -128,7 +129,7 @@ def addShare(baseDir: str, if not os.path.isdir(baseDir + '/sharefiles/' + nickname): os.mkdir(baseDir + '/sharefiles/' + nickname) itemIDfile = baseDir + '/sharefiles/' + nickname + '/' + itemID - formats = ('png', 'jpg', 'gif', 'webp', 'avif') + formats = getImageExtensions() for ext in formats: if imageFilename.endswith('.' + ext): removeMetaData(imageFilename, itemIDfile + '.' + ext) @@ -202,7 +203,7 @@ def expireSharesForAccount(baseDir: str, nickname: str, domain: str) -> None: # remove any associated images itemIDfile = \ baseDir + '/sharefiles/' + nickname + '/' + itemID - formats = ('png', 'jpg', 'gif', 'webp', 'avif') + formats = getImageExtensions() for ext in formats: if os.path.isfile(itemIDfile + '.' + ext): os.remove(itemIDfile + '.' + ext) diff --git a/theme.py b/theme.py index eec781465..4f85a9d5c 100644 --- a/theme.py +++ b/theme.py @@ -9,6 +9,7 @@ __status__ = "Production" import os from utils import loadJson from utils import saveJson +from utils import getImageExtensions from shutil import copyfile from content import dangerousCSS @@ -473,7 +474,7 @@ def setThemeImages(baseDir: str, name: str) -> None: backgroundNames = ('login', 'shares', 'delete', 'follow', 'options', 'block', 'search', 'calendar') - extensions = ('webp', 'gif', 'jpg', 'png', 'avif') + extensions = getImageExtensions() for subdir, dirs, files in os.walk(baseDir + '/accounts'): for acct in dirs: diff --git a/utils.py b/utils.py index 848959542..3bd40da77 100644 --- a/utils.py +++ b/utils.py @@ -25,6 +25,24 @@ def getImageExtensions() -> []: return ('png', 'jpg', 'jpeg', 'gif', 'webp', 'avif') +def getVideoExtensions() -> []: + """Returns a list of the possible video file extensions + """ + return ('mp4', 'webm', 'ogv') + + +def getAudioExtensions() -> []: + """Returns a list of the possible audio file extensions + """ + return ('mp3', 'ogg') + + +def getMediaExtensions() -> []: + """Returns a list of the possible media file extensions + """ + return getImageExtensions() + getVideoExtensions() + getAudioExtensions() + + def getImageFormats() -> str: """Returns a string of permissable image formats used when selecting an image for a new post @@ -39,6 +57,20 @@ def getImageFormats() -> str: return imageFormats +def getMediaFormats() -> str: + """Returns a string of permissable media formats + used when selecting an attachment for a new post + """ + mediaExt = getMediaExtensions() + + mediaFormats = '' + for ext in mediaExt: + if mediaFormats: + mediaFormats += ', ' + mediaFormats += '.' + ext + return mediaFormats + + def removeHtml(content: str) -> str: """Removes html links from the given content. Used to ensure that profile descriptions don't contain dubious content @@ -213,7 +245,7 @@ def removeAvatarFromCache(baseDir: str, actorStr: str) -> None: """Removes any existing avatar entries from the cache This avoids duplicate entries with differing extensions """ - avatarFilenameExtensions = ('png', 'jpg', 'gif', 'webp', 'avif') + avatarFilenameExtensions = getImageExtensions() for extension in avatarFilenameExtensions: avatarFilename = \ baseDir + '/cache/avatars/' + actorStr + '.' + extension diff --git a/webapp_create_post.py b/webapp_create_post.py index 4034602c3..db15f0f8b 100644 --- a/webapp_create_post.py +++ b/webapp_create_post.py @@ -10,6 +10,8 @@ import os from utils import isPublicPostFromUrl from utils import getNicknameFromActor from utils import getDomainFromActor +from utils import getImageFormats +from utils import getMediaFormats from webapp_utils import getIconsWebPath from webapp_utils import getBannerFile from webapp_utils import htmlHeaderWithExternalStyle @@ -280,13 +282,12 @@ def htmlNewPost(cssCache: {}, mediaInstance: bool, translate: {}, newPostImageSection += \ ' \n' + ' accept="' + getImageFormats() + '">\n' else: newPostImageSection += \ ' \n' + ' accept="' + getMediaFormats() + '">\n' newPostImageSection += '
\n' scopeIcon = 'scope_public.png' From 31fee182f9d734b664d17c5de5727e6e1c8c018b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 13:31:08 +0000 Subject: [PATCH 10/50] Bigger recency window for hashtag swarm --- webapp_hashtagswarm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/webapp_hashtagswarm.py b/webapp_hashtagswarm.py index 6cf95974f..c34d7eb00 100644 --- a/webapp_hashtagswarm.py +++ b/webapp_hashtagswarm.py @@ -76,6 +76,7 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: currTime = datetime.utcnow() daysSinceEpoch = (currTime - datetime(1970, 1, 1)).days daysSinceEpochStr = str(daysSinceEpoch) + ' ' + recently = daysSinceEpoch - 1 tagSwarm = [] domainHistogram = {} @@ -84,12 +85,15 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: tagsFilename = os.path.join(baseDir + '/tags', f) if not os.path.isfile(tagsFilename): continue + # get last modified datetime modTimesinceEpoc = os.path.getmtime(tagsFilename) lastModifiedDate = datetime.fromtimestamp(modTimesinceEpoc) fileDaysSinceEpoch = (lastModifiedDate - datetime(1970, 1, 1)).days - # check if the file was last modified today - if fileDaysSinceEpoch != daysSinceEpoch: + + # check if the file was last modified within the previous + # two days + if fileDaysSinceEpoch >= recently: continue hashTagName = f.split('.')[0] @@ -111,7 +115,7 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: if not postDaysSinceEpochStr.isdigit(): break postDaysSinceEpoch = int(postDaysSinceEpochStr) - if postDaysSinceEpoch < daysSinceEpoch - 1: + if postDaysSinceEpoch < recently: break else: postUrl = sections[2] From d4536a9dd8bb7c1f672740c3ed4f740b6de24dce Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 13:33:50 +0000 Subject: [PATCH 11/50] Invert logic --- webapp_hashtagswarm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_hashtagswarm.py b/webapp_hashtagswarm.py index c34d7eb00..552f8045d 100644 --- a/webapp_hashtagswarm.py +++ b/webapp_hashtagswarm.py @@ -93,7 +93,7 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: # check if the file was last modified within the previous # two days - if fileDaysSinceEpoch >= recently: + if fileDaysSinceEpoch < recently: continue hashTagName = f.split('.')[0] From a30c1c1044ab6a22769b41c84cc842e0be860dce Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 13:45:37 +0000 Subject: [PATCH 12/50] More efficient reading of tag index --- webapp_hashtagswarm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/webapp_hashtagswarm.py b/webapp_hashtagswarm.py index 552f8045d..e26a8bd3f 100644 --- a/webapp_hashtagswarm.py +++ b/webapp_hashtagswarm.py @@ -76,6 +76,7 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: currTime = datetime.utcnow() daysSinceEpoch = (currTime - datetime(1970, 1, 1)).days daysSinceEpochStr = str(daysSinceEpoch) + ' ' + daysSinceEpochStr2 = str(daysSinceEpoch - 1) + ' ' recently = daysSinceEpoch - 1 tagSwarm = [] domainHistogram = {} @@ -99,8 +100,12 @@ def htmlHashTagSwarm(baseDir: str, actor: str, translate: {}) -> str: hashTagName = f.split('.')[0] if isBlockedHashtag(baseDir, hashTagName): continue - if daysSinceEpochStr not in open(tagsFilename).read(): - continue + with open(tagsFilename, 'r') as fp: + # only read one line, which saves time and memory + lastTag = fp.readline() + if not lastTag.startswith(daysSinceEpochStr): + if not lastTag.startswith(daysSinceEpochStr2): + continue with open(tagsFilename, 'r') as tagsFile: while True: line = tagsFile.readline() From a557c4a060c7504be10da11d24620f743a4ccaa4 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 23:18:34 +0000 Subject: [PATCH 13/50] Try media description if description doesn't exist --- newswire.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/newswire.py b/newswire.py index 7874ecb9a..2eb9f34c5 100644 --- a/newswire.py +++ b/newswire.py @@ -159,6 +159,11 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + else: + if '' in rssItem and \ + '' in rssItem: + description = rssItem.split('')[1] + description = description.split('')[0] link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: From fad926c7ecebb445f0a182d9fbd11e731bd98069 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Nov 2020 23:29:46 +0000 Subject: [PATCH 14/50] Try media description if description doesn't exist --- newswire.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/newswire.py b/newswire.py index 2eb9f34c5..ea4e6397f 100644 --- a/newswire.py +++ b/newswire.py @@ -248,6 +248,11 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + else: + if '' in rssItem and \ + '' in rssItem: + description = rssItem.split('')[1] + description = description.split('')[0] link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: From 0b9e4c5350ff89df282fb741c29989107fcefa88 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 10:34:42 +0000 Subject: [PATCH 15/50] Support for youtube feeds --- newswire.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/newswire.py b/newswire.py index ea4e6397f..22beae023 100644 --- a/newswire.py +++ b/newswire.py @@ -302,6 +302,89 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, return result +def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, + moderated: bool, mirrored: bool, + maxPostsPerSource: int, + maxFeedItemSizeKb: int) -> {}: + """Converts an atom-style YouTube feed string to a dictionary + """ + if '' not in xmlStr: + return {} + if isBlockedDomain(baseDir, 'www.youtube.com'): + return {} + result = {} + rssItems = xmlStr.split('') + postCtr = 0 + maxBytes = maxFeedItemSizeKb * 1024 + for rssItem in rssItems: + if len(rssItem) > maxBytes: + print('WARN: atom feed item is too big') + continue + if '' not in rssItem: + continue + if '' not in rssItem: + continue + if '' not in rssItem: + continue + if '' not in rssItem: + continue + if '' not in rssItem: + continue + if '' not in rssItem: + continue + title = rssItem.split('')[1] + title = title.split('')[0] + description = '' + if '' in rssItem and \ + '' in rssItem: + description = rssItem.split('')[1] + description = description.split('')[0] + elif '' in rssItem and '' in rssItem: + description = rssItem.split('')[1] + description = description.split('')[0] + link = rssItem.split('')[1] + link = link.split('')[0] + link = 'https://www.youtube.com/watch?v=' + link.strip() + pubDate = rssItem.split('')[1] + pubDate = pubDate.split('')[0] + parsed = False + try: + publishedDate = \ + datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ") + postFilename = '' + votesStatus = [] + addNewswireDictEntry(baseDir, domain, + result, str(publishedDate), + title, link, + votesStatus, postFilename, + description, moderated, mirrored) + postCtr += 1 + if postCtr >= maxPostsPerSource: + break + parsed = True + except BaseException: + pass + if not parsed: + try: + publishedDate = \ + datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") + postFilename = '' + votesStatus = [] + addNewswireDictEntry(baseDir, domain, result, + str(publishedDate) + '+00:00', + title, link, + votesStatus, postFilename, + description, moderated, mirrored) + postCtr += 1 + if postCtr >= maxPostsPerSource: + break + parsed = True + except BaseException: + print('WARN: unrecognized atom feed date format: ' + pubDate) + pass + return result + + def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, @@ -316,6 +399,10 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, return atomFeedToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) + elif '' in xmlStr and '' in xmlStr: + return atomFeedYTToDict(baseDir, domain, + xmlStr, moderated, mirrored, + maxPostsPerSource, maxFeedItemSizeKb) return {} From 0e2a5e09652151335dfbc343c70f65867623f421 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 10:46:54 +0000 Subject: [PATCH 16/50] Convert YT channels into atom-ish links --- newswire.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/newswire.py b/newswire.py index 22beae023..cb497c90b 100644 --- a/newswire.py +++ b/newswire.py @@ -406,6 +406,15 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, return {} +def YTchannelToAtomFeed(url: str) -> str: + """Converts a YouTube channel url into an atom feed url + """ + if 'youtube.com/channel/' not in url: + return url + channelId = url.split('youtube.com/channel/')[1] + return 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId + + def getRSS(baseDir: str, domain: str, session, url: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, maxFeedSizeKb: int, @@ -430,6 +439,7 @@ def getRSS(baseDir: str, domain: str, session, url: str, 'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0' if not session: print('WARN: no session specified for getRSS') + url = YTchannelToAtomFeed(url) try: result = session.get(url, headers=sessionHeaders, params=sessionParams) if result: From 7a01f422cf095c096ba88994ba55e2fd63a830a8 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 11:48:53 +0000 Subject: [PATCH 17/50] Set a maximum number of posts to appear in the newswire column --- daemon.py | 6 +++++- epicyon.py | 13 ++++++++++++- newsdaemon.py | 3 ++- newswire.py | 10 +++++++++- tests.py | 9 ++++++--- 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/daemon.py b/daemon.py index 15c97bc98..7ffaba474 100644 --- a/daemon.py +++ b/daemon.py @@ -12388,7 +12388,8 @@ def loadTokens(baseDir: str, tokensDict: {}, tokensLookup: {}) -> None: tokensLookup[token] = nickname -def runDaemon(allowLocalNetworkAccess: bool, +def runDaemon(maxNewswirePosts: int, + allowLocalNetworkAccess: bool, maxFeedItemSizeKb: int, publishButtonAtTop: bool, rssIconAtTop: bool, @@ -12463,6 +12464,9 @@ def runDaemon(allowLocalNetworkAccess: bool, # newswire storing rss feeds httpd.newswire = {} + # maximum number of posts to appear in the newswire on the right column + httpd.maxNewswirePosts = maxNewswirePosts + # This counter is used to update the list of blocked domains in memory. # It helps to avoid touching the disk and so improves flooding resistance httpd.blocklistUpdateCtr = 0 diff --git a/epicyon.py b/epicyon.py index 605661f1b..ecc051bb6 100644 --- a/epicyon.py +++ b/epicyon.py @@ -116,6 +116,10 @@ parser.add_argument('--postsPerSource', dest='maxNewswirePostsPerSource', type=int, default=4, help='Maximum newswire posts per feed or account') +parser.add_argument('--maxNewswirePosts', + dest='maxNewswirePosts', type=int, + default=20, + help='Maximum newswire posts in the right column') parser.add_argument('--maxFeedSize', dest='maxNewswireFeedSizeKb', type=int, default=10240, @@ -2001,6 +2005,12 @@ maxNewswirePostsPerSource = \ if maxNewswirePostsPerSource: args.maxNewswirePostsPerSource = int(maxNewswirePostsPerSource) +# set the maximum number of newswire posts appearing in the right column +maxNewswirePosts = \ + getConfigParam(baseDir, 'maxNewswirePosts') +if maxNewswirePosts: + args.maxNewswirePosts = int(maxNewswirePosts) + # set the maximum size of a newswire rss/atom feed in Kilobytes maxNewswireFeedSizeKb = \ getConfigParam(baseDir, 'maxNewswireFeedSizeKb') @@ -2075,7 +2085,8 @@ if setTheme(baseDir, themeName, domain, args.allowLocalNetworkAccess): print('Theme set to ' + themeName) if __name__ == "__main__": - runDaemon(args.allowLocalNetworkAccess, + runDaemon(args.maxNewswirePosts, + args.allowLocalNetworkAccess, args.maxFeedItemSizeKb, args.publishButtonAtTop, args.rssIconAtTop, diff --git a/newsdaemon.py b/newsdaemon.py index 899866179..82b4757af 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -718,7 +718,8 @@ def runNewswireDaemon(baseDir: str, httpd, httpd.maxNewswirePostsPerSource, httpd.maxNewswireFeedSizeKb, httpd.maxTags, - httpd.maxFeedItemSizeKb) + httpd.maxFeedItemSizeKb, + httpd.maxNewswirePosts) except Exception as e: print('WARN: unable to update newswire ' + str(e)) time.sleep(120) diff --git a/newswire.py b/newswire.py index cb497c90b..c52415829 100644 --- a/newswire.py +++ b/newswire.py @@ -677,7 +677,8 @@ def addBlogsToNewswire(baseDir: str, domain: str, newswire: {}, def getDictFromNewswire(session, baseDir: str, domain: str, maxPostsPerSource: int, maxFeedSizeKb: int, - maxTags: int, maxFeedItemSizeKb: int) -> {}: + maxTags: int, maxFeedItemSizeKb: int, + maxNewswirePosts: int) -> {}: """Gets rss feeds as a dictionary from newswire file """ subscriptionsFilename = baseDir + '/accounts/newswire.txt' @@ -728,4 +729,11 @@ def getDictFromNewswire(session, baseDir: str, domain: str, # sort into chronological order, latest first sortedResult = OrderedDict(sorted(result.items(), reverse=True)) + + # are there too many posts? If so then remove the oldest ones + noOfPosts = len(sortedResult.items()) + if noOfPosts > maxNewswirePosts: + for n in range(noOfPosts - maxNewswirePosts): + sortedResult.pop() + return sortedResult diff --git a/tests.py b/tests.py index c2de44e8c..41c1a0550 100644 --- a/tests.py +++ b/tests.py @@ -292,8 +292,9 @@ def createServerAlice(path: str, domain: str, port: int, onionDomain = None i2pDomain = None allowLocalNetworkAccess = True + maxNewswirePosts = 20 print('Server running: Alice') - runDaemon(allowLocalNetworkAccess, + runDaemon(maxNewswirePosts, allowLocalNetworkAccess, 2048, False, True, False, False, True, 10, False, 0, 100, 1024, 5, False, 0, False, 1, False, False, False, @@ -359,8 +360,9 @@ def createServerBob(path: str, domain: str, port: int, onionDomain = None i2pDomain = None allowLocalNetworkAccess = True + maxNewswirePosts = 20 print('Server running: Bob') - runDaemon(allowLocalNetworkAccess, + runDaemon(maxNewswirePosts, allowLocalNetworkAccess, 2048, False, True, False, False, True, 10, False, 0, 100, 1024, 5, False, 0, False, 1, False, False, False, @@ -400,8 +402,9 @@ def createServerEve(path: str, domain: str, port: int, federationList: [], onionDomain = None i2pDomain = None allowLocalNetworkAccess = True + maxNewswirePosts = 20 print('Server running: Eve') - runDaemon(allowLocalNetworkAccess, + runDaemon(maxNewswirePosts, allowLocalNetworkAccess, 2048, False, True, False, False, True, 10, False, 0, 100, 1024, 5, False, 0, False, 1, False, False, False, From d1295a94f02fcd29a2a0d6221683cc3f0d2f2778 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:05:15 +0000 Subject: [PATCH 18/50] Remove extra newswire items --- newswire.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/newswire.py b/newswire.py index c52415829..b9b476b8c 100644 --- a/newswire.py +++ b/newswire.py @@ -733,7 +733,13 @@ def getDictFromNewswire(session, baseDir: str, domain: str, # are there too many posts? If so then remove the oldest ones noOfPosts = len(sortedResult.items()) if noOfPosts > maxNewswirePosts: - for n in range(noOfPosts - maxNewswirePosts): - sortedResult.pop() + ctr = 0 + removals = [] + for dateStr, item in sortedResult.items(): + ctr += 1 + if ctr >= maxNewswirePosts: + removals.append(dateStr) + for r in removals: + sortedResult.pop(r) return sortedResult From bc77031e6a08b5055079733c312f883e0ea02fd4 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:18:43 +0000 Subject: [PATCH 19/50] Ensure that CDATA is removed from titles and descriptions --- newswire.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/newswire.py b/newswire.py index b9b476b8c..52b8c931b 100644 --- a/newswire.py +++ b/newswire.py @@ -25,6 +25,16 @@ from blocking import isBlockedHashtag from filters import isFiltered +def removeCDATA(text: str) -> str: + """Removes any CDATA from the given text + """ + if 'CDATA[' in text: + text = text.split('CDATA[')[1] + if ']' in text: + text = text.split(']')[0] + return text + + def rss2Header(httpPrefix: str, nickname: str, domainFull: str, title: str, translate: {}) -> str: @@ -154,16 +164,17 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] - description = description.split('')[0] + description = removeCDATA(description.split('')[0]) else: if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: @@ -243,16 +254,17 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and '' in rssItem: description = rssItem.split('')[1] - description = description.split('')[0] + description = removeCDATA(description.split('')[0]) else: if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] if '://' not in link: @@ -333,15 +345,17 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, if '' not in rssItem: continue title = rssItem.split('')[1] - title = title.split('')[0] + title = removeCDATA(title.split('')[0]) description = '' if '' in rssItem and \ '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) elif '' in rssItem and '' in rssItem: description = rssItem.split('')[1] description = description.split('')[0] + description = removeCDATA(description) link = rssItem.split('')[1] link = link.split('')[0] link = 'https://www.youtube.com/watch?v=' + link.strip() @@ -494,7 +508,7 @@ def getRSSfromDict(baseDir: str, newswire: {}, continue rssStr += '\n' rssStr += ' ' + fields[0] + '\n' - description = firstParagraphFromString(fields[4]) + description = removeCDATA(firstParagraphFromString(fields[4])) rssStr += ' ' + description + '\n' url = fields[1] if '://' not in url: @@ -614,6 +628,7 @@ def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, votes = loadJson(fullPostFilename + '.votes') content = postJsonObject['object']['content'] description = firstParagraphFromString(content) + description = removeCDATA(description) addNewswireDictEntry(baseDir, domain, newswire, published, postJsonObject['object']['summary'], From bf28568bd1417e79318b7197a0f1d1f4c200140f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:25:53 +0000 Subject: [PATCH 20/50] One more --- newswire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 52b8c931b..e7fa4a935 100644 --- a/newswire.py +++ b/newswire.py @@ -752,7 +752,7 @@ def getDictFromNewswire(session, baseDir: str, domain: str, removals = [] for dateStr, item in sortedResult.items(): ctr += 1 - if ctr >= maxNewswirePosts: + if ctr > maxNewswirePosts: removals.append(dateStr) for r in removals: sortedResult.pop(r) From a41ec5437034375ffa854786436fc981c6ae9a62 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:27:42 +0000 Subject: [PATCH 21/50] Remove any ending --- newswire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index e7fa4a935..b95daac4e 100644 --- a/newswire.py +++ b/newswire.py @@ -425,7 +425,7 @@ def YTchannelToAtomFeed(url: str) -> str: """ if 'youtube.com/channel/' not in url: return url - channelId = url.split('youtube.com/channel/')[1] + channelId = url.split('youtube.com/channel/')[1].strip() return 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId From 06337020f8841d0ac61dd82a65ffd039566eb4fd Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:32:48 +0000 Subject: [PATCH 22/50] Debug --- newswire.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index b95daac4e..3fdfa7eab 100644 --- a/newswire.py +++ b/newswire.py @@ -329,6 +329,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, postCtr = 0 maxBytes = maxFeedItemSizeKb * 1024 for rssItem in rssItems: + print('YouTube feed item: ' + rssItem) if len(rssItem) > maxBytes: print('WARN: atom feed item is too big') continue @@ -377,6 +378,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: + print('YouTube feed: failed to parse published date ' + pubDate) pass if not parsed: try: @@ -394,7 +396,8 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized atom feed date format: ' + pubDate) + print('YouTube feed: failed to parse published date ' + + pubDate) pass return result @@ -414,6 +417,7 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) elif '' in xmlStr and '' in xmlStr: + print ('YouTube feed: reading') return atomFeedYTToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) From ea30903c7001ebc82cc1e0a38249b6cf6350e208 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:36:21 +0000 Subject: [PATCH 23/50] Debug --- newswire.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 3fdfa7eab..f1d33aee0 100644 --- a/newswire.py +++ b/newswire.py @@ -430,7 +430,10 @@ def YTchannelToAtomFeed(url: str) -> str: if 'youtube.com/channel/' not in url: return url channelId = url.split('youtube.com/channel/')[1].strip() - return 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId + channelUrl = \ + 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId + print('YouTube feed: ' + channelUrl) + return channelUrl def getRSS(baseDir: str, domain: str, session, url: str, From 22f21ba1f0455873a297157eb6c769583765ed7b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:41:54 +0000 Subject: [PATCH 24/50] Variable name --- newswire.py | 72 ++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/newswire.py b/newswire.py index f1d33aee0..761446f0e 100644 --- a/newswire.py +++ b/newswire.py @@ -234,38 +234,38 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, if '' not in xmlStr: return {} result = {} - rssItems = xmlStr.split('') + atomItems = xmlStr.split('') postCtr = 0 maxBytes = maxFeedItemSizeKb * 1024 - for rssItem in rssItems: - if len(rssItem) > maxBytes: + for atomItem in atomItems: + if len(atomItem) > maxBytes: print('WARN: atom feed item is too big') continue - if '' not in rssItem: + if '<title>' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - title = rssItem.split('')[1] + title = atomItem.split('<title>')[1] title = removeCDATA(title.split('')[0]) description = '' - if '' in rssItem and '' in rssItem: - description = rssItem.split('')[1] + if '' in atomItem and '' in atomItem: + description = atomItem.split('')[1] description = removeCDATA(description.split('')[0]) else: - if '' in rssItem and \ - '' in rssItem: - description = rssItem.split('')[1] + if '' in atomItem and \ + '' in atomItem: + description = atomItem.split('')[1] description = description.split('')[0] description = removeCDATA(description) - link = rssItem.split('')[1] + link = atomItem.split('')[1] link = link.split('')[0] if '://' not in link: continue @@ -274,7 +274,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, itemDomain = itemDomain.split('/')[0] if isBlockedDomain(baseDir, itemDomain): continue - pubDate = rssItem.split('')[1] + pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] parsed = False try: @@ -325,42 +325,42 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, if isBlockedDomain(baseDir, 'www.youtube.com'): return {} result = {} - rssItems = xmlStr.split('') + atomItems = xmlStr.split('') postCtr = 0 maxBytes = maxFeedItemSizeKb * 1024 - for rssItem in rssItems: - print('YouTube feed item: ' + rssItem) - if len(rssItem) > maxBytes: + for atomItem in atomItems: + print('YouTube feed item: ' + atomItem) + if len(atomItem) > maxBytes: print('WARN: atom feed item is too big') continue - if '' not in rssItem: + if '<title>' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - if '' not in rssItem: + if '' not in atomItem: continue - title = rssItem.split('')[1] + title = atomItem.split('<title>')[1] title = removeCDATA(title.split('')[0]) description = '' - if '' in rssItem and \ - '' in rssItem: - description = rssItem.split('')[1] + if '' in atomItem and \ + '' in atomItem: + description = atomItem.split('')[1] description = description.split('')[0] description = removeCDATA(description) - elif '' in rssItem and '' in rssItem: - description = rssItem.split('')[1] + elif '' in atomItem and '' in atomItem: + description = atomItem.split('')[1] description = description.split('')[0] description = removeCDATA(description) - link = rssItem.split('')[1] + link = atomItem.split('')[1] link = link.split('')[0] link = 'https://www.youtube.com/watch?v=' + link.strip() - pubDate = rssItem.split('')[1] + pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] parsed = False try: @@ -417,7 +417,7 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) elif '' in xmlStr and '' in xmlStr: - print ('YouTube feed: reading') + print('YouTube feed: reading') return atomFeedYTToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) From 03842778cce2172802bc4c61ea9ace4f511e9c83 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 12:43:22 +0000 Subject: [PATCH 25/50] Warning text --- newswire.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 761446f0e..785adcab4 100644 --- a/newswire.py +++ b/newswire.py @@ -471,7 +471,8 @@ def getRSS(baseDir: str, domain: str, session, url: str, maxPostsPerSource, maxFeedItemSizeKb) else: - print('WARN: feed is too large: ' + url) + print('WARN: feed is too large, ' + + 'or contains invalid characters: ' + url) except requests.exceptions.RequestException as e: print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' + 'headers: ' + str(sessionHeaders) + '\n' + From 64b4e7fbd8cbe9ecd68e6e35b3a0749fd69a82b3 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 13:04:58 +0000 Subject: [PATCH 26/50] Extra warning --- newswire.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/newswire.py b/newswire.py index 785adcab4..6b837b983 100644 --- a/newswire.py +++ b/newswire.py @@ -473,6 +473,8 @@ def getRSS(baseDir: str, domain: str, session, url: str, else: print('WARN: feed is too large, ' + 'or contains invalid characters: ' + url) + else: + print('WARN: no result returned for feed ' + url) except requests.exceptions.RequestException as e: print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' + 'headers: ' + str(sessionHeaders) + '\n' + From 7be81054badcfdd44740445434bf295aaf846058 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 14:08:29 +0000 Subject: [PATCH 27/50] Handling of dates within feeds --- newswire.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/newswire.py b/newswire.py index 6b837b983..199bf59cd 100644 --- a/newswire.py +++ b/newswire.py @@ -203,6 +203,29 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, parsed = True except BaseException: pass + + if not parsed: + try: + publishedDate = \ + datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") + hoursAdded = datetime.timedelta(hours=5) + publishedDate = publishedDate + hoursAdded + postFilename = '' + votesStatus = [] + addNewswireDictEntry(baseDir, domain, + result, + str(publishedDate) + '00:00', + title, link, + votesStatus, postFilename, + description, moderated, mirrored) + postCtr += 1 + if postCtr >= maxPostsPerSource: + break + parsed = True + except BaseException: + print('WARN: unrecognized RSS date format: ' + pubDate) + pass + if not parsed: try: publishedDate = \ @@ -293,14 +316,15 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, parsed = True except BaseException: pass + if not parsed: try: publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") + datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z") postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, result, - str(publishedDate) + '+00:00', + str(publishedDate), title, link, votesStatus, postFilename, description, moderated, mirrored) @@ -311,6 +335,28 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, except BaseException: print('WARN: unrecognized atom feed date format: ' + pubDate) pass + + if not parsed: + try: + publishedDate = \ + datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") + hoursAdded = datetime.timedelta(hours=5) + publishedDate = publishedDate + hoursAdded + postFilename = '' + votesStatus = [] + addNewswireDictEntry(baseDir, domain, + result, + str(publishedDate) + '00:00', + title, link, + votesStatus, postFilename, + description, moderated, mirrored) + postCtr += 1 + if postCtr >= maxPostsPerSource: + break + parsed = True + except BaseException: + print('WARN: unrecognized RSS date format: ' + pubDate) + pass return result @@ -380,6 +426,26 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, except BaseException: print('YouTube feed: failed to parse published date ' + pubDate) pass + + if not parsed: + try: + publishedDate = \ + datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z") + postFilename = '' + votesStatus = [] + addNewswireDictEntry(baseDir, domain, result, + str(publishedDate), + title, link, + votesStatus, postFilename, + description, moderated, mirrored) + postCtr += 1 + if postCtr >= maxPostsPerSource: + break + parsed = True + except BaseException: + print('WARN: unrecognized atom feed date format: ' + pubDate) + pass + if not parsed: try: publishedDate = \ From f42c23d69ff695c73a6a5fb32ea884ec256bca13 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 14:18:25 +0000 Subject: [PATCH 28/50] Make debug messages distinguishable --- newswire.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/newswire.py b/newswire.py index 199bf59cd..f35affd73 100644 --- a/newswire.py +++ b/newswire.py @@ -223,7 +223,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format: ' + pubDate) + print('WARN: unrecognized RSS date format 1: ' + pubDate) pass if not parsed: @@ -243,7 +243,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format: ' + pubDate) + print('WARN: unrecognized RSS date format 2: ' + pubDate) pass return result @@ -333,7 +333,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized atom feed date format: ' + pubDate) + print('WARN: unrecognized atom feed date format 1: ' + pubDate) pass if not parsed: @@ -355,7 +355,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format: ' + pubDate) + print('WARN: unrecognized RSS date format 3: ' + pubDate) pass return result @@ -443,7 +443,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized atom feed date format: ' + pubDate) + print('WARN: unrecognized atom feed date format 2: ' + pubDate) pass if not parsed: From 8b7b9d566061c04633417fd1c9a1b907a6949ad3 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 15:08:05 +0000 Subject: [PATCH 29/50] Debug --- newswire.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/newswire.py b/newswire.py index f35affd73..61570fdac 100644 --- a/newswire.py +++ b/newswire.py @@ -223,7 +223,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format 1: ' + pubDate) + print('WARN: unrecognized RSS date format EST: ' + pubDate) pass if not parsed: @@ -243,7 +243,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format 2: ' + pubDate) + print('WARN: unrecognized RSS date format UT: ' + pubDate) pass return result @@ -315,6 +315,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: + print('WARN: unrecognized atom date format UT: ' + pubDate) pass if not parsed: @@ -333,7 +334,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized atom feed date format 1: ' + pubDate) + print('WARN: unrecognized atom feed date format z: ' + pubDate) pass if not parsed: @@ -355,7 +356,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format 3: ' + pubDate) + print('WARN: unrecognized RSS date format EST: ' + pubDate) pass return result @@ -424,7 +425,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('YouTube feed: failed to parse published date ' + pubDate) + print('WARN: unrecognized YT atom date format UT: ' + pubDate) pass if not parsed: @@ -443,7 +444,8 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized atom feed date format 2: ' + pubDate) + print('WARN: unrecognized YT atom feed date format z: ' + + pubDate) pass if not parsed: @@ -462,7 +464,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('YouTube feed: failed to parse published date ' + + print('WARN: unrecognized YT atom feed date format UT: ' + pubDate) pass return result From b58cdeba8bd884e9373ec96d456f2e358b6ba3be Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 15:25:25 +0000 Subject: [PATCH 30/50] atom --- newswire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 61570fdac..8ccef7a79 100644 --- a/newswire.py +++ b/newswire.py @@ -356,7 +356,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, break parsed = True except BaseException: - print('WARN: unrecognized RSS date format EST: ' + pubDate) + print('WARN: unrecognized atom date format EST: ' + pubDate) pass return result From c937cbdac870ecb55cb66babf51a412132a63e3c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 15:33:11 +0000 Subject: [PATCH 31/50] timedelta import --- newswire.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/newswire.py b/newswire.py index 8ccef7a79..86ff29356 100644 --- a/newswire.py +++ b/newswire.py @@ -11,6 +11,7 @@ import requests from socket import error as SocketError import errno from datetime import datetime +from datetime import timedelta from collections import OrderedDict from utils import firstParagraphFromString from utils import isPublicPost @@ -208,7 +209,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, try: publishedDate = \ datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") - hoursAdded = datetime.timedelta(hours=5) + hoursAdded = timedelta(hours=5) publishedDate = publishedDate + hoursAdded postFilename = '' votesStatus = [] @@ -341,7 +342,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, try: publishedDate = \ datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") - hoursAdded = datetime.timedelta(hours=5) + hoursAdded = timedelta(hours=5) publishedDate = publishedDate + hoursAdded postFilename = '' votesStatus = [] From 944fa554e784134042759b5b2b615f8a471125f2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 15:41:42 +0000 Subject: [PATCH 32/50] Date format check --- webapp_column_right.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/webapp_column_right.py b/webapp_column_right.py index c23d538fa..921a22d41 100644 --- a/webapp_column_right.py +++ b/webapp_column_right.py @@ -214,8 +214,12 @@ def htmlNewswire(baseDir: str, newswire: {}, nickname: str, moderator: bool, item[0] = item[0].split('CDATA[')[1] if ']' in item[0]: item[0] = item[0].split(']')[0] - publishedDate = \ - datetime.strptime(dateStr, "%Y-%m-%d %H:%M:%S%z") + try: + publishedDate = \ + datetime.strptime(dateStr, "%Y-%m-%d %H:%M:%S%z") + except BaseException: + print('WARN: bad date format ' + dateStr) + continue dateShown = publishedDate.strftime("%Y-%m-%d %H:%M") dateStrLink = dateStr.replace('T', ' ') From a7de74bb8145a418153eb3fe0d2e44501e582f48 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 16:02:18 +0000 Subject: [PATCH 33/50] Plus --- newswire.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/newswire.py b/newswire.py index 86ff29356..11f37f6ec 100644 --- a/newswire.py +++ b/newswire.py @@ -213,9 +213,9 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = publishedDate + hoursAdded postFilename = '' votesStatus = [] + pubDateStr = str(publishedDate) + '+00:00' addNewswireDictEntry(baseDir, domain, - result, - str(publishedDate) + '00:00', + result, pubDateStr, title, link, votesStatus, postFilename, description, moderated, mirrored) @@ -346,9 +346,9 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = publishedDate + hoursAdded postFilename = '' votesStatus = [] + pubDateStr = str(publishedDate) + '+00:00' addNewswireDictEntry(baseDir, domain, - result, - str(publishedDate) + '00:00', + result, pubDateStr, title, link, votesStatus, postFilename, description, moderated, mirrored) From e0e31e83303f434ed52e84030b16f22db5fe5ed3 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 16:10:58 +0000 Subject: [PATCH 34/50] Testing sequence for feeds --- newswire.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/newswire.py b/newswire.py index 11f37f6ec..1e5e9998f 100644 --- a/newswire.py +++ b/newswire.py @@ -477,7 +477,12 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, maxFeedItemSizeKb: int) -> {}: """Converts an xml string to a dictionary """ - if 'rss version="2.0"' in xmlStr: + if '' in xmlStr and '' in xmlStr: + print('YouTube feed: reading') + return atomFeedYTToDict(baseDir, domain, + xmlStr, moderated, mirrored, + maxPostsPerSource, maxFeedItemSizeKb) + elif 'rss version="2.0"' in xmlStr: return xml2StrToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) @@ -485,11 +490,6 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, return atomFeedToDict(baseDir, domain, xmlStr, moderated, mirrored, maxPostsPerSource, maxFeedItemSizeKb) - elif '' in xmlStr and '' in xmlStr: - print('YouTube feed: reading') - return atomFeedYTToDict(baseDir, domain, - xmlStr, moderated, mirrored, - maxPostsPerSource, maxFeedItemSizeKb) return {} From f892745e8c00e114714cd556b5ee12a8857e314b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:14:40 +0000 Subject: [PATCH 35/50] Tidy parsing of feed dates --- newswire.py | 208 ++++++++++++++++------------------------------------ 1 file changed, 62 insertions(+), 146 deletions(-) diff --git a/newswire.py b/newswire.py index 1e5e9998f..2842ba113 100644 --- a/newswire.py +++ b/newswire.py @@ -136,6 +136,59 @@ def addNewswireDictEntry(baseDir: str, domain: str, ] +def parseFeedDate(pubDate: str): + """Returns a date object based on the given date string + This tries a number of formats to see which work + """ + formats = ("%a, %d %b %Y %H:%M:%S %z", + "%a, %d %b %Y %H:%M:%S EST", + "%a, %d %b %Y %H:%M:%S UT", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S%z") + + publishedDate = None + for dateFormat in formats: + if ',' in pubDate and ',' not in dateFormat: + continue + if ',' not in pubDate and ',' in dateFormat: + continue + if '-' in pubDate and '-' not in dateFormat: + continue + if '-' not in pubDate and '-' in dateFormat: + continue + if 'T' in pubDate and 'T' not in dateFormat: + continue + if 'T' not in pubDate and 'T' in dateFormat: + continue + if 'Z' in pubDate and 'Z' not in dateFormat: + continue + if 'Z' not in pubDate and 'Z' in dateFormat: + continue + if 'EST' not in pubDate and 'EST' in dateFormat: + continue + if 'EST' in pubDate and 'EST' not in dateFormat: + continue + if 'UT' not in pubDate and 'UT' in dateFormat: + continue + if 'UT' in pubDate and 'UT' not in dateFormat: + continue + + try: + publishedDate = \ + datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") + except BaseException: + print('WARN: unrecognized date format: ' + + pubDate + ' ' + dateFormat) + continue + + if publishedDate: + if pubDate.endswith(' EST'): + hoursAdded = timedelta(hours=5) + publishedDate = publishedDate + hoursAdded + break + return publishedDate + + def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, moderated: bool, mirrored: bool, maxPostsPerSource: int, @@ -187,10 +240,9 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, continue pubDate = rssItem.split('')[1] pubDate = pubDate.split('')[0] - parsed = False - try: - publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z") + + publishedDate = parseFeedDate(pubDate) + if publishedDate: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -201,51 +253,6 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, postCtr += 1 if postCtr >= maxPostsPerSource: break - parsed = True - except BaseException: - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") - hoursAdded = timedelta(hours=5) - publishedDate = publishedDate + hoursAdded - postFilename = '' - votesStatus = [] - pubDateStr = str(publishedDate) + '+00:00' - addNewswireDictEntry(baseDir, domain, - result, pubDateStr, - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized RSS date format EST: ' + pubDate) - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") - postFilename = '' - votesStatus = [] - addNewswireDictEntry(baseDir, domain, - result, - str(publishedDate) + '+00:00', - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized RSS date format UT: ' + pubDate) - pass return result @@ -300,10 +307,9 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, continue pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] - parsed = False - try: - publishedDate = \ - datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ") + + publishedDate = parseFeedDate(pubDate) + if publishedDate: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -314,51 +320,6 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, postCtr += 1 if postCtr >= maxPostsPerSource: break - parsed = True - except BaseException: - print('WARN: unrecognized atom date format UT: ' + pubDate) - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z") - postFilename = '' - votesStatus = [] - addNewswireDictEntry(baseDir, domain, result, - str(publishedDate), - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized atom feed date format z: ' + pubDate) - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") - hoursAdded = timedelta(hours=5) - publishedDate = publishedDate + hoursAdded - postFilename = '' - votesStatus = [] - pubDateStr = str(publishedDate) + '+00:00' - addNewswireDictEntry(baseDir, domain, - result, pubDateStr, - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized atom date format EST: ' + pubDate) - pass return result @@ -410,10 +371,9 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, link = 'https://www.youtube.com/watch?v=' + link.strip() pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] - parsed = False - try: - publishedDate = \ - datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ") + + publishedDate = parseFeedDate(pubDate) + if publishedDate: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -424,50 +384,6 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, postCtr += 1 if postCtr >= maxPostsPerSource: break - parsed = True - except BaseException: - print('WARN: unrecognized YT atom date format UT: ' + pubDate) - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z") - postFilename = '' - votesStatus = [] - addNewswireDictEntry(baseDir, domain, result, - str(publishedDate), - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized YT atom feed date format z: ' + - pubDate) - pass - - if not parsed: - try: - publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") - postFilename = '' - votesStatus = [] - addNewswireDictEntry(baseDir, domain, result, - str(publishedDate) + '+00:00', - title, link, - votesStatus, postFilename, - description, moderated, mirrored) - postCtr += 1 - if postCtr >= maxPostsPerSource: - break - parsed = True - except BaseException: - print('WARN: unrecognized YT atom feed date format UT: ' + - pubDate) - pass return result From e58effe89dd8b3e371cefa2d8ec94e51f5a6274b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:21:13 +0000 Subject: [PATCH 36/50] Append UTC --- newswire.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 2842ba113..80c83b188 100644 --- a/newswire.py +++ b/newswire.py @@ -243,10 +243,13 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = parseFeedDate(pubDate) if publishedDate: + pubDateStr = str(publishedDate) + '+00:00' + if not pubDateStr.endswith('+00:00'): + pubDateStr += '+00:00' postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, - result, str(publishedDate), + result, pubDateStr, title, link, votesStatus, postFilename, description, moderated, mirrored) From 669a70d0d2fcd334b1c5d920de5369cc3df3558b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:25:05 +0000 Subject: [PATCH 37/50] No default append to feed date --- newswire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 80c83b188..5ef11b0c2 100644 --- a/newswire.py +++ b/newswire.py @@ -243,7 +243,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = parseFeedDate(pubDate) if publishedDate: - pubDateStr = str(publishedDate) + '+00:00' + pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' postFilename = '' From d505132b179cb637ccd4f66e756b3a5dfb5b63d2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:28:27 +0000 Subject: [PATCH 38/50] Published date --- newswire.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/newswire.py b/newswire.py index 5ef11b0c2..c79a47c8a 100644 --- a/newswire.py +++ b/newswire.py @@ -313,10 +313,13 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = parseFeedDate(pubDate) if publishedDate: + pubDateStr = str(publishedDate) + if not pubDateStr.endswith('+00:00'): + pubDateStr += '+00:00' postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, - result, str(publishedDate), + result, pubDateStr, title, link, votesStatus, postFilename, description, moderated, mirrored) @@ -377,10 +380,13 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, publishedDate = parseFeedDate(pubDate) if publishedDate: + pubDateStr = str(publishedDate) + if not pubDateStr.endswith('+00:00'): + pubDateStr += '+00:00' postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, - result, str(publishedDate), + result, pubDateStr, title, link, votesStatus, postFilename, description, moderated, mirrored) From b90abc689122b618c43ee90891330c7ad461ea90 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:30:59 +0000 Subject: [PATCH 39/50] Debug --- newswire.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/newswire.py b/newswire.py index c79a47c8a..ba38af996 100644 --- a/newswire.py +++ b/newswire.py @@ -246,6 +246,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' + print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -316,6 +317,7 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' + print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -383,6 +385,7 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' + print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, From 86997a138000176a48c5598d2138f91f4406384b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:36:17 +0000 Subject: [PATCH 40/50] Debug --- newswire.py | 1 + 1 file changed, 1 insertion(+) diff --git a/newswire.py b/newswire.py index ba38af996..29812d72c 100644 --- a/newswire.py +++ b/newswire.py @@ -186,6 +186,7 @@ def parseFeedDate(pubDate: str): hoursAdded = timedelta(hours=5) publishedDate = publishedDate + hoursAdded break + print('Feed date: ' + pubDate + ' ' + str(publishedDate)) return publishedDate From 31b12ae207c4aadcad516f51b71e936fa087985c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:43:01 +0000 Subject: [PATCH 41/50] Unit test for date parsing --- newswire.py | 2 +- tests.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 29812d72c..1c5c4a613 100644 --- a/newswire.py +++ b/newswire.py @@ -175,7 +175,7 @@ def parseFeedDate(pubDate: str): try: publishedDate = \ - datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST") + datetime.strptime(pubDate, dateFormat) except BaseException: print('WARN: unrecognized date format: ' + pubDate + ' ' + dateFormat) diff --git a/tests.py b/tests.py index 41c1a0550..59c0259e2 100644 --- a/tests.py +++ b/tests.py @@ -86,6 +86,7 @@ from jsonldsig import jsonldVerify from newsdaemon import hashtagRuleTree from newsdaemon import hashtagRuleResolve from newswire import getNewswireTags +from newswire import parseFeedDate testServerAliceRunning = False testServerBobRunning = False @@ -2385,8 +2386,17 @@ def testFirstParagraphFromString(): assert resultStr == testStr +def testParseFeedDate(): + print('testParseFeedDate') + pubDate = "2020-08-27T16:12:34+00:00" + publishedDate = parseFeedDate(pubDate) + assert publishedDate + print(str(publishedDate)) + + def runAllTests(): print('Running tests...') + testParseFeedDate() testFirstParagraphFromString() testGetNewswireTags() testHashtagRuleTree() From 61beedd86e1ced4bd48a9f43126cd341f38625d2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 18:46:10 +0000 Subject: [PATCH 42/50] Remove debug --- newswire.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/newswire.py b/newswire.py index 1c5c4a613..b9676af8b 100644 --- a/newswire.py +++ b/newswire.py @@ -186,7 +186,6 @@ def parseFeedDate(pubDate: str): hoursAdded = timedelta(hours=5) publishedDate = publishedDate + hoursAdded break - print('Feed date: ' + pubDate + ' ' + str(publishedDate)) return publishedDate @@ -247,7 +246,6 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' - print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -318,7 +316,6 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' - print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -386,7 +383,6 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' - print('Feed date: ' + pubDateStr) postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, From fb29da5f7a2379b3391aa4785aa9cb9c05184de9 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 19:01:18 +0000 Subject: [PATCH 43/50] Date parser returns string --- newswire.py | 34 ++++++++++++++++------------------ tests.py | 6 +++++- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/newswire.py b/newswire.py index b9676af8b..825009eb1 100644 --- a/newswire.py +++ b/newswire.py @@ -136,8 +136,8 @@ def addNewswireDictEntry(baseDir: str, domain: str, ] -def parseFeedDate(pubDate: str): - """Returns a date object based on the given date string +def parseFeedDate(pubDate: str) -> str: + """Returns a UTC date string based on the given date string This tries a number of formats to see which work """ formats = ("%a, %d %b %Y %H:%M:%S %z", @@ -186,7 +186,14 @@ def parseFeedDate(pubDate: str): hoursAdded = timedelta(hours=5) publishedDate = publishedDate + hoursAdded break - return publishedDate + + pubDateStr = None + if publishedDate: + pubDateStr = str(publishedDate) + if not pubDateStr.endswith('+00:00'): + pubDateStr += '+00:00' + + return pubDateStr def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, @@ -241,11 +248,8 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, pubDate = rssItem.split('')[1] pubDate = pubDate.split('')[0] - publishedDate = parseFeedDate(pubDate) - if publishedDate: - pubDateStr = str(publishedDate) - if not pubDateStr.endswith('+00:00'): - pubDateStr += '+00:00' + pubDateStr = parseFeedDate(pubDate) + if pubDateStr: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -311,11 +315,8 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] - publishedDate = parseFeedDate(pubDate) - if publishedDate: - pubDateStr = str(publishedDate) - if not pubDateStr.endswith('+00:00'): - pubDateStr += '+00:00' + pubDateStr = parseFeedDate(pubDate) + if pubDateStr: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, @@ -378,11 +379,8 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, pubDate = atomItem.split('')[1] pubDate = pubDate.split('')[0] - publishedDate = parseFeedDate(pubDate) - if publishedDate: - pubDateStr = str(publishedDate) - if not pubDateStr.endswith('+00:00'): - pubDateStr += '+00:00' + pubDateStr = parseFeedDate(pubDate) + if pubDateStr: postFilename = '' votesStatus = [] addNewswireDictEntry(baseDir, domain, diff --git a/tests.py b/tests.py index 59c0259e2..23efc6c13 100644 --- a/tests.py +++ b/tests.py @@ -2391,7 +2391,11 @@ def testParseFeedDate(): pubDate = "2020-08-27T16:12:34+00:00" publishedDate = parseFeedDate(pubDate) assert publishedDate - print(str(publishedDate)) + + pubDate = "Sun, 22 Nov 2020 19:51:33 +0100" + publishedDate = parseFeedDate(pubDate) + # print(str(publishedDate)) + assert publishedDate def runAllTests(): From 6f5d5f1019b265c6b4802315bb16ab7816a5e839 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 19:09:35 +0000 Subject: [PATCH 44/50] Convert local dates to utc --- newswire.py | 3 +++ tests.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 825009eb1..b6af30cc9 100644 --- a/newswire.py +++ b/newswire.py @@ -12,6 +12,7 @@ from socket import error as SocketError import errno from datetime import datetime from datetime import timedelta +from datetime import timezone from collections import OrderedDict from utils import firstParagraphFromString from utils import isPublicPost @@ -189,6 +190,8 @@ def parseFeedDate(pubDate: str) -> str: pubDateStr = None if publishedDate: + # convert local date to UTC + publishedDate = publishedDate.replace(tzinfo=timezone.utc) pubDateStr = str(publishedDate) if not pubDateStr.endswith('+00:00'): pubDateStr += '+00:00' diff --git a/tests.py b/tests.py index 23efc6c13..7b4a85fd3 100644 --- a/tests.py +++ b/tests.py @@ -2394,13 +2394,14 @@ def testParseFeedDate(): pubDate = "Sun, 22 Nov 2020 19:51:33 +0100" publishedDate = parseFeedDate(pubDate) - # print(str(publishedDate)) + print(str(publishedDate)) assert publishedDate def runAllTests(): print('Running tests...') testParseFeedDate() + return testFirstParagraphFromString() testGetNewswireTags() testHashtagRuleTree() From fb5cb31ba10bea3d1a6acbcb94c19c914b1f0e82 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 19:13:41 +0000 Subject: [PATCH 45/50] Improve date parsing tests --- tests.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests.py b/tests.py index 7b4a85fd3..126b3a727 100644 --- a/tests.py +++ b/tests.py @@ -2390,18 +2390,16 @@ def testParseFeedDate(): print('testParseFeedDate') pubDate = "2020-08-27T16:12:34+00:00" publishedDate = parseFeedDate(pubDate) - assert publishedDate + assert publishedDate == "2020-08-27 16:12:34+00:00" pubDate = "Sun, 22 Nov 2020 19:51:33 +0100" publishedDate = parseFeedDate(pubDate) - print(str(publishedDate)) - assert publishedDate + assert publishedDate == "2020-11-22 19:51:33+00:00" def runAllTests(): print('Running tests...') testParseFeedDate() - return testFirstParagraphFromString() testGetNewswireTags() testHashtagRuleTree() From 7703a002cd25eae4622974f1e7434d3e33be412f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 20:18:10 +0000 Subject: [PATCH 46/50] utc offset adjust --- newswire.py | 1 + tests.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index b6af30cc9..50beca8b1 100644 --- a/newswire.py +++ b/newswire.py @@ -190,6 +190,7 @@ def parseFeedDate(pubDate: str) -> str: pubDateStr = None if publishedDate: + publishedDate = publishedDate - publishedDate.utcoffset() # convert local date to UTC publishedDate = publishedDate.replace(tzinfo=timezone.utc) pubDateStr = str(publishedDate) diff --git a/tests.py b/tests.py index 126b3a727..14d453c07 100644 --- a/tests.py +++ b/tests.py @@ -2394,12 +2394,13 @@ def testParseFeedDate(): pubDate = "Sun, 22 Nov 2020 19:51:33 +0100" publishedDate = parseFeedDate(pubDate) - assert publishedDate == "2020-11-22 19:51:33+00:00" + assert publishedDate == "2020-11-22 18:51:33+00:00" def runAllTests(): print('Running tests...') testParseFeedDate() + return testFirstParagraphFromString() testGetNewswireTags() testHashtagRuleTree() From 38121317584d3e74359af40a65fa2a7011d7a340 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 20:28:32 +0000 Subject: [PATCH 47/50] Test --- newsdaemon.py | 24 ++++++++++++------------ tests.py | 1 - 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/newsdaemon.py b/newsdaemon.py index 82b4757af..ecdc252f8 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -712,18 +712,18 @@ def runNewswireDaemon(baseDir: str, httpd, # try to update the feeds newNewswire = None - try: - newNewswire = \ - getDictFromNewswire(httpd.session, baseDir, domain, - httpd.maxNewswirePostsPerSource, - httpd.maxNewswireFeedSizeKb, - httpd.maxTags, - httpd.maxFeedItemSizeKb, - httpd.maxNewswirePosts) - except Exception as e: - print('WARN: unable to update newswire ' + str(e)) - time.sleep(120) - continue + # try: + newNewswire = \ + getDictFromNewswire(httpd.session, baseDir, domain, + httpd.maxNewswirePostsPerSource, + httpd.maxNewswireFeedSizeKb, + httpd.maxTags, + httpd.maxFeedItemSizeKb, + httpd.maxNewswirePosts) + # except Exception as e: + # print('WARN: unable to update newswire ' + str(e)) + # time.sleep(120) + # continue if not httpd.newswire: if os.path.isfile(newswireStateFilename): diff --git a/tests.py b/tests.py index 14d453c07..4b86d499d 100644 --- a/tests.py +++ b/tests.py @@ -2400,7 +2400,6 @@ def testParseFeedDate(): def runAllTests(): print('Running tests...') testParseFeedDate() - return testFirstParagraphFromString() testGetNewswireTags() testHashtagRuleTree() From 758380dadece513c7a3eff54ef6a0939d6ac4eed Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 20:33:24 +0000 Subject: [PATCH 48/50] Calculate offset --- newswire.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 50beca8b1..4c86ac597 100644 --- a/newswire.py +++ b/newswire.py @@ -190,7 +190,8 @@ def parseFeedDate(pubDate: str) -> str: pubDateStr = None if publishedDate: - publishedDate = publishedDate - publishedDate.utcoffset() + offset = publishedDate.utcoffset() + publishedDate = publishedDate - offset # convert local date to UTC publishedDate = publishedDate.replace(tzinfo=timezone.utc) pubDateStr = str(publishedDate) From a998c0b3213f3493da365e5212c739569335fbd0 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 20:37:08 +0000 Subject: [PATCH 49/50] Check that offset exists --- newswire.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newswire.py b/newswire.py index 4c86ac597..2e0ab8aaf 100644 --- a/newswire.py +++ b/newswire.py @@ -191,7 +191,8 @@ def parseFeedDate(pubDate: str) -> str: pubDateStr = None if publishedDate: offset = publishedDate.utcoffset() - publishedDate = publishedDate - offset + if offset: + publishedDate = publishedDate - offset # convert local date to UTC publishedDate = publishedDate.replace(tzinfo=timezone.utc) pubDateStr = str(publishedDate) From 88cc48480f3a1d2c0e6abfc01434339b0a089c6c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 22 Nov 2020 20:41:01 +0000 Subject: [PATCH 50/50] Tidying --- newsdaemon.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/newsdaemon.py b/newsdaemon.py index ecdc252f8..a08f0e200 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -711,8 +711,6 @@ def runNewswireDaemon(baseDir: str, httpd, print('Newswire daemon session established') # try to update the feeds - newNewswire = None - # try: newNewswire = \ getDictFromNewswire(httpd.session, baseDir, domain, httpd.maxNewswirePostsPerSource, @@ -720,10 +718,6 @@ def runNewswireDaemon(baseDir: str, httpd, httpd.maxTags, httpd.maxFeedItemSizeKb, httpd.maxNewswirePosts) - # except Exception as e: - # print('WARN: unable to update newswire ' + str(e)) - # time.sleep(120) - # continue if not httpd.newswire: if os.path.isfile(newswireStateFilename):