From b6c2067c714de5765fb790fded960ecac37b3fb6 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Wed, 30 Mar 2022 19:13:40 +0100 Subject: [PATCH] Handle fractional seconds within post published date --- inbox.py | 11 ++++++++--- newswire.py | 17 +++++++++++++++++ posts.py | 4 ++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/inbox.py b/inbox.py index fe8196713..ff00fecd9 100644 --- a/inbox.py +++ b/inbox.py @@ -2306,11 +2306,16 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str, if not message_json['object'].get('published'): return False - if 'T' not in message_json['object']['published']: + published = message_json['object']['published'] + if 'T' not in published: return False - if 'Z' not in message_json['object']['published']: + if 'Z' not in published: return False - if not valid_post_date(message_json['object']['published'], 90, debug): + if '.' in published: + # converts 2022-03-30T17:37:58.734Z into 2022-03-30T17:37:58Z + published = published.split('.')[0] + 'Z' + message_json['object']['published'] = published + if not valid_post_date(published, 90, debug): return False summary = None diff --git a/newswire.py b/newswire.py index 36097404d..e5f4a1740 100644 --- a/newswire.py +++ b/newswire.py @@ -268,6 +268,14 @@ def _valid_feed_date(pub_date: str, debug: bool = False) -> bool: # convert from YY-MM-DD HH:MM:SS+00:00 to # YY-MM-DDTHH:MM:SSZ post_date = pub_date.replace(' ', 'T').replace('+00:00', 'Z') + if '.' in post_date: + ending = post_date.split('.')[1] + timezone_str = '' + for ending_char in ending: + if not ending_char.isdigit(): + timezone_str += ending_char + if timezone_str: + post_date = post_date.split('.')[0] + timezone_str return valid_post_date(post_date, 90, debug) @@ -320,6 +328,15 @@ def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str: if 'UT' in pub_date and 'UT' not in date_format: continue + # remove any fraction of a second + if '.' in pub_date: + ending = pub_date.split('.')[1] + timezone_str = '' + for ending_char in ending: + if not ending_char.isdigit(): + timezone_str += ending_char + if timezone_str: + pub_date = pub_date.split('.')[0] + timezone_str try: published_date = datetime.strptime(pub_date, date_format) except BaseException: diff --git a/posts.py b/posts.py index f587f5d3d..d53778500 100644 --- a/posts.py +++ b/posts.py @@ -4926,6 +4926,10 @@ def download_announce(session, base_dir: str, http_prefix: str, base_dir, nickname, domain, post_id, recent_posts_cache) return None + if '.' in announced_json['published'] and \ + 'Z' in announced_json['published']: + announced_json['published'] = \ + announced_json['published'].split('.')[0] + 'Z' if not valid_post_date(announced_json['published'], 90, debug): print('WARN: announced post is not recently published ' + str(announced_json))