diff --git a/newswire.py b/newswire.py index 11867ba2d..3d06bea12 100644 --- a/newswire.py +++ b/newswire.py @@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns" import os import json import requests +import random from socket import error as SocketError import errno from datetime import datetime @@ -268,10 +269,21 @@ def _valid_feed_date(pub_date: str, debug: bool = False) -> bool: return valid_post_date(post_date, 90, debug) -def parse_feed_date(pub_date: str) -> str: +def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str: """Returns a UTC date string based on the given date string This tries a number of formats to see which work """ + + if ':00:00' in pub_date: + # If this was published exactly on the hour then assign a + # random minute and second to make this item relatively unique + randgen = random.Random(unique_string_identifier) + rand_min = randgen.randint(0, 59) + rand_sec = randgen.randint(0, 59) + replace_time_str = \ + ':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2) + pub_date = pub_date.replace(':00:00', replace_time_str) + formats = ("%a, %d %b %Y %H:%M:%S %z", "%a, %d %b %Y %H:%M:%S Z", "%a, %d %b %Y %H:%M:%S GMT", @@ -668,7 +680,8 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, pub_date = rss_item.split('')[1] pub_date = pub_date.split('')[0] - pub_date_str = parse_feed_date(pub_date) + unique_string_identifier = title + ' ' + link + pub_date_str = parse_feed_date(pub_date, unique_string_identifier) if pub_date_str: if _valid_feed_date(pub_date_str): post_filename = '' @@ -763,7 +776,8 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, pub_date = rss_item.split('')[1] pub_date = pub_date.split('')[0] - pub_date_str = parse_feed_date(pub_date) + unique_string_identifier = title + ' ' + link + pub_date_str = parse_feed_date(pub_date, unique_string_identifier) if pub_date_str: if _valid_feed_date(pub_date_str): post_filename = '' @@ -846,7 +860,8 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, pub_date = atom_item.split('')[1] pub_date = pub_date.split('')[0] - pub_date_str = parse_feed_date(pub_date) + unique_string_identifier = title + ' ' + link + pub_date_str = parse_feed_date(pub_date, unique_string_identifier) if pub_date_str: if _valid_feed_date(pub_date_str): post_filename = '' @@ -961,7 +976,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str, continue pub_date = json_feed_item['date_modified'] - pub_date_str = parse_feed_date(pub_date) + unique_string_identifier = title + ' ' + link + pub_date_str = parse_feed_date(pub_date, unique_string_identifier) if pub_date_str: if _valid_feed_date(pub_date_str): post_filename = '' @@ -1045,7 +1061,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, pub_date = atom_item.split('')[1] pub_date = pub_date.split('')[0] - pub_date_str = parse_feed_date(pub_date) + unique_string_identifier = title + ' ' + link + pub_date_str = parse_feed_date(pub_date, unique_string_identifier) if pub_date_str: if _valid_feed_date(pub_date_str): post_filename = '' diff --git a/tests.py b/tests.py index 635ef73cc..3df4f5c56 100644 --- a/tests.py +++ b/tests.py @@ -4233,25 +4233,32 @@ def _test_first_paragraph_from_string(): assert result_str == test_str -def _test_parse_feed_date(): +def _test_parse_newswire_feed_date(): print('test_parse_feed_date') + unique_string_identifier = 'some string abcd' + pub_date = "2020-12-14T00:08:06+00:00" - published_date = parse_feed_date(pub_date) + published_date = parse_feed_date(pub_date, unique_string_identifier) assert published_date == "2020-12-14 00:08:06+00:00" pub_date = "Tue, 08 Dec 2020 06:24:38 -0600" - published_date = parse_feed_date(pub_date) + published_date = parse_feed_date(pub_date, unique_string_identifier) assert published_date == "2020-12-08 12:24:38+00:00" pub_date = "2020-08-27T16:12:34+00:00" - published_date = parse_feed_date(pub_date) + published_date = parse_feed_date(pub_date, unique_string_identifier) assert published_date == "2020-08-27 16:12:34+00:00" pub_date = "Sun, 22 Nov 2020 19:51:33 +0100" - published_date = parse_feed_date(pub_date) + published_date = parse_feed_date(pub_date, unique_string_identifier) assert published_date == "2020-11-22 18:51:33+00:00" + pub_date = "Sun, 22 Nov 2020 00:00:00 +0000" + published_date = parse_feed_date(pub_date, unique_string_identifier) + assert published_date != "2020-11-22 00:00:00+00:00" + assert "2020-11-22 00:" in published_date + def _test_valid_nick(): print('test_valid_nickname') @@ -6576,7 +6583,7 @@ def run_all_tests(): _test_mentioned_people(base_dir) _test_guess_tag_category() _test_valid_nick() - _test_parse_feed_date() + _test_parse_newswire_feed_date() _test_first_paragraph_from_string() _test_newswire_tags() _test_hashtag_rules()