Avoid colliding newswire feed items which are published on the hour

merge-requests/30/head
Bob Mottram 2022-01-27 10:42:46 +00:00
parent 401aa0712c
commit e68e2b56c7
2 changed files with 36 additions and 12 deletions

View File

@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns"
import os import os
import json import json
import requests import requests
import random
from socket import error as SocketError from socket import error as SocketError
import errno import errno
from datetime import datetime from datetime import datetime
@ -268,10 +269,21 @@ def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
return valid_post_date(post_date, 90, debug) return valid_post_date(post_date, 90, debug)
def parse_feed_date(pub_date: str) -> str: def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
"""Returns a UTC date string based on the given date string """Returns a UTC date string based on the given date string
This tries a number of formats to see which work This tries a number of formats to see which work
""" """
if ':00:00' in pub_date:
# If this was published exactly on the hour then assign a
# random minute and second to make this item relatively unique
randgen = random.Random(unique_string_identifier)
rand_min = randgen.randint(0, 59)
rand_sec = randgen.randint(0, 59)
replace_time_str = \
':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
pub_date = pub_date.replace(':00:00', replace_time_str)
formats = ("%a, %d %b %Y %H:%M:%S %z", formats = ("%a, %d %b %Y %H:%M:%S %z",
"%a, %d %b %Y %H:%M:%S Z", "%a, %d %b %Y %H:%M:%S Z",
"%a, %d %b %Y %H:%M:%S GMT", "%a, %d %b %Y %H:%M:%S GMT",
@ -663,7 +675,8 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = rss_item.split('<pubDate>')[1] pub_date = rss_item.split('<pubDate>')[1]
pub_date = pub_date.split('</pubDate>')[0] pub_date = pub_date.split('</pubDate>')[0]
pub_date_str = parse_feed_date(pub_date) unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str: if pub_date_str:
if _valid_feed_date(pub_date_str): if _valid_feed_date(pub_date_str):
post_filename = '' post_filename = ''
@ -758,7 +771,8 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = rss_item.split('<dc:date>')[1] pub_date = rss_item.split('<dc:date>')[1]
pub_date = pub_date.split('</dc:date>')[0] pub_date = pub_date.split('</dc:date>')[0]
pub_date_str = parse_feed_date(pub_date) unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str: if pub_date_str:
if _valid_feed_date(pub_date_str): if _valid_feed_date(pub_date_str):
post_filename = '' post_filename = ''
@ -841,7 +855,8 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = atom_item.split('<updated>')[1] pub_date = atom_item.split('<updated>')[1]
pub_date = pub_date.split('</updated>')[0] pub_date = pub_date.split('</updated>')[0]
pub_date_str = parse_feed_date(pub_date) unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str: if pub_date_str:
if _valid_feed_date(pub_date_str): if _valid_feed_date(pub_date_str):
post_filename = '' post_filename = ''
@ -956,7 +971,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
continue continue
pub_date = json_feed_item['date_modified'] pub_date = json_feed_item['date_modified']
pub_date_str = parse_feed_date(pub_date) unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str: if pub_date_str:
if _valid_feed_date(pub_date_str): if _valid_feed_date(pub_date_str):
post_filename = '' post_filename = ''
@ -1040,7 +1056,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = atom_item.split('<published>')[1] pub_date = atom_item.split('<published>')[1]
pub_date = pub_date.split('</published>')[0] pub_date = pub_date.split('</published>')[0]
pub_date_str = parse_feed_date(pub_date) unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str: if pub_date_str:
if _valid_feed_date(pub_date_str): if _valid_feed_date(pub_date_str):
post_filename = '' post_filename = ''

View File

@ -4233,25 +4233,32 @@ def _test_first_paragraph_from_string():
assert result_str == test_str assert result_str == test_str
def _test_parse_feed_date(): def _test_parse_newswire_feed_date():
print('test_parse_feed_date') print('test_parse_feed_date')
unique_string_identifier = 'some string abcd'
pub_date = "2020-12-14T00:08:06+00:00" pub_date = "2020-12-14T00:08:06+00:00"
published_date = parse_feed_date(pub_date) published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-12-14 00:08:06+00:00" assert published_date == "2020-12-14 00:08:06+00:00"
pub_date = "Tue, 08 Dec 2020 06:24:38 -0600" pub_date = "Tue, 08 Dec 2020 06:24:38 -0600"
published_date = parse_feed_date(pub_date) published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-12-08 12:24:38+00:00" assert published_date == "2020-12-08 12:24:38+00:00"
pub_date = "2020-08-27T16:12:34+00:00" pub_date = "2020-08-27T16:12:34+00:00"
published_date = parse_feed_date(pub_date) published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-08-27 16:12:34+00:00" assert published_date == "2020-08-27 16:12:34+00:00"
pub_date = "Sun, 22 Nov 2020 19:51:33 +0100" pub_date = "Sun, 22 Nov 2020 19:51:33 +0100"
published_date = parse_feed_date(pub_date) published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-11-22 18:51:33+00:00" assert published_date == "2020-11-22 18:51:33+00:00"
pub_date = "Sun, 22 Nov 2020 00:00:00 +0000"
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date != "2020-11-22 00:00:00+00:00"
assert "2020-11-22 00:" in published_date
def _test_valid_nick(): def _test_valid_nick():
print('test_valid_nickname') print('test_valid_nickname')
@ -6576,7 +6583,7 @@ def run_all_tests():
_test_mentioned_people(base_dir) _test_mentioned_people(base_dir)
_test_guess_tag_category() _test_guess_tag_category()
_test_valid_nick() _test_valid_nick()
_test_parse_feed_date() _test_parse_newswire_feed_date()
_test_first_paragraph_from_string() _test_first_paragraph_from_string()
_test_newswire_tags() _test_newswire_tags()
_test_hashtag_rules() _test_hashtag_rules()