Avoid colliding newswire feed items which are published on the hour

merge-requests/26/head
Bob Mottram 2022-01-27 10:42:46 +00:00
parent 401aa0712c
commit e68e2b56c7
2 changed files with 36 additions and 12 deletions

View File

@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns"
import os
import json
import requests
import random
from socket import error as SocketError
import errno
from datetime import datetime
@ -268,10 +269,21 @@ def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
return valid_post_date(post_date, 90, debug)
def parse_feed_date(pub_date: str) -> str:
def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
"""Returns a UTC date string based on the given date string
This tries a number of formats to see which work
"""
if ':00:00' in pub_date:
# If this was published exactly on the hour then assign a
# random minute and second to make this item relatively unique
randgen = random.Random(unique_string_identifier)
rand_min = randgen.randint(0, 59)
rand_sec = randgen.randint(0, 59)
replace_time_str = \
':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
pub_date = pub_date.replace(':00:00', replace_time_str)
formats = ("%a, %d %b %Y %H:%M:%S %z",
"%a, %d %b %Y %H:%M:%S Z",
"%a, %d %b %Y %H:%M:%S GMT",
@ -663,7 +675,8 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = rss_item.split('<pubDate>')[1]
pub_date = pub_date.split('</pubDate>')[0]
pub_date_str = parse_feed_date(pub_date)
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str:
if _valid_feed_date(pub_date_str):
post_filename = ''
@ -758,7 +771,8 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = rss_item.split('<dc:date>')[1]
pub_date = pub_date.split('</dc:date>')[0]
pub_date_str = parse_feed_date(pub_date)
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str:
if _valid_feed_date(pub_date_str):
post_filename = ''
@ -841,7 +855,8 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = atom_item.split('<updated>')[1]
pub_date = pub_date.split('</updated>')[0]
pub_date_str = parse_feed_date(pub_date)
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str:
if _valid_feed_date(pub_date_str):
post_filename = ''
@ -956,7 +971,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
continue
pub_date = json_feed_item['date_modified']
pub_date_str = parse_feed_date(pub_date)
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str:
if _valid_feed_date(pub_date_str):
post_filename = ''
@ -1040,7 +1056,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
pub_date = atom_item.split('<published>')[1]
pub_date = pub_date.split('</published>')[0]
pub_date_str = parse_feed_date(pub_date)
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
if pub_date_str:
if _valid_feed_date(pub_date_str):
post_filename = ''

View File

@ -4233,25 +4233,32 @@ def _test_first_paragraph_from_string():
assert result_str == test_str
def _test_parse_feed_date():
def _test_parse_newswire_feed_date():
print('test_parse_feed_date')
unique_string_identifier = 'some string abcd'
pub_date = "2020-12-14T00:08:06+00:00"
published_date = parse_feed_date(pub_date)
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-12-14 00:08:06+00:00"
pub_date = "Tue, 08 Dec 2020 06:24:38 -0600"
published_date = parse_feed_date(pub_date)
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-12-08 12:24:38+00:00"
pub_date = "2020-08-27T16:12:34+00:00"
published_date = parse_feed_date(pub_date)
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-08-27 16:12:34+00:00"
pub_date = "Sun, 22 Nov 2020 19:51:33 +0100"
published_date = parse_feed_date(pub_date)
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date == "2020-11-22 18:51:33+00:00"
pub_date = "Sun, 22 Nov 2020 00:00:00 +0000"
published_date = parse_feed_date(pub_date, unique_string_identifier)
assert published_date != "2020-11-22 00:00:00+00:00"
assert "2020-11-22 00:" in published_date
def _test_valid_nick():
print('test_valid_nickname')
@ -6576,7 +6583,7 @@ def run_all_tests():
_test_mentioned_people(base_dir)
_test_guess_tag_category()
_test_valid_nick()
_test_parse_feed_date()
_test_parse_newswire_feed_date()
_test_first_paragraph_from_string()
_test_newswire_tags()
_test_hashtag_rules()