mirror of https://gitlab.com/bashrc2/epicyon
Avoid colliding newswire feed items which are published on the hour
parent
401aa0712c
commit
e68e2b56c7
29
newswire.py
29
newswire.py
|
@ -10,6 +10,7 @@ __module_group__ = "Web Interface Columns"
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
|
import random
|
||||||
from socket import error as SocketError
|
from socket import error as SocketError
|
||||||
import errno
|
import errno
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -268,10 +269,21 @@ def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
|
||||||
return valid_post_date(post_date, 90, debug)
|
return valid_post_date(post_date, 90, debug)
|
||||||
|
|
||||||
|
|
||||||
def parse_feed_date(pub_date: str) -> str:
|
def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
|
||||||
"""Returns a UTC date string based on the given date string
|
"""Returns a UTC date string based on the given date string
|
||||||
This tries a number of formats to see which work
|
This tries a number of formats to see which work
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if ':00:00' in pub_date:
|
||||||
|
# If this was published exactly on the hour then assign a
|
||||||
|
# random minute and second to make this item relatively unique
|
||||||
|
randgen = random.Random(unique_string_identifier)
|
||||||
|
rand_min = randgen.randint(0, 59)
|
||||||
|
rand_sec = randgen.randint(0, 59)
|
||||||
|
replace_time_str = \
|
||||||
|
':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
|
||||||
|
pub_date = pub_date.replace(':00:00', replace_time_str)
|
||||||
|
|
||||||
formats = ("%a, %d %b %Y %H:%M:%S %z",
|
formats = ("%a, %d %b %Y %H:%M:%S %z",
|
||||||
"%a, %d %b %Y %H:%M:%S Z",
|
"%a, %d %b %Y %H:%M:%S Z",
|
||||||
"%a, %d %b %Y %H:%M:%S GMT",
|
"%a, %d %b %Y %H:%M:%S GMT",
|
||||||
|
@ -663,7 +675,8 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
|
||||||
pub_date = rss_item.split('<pubDate>')[1]
|
pub_date = rss_item.split('<pubDate>')[1]
|
||||||
pub_date = pub_date.split('</pubDate>')[0]
|
pub_date = pub_date.split('</pubDate>')[0]
|
||||||
|
|
||||||
pub_date_str = parse_feed_date(pub_date)
|
unique_string_identifier = title + ' ' + link
|
||||||
|
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
if pub_date_str:
|
if pub_date_str:
|
||||||
if _valid_feed_date(pub_date_str):
|
if _valid_feed_date(pub_date_str):
|
||||||
post_filename = ''
|
post_filename = ''
|
||||||
|
@ -758,7 +771,8 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
|
||||||
pub_date = rss_item.split('<dc:date>')[1]
|
pub_date = rss_item.split('<dc:date>')[1]
|
||||||
pub_date = pub_date.split('</dc:date>')[0]
|
pub_date = pub_date.split('</dc:date>')[0]
|
||||||
|
|
||||||
pub_date_str = parse_feed_date(pub_date)
|
unique_string_identifier = title + ' ' + link
|
||||||
|
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
if pub_date_str:
|
if pub_date_str:
|
||||||
if _valid_feed_date(pub_date_str):
|
if _valid_feed_date(pub_date_str):
|
||||||
post_filename = ''
|
post_filename = ''
|
||||||
|
@ -841,7 +855,8 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
|
||||||
pub_date = atom_item.split('<updated>')[1]
|
pub_date = atom_item.split('<updated>')[1]
|
||||||
pub_date = pub_date.split('</updated>')[0]
|
pub_date = pub_date.split('</updated>')[0]
|
||||||
|
|
||||||
pub_date_str = parse_feed_date(pub_date)
|
unique_string_identifier = title + ' ' + link
|
||||||
|
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
if pub_date_str:
|
if pub_date_str:
|
||||||
if _valid_feed_date(pub_date_str):
|
if _valid_feed_date(pub_date_str):
|
||||||
post_filename = ''
|
post_filename = ''
|
||||||
|
@ -956,7 +971,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
|
||||||
continue
|
continue
|
||||||
pub_date = json_feed_item['date_modified']
|
pub_date = json_feed_item['date_modified']
|
||||||
|
|
||||||
pub_date_str = parse_feed_date(pub_date)
|
unique_string_identifier = title + ' ' + link
|
||||||
|
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
if pub_date_str:
|
if pub_date_str:
|
||||||
if _valid_feed_date(pub_date_str):
|
if _valid_feed_date(pub_date_str):
|
||||||
post_filename = ''
|
post_filename = ''
|
||||||
|
@ -1040,7 +1056,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
|
||||||
pub_date = atom_item.split('<published>')[1]
|
pub_date = atom_item.split('<published>')[1]
|
||||||
pub_date = pub_date.split('</published>')[0]
|
pub_date = pub_date.split('</published>')[0]
|
||||||
|
|
||||||
pub_date_str = parse_feed_date(pub_date)
|
unique_string_identifier = title + ' ' + link
|
||||||
|
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
if pub_date_str:
|
if pub_date_str:
|
||||||
if _valid_feed_date(pub_date_str):
|
if _valid_feed_date(pub_date_str):
|
||||||
post_filename = ''
|
post_filename = ''
|
||||||
|
|
19
tests.py
19
tests.py
|
@ -4233,25 +4233,32 @@ def _test_first_paragraph_from_string():
|
||||||
assert result_str == test_str
|
assert result_str == test_str
|
||||||
|
|
||||||
|
|
||||||
def _test_parse_feed_date():
|
def _test_parse_newswire_feed_date():
|
||||||
print('test_parse_feed_date')
|
print('test_parse_feed_date')
|
||||||
|
|
||||||
|
unique_string_identifier = 'some string abcd'
|
||||||
|
|
||||||
pub_date = "2020-12-14T00:08:06+00:00"
|
pub_date = "2020-12-14T00:08:06+00:00"
|
||||||
published_date = parse_feed_date(pub_date)
|
published_date = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
assert published_date == "2020-12-14 00:08:06+00:00"
|
assert published_date == "2020-12-14 00:08:06+00:00"
|
||||||
|
|
||||||
pub_date = "Tue, 08 Dec 2020 06:24:38 -0600"
|
pub_date = "Tue, 08 Dec 2020 06:24:38 -0600"
|
||||||
published_date = parse_feed_date(pub_date)
|
published_date = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
assert published_date == "2020-12-08 12:24:38+00:00"
|
assert published_date == "2020-12-08 12:24:38+00:00"
|
||||||
|
|
||||||
pub_date = "2020-08-27T16:12:34+00:00"
|
pub_date = "2020-08-27T16:12:34+00:00"
|
||||||
published_date = parse_feed_date(pub_date)
|
published_date = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
assert published_date == "2020-08-27 16:12:34+00:00"
|
assert published_date == "2020-08-27 16:12:34+00:00"
|
||||||
|
|
||||||
pub_date = "Sun, 22 Nov 2020 19:51:33 +0100"
|
pub_date = "Sun, 22 Nov 2020 19:51:33 +0100"
|
||||||
published_date = parse_feed_date(pub_date)
|
published_date = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
assert published_date == "2020-11-22 18:51:33+00:00"
|
assert published_date == "2020-11-22 18:51:33+00:00"
|
||||||
|
|
||||||
|
pub_date = "Sun, 22 Nov 2020 00:00:00 +0000"
|
||||||
|
published_date = parse_feed_date(pub_date, unique_string_identifier)
|
||||||
|
assert published_date != "2020-11-22 00:00:00+00:00"
|
||||||
|
assert "2020-11-22 00:" in published_date
|
||||||
|
|
||||||
|
|
||||||
def _test_valid_nick():
|
def _test_valid_nick():
|
||||||
print('test_valid_nickname')
|
print('test_valid_nickname')
|
||||||
|
@ -6576,7 +6583,7 @@ def run_all_tests():
|
||||||
_test_mentioned_people(base_dir)
|
_test_mentioned_people(base_dir)
|
||||||
_test_guess_tag_category()
|
_test_guess_tag_category()
|
||||||
_test_valid_nick()
|
_test_valid_nick()
|
||||||
_test_parse_feed_date()
|
_test_parse_newswire_feed_date()
|
||||||
_test_first_paragraph_from_string()
|
_test_first_paragraph_from_string()
|
||||||
_test_newswire_tags()
|
_test_newswire_tags()
|
||||||
_test_hashtag_rules()
|
_test_hashtag_rules()
|
||||||
|
|
Loading…
Reference in New Issue