Variable types

main
bashrc 2026-05-03 14:32:48 +01:00
parent e1f1469635
commit eee7c0db62
1 changed files with 47 additions and 45 deletions

View File

@ -745,7 +745,7 @@ def get_link_from_rss_item(rss_item: str,
link: str = None link: str = None
if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item: if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item:
enclosures: list = rss_item.split('<podcast:alternateEnclosure ') enclosures: list[str] = rss_item.split('<podcast:alternateEnclosure ')
ctr: int = 0 ctr: int = 0
for enclosure in enclosures: for enclosure in enclosures:
if ctr == 0: if ctr == 0:
@ -869,14 +869,14 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
if '</pubDate>' not in rss_item: if '</pubDate>' not in rss_item:
continue continue
title = rss_item.split('<title>')[1] title: str = rss_item.split('<title>')[1]
title = _remove_cdata(title.split('</title>')[0]) title = _remove_cdata(title.split('</title>')[0])
title = unescaped_text(title) title = unescaped_text(title)
title = remove_script(title, None, None, None) title = remove_script(title, None, None, None)
title = remove_html(title) title = remove_html(title)
title = title.replace('\n', '') title = title.replace('\n', '')
description = '' description: str = ''
if '<description>' in rss_item and '</description>' in rss_item: if '<description>' in rss_item and '</description>' in rss_item:
description = rss_item.split('<description>')[1] description = rss_item.split('<description>')[1]
description = description.split('</description>')[0] description = description.split('</description>')[0]
@ -892,7 +892,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
description = remove_script(description, None, None, None) description = remove_script(description, None, None, None)
description = remove_html(description) description = remove_html(description)
proxy_type = None proxy_type: str = None
if domain.endswith('.onion'): if domain.endswith('.onion'):
proxy_type = 'tor' proxy_type = 'tor'
elif domain.endswith('.i2p'): elif domain.endswith('.i2p'):
@ -906,28 +906,28 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
if not link: if not link:
continue continue
item_domain = link.split('://')[1] item_domain: str = link.split('://')[1]
if '/' in item_domain: if '/' in item_domain:
item_domain = item_domain.split('/')[0] item_domain = item_domain.split('/')[0]
if is_blocked_domain(base_dir, item_domain, None, None): if is_blocked_domain(base_dir, item_domain, None, None):
continue continue
pub_date = rss_item.split('<pubDate>')[1] pub_date: str = rss_item.split('<pubDate>')[1]
pub_date = pub_date.split('</pubDate>')[0] pub_date = pub_date.split('</pubDate>')[0]
unique_string_identifier = title + ' ' + link unique_string_identifier: str = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier) pub_date_str: str = parse_feed_date(pub_date, unique_string_identifier)
if not pub_date_str: if not pub_date_str:
continue continue
if not _valid_feed_date(pub_date_str): if not _valid_feed_date(pub_date_str):
continue continue
post_filename = '' post_filename: str = ''
votes_status: list[str] = [] votes_status: list[str] = []
podcast_properties: dict = \ podcast_properties: dict = \
xml_podcast_to_dict(base_dir, rss_item, xml_str) xml_podcast_to_dict(base_dir, rss_item, xml_str)
if podcast_properties: if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type podcast_properties['linkMimeType'] = link_mime_type
fediverse_handle = '' fediverse_handle: str = ''
extra_links: list[str] = [] extra_links: list[str] = []
_add_newswire_dict_entry(base_dir, _add_newswire_dict_entry(base_dir,
result, pub_date_str, result, pub_date_str,
@ -956,7 +956,7 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
"""Converts an xml RSS 1.0 string to a dictionary """Converts an xml RSS 1.0 string to a dictionary
https://validator.w3.org/feed/docs/rss1.html https://validator.w3.org/feed/docs/rss1.html
""" """
item_str = '<item' item_str: str = '<item'
if item_str not in xml_str: if item_str not in xml_str:
return {} return {}
result: dict = {} result: dict = {}
@ -992,12 +992,12 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
continue continue
if '</dc:date>' not in rss_item: if '</dc:date>' not in rss_item:
continue continue
title = rss_item.split('<title>')[1] title: str = rss_item.split('<title>')[1]
title = _remove_cdata(title.split('</title>')[0]) title = _remove_cdata(title.split('</title>')[0])
title = unescaped_text(title) title = unescaped_text(title)
title = remove_script(title, None, None, None) title = remove_script(title, None, None, None)
title = remove_html(title) title = remove_html(title)
description = '' description: str = ''
if '<description>' in rss_item and '</description>' in rss_item: if '<description>' in rss_item and '</description>' in rss_item:
description = rss_item.split('<description>')[1] description = rss_item.split('<description>')[1]
description = description.split('</description>')[0] description = description.split('</description>')[0]
@ -1013,7 +1013,7 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
description = remove_script(description, None, None, None) description = remove_script(description, None, None, None)
description = remove_html(description) description = remove_html(description)
proxy_type = None proxy_type: str = None
if domain.endswith('.onion'): if domain.endswith('.onion'):
proxy_type = 'tor' proxy_type = 'tor'
elif domain.endswith('.i2p'): elif domain.endswith('.i2p'):
@ -1027,28 +1027,28 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
if not link: if not link:
continue continue
item_domain = link.split('://')[1] item_domain: str = link.split('://')[1]
if '/' in item_domain: if '/' in item_domain:
item_domain = item_domain.split('/')[0] item_domain = item_domain.split('/')[0]
if is_blocked_domain(base_dir, item_domain, None, None): if is_blocked_domain(base_dir, item_domain, None, None):
continue continue
pub_date = rss_item.split('<dc:date>')[1] pub_date: str = rss_item.split('<dc:date>')[1]
pub_date = pub_date.split('</dc:date>')[0] pub_date = pub_date.split('</dc:date>')[0]
unique_string_identifier = title + ' ' + link unique_string_identifier: str = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier) pub_date_str: str = parse_feed_date(pub_date, unique_string_identifier)
if not pub_date_str: if not pub_date_str:
continue continue
if not _valid_feed_date(pub_date_str): if not _valid_feed_date(pub_date_str):
continue continue
post_filename = '' post_filename: str = ''
votes_status: list[str] = [] votes_status: list[str] = []
podcast_properties: dict = \ podcast_properties: dict = \
xml_podcast_to_dict(base_dir, rss_item, xml_str) xml_podcast_to_dict(base_dir, rss_item, xml_str)
if podcast_properties: if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type podcast_properties['linkMimeType'] = link_mime_type
fediverse_handle = '' fediverse_handle: str = ''
extra_links: list[str] = [] extra_links: list[str] = []
_add_newswire_dict_entry(base_dir, _add_newswire_dict_entry(base_dir,
result, pub_date_str, result, pub_date_str,
@ -1102,12 +1102,12 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
continue continue
if '</updated>' not in atom_item: if '</updated>' not in atom_item:
continue continue
title = atom_item.split('<title>')[1] title: str = atom_item.split('<title>')[1]
title = _remove_cdata(title.split('</title>')[0]) title = _remove_cdata(title.split('</title>')[0])
title = unescaped_text(title) title = unescaped_text(title)
title = remove_script(title, None, None, None) title = remove_script(title, None, None, None)
title = remove_html(title) title = remove_html(title)
description = '' description: str = ''
if '<summary>' in atom_item and '</summary>' in atom_item: if '<summary>' in atom_item and '</summary>' in atom_item:
description = atom_item.split('<summary>')[1] description = atom_item.split('<summary>')[1]
description = unescaped_text(description.split('</summary>')[0]) description = unescaped_text(description.split('</summary>')[0])
@ -1129,18 +1129,18 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
description = remove_html(description) description = remove_html(description)
# is there a fediverse handle # is there a fediverse handle
fediverse_handle = '' fediverse_handle: str = ''
if '<author>' in atom_item and '</author>' in atom_item: if '<author>' in atom_item and '</author>' in atom_item:
actor_str = atom_item.split('<author>')[1] actor_str: str = atom_item.split('<author>')[1]
actor_str = unescaped_text(actor_str.split('</author>')[0]) actor_str = unescaped_text(actor_str.split('</author>')[0])
actor_str = remove_script(actor_str, None, None, None) actor_str = remove_script(actor_str, None, None, None)
if '<activity:object-type>' in actor_str and \ if '<activity:object-type>' in actor_str and \
'</activity:object-type>' in actor_str and \ '</activity:object-type>' in actor_str and \
'<uri>' in actor_str and '</uri>' in actor_str: '<uri>' in actor_str and '</uri>' in actor_str:
obj_type = actor_str.split('<activity:object-type>')[1] obj_type: str = actor_str.split('<activity:object-type>')[1]
obj_type = obj_type.split('</activity:object-type>')[0] obj_type = obj_type.split('</activity:object-type>')[0]
if obj_type == 'Person': if obj_type == 'Person':
actor_uri = actor_str.split('<uri>')[1] actor_uri: str = actor_str.split('<uri>')[1]
actor_uri = actor_uri.split('</uri>')[0] actor_uri = actor_uri.split('</uri>')[0]
if resembles_url(actor_uri) and \ if resembles_url(actor_uri) and \
not is_local_network_address(actor_uri): not is_local_network_address(actor_uri):
@ -1150,18 +1150,18 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
extra_links: list[str] = [] extra_links: list[str] = []
if '<activity:object>' in atom_item and \ if '<activity:object>' in atom_item and \
'</activity:object>' in atom_item: '</activity:object>' in atom_item:
obj_str = atom_item.split('<activity:object>')[1] obj_str: str = atom_item.split('<activity:object>')[1]
obj_str = \ obj_str = \
unescaped_text(obj_str.split('</activity:object>')[0]) unescaped_text(obj_str.split('</activity:object>')[0])
obj_str = remove_script(obj_str, None, None, None) obj_str = remove_script(obj_str, None, None, None)
sections = obj_str.split('<link ') sections: list[str] = obj_str.split('<link ')
ctr: int = 0 ctr: int = 0
for section_str in sections: for section_str in sections:
if ctr == 0: if ctr == 0:
ctr = 1 ctr = 1
continue continue
if '>' in section_str: if '>' in section_str:
link_str = section_str.split('>')[0] link_str: str = section_str.split('>')[0]
if 'href="' in link_str and \ if 'href="' in link_str and \
'rel="preview"' not in link_str: 'rel="preview"' not in link_str:
link_str = link_str.split('href="')[1] link_str = link_str.split('href="')[1]
@ -1173,7 +1173,7 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
if link_str not in extra_links: if link_str not in extra_links:
extra_links.append(link_str) extra_links.append(link_str)
proxy_type = None proxy_type: str = None
if domain.endswith('.onion'): if domain.endswith('.onion'):
proxy_type = 'tor' proxy_type = 'tor'
elif domain.endswith('.i2p'): elif domain.endswith('.i2p'):
@ -1187,24 +1187,24 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
if not link: if not link:
continue continue
item_domain = link.split('://')[1] item_domain: str = link.split('://')[1]
if '/' in item_domain: if '/' in item_domain:
item_domain = item_domain.split('/')[0] item_domain = item_domain.split('/')[0]
if is_blocked_domain(base_dir, item_domain, None, None): if is_blocked_domain(base_dir, item_domain, None, None):
continue continue
pub_date = atom_item.split('<updated>')[1] pub_date: str = atom_item.split('<updated>')[1]
pub_date = pub_date.split('</updated>')[0] pub_date = pub_date.split('</updated>')[0]
unique_string_identifier = title + ' ' + link unique_string_identifier: str = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier) pub_date_str: str = parse_feed_date(pub_date, unique_string_identifier)
if not pub_date_str: if not pub_date_str:
continue continue
if not _valid_feed_date(pub_date_str): if not _valid_feed_date(pub_date_str):
continue continue
post_filename = '' post_filename: str = ''
votes_status: list[str] = [] votes_status: list[str] = []
podcast_properties = \ podcast_properties: dict = \
xml_podcast_to_dict(base_dir, atom_item, xml_str) xml_podcast_to_dict(base_dir, atom_item, xml_str)
if podcast_properties: if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type podcast_properties['linkMimeType'] = link_mime_type
@ -1859,13 +1859,14 @@ def get_dict_from_newswire(session, base_dir: str, domain: str,
mirrored = True mirrored = True
url = url.replace('!', '').strip() url = url.replace('!', '').strip()
items_list = get_rss(base_dir, domain, session, url, items_list: dict = \
moderated, mirrored, get_rss(base_dir, domain, session, url,
max_posts_per_source, max_feed_size_kb, moderated, mirrored,
max_feed_item_size_kb, max_posts_per_source, max_feed_size_kb,
max_categories_feed_item_size_kb, debug, max_feed_item_size_kb,
preferred_podcast_formats, max_categories_feed_item_size_kb, debug,
timeout_sec, system_language) preferred_podcast_formats,
timeout_sec, system_language)
if items_list: if items_list:
for date_str, item in items_list.items(): for date_str, item in items_list.items():
result[date_str] = item result[date_str] = item
@ -1877,10 +1878,11 @@ def get_dict_from_newswire(session, base_dir: str, domain: str,
session, debug) session, debug)
# sort into chronological order, latest first # sort into chronological order, latest first
sorted_result = OrderedDict(sorted(result.items(), reverse=True)) sorted_result: dict = \
OrderedDict(sorted(result.items(), reverse=True))
# are there too many posts? If so then remove the oldest ones # are there too many posts? If so then remove the oldest ones
no_of_posts = len(sorted_result.items()) no_of_posts: int = len(sorted_result.items())
if no_of_posts > max_newswire_posts: if no_of_posts > max_newswire_posts:
ctr: int = 0 ctr: int = 0
removals: list[str] = [] removals: list[str] = []