2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__filename__ = "newswire.py"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__author__ = "Bob Mottram"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__license__ = "AGPL3+"
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-21 19:01:20 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__version__ = "1.5.0"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__maintainer__ = "Bob Mottram"
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__email__ = "bob@libreserver.org"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__status__ = "Production"
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-26 11:27:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__module_group__ = "Web Interface Columns"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import os
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import json
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import requests
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import random
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-27 20:02:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import time
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from socket import error as SocketError
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import errno
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 15:33:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from datetime import timedelta
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from datetime import timezone
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from collections import OrderedDict
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:31:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import valid_post_date
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from categories import set_hashtag_category
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 15:42:33 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_local_network_address
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import data_dir
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-10 13:32:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import string_contains
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-05 20:05:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import image_mime_types_dict
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-27 17:04:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import resembles_url
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-09 14:18:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_url_from_post
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-29 11:37:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import remove_zero_length_strings
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-20 22:27:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import date_from_string_format
							 | 
						
					
						
							
								
									
										
										
										
											2022-12-18 15:29:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import acct_handle_dir
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-21 11:58:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import remove_eol
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_domain_from_actor
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import valid_hash_tag
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 21:44:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import dangerous_svg
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 16:01:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_fav_filename_from_url
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_base_content_from_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import has_object_dict
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import first_paragraph_from_string
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_public_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 20:36:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import locate_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 15:13:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import load_json
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 14:47:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import save_json
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_suspended
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 17:53:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import contains_invalid_chars
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import remove_html
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_account_dir
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import acct_dir
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import local_actor_url
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import escape_text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from utils import unescaped_text
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from blocking import is_blocked_domain
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from blocking import is_blocked_hashtag
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from filters import is_filtered
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from session import download_image_any_mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from content import remove_script
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 12:11:05 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _remove_cdata(text: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:18:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Removes any CDATA from the given text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if 'CDATA[' in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        text = text.split('CDATA[')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ']' in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            text = text.split(']')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def rss2header(http_prefix: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               nickname: str, domain_full: str,
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               title: str, translate: {}) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Header for an RSS 2.0 feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "<rss version=\"2.0\">" + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        '<channel>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if title.startswith('News'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>Newswire</title>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + http_prefix + '://' + domain_full + \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/newswire.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    elif title.startswith('Site'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>' + domain_full + '</title>' + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + http_prefix + '://' + domain_full + \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:17:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/blog/rss.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title_str = escape_text(translate[title])
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>' + title_str + '</title>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            local_actor_url(http_prefix, nickname, domain_full) + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/rss.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def rss2footer() -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Footer for an RSS 2.0 feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = '</channel></rss>'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_newswire_tags(text: str, max_tags: int) -> []:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a list of hashtags found in the given text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:46:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '#' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if ' ' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    text_simplified = \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    text_simplified = text_simplified.replace('. ', ' ').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if text_simplified.endswith('.'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        text_simplified = text_simplified[:len(text_simplified)-1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    words = text_simplified.split(' ')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    tags = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for wrd in words:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not wrd.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if len(wrd) <= 1:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if wrd in tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        tags.append(wrd)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(tags) >= max_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return tags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def limit_word_lengths(text: str, max_word_length: int) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Limits the maximum length of words so that the newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    column cannot become too wide
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if ' ' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    words = text.split(' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for wrd in words:
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(wrd) > max_word_length:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            wrd = wrd[:max_word_length]
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if result:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            result += ' '
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        result += wrd
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_newswire_favicon_url(url: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a favicon url from the given article link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if '://' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return '/newswire_favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if url.startswith('http://'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not (url.endswith('.onion') or url.endswith('.i2p')):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return '/newswire_favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    domain = url.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if '/' not in domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return url + '/favicon.ico'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    domain = domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return url.split('://')[0] + '://' + domain + '/favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _download_newswire_feed_favicon(session, base_dir: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    link: str, debug: bool) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Downloads the favicon for the given feed link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    fav_url = get_newswire_favicon_url(link)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    timeout_sec = 10
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    image_data, mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        download_image_any_mime_type(session, fav_url, timeout_sec, debug)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not image_data or not mime_type:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 23:59:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # update the favicon url
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-05 20:05:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    extensions_to_mime = image_mime_types_dict()
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for ext, mime_ext in extensions_to_mime.items():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if 'image/' + mime_ext in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            fav_url = fav_url.replace('.ico', '.' + ext)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # create cached favicons directory if needed
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isdir(base_dir + '/favicons'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        os.mkdir(base_dir + '/favicons')
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # check svg for dubious scripts
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if fav_url.endswith('.svg'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        image_data_str = str(image_data)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if dangerous_svg(image_data_str, False):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # save to the cache
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    fav_filename = get_fav_filename_from_url(base_dir, fav_url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if os.path.isfile(fav_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 21:14:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return True
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        with open(fav_filename, 'wb+') as fp_fav:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            fp_fav.write(image_data)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except OSError:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('EX: failed writing favicon ' + fav_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-06 19:48:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_newswire_dict_entry(base_dir: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             newswire: {}, date_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             title: str, link: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             votes_status: str, post_filename: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             description: str, moderated: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             mirrored: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             tags: [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             max_tags: int, session, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             podcast_properties: {},
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 15:42:33 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             system_language: str,
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 19:34:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             fediverse_handle: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             extra_links: []) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Update the newswire dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # remove any markup
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    title = remove_html(title)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    all_text = title + ' ' + description
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # check that none of the text is filtered against
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if is_filtered(base_dir, None, None, all_text, system_language):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    title = limit_word_lengths(title, 13)
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if tags is None:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        tags = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # extract hashtags from the text of the feed post
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_tags = get_newswire_tags(all_text, max_tags)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # Include tags from podcast categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if podcast_properties:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties.get('explicit'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 13:15:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '#nsfw' not in post_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                post_tags.append('#nsfw')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_tags += podcast_properties['categories']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # combine the tags into a single list
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tag in tags:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tag in post_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(post_tags) < max_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_tags.append(tag)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # check that no tags are blocked
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tag in post_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_hashtag(base_dir, tag):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:18:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _download_newswire_feed_favicon(session, base_dir, link, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    newswire[date_str] = [
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        votes_status,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_tags,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        mirrored,
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 15:42:33 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        podcast_properties,
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 19:34:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ]
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # convert from YY-MM-DD HH:MM:SS+00:00 to
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # YY-MM-DDTHH:MM:SSZ
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_date = pub_date.replace(' ', 'T').replace('+00:00', 'Z')
							 | 
						
					
						
							
								
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '.' in post_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ending = post_date.split('.')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        timezone_str = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for ending_char in ending:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not ending_char.isdigit():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                timezone_str += ending_char
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if timezone_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_date = post_date.split('.')[0] + timezone_str
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return valid_post_date(post_date, 90, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a UTC date string based on the given date string
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    This tries a number of formats to see which work
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if ':00:00' in pub_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # If this was published exactly on the hour then assign a
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # random minute and second to make this item relatively unique
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        randgen = random.Random(unique_string_identifier)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rand_min = randgen.randint(0, 59)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rand_sec = randgen.randint(0, 59)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        replace_time_str = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.replace(':00:00', replace_time_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    formats = ("%a, %d %b %Y %H:%M:%S %z",
							 | 
						
					
						
							
								
									
										
										
										
											2021-10-17 14:17:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S Z",
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S GMT",
							 | 
						
					
						
							
								
									
										
										
										
											2021-10-17 14:24:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S EST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S PST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S AST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S CST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S MST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S AKST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S HST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S UT",
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-20 22:27:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%Y-%m-%dT%H:%M:%S%z",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%Y-%m-%dT%H:%M:%S%Z")
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    published_date = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for date_format in formats:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ',' in pub_date and ',' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if ',' not in pub_date and ',' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'Z' in pub_date and 'Z' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'Z' not in pub_date and 'Z' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'EST' not in pub_date and 'EST' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'GMT' not in pub_date and 'GMT' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'EST' in pub_date and 'EST' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'UT' not in pub_date and 'UT' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'UT' in pub_date and 'UT' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # remove any fraction of a second
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-18 13:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date2 = pub_date
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '.' in pub_date2:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ending = pub_date2.split('.')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            timezone_str = ''
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-18 13:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '+' in ending:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                timezone_str = '+' + ending.split('+')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            elif '-' in ending:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                timezone_str = '-' + ending.split('-')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pub_date2 = pub_date2.split('.')[0] + timezone_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-20 22:27:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_date = date_from_string_format(pub_date2, [date_format])
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        except BaseException:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if published_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pub_date.endswith(' EST'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                hours_added = timedelta(hours=5)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                published_date = published_date + hours_added
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    pub_date_str = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if published_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        offset = published_date.utcoffset()
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 20:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if offset:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_date = published_date - offset
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # convert local date to UTC
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        published_date = published_date.replace(tzinfo=timezone.utc)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = str(published_date)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str.endswith('+00:00'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pub_date_str += '+00:00'
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:33:27 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-18 13:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: unrecognized date format: ' + pub_date)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return pub_date_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def load_hashtag_categories(base_dir: str, language: str) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Loads an rss file containing hashtag categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    hashtag_categories_filename = base_dir + '/categories.xml'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(hashtag_categories_filename):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        hashtag_categories_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            base_dir + '/defaultcategories/' + language + '.xml'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not os.path.isfile(hashtag_categories_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        with open(hashtag_categories_filename, 'r',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                  encoding='utf-8') as fp_cat:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xml_str = fp_cat.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            _xml2str_to_hashtag_categories(base_dir, xml_str, 1024, True)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    except OSError:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('EX: load_hashtag_categories unable to read ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              hashtag_categories_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   force: bool = False) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Updates hashtag categories based upon an rss feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split('<item>')
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_categories_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss categories feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</description>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        category_str = rss_item.split('<title>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        category_str = category_str.split('</title>')[0].strip()
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        category_str = unescaped_text(category_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not category_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'CDATA' in category_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        hashtag_list_str = rss_item.split('<description>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        hashtag_list_str = hashtag_list_str.split('</description>')[0].strip()
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        hashtag_list_str = unescaped_text(hashtag_list_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not hashtag_list_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'CDATA' in hashtag_list_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        hashtag_list = hashtag_list_str.split(' ')
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:34:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_hashtag(base_dir, category_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for hashtag in hashtag_list:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            set_hashtag_category(base_dir, hashtag, category_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 False, force)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _get_podcast_categories(xml_item: str, xml_str: str) -> str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """ get podcast categories if they exist. These can be turned into hashtags
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:07:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    See https://podcast-standard.org/itunes_tags
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    podcast_categories = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:07:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # convert keywords to hashtags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if '<itunes:keywords' in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        keywords_str = xml_item.split('<itunes:keywords')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' in keywords_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            keywords_str = keywords_str.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<' in keywords_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                keywords_str = keywords_str.split('<')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                keywords_str = remove_html(keywords_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                keywords_list = keywords_str.split(',')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                for keyword in keywords_list:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    keyword_hashtag = '#' + keyword.strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if keyword_hashtag not in podcast_categories:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if valid_hash_tag(keyword):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            podcast_categories.append(keyword_hashtag)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    episode_category_tags = ['<itunes:category', '<category']
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for category_tag in episode_category_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        item_str = xml_item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if category_tag not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if category_tag not in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_str = xml_str
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        category_list = item_str.split(category_tag)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        first_category = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:12:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for episode_category in category_list:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if first_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                first_category = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'text="' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_category = episode_category.split('text="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        episode_category.lower().replace(' ', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.replace('#', '')
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:13:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    episode_category_hashtag = '#' + episode_category
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if episode_category_hashtag not in podcast_categories:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if valid_hash_tag(episode_category):
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:13:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            podcast_categories.append(episode_category_hashtag)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '>' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_category = episode_category.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '<' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.split('<')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        episode_category.lower().replace(' ', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.replace('#', '')
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:13:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    episode_category_hashtag = '#' + episode_category
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if episode_category_hashtag not in podcast_categories:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if valid_hash_tag(episode_category):
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 17:13:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            podcast_categories.append(episode_category_hashtag)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return podcast_categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 11:32:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _get_podcast_author(xml_item: str, xml_str: str) -> str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """ get podcast author if specified.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    author = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    episode_author_tags = ['<itunes:author', '<author']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for author_tag in episode_author_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        item_str = xml_item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if author_tag not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if author_tag not in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_str = xml_str
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        author_str = item_str.split(author_tag)[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' not in author_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        author_str = author_str.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<' not in author_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        author = item_str.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return remove_html(author).strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return author
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _valid_podcast_entry(base_dir: str, key: str, entry: {}) -> bool:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Is the given podcast namespace entry valid?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/proposal-docs/social/social.md#socialinteract-element
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if key in ('socialInteract', 'discussion'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not entry.get('protocol'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not entry.get('uri'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not entry.get('text'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not entry.get('url'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if entry['protocol'].tolower() != 'activitypub':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if entry.get('uri'):
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-12 11:08:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            post_url = remove_html(entry['uri'])
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif entry.get('url'):
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-12 11:08:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            post_url = remove_html(entry['uri'])
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_url = entry['text']
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in post_url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_domain, _ = get_domain_from_actor(post_url)
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not post_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, post_domain, None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def xml_podcast_to_dict(base_dir: str, xml_item: str, xml_str: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """podcasting extensions for RSS feeds
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    See https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/docs/1.0.md
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:40:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/proposal-docs/social/social.md#socialinteract-element
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<podcast:' not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<itunes:' not in xml_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:thumbnail' not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    podcast_properties = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "locations": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "persons": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "soundbites": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "transcripts": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "valueRecipients": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "trailers": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-03 16:38:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "chapters": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "discussion": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "episode": '',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "socialInteract": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    pod_lines = xml_item.split('<podcast:')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for pod_line in pod_lines:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ctr == 0 or '>' not in pod_line:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ' ' not in pod_line.split('>')[0]:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_key = pod_line.split('>')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_val = pod_line.split('>', 1)[1].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<' in pod_val:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pod_val = pod_val.split('<')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if pod_key in podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_properties[pod_key] = pod_val
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_key = pod_line.split(' ')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_fields = (
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'url', 'geo', 'osm', 'type', 'method', 'group',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'owner', 'srcset', 'img', 'role', 'address', 'suggested',
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'startTime', 'duration', 'href', 'name', 'pubdate',
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'length', 'season', 'email', 'platform', 'protocol',
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-16 13:27:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'accountId', 'priority', 'podcastAccountId',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'podcastAccountUrl'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_entry = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for pod_field in pod_fields:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pod_field + '="' not in pod_line:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_str = pod_line.split(pod_field + '="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' not in pod_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_val = pod_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_entry[pod_field] = pod_val
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_text = pod_line.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<' in pod_text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_text = pod_text.split('<')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pod_text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pod_entry['text'] = pod_text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        appended = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pod_key + 's' in podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if isinstance(podcast_properties[pod_key + 's'], list):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_properties[pod_key + 's'].append(pod_entry)
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                appended = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not appended:
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:05:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            # if there are repeated keys then only use the first one
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not podcast_properties.get(pod_key):
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if _valid_podcast_entry(base_dir, pod_key, pod_entry):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties[pod_key] = pod_entry
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # get the image for the podcast, if it exists
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    podcast_episode_image = None
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    episode_image_tags = ['<itunes:image', '<media:thumbnail']
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for image_tag in episode_image_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_str = xml_item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if image_tag not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if image_tag not in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_str = xml_str
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        episode_image = item_str.split(image_tag)[1]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:55:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if image_tag + ' ' in item_str and '>' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'href="' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('href="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('"')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_episode_image = episode_image
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif 'url="' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('url="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_episode_image = episode_image
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '>' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('<')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-27 17:04:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if resembles_url(episode_image):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    podcast_episode_image = episode_image
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # get categories if they exist. These can be turned into hashtags
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    podcast_categories = _get_podcast_categories(xml_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-05 11:32:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # get the author name
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    podcast_author = _get_podcast_author(xml_item, xml_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if podcast_author:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties['author'] = podcast_author
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if podcast_episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties['image'] = podcast_episode_image
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        podcast_properties['categories'] = podcast_categories
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-10 13:32:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if string_contains(xml_item,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           ('<itunes:explicit>Y', '<itunes:explicit>T',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            '<itunes:explicit>1')):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['explicit'] = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['explicit'] = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<podcast:' not in xml_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return podcast_properties
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_link_from_rss_item(rss_item: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           preferred_mime_types: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           proxy_type: str) -> (str, str):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Extracts rss link from rss item string
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    mime_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        enclosures = rss_item.split('<podcast:alternateEnclosure ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for enclosure in enclosures:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if ctr == 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '</podcast:alternateEnclosure' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            enclosure = enclosure.split('</podcast:alternateEnclosure')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'type="' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            mime_type = enclosure.split('type="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mime_type = mime_type.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if mime_type not in preferred_mime_types:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'uri="' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            uris = enclosure.split('uri="')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr2 = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for uri in uris:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if ctr2 == 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    ctr2 += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ctr2 += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' not in uri:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link = uri.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if proxy_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'tor' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.onion/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'onion' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.onion/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'i2p' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.i2p/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if '.onion/' not in link and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                   '.i2p/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<enclosure ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # get link from audio or video enclosure
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        enclosure = rss_item.split('<enclosure ')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            enclosure = enclosure.split('>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if ' type="' in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mime_type = enclosure.split(' type="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    mime_type = mime_type.split('"')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if 'url="' in enclosure and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               ('"audio/' in enclosure or '"video/' in enclosure):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link_str = enclosure.split('url="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in link_str:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    link = link_str.split('"')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-27 17:04:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    if resembles_url(link):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<link>' in rss_item and '</link>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link = rss_item.split('<link>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link = link.split('</link>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return None, None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    elif '<link ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link_str = rss_item.split('<link ')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link_str = link_str.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'href="' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link_str = link_str.split('href="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    link = link_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml RSS 2.0 string to a dictionary
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<item>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # is this an rss feed containing hashtag categories?
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<title>#categories</title>' in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _xml2str_to_hashtag_categories(base_dir, xml_str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                       max_categories_feed_item_size_kb)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split('<item>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<pubDate>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</pubDate>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = rss_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = unescaped_text(title)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_script(title, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2023-08-17 14:04:20 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = title.replace('\n', '')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' in rss_item and '</description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = rss_item.split('<description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:46:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</description>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in rss_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = rss_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(rss_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain, None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = rss_item.split('<pubDate>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</pubDate>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:38:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not _valid_feed_date(pub_date_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_filename = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        votes_status = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xml_podcast_to_dict(base_dir, rss_item, xml_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 result, pub_date_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 title, link,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 votes_status, post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 description, moderated,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 podcast_properties, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 fediverse_handle, extra_links)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' rss 2.0 feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml RSS 1.0 string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    https://validator.w3.org/feed/docs/rss1.html
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    item_str = '<item'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if item_str not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # is this an rss feed containing hashtag categories?
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<title>#categories</title>' in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _xml2str_to_hashtag_categories(base_dir, xml_str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                       max_categories_feed_item_size_kb)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split(item_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss 1.0 feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if rss_item.startswith('s>'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<dc:date>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</dc:date>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = rss_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = unescaped_text(title)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_script(title, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' in rss_item and '</description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = rss_item.split('<description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:46:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</description>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in rss_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = rss_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(rss_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain, None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = rss_item.split('<dc:date>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</dc:date>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:44:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not _valid_feed_date(pub_date_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_filename = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        votes_status = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xml_podcast_to_dict(base_dir, rss_item, xml_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 result, pub_date_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 title, link,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 votes_status, post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 description, moderated,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 podcast_properties, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 fediverse_handle, extra_links)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' rss 1.0 feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       session, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an atom feed string to a dictionary
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 15:12:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Also see https://activitystrea.ms/specs/atom/1.0/
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<entry>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    atom_items = xml_str.split('<entry>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for atom_item in atom_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(atom_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: atom feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<updated>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</updated>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = atom_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = unescaped_text(title)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_script(title, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<summary>' in atom_item and '</summary>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<summary>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description.split('</summary>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-27 21:27:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '<content' in atom_item and '</content>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<content', 1)[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = description.split('>', 1)[1]
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description.split('</content>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 11:01:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in atom_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = atom_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 15:42:33 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # is there a fediverse handle
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<author>' in atom_item and '</author>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            actor_str = atom_item.split('<author>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            actor_str = unescaped_text(actor_str.split('</author>')[0])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            actor_str = remove_script(actor_str, None, None, None)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<activity:object-type>' in actor_str and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</activity:object-type>' in actor_str and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '<uri>' in actor_str and '</uri>' in actor_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                obj_type = actor_str.split('<activity:object-type>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                obj_type = obj_type.split('</activity:object-type>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if obj_type == 'Person':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    actor_uri = actor_str.split('<uri>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    actor_uri = actor_uri.split('</uri>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if resembles_url(actor_uri) and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       not is_local_network_address(actor_uri):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        fediverse_handle = actor_uri
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-05 19:34:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # are there any extra links?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<activity:object>' in atom_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								           '</activity:object>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            obj_str = atom_item.split('<activity:object>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            obj_str = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                unescaped_text(obj_str.split('</activity:object>')[0])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            obj_str = remove_script(obj_str, None, None, None)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            sections = obj_str.split('<link ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for section_str in sections:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if ctr == 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    ctr = 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '>' in section_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    link_str = section_str.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if 'href="' in link_str and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       'rel="preview"' not in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        link_str = link_str.split('href="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if '"' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            link_str = link_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            link_str = remove_html(link_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            if resembles_url(link_str) and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                               not is_local_network_address(link_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                if link_str not in extra_links:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    extra_links.append(link_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(atom_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain, None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = atom_item.split('<updated>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</updated>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:44:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not _valid_feed_date(pub_date_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_filename = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        votes_status = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xml_podcast_to_dict(base_dir, atom_item, xml_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 result, pub_date_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 title, link,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 votes_status, post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 description, moderated,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 podcast_properties, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 fediverse_handle, extra_links)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' atom feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-19 14:26:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _json_feed_v1to_dict(base_dir: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts a json feed string to a dictionary
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    See https://jsonfeed.org/version/1.1
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '"items"' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        feed_json = json.loads(xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except BaseException:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('EX: _json_feed_v1to_dict unable to load json ' + str(xml_str))
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not feed_json.get('version'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not feed_json['version'].startswith('https://jsonfeed.org/version/1'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not feed_json.get('items'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(feed_json['items'], list):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:47:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for json_feed_item in feed_json['items']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not isinstance(json_feed_item, dict):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('url'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-09 14:18:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        url_str = get_url_from_post(json_feed_item['url'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not url_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('date_published'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not json_feed_item.get('date_modified'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('content_text'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not json_feed_item.get('content_html'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('content_html'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['content_html'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            title = remove_html(json_feed_item['content_html'])
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['content_text'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            title = remove_html(json_feed_item['content_text'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if len(title) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: json feed title is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('description'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['description'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(json_feed_item['description'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if len(description) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                print('WARN: json feed description is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if json_feed_item.get('tags'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if isinstance(json_feed_item['tags'], list):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    for tag_name in json_feed_item['tags']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if not isinstance(tag_name, str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if ' ' in tag_name:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if not tag_name.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            tag_name = '#' + tag_name
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if tag_name not in description:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            description += ' ' + tag_name
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-09 14:18:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        link = remove_html(url_str)
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(link) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: json feed link is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain, None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('date_published'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['date_published'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = json_feed_item['date_published']
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['date_modified'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = json_feed_item['date_modified']
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:44:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not _valid_feed_date(pub_date_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_filename = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        votes_status = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 result, pub_date_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 title, link,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 votes_status, post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 description, moderated,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 None, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 fediverse_handle, extra_links)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) +
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              ' json feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-06 19:48:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _atom_feed_yt_to_dict(base_dir: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                          system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an atom-style YouTube feed string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<entry>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-11 13:42:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if is_blocked_domain(base_dir, 'www.youtube.com', None, None):
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    atom_items = xml_str.split('<entry>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_entry = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for atom_item in atom_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_entry:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            first_entry = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item.strip():
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(atom_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: atom feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<published>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</published>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<yt:videoId>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</yt:videoId>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = atom_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_script(title, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = unescaped_text(title)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<media:description>' in atom_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								           '</media:description>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '<summary>' in atom_item and '</summary>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<summary>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</summary>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-27 21:27:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '<content' in atom_item and '</content>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<content', 1)[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = description.split('>', 1)[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</content>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = unescaped_text(description)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-28 12:00:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_script(description, None, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-27 21:27:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        link, _ = get_link_from_rss_item(atom_item, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = atom_item.split('<yt:videoId>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = link.split('</yt:videoId>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = 'https://www.youtube.com/watch?v=' + link.strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = atom_item.split('<published>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</published>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:44:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not _valid_feed_date(pub_date_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_filename = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        votes_status = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xml_podcast_to_dict(base_dir, atom_item, xml_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['linkMimeType'] = 'video/youtube'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 result, pub_date_str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 title, link,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 votes_status, post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 description, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 podcast_properties, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                 fediverse_handle, extra_links)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' YouTube feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml_str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<yt:videoId>' in xml_str and '<yt:channelId>' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 16:10:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('YouTube feed: reading')
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-06 19:48:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _atom_feed_yt_to_dict(base_dir,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                     xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                     max_posts_per_source,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                     max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                     session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                     system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'rss version="2.0"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _xml2str_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                session, debug,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<?xml version="1.0"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _xml1str_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                session, debug, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'xmlns="http://www.w3.org/2005/Atom"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _atom_feed_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                  xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                  max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                  session, debug, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                  system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'https://jsonfeed.org/version/1' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-19 14:26:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _json_feed_v1to_dict(base_dir,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                    xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    max_posts_per_source,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                    session, debug, system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _yt_channel_to_atom_feed(url: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts a YouTube channel url into an atom feed url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if 'youtube.com/channel/' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return url
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    channel_id = url.split('youtube.com/channel/')[1].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    channel_url = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    print('YouTube feed: ' + channel_url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return channel_url
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_rss(base_dir: str, domain: str, session, url: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_posts_per_source: int, max_feed_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_categories_feed_item_size_kb: int, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            preferred_podcast_formats: [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            timeout_sec: int, system_language: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Returns an RSS url as a dict
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(url, str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('url: ' + str(url))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('ERROR: get_rss url should be a string')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    headers = {
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 20:22:05 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'Accept': 'text/xml, application/xml; charset=UTF-8'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    params = None
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    session_params = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    session_headers = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if headers:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        session_headers = headers
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if params:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        session_params = params
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    session_headers['User-Agent'] = \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not session:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: no session specified for get_rss')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    url = _yt_channel_to_atom_feed(url)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        result = \
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            session.get(url, headers=session_headers,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        params=session_params,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 20:33:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        timeout=timeout_sec,
							 | 
						
					
						
							
								
									
										
										
										
											2023-08-17 16:46:59 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        allow_redirects=True)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if result:
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-29 12:50:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            result_str = remove_zero_length_strings(result.text)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if int(len(result_str) / 1024) >= max_feed_size_kb:
							 | 
						
					
						
							
								
									
										
										
										
											2023-08-05 10:36:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                print('WARN: feed is too large: ' + url)
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-29 12:50:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            elif not contains_invalid_chars(result_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                return _xml_str_to_dict(base_dir, domain, result_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        moderated, mirrored,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_posts_per_source,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        session, debug,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                        system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2023-08-05 10:36:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: feed contains invalid characters: ' + url)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 13:04:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            print('WARN: no result returned for feed ' + url)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except requests.exceptions.RequestException as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              'headers: ' + str(session_headers) + ', ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              'params: ' + str(session_params) + ', ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except ValueError as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              'headers: ' + str(session_headers) + ', ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              'params: ' + str(session_params) + ', ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except SocketError as ex:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ex.errno == errno.ECONNRESET:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: connection was reset during get_rss ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: get_rss, ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-02 21:54:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_rss_from_dict(newswire: {},
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                      http_prefix: str, domain_full: str,
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-06 19:48:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                      translate: {}) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns an rss feed from the current newswire dict.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    This allows other instances to subscribe to the same newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = rss2header(http_prefix,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         None, domain_full,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         'Newswire', translate)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not newswire:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return ''
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for published, fields in newswire.items():
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-20 12:22:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '+00:00' in published:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            published = published.replace('+00:00', 'Z').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            published = published.replace(' ', 'T')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_with_offset = \
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-20 22:27:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                date_from_string_format(published, ["%Y-%m-%d %H:%M:%S%z"])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            published = published_with_offset.strftime("%Y-%m-%dT%H:%M:%S%z")
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-20 22:27:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = date_from_string_format(published,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                               ["%Y-%m-%dT%H:%M:%S%z"])
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-03 10:39:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        except BaseException as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: Unable to convert date ' + published + ' ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '<item>\n' + \
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '  <title>' + escape_text(fields[0]) + '</title>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = remove_html(first_paragraph_from_string(fields[4]))
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-02 10:24:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '  <description>' + escape_text(description) + '</description>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-08 15:07:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        url = fields[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-08 11:04:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in url:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if domain_full not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                url = http_prefix + '://' + domain_full + url
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += '  <link>' + url + '</link>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:12:27 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_date_str = pub_date.strftime("%a, %d %b %Y %H:%M:%S UT")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '  <pubDate>' + rss_date_str + '</pubDate>\n' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '</item>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str += rss2footer()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _is_newswire_blog_post(post_json_object: {}) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Is the given object a blog post?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:47:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    There isn't any difference between a blog post and a newswire blog post
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    but we may here need to check for different properties than
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 13:49:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    is_blog_post does
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not post_json_object:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not has_object_dict(post_json_object):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_json_object['object'].get('summary') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('url') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('content') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('published'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return is_public_post(post_json_object)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _get_hashtags_from_post(post_json_object: {}) -> []:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a list of any hashtags within a post
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not has_object_dict(post_json_object):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not post_json_object['object'].get('tag'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(post_json_object['object']['tag'], list):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    tags = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tgname in post_json_object['object']['tag']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not isinstance(tgname, dict):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not tgname.get('name'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not tgname.get('type'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tgname['type'] != 'Hashtag':
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tgname['name'] not in tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            tags.append(tgname['name'])
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return tags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   newswire: {},
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   max_blogs_per_account: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   index_filename: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   max_tags: int, system_language: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   session, debug: bool) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Adds blogs for the given account to the newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(index_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # local blog entries are unmoderated by default
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    moderated = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # local blogs can potentially be moderated
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    moderated_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        acct_dir(base_dir, nickname, domain) + '/.newswiremoderated'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if os.path.isfile(moderated_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        moderated = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-14 13:01:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        with open(index_filename, 'r', encoding='utf-8') as fp_index:
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            post_filename = 'start'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            while post_filename:
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-14 13:01:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = fp_index.readline()
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:46:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if not post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if ctr >= max_blogs_per_account:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # if this is a full path then remove the directories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '/' in post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    post_filename = post_filename.split('/')[-1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # filename of the post without any extension or path
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # This should also correspond to any index entry in
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # the posts cache
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                post_url = remove_eol(post_filename)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                post_url = post_url.replace('.json', '').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # read the post from file
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                full_post_filename = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    locate_post(base_dir, nickname,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                domain, post_url, False)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not full_post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    print('Unable to locate post for newswire ' + post_url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if ctr >= max_blogs_per_account:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-15 19:46:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_json_object = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if full_post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    post_json_object = load_json(full_post_filename)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if _is_newswire_blog_post(post_json_object):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    published = post_json_object['object']['published']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    published = published.replace('T', ' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    published = published.replace('Z', '+00:00')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    votes = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if os.path.isfile(full_post_filename + '.votes'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        votes = load_json(full_post_filename + '.votes')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    content = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        get_base_content_from_post(post_json_object,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                                   system_language)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    description = first_paragraph_from_string(content)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    description = remove_html(description)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    tags_from_post = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        _get_hashtags_from_post(post_json_object)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    summary = post_json_object['object']['summary']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    url2 = post_json_object['object']['url']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    url_str = get_url_from_post(url2)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    url3 = remove_html(url_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    fediverse_handle = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    extra_links = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    _add_newswire_dict_entry(base_dir,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             newswire, published,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             summary, url3,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             votes, full_post_filename,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             description, moderated, False,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             tags_from_post,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             max_tags, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             None, system_language,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             fediverse_handle, extra_links)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if ctr >= max_blogs_per_account:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    except OSError as exc:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('EX: _add_account_blogs_to_newswire unable to read ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              index_filename + ' ' + str(exc))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_blogs_to_newswire(base_dir: str, domain: str, newswire: {},
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_blogs_per_account: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           max_tags: int, system_language: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           session, debug: bool) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Adds blogs from each user account into the newswire
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    moderation_dict = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # go through each account
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    dir_str = data_dir(base_dir)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for _, dirs, _ in os.walk(dir_str):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for handle in dirs:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not is_account_dir(handle):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            nickname = handle.split('@')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            # has this account been suspended?
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if is_suspended(base_dir, nickname):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 08:58:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-12-18 15:29:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            handle_dir = acct_handle_dir(base_dir, handle)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if os.path.isfile(handle_dir + '/.nonewswire'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 21:28:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            # is there a blogs timeline for this account?
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            account_dir = os.path.join(dir_str, handle)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            blogs_index = account_dir + '/tlblogs.index'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if os.path.isfile(blogs_index):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                domain = handle.split('@')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_account_blogs_to_newswire(base_dir, nickname, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                               newswire, max_blogs_per_account,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                               blogs_index, max_tags,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                               system_language, session,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                               debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-13 22:13:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        break
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # sort the moderation dict into chronological order, latest first
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    sorted_moderation_dict = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        OrderedDict(sorted(moderation_dict.items(), reverse=True))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 12:15:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # save the moderation queue details for later display
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    newswire_moderation_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        data_dir(base_dir) + '/newswiremoderation.txt'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if sorted_moderation_dict:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        save_json(sorted_moderation_dict, newswire_moderation_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 14:32:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # remove the file if there is nothing to moderate
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if os.path.isfile(newswire_moderation_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                os.remove(newswire_moderation_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            except OSError:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                print('EX: _add_blogs_to_newswire unable to delete ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                      str(newswire_moderation_filename))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_dict_from_newswire(session, base_dir: str, domain: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_posts_per_source: int, max_feed_size_kb: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           max_tags: int, max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_newswire_posts: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           system_language: str, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           timeout_sec: int) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:59:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Gets rss feeds as a dictionary from newswire file
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    subscriptions_filename = data_dir(base_dir) + '/newswire.txt'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(subscriptions_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_posts_per_source = 5
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # add rss feeds
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_feed = []
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        with open(subscriptions_filename, 'r', encoding='utf-8') as fp_sub:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            rss_feed = fp_sub.readlines()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    except OSError:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('EX: get_dict_from_newswire unable to read ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              subscriptions_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for url in rss_feed:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url = url.strip()
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # Does this contain a url?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '://' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # is this a comment?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if url.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # should this feed be moderated?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        moderated = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '*' in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            moderated = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            url = url.replace('*', '').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-19 14:37:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # should this feed content be mirrored?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        mirrored = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '!' in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            mirrored = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            url = url.replace('!', '').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        items_list = get_rss(base_dir, domain, session, url,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             max_posts_per_source, max_feed_size_kb,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             max_categories_feed_item_size_kb, debug,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             preferred_podcast_formats,
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-25 17:26:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             timeout_sec, system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if items_list:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for date_str, item in items_list.items():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                result[date_str] = item
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-27 17:12:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        time.sleep(4)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # add blogs from each user account
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _add_blogs_to_newswire(base_dir, domain, result,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_posts_per_source, max_tags, system_language,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           session, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # sort into chronological order, latest first
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    sorted_result = OrderedDict(sorted(result.items(), reverse=True))
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # are there too many posts? If so then remove the oldest ones
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    no_of_posts = len(sorted_result.items())
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if no_of_posts > max_newswire_posts:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        removals = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for date_str, item in sorted_result.items():
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 18:49:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if ctr > max_newswire_posts:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                removals.append(date_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for remov in removals:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            sorted_result.pop(remov)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return sorted_result
							 |