2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__filename__ = "newswire.py"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__author__ = "Bob Mottram"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__license__ = "AGPL3+"
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-03 13:58:20 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__version__ = "1.3.0"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__maintainer__ = "Bob Mottram"
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__email__ = "bob@libreserver.org"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								__status__ = "Production"
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-26 11:27:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__module_group__ = "Web Interface Columns"
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import os
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import json
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import requests
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import random
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-27 20:02:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import time
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from socket import error as SocketError
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import errno
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from datetime import datetime
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 15:33:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from datetime import timedelta
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from datetime import timezone
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from collections import OrderedDict
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:31:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import valid_post_date
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from categories import set_hashtag_category
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-21 11:58:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import remove_eol
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_domain_from_actor
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import valid_hash_tag
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 21:44:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import dangerous_svg
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 16:01:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_fav_filename_from_url
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import get_base_content_from_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import has_object_dict
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import first_paragraph_from_string
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_public_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 20:36:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import locate_post
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 15:13:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import load_json
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 14:47:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import save_json
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_suspended
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 17:53:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import contains_invalid_chars
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import remove_html
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import is_account_dir
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import acct_dir
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from utils import local_actor_url
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from blocking import is_blocked_domain
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from blocking import is_blocked_hashtag
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from filters import is_filtered
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from session import download_image_any_mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 12:11:05 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _remove_cdata(text: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:18:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Removes any CDATA from the given text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if 'CDATA[' in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        text = text.split('CDATA[')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ']' in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            text = text.split(']')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def rss2header(http_prefix: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               nickname: str, domain_full: str,
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               title: str, translate: {}) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Header for an RSS 2.0 feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "<rss version=\"2.0\">" + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        '<channel>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if title.startswith('News'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>Newswire</title>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + http_prefix + '://' + domain_full + \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/newswire.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    elif title.startswith('Site'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>' + domain_full + '</title>' + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + http_prefix + '://' + domain_full + \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-13 17:17:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/blog/rss.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <title>' + translate[title] + '</title>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '    <link>' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            local_actor_url(http_prefix, nickname, domain_full) + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '/rss.xml' + '</link>'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def rss2footer() -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Footer for an RSS 2.0 feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = '</channel></rss>'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_newswire_tags(text: str, max_tags: int) -> []:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a list of hashtags found in the given text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:46:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '#' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if ' ' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    text_simplified = \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    text_simplified = text_simplified.replace('. ', ' ').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if text_simplified.endswith('.'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        text_simplified = text_simplified[:len(text_simplified)-1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    words = text_simplified.split(' ')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    tags = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for wrd in words:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not wrd.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if len(wrd) <= 1:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if wrd in tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        tags.append(wrd)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(tags) >= max_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return tags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def limit_word_lengths(text: str, max_word_length: int) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Limits the maximum length of words so that the newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    column cannot become too wide
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if ' ' not in text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    words = text.split(' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for wrd in words:
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(wrd) > max_word_length:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            wrd = wrd[:max_word_length]
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if result:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            result += ' '
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        result += wrd
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_newswire_favicon_url(url: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a favicon url from the given article link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if '://' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return '/newswire_favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if url.startswith('http://'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not (url.endswith('.onion') or url.endswith('.i2p')):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return '/newswire_favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    domain = url.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if '/' not in domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return url + '/favicon.ico'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    domain = domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return url.split('://')[0] + '://' + domain + '/favicon.ico'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _download_newswire_feed_favicon(session, base_dir: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    link: str, debug: bool) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Downloads the favicon for the given feed link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    fav_url = get_newswire_favicon_url(link)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    timeout_sec = 10
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    image_data, mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        download_image_any_mime_type(session, fav_url, timeout_sec, debug)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not image_data or not mime_type:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 23:59:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # update the favicon url
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    extensions_to_mime = {
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'ico': 'x-icon',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'png': 'png',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'jpg': 'jpeg',
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-06 11:04:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'jxl': 'jxl',
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'gif': 'gif',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'avif': 'avif',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'svg': 'svg+xml',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'webp': 'webp'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for ext, mime_ext in extensions_to_mime.items():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if 'image/' + mime_ext in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            fav_url = fav_url.replace('.ico', '.' + ext)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # create cached favicons directory if needed
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isdir(base_dir + '/favicons'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        os.mkdir(base_dir + '/favicons')
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # check svg for dubious scripts
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if fav_url.endswith('.svg'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        image_data_str = str(image_data)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if dangerous_svg(image_data_str, False):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # save to the cache
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    fav_filename = get_fav_filename_from_url(base_dir, fav_url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if os.path.isfile(fav_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 21:14:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return True
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        with open(fav_filename, 'wb+') as fp_fav:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            fp_fav.write(image_data)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except OSError:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('EX: failed writing favicon ' + fav_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_newswire_dict_entry(base_dir: str, domain: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             newswire: {}, date_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             title: str, link: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             votes_status: str, post_filename: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             description: str, moderated: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             mirrored: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             tags: [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             max_tags: int, session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             podcast_properties: {}) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Update the newswire dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # remove any markup
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    title = remove_html(title)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    all_text = title + ' ' + description
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # check that none of the text is filtered against
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if is_filtered(base_dir, None, None, all_text):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    title = limit_word_lengths(title, 13)
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if tags is None:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        tags = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # extract hashtags from the text of the feed post
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_tags = get_newswire_tags(all_text, max_tags)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # Include tags from podcast categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if podcast_properties:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if podcast_properties.get('explicit'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 13:15:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '#nsfw' not in post_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                post_tags.append('#nsfw')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_tags += podcast_properties['categories']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # combine the tags into a single list
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tag in tags:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tag in post_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(post_tags) < max_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_tags.append(tag)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # check that no tags are blocked
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tag in post_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_hashtag(base_dir, tag):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:18:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _download_newswire_feed_favicon(session, base_dir, link, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    newswire[date_str] = [
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        votes_status,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_tags,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ]
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # convert from YY-MM-DD HH:MM:SS+00:00 to
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # YY-MM-DDTHH:MM:SSZ
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_date = pub_date.replace(' ', 'T').replace('+00:00', 'Z')
							 | 
						
					
						
							
								
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '.' in post_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ending = post_date.split('.')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        timezone_str = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for ending_char in ending:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not ending_char.isdigit():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                timezone_str += ending_char
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if timezone_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_date = post_date.split('.')[0] + timezone_str
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return valid_post_date(post_date, 90, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a UTC date string based on the given date string
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    This tries a number of formats to see which work
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if ':00:00' in pub_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # If this was published exactly on the hour then assign a
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # random minute and second to make this item relatively unique
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        randgen = random.Random(unique_string_identifier)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rand_min = randgen.randint(0, 59)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rand_sec = randgen.randint(0, 59)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        replace_time_str = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.replace(':00:00', replace_time_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    formats = ("%a, %d %b %Y %H:%M:%S %z",
							 | 
						
					
						
							
								
									
										
										
										
											2021-10-17 14:17:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S Z",
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S GMT",
							 | 
						
					
						
							
								
									
										
										
										
											2021-10-17 14:24:21 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S EST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S PST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S AST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S CST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S MST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S AKST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S HST",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%a, %d %b %Y %H:%M:%S UT",
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               "%Y-%m-%dT%H:%M:%SZ",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               "%Y-%m-%dT%H:%M:%S%z")
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    published_date = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for date_format in formats:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ',' in pub_date and ',' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if ',' not in pub_date and ',' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'Z' in pub_date and 'Z' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'Z' not in pub_date and 'Z' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'EST' not in pub_date and 'EST' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'GMT' not in pub_date and 'GMT' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'EST' in pub_date and 'EST' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'UT' not in pub_date and 'UT' in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'UT' in pub_date and 'UT' not in date_format:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # remove any fraction of a second
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '.' in pub_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ending = pub_date.split('.')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            timezone_str = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for ending_char in ending:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not ending_char.isdigit():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    timezone_str += ending_char
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if timezone_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pub_date = pub_date.split('.')[0] + timezone_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_date = datetime.strptime(pub_date, date_format)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        except BaseException:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if published_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pub_date.endswith(' EST'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                hours_added = timedelta(hours=5)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                published_date = published_date + hours_added
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    pub_date_str = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if published_date:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        offset = published_date.utcoffset()
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 20:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if offset:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_date = published_date - offset
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # convert local date to UTC
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        published_date = published_date.replace(tzinfo=timezone.utc)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = str(published_date)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not pub_date_str.endswith('+00:00'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pub_date_str += '+00:00'
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 19:33:27 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: unrecognized date format: ' + pub_date)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return pub_date_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def load_hashtag_categories(base_dir: str, language: str) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Loads an rss file containing hashtag categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    hashtag_categories_filename = base_dir + '/categories.xml'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(hashtag_categories_filename):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        hashtag_categories_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            base_dir + '/defaultcategories/' + language + '.xml'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not os.path.isfile(hashtag_categories_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    with open(hashtag_categories_filename, 'r', encoding='utf-8') as fp_cat:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        xml_str = fp_cat.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _xml2str_to_hashtag_categories(base_dir, xml_str, 1024, True)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   force: bool = False) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Updates hashtag categories based upon an rss feed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split('<item>')
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_categories_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss categories feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</description>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        category_str = rss_item.split('<title>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        category_str = category_str.split('</title>')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not category_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'CDATA' in category_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        hashtag_list_str = rss_item.split('<description>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        hashtag_list_str = hashtag_list_str.split('</description>')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not hashtag_list_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'CDATA' in hashtag_list_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        hashtag_list = hashtag_list_str.split(' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not is_blocked_hashtag(base_dir, category_str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for hashtag in hashtag_list:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                set_hashtag_category(base_dir, hashtag, category_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                     False, force)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _get_podcast_categories(xml_item: str, xml_str: str) -> str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """ get podcast categories if they exist. These can be turned into hashtags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    podcast_categories = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    episode_category_tags = ['<itunes:category', '<category']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for category_tag in episode_category_tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        item_str = xml_item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if category_tag not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if category_tag not in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_str = xml_str
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        category_list = item_str.split(category_tag)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        first_category = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:12:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for episode_category in category_list:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if first_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                first_category = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'text="' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_category = episode_category.split('text="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        episode_category.lower().replace(' ', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.replace('#', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if episode_category not in podcast_categories:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if valid_hash_tag(episode_category):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            podcast_categories.append('#' + episode_category)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '>' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_category = episode_category.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '<' in episode_category:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.split('<')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        episode_category.lower().replace(' ', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    episode_category = episode_category.replace('#', '')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if episode_category not in podcast_categories:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if valid_hash_tag(episode_category):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            podcast_categories.append('#' + episode_category)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return podcast_categories
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _valid_podcast_entry(base_dir: str, key: str, entry: {}) -> bool:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Is the given podcast namespace entry valid?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/proposal-docs/social/social.md#socialinteract-element
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if key in ('socialInteract', 'discussion'):
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not entry.get('protocol'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not entry.get('uri'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not entry.get('text'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not entry.get('url'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if entry['protocol'].tolower() != 'activitypub':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if entry.get('uri'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_url = entry['uri']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif entry.get('url'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_url = entry['uri']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            post_url = entry['text']
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in post_url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_domain, _ = get_domain_from_actor(post_url)
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not post_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, post_domain):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def xml_podcast_to_dict(base_dir: str, xml_item: str, xml_str: str) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """podcasting extensions for RSS feeds
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    See https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/docs/1.0.md
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:40:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    https://github.com/Podcastindex-org/podcast-namespace/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blob/main/proposal-docs/social/social.md#socialinteract-element
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<podcast:' not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<itunes:' not in xml_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:thumbnail' not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    podcast_properties = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "locations": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "persons": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "soundbites": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "transcripts": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "valueRecipients": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "trailers": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-03 16:38:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "chapters": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "discussion": [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "episode": '',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "socialInteract": [],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    pod_lines = xml_item.split('<podcast:')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for pod_line in pod_lines:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ctr == 0 or '>' not in pod_line:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ' ' not in pod_line.split('>')[0]:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_key = pod_line.split('>')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_val = pod_line.split('>', 1)[1].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<' in pod_val:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pod_val = pod_val.split('<')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if pod_key in podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_properties[pod_key] = pod_val
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_key = pod_line.split(' ')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_fields = (
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'url', 'geo', 'osm', 'type', 'method', 'group',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'owner', 'srcset', 'img', 'role', 'address', 'suggested',
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'startTime', 'duration', 'href', 'name', 'pubdate',
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'length', 'season', 'email', 'platform', 'protocol',
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-16 13:27:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            'accountId', 'priority', 'podcastAccountId',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'podcastAccountUrl'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_entry = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for pod_field in pod_fields:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pod_field + '="' not in pod_line:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_str = pod_line.split(pod_field + '="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' not in pod_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_val = pod_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_entry[pod_field] = pod_val
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pod_text = pod_line.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '<' in pod_text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            pod_text = pod_text.split('<')[0].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if pod_text:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pod_entry['text'] = pod_text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        appended = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pod_key + 's' in podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if isinstance(podcast_properties[pod_key + 's'], list):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_properties[pod_key + 's'].append(pod_entry)
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                appended = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not appended:
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 16:05:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            # if there are repeated keys then only use the first one
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not podcast_properties.get(pod_key):
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if _valid_podcast_entry(base_dir, pod_key, pod_entry):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties[pod_key] = pod_entry
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # get the image for the podcast, if it exists
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    podcast_episode_image = None
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    episode_image_tags = ['<itunes:image', '<media:thumbnail']
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for image_tag in episode_image_tags:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_str = xml_item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if image_tag not in xml_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if image_tag not in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_str = xml_str
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        episode_image = item_str.split(image_tag)[1]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:55:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if image_tag + ' ' in item_str and '>' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if 'href="' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('href="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('"')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_episode_image = episode_image
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif 'url="' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('url="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                podcast_episode_image = episode_image
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '>' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            episode_image = episode_image.split('>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '<' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                episode_image = episode_image.split('<')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '://' in episode_image and '.' in episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_episode_image = episode_image
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # get categories if they exist. These can be turned into hashtags
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    podcast_categories = _get_podcast_categories(xml_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if podcast_episode_image:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        podcast_properties['image'] = podcast_episode_image
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        podcast_properties['categories'] = podcast_categories
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<itunes:explicit>Y' in xml_item or \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								           '<itunes:explicit>T' in xml_item or \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								           '<itunes:explicit>1' in xml_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['explicit'] = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            podcast_properties['explicit'] = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<podcast:' not in xml_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return podcast_properties
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_link_from_rss_item(rss_item: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           preferred_mime_types: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           proxy_type: str) -> (str, str):
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Extracts rss link from rss item string
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    mime_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        enclosures = rss_item.split('<podcast:alternateEnclosure ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for enclosure in enclosures:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if ctr == 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '</podcast:alternateEnclosure' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            enclosure = enclosure.split('</podcast:alternateEnclosure')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'type="' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            mime_type = enclosure.split('type="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if '"' in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mime_type = mime_type.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if mime_type not in preferred_mime_types:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'uri="' not in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            uris = enclosure.split('uri="')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr2 = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for uri in uris:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if ctr2 == 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    ctr2 += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ctr2 += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' not in uri:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link = uri.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if proxy_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'tor' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.onion/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'onion' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.onion/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if proxy_type == 'i2p' and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       '.i2p/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if '.onion/' not in link and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                   '.i2p/' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<enclosure ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # get link from audio or video enclosure
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        enclosure = rss_item.split('<enclosure ')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            enclosure = enclosure.split('>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if ' type="' in enclosure:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mime_type = enclosure.split(' type="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in mime_type:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    mime_type = mime_type.split('"')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if 'url="' in enclosure and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               ('"audio/' in enclosure or '"video/' in enclosure):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link_str = enclosure.split('url="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in link_str:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    link = link_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if '://' in link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<link>' in rss_item and '</link>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link = rss_item.split('<link>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link = link.split('</link>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return None, None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    elif '<link ' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link_str = rss_item.split('<link ')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '>' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link_str = link_str.split('>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if 'href="' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                link_str = link_str.split('href="')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if '"' in link_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    link = link_str.split('"')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return link, mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: []) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml RSS 2.0 string to a dictionary
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<item>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # is this an rss feed containing hashtag categories?
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<title>#categories</title>' in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _xml2str_to_hashtag_categories(base_dir, xml_str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                       max_categories_feed_item_size_kb)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split('<item>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<pubDate>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</pubDate>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = rss_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' in rss_item and '</description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = rss_item.split('<description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description.split('</description>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in rss_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = rss_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(rss_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = rss_item.split('<pubDate>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</pubDate>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if _valid_feed_date(pub_date_str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                votes_status = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    xml_podcast_to_dict(base_dir, rss_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         result, pub_date_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         title, link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         votes_status, post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         description, moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                         podcast_properties)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' rss 2.0 feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: []) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml RSS 1.0 string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    https://validator.w3.org/feed/docs/rss1.html
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    item_str = '<item'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if item_str not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # is this an rss feed containing hashtag categories?
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<title>#categories</title>' in xml_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        _xml2str_to_hashtag_categories(base_dir, xml_str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                       max_categories_feed_item_size_kb)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_items = xml_str.split(item_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for rss_item in rss_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(rss_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: rss 1.0 feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if rss_item.startswith('s>'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<dc:date>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</dc:date>' not in rss_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = rss_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<description>' in rss_item and '</description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = rss_item.split('<description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description.split('</description>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in rss_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in rss_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = rss_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(rss_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain):
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = rss_item.split('<dc:date>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</dc:date>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if _valid_feed_date(pub_date_str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                votes_status = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    xml_podcast_to_dict(base_dir, rss_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         result, pub_date_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         title, link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         votes_status, post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         description, moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                         podcast_properties)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' rss 1.0 feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                       preferred_podcast_formats: []) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an atom feed string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<entry>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    atom_items = xml_str.split('<entry>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_item = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for atom_item in atom_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_item:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            first_item = False
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(atom_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: atom feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<link' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<updated>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</updated>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = atom_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = remove_html(title)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<summary>' in atom_item and '</summary>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<summary>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description.split('</summary>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if '<media:description>' in atom_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								               '</media:description>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                description = atom_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        proxy_type = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if domain.endswith('.onion'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'tor'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif domain.endswith('.i2p'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            proxy_type = 'i2p'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        link, link_mime_type = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            get_link_from_rss_item(atom_item, preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   proxy_type)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = atom_item.split('<updated>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</updated>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if _valid_feed_date(pub_date_str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                votes_status = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    xml_podcast_to_dict(base_dir, atom_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if podcast_properties:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties['linkMimeType'] = link_mime_type
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         result, pub_date_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         title, link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         votes_status, post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         description, moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                         podcast_properties)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' atom feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                         max_feed_item_size_kb: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         session, debug: bool) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts a json feed string to a dictionary
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    See https://jsonfeed.org/version/1.1
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '"items"' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        feed_json = json.loads(xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except BaseException:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('EX: _json_feed_v1to_dict unable to load json ' + str(xml_str))
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not feed_json.get('version'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not feed_json['version'].startswith('https://jsonfeed.org/version/1'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not feed_json.get('items'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(feed_json['items'], list):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:47:49 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for json_feed_item in feed_json['items']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not isinstance(json_feed_item, dict):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('url'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not isinstance(json_feed_item['url'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('date_published'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not json_feed_item.get('date_modified'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not json_feed_item.get('content_text'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not json_feed_item.get('content_html'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('content_html'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['content_html'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            title = remove_html(json_feed_item['content_html'])
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['content_text'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            title = remove_html(json_feed_item['content_text'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if len(title) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: json feed title is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('description'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['description'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(json_feed_item['description'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if len(description) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                print('WARN: json feed description is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if json_feed_item.get('tags'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if isinstance(json_feed_item['tags'], list):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    for tag_name in json_feed_item['tags']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if not isinstance(tag_name, str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if ' ' in tag_name:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if not tag_name.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            tag_name = '#' + tag_name
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if tag_name not in description:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            description += ' ' + tag_name
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        link = json_feed_item['url']
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(link) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: json feed link is too long')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        item_domain = link.split('://')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '/' in item_domain:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            item_domain = item_domain.split('/')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if is_blocked_domain(base_dir, item_domain):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if json_feed_item.get('date_published'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['date_published'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = json_feed_item['date_published']
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not isinstance(json_feed_item['date_modified'], str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = json_feed_item['date_modified']
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if _valid_feed_date(pub_date_str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                votes_status = []
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         result, pub_date_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         title, link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         votes_status, post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         description, moderated,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         mirrored, [], 32, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                         None)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) +
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              ' json feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                          max_feed_item_size_kb: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                          session, debug: bool) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts an atom-style YouTube feed string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<entry>' not in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if is_blocked_domain(base_dir, 'www.youtube.com'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    atom_items = xml_str.split('<entry>')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    post_ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    max_bytes = max_feed_item_size_kb * 1024
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    first_entry = True
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for atom_item in atom_items:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if first_entry:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            first_entry = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not atom_item.strip():
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if len(atom_item) > max_bytes:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: atom feed item is too big')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</title>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<published>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</published>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<yt:videoId>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '</yt:videoId>' not in atom_item:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = atom_item.split('<title>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        title = _remove_cdata(title.split('</title>')[0])
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '<media:description>' in atom_item and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								           '</media:description>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<media:description>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</media:description>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        elif '<summary>' in atom_item and '</summary>' in atom_item:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            description = atom_item.split('<summary>')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = description.split('</summary>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        link, _ = get_link_from_rss_item(atom_item, None, None)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = atom_item.split('<yt:videoId>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = link.split('</yt:videoId>')[0]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            link = 'https://www.youtube.com/watch?v=' + link.strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not link:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pub_date = atom_item.split('<published>')[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date = pub_date.split('</published>')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        unique_string_identifier = title + ' ' + link
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if pub_date_str:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if _valid_feed_date(pub_date_str):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_filename = ''
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                votes_status = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                podcast_properties = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    xml_podcast_to_dict(base_dir, atom_item, xml_str)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if podcast_properties:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 18:48:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    podcast_properties['linkMimeType'] = 'video/youtube'
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         result, pub_date_str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         title, link,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         votes_status, post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         description, moderated, mirrored,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         [], 32, session, debug,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                         podcast_properties)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_ctr += 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if post_ctr >= max_posts_per_source:
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    break
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_ctr > 0:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('Added ' + str(post_ctr) + ' YouTube feed items to newswire')
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _xml_str_to_dict(base_dir: str, domain: str, xml_str: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_posts_per_source: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                     session, debug: bool,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                     preferred_podcast_formats: []) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Converts an xml string to a dictionary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<yt:videoId>' in xml_str and '<yt:channelId>' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 16:10:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('YouTube feed: reading')
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _atom_feed_yt_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                     xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                     max_posts_per_source,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                     max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                     session, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'rss version="2.0"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _xml2str_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                preferred_podcast_formats)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if '<?xml version="1.0"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _xml1str_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                session, debug, preferred_podcast_formats)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'xmlns="http://www.w3.org/2005/Atom"' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _atom_feed_to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                  xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                  max_posts_per_source, max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                  session, debug, preferred_podcast_formats)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if 'https://jsonfeed.org/version/1' in xml_str:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return _json_feed_v1to_dict(base_dir, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                    xml_str, moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    max_posts_per_source,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                    max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                    session, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _yt_channel_to_atom_feed(url: str) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Converts a YouTube channel url into an atom feed url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if 'youtube.com/channel/' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return url
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    channel_id = url.split('youtube.com/channel/')[1].strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    channel_url = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    print('YouTube feed: ' + channel_url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return channel_url
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_rss(base_dir: str, domain: str, session, url: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            moderated: bool, mirrored: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_posts_per_source: int, max_feed_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            max_categories_feed_item_size_kb: int, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            timeout_sec: int) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """Returns an RSS url as a dict
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(url, str):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print('url: ' + str(url))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('ERROR: get_rss url should be a string')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    headers = {
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-14 20:22:05 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'Accept': 'text/xml, application/xml; charset=UTF-8'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    params = None
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    session_params = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    session_headers = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if headers:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        session_headers = headers
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if params:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        session_params = params
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    session_headers['User-Agent'] = \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not session:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: no session specified for get_rss')
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    url = _yt_channel_to_atom_feed(url)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        result = \
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            session.get(url, headers=session_headers,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        params=session_params,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 20:33:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        timeout=timeout_sec,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        allow_redirects=False)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if result:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if int(len(result.text) / 1024) < max_feed_size_kb and \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 17:53:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								               not contains_invalid_chars(result.text):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                return _xml_str_to_dict(base_dir, domain, result.text,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                        moderated, mirrored,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_posts_per_source,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        max_categories_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                        session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                        preferred_podcast_formats)
							 | 
						
					
						
							
								
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: feed is too large, ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                  'or contains invalid characters: ' + url)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 13:04:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            print('WARN: no result returned for feed ' + url)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except requests.exceptions.RequestException as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              'headers: ' + str(session_headers) + ', ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              'params: ' + str(session_params) + ', ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except ValueError as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              'headers: ' + str(session_headers) + ', ' +
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              'params: ' + str(session_params) + ', ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    except SocketError as ex:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ex.errno == errno.ECONNRESET:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: connection was reset during get_rss ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: get_rss, ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_rs_sfrom_dict(base_dir: str, newswire: {},
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                      http_prefix: str, domain_full: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                      title: str, translate: {}) -> str:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns an rss feed from the current newswire dict.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    This allows other instances to subscribe to the same newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str = rss2header(http_prefix,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         None, domain_full,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                         'Newswire', translate)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not newswire:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return ''
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for published, fields in newswire.items():
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-20 12:22:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '+00:00' in published:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            published = published.replace('+00:00', 'Z').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            published = published.replace(' ', 'T')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published_with_offset = \
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-20 12:37:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z")
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            published = published_with_offset.strftime("%Y-%m-%dT%H:%M:%SZ")
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            pub_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
							 | 
						
					
						
							
								
									
										
										
										
											2022-02-03 10:39:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        except BaseException as ex:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            print('WARN: Unable to convert date ' + published + ' ' + str(ex))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '<item>\n' + \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '  <title>' + fields[0] + '</title>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        description = remove_html(first_paragraph_from_string(fields[4]))
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += '  <description>' + description + '</description>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-08 15:07:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        url = fields[1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-08 11:04:52 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if '://' not in url:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if domain_full not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                url = http_prefix + '://' + domain_full + url
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_str += '  <link>' + url + '</link>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 22:12:27 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_date_str = pub_date.strftime("%a, %d %b %Y %H:%M:%S UT")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        rss_str += \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            '  <pubDate>' + rss_date_str + '</pubDate>\n' + \
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            '</item>\n'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_str += rss2footer()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return rss_str
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _is_newswire_blog_post(post_json_object: {}) -> bool:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Is the given object a blog post?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-25 10:47:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    There isn't any difference between a blog post and a newswire blog post
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    but we may here need to check for different properties than
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 13:49:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    is_blog_post does
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not post_json_object:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not has_object_dict(post_json_object):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return False
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if post_json_object['object'].get('summary') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('url') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('content') and \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       post_json_object['object'].get('published'):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return is_public_post(post_json_object)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _get_hashtags_from_post(post_json_object: {}) -> []:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Returns a list of any hashtags within a post
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not has_object_dict(post_json_object):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not post_json_object['object'].get('tag'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not isinstance(post_json_object['object']['tag'], list):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    tags = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for tgname in post_json_object['object']['tag']:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not isinstance(tgname, dict):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not tgname.get('name'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not tgname.get('type'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tgname['type'] != 'Hashtag':
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if tgname['name'] not in tags:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            tags.append(tgname['name'])
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return tags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   newswire: {},
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   max_blogs_per_account: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   index_filename: str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                   max_tags: int, system_language: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                   session, debug: bool) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Adds blogs for the given account to the newswire
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(index_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # local blog entries are unmoderated by default
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    moderated = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # local blogs can potentially be moderated
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    moderated_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        acct_dir(base_dir, nickname, domain) + '/.newswiremoderated'
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if os.path.isfile(moderated_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        moderated = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    with open(index_filename, 'r', encoding='utf-8') as index_file:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        post_filename = 'start'
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ctr = 0
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        while post_filename:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            post_filename = index_file.readline()
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if post_filename:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                # if this is a full path then remove the directories
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if '/' in post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    post_filename = post_filename.split('/')[-1]
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # filename of the post without any extension or path
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # This should also correspond to any index entry in
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # the posts cache
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-21 11:58:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_url = remove_eol(post_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_url = post_url.replace('.json', '').strip()
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                # read the post from file
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                full_post_filename = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 20:36:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    locate_post(base_dir, nickname,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                domain, post_url, False)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not full_post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    print('Unable to locate post for newswire ' + post_url)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    ctr += 1
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    if ctr >= max_blogs_per_account:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        break
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 13:34:04 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                post_json_object = None
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if full_post_filename:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    post_json_object = load_json(full_post_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if _is_newswire_blog_post(post_json_object):
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    published = post_json_object['object']['published']
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    published = published.replace('T', ' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    published = published.replace('Z', '+00:00')
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 20:17:34 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    votes = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    if os.path.isfile(full_post_filename + '.votes'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        votes = load_json(full_post_filename + '.votes')
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    content = \
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        get_base_content_from_post(post_json_object,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                                   system_language)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    description = first_paragraph_from_string(content)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    description = remove_html(description)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    tags_from_post = _get_hashtags_from_post(post_json_object)
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    summary = post_json_object['object']['summary']
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    _add_newswire_dict_entry(base_dir, domain,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             newswire, published,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             summary,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             post_json_object['object']['url'],
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                             votes, full_post_filename,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                             description, moderated, False,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                             tags_from_post,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                             max_tags, session, debug,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                             None)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if ctr >= max_blogs_per_account:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _add_blogs_to_newswire(base_dir: str, domain: str, newswire: {},
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_blogs_per_account: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           max_tags: int, system_language: str,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           session, debug: bool) -> None:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Adds blogs from each user account into the newswire
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    moderation_dict = {}
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # go through each account
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for _, dirs, _ in os.walk(base_dir + '/accounts'):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for handle in dirs:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if not is_account_dir(handle):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            nickname = handle.split('@')[0]
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            # has this account been suspended?
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if is_suspended(base_dir, nickname):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 08:58:44 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if os.path.isfile(base_dir + '/accounts/' + handle +
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 21:28:40 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                              '/.nonewswire'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            # is there a blogs timeline for this account?
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            account_dir = os.path.join(base_dir + '/accounts', handle)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            blogs_index = account_dir + '/tlblogs.index'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if os.path.isfile(blogs_index):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                domain = handle.split('@')[1]
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                _add_account_blogs_to_newswire(base_dir, nickname, domain,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                               newswire, max_blogs_per_account,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                               blogs_index, max_tags,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                               system_language, session,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                               debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-12-13 22:13:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        break
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # sort the moderation dict into chronological order, latest first
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    sorted_moderation_dict = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        OrderedDict(sorted(moderation_dict.items(), reverse=True))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 12:15:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # save the moderation queue details for later display
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    newswire_moderation_filename = \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        base_dir + '/accounts/newswiremoderation.txt'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if sorted_moderation_dict:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        save_json(sorted_moderation_dict, newswire_moderation_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 14:32:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # remove the file if there is nothing to moderate
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if os.path.isfile(newswire_moderation_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            try:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                os.remove(newswire_moderation_filename)
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            except OSError:
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                print('EX: _add_blogs_to_newswire unable to delete ' +
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                      str(newswire_moderation_filename))
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def get_dict_from_newswire(session, base_dir: str, domain: str,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_posts_per_source: int, max_feed_size_kb: int,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           max_tags: int, max_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_newswire_posts: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_categories_feed_item_size_kb: int,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           system_language: str, debug: bool,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           preferred_podcast_formats: [],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                           timeout_sec: int) -> {}:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:59:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    """Gets rss feeds as a dictionary from newswire file
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    """
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    subscriptions_filename = base_dir + '/accounts/newswire.txt'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not os.path.isfile(subscriptions_filename):
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    max_posts_per_source = 5
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # add rss feeds
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rss_feed = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    with open(subscriptions_filename, 'r', encoding='utf-8') as fp_sub:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        rss_feed = fp_sub.readlines()
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = {}
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    for url in rss_feed:
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url = url.strip()
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # Does this contain a url?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '://' not in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # is this a comment?
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if url.startswith('#'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # should this feed be moderated?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        moderated = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '*' in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            moderated = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            url = url.replace('*', '').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-19 14:37:17 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # should this feed content be mirrored?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        mirrored = False
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if '!' in url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            mirrored = True
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            url = url.replace('!', '').strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        items_list = get_rss(base_dir, domain, session, url,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             moderated, mirrored,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             max_posts_per_source, max_feed_size_kb,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             max_feed_item_size_kb,
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-09 14:58:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             max_categories_feed_item_size_kb, debug,
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                             preferred_podcast_formats,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                             timeout_sec)
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if items_list:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            for date_str, item in items_list.items():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                result[date_str] = item
							 | 
						
					
						
							
								
									
										
										
										
											2022-04-27 17:12:25 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        time.sleep(4)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # add blogs from each user account
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _add_blogs_to_newswire(base_dir, domain, result,
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           max_posts_per_source, max_tags, system_language,
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                           session, debug)
							 | 
						
					
						
							
								
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # sort into chronological order, latest first
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    sorted_result = OrderedDict(sorted(result.items(), reverse=True))
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # are there too many posts? If so then remove the oldest ones
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    no_of_posts = len(sorted_result.items())
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if no_of_posts > max_newswire_posts:
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ctr = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        removals = []
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for date_str, item in sorted_result.items():
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            ctr += 1
							 | 
						
					
						
							
								
									
										
										
										
											2021-12-25 18:49:19 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if ctr > max_newswire_posts:
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                removals.append(date_str)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for remov in removals:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            sorted_result.pop(remov)
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return sorted_result
							 |