| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __filename__ = "newswire.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2022-02-03 13:58:20 +00:00
										 |  |  | __version__ = "1.3.0" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-26 11:27:14 +00:00
										 |  |  | __module_group__ = "Web Interface Columns" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | import requests | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  | import random | 
					
						
							| 
									
										
										
										
											2022-04-27 20:02:56 +00:00
										 |  |  | import time | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | from socket import error as SocketError | 
					
						
							|  |  |  | import errno | 
					
						
							|  |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2020-11-22 15:33:11 +00:00
										 |  |  | from datetime import timedelta | 
					
						
							| 
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 |  |  | from datetime import timezone | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | from collections import OrderedDict | 
					
						
							| 
									
										
										
										
											2021-12-26 12:31:47 +00:00
										 |  |  | from utils import valid_post_date | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | from categories import set_hashtag_category | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  | from utils import get_domain_from_actor | 
					
						
							| 
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 |  |  | from utils import valid_hash_tag | 
					
						
							| 
									
										
										
										
											2021-12-27 21:44:48 +00:00
										 |  |  | from utils import dangerous_svg | 
					
						
							| 
									
										
										
										
											2021-12-26 16:01:32 +00:00
										 |  |  | from utils import get_fav_filename_from_url | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  | from utils import get_base_content_from_post | 
					
						
							| 
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 |  |  | from utils import has_object_dict | 
					
						
							| 
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 |  |  | from utils import first_paragraph_from_string | 
					
						
							| 
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 |  |  | from utils import is_public_post | 
					
						
							| 
									
										
										
										
											2021-12-26 20:36:08 +00:00
										 |  |  | from utils import locate_post | 
					
						
							| 
									
										
										
										
											2021-12-26 15:13:34 +00:00
										 |  |  | from utils import load_json | 
					
						
							| 
									
										
										
										
											2021-12-26 14:47:21 +00:00
										 |  |  | from utils import save_json | 
					
						
							| 
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 |  |  | from utils import is_suspended | 
					
						
							| 
									
										
										
										
											2021-12-27 17:53:41 +00:00
										 |  |  | from utils import contains_invalid_chars | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  | from utils import remove_html | 
					
						
							| 
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 |  |  | from utils import is_account_dir | 
					
						
							| 
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 |  |  | from utils import acct_dir | 
					
						
							| 
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 |  |  | from utils import local_actor_url | 
					
						
							| 
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 |  |  | from blocking import is_blocked_domain | 
					
						
							|  |  |  | from blocking import is_blocked_hashtag | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | from filters import is_filtered | 
					
						
							|  |  |  | from session import download_image_any_mime_type | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-16 12:11:05 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _remove_cdata(text: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:18:43 +00:00
										 |  |  |     """Removes any CDATA from the given text
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if 'CDATA[' in text: | 
					
						
							|  |  |  |         text = text.split('CDATA[')[1] | 
					
						
							|  |  |  |         if ']' in text: | 
					
						
							|  |  |  |             text = text.split(']')[0] | 
					
						
							|  |  |  |     return text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def rss2header(http_prefix: str, | 
					
						
							| 
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 |  |  |                nickname: str, domain_full: str, | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |                title: str, translate: {}) -> str: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 |  |  |     """Header for an RSS 2.0 feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_str = \ | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |         "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \ | 
					
						
							|  |  |  |         "<rss version=\"2.0\">" + \ | 
					
						
							|  |  |  |         '<channel>' | 
					
						
							| 
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     if title.startswith('News'): | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += \ | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |             '    <title>Newswire</title>' + \ | 
					
						
							| 
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 |  |  |             '    <link>' + http_prefix + '://' + domain_full + \ | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |             '/newswire.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 |  |  |     elif title.startswith('Site'): | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += \ | 
					
						
							| 
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 |  |  |             '    <title>' + domain_full + '</title>' + \ | 
					
						
							|  |  |  |             '    <link>' + http_prefix + '://' + domain_full + \ | 
					
						
							| 
									
										
										
										
											2020-10-13 17:17:17 +00:00
										 |  |  |             '/blog/rss.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += \ | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |             '    <title>' + translate[title] + '</title>' + \ | 
					
						
							| 
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 |  |  |             '    <link>' + \ | 
					
						
							| 
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 |  |  |             local_actor_url(http_prefix, nickname, domain_full) + \ | 
					
						
							| 
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 |  |  |             '/rss.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     return rss_str | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def rss2footer() -> str: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 |  |  |     """Footer for an RSS 2.0 feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_str = '</channel></rss>' | 
					
						
							|  |  |  |     return rss_str | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def get_newswire_tags(text: str, max_tags: int) -> []: | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     """Returns a list of hashtags found in the given text
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-16 20:46:34 +00:00
										 |  |  |     if '#' not in text: | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     if ' ' not in text: | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     text_simplified = \ | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |         text.replace(',', ' ').replace(';', ' ').replace('- ', ' ') | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     text_simplified = text_simplified.replace('. ', ' ').strip() | 
					
						
							|  |  |  |     if text_simplified.endswith('.'): | 
					
						
							|  |  |  |         text_simplified = text_simplified[:len(text_simplified)-1] | 
					
						
							|  |  |  |     words = text_simplified.split(' ') | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     tags = [] | 
					
						
							|  |  |  |     for wrd in words: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |         if not wrd.startswith('#'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(wrd) <= 1: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if wrd in tags: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         tags.append(wrd) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(tags) >= max_tags: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |             break | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     return tags | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def limit_word_lengths(text: str, maxWordLength: int) -> str: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |     """Limits the maximum length of words so that the newswire
 | 
					
						
							|  |  |  |     column cannot become too wide | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if ' ' not in text: | 
					
						
							|  |  |  |         return text | 
					
						
							|  |  |  |     words = text.split(' ') | 
					
						
							|  |  |  |     result = '' | 
					
						
							|  |  |  |     for wrd in words: | 
					
						
							|  |  |  |         if len(wrd) > maxWordLength: | 
					
						
							|  |  |  |             wrd = wrd[:maxWordLength] | 
					
						
							|  |  |  |         if result: | 
					
						
							|  |  |  |             result += ' ' | 
					
						
							|  |  |  |         result += wrd | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_newswire_favicon_url(url: str) -> str: | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     """Returns a favicon url from the given article link
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if '://' not in url: | 
					
						
							|  |  |  |         return '/newswire_favicon.ico' | 
					
						
							|  |  |  |     if url.startswith('http://'): | 
					
						
							|  |  |  |         if not (url.endswith('.onion') or url.endswith('.i2p')): | 
					
						
							|  |  |  |             return '/newswire_favicon.ico' | 
					
						
							|  |  |  |     domain = url.split('://')[1] | 
					
						
							|  |  |  |     if '/' not in domain: | 
					
						
							|  |  |  |         return url + '/favicon.ico' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     domain = domain.split('/')[0] | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     return url.split('://')[0] + '://' + domain + '/favicon.ico' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _download_newswire_feed_favicon(session, base_dir: str, | 
					
						
							|  |  |  |                                     link: str, debug: bool) -> bool: | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     """Downloads the favicon for the given feed link
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     fav_url = get_newswire_favicon_url(link) | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     if '://' not in link: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     timeout_sec = 10 | 
					
						
							|  |  |  |     image_data, mime_type = \ | 
					
						
							|  |  |  |         download_image_any_mime_type(session, fav_url, timeout_sec, debug) | 
					
						
							|  |  |  |     if not image_data or not mime_type: | 
					
						
							| 
									
										
										
										
											2021-12-16 23:59:53 +00:00
										 |  |  |         return False | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # update the favicon url | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     extensions_to_mime = { | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  |         'ico': 'x-icon', | 
					
						
							|  |  |  |         'png': 'png', | 
					
						
							|  |  |  |         'jpg': 'jpeg', | 
					
						
							| 
									
										
										
										
											2022-02-06 11:04:49 +00:00
										 |  |  |         'jxl': 'jxl', | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  |         'gif': 'gif', | 
					
						
							|  |  |  |         'avif': 'avif', | 
					
						
							|  |  |  |         'svg': 'svg+xml', | 
					
						
							|  |  |  |         'webp': 'webp' | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for ext, mime_ext in extensions_to_mime.items(): | 
					
						
							|  |  |  |         if 'image/' + mime_ext in mime_type: | 
					
						
							|  |  |  |             fav_url = fav_url.replace('.ico', '.' + ext) | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  |             break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # create cached favicons directory if needed | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |     if not os.path.isdir(base_dir + '/favicons'): | 
					
						
							|  |  |  |         os.mkdir(base_dir + '/favicons') | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 |  |  |     # check svg for dubious scripts | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if fav_url.endswith('.svg'): | 
					
						
							|  |  |  |         image_data_str = str(image_data) | 
					
						
							|  |  |  |         if dangerous_svg(image_data_str, False): | 
					
						
							| 
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 |  |  |             return False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-17 12:01:54 +00:00
										 |  |  |     # save to the cache | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     fav_filename = get_fav_filename_from_url(base_dir, fav_url) | 
					
						
							|  |  |  |     if os.path.isfile(fav_filename): | 
					
						
							| 
									
										
										
										
											2021-12-16 21:14:24 +00:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         with open(fav_filename, 'wb+') as fp_fav: | 
					
						
							|  |  |  |             fp_fav.write(image_data) | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         print('EX: failed writing favicon ' + fav_filename) | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |         return False | 
					
						
							| 
									
										
										
										
											2021-12-19 12:32:01 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _add_newswire_dict_entry(base_dir: str, domain: str, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                              newswire: {}, date_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                              title: str, link: str, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                              votes_status: str, post_filename: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                              description: str, moderated: bool, | 
					
						
							|  |  |  |                              mirrored: bool, | 
					
						
							|  |  |  |                              tags: [], | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                              max_tags: int, session, debug: bool, | 
					
						
							|  |  |  |                              podcast_properties: {}) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 |  |  |     """Update the newswire dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 |  |  |     # remove any markup | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |     title = remove_html(title) | 
					
						
							|  |  |  |     description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     all_text = title + ' ' + description | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # check that none of the text is filtered against | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if is_filtered(base_dir, None, None, all_text): | 
					
						
							| 
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 |  |  |         return | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     title = limit_word_lengths(title, 13) | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |     if tags is None: | 
					
						
							|  |  |  |         tags = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # extract hashtags from the text of the feed post | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     post_tags = get_newswire_tags(all_text, max_tags) | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 |  |  |     # Include tags from podcast categories | 
					
						
							|  |  |  |     if podcast_properties: | 
					
						
							| 
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 |  |  |         if podcast_properties.get('explicit'): | 
					
						
							| 
									
										
										
										
											2022-01-14 13:15:43 +00:00
										 |  |  |             if '#nsfw' not in post_tags: | 
					
						
							|  |  |  |                 post_tags.append('#nsfw') | 
					
						
							| 
									
										
										
										
											2022-01-13 23:06:04 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 15:15:47 +00:00
										 |  |  |         post_tags += podcast_properties['categories'] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |     # combine the tags into a single list | 
					
						
							| 
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 |  |  |     for tag in tags: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if tag in post_tags: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(post_tags) < max_tags: | 
					
						
							|  |  |  |             post_tags.append(tag) | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # check that no tags are blocked | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for tag in post_tags: | 
					
						
							| 
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 |  |  |         if is_blocked_hashtag(base_dir, tag): | 
					
						
							| 
									
										
										
										
											2020-10-25 10:18:07 +00:00
										 |  |  |             return | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     _download_newswire_feed_favicon(session, base_dir, link, debug) | 
					
						
							| 
									
										
										
										
											2021-12-16 20:57:30 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     newswire[date_str] = [ | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |         title, | 
					
						
							|  |  |  |         link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         votes_status, | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |         post_filename, | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |         description, | 
					
						
							|  |  |  |         moderated, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         post_tags, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |         mirrored, | 
					
						
							|  |  |  |         podcast_properties | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |     ] | 
					
						
							| 
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _valid_feed_date(pub_date: str, debug: bool = False) -> bool: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |     # convert from YY-MM-DD HH:MM:SS+00:00 to | 
					
						
							|  |  |  |     # YY-MM-DDTHH:MM:SSZ | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     post_date = pub_date.replace(' ', 'T').replace('+00:00', 'Z') | 
					
						
							| 
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 |  |  |     if '.' in post_date: | 
					
						
							|  |  |  |         ending = post_date.split('.')[1] | 
					
						
							|  |  |  |         timezone_str = '' | 
					
						
							|  |  |  |         for ending_char in ending: | 
					
						
							|  |  |  |             if not ending_char.isdigit(): | 
					
						
							|  |  |  |                 timezone_str += ending_char | 
					
						
							|  |  |  |         if timezone_str: | 
					
						
							|  |  |  |             post_date = post_date.split('.')[0] + timezone_str | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     return valid_post_date(post_date, 90, debug) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  | def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  |     """Returns a UTC date string based on the given date string
 | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |     This tries a number of formats to see which work | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if ':00:00' in pub_date: | 
					
						
							|  |  |  |         # If this was published exactly on the hour then assign a | 
					
						
							|  |  |  |         # random minute and second to make this item relatively unique | 
					
						
							|  |  |  |         randgen = random.Random(unique_string_identifier) | 
					
						
							|  |  |  |         rand_min = randgen.randint(0, 59) | 
					
						
							|  |  |  |         rand_sec = randgen.randint(0, 59) | 
					
						
							|  |  |  |         replace_time_str = \ | 
					
						
							|  |  |  |             ':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2) | 
					
						
							|  |  |  |         pub_date = pub_date.replace(':00:00', replace_time_str) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |     formats = ("%a, %d %b %Y %H:%M:%S %z", | 
					
						
							| 
									
										
										
										
											2021-10-17 14:17:42 +00:00
										 |  |  |                "%a, %d %b %Y %H:%M:%S Z", | 
					
						
							| 
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 |  |  |                "%a, %d %b %Y %H:%M:%S GMT", | 
					
						
							| 
									
										
										
										
											2021-10-17 14:24:21 +00:00
										 |  |  |                "%a, %d %b %Y %H:%M:%S EST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S PST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S AST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S CST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S MST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S AKST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S HST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S UT", | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |                "%Y-%m-%dT%H:%M:%SZ", | 
					
						
							|  |  |  |                "%Y-%m-%dT%H:%M:%S%z") | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     published_date = None | 
					
						
							|  |  |  |     for date_format in formats: | 
					
						
							|  |  |  |         if ',' in pub_date and ',' not in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if ',' not in pub_date and ',' in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'Z' in pub_date and 'Z' not in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'Z' not in pub_date and 'Z' in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'EST' not in pub_date and 'EST' in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'GMT' not in pub_date and 'GMT' in date_format: | 
					
						
							| 
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'EST' in pub_date and 'EST' not in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'UT' not in pub_date and 'UT' in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'UT' in pub_date and 'UT' not in date_format: | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-30 18:13:40 +00:00
										 |  |  |         # remove any fraction of a second | 
					
						
							|  |  |  |         if '.' in pub_date: | 
					
						
							|  |  |  |             ending = pub_date.split('.')[1] | 
					
						
							|  |  |  |             timezone_str = '' | 
					
						
							|  |  |  |             for ending_char in ending: | 
					
						
							|  |  |  |                 if not ending_char.isdigit(): | 
					
						
							|  |  |  |                     timezone_str += ending_char | 
					
						
							|  |  |  |             if timezone_str: | 
					
						
							|  |  |  |                 pub_date = pub_date.split('.')[0] + timezone_str | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             published_date = datetime.strptime(pub_date, date_format) | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |         except BaseException: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if published_date: | 
					
						
							|  |  |  |             if pub_date.endswith(' EST'): | 
					
						
							|  |  |  |                 hours_added = timedelta(hours=5) | 
					
						
							|  |  |  |                 published_date = published_date + hours_added | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |             break | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     pub_date_str = None | 
					
						
							|  |  |  |     if published_date: | 
					
						
							|  |  |  |         offset = published_date.utcoffset() | 
					
						
							| 
									
										
										
										
											2020-11-22 20:37:08 +00:00
										 |  |  |         if offset: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             published_date = published_date - offset | 
					
						
							| 
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 |  |  |         # convert local date to UTC | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         published_date = published_date.replace(tzinfo=timezone.utc) | 
					
						
							|  |  |  |         pub_date_str = str(published_date) | 
					
						
							|  |  |  |         if not pub_date_str.endswith('+00:00'): | 
					
						
							|  |  |  |             pub_date_str += '+00:00' | 
					
						
							| 
									
										
										
										
											2021-09-07 19:33:27 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         print('WARN: unrecognized date format: ' + pub_date) | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     return pub_date_str | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def load_hashtag_categories(base_dir: str, language: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 |  |  |     """Loads an rss file containing hashtag categories
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     hashtag_categories_filename = base_dir + '/categories.xml' | 
					
						
							|  |  |  |     if not os.path.isfile(hashtag_categories_filename): | 
					
						
							|  |  |  |         hashtag_categories_filename = \ | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |             base_dir + '/defaultcategories/' + language + '.xml' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not os.path.isfile(hashtag_categories_filename): | 
					
						
							| 
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     with open(hashtag_categories_filename, 'r') as fp_cat: | 
					
						
							|  |  |  |         xml_str = fp_cat.read() | 
					
						
							|  |  |  |         _xml2str_to_hashtag_categories(base_dir, xml_str, 1024, True) | 
					
						
							| 
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str, | 
					
						
							|  |  |  |                                    max_categories_feedItem_size_kb: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                    force: bool = False) -> None: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |     """Updates hashtag categories based upon an rss feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_items = xml_str.split('<item>') | 
					
						
							|  |  |  |     max_bytes = max_categories_feedItem_size_kb * 1024 | 
					
						
							|  |  |  |     for rss_item in rss_items: | 
					
						
							|  |  |  |         if not rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(rss_item) > max_bytes: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             print('WARN: rss categories feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<description>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</description>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         category_str = rss_item.split('<title>')[1] | 
					
						
							|  |  |  |         category_str = category_str.split('</title>')[0].strip() | 
					
						
							|  |  |  |         if not category_str: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'CDATA' in category_str: | 
					
						
							| 
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         hashtag_list_str = rss_item.split('<description>')[1] | 
					
						
							|  |  |  |         hashtag_list_str = hashtag_list_str.split('</description>')[0].strip() | 
					
						
							|  |  |  |         if not hashtag_list_str: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if 'CDATA' in hashtag_list_str: | 
					
						
							| 
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         hashtag_list = hashtag_list_str.split(' ') | 
					
						
							|  |  |  |         if not is_blocked_hashtag(base_dir, category_str): | 
					
						
							|  |  |  |             for hashtag in hashtag_list: | 
					
						
							|  |  |  |                 set_hashtag_category(base_dir, hashtag, category_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                      False, force) | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 |  |  | def _get_podcast_categories(xml_item: str, xml_str: str) -> str: | 
					
						
							|  |  |  |     """ get podcast categories if they exist. These can be turned into hashtags
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     podcast_categories = [] | 
					
						
							|  |  |  |     episode_category_tags = ['<itunes:category', '<category'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for category_tag in episode_category_tags: | 
					
						
							|  |  |  |         item_str = xml_item | 
					
						
							|  |  |  |         if category_tag not in xml_item: | 
					
						
							|  |  |  |             if category_tag not in xml_str: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             item_str = xml_str | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 |  |  |         category_list = item_str.split(category_tag) | 
					
						
							|  |  |  |         first_category = True | 
					
						
							| 
									
										
										
										
											2022-01-13 16:12:55 +00:00
										 |  |  |         for episode_category in category_list: | 
					
						
							| 
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 |  |  |             if first_category: | 
					
						
							|  |  |  |                 first_category = False | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if 'text="' in episode_category: | 
					
						
							|  |  |  |                 episode_category = episode_category.split('text="')[1] | 
					
						
							|  |  |  |                 if '"' in episode_category: | 
					
						
							|  |  |  |                     episode_category = episode_category.split('"')[0] | 
					
						
							|  |  |  |                     episode_category = \ | 
					
						
							|  |  |  |                         episode_category.lower().replace(' ', '') | 
					
						
							|  |  |  |                     episode_category = episode_category.replace('#', '') | 
					
						
							|  |  |  |                     if episode_category not in podcast_categories: | 
					
						
							|  |  |  |                         if valid_hash_tag(episode_category): | 
					
						
							|  |  |  |                             podcast_categories.append('#' + episode_category) | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 16:04:14 +00:00
										 |  |  |             if '>' in episode_category: | 
					
						
							|  |  |  |                 episode_category = episode_category.split('>')[1] | 
					
						
							|  |  |  |                 if '<' in episode_category: | 
					
						
							|  |  |  |                     episode_category = episode_category.split('<')[0] | 
					
						
							|  |  |  |                     episode_category = \ | 
					
						
							|  |  |  |                         episode_category.lower().replace(' ', '') | 
					
						
							|  |  |  |                     episode_category = episode_category.replace('#', '') | 
					
						
							|  |  |  |                     if episode_category not in podcast_categories: | 
					
						
							|  |  |  |                         if valid_hash_tag(episode_category): | 
					
						
							|  |  |  |                             podcast_categories.append('#' + episode_category) | 
					
						
							| 
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return podcast_categories | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  | def _valid_podcast_entry(base_dir: str, key: str, entry: {}) -> bool: | 
					
						
							|  |  |  |     """Is the given podcast namespace entry valid?
 | 
					
						
							|  |  |  |     https://github.com/Podcastindex-org/podcast-namespace/ | 
					
						
							|  |  |  |     blob/main/proposal-docs/social/social.md#socialinteract-element | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |     if key == 'socialInteract' or key == 'discussion': | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |         if not entry.get('protocol'): | 
					
						
							|  |  |  |             return False | 
					
						
							| 
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 |  |  |         if not entry.get('uri'): | 
					
						
							|  |  |  |             if not entry.get('text'): | 
					
						
							|  |  |  |                 if not entry.get('url'): | 
					
						
							|  |  |  |                     return False | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |         if entry['protocol'].tolower() != 'activitypub': | 
					
						
							|  |  |  |             return False | 
					
						
							| 
									
										
										
										
											2022-04-21 09:21:25 +00:00
										 |  |  |         if entry.get('uri'): | 
					
						
							|  |  |  |             post_url = entry['uri'] | 
					
						
							|  |  |  |         elif entry.get('url'): | 
					
						
							|  |  |  |             post_url = entry['uri'] | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             post_url = entry['text'] | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |         if '://' not in post_url: | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  |         post_domain, post_port = get_domain_from_actor(post_url) | 
					
						
							|  |  |  |         if not post_domain: | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  |         if is_blocked_domain(base_dir, post_domain): | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def xml_podcast_to_dict(base_dir: str, xml_item: str, xml_str: str) -> {}: | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |     """podcasting extensions for RSS feeds
 | 
					
						
							| 
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 |  |  |     See https://github.com/Podcastindex-org/podcast-namespace/ | 
					
						
							|  |  |  |     blob/main/docs/1.0.md | 
					
						
							| 
									
										
										
										
											2022-02-12 15:40:55 +00:00
										 |  |  |     https://github.com/Podcastindex-org/podcast-namespace/ | 
					
						
							|  |  |  |     blob/main/proposal-docs/social/social.md#socialinteract-element | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 |  |  |     if '<podcast:' not in xml_item: | 
					
						
							|  |  |  |         if '<itunes:' not in xml_item: | 
					
						
							| 
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 |  |  |             if '<media:thumbnail' not in xml_item: | 
					
						
							|  |  |  |                 return {} | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     podcast_properties = { | 
					
						
							|  |  |  |         "locations": [], | 
					
						
							|  |  |  |         "persons": [], | 
					
						
							|  |  |  |         "soundbites": [], | 
					
						
							|  |  |  |         "transcripts": [], | 
					
						
							| 
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 |  |  |         "valueRecipients": [], | 
					
						
							| 
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 |  |  |         "trailers": [], | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |         "discussion": [], | 
					
						
							|  |  |  |         "episode": '', | 
					
						
							|  |  |  |         "socialInteract": [], | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 |  |  |     pod_lines = xml_item.split('<podcast:') | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |     ctr = 0 | 
					
						
							|  |  |  |     for pod_line in pod_lines: | 
					
						
							|  |  |  |         if ctr == 0 or '>' not in pod_line: | 
					
						
							|  |  |  |             ctr += 1 | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if ' ' not in pod_line.split('>')[0]: | 
					
						
							|  |  |  |             pod_key = pod_line.split('>')[0].strip() | 
					
						
							|  |  |  |             pod_val = pod_line.split('>', 1)[1].strip() | 
					
						
							|  |  |  |             if '<' in pod_val: | 
					
						
							|  |  |  |                 pod_val = pod_val.split('<')[0] | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |             if pod_key in podcast_properties: | 
					
						
							|  |  |  |                 podcast_properties[pod_key] = pod_val | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |             ctr += 1 | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         pod_key = pod_line.split(' ')[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         pod_fields = ( | 
					
						
							|  |  |  |             'url', 'geo', 'osm', 'type', 'method', 'group', | 
					
						
							|  |  |  |             'owner', 'srcset', 'img', 'role', 'address', 'suggested', | 
					
						
							| 
									
										
										
										
											2022-01-10 19:07:16 +00:00
										 |  |  |             'startTime', 'duration', 'href', 'name', 'pubdate', | 
					
						
							| 
									
										
										
										
											2022-02-12 15:38:35 +00:00
										 |  |  |             'length', 'season', 'email', 'platform', 'protocol', | 
					
						
							| 
									
										
										
										
											2022-02-16 13:27:11 +00:00
										 |  |  |             'accountId', 'priority', 'podcastAccountId', | 
					
						
							|  |  |  |             'podcastAccountUrl' | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |         ) | 
					
						
							|  |  |  |         pod_entry = {} | 
					
						
							|  |  |  |         for pod_field in pod_fields: | 
					
						
							|  |  |  |             if pod_field + '="' not in pod_line: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             pod_str = pod_line.split(pod_field + '="')[1] | 
					
						
							|  |  |  |             if '"' not in pod_str: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             pod_val = pod_str.split('"')[0] | 
					
						
							|  |  |  |             pod_entry[pod_field] = pod_val | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         pod_text = pod_line.split('>')[1] | 
					
						
							|  |  |  |         if '<' in pod_text: | 
					
						
							|  |  |  |             pod_text = pod_text.split('<')[0].strip() | 
					
						
							|  |  |  |             if pod_text: | 
					
						
							|  |  |  |                 pod_entry['text'] = pod_text | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 |  |  |         appended = False | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |         if pod_key + 's' in podcast_properties: | 
					
						
							|  |  |  |             if isinstance(podcast_properties[pod_key + 's'], list): | 
					
						
							|  |  |  |                 podcast_properties[pod_key + 's'].append(pod_entry) | 
					
						
							| 
									
										
										
										
											2022-02-12 16:00:45 +00:00
										 |  |  |                 appended = True | 
					
						
							|  |  |  |         if not appended: | 
					
						
							| 
									
										
										
										
											2022-02-12 16:05:44 +00:00
										 |  |  |             # if there are repeated keys then only use the first one | 
					
						
							|  |  |  |             if not podcast_properties.get(pod_key): | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |                 if _valid_podcast_entry(base_dir, pod_key, pod_entry): | 
					
						
							|  |  |  |                     podcast_properties[pod_key] = pod_entry | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |         ctr += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  |     # get the image for the podcast, if it exists | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  |     podcast_episode_image = None | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  |     episode_image_tags = ['<itunes:image', '<media:thumbnail'] | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  |     for image_tag in episode_image_tags: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 |  |  |         item_str = xml_item | 
					
						
							|  |  |  |         if image_tag not in xml_item: | 
					
						
							|  |  |  |             if image_tag not in xml_str: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             item_str = xml_str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         episode_image = item_str.split(image_tag)[1] | 
					
						
							| 
									
										
										
										
											2022-01-14 17:55:56 +00:00
										 |  |  |         if image_tag + ' ' in item_str and '>' in episode_image: | 
					
						
							|  |  |  |             episode_image = episode_image.split('>')[0] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  |         if 'href="' in episode_image: | 
					
						
							|  |  |  |             episode_image = episode_image.split('href="')[1] | 
					
						
							|  |  |  |             if '"' in episode_image: | 
					
						
							|  |  |  |                 episode_image = episode_image.split('"')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  |                 podcast_episode_image = episode_image | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  |                 break | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  |         elif 'url="' in episode_image: | 
					
						
							|  |  |  |             episode_image = episode_image.split('url="')[1] | 
					
						
							|  |  |  |             if '"' in episode_image: | 
					
						
							|  |  |  |                 episode_image = episode_image.split('"')[0] | 
					
						
							|  |  |  |                 podcast_episode_image = episode_image | 
					
						
							|  |  |  |                 break | 
					
						
							| 
									
										
										
										
											2022-01-14 18:05:29 +00:00
										 |  |  |         elif '>' in episode_image: | 
					
						
							|  |  |  |             episode_image = episode_image.split('>')[1] | 
					
						
							|  |  |  |             if '<' in episode_image: | 
					
						
							|  |  |  |                 episode_image = episode_image.split('<')[0] | 
					
						
							|  |  |  |                 if '://' in episode_image and '.' in episode_image: | 
					
						
							|  |  |  |                     podcast_episode_image = episode_image | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 |  |  |     # get categories if they exist. These can be turned into hashtags | 
					
						
							| 
									
										
										
										
											2022-01-13 15:30:55 +00:00
										 |  |  |     podcast_categories = _get_podcast_categories(xml_item, xml_str) | 
					
						
							| 
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  |     if podcast_episode_image: | 
					
						
							|  |  |  |         podcast_properties['image'] = podcast_episode_image | 
					
						
							| 
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 |  |  |         podcast_properties['categories'] = podcast_categories | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 |  |  |         if '<itunes:explicit>Y' in xml_item or \ | 
					
						
							|  |  |  |            '<itunes:explicit>T' in xml_item or \ | 
					
						
							|  |  |  |            '<itunes:explicit>1' in xml_item: | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  |             podcast_properties['explicit'] = True | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             podcast_properties['explicit'] = False | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:37:08 +00:00
										 |  |  |         if '<podcast:' not in xml_item: | 
					
						
							| 
									
										
										
										
											2022-01-12 17:44:49 +00:00
										 |  |  |             return {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |     return podcast_properties | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 |  |  | def get_link_from_rss_item(rss_item: str, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                            preferred_mime_types: [], | 
					
						
							|  |  |  |                            proxy_type: str) -> (str, str): | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |     """Extracts rss link from rss item string
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |     mime_type = None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 12:53:34 +00:00
										 |  |  |     if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item: | 
					
						
							|  |  |  |         enclosures = rss_item.split('<podcast:alternateEnclosure ') | 
					
						
							|  |  |  |         ctr = 0 | 
					
						
							|  |  |  |         for enclosure in enclosures: | 
					
						
							|  |  |  |             if ctr == 0: | 
					
						
							|  |  |  |                 ctr += 1 | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             ctr += 1 | 
					
						
							|  |  |  |             if '</podcast:alternateEnclosure' not in enclosure: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             enclosure = enclosure.split('</podcast:alternateEnclosure')[0] | 
					
						
							|  |  |  |             if 'type="' not in enclosure: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             mime_type = enclosure.split('type="')[1] | 
					
						
							|  |  |  |             if '"' in mime_type: | 
					
						
							|  |  |  |                 mime_type = mime_type.split('"')[0] | 
					
						
							|  |  |  |             if mime_type not in preferred_mime_types: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if 'uri="' not in enclosure: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             uris = enclosure.split('uri="') | 
					
						
							|  |  |  |             ctr2 = 0 | 
					
						
							|  |  |  |             for uri in uris: | 
					
						
							|  |  |  |                 if ctr2 == 0: | 
					
						
							|  |  |  |                     ctr2 += 1 | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 ctr2 += 1 | 
					
						
							|  |  |  |                 if '"' not in uri: | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 link = uri.split('"')[0] | 
					
						
							|  |  |  |                 if '://' not in link: | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 if proxy_type: | 
					
						
							|  |  |  |                     if proxy_type == 'tor' and \ | 
					
						
							|  |  |  |                        '.onion/' not in link: | 
					
						
							|  |  |  |                         continue | 
					
						
							|  |  |  |                     if proxy_type == 'onion' and \ | 
					
						
							|  |  |  |                        '.onion/' not in link: | 
					
						
							|  |  |  |                         continue | 
					
						
							|  |  |  |                     if proxy_type == 'i2p' and \ | 
					
						
							|  |  |  |                        '.i2p/' not in link: | 
					
						
							|  |  |  |                         continue | 
					
						
							|  |  |  |                     return link, mime_type | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     if '.onion/' not in link and \ | 
					
						
							|  |  |  |                        '.i2p/' not in link: | 
					
						
							|  |  |  |                         return link, mime_type | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |     if '<enclosure ' in rss_item: | 
					
						
							|  |  |  |         # get link from audio or video enclosure | 
					
						
							|  |  |  |         enclosure = rss_item.split('<enclosure ')[1] | 
					
						
							|  |  |  |         if '>' in enclosure: | 
					
						
							|  |  |  |             enclosure = enclosure.split('>')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |             if ' type="' in enclosure: | 
					
						
							|  |  |  |                 mime_type = enclosure.split(' type="')[1] | 
					
						
							|  |  |  |                 if '"' in mime_type: | 
					
						
							|  |  |  |                     mime_type = mime_type.split('"')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |             if 'url="' in enclosure and \ | 
					
						
							|  |  |  |                ('"audio/' in enclosure or '"video/' in enclosure): | 
					
						
							|  |  |  |                 link_str = enclosure.split('url="')[1] | 
					
						
							|  |  |  |                 if '"' in link_str: | 
					
						
							| 
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 |  |  |                     link = link_str.split('"')[0] | 
					
						
							|  |  |  |                     if '://' in link: | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |                         return link, mime_type | 
					
						
							| 
									
										
										
										
											2022-01-12 16:18:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 |  |  |     if '<link>' in rss_item and '</link>' in rss_item: | 
					
						
							|  |  |  |         link = rss_item.split('<link>')[1] | 
					
						
							|  |  |  |         link = link.split('</link>')[0] | 
					
						
							|  |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             return None, None | 
					
						
							|  |  |  |     elif '<link ' in rss_item: | 
					
						
							|  |  |  |         link_str = rss_item.split('<link ')[1] | 
					
						
							|  |  |  |         if '>' in link_str: | 
					
						
							|  |  |  |             link_str = link_str.split('>')[0] | 
					
						
							|  |  |  |             if 'href="' in link_str: | 
					
						
							|  |  |  |                 link_str = link_str.split('href="')[1] | 
					
						
							|  |  |  |                 if '"' in link_str: | 
					
						
							|  |  |  |                     link = link_str.split('"')[0] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |     return link, mime_type | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_categories_feedItem_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                      session, debug: bool, | 
					
						
							|  |  |  |                      preferred_podcast_formats: []) -> {}: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     """Converts an xml RSS 2.0 string to a dictionary
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<item>' not in xml_str: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # is this an rss feed containing hashtag categories? | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<title>#categories</title>' in xml_str: | 
					
						
							|  |  |  |         _xml2str_to_hashtag_categories(base_dir, xml_str, | 
					
						
							|  |  |  |                                        max_categories_feedItem_size_kb) | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_items = xml_str.split('<item>') | 
					
						
							|  |  |  |     post_ctr = 0 | 
					
						
							|  |  |  |     max_bytes = max_feed_item_size_kb * 1024 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |     first_item = True | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for rss_item in rss_items: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |         if first_item: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 |  |  |             first_item = False | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not rss_item: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(rss_item) > max_bytes: | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |             print('WARN: rss feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 |  |  |         if '<link' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<pubDate>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</pubDate>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         title = rss_item.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         title = _remove_cdata(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |         title = remove_html(title) | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<description>' in rss_item and '</description>' in rss_item: | 
					
						
							|  |  |  |             description = rss_item.split('<description>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |             description = remove_html(description.split('</description>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if '<media:description>' in rss_item and \ | 
					
						
							|  |  |  |                '</media:description>' in rss_item: | 
					
						
							|  |  |  |                 description = rss_item.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |                 description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |         proxy_type = None | 
					
						
							|  |  |  |         if domain.endswith('.onion'): | 
					
						
							|  |  |  |             proxy_type = 'tor' | 
					
						
							|  |  |  |         elif domain.endswith('.i2p'): | 
					
						
							|  |  |  |             proxy_type = 'i2p' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         link, link_mime_type = \ | 
					
						
							|  |  |  |             get_link_from_rss_item(rss_item, preferred_podcast_formats, | 
					
						
							|  |  |  |                                    proxy_type) | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  |         if not link: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-11 18:25:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  |         item_domain = link.split('://')[1] | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '/' in item_domain: | 
					
						
							|  |  |  |             item_domain = item_domain.split('/')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if is_blocked_domain(base_dir, item_domain): | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         pub_date = rss_item.split('<pubDate>')[1] | 
					
						
							|  |  |  |         pub_date = pub_date.split('</pubDate>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  |         unique_string_identifier = title + ' ' + link | 
					
						
							|  |  |  |         pub_date_str = parse_feed_date(pub_date, unique_string_identifier) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if pub_date_str: | 
					
						
							|  |  |  |             if _valid_feed_date(pub_date_str): | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 post_filename = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 votes_status = [] | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |                 podcast_properties = \ | 
					
						
							|  |  |  |                     xml_podcast_to_dict(base_dir, rss_item, xml_str) | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |                 if podcast_properties: | 
					
						
							|  |  |  |                     podcast_properties['linkMimeType'] = link_mime_type | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_newswire_dict_entry(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          result, pub_date_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          title, link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          votes_status, post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          description, moderated, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                          mirrored, [], 32, session, debug, | 
					
						
							|  |  |  |                                          podcast_properties) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_ctr += 1 | 
					
						
							|  |  |  |                 if post_ctr >= max_posts_per_source: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if post_ctr > 0: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 |  |  |         print('Added ' + str(post_ctr) + ' rss 2.0 feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_categories_feedItem_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                      session, debug: bool, | 
					
						
							|  |  |  |                      preferred_podcast_formats: []) -> {}: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     """Converts an xml RSS 1.0 string to a dictionary
 | 
					
						
							|  |  |  |     https://validator.w3.org/feed/docs/rss1.html | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     item_str = '<item' | 
					
						
							|  |  |  |     if item_str not in xml_str: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # is this an rss feed containing hashtag categories? | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<title>#categories</title>' in xml_str: | 
					
						
							|  |  |  |         _xml2str_to_hashtag_categories(base_dir, xml_str, | 
					
						
							|  |  |  |                                        max_categories_feedItem_size_kb) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_items = xml_str.split(item_str) | 
					
						
							|  |  |  |     post_ctr = 0 | 
					
						
							|  |  |  |     max_bytes = max_feed_item_size_kb * 1024 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |     first_item = True | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for rss_item in rss_items: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |         if first_item: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 |  |  |             first_item = False | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(rss_item) > max_bytes: | 
					
						
							| 
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 |  |  |             print('WARN: rss 1.0 feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if rss_item.startswith('s>'): | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</title>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 |  |  |         if '<link' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<dc:date>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</dc:date>' not in rss_item: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         title = rss_item.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         title = _remove_cdata(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |         title = remove_html(title) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<description>' in rss_item and '</description>' in rss_item: | 
					
						
							|  |  |  |             description = rss_item.split('<description>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |             description = remove_html(description.split('</description>')[0]) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if '<media:description>' in rss_item and \ | 
					
						
							|  |  |  |                '</media:description>' in rss_item: | 
					
						
							|  |  |  |                 description = rss_item.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |                 description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |         proxy_type = None | 
					
						
							|  |  |  |         if domain.endswith('.onion'): | 
					
						
							|  |  |  |             proxy_type = 'tor' | 
					
						
							|  |  |  |         elif domain.endswith('.i2p'): | 
					
						
							|  |  |  |             proxy_type = 'i2p' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         link, link_mime_type = \ | 
					
						
							|  |  |  |             get_link_from_rss_item(rss_item, preferred_podcast_formats, | 
					
						
							|  |  |  |                                    proxy_type) | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  |         if not link: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         item_domain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in item_domain: | 
					
						
							|  |  |  |             item_domain = item_domain.split('/')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if is_blocked_domain(base_dir, item_domain): | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         pub_date = rss_item.split('<dc:date>')[1] | 
					
						
							|  |  |  |         pub_date = pub_date.split('</dc:date>')[0] | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  |         unique_string_identifier = title + ' ' + link | 
					
						
							|  |  |  |         pub_date_str = parse_feed_date(pub_date, unique_string_identifier) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if pub_date_str: | 
					
						
							|  |  |  |             if _valid_feed_date(pub_date_str): | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 post_filename = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 votes_status = [] | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |                 podcast_properties = \ | 
					
						
							|  |  |  |                     xml_podcast_to_dict(base_dir, rss_item, xml_str) | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |                 if podcast_properties: | 
					
						
							|  |  |  |                     podcast_properties['linkMimeType'] = link_mime_type | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_newswire_dict_entry(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          result, pub_date_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          title, link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          votes_status, post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          description, moderated, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                          mirrored, [], 32, session, debug, | 
					
						
							|  |  |  |                                          podcast_properties) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_ctr += 1 | 
					
						
							|  |  |  |                 if post_ctr >= max_posts_per_source: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if post_ctr > 0: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 |  |  |         print('Added ' + str(post_ctr) + ' rss 1.0 feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                        moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                        max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                        max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                        session, debug: bool, | 
					
						
							|  |  |  |                        preferred_podcast_formats: []) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |     """Converts an atom feed string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<entry>' not in xml_str: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     atom_items = xml_str.split('<entry>') | 
					
						
							|  |  |  |     post_ctr = 0 | 
					
						
							|  |  |  |     max_bytes = max_feed_item_size_kb * 1024 | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |     first_item = True | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for atom_item in atom_items: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |         if first_item: | 
					
						
							| 
									
										
										
										
											2022-01-13 12:19:35 +00:00
										 |  |  |             first_item = False | 
					
						
							| 
									
										
										
										
											2022-01-13 12:16:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(atom_item) > max_bytes: | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |             print('WARN: atom feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<title>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</title>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-13 22:26:01 +00:00
										 |  |  |         if '<link' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<updated>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</updated>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         title = atom_item.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         title = _remove_cdata(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |         title = remove_html(title) | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<summary>' in atom_item and '</summary>' in atom_item: | 
					
						
							|  |  |  |             description = atom_item.split('<summary>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |             description = remove_html(description.split('</summary>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if '<media:description>' in atom_item and \ | 
					
						
							|  |  |  |                '</media:description>' in atom_item: | 
					
						
							|  |  |  |                 description = atom_item.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |                 description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |         proxy_type = None | 
					
						
							|  |  |  |         if domain.endswith('.onion'): | 
					
						
							|  |  |  |             proxy_type = 'tor' | 
					
						
							|  |  |  |         elif domain.endswith('.i2p'): | 
					
						
							|  |  |  |             proxy_type = 'i2p' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         link, link_mime_type = \ | 
					
						
							|  |  |  |             get_link_from_rss_item(atom_item, preferred_podcast_formats, | 
					
						
							|  |  |  |                                    proxy_type) | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  |         if not link: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:23:07 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         item_domain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in item_domain: | 
					
						
							|  |  |  |             item_domain = item_domain.split('/')[0] | 
					
						
							| 
									
										
										
										
											2022-01-12 14:02:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if is_blocked_domain(base_dir, item_domain): | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         pub_date = atom_item.split('<updated>')[1] | 
					
						
							|  |  |  |         pub_date = pub_date.split('</updated>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  |         unique_string_identifier = title + ' ' + link | 
					
						
							|  |  |  |         pub_date_str = parse_feed_date(pub_date, unique_string_identifier) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if pub_date_str: | 
					
						
							|  |  |  |             if _valid_feed_date(pub_date_str): | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 post_filename = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 votes_status = [] | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |                 podcast_properties = \ | 
					
						
							|  |  |  |                     xml_podcast_to_dict(base_dir, atom_item, xml_str) | 
					
						
							| 
									
										
										
										
											2022-01-12 18:35:15 +00:00
										 |  |  |                 if podcast_properties: | 
					
						
							|  |  |  |                     podcast_properties['linkMimeType'] = link_mime_type | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_newswire_dict_entry(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          result, pub_date_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          title, link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          votes_status, post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          description, moderated, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                          mirrored, [], 32, session, debug, | 
					
						
							|  |  |  |                                          podcast_properties) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_ctr += 1 | 
					
						
							|  |  |  |                 if post_ctr >= max_posts_per_source: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if post_ctr > 0: | 
					
						
							| 
									
										
										
										
											2022-01-12 14:31:04 +00:00
										 |  |  |         print('Added ' + str(post_ctr) + ' atom feed items to newswire') | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                          moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                          max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                          max_feed_item_size_kb: int, | 
					
						
							|  |  |  |                          session, debug: bool) -> {}: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     """Converts a json feed string to a dictionary
 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 |  |  |     See https://jsonfeed.org/version/1.1 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '"items"' not in xml_str: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         feed_json = json.loads(xml_str) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     except BaseException: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         print('EX: _json_feed_v1to_dict unable to load json ' + str(xml_str)) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     max_bytes = max_feed_item_size_kb * 1024 | 
					
						
							|  |  |  |     if not feed_json.get('version'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if not feed_json['version'].startswith('https://jsonfeed.org/version/1'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if not feed_json.get('items'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if not isinstance(feed_json['items'], list): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     post_ctr = 0 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:47:49 +00:00
										 |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for json_feed_item in feed_json['items']: | 
					
						
							|  |  |  |         if not json_feed_item: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not isinstance(json_feed_item, dict): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not json_feed_item.get('url'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not isinstance(json_feed_item['url'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not json_feed_item.get('date_published'): | 
					
						
							|  |  |  |             if not json_feed_item.get('date_modified'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not json_feed_item.get('content_text'): | 
					
						
							|  |  |  |             if not json_feed_item.get('content_html'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if json_feed_item.get('content_html'): | 
					
						
							|  |  |  |             if not isinstance(json_feed_item['content_html'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             title = remove_html(json_feed_item['content_html']) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if not isinstance(json_feed_item['content_text'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             title = remove_html(json_feed_item['content_text']) | 
					
						
							|  |  |  |         if len(title) > max_bytes: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             print('WARN: json feed title is too long') | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if json_feed_item.get('description'): | 
					
						
							|  |  |  |             if not isinstance(json_feed_item['description'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             description = remove_html(json_feed_item['description']) | 
					
						
							|  |  |  |             if len(description) > max_bytes: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 print('WARN: json feed description is too long') | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if json_feed_item.get('tags'): | 
					
						
							|  |  |  |                 if isinstance(json_feed_item['tags'], list): | 
					
						
							|  |  |  |                     for tag_name in json_feed_item['tags']: | 
					
						
							|  |  |  |                         if not isinstance(tag_name, str): | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 |  |  |                             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                         if ' ' in tag_name: | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 |  |  |                             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                         if not tag_name.startswith('#'): | 
					
						
							|  |  |  |                             tag_name = '#' + tag_name | 
					
						
							|  |  |  |                         if tag_name not in description: | 
					
						
							|  |  |  |                             description += ' ' + tag_name | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         link = json_feed_item['url'] | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(link) > max_bytes: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             print('WARN: json feed link is too long') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         item_domain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in item_domain: | 
					
						
							|  |  |  |             item_domain = item_domain.split('/')[0] | 
					
						
							|  |  |  |         if is_blocked_domain(base_dir, item_domain): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if json_feed_item.get('date_published'): | 
					
						
							|  |  |  |             if not isinstance(json_feed_item['date_published'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             pub_date = json_feed_item['date_published'] | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if not isinstance(json_feed_item['date_modified'], str): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             pub_date = json_feed_item['date_modified'] | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  |         unique_string_identifier = title + ' ' + link | 
					
						
							|  |  |  |         pub_date_str = parse_feed_date(pub_date, unique_string_identifier) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if pub_date_str: | 
					
						
							|  |  |  |             if _valid_feed_date(pub_date_str): | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 post_filename = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 votes_status = [] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_newswire_dict_entry(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          result, pub_date_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          title, link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          votes_status, post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          description, moderated, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                          mirrored, [], 32, session, debug, | 
					
						
							|  |  |  |                                          None) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_ctr += 1 | 
					
						
							|  |  |  |                 if post_ctr >= max_posts_per_source: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if post_ctr > 0: | 
					
						
							|  |  |  |         print('Added ' + str(post_ctr) + | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |               ' json feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                           moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                           max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                           max_feed_item_size_kb: int, | 
					
						
							|  |  |  |                           session, debug: bool) -> {}: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |     """Converts an atom-style YouTube feed string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<entry>' not in xml_str: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2021-12-28 21:55:38 +00:00
										 |  |  |     if is_blocked_domain(base_dir, 'www.youtube.com'): | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     atom_items = xml_str.split('<entry>') | 
					
						
							|  |  |  |     post_ctr = 0 | 
					
						
							|  |  |  |     max_bytes = max_feed_item_size_kb * 1024 | 
					
						
							| 
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 |  |  |     first_entry = True | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for atom_item in atom_items: | 
					
						
							| 
									
										
										
										
											2022-01-13 22:57:16 +00:00
										 |  |  |         if first_entry: | 
					
						
							|  |  |  |             first_entry = False | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not atom_item.strip(): | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if len(atom_item) > max_bytes: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             print('WARN: atom feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<title>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</title>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<published>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</published>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<yt:videoId>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '</yt:videoId>' not in atom_item: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         title = atom_item.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         title = _remove_cdata(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if '<media:description>' in atom_item and \ | 
					
						
							|  |  |  |            '</media:description>' in atom_item: | 
					
						
							|  |  |  |             description = atom_item.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |             description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         elif '<summary>' in atom_item and '</summary>' in atom_item: | 
					
						
							|  |  |  |             description = atom_item.split('<summary>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             description = description.split('</summary>')[0] | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |             description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |         link, _ = get_link_from_rss_item(atom_item, None, None) | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  |         if not link: | 
					
						
							|  |  |  |             link = atom_item.split('<yt:videoId>')[1] | 
					
						
							|  |  |  |             link = link.split('</yt:videoId>')[0] | 
					
						
							|  |  |  |             link = 'https://www.youtube.com/watch?v=' + link.strip() | 
					
						
							|  |  |  |         if not link: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         pub_date = atom_item.split('<published>')[1] | 
					
						
							|  |  |  |         pub_date = pub_date.split('</published>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-27 10:42:46 +00:00
										 |  |  |         unique_string_identifier = title + ' ' + link | 
					
						
							|  |  |  |         pub_date_str = parse_feed_date(pub_date, unique_string_identifier) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if pub_date_str: | 
					
						
							|  |  |  |             if _valid_feed_date(pub_date_str): | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 post_filename = '' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 votes_status = [] | 
					
						
							| 
									
										
										
										
											2022-02-12 20:37:15 +00:00
										 |  |  |                 podcast_properties = \ | 
					
						
							|  |  |  |                     xml_podcast_to_dict(base_dir, atom_item, xml_str) | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  |                 if podcast_properties: | 
					
						
							| 
									
										
										
										
											2022-01-14 18:48:43 +00:00
										 |  |  |                     podcast_properties['linkMimeType'] = 'video/youtube' | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_newswire_dict_entry(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          result, pub_date_str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          title, link, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                          votes_status, post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          description, moderated, mirrored, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                          [], 32, session, debug, | 
					
						
							| 
									
										
										
										
											2022-01-14 17:40:42 +00:00
										 |  |  |                                          podcast_properties) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_ctr += 1 | 
					
						
							|  |  |  |                 if post_ctr >= max_posts_per_source: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if post_ctr > 0: | 
					
						
							|  |  |  |         print('Added ' + str(post_ctr) + ' YouTube feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _xml_str_to_dict(base_dir: str, domain: str, xml_str: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_posts_per_source: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                      max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                      max_categories_feedItem_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                      session, debug: bool, | 
					
						
							|  |  |  |                      preferred_podcast_formats: []) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """Converts an xml string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if '<yt:videoId>' in xml_str and '<yt:channelId>' in xml_str: | 
					
						
							| 
									
										
										
										
											2020-11-22 16:10:58 +00:00
										 |  |  |         print('YouTube feed: reading') | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _atom_feed_yt_to_dict(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                      xml_str, moderated, mirrored, | 
					
						
							|  |  |  |                                      max_posts_per_source, | 
					
						
							|  |  |  |                                      max_feed_item_size_kb, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                      session, debug) | 
					
						
							| 
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 |  |  |     if 'rss version="2.0"' in xml_str: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _xml2str_to_dict(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                 xml_str, moderated, mirrored, | 
					
						
							|  |  |  |                                 max_posts_per_source, max_feed_item_size_kb, | 
					
						
							|  |  |  |                                 max_categories_feedItem_size_kb, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                                 session, debug, | 
					
						
							|  |  |  |                                 preferred_podcast_formats) | 
					
						
							| 
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 |  |  |     if '<?xml version="1.0"' in xml_str: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _xml1str_to_dict(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                 xml_str, moderated, mirrored, | 
					
						
							|  |  |  |                                 max_posts_per_source, max_feed_item_size_kb, | 
					
						
							|  |  |  |                                 max_categories_feedItem_size_kb, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                                 session, debug, preferred_podcast_formats) | 
					
						
							| 
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 |  |  |     if 'xmlns="http://www.w3.org/2005/Atom"' in xml_str: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _atom_feed_to_dict(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                   xml_str, moderated, mirrored, | 
					
						
							|  |  |  |                                   max_posts_per_source, max_feed_item_size_kb, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                                   session, debug, preferred_podcast_formats) | 
					
						
							| 
									
										
										
										
											2022-01-10 22:30:06 +00:00
										 |  |  |     if 'https://jsonfeed.org/version/1' in xml_str: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _json_feed_v1to_dict(base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                     xml_str, moderated, mirrored, | 
					
						
							|  |  |  |                                     max_posts_per_source, | 
					
						
							|  |  |  |                                     max_feed_item_size_kb, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                     session, debug) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  | def _yt_channel_to_atom_feed(url: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 |  |  |     """Converts a YouTube channel url into an atom feed url
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if 'youtube.com/channel/' not in url: | 
					
						
							|  |  |  |         return url | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     channel_id = url.split('youtube.com/channel/')[1].strip() | 
					
						
							|  |  |  |     channel_url = \ | 
					
						
							|  |  |  |         'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id | 
					
						
							|  |  |  |     print('YouTube feed: ' + channel_url) | 
					
						
							|  |  |  |     return channel_url | 
					
						
							| 
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_rss(base_dir: str, domain: str, session, url: str, | 
					
						
							|  |  |  |             moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             max_posts_per_source: int, max_feed_size_kb: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |             max_categories_feedItem_size_kb: int, debug: bool, | 
					
						
							| 
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 |  |  |             preferred_podcast_formats: [], | 
					
						
							|  |  |  |             timeout_sec: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """Returns an RSS url as a dict
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not isinstance(url, str): | 
					
						
							|  |  |  |         print('url: ' + str(url)) | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         print('ERROR: get_rss url should be a string') | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         return None | 
					
						
							|  |  |  |     headers = { | 
					
						
							| 
									
										
										
										
											2020-12-14 20:22:05 +00:00
										 |  |  |         'Accept': 'text/xml, application/xml; charset=UTF-8' | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     } | 
					
						
							|  |  |  |     params = None | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     session_params = {} | 
					
						
							|  |  |  |     session_headers = {} | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     if headers: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         session_headers = headers | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     if params: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         session_params = params | 
					
						
							|  |  |  |     session_headers['User-Agent'] = \ | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0' | 
					
						
							|  |  |  |     if not session: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         print('WARN: no session specified for get_rss') | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     url = _yt_channel_to_atom_feed(url) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         result = \ | 
					
						
							| 
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 |  |  |             session.get(url, headers=session_headers, | 
					
						
							|  |  |  |                         params=session_params, | 
					
						
							| 
									
										
										
										
											2022-04-24 20:33:07 +00:00
										 |  |  |                         timeout=timeout_sec, | 
					
						
							|  |  |  |                         allow_redirects=False) | 
					
						
							| 
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 |  |  |         if result: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if int(len(result.text) / 1024) < max_feed_size_kb and \ | 
					
						
							| 
									
										
										
										
											2021-12-27 17:53:41 +00:00
										 |  |  |                not contains_invalid_chars(result.text): | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 return _xml_str_to_dict(base_dir, domain, result.text, | 
					
						
							|  |  |  |                                         moderated, mirrored, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                         max_posts_per_source, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                         max_feed_item_size_kb, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                         max_categories_feedItem_size_kb, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                                         session, debug, | 
					
						
							|  |  |  |                                         preferred_podcast_formats) | 
					
						
							| 
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:43:22 +00:00
										 |  |  |                 print('WARN: feed is too large, ' + | 
					
						
							|  |  |  |                       'or contains invalid characters: ' + url) | 
					
						
							| 
									
										
										
										
											2020-11-22 13:04:58 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             print('WARN: no result returned for feed ' + url) | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |     except requests.exceptions.RequestException as ex: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         print('WARN: get_rss failed\nurl: ' + str(url) + ', ' + | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |               'headers: ' + str(session_headers) + ', ' + | 
					
						
							|  |  |  |               'params: ' + str(session_params) + ', ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |     except ValueError as ex: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         print('WARN: get_rss failed\nurl: ' + str(url) + ', ' + | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |               'headers: ' + str(session_headers) + ', ' + | 
					
						
							|  |  |  |               'params: ' + str(session_params) + ', ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |     except SocketError as ex: | 
					
						
							|  |  |  |         if ex.errno == errno.ECONNRESET: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             print('WARN: connection was reset during get_rss ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             print('WARN: get_rss, ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_rs_sfrom_dict(base_dir: str, newswire: {}, | 
					
						
							|  |  |  |                       http_prefix: str, domain_full: str, | 
					
						
							|  |  |  |                       title: str, translate: {}) -> str: | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     """Returns an rss feed from the current newswire dict.
 | 
					
						
							|  |  |  |     This allows other instances to subscribe to the same newswire | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_str = rss2header(http_prefix, | 
					
						
							|  |  |  |                          None, domain_full, | 
					
						
							|  |  |  |                          'Newswire', translate) | 
					
						
							| 
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 |  |  |     if not newswire: | 
					
						
							|  |  |  |         return '' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     for published, fields in newswire.items(): | 
					
						
							| 
									
										
										
										
											2020-10-20 12:22:52 +00:00
										 |  |  |         if '+00:00' in published: | 
					
						
							|  |  |  |             published = published.replace('+00:00', 'Z').strip() | 
					
						
							|  |  |  |             published = published.replace(' ', 'T') | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             published_with_offset = \ | 
					
						
							| 
									
										
										
										
											2020-10-20 12:37:32 +00:00
										 |  |  |                 datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z") | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             published = published_with_offset.strftime("%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             pub_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2022-02-03 10:39:52 +00:00
										 |  |  |         except BaseException as ex: | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |             print('WARN: Unable to convert date ' + published + ' ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += \ | 
					
						
							| 
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 |  |  |             '<item>\n' + \ | 
					
						
							|  |  |  |             '  <title>' + fields[0] + '</title>\n' | 
					
						
							| 
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 |  |  |         description = remove_html(first_paragraph_from_string(fields[4])) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += '  <description>' + description + '</description>\n' | 
					
						
							| 
									
										
										
										
											2020-10-08 15:07:06 +00:00
										 |  |  |         url = fields[1] | 
					
						
							| 
									
										
										
										
											2020-11-08 11:04:52 +00:00
										 |  |  |         if '://' not in url: | 
					
						
							| 
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 |  |  |             if domain_full not in url: | 
					
						
							|  |  |  |                 url = http_prefix + '://' + domain_full + url | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_str += '  <link>' + url + '</link>\n' | 
					
						
							| 
									
										
										
										
											2020-10-04 22:12:27 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         rss_date_str = pub_date.strftime("%a, %d %b %Y %H:%M:%S UT") | 
					
						
							|  |  |  |         rss_str += \ | 
					
						
							|  |  |  |             '  <pubDate>' + rss_date_str + '</pubDate>\n' + \ | 
					
						
							| 
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 |  |  |             '</item>\n' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_str += rss2footer() | 
					
						
							|  |  |  |     return rss_str | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _is_newswire_blog_post(post_json_object: {}) -> bool: | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     """Is the given object a blog post?
 | 
					
						
							| 
									
										
										
										
											2020-10-25 10:47:39 +00:00
										 |  |  |     There isn't any difference between a blog post and a newswire blog post | 
					
						
							|  |  |  |     but we may here need to check for different properties than | 
					
						
							| 
									
										
										
										
											2021-12-28 13:49:44 +00:00
										 |  |  |     is_blog_post does | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if not post_json_object: | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |         return False | 
					
						
							| 
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 |  |  |     if not has_object_dict(post_json_object): | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |         return False | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if post_json_object['object'].get('summary') and \ | 
					
						
							|  |  |  |        post_json_object['object'].get('url') and \ | 
					
						
							|  |  |  |        post_json_object['object'].get('content') and \ | 
					
						
							|  |  |  |        post_json_object['object'].get('published'): | 
					
						
							| 
									
										
										
										
											2021-12-28 14:41:10 +00:00
										 |  |  |         return is_public_post(post_json_object) | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _get_hashtags_from_post(post_json_object: {}) -> []: | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |     """Returns a list of any hashtags within a post
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-26 10:57:03 +00:00
										 |  |  |     if not has_object_dict(post_json_object): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if not post_json_object['object'].get('tag'): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if not isinstance(post_json_object['object']['tag'], list): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         return [] | 
					
						
							|  |  |  |     tags = [] | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for tgname in post_json_object['object']['tag']: | 
					
						
							|  |  |  |         if not isinstance(tgname, dict): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not tgname.get('name'): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if not tgname.get('type'): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if tgname['type'] != 'Hashtag': | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if tgname['name'] not in tags: | 
					
						
							|  |  |  |             tags.append(tgname['name']) | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |     return tags | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str, | 
					
						
							|  |  |  |                                    newswire: {}, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                    max_blogs_per_account: int, | 
					
						
							|  |  |  |                                    index_filename: str, | 
					
						
							|  |  |  |                                    max_tags: int, system_language: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                    session, debug: bool) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     """Adds blogs for the given account to the newswire
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if not os.path.isfile(index_filename): | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |         return | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  |     # local blog entries are unmoderated by default | 
					
						
							|  |  |  |     moderated = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # local blogs can potentially be moderated | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     moderated_filename = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 12:02:29 +00:00
										 |  |  |         acct_dir(base_dir, nickname, domain) + '/.newswiremoderated' | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     if os.path.isfile(moderated_filename): | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  |         moderated = True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     with open(index_filename, 'r') as index_file: | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |         post_filename = 'start' | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |         ctr = 0 | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |         while post_filename: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             post_filename = index_file.readline() | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |             if post_filename: | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 # if this is a full path then remove the directories | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                 if '/' in post_filename: | 
					
						
							|  |  |  |                     post_filename = post_filename.split('/')[-1] | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 # filename of the post without any extension or path | 
					
						
							|  |  |  |                 # This should also correspond to any index entry in | 
					
						
							|  |  |  |                 # the posts cache | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_url = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |                     post_filename.replace('\n', '').replace('\r', '') | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 post_url = post_url.replace('.json', '').strip() | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 # read the post from file | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 full_post_filename = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 20:36:08 +00:00
										 |  |  |                     locate_post(base_dir, nickname, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                 domain, post_url, False) | 
					
						
							|  |  |  |                 if not full_post_filename: | 
					
						
							|  |  |  |                     print('Unable to locate post for newswire ' + post_url) | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  |                     ctr += 1 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                     if ctr >= max_blogs_per_account: | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  |                         break | 
					
						
							| 
									
										
										
										
											2020-10-06 13:34:04 +00:00
										 |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |                 post_json_object = None | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 if full_post_filename: | 
					
						
							|  |  |  |                     post_json_object = load_json(full_post_filename) | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 if _is_newswire_blog_post(post_json_object): | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |                     published = post_json_object['object']['published'] | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |                     published = published.replace('T', ' ') | 
					
						
							|  |  |  |                     published = published.replace('Z', '+00:00') | 
					
						
							| 
									
										
										
										
											2020-10-06 20:17:34 +00:00
										 |  |  |                     votes = [] | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                     if os.path.isfile(full_post_filename + '.votes'): | 
					
						
							|  |  |  |                         votes = load_json(full_post_filename + '.votes') | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                     content = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  |                         get_base_content_from_post(post_json_object, | 
					
						
							|  |  |  |                                                    system_language) | 
					
						
							| 
									
										
										
										
											2021-12-27 15:52:08 +00:00
										 |  |  |                     description = first_paragraph_from_string(content) | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  |                     description = remove_html(description) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                     tags_from_post = _get_hashtags_from_post(post_json_object) | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |                     summary = post_json_object['object']['summary'] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                     _add_newswire_dict_entry(base_dir, domain, | 
					
						
							|  |  |  |                                              newswire, published, | 
					
						
							|  |  |  |                                              summary, | 
					
						
							|  |  |  |                                              post_json_object['object']['url'], | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                              votes, full_post_filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                              description, moderated, False, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                              tags_from_post, | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  |                                              max_tags, session, debug, | 
					
						
							|  |  |  |                                              None) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |             ctr += 1 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             if ctr >= max_blogs_per_account: | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _add_blogs_to_newswire(base_dir: str, domain: str, newswire: {}, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                            max_blogs_per_account: int, | 
					
						
							|  |  |  |                            max_tags: int, system_language: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                            session, debug: bool) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 |  |  |     """Adds blogs from each user account into the newswire
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     moderation_dict = {} | 
					
						
							| 
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     # go through each account | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for _, dirs, _ in os.walk(base_dir + '/accounts'): | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |         for handle in dirs: | 
					
						
							| 
									
										
										
										
											2021-12-26 18:46:43 +00:00
										 |  |  |             if not is_account_dir(handle): | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 |  |  |             nickname = handle.split('@')[0] | 
					
						
							| 
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # has this account been suspended? | 
					
						
							| 
									
										
										
										
											2021-12-27 15:37:31 +00:00
										 |  |  |             if is_suspended(base_dir, nickname): | 
					
						
							| 
									
										
										
										
											2020-10-06 08:58:44 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |             if os.path.isfile(base_dir + '/accounts/' + handle + | 
					
						
							| 
									
										
										
										
											2020-10-06 21:28:40 +00:00
										 |  |  |                               '/.nonewswire'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |             # is there a blogs timeline for this account? | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |             account_dir = os.path.join(base_dir + '/accounts', handle) | 
					
						
							|  |  |  |             blogs_index = account_dir + '/tlblogs.index' | 
					
						
							|  |  |  |             if os.path.isfile(blogs_index): | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 domain = handle.split('@')[1] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _add_account_blogs_to_newswire(base_dir, nickname, domain, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                                                newswire, max_blogs_per_account, | 
					
						
							|  |  |  |                                                blogs_index, max_tags, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                                system_language, session, | 
					
						
							|  |  |  |                                                debug) | 
					
						
							| 
									
										
										
										
											2020-12-13 22:13:45 +00:00
										 |  |  |         break | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     # sort the moderation dict into chronological order, latest first | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     sorted_moderation_dict = \ | 
					
						
							|  |  |  |         OrderedDict(sorted(moderation_dict.items(), reverse=True)) | 
					
						
							| 
									
										
										
										
											2020-10-06 12:15:35 +00:00
										 |  |  |     # save the moderation queue details for later display | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     newswire_moderation_filename = \ | 
					
						
							|  |  |  |         base_dir + '/accounts/newswiremoderation.txt' | 
					
						
							|  |  |  |     if sorted_moderation_dict: | 
					
						
							|  |  |  |         save_json(sorted_moderation_dict, newswire_moderation_filename) | 
					
						
							| 
									
										
										
										
											2020-10-06 14:32:53 +00:00
										 |  |  |     else: | 
					
						
							|  |  |  |         # remove the file if there is nothing to moderate | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if os.path.isfile(newswire_moderation_filename): | 
					
						
							| 
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 os.remove(newswire_moderation_filename) | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |             except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 print('EX: _add_blogs_to_newswire unable to delete ' + | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                       str(newswire_moderation_filename)) | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_dict_from_newswire(session, base_dir: str, domain: str, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                            max_posts_per_source: int, max_feed_size_kb: int, | 
					
						
							|  |  |  |                            max_tags: int, max_feed_item_size_kb: int, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                            max_newswire_posts: int, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                            max_categories_feedItem_size_kb: int, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                            system_language: str, debug: bool, | 
					
						
							| 
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 |  |  |                            preferred_podcast_formats: [], | 
					
						
							|  |  |  |                            timeout_sec: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:59:55 +00:00
										 |  |  |     """Gets rss feeds as a dictionary from newswire file
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     subscriptions_filename = base_dir + '/accounts/newswire.txt' | 
					
						
							|  |  |  |     if not os.path.isfile(subscriptions_filename): | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     max_posts_per_source = 5 | 
					
						
							| 
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     # add rss feeds | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     rss_feed = [] | 
					
						
							|  |  |  |     with open(subscriptions_filename, 'r') as fp_sub: | 
					
						
							|  |  |  |         rss_feed = fp_sub.readlines() | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     for url in rss_feed: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         url = url.strip() | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # Does this contain a url? | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         if '://' not in url: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # is this a comment? | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         if url.startswith('#'): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # should this feed be moderated? | 
					
						
							|  |  |  |         moderated = False | 
					
						
							|  |  |  |         if '*' in url: | 
					
						
							|  |  |  |             moderated = True | 
					
						
							|  |  |  |             url = url.replace('*', '').strip() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 14:37:17 +00:00
										 |  |  |         # should this feed content be mirrored? | 
					
						
							|  |  |  |         mirrored = False | 
					
						
							|  |  |  |         if '!' in url: | 
					
						
							|  |  |  |             mirrored = True | 
					
						
							|  |  |  |             url = url.replace('!', '').strip() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         items_list = get_rss(base_dir, domain, session, url, | 
					
						
							|  |  |  |                              moderated, mirrored, | 
					
						
							|  |  |  |                              max_posts_per_source, max_feed_size_kb, | 
					
						
							|  |  |  |                              max_feed_item_size_kb, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                              max_categories_feedItem_size_kb, debug, | 
					
						
							| 
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 |  |  |                              preferred_podcast_formats, | 
					
						
							|  |  |  |                              timeout_sec) | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         if items_list: | 
					
						
							|  |  |  |             for date_str, item in items_list.items(): | 
					
						
							|  |  |  |                 result[date_str] = item | 
					
						
							| 
									
										
										
										
											2022-04-27 17:12:25 +00:00
										 |  |  |         time.sleep(4) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 |  |  |     # add blogs from each user account | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     _add_blogs_to_newswire(base_dir, domain, result, | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                            max_posts_per_source, max_tags, system_language, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                            session, debug) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # sort into chronological order, latest first | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     sorted_result = OrderedDict(sorted(result.items(), reverse=True)) | 
					
						
							| 
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # are there too many posts? If so then remove the oldest ones | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     no_of_posts = len(sorted_result.items()) | 
					
						
							|  |  |  |     if no_of_posts > max_newswire_posts: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 |  |  |         ctr = 0 | 
					
						
							|  |  |  |         removals = [] | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |         for date_str, item in sorted_result.items(): | 
					
						
							| 
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 |  |  |             ctr += 1 | 
					
						
							| 
									
										
										
										
											2021-12-25 18:49:19 +00:00
										 |  |  |             if ctr > max_newswire_posts: | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |                 removals.append(date_str) | 
					
						
							|  |  |  |         for remov in removals: | 
					
						
							|  |  |  |             sorted_result.pop(remov) | 
					
						
							| 
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:37:09 +00:00
										 |  |  |     return sorted_result |