| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | __filename__ = "newsdaemon.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2022-02-03 13:58:20 +00:00
										 |  |  | __version__ = "1.3.0" | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-26 11:27:14 +00:00
										 |  |  | __module_group__ = "Web Interface Columns" | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-17 18:53:08 +00:00
										 |  |  | # Example hashtag logic: | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # if moderated and not #imcoxford then block | 
					
						
							|  |  |  | # if #pol and contains "westminster" then add #britpol | 
					
						
							| 
									
										
										
										
											2020-10-17 19:06:56 +00:00
										 |  |  | # if #unwantedtag then block | 
					
						
							| 
									
										
										
										
											2020-10-17 18:53:08 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | import time | 
					
						
							| 
									
										
										
										
											2020-10-09 10:05:01 +00:00
										 |  |  | import datetime | 
					
						
							| 
									
										
										
										
											2020-10-20 13:07:02 +00:00
										 |  |  | import html | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | from shutil import rmtree | 
					
						
							|  |  |  | from subprocess import Popen | 
					
						
							| 
									
										
										
										
											2020-10-07 18:46:42 +00:00
										 |  |  | from collections import OrderedDict | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | from newswire import get_dict_from_newswire | 
					
						
							|  |  |  | # from posts import send_signed_json | 
					
						
							|  |  |  | from posts import create_news_post | 
					
						
							|  |  |  | from posts import archive_posts_for_person | 
					
						
							| 
									
										
										
										
											2022-01-13 15:10:41 +00:00
										 |  |  | from utils import valid_hash_tag | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  | from utils import get_base_content_from_post | 
					
						
							| 
									
										
										
										
											2021-12-27 15:43:22 +00:00
										 |  |  | from utils import remove_html | 
					
						
							| 
									
										
										
										
											2021-12-26 12:45:03 +00:00
										 |  |  | from utils import get_full_domain | 
					
						
							| 
									
										
										
										
											2021-12-26 15:13:34 +00:00
										 |  |  | from utils import load_json | 
					
						
							| 
									
										
										
										
											2021-12-26 14:47:21 +00:00
										 |  |  | from utils import save_json | 
					
						
							| 
									
										
										
										
											2021-12-27 17:42:35 +00:00
										 |  |  | from utils import get_status_number | 
					
						
							| 
									
										
										
										
											2021-12-28 10:17:58 +00:00
										 |  |  | from utils import clear_from_post_caches | 
					
						
							| 
									
										
										
										
											2021-12-27 21:42:08 +00:00
										 |  |  | from utils import dangerous_markup | 
					
						
							| 
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 |  |  | from utils import local_actor_url | 
					
						
							| 
									
										
										
										
											2022-06-10 11:43:33 +00:00
										 |  |  | from utils import text_in_file | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | from inbox import store_hash_tags | 
					
						
							| 
									
										
										
										
											2021-12-28 16:56:57 +00:00
										 |  |  | from session import create_session | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-08 12:29:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _update_feeds_outbox_index(base_dir: str, domain: str, | 
					
						
							|  |  |  |                                post_id: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |     """Updates the index used for imported RSS feeds
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     base_path = base_dir + '/accounts/news@' + domain | 
					
						
							|  |  |  |     index_filename = base_path + '/outbox.index' | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if os.path.isfile(index_filename): | 
					
						
							| 
									
										
										
										
											2022-06-10 11:43:33 +00:00
										 |  |  |         if not text_in_file(post_id, index_filename): | 
					
						
							| 
									
										
										
										
											2020-10-07 18:46:42 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 with open(index_filename, 'r+') as feeds_file: | 
					
						
							|  |  |  |                     content = feeds_file.read() | 
					
						
							| 
									
										
										
										
											2021-12-26 19:47:06 +00:00
										 |  |  |                     if post_id + '\n' not in content: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                         feeds_file.seek(0, 0) | 
					
						
							|  |  |  |                         feeds_file.write(post_id + '\n' + content) | 
					
						
							| 
									
										
										
										
											2020-12-29 20:22:28 +00:00
										 |  |  |                         print('DEBUG: feeds post added to index') | 
					
						
							| 
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 |  |  |             except OSError as ex: | 
					
						
							| 
									
										
										
										
											2022-02-03 10:39:52 +00:00
										 |  |  |                 print('EX: Failed to write entry to feeds posts index ' + | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                       index_filename + ' ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |             with open(index_filename, 'w+', encoding='utf-8') as feeds_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 feeds_file.write(post_id + '\n') | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             print('EX: unable to write ' + index_filename) | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  | def _save_arrived_time(post_filename: str, arrived: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-09 12:15:20 +00:00
										 |  |  |     """Saves the time when an rss post arrived to a file
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |         with open(post_filename + '.arrived', 'w+', | 
					
						
							|  |  |  |                   encoding='utf-8') as arrived_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             arrived_file.write(arrived) | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-26 23:41:34 +00:00
										 |  |  |         print('EX: unable to write ' + post_filename + '.arrived') | 
					
						
							| 
									
										
										
										
											2020-10-09 12:15:20 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _remove_control_characters(content: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-10-20 13:07:02 +00:00
										 |  |  |     """Remove escaped html
 | 
					
						
							| 
									
										
										
										
											2020-10-11 09:33:31 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-20 13:07:02 +00:00
										 |  |  |     if '&' in content: | 
					
						
							|  |  |  |         return html.unescape(content) | 
					
						
							| 
									
										
										
										
											2020-10-11 09:33:31 +00:00
										 |  |  |     return content | 
					
						
							| 
									
										
										
										
											2020-10-10 09:36:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-10 08:54:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_logical_not(tree: [], hashtags: [], moderated: bool, | 
					
						
							|  |  |  |                          content: str, url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ NOT
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) != 2: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  |     if isinstance(tree[1], str): | 
					
						
							|  |  |  |         return tree[1] not in hashtags | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if isinstance(tree[1], list): | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return not hashtag_rule_resolve(tree[1], hashtags, | 
					
						
							|  |  |  |                                         moderated, content, url) | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  | def _hashtag_logical_contains(tree: [], content: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ Contains
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) != 2: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     match_str = None | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     if isinstance(tree[1], str): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         match_str = tree[1] | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     elif isinstance(tree[1], list): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         match_str = tree[1][0] | 
					
						
							|  |  |  |     if match_str: | 
					
						
							|  |  |  |         if match_str.startswith('"') and match_str.endswith('"'): | 
					
						
							|  |  |  |             match_str = match_str[1:] | 
					
						
							|  |  |  |             match_str = match_str[:len(match_str) - 1] | 
					
						
							|  |  |  |         match_str_lower = match_str.lower() | 
					
						
							|  |  |  |         content_without_tags = content.replace('#' + match_str_lower, '') | 
					
						
							|  |  |  |         return match_str_lower in content_without_tags | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  | def _hashtag_logical_from(tree: [], url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ FROM
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) != 2: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     match_str = None | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     if isinstance(tree[1], str): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         match_str = tree[1] | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     elif isinstance(tree[1], list): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         match_str = tree[1][0] | 
					
						
							|  |  |  |     if match_str: | 
					
						
							|  |  |  |         if match_str.startswith('"') and match_str.endswith('"'): | 
					
						
							|  |  |  |             match_str = match_str[1:] | 
					
						
							|  |  |  |             match_str = match_str[:len(match_str) - 1] | 
					
						
							|  |  |  |         return match_str.lower() in url | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_logical_and(tree: [], hashtags: [], moderated: bool, | 
					
						
							|  |  |  |                          content: str, url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ AND
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) < 3: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     for arg_index in range(1, len(tree)): | 
					
						
							|  |  |  |         arg_value = False | 
					
						
							|  |  |  |         if isinstance(tree[arg_index], str): | 
					
						
							|  |  |  |             arg_value = (tree[arg_index] in hashtags) | 
					
						
							|  |  |  |         elif isinstance(tree[arg_index], list): | 
					
						
							|  |  |  |             arg_value = hashtag_rule_resolve(tree[arg_index], | 
					
						
							|  |  |  |                                              hashtags, moderated, | 
					
						
							|  |  |  |                                              content, url) | 
					
						
							|  |  |  |         if not arg_value: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |             return False | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_logical_or(tree: [], hashtags: [], moderated: bool, | 
					
						
							|  |  |  |                         content: str, url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ OR
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) < 3: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     for arg_index in range(1, len(tree)): | 
					
						
							|  |  |  |         arg_value = False | 
					
						
							|  |  |  |         if isinstance(tree[arg_index], str): | 
					
						
							|  |  |  |             arg_value = (tree[arg_index] in hashtags) | 
					
						
							|  |  |  |         elif isinstance(tree[arg_index], list): | 
					
						
							|  |  |  |             arg_value = hashtag_rule_resolve(tree[arg_index], | 
					
						
							|  |  |  |                                              hashtags, moderated, | 
					
						
							|  |  |  |                                              content, url) | 
					
						
							|  |  |  |         if arg_value: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |             return True | 
					
						
							|  |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_logical_xor(tree: [], hashtags: [], moderated: bool, | 
					
						
							|  |  |  |                          content: str, url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |     """ XOR
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if len(tree) < 3: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     true_ctr = 0 | 
					
						
							|  |  |  |     for arg_index in range(1, len(tree)): | 
					
						
							|  |  |  |         arg_value = False | 
					
						
							|  |  |  |         if isinstance(tree[arg_index], str): | 
					
						
							|  |  |  |             arg_value = (tree[arg_index] in hashtags) | 
					
						
							|  |  |  |         elif isinstance(tree[arg_index], list): | 
					
						
							|  |  |  |             arg_value = hashtag_rule_resolve(tree[arg_index], | 
					
						
							|  |  |  |                                              hashtags, moderated, | 
					
						
							|  |  |  |                                              content, url) | 
					
						
							|  |  |  |         if arg_value: | 
					
						
							|  |  |  |             true_ctr += 1 | 
					
						
							|  |  |  |     if true_ctr == 1: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:24:35 +00:00
										 |  |  |         return True | 
					
						
							|  |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def hashtag_rule_resolve(tree: [], hashtags: [], moderated: bool, | 
					
						
							|  |  |  |                          content: str, url: str) -> bool: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |     """Returns whether the tree for a hashtag rule evaluates to true or false
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not tree: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if tree[0] == 'not': | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _hashtag_logical_not(tree, hashtags, moderated, content, url) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0] == 'contains': | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  |         return _hashtag_logical_contains(tree, content) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0] == 'from': | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  |         return _hashtag_logical_from(tree, url) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0] == 'and': | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _hashtag_logical_and(tree, hashtags, moderated, content, url) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0] == 'or': | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _hashtag_logical_or(tree, hashtags, moderated, content, url) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0] == 'xor': | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         return _hashtag_logical_xor(tree, hashtags, moderated, content, url) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0].startswith('#') and len(tree) == 1: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |         return tree[0] in hashtags | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0].startswith('moderated'): | 
					
						
							| 
									
										
										
										
											2020-10-17 17:36:10 +00:00
										 |  |  |         return moderated | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if tree[0].startswith('"') and tree[0].endswith('"'): | 
					
						
							| 
									
										
										
										
											2020-10-17 18:49:43 +00:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def hashtag_rule_tree(operators: [], | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                       conditions_str: str, | 
					
						
							|  |  |  |                       tags_in_conditions: [], | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                       moderated: bool) -> []: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |     """Walks the tree
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if not operators and conditions_str: | 
					
						
							|  |  |  |         conditions_str = conditions_str.strip() | 
					
						
							|  |  |  |         is_str = \ | 
					
						
							|  |  |  |             conditions_str.startswith('"') and conditions_str.endswith('"') | 
					
						
							|  |  |  |         if conditions_str.startswith('#') or is_str or \ | 
					
						
							|  |  |  |            conditions_str in operators or \ | 
					
						
							|  |  |  |            conditions_str == 'moderated' or \ | 
					
						
							|  |  |  |            conditions_str == 'contains': | 
					
						
							|  |  |  |             if conditions_str.startswith('#'): | 
					
						
							|  |  |  |                 if conditions_str not in tags_in_conditions: | 
					
						
							|  |  |  |                     if ' ' not in conditions_str or \ | 
					
						
							|  |  |  |                        conditions_str.startswith('"'): | 
					
						
							|  |  |  |                         tags_in_conditions.append(conditions_str) | 
					
						
							|  |  |  |             return [conditions_str.strip()] | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  |     if not operators or not conditions_str: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |         return None | 
					
						
							|  |  |  |     tree = None | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     conditions_str = conditions_str.strip() | 
					
						
							|  |  |  |     is_str = conditions_str.startswith('"') and conditions_str.endswith('"') | 
					
						
							|  |  |  |     if conditions_str.startswith('#') or is_str or \ | 
					
						
							|  |  |  |        conditions_str in operators or \ | 
					
						
							|  |  |  |        conditions_str == 'moderated' or \ | 
					
						
							|  |  |  |        conditions_str == 'contains': | 
					
						
							|  |  |  |         if conditions_str.startswith('#'): | 
					
						
							|  |  |  |             if conditions_str not in tags_in_conditions: | 
					
						
							|  |  |  |                 if ' ' not in conditions_str or \ | 
					
						
							|  |  |  |                    conditions_str.startswith('"'): | 
					
						
							|  |  |  |                     tags_in_conditions.append(conditions_str) | 
					
						
							|  |  |  |         tree = [conditions_str.strip()] | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |     ctr = 0 | 
					
						
							|  |  |  |     while ctr < len(operators): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         oper = operators[ctr] | 
					
						
							|  |  |  |         opmatch = ' ' + oper + ' ' | 
					
						
							|  |  |  |         if opmatch not in conditions_str and \ | 
					
						
							|  |  |  |            not conditions_str.startswith(oper + ' '): | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |             ctr += 1 | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         tree = [oper] | 
					
						
							|  |  |  |         if opmatch in conditions_str: | 
					
						
							|  |  |  |             sections = conditions_str.split(opmatch) | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             sections = conditions_str.split(oper + ' ', 1) | 
					
						
							|  |  |  |         for sub_condition_str in sections: | 
					
						
							|  |  |  |             result = hashtag_rule_tree(operators[ctr + 1:], | 
					
						
							|  |  |  |                                        sub_condition_str, | 
					
						
							|  |  |  |                                        tags_in_conditions, moderated) | 
					
						
							|  |  |  |             if result: | 
					
						
							|  |  |  |                 tree.append(result) | 
					
						
							|  |  |  |         break | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |     return tree | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_add(base_dir: str, http_prefix: str, domain_full: str, | 
					
						
							|  |  |  |                  post_json_object: {}, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                  action_str: str, hashtags: [], system_language: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                  translate: {}) -> None: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |     """Adds a hashtag via a hashtag rule
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     add_hashtag = action_str.split('add ', 1)[1].strip() | 
					
						
							|  |  |  |     if not add_hashtag.startswith('#'): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if add_hashtag not in hashtags: | 
					
						
							|  |  |  |         hashtags.append(add_hashtag) | 
					
						
							|  |  |  |     ht_id = add_hashtag.replace('#', '') | 
					
						
							|  |  |  |     if not valid_hash_tag(ht_id): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id | 
					
						
							|  |  |  |     new_tag = { | 
					
						
							|  |  |  |         'href': hashtag_url, | 
					
						
							|  |  |  |         'name': add_hashtag, | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |         'type': 'Hashtag' | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     # does the tag already exist? | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     add_tag_object = None | 
					
						
							|  |  |  |     for htag in post_json_object['object']['tag']: | 
					
						
							|  |  |  |         if htag.get('type') and htag.get('name'): | 
					
						
							|  |  |  |             if htag['type'] == 'Hashtag' and \ | 
					
						
							|  |  |  |                htag['name'] == add_hashtag: | 
					
						
							|  |  |  |                 add_tag_object = htag | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |                 break | 
					
						
							|  |  |  |     # append the tag if it wasn't found | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if not add_tag_object: | 
					
						
							|  |  |  |         post_json_object['object']['tag'].append(new_tag) | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |     # add corresponding html to the post content | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     hashtag_html = \ | 
					
						
							|  |  |  |         " <a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \ | 
					
						
							|  |  |  |         "rel=\"tag\">#<span>" + ht_id + "</span></a>" | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  |     content = get_base_content_from_post(post_json_object, system_language) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if hashtag_html in content: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if content.endswith('</p>'): | 
					
						
							|  |  |  |         content = \ | 
					
						
							|  |  |  |             content[:len(content) - len('</p>')] + \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             hashtag_html + '</p>' | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         content += hashtag_html | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     post_json_object['object']['content'] = content | 
					
						
							| 
									
										
										
										
											2021-12-26 10:00:46 +00:00
										 |  |  |     domain = domain_full | 
					
						
							| 
									
										
										
										
											2021-10-20 13:33:34 +00:00
										 |  |  |     if ':' in domain: | 
					
						
							|  |  |  |         domain = domain.split(':')[0] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     store_hash_tags(base_dir, 'news', domain, | 
					
						
							|  |  |  |                     http_prefix, domain_full, | 
					
						
							|  |  |  |                     post_json_object, translate) | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _hashtag_remove(http_prefix: str, domain_full: str, post_json_object: {}, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                     action_str: str, hashtags: [], | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                     system_language: str) -> None: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |     """Removes a hashtag via a hashtag rule
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     rm_hashtag = action_str.split('remove ', 1)[1].strip() | 
					
						
							|  |  |  |     if not rm_hashtag.startswith('#'): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if rm_hashtag in hashtags: | 
					
						
							|  |  |  |         hashtags.remove(rm_hashtag) | 
					
						
							|  |  |  |     ht_id = rm_hashtag.replace('#', '') | 
					
						
							|  |  |  |     hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |     # remove tag html from the post content | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     hashtag_html = \ | 
					
						
							|  |  |  |         "<a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \ | 
					
						
							|  |  |  |         "rel=\"tag\">#<span>" + ht_id + "</span></a>" | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  |     content = get_base_content_from_post(post_json_object, system_language) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if hashtag_html in content: | 
					
						
							|  |  |  |         content = content.replace(hashtag_html, '').replace('  ', ' ') | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |         post_json_object['object']['content'] = content | 
					
						
							| 
									
										
										
										
											2021-12-25 23:03:28 +00:00
										 |  |  |         post_json_object['object']['contentMap'][system_language] = content | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     rm_tag_object = None | 
					
						
							|  |  |  |     for htag in post_json_object['object']['tag']: | 
					
						
							|  |  |  |         if htag.get('type') and htag.get('name'): | 
					
						
							|  |  |  |             if htag['type'] == 'Hashtag' and \ | 
					
						
							|  |  |  |                htag['name'] == rm_hashtag: | 
					
						
							|  |  |  |                 rm_tag_object = htag | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |                 break | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if rm_tag_object: | 
					
						
							|  |  |  |         post_json_object['object']['tag'].remove(rm_tag_object) | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  | def _newswire_hashtag_processing(base_dir: str, post_json_object: {}, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                  hashtags: [], http_prefix: str, | 
					
						
							|  |  |  |                                  domain: str, port: int, | 
					
						
							|  |  |  |                                  moderated: bool, url: str, | 
					
						
							|  |  |  |                                  system_language: str, | 
					
						
							|  |  |  |                                  translate: {}) -> bool: | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  |     """Applies hashtag rules to a news post.
 | 
					
						
							|  |  |  |     Returns true if the post should be saved to the news timeline | 
					
						
							|  |  |  |     of this instance | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     rules_filename = base_dir + '/accounts/hashtagrules.txt' | 
					
						
							|  |  |  |     if not os.path.isfile(rules_filename): | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |         return True | 
					
						
							|  |  |  |     rules = [] | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |     with open(rules_filename, 'r', encoding='utf-8') as fp_rules: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         rules = fp_rules.readlines() | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-26 12:45:03 +00:00
										 |  |  |     domain_full = get_full_domain(domain, port) | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-17 18:49:43 +00:00
										 |  |  |     # get the full text content of the post | 
					
						
							|  |  |  |     content = '' | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if post_json_object['object'].get('content'): | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  |         content += get_base_content_from_post(post_json_object, | 
					
						
							|  |  |  |                                               system_language) | 
					
						
							| 
									
										
										
										
											2021-12-25 22:09:19 +00:00
										 |  |  |     if post_json_object['object'].get('summary'): | 
					
						
							|  |  |  |         content += ' ' + post_json_object['object']['summary'] | 
					
						
							| 
									
										
										
										
											2020-10-17 19:04:39 +00:00
										 |  |  |     content = content.lower() | 
					
						
							| 
									
										
										
										
											2020-10-17 18:49:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-17 13:41:20 +00:00
										 |  |  |     # actionOccurred = False | 
					
						
							| 
									
										
										
										
											2020-10-20 17:37:15 +00:00
										 |  |  |     operators = ('not', 'and', 'or', 'xor', 'from', 'contains') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     for rule_str in rules: | 
					
						
							|  |  |  |         if not rule_str: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if not rule_str.startswith('if '): | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if ' then ' not in rule_str: | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         conditions_str = rule_str.split('if ', 1)[1] | 
					
						
							|  |  |  |         conditions_str = conditions_str.split(' then ')[0] | 
					
						
							|  |  |  |         tags_in_conditions = [] | 
					
						
							|  |  |  |         tree = hashtag_rule_tree(operators, conditions_str, | 
					
						
							|  |  |  |                                  tags_in_conditions, moderated) | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         if not hashtag_rule_resolve(tree, hashtags, moderated, content, url): | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  |             continue | 
					
						
							|  |  |  |         # the condition matches, so do something | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         action_str = rule_str.split(' then ')[1].strip() | 
					
						
							| 
									
										
										
										
											2020-10-17 12:05:41 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if action_str.startswith('add '): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |             # add a hashtag | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             _hashtag_add(base_dir, http_prefix, domain_full, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                          post_json_object, action_str, hashtags, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                          system_language, translate) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         elif action_str.startswith('remove '): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |             # remove a hashtag | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             _hashtag_remove(http_prefix, domain_full, post_json_object, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                             action_str, hashtags, system_language) | 
					
						
							|  |  |  |         elif action_str.startswith('block') or action_str.startswith('drop'): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:46:48 +00:00
										 |  |  |             # Block this item | 
					
						
							| 
									
										
										
										
											2020-10-17 16:24:47 +00:00
										 |  |  |             return False | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _create_news_mirror(base_dir: str, domain: str, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                         post_id_number: str, url: str, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                         max_mirrored_articles: int) -> bool: | 
					
						
							| 
									
										
										
										
											2020-10-19 16:33:58 +00:00
										 |  |  |     """Creates a local mirror of a news article
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |     if '|' in url or '>' in url: | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     mirror_dir = base_dir + '/accounts/newsmirror' | 
					
						
							|  |  |  |     if not os.path.isdir(mirror_dir): | 
					
						
							|  |  |  |         os.mkdir(mirror_dir) | 
					
						
							| 
									
										
										
										
											2020-10-19 16:33:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |     # count the directories | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     no_of_dirs = 0 | 
					
						
							|  |  |  |     for _, dirs, _ in os.walk(mirror_dir): | 
					
						
							|  |  |  |         no_of_dirs = len(dirs) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     mirror_index_filename = base_dir + '/accounts/newsmirror.txt' | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if max_mirrored_articles > 0 and no_of_dirs > max_mirrored_articles: | 
					
						
							|  |  |  |         if not os.path.isfile(mirror_index_filename): | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |             # no index for mirrors found | 
					
						
							|  |  |  |             return True | 
					
						
							|  |  |  |         removals = [] | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |         with open(mirror_index_filename, 'r', encoding='utf-8') as index_file: | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |             # remove the oldest directories | 
					
						
							|  |  |  |             ctr = 0 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             while no_of_dirs > max_mirrored_articles: | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |                 ctr += 1 | 
					
						
							|  |  |  |                 if ctr > 5000: | 
					
						
							|  |  |  |                     # escape valve | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 post_id = index_file.readline() | 
					
						
							| 
									
										
										
										
											2021-12-26 19:47:06 +00:00
										 |  |  |                 if not post_id: | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2021-12-26 19:47:06 +00:00
										 |  |  |                 post_id = post_id.strip() | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 mirror_article_dir = mirror_dir + '/' + post_id | 
					
						
							|  |  |  |                 if os.path.isdir(mirror_article_dir): | 
					
						
							|  |  |  |                     rmtree(mirror_article_dir, | 
					
						
							|  |  |  |                            ignore_errors=False, onerror=None) | 
					
						
							| 
									
										
										
										
											2021-12-26 19:47:06 +00:00
										 |  |  |                     removals.append(post_id) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                     no_of_dirs -= 1 | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # remove the corresponding index entries | 
					
						
							|  |  |  |         if removals: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             index_content = '' | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |             with open(mirror_index_filename, 'r', | 
					
						
							|  |  |  |                       encoding='utf-8') as index_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 index_content = index_file.read() | 
					
						
							|  |  |  |                 for remove_post_id in removals: | 
					
						
							|  |  |  |                     index_content = \ | 
					
						
							|  |  |  |                         index_content.replace(remove_post_id + '\n', '') | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |                 with open(mirror_index_filename, 'w+', | 
					
						
							|  |  |  |                           encoding='utf-8') as index_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                     index_file.write(index_content) | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |             except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 print('EX: unable to write ' + mirror_index_filename) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     mirror_article_dir = mirror_dir + '/' + post_id_number | 
					
						
							|  |  |  |     if os.path.isdir(mirror_article_dir): | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |         # already mirrored | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-20 09:27:58 +00:00
										 |  |  |     # for onion instances mirror via tor | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     prefix_str = '' | 
					
						
							| 
									
										
										
										
											2020-10-20 09:27:58 +00:00
										 |  |  |     if domain.endswith('.onion'): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         prefix_str = '/usr/bin/torsocks ' | 
					
						
							| 
									
										
										
										
											2020-10-20 09:27:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |     # download the files | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     command_str = \ | 
					
						
							|  |  |  |         prefix_str + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \ | 
					
						
							|  |  |  |         ' -P ' + mirror_article_dir | 
					
						
							|  |  |  |     proc = Popen(command_str, shell=True) | 
					
						
							|  |  |  |     os.waitpid(proc.pid, 0) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if not os.path.isdir(mirror_article_dir): | 
					
						
							| 
									
										
										
										
											2020-10-20 09:27:58 +00:00
										 |  |  |         print('WARN: failed to mirror ' + url) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # append the post Id number to the index file | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     if os.path.isfile(mirror_index_filename): | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |             with open(mirror_index_filename, 'a+', | 
					
						
							|  |  |  |                       encoding='utf-8') as index_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 index_file.write(post_id_number + '\n') | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             print('EX: unable to append ' + mirror_index_filename) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |             with open(mirror_index_filename, 'w+', | 
					
						
							|  |  |  |                       encoding='utf-8') as index_file: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 index_file.write(post_id_number + '\n') | 
					
						
							| 
									
										
										
										
											2021-11-25 21:18:53 +00:00
										 |  |  |         except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             print('EX: unable to write ' + mirror_index_filename) | 
					
						
							| 
									
										
										
										
											2020-10-19 19:26:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 16:33:58 +00:00
										 |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 19:40:12 +00:00
										 |  |  | def _convert_rss_to_activitypub(base_dir: str, http_prefix: str, | 
					
						
							|  |  |  |                                 domain: str, port: int, | 
					
						
							|  |  |  |                                 newswire: {}, | 
					
						
							|  |  |  |                                 translate: {}, | 
					
						
							|  |  |  |                                 recent_posts_cache: {}, | 
					
						
							|  |  |  |                                 max_mirrored_articles: int, | 
					
						
							|  |  |  |                                 allow_local_network_access: bool, | 
					
						
							|  |  |  |                                 system_language: str, | 
					
						
							|  |  |  |                                 low_bandwidth: bool, | 
					
						
							|  |  |  |                                 content_license_url: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |     """Converts rss items in a newswire into posts
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 |  |  |     if not newswire: | 
					
						
							| 
									
										
										
										
											2021-09-15 17:43:06 +00:00
										 |  |  |         print('No newswire to convert') | 
					
						
							| 
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     base_path = base_dir + '/accounts/news@' + domain + '/outbox' | 
					
						
							|  |  |  |     if not os.path.isdir(base_path): | 
					
						
							|  |  |  |         os.mkdir(base_path) | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 10:05:01 +00:00
										 |  |  |     # oldest items first | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     newswire_reverse = OrderedDict(sorted(newswire.items(), reverse=False)) | 
					
						
							| 
									
										
										
										
											2020-10-07 18:46:42 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     for date_str, item in newswire_reverse.items(): | 
					
						
							|  |  |  |         original_date_str = date_str | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  |         # convert the date to the format used by ActivityPub | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if '+00:00' in date_str: | 
					
						
							|  |  |  |             date_str = date_str.replace(' ', 'T') | 
					
						
							|  |  |  |             date_str = date_str.replace('+00:00', 'Z') | 
					
						
							| 
									
										
										
										
											2020-10-20 12:37:32 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2021-09-15 17:43:06 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 date_str_with_offset = \ | 
					
						
							|  |  |  |                     datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S%z") | 
					
						
							| 
									
										
										
										
											2021-09-15 17:43:06 +00:00
										 |  |  |             except BaseException: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 print('EX: Newswire strptime failed ' + str(date_str)) | 
					
						
							| 
									
										
										
										
											2021-09-15 17:43:06 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-09-15 19:04:29 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 date_str = date_str_with_offset.strftime("%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2021-09-15 19:04:29 +00:00
										 |  |  |             except BaseException: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 print('EX: Newswire date_str_with_offset failed ' + | 
					
						
							|  |  |  |                       str(date_str_with_offset)) | 
					
						
							| 
									
										
										
										
											2021-09-15 19:04:29 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         status_number, _ = get_status_number(date_str) | 
					
						
							|  |  |  |         new_post_id = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 |  |  |             local_actor_url(http_prefix, 'news', domain) + \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             '/statuses/' + status_number | 
					
						
							| 
									
										
										
										
											2020-10-07 16:55:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  |         # file where the post is stored | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         filename = base_path + '/' + new_post_id.replace('/', '#') + '.json' | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |         if os.path.isfile(filename): | 
					
						
							| 
									
										
										
										
											2020-10-08 12:52:15 +00:00
										 |  |  |             # don't create the post if it already exists | 
					
						
							| 
									
										
										
										
											2020-10-08 14:35:26 +00:00
										 |  |  |             # set the url | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             # newswire[original_date_str][1] = \ | 
					
						
							|  |  |  |             #     '/users/news/statuses/' + status_number | 
					
						
							| 
									
										
										
										
											2020-10-08 14:35:26 +00:00
										 |  |  |             # set the filename | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             newswire[original_date_str][3] = filename | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         rss_title = _remove_control_characters(item[0]) | 
					
						
							| 
									
										
										
										
											2020-10-10 09:53:56 +00:00
										 |  |  |         url = item[1] | 
					
						
							| 
									
										
										
										
											2021-12-27 21:42:08 +00:00
										 |  |  |         if dangerous_markup(url, allow_local_network_access) or \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |            dangerous_markup(rss_title, allow_local_network_access): | 
					
						
							| 
									
										
										
										
											2020-10-11 09:33:31 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         rss_description = '' | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # get the rss description if it exists | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         rss_description = '<p>' + remove_html(item[4]) + '<p>' | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 20:43:27 +00:00
										 |  |  |         mirrored = item[7] | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         post_url = url | 
					
						
							| 
									
										
										
										
											2020-10-19 20:43:27 +00:00
										 |  |  |         if mirrored and '://' in url: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             post_url = '/newsmirror/' + status_number + '/' + \ | 
					
						
							| 
									
										
										
										
											2020-10-19 22:21:30 +00:00
										 |  |  |                 url.split('://')[1] | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             if post_url.endswith('/'): | 
					
						
							|  |  |  |                 post_url += 'index.html' | 
					
						
							| 
									
										
										
										
											2020-10-19 22:21:30 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 post_url += '/index.html' | 
					
						
							| 
									
										
										
										
											2020-10-19 20:43:27 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  |         # add the off-site link to the description | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         rss_description += \ | 
					
						
							|  |  |  |             '<br><a href="' + post_url + '">' + \ | 
					
						
							| 
									
										
										
										
											2021-01-11 21:38:31 +00:00
										 |  |  |             translate['Read more...'] + '</a>' | 
					
						
							| 
									
										
										
										
											2020-10-11 09:33:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-10 18:48:57 +00:00
										 |  |  | #        podcast_properties = None | 
					
						
							|  |  |  | #        if len(item) > 8: | 
					
						
							|  |  |  | #            podcast_properties = item[8] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 10:08:01 +00:00
										 |  |  |         # NOTE: the id when the post is created will not be | 
					
						
							|  |  |  |         # consistent (it's based on the current time, not the | 
					
						
							|  |  |  |         # published time), so we change that later | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         save_to_file = False | 
					
						
							|  |  |  |         attach_image_filename = None | 
					
						
							|  |  |  |         media_type = None | 
					
						
							|  |  |  |         image_description = None | 
					
						
							| 
									
										
										
										
											2021-05-09 19:29:53 +00:00
										 |  |  |         city = 'London, England' | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         conversation_id = None | 
					
						
							| 
									
										
										
										
											2022-01-28 10:54:53 +00:00
										 |  |  |         languages_understood = [system_language] | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         blog = create_news_post(base_dir, | 
					
						
							|  |  |  |                                 domain, port, http_prefix, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                                 rss_description, | 
					
						
							| 
									
										
										
										
											2022-05-31 16:51:56 +00:00
										 |  |  |                                 save_to_file, | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                                 attach_image_filename, media_type, | 
					
						
							|  |  |  |                                 image_description, city, | 
					
						
							|  |  |  |                                 rss_title, system_language, | 
					
						
							|  |  |  |                                 conversation_id, low_bandwidth, | 
					
						
							| 
									
										
										
										
											2022-01-28 10:54:53 +00:00
										 |  |  |                                 content_license_url, | 
					
						
							|  |  |  |                                 languages_understood) | 
					
						
							| 
									
										
										
										
											2020-10-07 16:55:15 +00:00
										 |  |  |         if not blog: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 16:33:58 +00:00
										 |  |  |         if mirrored: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             if not _create_news_mirror(base_dir, domain, status_number, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                        url, max_mirrored_articles): | 
					
						
							| 
									
										
										
										
											2020-10-19 16:33:58 +00:00
										 |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         id_str = \ | 
					
						
							| 
									
										
										
										
											2021-12-26 10:19:59 +00:00
										 |  |  |             local_actor_url(http_prefix, 'news', domain) + \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             '/statuses/' + status_number + '/replies' | 
					
						
							| 
									
										
										
										
											2020-10-08 09:07:45 +00:00
										 |  |  |         blog['news'] = True | 
					
						
							| 
									
										
										
										
											2020-10-09 10:05:01 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # note the time of arrival | 
					
						
							| 
									
										
										
										
											2021-12-26 13:17:46 +00:00
										 |  |  |         curr_time = datetime.datetime.utcnow() | 
					
						
							|  |  |  |         blog['object']['arrived'] = curr_time.strftime("%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2020-10-09 10:05:01 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 10:08:01 +00:00
										 |  |  |         # change the id, based upon the published time | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         blog['object']['replies']['id'] = id_str | 
					
						
							|  |  |  |         blog['object']['replies']['first']['partOf'] = id_str | 
					
						
							| 
									
										
										
										
											2020-10-07 16:55:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         blog['id'] = new_post_id + '/activity' | 
					
						
							|  |  |  |         blog['object']['id'] = new_post_id | 
					
						
							|  |  |  |         blog['object']['atomUri'] = new_post_id | 
					
						
							| 
									
										
										
										
											2020-10-07 16:55:15 +00:00
										 |  |  |         blog['object']['url'] = \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             http_prefix + '://' + domain + '/@news/' + status_number | 
					
						
							|  |  |  |         blog['object']['published'] = date_str | 
					
						
							| 
									
										
										
										
											2020-10-20 13:07:02 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         blog['object']['content'] = rss_description | 
					
						
							|  |  |  |         blog['object']['contentMap'][system_language] = rss_description | 
					
						
							| 
									
										
										
										
											2020-10-07 16:55:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-26 12:45:03 +00:00
										 |  |  |         domain_full = get_full_domain(domain, port) | 
					
						
							| 
									
										
										
										
											2020-10-17 13:59:47 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         hashtags = item[6] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         post_id = new_post_id.replace('/', '#') | 
					
						
							| 
									
										
										
										
											2020-10-07 14:10:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 12:15:20 +00:00
										 |  |  |         moderated = item[5] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         save_post = \ | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  |             _newswire_hashtag_processing(base_dir, blog, hashtags, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                          http_prefix, domain, port, | 
					
						
							|  |  |  |                                          moderated, url, system_language, | 
					
						
							|  |  |  |                                          translate) | 
					
						
							| 
									
										
										
										
											2020-10-09 12:15:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  |         # save the post and update the index | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if save_post: | 
					
						
							| 
									
										
										
										
											2020-10-25 12:00:55 +00:00
										 |  |  |             # ensure that all hashtags are stored in the json | 
					
						
							|  |  |  |             # and appended to the content | 
					
						
							|  |  |  |             blog['object']['tag'] = [] | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             for tag_name in hashtags: | 
					
						
							|  |  |  |                 ht_id = tag_name.replace('#', '') | 
					
						
							|  |  |  |                 hashtag_url = \ | 
					
						
							|  |  |  |                     http_prefix + "://" + domain_full + "/tags/" + ht_id | 
					
						
							|  |  |  |                 new_tag = { | 
					
						
							|  |  |  |                     'href': hashtag_url, | 
					
						
							|  |  |  |                     'name': tag_name, | 
					
						
							| 
									
										
										
										
											2020-10-25 11:22:52 +00:00
										 |  |  |                     'type': 'Hashtag' | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 blog['object']['tag'].append(new_tag) | 
					
						
							|  |  |  |                 hashtag_html = \ | 
					
						
							|  |  |  |                     " <a href=\"" + hashtag_url + \ | 
					
						
							| 
									
										
										
										
											2020-10-25 12:00:55 +00:00
										 |  |  |                     "\" class=\"addedHashtag\" " + \ | 
					
						
							|  |  |  |                     "rel=\"tag\">#<span>" + \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                     ht_id + "</span></a>" | 
					
						
							| 
									
										
										
										
											2021-12-26 11:29:40 +00:00
										 |  |  |                 content = get_base_content_from_post(blog, system_language) | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 if hashtag_html not in content: | 
					
						
							| 
									
										
										
										
											2020-10-25 14:37:51 +00:00
										 |  |  |                     if content.endswith('</p>'): | 
					
						
							|  |  |  |                         content = \ | 
					
						
							|  |  |  |                             content[:len(content) - len('</p>')] + \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                             hashtag_html + '</p>' | 
					
						
							| 
									
										
										
										
											2020-10-25 14:37:51 +00:00
										 |  |  |                     else: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                         content += hashtag_html | 
					
						
							| 
									
										
										
										
											2020-10-25 14:37:51 +00:00
										 |  |  |                     blog['object']['content'] = content | 
					
						
							| 
									
										
										
										
											2021-12-25 23:03:28 +00:00
										 |  |  |                     blog['object']['contentMap'][system_language] = content | 
					
						
							| 
									
										
										
										
											2020-10-25 11:22:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-25 14:21:29 +00:00
										 |  |  |             # update the newswire tags if new ones have been found by | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             # _newswire_hashtag_processing | 
					
						
							| 
									
										
										
										
											2020-10-25 14:21:29 +00:00
										 |  |  |             for tag in hashtags: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 if tag not in newswire[original_date_str][6]: | 
					
						
							|  |  |  |                     newswire[original_date_str][6].append(tag) | 
					
						
							| 
									
										
										
										
											2020-10-17 13:39:04 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             store_hash_tags(base_dir, 'news', domain, | 
					
						
							|  |  |  |                             http_prefix, domain_full, | 
					
						
							|  |  |  |                             blog, translate) | 
					
						
							| 
									
										
										
										
											2020-10-17 13:39:04 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-28 10:17:58 +00:00
										 |  |  |             clear_from_post_caches(base_dir, recent_posts_cache, post_id) | 
					
						
							| 
									
										
										
										
											2021-12-26 14:47:21 +00:00
										 |  |  |             if save_json(blog, filename): | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 _update_feeds_outbox_index(base_dir, domain, post_id + '.json') | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 # Save a file containing the time when the post arrived | 
					
						
							|  |  |  |                 # this can then later be used to construct the news timeline | 
					
						
							|  |  |  |                 # excluding items during the voting period | 
					
						
							|  |  |  |                 if moderated: | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  |                     _save_arrived_time(filename, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                        blog['object']['arrived']) | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     if os.path.isfile(filename + '.arrived'): | 
					
						
							| 
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 |  |  |                         try: | 
					
						
							|  |  |  |                             os.remove(filename + '.arrived') | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |                         except OSError: | 
					
						
							| 
									
										
										
										
											2022-01-12 19:40:12 +00:00
										 |  |  |                             print('EX: _convert_rss_to_activitypub ' + | 
					
						
							| 
									
										
										
										
											2021-10-29 18:48:15 +00:00
										 |  |  |                                   'unable to delete ' + filename + '.arrived') | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-08 16:52:57 +00:00
										 |  |  |                 # setting the url here links to the activitypub object | 
					
						
							|  |  |  |                 # stored locally | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 # newswire[original_date_str][1] = \ | 
					
						
							|  |  |  |                 #     '/users/news/statuses/' + status_number | 
					
						
							| 
									
										
										
										
											2020-11-08 16:52:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-16 21:33:18 +00:00
										 |  |  |                 # set the filename | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 newswire[original_date_str][3] = filename | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  | def _merge_with_previous_newswire(oldNewswire: {}, new_newswire: {}) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  |     """Preserve any votes or generated activitypub post filename
 | 
					
						
							|  |  |  |     as rss feeds are updated | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 |  |  |     if not oldNewswire: | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  |     for published, fields in oldNewswire.items(): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         if not new_newswire.get(published): | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-13 08:53:59 +00:00
										 |  |  |         for i in range(1, 5): | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             new_newswire[published][i] = fields[i] | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def run_newswire_daemon(base_dir: str, httpd, | 
					
						
							|  |  |  |                         http_prefix: str, domain: str, port: int, | 
					
						
							|  |  |  |                         translate: {}) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |     """Periodically updates RSS feeds
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     newswire_state_filename = base_dir + '/accounts/.newswirestate.json' | 
					
						
							|  |  |  |     refresh_filename = base_dir + '/accounts/.refresh_newswire' | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-30 10:05:43 +00:00
										 |  |  |     print('Starting newswire daemon') | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |     # initial sleep to allow the system to start up | 
					
						
							|  |  |  |     time.sleep(50) | 
					
						
							|  |  |  |     while True: | 
					
						
							|  |  |  |         # has the session been created yet? | 
					
						
							|  |  |  |         if not httpd.session: | 
					
						
							| 
									
										
										
										
											2020-11-03 16:10:54 +00:00
										 |  |  |             print('Newswire daemon waiting for session') | 
					
						
							| 
									
										
										
										
											2021-12-28 16:56:57 +00:00
										 |  |  |             httpd.session = create_session(httpd.proxy_type) | 
					
						
							| 
									
										
										
										
											2020-11-03 16:08:31 +00:00
										 |  |  |             if not httpd.session: | 
					
						
							| 
									
										
										
										
											2020-11-03 16:10:54 +00:00
										 |  |  |                 print('Newswire daemon has no session') | 
					
						
							| 
									
										
										
										
											2020-11-03 16:08:31 +00:00
										 |  |  |                 time.sleep(60) | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2022-05-30 15:15:17 +00:00
										 |  |  |             print('Newswire daemon session established') | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # try to update the feeds | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |         print('Updating newswire feeds') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         new_newswire = \ | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             get_dict_from_newswire(httpd.session, base_dir, domain, | 
					
						
							| 
									
										
										
										
											2022-06-10 13:47:10 +00:00
										 |  |  |                                    httpd.max_newswire_posts_per_source, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                    httpd.max_newswire_feed_size_kb, | 
					
						
							|  |  |  |                                    httpd.maxTags, | 
					
						
							|  |  |  |                                    httpd.max_feed_item_size_kb, | 
					
						
							|  |  |  |                                    httpd.max_newswire_posts, | 
					
						
							|  |  |  |                                    httpd.maxCategoriesFeedItemSizeKb, | 
					
						
							|  |  |  |                                    httpd.system_language, | 
					
						
							| 
									
										
										
										
											2022-04-22 13:46:42 +00:00
										 |  |  |                                    httpd.debug, | 
					
						
							| 
									
										
										
										
											2022-04-24 19:03:02 +00:00
										 |  |  |                                    httpd.preferred_podcast_formats, | 
					
						
							|  |  |  |                                    httpd.rss_timeout_sec) | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  |         if not httpd.newswire: | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |             print('Newswire feeds not updated') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             if os.path.isfile(newswire_state_filename): | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |                 print('Loading newswire from file') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                 httpd.newswire = load_json(newswire_state_filename) | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |         print('Merging with previous newswire') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         _merge_with_previous_newswire(httpd.newswire, new_newswire) | 
					
						
							| 
									
										
										
										
											2020-10-09 09:02:01 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |         httpd.newswire = new_newswire | 
					
						
							|  |  |  |         if new_newswire: | 
					
						
							|  |  |  |             save_json(httpd.newswire, newswire_state_filename) | 
					
						
							| 
									
										
										
										
											2020-11-03 21:53:29 +00:00
										 |  |  |             print('Newswire updated') | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             print('No new newswire') | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |         print('Converting newswire to activitypub format') | 
					
						
							| 
									
										
										
										
											2022-06-12 20:31:56 +00:00
										 |  |  |         _convert_rss_to_activitypub(base_dir, http_prefix, domain, port, | 
					
						
							| 
									
										
										
										
											2022-01-12 19:40:12 +00:00
										 |  |  |                                     new_newswire, translate, | 
					
						
							|  |  |  |                                     httpd.recent_posts_cache, | 
					
						
							|  |  |  |                                     httpd.max_mirrored_articles, | 
					
						
							|  |  |  |                                     httpd.allow_local_network_access, | 
					
						
							|  |  |  |                                     httpd.system_language, | 
					
						
							|  |  |  |                                     httpd.low_bandwidth, | 
					
						
							|  |  |  |                                     httpd.content_license_url) | 
					
						
							| 
									
										
										
										
											2020-10-07 13:51:29 +00:00
										 |  |  |         print('Newswire feed converted to ActivityPub') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-25 19:39:45 +00:00
										 |  |  |         if httpd.max_news_posts > 0: | 
					
						
							| 
									
										
										
										
											2021-12-25 23:41:17 +00:00
										 |  |  |             archive_dir = base_dir + '/archive' | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             archive_subdir = \ | 
					
						
							| 
									
										
										
										
											2021-12-25 23:41:17 +00:00
										 |  |  |                 archive_dir + '/accounts/news@' + domain + '/outbox' | 
					
						
							| 
									
										
										
										
											2021-09-15 17:03:20 +00:00
										 |  |  |             print('Archiving news posts') | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             archive_posts_for_person(http_prefix, 'news', | 
					
						
							|  |  |  |                                      domain, base_dir, 'outbox', | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                                      archive_subdir, | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                                      httpd.recent_posts_cache, | 
					
						
							|  |  |  |                                      httpd.max_news_posts) | 
					
						
							| 
									
										
										
										
											2020-10-21 10:39:09 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |         # wait a while before the next feeds update | 
					
						
							| 
									
										
										
										
											2022-04-24 20:19:16 +00:00
										 |  |  |         for _ in range(360): | 
					
						
							| 
									
										
										
										
											2021-02-10 13:31:19 +00:00
										 |  |  |             time.sleep(10) | 
					
						
							|  |  |  |             # if a new blog post has been created then stop | 
					
						
							|  |  |  |             # waiting and recalculate the newswire | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             if os.path.isfile(refresh_filename): | 
					
						
							| 
									
										
										
										
											2021-02-10 13:31:19 +00:00
										 |  |  |                 try: | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                     os.remove(refresh_filename) | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |                 except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                     print('EX: run_newswire_daemon unable to delete ' + | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |                           str(refresh_filename)) | 
					
						
							| 
									
										
										
										
											2021-02-10 13:31:19 +00:00
										 |  |  |                 break | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def run_newswire_watchdog(project_version: str, httpd) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |     """This tries to keep the newswire update thread running even if it dies
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-03-13 11:01:07 +00:00
										 |  |  |     print('THREAD: Starting newswire watchdog') | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |     newswire_original = \ | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         httpd.thrPostSchedule.clone(run_newswire_daemon) | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |     httpd.thrNewswireDaemon.start() | 
					
						
							|  |  |  |     while True: | 
					
						
							|  |  |  |         time.sleep(50) | 
					
						
							| 
									
										
										
										
											2021-06-05 12:43:57 +00:00
										 |  |  |         if httpd.thrNewswireDaemon.is_alive(): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         httpd.thrNewswireDaemon.kill() | 
					
						
							| 
									
										
										
										
											2022-03-13 11:01:07 +00:00
										 |  |  |         print('THREAD: restarting newswire watchdog') | 
					
						
							| 
									
										
										
										
											2021-06-05 12:43:57 +00:00
										 |  |  |         httpd.thrNewswireDaemon = \ | 
					
						
							| 
									
										
										
										
											2022-01-03 11:33:46 +00:00
										 |  |  |             newswire_original.clone(run_newswire_daemon) | 
					
						
							| 
									
										
										
										
											2021-06-05 12:43:57 +00:00
										 |  |  |         httpd.thrNewswireDaemon.start() | 
					
						
							|  |  |  |         print('Restarting newswire daemon...') |