2020-10-07 12:05:49 +00:00
|
|
|
__filename__ = "newsdaemon.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2024-01-21 19:01:20 +00:00
|
|
|
__version__ = "1.5.0"
|
2020-10-07 12:05:49 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2020-10-07 12:05:49 +00:00
|
|
|
__status__ = "Production"
|
2021-06-26 11:27:14 +00:00
|
|
|
__module_group__ = "Web Interface Columns"
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-17 18:53:08 +00:00
|
|
|
# Example hashtag logic:
|
|
|
|
#
|
|
|
|
# if moderated and not #imcoxford then block
|
|
|
|
# if #pol and contains "westminster" then add #britpol
|
2020-10-17 19:06:56 +00:00
|
|
|
# if #unwantedtag then block
|
2020-10-17 18:53:08 +00:00
|
|
|
|
2020-10-07 13:51:29 +00:00
|
|
|
import os
|
2020-10-07 12:05:49 +00:00
|
|
|
import time
|
2020-10-20 13:07:02 +00:00
|
|
|
import html
|
2020-10-19 19:26:58 +00:00
|
|
|
from shutil import rmtree
|
|
|
|
from subprocess import Popen
|
2020-10-07 18:46:42 +00:00
|
|
|
from collections import OrderedDict
|
2021-12-29 21:55:09 +00:00
|
|
|
from newswire import get_dict_from_newswire
|
|
|
|
# from posts import send_signed_json
|
|
|
|
from posts import create_news_post
|
|
|
|
from posts import archive_posts_for_person
|
2023-11-20 22:27:58 +00:00
|
|
|
from utils import date_from_string_format
|
|
|
|
from utils import date_utcnow
|
2022-01-13 15:10:41 +00:00
|
|
|
from utils import valid_hash_tag
|
2021-12-26 11:29:40 +00:00
|
|
|
from utils import get_base_content_from_post
|
2021-12-27 15:43:22 +00:00
|
|
|
from utils import remove_html
|
2021-12-26 12:45:03 +00:00
|
|
|
from utils import get_full_domain
|
2021-12-26 15:13:34 +00:00
|
|
|
from utils import load_json
|
2021-12-26 14:47:21 +00:00
|
|
|
from utils import save_json
|
2021-12-27 17:42:35 +00:00
|
|
|
from utils import get_status_number
|
2021-12-28 10:17:58 +00:00
|
|
|
from utils import clear_from_post_caches
|
2021-12-27 21:42:08 +00:00
|
|
|
from utils import dangerous_markup
|
2021-12-26 10:19:59 +00:00
|
|
|
from utils import local_actor_url
|
2022-06-10 11:43:33 +00:00
|
|
|
from utils import text_in_file
|
2024-05-12 12:35:26 +00:00
|
|
|
from utils import data_dir
|
2021-12-28 16:56:57 +00:00
|
|
|
from session import create_session
|
2022-07-28 09:59:18 +00:00
|
|
|
from threads import begin_thread
|
2024-08-31 10:26:14 +00:00
|
|
|
from webapp_hashtagswarm import store_hash_tags
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-08 12:29:40 +00:00
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _update_feeds_outbox_index(base_dir: str, domain: str,
|
|
|
|
post_id: str) -> None:
|
2020-10-07 13:51:29 +00:00
|
|
|
"""Updates the index used for imported RSS feeds
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
base_path = data_dir(base_dir) + '/news@' + domain
|
2022-01-03 11:33:46 +00:00
|
|
|
index_filename = base_path + '/outbox.index'
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if os.path.isfile(index_filename):
|
2022-06-10 11:43:33 +00:00
|
|
|
if not text_in_file(post_id, index_filename):
|
2020-10-07 18:46:42 +00:00
|
|
|
try:
|
2024-02-19 13:45:45 +00:00
|
|
|
with open(index_filename, 'r+',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_feeds:
|
|
|
|
content = fp_feeds.read()
|
2021-12-26 19:47:06 +00:00
|
|
|
if post_id + '\n' not in content:
|
2024-07-14 13:01:46 +00:00
|
|
|
fp_feeds.seek(0, 0)
|
|
|
|
fp_feeds.write(post_id + '\n' + content)
|
2020-12-29 20:22:28 +00:00
|
|
|
print('DEBUG: feeds post added to index')
|
2022-05-30 15:15:17 +00:00
|
|
|
except OSError as ex:
|
2022-02-03 10:39:52 +00:00
|
|
|
print('EX: Failed to write entry to feeds posts index ' +
|
2022-01-03 11:33:46 +00:00
|
|
|
index_filename + ' ' + str(ex))
|
2024-07-15 19:48:44 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open(index_filename, 'w+', encoding='utf-8') as fp_feeds:
|
|
|
|
fp_feeds.write(post_id + '\n')
|
|
|
|
except OSError:
|
|
|
|
print('EX: _update_feeds_outbox_index unable to write ' +
|
|
|
|
index_filename)
|
2020-10-07 13:51:29 +00:00
|
|
|
|
|
|
|
|
2022-06-12 20:31:56 +00:00
|
|
|
def _save_arrived_time(post_filename: str, arrived: str) -> None:
|
2020-10-09 12:15:20 +00:00
|
|
|
"""Saves the time when an rss post arrived to a file
|
|
|
|
"""
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(post_filename + '.arrived', 'w+',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_arrived:
|
|
|
|
fp_arrived.write(arrived)
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2024-07-02 22:16:13 +00:00
|
|
|
print('EX: _save_arrived_time unable to write ' +
|
|
|
|
post_filename + '.arrived')
|
2020-10-09 12:15:20 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _remove_control_characters(content: str) -> str:
|
2020-10-20 13:07:02 +00:00
|
|
|
"""Remove escaped html
|
2020-10-11 09:33:31 +00:00
|
|
|
"""
|
2020-10-20 13:07:02 +00:00
|
|
|
if '&' in content:
|
|
|
|
return html.unescape(content)
|
2020-10-11 09:33:31 +00:00
|
|
|
return content
|
2020-10-10 09:36:23 +00:00
|
|
|
|
2020-10-10 08:54:13 +00:00
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_logical_not(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" NOT
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
|
|
|
if isinstance(tree[1], str):
|
|
|
|
return tree[1] not in hashtags
|
2022-01-03 11:33:46 +00:00
|
|
|
if isinstance(tree[1], list):
|
2021-12-29 21:55:09 +00:00
|
|
|
return not hashtag_rule_resolve(tree[1], hashtags,
|
|
|
|
moderated, content, url)
|
2021-07-04 09:24:35 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-06-12 20:31:56 +00:00
|
|
|
def _hashtag_logical_contains(tree: [], content: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" Contains
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = None
|
2021-07-04 09:24:35 +00:00
|
|
|
if isinstance(tree[1], str):
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = tree[1]
|
2021-07-04 09:24:35 +00:00
|
|
|
elif isinstance(tree[1], list):
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = tree[1][0]
|
|
|
|
if match_str:
|
|
|
|
if match_str.startswith('"') and match_str.endswith('"'):
|
|
|
|
match_str = match_str[1:]
|
|
|
|
match_str = match_str[:len(match_str) - 1]
|
|
|
|
match_str_lower = match_str.lower()
|
|
|
|
content_without_tags = content.replace('#' + match_str_lower, '')
|
|
|
|
return match_str_lower in content_without_tags
|
2021-07-04 09:24:35 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-06-12 20:31:56 +00:00
|
|
|
def _hashtag_logical_from(tree: [], url: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" FROM
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = None
|
2021-07-04 09:24:35 +00:00
|
|
|
if isinstance(tree[1], str):
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = tree[1]
|
2021-07-04 09:24:35 +00:00
|
|
|
elif isinstance(tree[1], list):
|
2022-01-03 11:33:46 +00:00
|
|
|
match_str = tree[1][0]
|
|
|
|
if match_str:
|
|
|
|
if match_str.startswith('"') and match_str.endswith('"'):
|
|
|
|
match_str = match_str[1:]
|
|
|
|
match_str = match_str[:len(match_str) - 1]
|
|
|
|
return match_str.lower() in url
|
2021-07-04 09:24:35 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_logical_and(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" AND
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
2022-01-03 11:33:46 +00:00
|
|
|
for arg_index in range(1, len(tree)):
|
|
|
|
arg_value = False
|
|
|
|
if isinstance(tree[arg_index], str):
|
2024-02-19 13:46:43 +00:00
|
|
|
arg_value = tree[arg_index] in hashtags
|
2022-01-03 11:33:46 +00:00
|
|
|
elif isinstance(tree[arg_index], list):
|
|
|
|
arg_value = hashtag_rule_resolve(tree[arg_index],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if not arg_value:
|
2021-07-04 09:24:35 +00:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_logical_or(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" OR
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
2022-01-03 11:33:46 +00:00
|
|
|
for arg_index in range(1, len(tree)):
|
|
|
|
arg_value = False
|
|
|
|
if isinstance(tree[arg_index], str):
|
2024-02-19 13:46:43 +00:00
|
|
|
arg_value = tree[arg_index] in hashtags
|
2022-01-03 11:33:46 +00:00
|
|
|
elif isinstance(tree[arg_index], list):
|
|
|
|
arg_value = hashtag_rule_resolve(tree[arg_index],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if arg_value:
|
2021-07-04 09:24:35 +00:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_logical_xor(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
2021-07-04 09:24:35 +00:00
|
|
|
""" XOR
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
2022-01-03 11:33:46 +00:00
|
|
|
true_ctr = 0
|
|
|
|
for arg_index in range(1, len(tree)):
|
|
|
|
arg_value = False
|
|
|
|
if isinstance(tree[arg_index], str):
|
2024-02-19 13:46:43 +00:00
|
|
|
arg_value = tree[arg_index] in hashtags
|
2022-01-03 11:33:46 +00:00
|
|
|
elif isinstance(tree[arg_index], list):
|
|
|
|
arg_value = hashtag_rule_resolve(tree[arg_index],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if arg_value:
|
|
|
|
true_ctr += 1
|
|
|
|
if true_ctr == 1:
|
2021-07-04 09:24:35 +00:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def hashtag_rule_resolve(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
2020-10-17 12:05:41 +00:00
|
|
|
"""Returns whether the tree for a hashtag rule evaluates to true or false
|
|
|
|
"""
|
|
|
|
if not tree:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if tree[0] == 'not':
|
2021-12-29 21:55:09 +00:00
|
|
|
return _hashtag_logical_not(tree, hashtags, moderated, content, url)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0] == 'contains':
|
2022-06-12 20:31:56 +00:00
|
|
|
return _hashtag_logical_contains(tree, content)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0] == 'from':
|
2022-06-12 20:31:56 +00:00
|
|
|
return _hashtag_logical_from(tree, url)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0] == 'and':
|
2021-12-29 21:55:09 +00:00
|
|
|
return _hashtag_logical_and(tree, hashtags, moderated, content, url)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0] == 'or':
|
2021-12-29 21:55:09 +00:00
|
|
|
return _hashtag_logical_or(tree, hashtags, moderated, content, url)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0] == 'xor':
|
2021-12-29 21:55:09 +00:00
|
|
|
return _hashtag_logical_xor(tree, hashtags, moderated, content, url)
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0].startswith('#') and len(tree) == 1:
|
2020-10-17 12:05:41 +00:00
|
|
|
return tree[0] in hashtags
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0].startswith('moderated'):
|
2020-10-17 17:36:10 +00:00
|
|
|
return moderated
|
2022-01-03 11:33:46 +00:00
|
|
|
if tree[0].startswith('"') and tree[0].endswith('"'):
|
2020-10-17 18:49:43 +00:00
|
|
|
return True
|
2020-10-17 12:05:41 +00:00
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def hashtag_rule_tree(operators: [],
|
2022-01-03 11:33:46 +00:00
|
|
|
conditions_str: str,
|
|
|
|
tags_in_conditions: [],
|
2021-12-29 21:55:09 +00:00
|
|
|
moderated: bool) -> []:
|
2020-10-17 12:05:41 +00:00
|
|
|
"""Walks the tree
|
|
|
|
"""
|
2022-01-03 11:33:46 +00:00
|
|
|
if not operators and conditions_str:
|
|
|
|
conditions_str = conditions_str.strip()
|
|
|
|
is_str = \
|
|
|
|
conditions_str.startswith('"') and conditions_str.endswith('"')
|
|
|
|
if conditions_str.startswith('#') or is_str or \
|
|
|
|
conditions_str in operators or \
|
|
|
|
conditions_str == 'moderated' or \
|
|
|
|
conditions_str == 'contains':
|
|
|
|
if conditions_str.startswith('#'):
|
|
|
|
if conditions_str not in tags_in_conditions:
|
|
|
|
if ' ' not in conditions_str or \
|
|
|
|
conditions_str.startswith('"'):
|
|
|
|
tags_in_conditions.append(conditions_str)
|
|
|
|
return [conditions_str.strip()]
|
|
|
|
return None
|
|
|
|
if not operators or not conditions_str:
|
2020-10-17 12:05:41 +00:00
|
|
|
return None
|
|
|
|
tree = None
|
2022-01-03 11:33:46 +00:00
|
|
|
conditions_str = conditions_str.strip()
|
|
|
|
is_str = conditions_str.startswith('"') and conditions_str.endswith('"')
|
|
|
|
if conditions_str.startswith('#') or is_str or \
|
|
|
|
conditions_str in operators or \
|
|
|
|
conditions_str == 'moderated' or \
|
|
|
|
conditions_str == 'contains':
|
|
|
|
if conditions_str.startswith('#'):
|
|
|
|
if conditions_str not in tags_in_conditions:
|
|
|
|
if ' ' not in conditions_str or \
|
|
|
|
conditions_str.startswith('"'):
|
|
|
|
tags_in_conditions.append(conditions_str)
|
|
|
|
tree = [conditions_str.strip()]
|
2020-10-17 12:05:41 +00:00
|
|
|
ctr = 0
|
|
|
|
while ctr < len(operators):
|
2022-01-03 11:33:46 +00:00
|
|
|
oper = operators[ctr]
|
|
|
|
opmatch = ' ' + oper + ' '
|
|
|
|
if opmatch not in conditions_str and \
|
|
|
|
not conditions_str.startswith(oper + ' '):
|
2020-10-17 12:05:41 +00:00
|
|
|
ctr += 1
|
|
|
|
continue
|
2022-01-03 11:33:46 +00:00
|
|
|
tree = [oper]
|
|
|
|
if opmatch in conditions_str:
|
|
|
|
sections = conditions_str.split(opmatch)
|
2020-10-17 12:05:41 +00:00
|
|
|
else:
|
2022-01-03 11:33:46 +00:00
|
|
|
sections = conditions_str.split(oper + ' ', 1)
|
|
|
|
for sub_condition_str in sections:
|
|
|
|
result = hashtag_rule_tree(operators[ctr + 1:],
|
|
|
|
sub_condition_str,
|
|
|
|
tags_in_conditions, moderated)
|
|
|
|
if result:
|
|
|
|
tree.append(result)
|
|
|
|
break
|
2020-10-17 12:05:41 +00:00
|
|
|
return tree
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_add(base_dir: str, http_prefix: str, domain_full: str,
|
|
|
|
post_json_object: {},
|
2022-01-03 11:33:46 +00:00
|
|
|
action_str: str, hashtags: [], system_language: str,
|
2021-12-29 21:55:09 +00:00
|
|
|
translate: {}) -> None:
|
2021-07-04 09:46:48 +00:00
|
|
|
"""Adds a hashtag via a hashtag rule
|
|
|
|
"""
|
2022-01-03 11:33:46 +00:00
|
|
|
add_hashtag = action_str.split('add ', 1)[1].strip()
|
|
|
|
if not add_hashtag.startswith('#'):
|
2021-07-04 09:46:48 +00:00
|
|
|
return
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if add_hashtag not in hashtags:
|
|
|
|
hashtags.append(add_hashtag)
|
|
|
|
ht_id = add_hashtag.replace('#', '')
|
|
|
|
if not valid_hash_tag(ht_id):
|
2021-07-04 09:46:48 +00:00
|
|
|
return
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id
|
|
|
|
new_tag = {
|
|
|
|
'href': hashtag_url,
|
|
|
|
'name': add_hashtag,
|
2021-07-04 09:46:48 +00:00
|
|
|
'type': 'Hashtag'
|
|
|
|
}
|
|
|
|
# does the tag already exist?
|
2022-01-03 11:33:46 +00:00
|
|
|
add_tag_object = None
|
|
|
|
for htag in post_json_object['object']['tag']:
|
|
|
|
if htag.get('type') and htag.get('name'):
|
|
|
|
if htag['type'] == 'Hashtag' and \
|
|
|
|
htag['name'] == add_hashtag:
|
|
|
|
add_tag_object = htag
|
2021-07-04 09:46:48 +00:00
|
|
|
break
|
|
|
|
# append the tag if it wasn't found
|
2022-01-03 11:33:46 +00:00
|
|
|
if not add_tag_object:
|
|
|
|
post_json_object['object']['tag'].append(new_tag)
|
2021-07-04 09:46:48 +00:00
|
|
|
# add corresponding html to the post content
|
2022-01-03 11:33:46 +00:00
|
|
|
hashtag_html = \
|
|
|
|
" <a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + ht_id + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(post_json_object, system_language)
|
2022-01-03 11:33:46 +00:00
|
|
|
if hashtag_html in content:
|
2021-07-04 09:46:48 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
if content.endswith('</p>'):
|
|
|
|
content = \
|
|
|
|
content[:len(content) - len('</p>')] + \
|
2022-01-03 11:33:46 +00:00
|
|
|
hashtag_html + '</p>'
|
2021-07-04 09:46:48 +00:00
|
|
|
else:
|
2022-01-03 11:33:46 +00:00
|
|
|
content += hashtag_html
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['content'] = content
|
2021-12-26 10:00:46 +00:00
|
|
|
domain = domain_full
|
2021-10-20 13:33:34 +00:00
|
|
|
if ':' in domain:
|
|
|
|
domain = domain.split(':')[0]
|
2021-12-29 21:55:09 +00:00
|
|
|
store_hash_tags(base_dir, 'news', domain,
|
|
|
|
http_prefix, domain_full,
|
|
|
|
post_json_object, translate)
|
2021-07-04 09:46:48 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _hashtag_remove(http_prefix: str, domain_full: str, post_json_object: {},
|
2022-01-03 11:33:46 +00:00
|
|
|
action_str: str, hashtags: [],
|
2021-12-29 21:55:09 +00:00
|
|
|
system_language: str) -> None:
|
2021-07-04 09:46:48 +00:00
|
|
|
"""Removes a hashtag via a hashtag rule
|
|
|
|
"""
|
2022-01-03 11:33:46 +00:00
|
|
|
rm_hashtag = action_str.split('remove ', 1)[1].strip()
|
|
|
|
if not rm_hashtag.startswith('#'):
|
2021-07-04 09:46:48 +00:00
|
|
|
return
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if rm_hashtag in hashtags:
|
|
|
|
hashtags.remove(rm_hashtag)
|
|
|
|
ht_id = rm_hashtag.replace('#', '')
|
|
|
|
hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id
|
2021-07-04 09:46:48 +00:00
|
|
|
# remove tag html from the post content
|
2022-01-03 11:33:46 +00:00
|
|
|
hashtag_html = \
|
|
|
|
"<a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + ht_id + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(post_json_object, system_language)
|
2022-01-03 11:33:46 +00:00
|
|
|
if hashtag_html in content:
|
|
|
|
content = content.replace(hashtag_html, '').replace(' ', ' ')
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['content'] = content
|
2021-12-25 23:03:28 +00:00
|
|
|
post_json_object['object']['contentMap'][system_language] = content
|
2022-01-03 11:33:46 +00:00
|
|
|
rm_tag_object = None
|
|
|
|
for htag in post_json_object['object']['tag']:
|
|
|
|
if htag.get('type') and htag.get('name'):
|
|
|
|
if htag['type'] == 'Hashtag' and \
|
|
|
|
htag['name'] == rm_hashtag:
|
|
|
|
rm_tag_object = htag
|
2021-07-04 09:46:48 +00:00
|
|
|
break
|
2022-01-03 11:33:46 +00:00
|
|
|
if rm_tag_object:
|
|
|
|
post_json_object['object']['tag'].remove(rm_tag_object)
|
2021-07-04 09:46:48 +00:00
|
|
|
|
|
|
|
|
2022-06-12 20:31:56 +00:00
|
|
|
def _newswire_hashtag_processing(base_dir: str, post_json_object: {},
|
2021-12-29 21:55:09 +00:00
|
|
|
hashtags: [], http_prefix: str,
|
|
|
|
domain: str, port: int,
|
|
|
|
moderated: bool, url: str,
|
|
|
|
system_language: str,
|
|
|
|
translate: {}) -> bool:
|
2020-10-16 21:33:18 +00:00
|
|
|
"""Applies hashtag rules to a news post.
|
|
|
|
Returns true if the post should be saved to the news timeline
|
|
|
|
of this instance
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
rules_filename = data_dir(base_dir) + '/hashtagrules.txt'
|
2022-01-03 11:33:46 +00:00
|
|
|
if not os.path.isfile(rules_filename):
|
2020-10-17 12:05:41 +00:00
|
|
|
return True
|
|
|
|
rules = []
|
2024-07-13 14:38:11 +00:00
|
|
|
try:
|
|
|
|
with open(rules_filename, 'r', encoding='utf-8') as fp_rules:
|
|
|
|
rules = fp_rules.readlines()
|
|
|
|
except OSError:
|
|
|
|
print('EX: _newswire_hashtag_processing unable to read ' +
|
|
|
|
rules_filename)
|
2020-10-17 12:05:41 +00:00
|
|
|
|
2021-12-26 12:45:03 +00:00
|
|
|
domain_full = get_full_domain(domain, port)
|
2020-10-17 12:05:41 +00:00
|
|
|
|
2020-10-17 18:49:43 +00:00
|
|
|
# get the full text content of the post
|
|
|
|
content = ''
|
2021-12-25 22:09:19 +00:00
|
|
|
if post_json_object['object'].get('content'):
|
2021-12-26 11:29:40 +00:00
|
|
|
content += get_base_content_from_post(post_json_object,
|
|
|
|
system_language)
|
2021-12-25 22:09:19 +00:00
|
|
|
if post_json_object['object'].get('summary'):
|
|
|
|
content += ' ' + post_json_object['object']['summary']
|
2020-10-17 19:04:39 +00:00
|
|
|
content = content.lower()
|
2020-10-17 18:49:43 +00:00
|
|
|
|
2020-10-17 13:41:20 +00:00
|
|
|
# actionOccurred = False
|
2020-10-20 17:37:15 +00:00
|
|
|
operators = ('not', 'and', 'or', 'xor', 'from', 'contains')
|
2022-01-03 11:33:46 +00:00
|
|
|
for rule_str in rules:
|
|
|
|
if not rule_str:
|
2020-10-17 12:05:41 +00:00
|
|
|
continue
|
2022-01-03 11:33:46 +00:00
|
|
|
if not rule_str.startswith('if '):
|
2020-10-17 12:05:41 +00:00
|
|
|
continue
|
2022-01-03 11:33:46 +00:00
|
|
|
if ' then ' not in rule_str:
|
2020-10-17 12:05:41 +00:00
|
|
|
continue
|
2022-01-03 11:33:46 +00:00
|
|
|
conditions_str = rule_str.split('if ', 1)[1]
|
|
|
|
conditions_str = conditions_str.split(' then ')[0]
|
|
|
|
tags_in_conditions = []
|
|
|
|
tree = hashtag_rule_tree(operators, conditions_str,
|
|
|
|
tags_in_conditions, moderated)
|
2021-12-29 21:55:09 +00:00
|
|
|
if not hashtag_rule_resolve(tree, hashtags, moderated, content, url):
|
2020-10-17 12:05:41 +00:00
|
|
|
continue
|
|
|
|
# the condition matches, so do something
|
2022-01-03 11:33:46 +00:00
|
|
|
action_str = rule_str.split(' then ')[1].strip()
|
2020-10-17 12:05:41 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if action_str.startswith('add '):
|
2021-07-04 09:46:48 +00:00
|
|
|
# add a hashtag
|
2021-12-29 21:55:09 +00:00
|
|
|
_hashtag_add(base_dir, http_prefix, domain_full,
|
2022-01-03 11:33:46 +00:00
|
|
|
post_json_object, action_str, hashtags,
|
2021-12-29 21:55:09 +00:00
|
|
|
system_language, translate)
|
2022-01-03 11:33:46 +00:00
|
|
|
elif action_str.startswith('remove '):
|
2021-07-04 09:46:48 +00:00
|
|
|
# remove a hashtag
|
2021-12-29 21:55:09 +00:00
|
|
|
_hashtag_remove(http_prefix, domain_full, post_json_object,
|
2022-01-03 11:33:46 +00:00
|
|
|
action_str, hashtags, system_language)
|
|
|
|
elif action_str.startswith('block') or action_str.startswith('drop'):
|
2021-07-04 09:46:48 +00:00
|
|
|
# Block this item
|
2020-10-17 16:24:47 +00:00
|
|
|
return False
|
2020-10-16 21:33:18 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _create_news_mirror(base_dir: str, domain: str,
|
2022-01-03 11:33:46 +00:00
|
|
|
post_id_number: str, url: str,
|
2021-12-29 21:55:09 +00:00
|
|
|
max_mirrored_articles: int) -> bool:
|
2020-10-19 16:33:58 +00:00
|
|
|
"""Creates a local mirror of a news article
|
|
|
|
"""
|
2020-10-19 19:26:58 +00:00
|
|
|
if '|' in url or '>' in url:
|
|
|
|
return True
|
|
|
|
|
2024-05-12 12:35:26 +00:00
|
|
|
mirror_dir = data_dir(base_dir) + '/newsmirror'
|
2022-01-03 11:33:46 +00:00
|
|
|
if not os.path.isdir(mirror_dir):
|
|
|
|
os.mkdir(mirror_dir)
|
2020-10-19 16:33:58 +00:00
|
|
|
|
2020-10-19 19:26:58 +00:00
|
|
|
# count the directories
|
2022-01-03 11:33:46 +00:00
|
|
|
no_of_dirs = 0
|
|
|
|
for _, dirs, _ in os.walk(mirror_dir):
|
|
|
|
no_of_dirs = len(dirs)
|
2022-07-22 09:58:42 +00:00
|
|
|
break
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2024-05-12 12:35:26 +00:00
|
|
|
mirror_index_filename = data_dir(base_dir) + '/newsmirror.txt'
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if max_mirrored_articles > 0 and no_of_dirs > max_mirrored_articles:
|
|
|
|
if not os.path.isfile(mirror_index_filename):
|
2020-10-19 19:26:58 +00:00
|
|
|
# no index for mirrors found
|
|
|
|
return True
|
|
|
|
removals = []
|
2024-07-13 14:38:11 +00:00
|
|
|
try:
|
|
|
|
with open(mirror_index_filename, 'r',
|
|
|
|
encoding='utf-8') as fp_index:
|
|
|
|
# remove the oldest directories
|
|
|
|
ctr = 0
|
|
|
|
while no_of_dirs > max_mirrored_articles:
|
|
|
|
ctr += 1
|
|
|
|
if ctr > 5000:
|
|
|
|
# escape valve
|
|
|
|
break
|
|
|
|
|
|
|
|
post_id = fp_index.readline()
|
|
|
|
if not post_id:
|
|
|
|
continue
|
|
|
|
post_id = post_id.strip()
|
|
|
|
mirror_article_dir = mirror_dir + '/' + post_id
|
|
|
|
if os.path.isdir(mirror_article_dir):
|
|
|
|
rmtree(mirror_article_dir,
|
|
|
|
ignore_errors=False, onexc=None)
|
|
|
|
removals.append(post_id)
|
|
|
|
no_of_dirs -= 1
|
|
|
|
except OSError as exc:
|
|
|
|
print('EX: _create_news_mirror unable to read ' +
|
|
|
|
mirror_index_filename + ' ' + str(exc))
|
2020-10-19 19:26:58 +00:00
|
|
|
|
|
|
|
# remove the corresponding index entries
|
|
|
|
if removals:
|
2022-01-03 11:33:46 +00:00
|
|
|
index_content = ''
|
2024-07-13 14:38:11 +00:00
|
|
|
try:
|
|
|
|
with open(mirror_index_filename, 'r',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_index:
|
|
|
|
index_content = fp_index.read()
|
2024-07-13 14:38:11 +00:00
|
|
|
for remove_post_id in removals:
|
|
|
|
index_content = \
|
|
|
|
index_content.replace(remove_post_id + '\n', '')
|
|
|
|
except OSError:
|
|
|
|
print('EX: _create_news_mirror unable to read ' +
|
|
|
|
mirror_index_filename)
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(mirror_index_filename, 'w+',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_index:
|
|
|
|
fp_index.write(index_content)
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2024-07-02 22:16:13 +00:00
|
|
|
print('EX: _create_news_mirror unable to write ' +
|
|
|
|
mirror_index_filename)
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
mirror_article_dir = mirror_dir + '/' + post_id_number
|
|
|
|
if os.path.isdir(mirror_article_dir):
|
2020-10-19 19:26:58 +00:00
|
|
|
# already mirrored
|
|
|
|
return True
|
|
|
|
|
2020-10-20 09:27:58 +00:00
|
|
|
# for onion instances mirror via tor
|
2022-01-03 11:33:46 +00:00
|
|
|
prefix_str = ''
|
2020-10-20 09:27:58 +00:00
|
|
|
if domain.endswith('.onion'):
|
2022-01-03 11:33:46 +00:00
|
|
|
prefix_str = '/usr/bin/torsocks '
|
2020-10-20 09:27:58 +00:00
|
|
|
|
2020-10-19 19:26:58 +00:00
|
|
|
# download the files
|
2022-01-03 11:33:46 +00:00
|
|
|
command_str = \
|
|
|
|
prefix_str + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \
|
|
|
|
' -P ' + mirror_article_dir
|
|
|
|
proc = Popen(command_str, shell=True)
|
|
|
|
os.waitpid(proc.pid, 0)
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
if not os.path.isdir(mirror_article_dir):
|
2020-10-20 09:27:58 +00:00
|
|
|
print('WARN: failed to mirror ' + url)
|
2020-10-19 19:26:58 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
# append the post Id number to the index file
|
2022-01-03 11:33:46 +00:00
|
|
|
if os.path.isfile(mirror_index_filename):
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(mirror_index_filename, 'a+',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_index:
|
|
|
|
fp_index.write(post_id_number + '\n')
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2024-07-02 22:16:13 +00:00
|
|
|
print('EX: _create_news_mirror unable to append ' +
|
|
|
|
mirror_index_filename)
|
2020-10-19 19:26:58 +00:00
|
|
|
else:
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(mirror_index_filename, 'w+',
|
2024-07-14 13:01:46 +00:00
|
|
|
encoding='utf-8') as fp_index:
|
|
|
|
fp_index.write(post_id_number + '\n')
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2024-07-02 22:16:13 +00:00
|
|
|
print('EX: _create_news_mirror unable to write ' +
|
|
|
|
mirror_index_filename)
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2020-10-19 16:33:58 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
2022-01-12 19:40:12 +00:00
|
|
|
def _convert_rss_to_activitypub(base_dir: str, http_prefix: str,
|
|
|
|
domain: str, port: int,
|
|
|
|
newswire: {},
|
|
|
|
translate: {},
|
|
|
|
recent_posts_cache: {},
|
|
|
|
max_mirrored_articles: int,
|
|
|
|
allow_local_network_access: bool,
|
|
|
|
system_language: str,
|
|
|
|
low_bandwidth: bool,
|
2023-01-22 23:47:13 +00:00
|
|
|
content_license_url: str,
|
2023-01-23 11:33:07 +00:00
|
|
|
media_license_url: str,
|
|
|
|
media_creator: str) -> None:
|
2020-10-07 13:51:29 +00:00
|
|
|
"""Converts rss items in a newswire into posts
|
|
|
|
"""
|
2020-11-03 14:41:28 +00:00
|
|
|
if not newswire:
|
2021-09-15 17:43:06 +00:00
|
|
|
print('No newswire to convert')
|
2020-11-03 14:41:28 +00:00
|
|
|
return
|
|
|
|
|
2024-05-12 12:35:26 +00:00
|
|
|
base_path = data_dir(base_dir) + '/news@' + domain + '/outbox'
|
2022-01-03 11:33:46 +00:00
|
|
|
if not os.path.isdir(base_path):
|
|
|
|
os.mkdir(base_path)
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2020-10-09 10:05:01 +00:00
|
|
|
# oldest items first
|
2022-01-03 11:33:46 +00:00
|
|
|
newswire_reverse = OrderedDict(sorted(newswire.items(), reverse=False))
|
2020-10-07 18:46:42 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
for date_str, item in newswire_reverse.items():
|
|
|
|
original_date_str = date_str
|
2020-10-07 14:10:06 +00:00
|
|
|
# convert the date to the format used by ActivityPub
|
2022-01-03 11:33:46 +00:00
|
|
|
if '+00:00' in date_str:
|
|
|
|
date_str = date_str.replace(' ', 'T')
|
|
|
|
date_str = date_str.replace('+00:00', 'Z')
|
2020-10-20 12:37:32 +00:00
|
|
|
else:
|
2021-09-15 17:43:06 +00:00
|
|
|
try:
|
2022-01-03 11:33:46 +00:00
|
|
|
date_str_with_offset = \
|
2023-11-20 22:27:58 +00:00
|
|
|
date_from_string_format(date_str, ["%Y-%m-%d %H:%M:%S%z"])
|
2021-09-15 17:43:06 +00:00
|
|
|
except BaseException:
|
2022-01-03 11:33:46 +00:00
|
|
|
print('EX: Newswire strptime failed ' + str(date_str))
|
2021-09-15 17:43:06 +00:00
|
|
|
continue
|
2021-09-15 19:04:29 +00:00
|
|
|
try:
|
2022-01-03 11:33:46 +00:00
|
|
|
date_str = date_str_with_offset.strftime("%Y-%m-%dT%H:%M:%SZ")
|
2021-09-15 19:04:29 +00:00
|
|
|
except BaseException:
|
2022-01-03 11:33:46 +00:00
|
|
|
print('EX: Newswire date_str_with_offset failed ' +
|
|
|
|
str(date_str_with_offset))
|
2021-09-15 19:04:29 +00:00
|
|
|
continue
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
status_number, _ = get_status_number(date_str)
|
|
|
|
new_post_id = \
|
2021-12-26 10:19:59 +00:00
|
|
|
local_actor_url(http_prefix, 'news', domain) + \
|
2022-01-03 11:33:46 +00:00
|
|
|
'/statuses/' + status_number
|
2020-10-07 16:55:15 +00:00
|
|
|
|
2020-10-07 14:10:06 +00:00
|
|
|
# file where the post is stored
|
2022-01-03 11:33:46 +00:00
|
|
|
filename = base_path + '/' + new_post_id.replace('/', '#') + '.json'
|
2020-10-07 13:51:29 +00:00
|
|
|
if os.path.isfile(filename):
|
2020-10-08 12:52:15 +00:00
|
|
|
# don't create the post if it already exists
|
2020-10-08 14:35:26 +00:00
|
|
|
# set the url
|
2022-01-03 11:33:46 +00:00
|
|
|
# newswire[original_date_str][1] = \
|
|
|
|
# '/users/news/statuses/' + status_number
|
2020-10-08 14:35:26 +00:00
|
|
|
# set the filename
|
2022-01-03 11:33:46 +00:00
|
|
|
newswire[original_date_str][3] = filename
|
2020-10-07 13:51:29 +00:00
|
|
|
continue
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
rss_title = _remove_control_characters(item[0])
|
2020-10-10 09:53:56 +00:00
|
|
|
url = item[1]
|
2023-05-18 11:15:18 +00:00
|
|
|
if dangerous_markup(url, allow_local_network_access, []) or \
|
|
|
|
dangerous_markup(rss_title, allow_local_network_access, []):
|
2020-10-11 09:33:31 +00:00
|
|
|
continue
|
2022-01-03 11:33:46 +00:00
|
|
|
rss_description = ''
|
2020-10-07 14:10:06 +00:00
|
|
|
|
|
|
|
# get the rss description if it exists
|
2022-01-03 11:33:46 +00:00
|
|
|
rss_description = '<p>' + remove_html(item[4]) + '<p>'
|
2020-10-07 14:10:06 +00:00
|
|
|
|
2020-10-19 20:43:27 +00:00
|
|
|
mirrored = item[7]
|
2022-01-03 11:33:46 +00:00
|
|
|
post_url = url
|
2020-10-19 20:43:27 +00:00
|
|
|
if mirrored and '://' in url:
|
2022-01-03 11:33:46 +00:00
|
|
|
post_url = '/newsmirror/' + status_number + '/' + \
|
2020-10-19 22:21:30 +00:00
|
|
|
url.split('://')[1]
|
2022-01-03 11:33:46 +00:00
|
|
|
if post_url.endswith('/'):
|
|
|
|
post_url += 'index.html'
|
2020-10-19 22:21:30 +00:00
|
|
|
else:
|
2022-01-03 11:33:46 +00:00
|
|
|
post_url += '/index.html'
|
2020-10-19 20:43:27 +00:00
|
|
|
|
2020-10-07 14:10:06 +00:00
|
|
|
# add the off-site link to the description
|
2022-01-03 11:33:46 +00:00
|
|
|
rss_description += \
|
|
|
|
'<br><a href="' + post_url + '">' + \
|
2021-01-11 21:38:31 +00:00
|
|
|
translate['Read more...'] + '</a>'
|
2020-10-11 09:33:31 +00:00
|
|
|
|
2022-01-10 18:48:57 +00:00
|
|
|
# podcast_properties = None
|
|
|
|
# if len(item) > 8:
|
|
|
|
# podcast_properties = item[8]
|
|
|
|
|
2020-10-09 10:08:01 +00:00
|
|
|
# NOTE: the id when the post is created will not be
|
|
|
|
# consistent (it's based on the current time, not the
|
|
|
|
# published time), so we change that later
|
2022-01-03 11:33:46 +00:00
|
|
|
save_to_file = False
|
|
|
|
attach_image_filename = None
|
|
|
|
media_type = None
|
|
|
|
image_description = None
|
2023-02-18 22:10:15 +00:00
|
|
|
video_transcript = None
|
2021-05-09 19:29:53 +00:00
|
|
|
city = 'London, England'
|
2022-01-03 11:33:46 +00:00
|
|
|
conversation_id = None
|
2022-01-28 10:54:53 +00:00
|
|
|
languages_understood = [system_language]
|
2023-01-13 19:19:57 +00:00
|
|
|
buy_url = ''
|
2023-07-10 17:53:56 +00:00
|
|
|
chat_url = ''
|
2021-12-29 21:55:09 +00:00
|
|
|
blog = create_news_post(base_dir,
|
|
|
|
domain, port, http_prefix,
|
2022-01-03 11:33:46 +00:00
|
|
|
rss_description,
|
2022-05-31 16:51:56 +00:00
|
|
|
save_to_file,
|
2022-01-03 11:33:46 +00:00
|
|
|
attach_image_filename, media_type,
|
2023-02-18 22:10:15 +00:00
|
|
|
image_description, video_transcript,
|
|
|
|
city, rss_title, system_language,
|
2022-01-03 11:33:46 +00:00
|
|
|
conversation_id, low_bandwidth,
|
2022-01-28 10:54:53 +00:00
|
|
|
content_license_url,
|
2023-01-23 11:33:07 +00:00
|
|
|
media_license_url, media_creator,
|
2023-01-13 19:19:57 +00:00
|
|
|
languages_understood, translate,
|
2023-07-10 17:53:56 +00:00
|
|
|
buy_url, chat_url)
|
2020-10-07 16:55:15 +00:00
|
|
|
if not blog:
|
|
|
|
continue
|
|
|
|
|
2020-10-19 16:33:58 +00:00
|
|
|
if mirrored:
|
2022-01-03 11:33:46 +00:00
|
|
|
if not _create_news_mirror(base_dir, domain, status_number,
|
2021-12-29 21:55:09 +00:00
|
|
|
url, max_mirrored_articles):
|
2020-10-19 16:33:58 +00:00
|
|
|
continue
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
id_str = \
|
2021-12-26 10:19:59 +00:00
|
|
|
local_actor_url(http_prefix, 'news', domain) + \
|
2022-01-03 11:33:46 +00:00
|
|
|
'/statuses/' + status_number + '/replies'
|
2020-10-08 09:07:45 +00:00
|
|
|
blog['news'] = True
|
2020-10-09 10:05:01 +00:00
|
|
|
|
|
|
|
# note the time of arrival
|
2023-11-20 22:27:58 +00:00
|
|
|
curr_time = date_utcnow()
|
2021-12-26 13:17:46 +00:00
|
|
|
blog['object']['arrived'] = curr_time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
2020-10-09 10:05:01 +00:00
|
|
|
|
2020-10-09 10:08:01 +00:00
|
|
|
# change the id, based upon the published time
|
2022-01-03 11:33:46 +00:00
|
|
|
blog['object']['replies']['id'] = id_str
|
|
|
|
blog['object']['replies']['first']['partOf'] = id_str
|
2020-10-07 16:55:15 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
blog['id'] = new_post_id + '/activity'
|
|
|
|
blog['object']['id'] = new_post_id
|
|
|
|
blog['object']['atomUri'] = new_post_id
|
2020-10-07 16:55:15 +00:00
|
|
|
blog['object']['url'] = \
|
2022-01-03 11:33:46 +00:00
|
|
|
http_prefix + '://' + domain + '/@news/' + status_number
|
|
|
|
blog['object']['published'] = date_str
|
2020-10-20 13:07:02 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
blog['object']['content'] = rss_description
|
|
|
|
blog['object']['contentMap'][system_language] = rss_description
|
2020-10-07 16:55:15 +00:00
|
|
|
|
2021-12-26 12:45:03 +00:00
|
|
|
domain_full = get_full_domain(domain, port)
|
2020-10-17 13:59:47 +00:00
|
|
|
|
|
|
|
hashtags = item[6]
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
post_id = new_post_id.replace('/', '#')
|
2020-10-07 14:10:06 +00:00
|
|
|
|
2020-10-09 12:15:20 +00:00
|
|
|
moderated = item[5]
|
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
save_post = \
|
2022-06-12 20:31:56 +00:00
|
|
|
_newswire_hashtag_processing(base_dir, blog, hashtags,
|
2021-12-29 21:55:09 +00:00
|
|
|
http_prefix, domain, port,
|
|
|
|
moderated, url, system_language,
|
|
|
|
translate)
|
2020-10-09 12:15:20 +00:00
|
|
|
|
2020-10-16 21:33:18 +00:00
|
|
|
# save the post and update the index
|
2022-01-03 11:33:46 +00:00
|
|
|
if save_post:
|
2020-10-25 12:00:55 +00:00
|
|
|
# ensure that all hashtags are stored in the json
|
|
|
|
# and appended to the content
|
|
|
|
blog['object']['tag'] = []
|
2022-01-03 11:33:46 +00:00
|
|
|
for tag_name in hashtags:
|
|
|
|
ht_id = tag_name.replace('#', '')
|
|
|
|
hashtag_url = \
|
|
|
|
http_prefix + "://" + domain_full + "/tags/" + ht_id
|
|
|
|
new_tag = {
|
|
|
|
'href': hashtag_url,
|
|
|
|
'name': tag_name,
|
2020-10-25 11:22:52 +00:00
|
|
|
'type': 'Hashtag'
|
|
|
|
}
|
2022-01-03 11:33:46 +00:00
|
|
|
blog['object']['tag'].append(new_tag)
|
|
|
|
hashtag_html = \
|
|
|
|
" <a href=\"" + hashtag_url + \
|
2020-10-25 12:00:55 +00:00
|
|
|
"\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + \
|
2022-01-03 11:33:46 +00:00
|
|
|
ht_id + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(blog, system_language)
|
2022-01-03 11:33:46 +00:00
|
|
|
if hashtag_html not in content:
|
2020-10-25 14:37:51 +00:00
|
|
|
if content.endswith('</p>'):
|
|
|
|
content = \
|
|
|
|
content[:len(content) - len('</p>')] + \
|
2022-01-03 11:33:46 +00:00
|
|
|
hashtag_html + '</p>'
|
2020-10-25 14:37:51 +00:00
|
|
|
else:
|
2022-01-03 11:33:46 +00:00
|
|
|
content += hashtag_html
|
2020-10-25 14:37:51 +00:00
|
|
|
blog['object']['content'] = content
|
2021-12-25 23:03:28 +00:00
|
|
|
blog['object']['contentMap'][system_language] = content
|
2020-10-25 11:22:52 +00:00
|
|
|
|
2020-10-25 14:21:29 +00:00
|
|
|
# update the newswire tags if new ones have been found by
|
2021-12-29 21:55:09 +00:00
|
|
|
# _newswire_hashtag_processing
|
2020-10-25 14:21:29 +00:00
|
|
|
for tag in hashtags:
|
2022-01-03 11:33:46 +00:00
|
|
|
if tag not in newswire[original_date_str][6]:
|
|
|
|
newswire[original_date_str][6].append(tag)
|
2020-10-17 13:39:04 +00:00
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
store_hash_tags(base_dir, 'news', domain,
|
|
|
|
http_prefix, domain_full,
|
|
|
|
blog, translate)
|
2020-10-17 13:39:04 +00:00
|
|
|
|
2021-12-28 10:17:58 +00:00
|
|
|
clear_from_post_caches(base_dir, recent_posts_cache, post_id)
|
2021-12-26 14:47:21 +00:00
|
|
|
if save_json(blog, filename):
|
2021-12-29 21:55:09 +00:00
|
|
|
_update_feeds_outbox_index(base_dir, domain, post_id + '.json')
|
2020-10-16 21:33:18 +00:00
|
|
|
|
|
|
|
# Save a file containing the time when the post arrived
|
|
|
|
# this can then later be used to construct the news timeline
|
|
|
|
# excluding items during the voting period
|
|
|
|
if moderated:
|
2022-06-12 20:31:56 +00:00
|
|
|
_save_arrived_time(filename,
|
2021-12-29 21:55:09 +00:00
|
|
|
blog['object']['arrived'])
|
2020-10-16 21:33:18 +00:00
|
|
|
else:
|
|
|
|
if os.path.isfile(filename + '.arrived'):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(filename + '.arrived')
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2022-01-12 19:40:12 +00:00
|
|
|
print('EX: _convert_rss_to_activitypub ' +
|
2021-10-29 18:48:15 +00:00
|
|
|
'unable to delete ' + filename + '.arrived')
|
2020-10-16 21:33:18 +00:00
|
|
|
|
2020-11-08 16:52:57 +00:00
|
|
|
# setting the url here links to the activitypub object
|
|
|
|
# stored locally
|
2022-01-03 11:33:46 +00:00
|
|
|
# newswire[original_date_str][1] = \
|
|
|
|
# '/users/news/statuses/' + status_number
|
2020-11-08 16:52:57 +00:00
|
|
|
|
2020-10-16 21:33:18 +00:00
|
|
|
# set the filename
|
2022-01-03 11:33:46 +00:00
|
|
|
newswire[original_date_str][3] = filename
|
2020-10-07 13:51:29 +00:00
|
|
|
|
|
|
|
|
2024-06-05 15:48:57 +00:00
|
|
|
def _merge_with_previous_newswire(old_newswire: {}, new_newswire: {}) -> None:
|
2020-10-09 09:02:01 +00:00
|
|
|
"""Preserve any votes or generated activitypub post filename
|
|
|
|
as rss feeds are updated
|
|
|
|
"""
|
2024-06-05 15:48:57 +00:00
|
|
|
if not old_newswire:
|
2020-11-03 14:41:28 +00:00
|
|
|
return
|
|
|
|
|
2024-06-05 15:48:57 +00:00
|
|
|
for published, fields in old_newswire.items():
|
2022-01-03 11:33:46 +00:00
|
|
|
if not new_newswire.get(published):
|
2020-10-09 09:02:01 +00:00
|
|
|
continue
|
2020-10-13 08:53:59 +00:00
|
|
|
for i in range(1, 5):
|
2022-01-03 11:33:46 +00:00
|
|
|
new_newswire[published][i] = fields[i]
|
2020-10-09 09:02:01 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def run_newswire_daemon(base_dir: str, httpd,
|
|
|
|
http_prefix: str, domain: str, port: int,
|
|
|
|
translate: {}) -> None:
|
2020-10-07 12:05:49 +00:00
|
|
|
"""Periodically updates RSS feeds
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
newswire_state_filename = data_dir(base_dir) + '/.newswirestate.json'
|
|
|
|
refresh_filename = data_dir(base_dir) + '/.refresh_newswire'
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2022-01-30 10:05:43 +00:00
|
|
|
print('Starting newswire daemon')
|
2020-10-07 12:05:49 +00:00
|
|
|
# initial sleep to allow the system to start up
|
|
|
|
time.sleep(50)
|
|
|
|
while True:
|
|
|
|
# has the session been created yet?
|
|
|
|
if not httpd.session:
|
2020-11-03 16:10:54 +00:00
|
|
|
print('Newswire daemon waiting for session')
|
2021-12-28 16:56:57 +00:00
|
|
|
httpd.session = create_session(httpd.proxy_type)
|
2020-11-03 16:08:31 +00:00
|
|
|
if not httpd.session:
|
2020-11-03 16:10:54 +00:00
|
|
|
print('Newswire daemon has no session')
|
2020-11-03 16:08:31 +00:00
|
|
|
time.sleep(60)
|
|
|
|
continue
|
2022-05-30 15:15:17 +00:00
|
|
|
print('Newswire daemon session established')
|
2020-10-07 12:05:49 +00:00
|
|
|
|
|
|
|
# try to update the feeds
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Updating newswire feeds')
|
2022-01-03 11:33:46 +00:00
|
|
|
new_newswire = \
|
2021-12-29 21:55:09 +00:00
|
|
|
get_dict_from_newswire(httpd.session, base_dir, domain,
|
2022-06-10 13:47:10 +00:00
|
|
|
httpd.max_newswire_posts_per_source,
|
2021-12-29 21:55:09 +00:00
|
|
|
httpd.max_newswire_feed_size_kb,
|
|
|
|
httpd.maxTags,
|
|
|
|
httpd.max_feed_item_size_kb,
|
|
|
|
httpd.max_newswire_posts,
|
|
|
|
httpd.maxCategoriesFeedItemSizeKb,
|
|
|
|
httpd.system_language,
|
2022-04-22 13:46:42 +00:00
|
|
|
httpd.debug,
|
2022-04-24 19:03:02 +00:00
|
|
|
httpd.preferred_podcast_formats,
|
|
|
|
httpd.rss_timeout_sec)
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-09 09:02:01 +00:00
|
|
|
if not httpd.newswire:
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Newswire feeds not updated')
|
2022-01-03 11:33:46 +00:00
|
|
|
if os.path.isfile(newswire_state_filename):
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Loading newswire from file')
|
2022-01-03 11:33:46 +00:00
|
|
|
httpd.newswire = load_json(newswire_state_filename)
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Merging with previous newswire')
|
2022-01-03 11:33:46 +00:00
|
|
|
_merge_with_previous_newswire(httpd.newswire, new_newswire)
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2022-01-03 11:33:46 +00:00
|
|
|
httpd.newswire = new_newswire
|
|
|
|
if new_newswire:
|
|
|
|
save_json(httpd.newswire, newswire_state_filename)
|
2020-11-03 21:53:29 +00:00
|
|
|
print('Newswire updated')
|
2021-09-15 17:03:20 +00:00
|
|
|
else:
|
|
|
|
print('No new newswire')
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Converting newswire to activitypub format')
|
2022-06-12 20:31:56 +00:00
|
|
|
_convert_rss_to_activitypub(base_dir, http_prefix, domain, port,
|
2022-01-12 19:40:12 +00:00
|
|
|
new_newswire, translate,
|
|
|
|
httpd.recent_posts_cache,
|
|
|
|
httpd.max_mirrored_articles,
|
|
|
|
httpd.allow_local_network_access,
|
|
|
|
httpd.system_language,
|
|
|
|
httpd.low_bandwidth,
|
2023-01-22 23:47:13 +00:00
|
|
|
httpd.content_license_url,
|
2023-01-23 11:33:07 +00:00
|
|
|
httpd.content_license_url, '')
|
2020-10-07 13:51:29 +00:00
|
|
|
print('Newswire feed converted to ActivityPub')
|
|
|
|
|
2021-12-25 19:39:45 +00:00
|
|
|
if httpd.max_news_posts > 0:
|
2021-12-25 23:41:17 +00:00
|
|
|
archive_dir = base_dir + '/archive'
|
2022-01-03 11:33:46 +00:00
|
|
|
archive_subdir = \
|
2021-12-25 23:41:17 +00:00
|
|
|
archive_dir + '/accounts/news@' + domain + '/outbox'
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Archiving news posts')
|
2021-12-29 21:55:09 +00:00
|
|
|
archive_posts_for_person(http_prefix, 'news',
|
|
|
|
domain, base_dir, 'outbox',
|
2022-01-03 11:33:46 +00:00
|
|
|
archive_subdir,
|
2021-12-29 21:55:09 +00:00
|
|
|
httpd.recent_posts_cache,
|
|
|
|
httpd.max_news_posts)
|
2020-10-21 10:39:09 +00:00
|
|
|
|
2020-10-07 12:05:49 +00:00
|
|
|
# wait a while before the next feeds update
|
2022-04-24 20:19:16 +00:00
|
|
|
for _ in range(360):
|
2021-02-10 13:31:19 +00:00
|
|
|
time.sleep(10)
|
|
|
|
# if a new blog post has been created then stop
|
|
|
|
# waiting and recalculate the newswire
|
2024-07-15 19:56:25 +00:00
|
|
|
if not os.path.isfile(refresh_filename):
|
|
|
|
continue
|
|
|
|
try:
|
|
|
|
os.remove(refresh_filename)
|
|
|
|
except OSError:
|
|
|
|
print('EX: run_newswire_daemon unable to delete ' +
|
|
|
|
str(refresh_filename))
|
|
|
|
break
|
2020-10-07 12:05:49 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def run_newswire_watchdog(project_version: str, httpd) -> None:
|
2020-10-07 12:05:49 +00:00
|
|
|
"""This tries to keep the newswire update thread running even if it dies
|
|
|
|
"""
|
2022-03-13 11:01:07 +00:00
|
|
|
print('THREAD: Starting newswire watchdog')
|
2022-01-03 11:33:46 +00:00
|
|
|
newswire_original = \
|
2021-12-29 21:55:09 +00:00
|
|
|
httpd.thrPostSchedule.clone(run_newswire_daemon)
|
2022-07-28 09:59:18 +00:00
|
|
|
begin_thread(httpd.thrNewswireDaemon, 'run_newswire_watchdog')
|
2020-10-07 12:05:49 +00:00
|
|
|
while True:
|
|
|
|
time.sleep(50)
|
2021-06-05 12:43:57 +00:00
|
|
|
if httpd.thrNewswireDaemon.is_alive():
|
|
|
|
continue
|
|
|
|
httpd.thrNewswireDaemon.kill()
|
2022-03-13 11:01:07 +00:00
|
|
|
print('THREAD: restarting newswire watchdog')
|
2021-06-05 12:43:57 +00:00
|
|
|
httpd.thrNewswireDaemon = \
|
2022-01-03 11:33:46 +00:00
|
|
|
newswire_original.clone(run_newswire_daemon)
|
2022-07-28 09:59:18 +00:00
|
|
|
begin_thread(httpd.thrNewswireDaemon, 'run_newswire_watchdog 2')
|
2021-06-05 12:43:57 +00:00
|
|
|
print('Restarting newswire daemon...')
|