epicyon/newsdaemon.py

916 lines
34 KiB
Python
Raw Normal View History

2020-10-07 12:05:49 +00:00
__filename__ = "newsdaemon.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2024-12-22 23:37:30 +00:00
__version__ = "1.6.0"
2020-10-07 12:05:49 +00:00
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
2020-10-07 12:05:49 +00:00
__status__ = "Production"
2021-06-26 11:27:14 +00:00
__module_group__ = "Web Interface Columns"
2020-10-07 12:05:49 +00:00
2020-10-17 18:53:08 +00:00
# Example hashtag logic:
#
# if moderated and not #imcoxford then block
# if #pol and contains "westminster" then add #britpol
2020-10-17 19:06:56 +00:00
# if #unwantedtag then block
2020-10-17 18:53:08 +00:00
import os
2020-10-07 12:05:49 +00:00
import time
2020-10-20 13:07:02 +00:00
import html
2020-10-19 19:26:58 +00:00
from shutil import rmtree
from subprocess import Popen
2020-10-07 18:46:42 +00:00
from collections import OrderedDict
2021-12-29 21:55:09 +00:00
from newswire import get_dict_from_newswire
# from posts import send_signed_json
from posts import create_news_post
from posts import archive_posts_for_person
2023-11-20 22:27:58 +00:00
from utils import date_from_string_format
from utils import date_utcnow
2022-01-13 15:10:41 +00:00
from utils import valid_hash_tag
2021-12-26 11:29:40 +00:00
from utils import get_base_content_from_post
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-26 12:45:03 +00:00
from utils import get_full_domain
2021-12-26 15:13:34 +00:00
from utils import load_json
2021-12-26 14:47:21 +00:00
from utils import save_json
2021-12-27 17:42:35 +00:00
from utils import get_status_number
2021-12-28 10:17:58 +00:00
from utils import clear_from_post_caches
2021-12-27 21:42:08 +00:00
from utils import dangerous_markup
2021-12-26 10:19:59 +00:00
from utils import local_actor_url
2022-06-10 11:43:33 +00:00
from utils import text_in_file
2024-05-12 12:35:26 +00:00
from utils import data_dir
2021-12-28 16:56:57 +00:00
from session import create_session
2022-07-28 09:59:18 +00:00
from threads import begin_thread
from webapp_hashtagswarm import store_hash_tags
2020-10-07 12:05:49 +00:00
2020-10-08 12:29:40 +00:00
2021-12-29 21:55:09 +00:00
def _update_feeds_outbox_index(base_dir: str, domain: str,
post_id: str) -> None:
"""Updates the index used for imported RSS feeds
"""
2024-05-12 12:35:26 +00:00
base_path = data_dir(base_dir) + '/news@' + domain
2022-01-03 11:33:46 +00:00
index_filename = base_path + '/outbox.index'
2022-01-03 11:33:46 +00:00
if os.path.isfile(index_filename):
2022-06-10 11:43:33 +00:00
if not text_in_file(post_id, index_filename):
2020-10-07 18:46:42 +00:00
try:
2024-02-19 13:45:45 +00:00
with open(index_filename, 'r+',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_feeds:
content = fp_feeds.read()
2021-12-26 19:47:06 +00:00
if post_id + '\n' not in content:
2024-07-14 13:01:46 +00:00
fp_feeds.seek(0, 0)
fp_feeds.write(post_id + '\n' + content)
print('DEBUG: feeds post added to index')
2022-05-30 15:15:17 +00:00
except OSError as ex:
2022-02-03 10:39:52 +00:00
print('EX: Failed to write entry to feeds posts index ' +
2022-01-03 11:33:46 +00:00
index_filename + ' ' + str(ex))
2024-07-15 19:48:44 +00:00
return
try:
with open(index_filename, 'w+', encoding='utf-8') as fp_feeds:
fp_feeds.write(post_id + '\n')
except OSError:
print('EX: _update_feeds_outbox_index unable to write ' +
index_filename)
2022-06-12 20:31:56 +00:00
def _save_arrived_time(post_filename: str, arrived: str) -> None:
2020-10-09 12:15:20 +00:00
"""Saves the time when an rss post arrived to a file
"""
2021-11-25 21:18:53 +00:00
try:
2022-06-09 14:46:30 +00:00
with open(post_filename + '.arrived', 'w+',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_arrived:
fp_arrived.write(arrived)
2021-11-25 21:18:53 +00:00
except OSError:
2024-07-02 22:16:13 +00:00
print('EX: _save_arrived_time unable to write ' +
post_filename + '.arrived')
2020-10-09 12:15:20 +00:00
2021-12-29 21:55:09 +00:00
def _remove_control_characters(content: str) -> str:
2020-10-20 13:07:02 +00:00
"""Remove escaped html
2020-10-11 09:33:31 +00:00
"""
2020-10-20 13:07:02 +00:00
if '&' in content:
return html.unescape(content)
2020-10-11 09:33:31 +00:00
return content
2020-10-10 09:36:23 +00:00
2020-10-10 08:54:13 +00:00
2021-12-29 21:55:09 +00:00
def _hashtag_logical_not(tree: [], hashtags: [], moderated: bool,
content: str, url: str) -> bool:
2021-07-04 09:24:35 +00:00
""" NOT
"""
if len(tree) != 2:
return False
if isinstance(tree[1], str):
return tree[1] not in hashtags
2022-01-03 11:33:46 +00:00
if isinstance(tree[1], list):
2021-12-29 21:55:09 +00:00
return not hashtag_rule_resolve(tree[1], hashtags,
moderated, content, url)
2021-07-04 09:24:35 +00:00
return False
2022-06-12 20:31:56 +00:00
def _hashtag_logical_contains(tree: [], content: str) -> bool:
2021-07-04 09:24:35 +00:00
""" Contains
"""
if len(tree) != 2:
return False
2022-01-03 11:33:46 +00:00
match_str = None
2021-07-04 09:24:35 +00:00
if isinstance(tree[1], str):
2022-01-03 11:33:46 +00:00
match_str = tree[1]
2021-07-04 09:24:35 +00:00
elif isinstance(tree[1], list):
2022-01-03 11:33:46 +00:00
match_str = tree[1][0]
if match_str:
if match_str.startswith('"') and match_str.endswith('"'):
match_str = match_str[1:]
match_str = match_str[:len(match_str) - 1]
match_str_lower = match_str.lower()
content_without_tags = content.replace('#' + match_str_lower, '')
return match_str_lower in content_without_tags
2021-07-04 09:24:35 +00:00
return False
2022-06-12 20:31:56 +00:00
def _hashtag_logical_from(tree: [], url: str) -> bool:
2021-07-04 09:24:35 +00:00
""" FROM
"""
if len(tree) != 2:
return False
2022-01-03 11:33:46 +00:00
match_str = None
2021-07-04 09:24:35 +00:00
if isinstance(tree[1], str):
2022-01-03 11:33:46 +00:00
match_str = tree[1]
2021-07-04 09:24:35 +00:00
elif isinstance(tree[1], list):
2022-01-03 11:33:46 +00:00
match_str = tree[1][0]
if match_str:
if match_str.startswith('"') and match_str.endswith('"'):
match_str = match_str[1:]
match_str = match_str[:len(match_str) - 1]
return match_str.lower() in url
2021-07-04 09:24:35 +00:00
return False
2021-12-29 21:55:09 +00:00
def _hashtag_logical_and(tree: [], hashtags: [], moderated: bool,
content: str, url: str) -> bool:
2021-07-04 09:24:35 +00:00
""" AND
"""
if len(tree) < 3:
return False
2022-01-03 11:33:46 +00:00
for arg_index in range(1, len(tree)):
arg_value = False
if isinstance(tree[arg_index], str):
2024-02-19 13:46:43 +00:00
arg_value = tree[arg_index] in hashtags
2022-01-03 11:33:46 +00:00
elif isinstance(tree[arg_index], list):
arg_value = hashtag_rule_resolve(tree[arg_index],
hashtags, moderated,
content, url)
if not arg_value:
2021-07-04 09:24:35 +00:00
return False
return True
2021-12-29 21:55:09 +00:00
def _hashtag_logical_or(tree: [], hashtags: [], moderated: bool,
content: str, url: str) -> bool:
2021-07-04 09:24:35 +00:00
""" OR
"""
if len(tree) < 3:
return False
2022-01-03 11:33:46 +00:00
for arg_index in range(1, len(tree)):
arg_value = False
if isinstance(tree[arg_index], str):
2024-02-19 13:46:43 +00:00
arg_value = tree[arg_index] in hashtags
2022-01-03 11:33:46 +00:00
elif isinstance(tree[arg_index], list):
arg_value = hashtag_rule_resolve(tree[arg_index],
hashtags, moderated,
content, url)
if arg_value:
2021-07-04 09:24:35 +00:00
return True
return False
2021-12-29 21:55:09 +00:00
def _hashtag_logical_xor(tree: [], hashtags: [], moderated: bool,
content: str, url: str) -> bool:
2021-07-04 09:24:35 +00:00
""" XOR
"""
if len(tree) < 3:
return False
2022-01-03 11:33:46 +00:00
true_ctr = 0
for arg_index in range(1, len(tree)):
arg_value = False
if isinstance(tree[arg_index], str):
2024-02-19 13:46:43 +00:00
arg_value = tree[arg_index] in hashtags
2022-01-03 11:33:46 +00:00
elif isinstance(tree[arg_index], list):
arg_value = hashtag_rule_resolve(tree[arg_index],
hashtags, moderated,
content, url)
if arg_value:
true_ctr += 1
if true_ctr == 1:
2021-07-04 09:24:35 +00:00
return True
return False
2021-12-29 21:55:09 +00:00
def hashtag_rule_resolve(tree: [], hashtags: [], moderated: bool,
content: str, url: str) -> bool:
2020-10-17 12:05:41 +00:00
"""Returns whether the tree for a hashtag rule evaluates to true or false
"""
if not tree:
return False
if tree[0] == 'not':
2021-12-29 21:55:09 +00:00
return _hashtag_logical_not(tree, hashtags, moderated, content, url)
2022-01-03 11:33:46 +00:00
if tree[0] == 'contains':
2022-06-12 20:31:56 +00:00
return _hashtag_logical_contains(tree, content)
2022-01-03 11:33:46 +00:00
if tree[0] == 'from':
2022-06-12 20:31:56 +00:00
return _hashtag_logical_from(tree, url)
2022-01-03 11:33:46 +00:00
if tree[0] == 'and':
2021-12-29 21:55:09 +00:00
return _hashtag_logical_and(tree, hashtags, moderated, content, url)
2022-01-03 11:33:46 +00:00
if tree[0] == 'or':
2021-12-29 21:55:09 +00:00
return _hashtag_logical_or(tree, hashtags, moderated, content, url)
2022-01-03 11:33:46 +00:00
if tree[0] == 'xor':
2021-12-29 21:55:09 +00:00
return _hashtag_logical_xor(tree, hashtags, moderated, content, url)
2022-01-03 11:33:46 +00:00
if tree[0].startswith('#') and len(tree) == 1:
2020-10-17 12:05:41 +00:00
return tree[0] in hashtags
2022-01-03 11:33:46 +00:00
if tree[0].startswith('moderated'):
2020-10-17 17:36:10 +00:00
return moderated
2022-01-03 11:33:46 +00:00
if tree[0].startswith('"') and tree[0].endswith('"'):
return True
2020-10-17 12:05:41 +00:00
return False
2021-12-29 21:55:09 +00:00
def hashtag_rule_tree(operators: [],
2022-01-03 11:33:46 +00:00
conditions_str: str,
tags_in_conditions: [],
2021-12-29 21:55:09 +00:00
moderated: bool) -> []:
2020-10-17 12:05:41 +00:00
"""Walks the tree
"""
2022-01-03 11:33:46 +00:00
if not operators and conditions_str:
conditions_str = conditions_str.strip()
is_str = \
conditions_str.startswith('"') and conditions_str.endswith('"')
if conditions_str.startswith('#') or is_str or \
conditions_str in operators or \
conditions_str == 'moderated' or \
conditions_str == 'contains':
if conditions_str.startswith('#'):
if conditions_str not in tags_in_conditions:
if ' ' not in conditions_str or \
conditions_str.startswith('"'):
tags_in_conditions.append(conditions_str)
return [conditions_str.strip()]
return None
if not operators or not conditions_str:
2020-10-17 12:05:41 +00:00
return None
tree = None
2022-01-03 11:33:46 +00:00
conditions_str = conditions_str.strip()
is_str = conditions_str.startswith('"') and conditions_str.endswith('"')
if conditions_str.startswith('#') or is_str or \
conditions_str in operators or \
conditions_str == 'moderated' or \
conditions_str == 'contains':
if conditions_str.startswith('#'):
if conditions_str not in tags_in_conditions:
if ' ' not in conditions_str or \
conditions_str.startswith('"'):
tags_in_conditions.append(conditions_str)
tree = [conditions_str.strip()]
2020-10-17 12:05:41 +00:00
ctr = 0
while ctr < len(operators):
2022-01-03 11:33:46 +00:00
oper = operators[ctr]
opmatch = ' ' + oper + ' '
if opmatch not in conditions_str and \
not conditions_str.startswith(oper + ' '):
2020-10-17 12:05:41 +00:00
ctr += 1
continue
2022-01-03 11:33:46 +00:00
tree = [oper]
if opmatch in conditions_str:
sections = conditions_str.split(opmatch)
2020-10-17 12:05:41 +00:00
else:
2022-01-03 11:33:46 +00:00
sections = conditions_str.split(oper + ' ', 1)
for sub_condition_str in sections:
result = hashtag_rule_tree(operators[ctr + 1:],
sub_condition_str,
tags_in_conditions, moderated)
if result:
tree.append(result)
break
2020-10-17 12:05:41 +00:00
return tree
2021-12-29 21:55:09 +00:00
def _hashtag_add(base_dir: str, http_prefix: str, domain_full: str,
post_json_object: {},
2022-01-03 11:33:46 +00:00
action_str: str, hashtags: [], system_language: str,
2024-10-16 09:47:04 +00:00
translate: {}, session) -> None:
2021-07-04 09:46:48 +00:00
"""Adds a hashtag via a hashtag rule
"""
2022-01-03 11:33:46 +00:00
add_hashtag = action_str.split('add ', 1)[1].strip()
if not add_hashtag.startswith('#'):
2021-07-04 09:46:48 +00:00
return
2022-01-03 11:33:46 +00:00
if add_hashtag not in hashtags:
hashtags.append(add_hashtag)
ht_id = add_hashtag.replace('#', '')
if not valid_hash_tag(ht_id):
2021-07-04 09:46:48 +00:00
return
2022-01-03 11:33:46 +00:00
hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id
new_tag = {
'href': hashtag_url,
'name': add_hashtag,
2021-07-04 09:46:48 +00:00
'type': 'Hashtag'
}
# does the tag already exist?
2022-01-03 11:33:46 +00:00
add_tag_object = None
for htag in post_json_object['object']['tag']:
if htag.get('type') and htag.get('name'):
if htag['type'] == 'Hashtag' and \
htag['name'] == add_hashtag:
add_tag_object = htag
2021-07-04 09:46:48 +00:00
break
# append the tag if it wasn't found
2022-01-03 11:33:46 +00:00
if not add_tag_object:
post_json_object['object']['tag'].append(new_tag)
2021-07-04 09:46:48 +00:00
# add corresponding html to the post content
2022-01-03 11:33:46 +00:00
hashtag_html = \
" <a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + ht_id + "</span></a>"
2021-12-26 11:29:40 +00:00
content = get_base_content_from_post(post_json_object, system_language)
2022-01-03 11:33:46 +00:00
if hashtag_html in content:
2021-07-04 09:46:48 +00:00
return
if content.endswith('</p>'):
content = \
content[:len(content) - len('</p>')] + \
2022-01-03 11:33:46 +00:00
hashtag_html + '</p>'
2021-07-04 09:46:48 +00:00
else:
2022-01-03 11:33:46 +00:00
content += hashtag_html
2021-12-25 22:09:19 +00:00
post_json_object['object']['content'] = content
2021-12-26 10:00:46 +00:00
domain = domain_full
2021-10-20 13:33:34 +00:00
if ':' in domain:
domain = domain.split(':')[0]
2021-12-29 21:55:09 +00:00
store_hash_tags(base_dir, 'news', domain,
http_prefix, domain_full,
2024-10-15 22:46:47 +00:00
post_json_object, translate, session)
2021-07-04 09:46:48 +00:00
2021-12-29 21:55:09 +00:00
def _hashtag_remove(http_prefix: str, domain_full: str, post_json_object: {},
2022-01-03 11:33:46 +00:00
action_str: str, hashtags: [],
2021-12-29 21:55:09 +00:00
system_language: str) -> None:
2021-07-04 09:46:48 +00:00
"""Removes a hashtag via a hashtag rule
"""
2022-01-03 11:33:46 +00:00
rm_hashtag = action_str.split('remove ', 1)[1].strip()
if not rm_hashtag.startswith('#'):
2021-07-04 09:46:48 +00:00
return
2022-01-03 11:33:46 +00:00
if rm_hashtag in hashtags:
hashtags.remove(rm_hashtag)
ht_id = rm_hashtag.replace('#', '')
hashtag_url = http_prefix + "://" + domain_full + "/tags/" + ht_id
2021-07-04 09:46:48 +00:00
# remove tag html from the post content
2022-01-03 11:33:46 +00:00
hashtag_html = \
"<a href=\"" + hashtag_url + "\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + ht_id + "</span></a>"
2021-12-26 11:29:40 +00:00
content = get_base_content_from_post(post_json_object, system_language)
2022-01-03 11:33:46 +00:00
if hashtag_html in content:
content = content.replace(hashtag_html, '').replace(' ', ' ')
2021-12-25 22:09:19 +00:00
post_json_object['object']['content'] = content
2021-12-25 23:03:28 +00:00
post_json_object['object']['contentMap'][system_language] = content
2022-01-03 11:33:46 +00:00
rm_tag_object = None
for htag in post_json_object['object']['tag']:
if htag.get('type') and htag.get('name'):
if htag['type'] == 'Hashtag' and \
htag['name'] == rm_hashtag:
rm_tag_object = htag
2021-07-04 09:46:48 +00:00
break
2022-01-03 11:33:46 +00:00
if rm_tag_object:
post_json_object['object']['tag'].remove(rm_tag_object)
2021-07-04 09:46:48 +00:00
2022-06-12 20:31:56 +00:00
def _newswire_hashtag_processing(base_dir: str, post_json_object: {},
2021-12-29 21:55:09 +00:00
hashtags: [], http_prefix: str,
domain: str, port: int,
moderated: bool, url: str,
system_language: str,
2024-10-16 09:47:04 +00:00
translate: {}, session) -> bool:
2020-10-16 21:33:18 +00:00
"""Applies hashtag rules to a news post.
Returns true if the post should be saved to the news timeline
of this instance
"""
2024-05-12 12:35:26 +00:00
rules_filename = data_dir(base_dir) + '/hashtagrules.txt'
2022-01-03 11:33:46 +00:00
if not os.path.isfile(rules_filename):
2020-10-17 12:05:41 +00:00
return True
2024-12-23 17:45:20 +00:00
rules: list[str] = []
try:
with open(rules_filename, 'r', encoding='utf-8') as fp_rules:
rules = fp_rules.readlines()
except OSError:
print('EX: _newswire_hashtag_processing unable to read ' +
rules_filename)
2020-10-17 12:05:41 +00:00
2021-12-26 12:45:03 +00:00
domain_full = get_full_domain(domain, port)
2020-10-17 12:05:41 +00:00
# get the full text content of the post
content = ''
2021-12-25 22:09:19 +00:00
if post_json_object['object'].get('content'):
2021-12-26 11:29:40 +00:00
content += get_base_content_from_post(post_json_object,
system_language)
2021-12-25 22:09:19 +00:00
if post_json_object['object'].get('summary'):
content += ' ' + post_json_object['object']['summary']
content = content.lower()
2020-10-17 13:41:20 +00:00
# actionOccurred = False
2020-10-20 17:37:15 +00:00
operators = ('not', 'and', 'or', 'xor', 'from', 'contains')
2022-01-03 11:33:46 +00:00
for rule_str in rules:
if not rule_str:
2020-10-17 12:05:41 +00:00
continue
2022-01-03 11:33:46 +00:00
if not rule_str.startswith('if '):
2020-10-17 12:05:41 +00:00
continue
2022-01-03 11:33:46 +00:00
if ' then ' not in rule_str:
2020-10-17 12:05:41 +00:00
continue
2022-01-03 11:33:46 +00:00
conditions_str = rule_str.split('if ', 1)[1]
conditions_str = conditions_str.split(' then ')[0]
2024-12-23 17:45:20 +00:00
tags_in_conditions: list[str] = []
2022-01-03 11:33:46 +00:00
tree = hashtag_rule_tree(operators, conditions_str,
tags_in_conditions, moderated)
2021-12-29 21:55:09 +00:00
if not hashtag_rule_resolve(tree, hashtags, moderated, content, url):
2020-10-17 12:05:41 +00:00
continue
# the condition matches, so do something
2022-01-03 11:33:46 +00:00
action_str = rule_str.split(' then ')[1].strip()
2020-10-17 12:05:41 +00:00
2022-01-03 11:33:46 +00:00
if action_str.startswith('add '):
2021-07-04 09:46:48 +00:00
# add a hashtag
2021-12-29 21:55:09 +00:00
_hashtag_add(base_dir, http_prefix, domain_full,
2022-01-03 11:33:46 +00:00
post_json_object, action_str, hashtags,
2024-10-16 09:47:04 +00:00
system_language, translate, session)
2022-01-03 11:33:46 +00:00
elif action_str.startswith('remove '):
2021-07-04 09:46:48 +00:00
# remove a hashtag
2021-12-29 21:55:09 +00:00
_hashtag_remove(http_prefix, domain_full, post_json_object,
2022-01-03 11:33:46 +00:00
action_str, hashtags, system_language)
elif action_str.startswith('block') or action_str.startswith('drop'):
2021-07-04 09:46:48 +00:00
# Block this item
return False
2020-10-16 21:33:18 +00:00
return True
2021-12-29 21:55:09 +00:00
def _create_news_mirror(base_dir: str, domain: str,
2022-01-03 11:33:46 +00:00
post_id_number: str, url: str,
2021-12-29 21:55:09 +00:00
max_mirrored_articles: int) -> bool:
2020-10-19 16:33:58 +00:00
"""Creates a local mirror of a news article
"""
2020-10-19 19:26:58 +00:00
if '|' in url or '>' in url:
return True
2024-05-12 12:35:26 +00:00
mirror_dir = data_dir(base_dir) + '/newsmirror'
2022-01-03 11:33:46 +00:00
if not os.path.isdir(mirror_dir):
os.mkdir(mirror_dir)
2020-10-19 16:33:58 +00:00
2020-10-19 19:26:58 +00:00
# count the directories
2022-01-03 11:33:46 +00:00
no_of_dirs = 0
for _, dirs, _ in os.walk(mirror_dir):
no_of_dirs = len(dirs)
2022-07-22 09:58:42 +00:00
break
2020-10-19 19:26:58 +00:00
2024-05-12 12:35:26 +00:00
mirror_index_filename = data_dir(base_dir) + '/newsmirror.txt'
2020-10-19 19:26:58 +00:00
2022-01-03 11:33:46 +00:00
if max_mirrored_articles > 0 and no_of_dirs > max_mirrored_articles:
if not os.path.isfile(mirror_index_filename):
2020-10-19 19:26:58 +00:00
# no index for mirrors found
return True
2024-12-23 17:45:20 +00:00
removals: list[str] = []
try:
with open(mirror_index_filename, 'r',
encoding='utf-8') as fp_index:
# remove the oldest directories
ctr = 0
while no_of_dirs > max_mirrored_articles:
ctr += 1
if ctr > 5000:
# escape valve
break
post_id = fp_index.readline()
if not post_id:
continue
post_id = post_id.strip()
mirror_article_dir = mirror_dir + '/' + post_id
if os.path.isdir(mirror_article_dir):
rmtree(mirror_article_dir,
ignore_errors=False, onexc=None)
removals.append(post_id)
no_of_dirs -= 1
except OSError as exc:
print('EX: _create_news_mirror unable to read ' +
mirror_index_filename + ' ' + str(exc))
2020-10-19 19:26:58 +00:00
# remove the corresponding index entries
if removals:
2022-01-03 11:33:46 +00:00
index_content = ''
try:
with open(mirror_index_filename, 'r',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_index:
index_content = fp_index.read()
for remove_post_id in removals:
index_content = \
index_content.replace(remove_post_id + '\n', '')
except OSError:
print('EX: _create_news_mirror unable to read ' +
mirror_index_filename)
2021-11-25 21:18:53 +00:00
try:
2022-06-09 14:46:30 +00:00
with open(mirror_index_filename, 'w+',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_index:
fp_index.write(index_content)
2021-11-25 21:18:53 +00:00
except OSError:
2024-07-02 22:16:13 +00:00
print('EX: _create_news_mirror unable to write ' +
mirror_index_filename)
2020-10-19 19:26:58 +00:00
2022-01-03 11:33:46 +00:00
mirror_article_dir = mirror_dir + '/' + post_id_number
if os.path.isdir(mirror_article_dir):
2020-10-19 19:26:58 +00:00
# already mirrored
return True
2020-10-20 09:27:58 +00:00
# for onion instances mirror via tor
2022-01-03 11:33:46 +00:00
prefix_str = ''
2020-10-20 09:27:58 +00:00
if domain.endswith('.onion'):
2022-01-03 11:33:46 +00:00
prefix_str = '/usr/bin/torsocks '
2020-10-20 09:27:58 +00:00
2020-10-19 19:26:58 +00:00
# download the files
2022-01-03 11:33:46 +00:00
command_str = \
prefix_str + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \
' -P ' + mirror_article_dir
proc = Popen(command_str, shell=True)
os.waitpid(proc.pid, 0)
2020-10-19 19:26:58 +00:00
2022-01-03 11:33:46 +00:00
if not os.path.isdir(mirror_article_dir):
2020-10-20 09:27:58 +00:00
print('WARN: failed to mirror ' + url)
2020-10-19 19:26:58 +00:00
return True
# append the post Id number to the index file
2022-01-03 11:33:46 +00:00
if os.path.isfile(mirror_index_filename):
2021-11-25 21:18:53 +00:00
try:
2022-06-09 14:46:30 +00:00
with open(mirror_index_filename, 'a+',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_index:
fp_index.write(post_id_number + '\n')
2021-11-25 21:18:53 +00:00
except OSError:
2024-07-02 22:16:13 +00:00
print('EX: _create_news_mirror unable to append ' +
mirror_index_filename)
2020-10-19 19:26:58 +00:00
else:
2021-11-25 21:18:53 +00:00
try:
2022-06-09 14:46:30 +00:00
with open(mirror_index_filename, 'w+',
2024-07-14 13:01:46 +00:00
encoding='utf-8') as fp_index:
fp_index.write(post_id_number + '\n')
2021-11-25 21:18:53 +00:00
except OSError:
2024-07-02 22:16:13 +00:00
print('EX: _create_news_mirror unable to write ' +
mirror_index_filename)
2020-10-19 19:26:58 +00:00
2020-10-19 16:33:58 +00:00
return True
2022-01-12 19:40:12 +00:00
def _convert_rss_to_activitypub(base_dir: str, http_prefix: str,
domain: str, port: int,
newswire: {},
translate: {},
recent_posts_cache: {},
max_mirrored_articles: int,
allow_local_network_access: bool,
system_language: str,
low_bandwidth: bool,
2023-01-22 23:47:13 +00:00
content_license_url: str,
2023-01-23 11:33:07 +00:00
media_license_url: str,
2024-10-15 22:46:47 +00:00
media_creator: str,
session) -> None:
"""Converts rss items in a newswire into posts
"""
2020-11-03 14:41:28 +00:00
if not newswire:
2021-09-15 17:43:06 +00:00
print('No newswire to convert')
2020-11-03 14:41:28 +00:00
return
2024-05-12 12:35:26 +00:00
base_path = data_dir(base_dir) + '/news@' + domain + '/outbox'
2022-01-03 11:33:46 +00:00
if not os.path.isdir(base_path):
os.mkdir(base_path)
2020-10-09 10:05:01 +00:00
# oldest items first
2022-01-03 11:33:46 +00:00
newswire_reverse = OrderedDict(sorted(newswire.items(), reverse=False))
2020-10-07 18:46:42 +00:00
2022-01-03 11:33:46 +00:00
for date_str, item in newswire_reverse.items():
original_date_str = date_str
# convert the date to the format used by ActivityPub
2022-01-03 11:33:46 +00:00
if '+00:00' in date_str:
date_str = date_str.replace(' ', 'T')
date_str = date_str.replace('+00:00', 'Z')
2020-10-20 12:37:32 +00:00
else:
2021-09-15 17:43:06 +00:00
try:
2022-01-03 11:33:46 +00:00
date_str_with_offset = \
2023-11-20 22:27:58 +00:00
date_from_string_format(date_str, ["%Y-%m-%d %H:%M:%S%z"])
2021-09-15 17:43:06 +00:00
except BaseException:
2022-01-03 11:33:46 +00:00
print('EX: Newswire strptime failed ' + str(date_str))
2021-09-15 17:43:06 +00:00
continue
2021-09-15 19:04:29 +00:00
try:
2022-01-03 11:33:46 +00:00
date_str = date_str_with_offset.strftime("%Y-%m-%dT%H:%M:%SZ")
2021-09-15 19:04:29 +00:00
except BaseException:
2022-01-03 11:33:46 +00:00
print('EX: Newswire date_str_with_offset failed ' +
str(date_str_with_offset))
2021-09-15 19:04:29 +00:00
continue
2022-01-03 11:33:46 +00:00
status_number, _ = get_status_number(date_str)
new_post_id = \
2021-12-26 10:19:59 +00:00
local_actor_url(http_prefix, 'news', domain) + \
2022-01-03 11:33:46 +00:00
'/statuses/' + status_number
2020-10-07 16:55:15 +00:00
# file where the post is stored
2022-01-03 11:33:46 +00:00
filename = base_path + '/' + new_post_id.replace('/', '#') + '.json'
if os.path.isfile(filename):
2020-10-08 12:52:15 +00:00
# don't create the post if it already exists
2020-10-08 14:35:26 +00:00
# set the url
2022-01-03 11:33:46 +00:00
# newswire[original_date_str][1] = \
# '/users/news/statuses/' + status_number
2020-10-08 14:35:26 +00:00
# set the filename
2022-01-03 11:33:46 +00:00
newswire[original_date_str][3] = filename
continue
2022-01-03 11:33:46 +00:00
rss_title = _remove_control_characters(item[0])
url = item[1]
if dangerous_markup(url, allow_local_network_access, []) or \
dangerous_markup(rss_title, allow_local_network_access, []):
2020-10-11 09:33:31 +00:00
continue
2022-01-03 11:33:46 +00:00
rss_description = ''
# get the rss description if it exists
2022-01-03 11:33:46 +00:00
rss_description = '<p>' + remove_html(item[4]) + '<p>'
2020-10-19 20:43:27 +00:00
mirrored = item[7]
2022-01-03 11:33:46 +00:00
post_url = url
2020-10-19 20:43:27 +00:00
if mirrored and '://' in url:
2022-01-03 11:33:46 +00:00
post_url = '/newsmirror/' + status_number + '/' + \
2020-10-19 22:21:30 +00:00
url.split('://')[1]
2022-01-03 11:33:46 +00:00
if post_url.endswith('/'):
post_url += 'index.html'
2020-10-19 22:21:30 +00:00
else:
2022-01-03 11:33:46 +00:00
post_url += '/index.html'
2020-10-19 20:43:27 +00:00
# add the off-site link to the description
2022-01-03 11:33:46 +00:00
rss_description += \
'<br><a href="' + post_url + '">' + \
translate['Read more...'] + '</a>'
2020-10-11 09:33:31 +00:00
# podcast_properties = None
# if len(item) > 8:
# podcast_properties = item[8]
2020-10-09 10:08:01 +00:00
# NOTE: the id when the post is created will not be
# consistent (it's based on the current time, not the
# published time), so we change that later
2022-01-03 11:33:46 +00:00
save_to_file = False
attach_image_filename = None
media_type = None
image_description = None
2023-02-18 22:10:15 +00:00
video_transcript = None
2021-05-09 19:29:53 +00:00
city = 'London, England'
2022-01-03 11:33:46 +00:00
conversation_id = None
2024-10-06 16:22:13 +00:00
convthread_id = None
languages_understood = [system_language]
2023-01-13 19:19:57 +00:00
buy_url = ''
2023-07-10 17:53:56 +00:00
chat_url = ''
2021-12-29 21:55:09 +00:00
blog = create_news_post(base_dir,
domain, port, http_prefix,
2022-01-03 11:33:46 +00:00
rss_description,
2022-05-31 16:51:56 +00:00
save_to_file,
2022-01-03 11:33:46 +00:00
attach_image_filename, media_type,
2023-02-18 22:10:15 +00:00
image_description, video_transcript,
city, rss_title, system_language,
2024-10-06 16:22:13 +00:00
conversation_id, convthread_id, low_bandwidth,
content_license_url,
2023-01-23 11:33:07 +00:00
media_license_url, media_creator,
2023-01-13 19:19:57 +00:00
languages_understood, translate,
2024-10-15 22:46:47 +00:00
buy_url, chat_url, session)
2020-10-07 16:55:15 +00:00
if not blog:
continue
2020-10-19 16:33:58 +00:00
if mirrored:
2022-01-03 11:33:46 +00:00
if not _create_news_mirror(base_dir, domain, status_number,
2021-12-29 21:55:09 +00:00
url, max_mirrored_articles):
2020-10-19 16:33:58 +00:00
continue
2022-01-03 11:33:46 +00:00
id_str = \
2021-12-26 10:19:59 +00:00
local_actor_url(http_prefix, 'news', domain) + \
2022-01-03 11:33:46 +00:00
'/statuses/' + status_number + '/replies'
blog['news'] = True
2020-10-09 10:05:01 +00:00
# note the time of arrival
2023-11-20 22:27:58 +00:00
curr_time = date_utcnow()
2021-12-26 13:17:46 +00:00
blog['object']['arrived'] = curr_time.strftime("%Y-%m-%dT%H:%M:%SZ")
2020-10-09 10:05:01 +00:00
2020-10-09 10:08:01 +00:00
# change the id, based upon the published time
2022-01-03 11:33:46 +00:00
blog['object']['replies']['id'] = id_str
blog['object']['replies']['first']['partOf'] = id_str
2020-10-07 16:55:15 +00:00
2022-01-03 11:33:46 +00:00
blog['id'] = new_post_id + '/activity'
blog['object']['id'] = new_post_id
blog['object']['atomUri'] = new_post_id
2020-10-07 16:55:15 +00:00
blog['object']['url'] = \
2022-01-03 11:33:46 +00:00
http_prefix + '://' + domain + '/@news/' + status_number
blog['object']['published'] = date_str
2020-10-20 13:07:02 +00:00
2022-01-03 11:33:46 +00:00
blog['object']['content'] = rss_description
blog['object']['contentMap'][system_language] = rss_description
2020-10-07 16:55:15 +00:00
2021-12-26 12:45:03 +00:00
domain_full = get_full_domain(domain, port)
2020-10-17 13:59:47 +00:00
hashtags = item[6]
2022-01-03 11:33:46 +00:00
post_id = new_post_id.replace('/', '#')
2020-10-09 12:15:20 +00:00
moderated = item[5]
2022-01-03 11:33:46 +00:00
save_post = \
2022-06-12 20:31:56 +00:00
_newswire_hashtag_processing(base_dir, blog, hashtags,
2021-12-29 21:55:09 +00:00
http_prefix, domain, port,
moderated, url, system_language,
2024-10-16 09:47:04 +00:00
translate, session)
2020-10-09 12:15:20 +00:00
2020-10-16 21:33:18 +00:00
# save the post and update the index
2022-01-03 11:33:46 +00:00
if save_post:
# ensure that all hashtags are stored in the json
# and appended to the content
2024-12-23 17:45:20 +00:00
blog['object']['tag']: list[dict] = []
2022-01-03 11:33:46 +00:00
for tag_name in hashtags:
ht_id = tag_name.replace('#', '')
hashtag_url = \
http_prefix + "://" + domain_full + "/tags/" + ht_id
new_tag = {
'href': hashtag_url,
'name': tag_name,
2020-10-25 11:22:52 +00:00
'type': 'Hashtag'
}
2022-01-03 11:33:46 +00:00
blog['object']['tag'].append(new_tag)
hashtag_html = \
" <a href=\"" + hashtag_url + \
"\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + \
2022-01-03 11:33:46 +00:00
ht_id + "</span></a>"
2021-12-26 11:29:40 +00:00
content = get_base_content_from_post(blog, system_language)
2022-01-03 11:33:46 +00:00
if hashtag_html not in content:
2020-10-25 14:37:51 +00:00
if content.endswith('</p>'):
content = \
content[:len(content) - len('</p>')] + \
2022-01-03 11:33:46 +00:00
hashtag_html + '</p>'
2020-10-25 14:37:51 +00:00
else:
2022-01-03 11:33:46 +00:00
content += hashtag_html
2020-10-25 14:37:51 +00:00
blog['object']['content'] = content
2021-12-25 23:03:28 +00:00
blog['object']['contentMap'][system_language] = content
2020-10-25 11:22:52 +00:00
2020-10-25 14:21:29 +00:00
# update the newswire tags if new ones have been found by
2021-12-29 21:55:09 +00:00
# _newswire_hashtag_processing
2020-10-25 14:21:29 +00:00
for tag in hashtags:
2022-01-03 11:33:46 +00:00
if tag not in newswire[original_date_str][6]:
newswire[original_date_str][6].append(tag)
2020-10-17 13:39:04 +00:00
2021-12-29 21:55:09 +00:00
store_hash_tags(base_dir, 'news', domain,
http_prefix, domain_full,
2024-10-15 22:46:47 +00:00
blog, translate, session)
2020-10-17 13:39:04 +00:00
2021-12-28 10:17:58 +00:00
clear_from_post_caches(base_dir, recent_posts_cache, post_id)
2021-12-26 14:47:21 +00:00
if save_json(blog, filename):
2021-12-29 21:55:09 +00:00
_update_feeds_outbox_index(base_dir, domain, post_id + '.json')
2020-10-16 21:33:18 +00:00
# Save a file containing the time when the post arrived
# this can then later be used to construct the news timeline
# excluding items during the voting period
if moderated:
2022-06-12 20:31:56 +00:00
_save_arrived_time(filename,
2021-12-29 21:55:09 +00:00
blog['object']['arrived'])
2020-10-16 21:33:18 +00:00
else:
if os.path.isfile(filename + '.arrived'):
try:
os.remove(filename + '.arrived')
2021-11-25 18:42:38 +00:00
except OSError:
2022-01-12 19:40:12 +00:00
print('EX: _convert_rss_to_activitypub ' +
2021-10-29 18:48:15 +00:00
'unable to delete ' + filename + '.arrived')
2020-10-16 21:33:18 +00:00
2020-11-08 16:52:57 +00:00
# setting the url here links to the activitypub object
# stored locally
2022-01-03 11:33:46 +00:00
# newswire[original_date_str][1] = \
# '/users/news/statuses/' + status_number
2020-11-08 16:52:57 +00:00
2020-10-16 21:33:18 +00:00
# set the filename
2022-01-03 11:33:46 +00:00
newswire[original_date_str][3] = filename
2024-06-05 15:48:57 +00:00
def _merge_with_previous_newswire(old_newswire: {}, new_newswire: {}) -> None:
2020-10-09 09:02:01 +00:00
"""Preserve any votes or generated activitypub post filename
as rss feeds are updated
"""
2024-06-05 15:48:57 +00:00
if not old_newswire:
2020-11-03 14:41:28 +00:00
return
2024-06-05 15:48:57 +00:00
for published, fields in old_newswire.items():
2022-01-03 11:33:46 +00:00
if not new_newswire.get(published):
2020-10-09 09:02:01 +00:00
continue
2020-10-13 08:53:59 +00:00
for i in range(1, 5):
2022-01-03 11:33:46 +00:00
new_newswire[published][i] = fields[i]
2020-10-09 09:02:01 +00:00
2021-12-29 21:55:09 +00:00
def run_newswire_daemon(base_dir: str, httpd,
http_prefix: str, domain: str, port: int,
translate: {}) -> None:
2020-10-07 12:05:49 +00:00
"""Periodically updates RSS feeds
"""
2024-05-12 12:35:26 +00:00
newswire_state_filename = data_dir(base_dir) + '/.newswirestate.json'
refresh_filename = data_dir(base_dir) + '/.refresh_newswire'
2020-10-09 09:02:01 +00:00
2022-01-30 10:05:43 +00:00
print('Starting newswire daemon')
2020-10-07 12:05:49 +00:00
# initial sleep to allow the system to start up
time.sleep(50)
while True:
# has the session been created yet?
if not httpd.session:
2020-11-03 16:10:54 +00:00
print('Newswire daemon waiting for session')
2021-12-28 16:56:57 +00:00
httpd.session = create_session(httpd.proxy_type)
if not httpd.session:
2020-11-03 16:10:54 +00:00
print('Newswire daemon has no session')
time.sleep(60)
continue
2022-05-30 15:15:17 +00:00
print('Newswire daemon session established')
2020-10-07 12:05:49 +00:00
# try to update the feeds
2021-09-15 17:03:20 +00:00
print('Updating newswire feeds')
2022-01-03 11:33:46 +00:00
new_newswire = \
2021-12-29 21:55:09 +00:00
get_dict_from_newswire(httpd.session, base_dir, domain,
2022-06-10 13:47:10 +00:00
httpd.max_newswire_posts_per_source,
2021-12-29 21:55:09 +00:00
httpd.max_newswire_feed_size_kb,
httpd.maxTags,
httpd.max_feed_item_size_kb,
httpd.max_newswire_posts,
httpd.maxCategoriesFeedItemSizeKb,
httpd.system_language,
httpd.debug,
2022-04-24 19:03:02 +00:00
httpd.preferred_podcast_formats,
httpd.rss_timeout_sec)
2020-10-07 12:05:49 +00:00
2020-10-09 09:02:01 +00:00
if not httpd.newswire:
2021-09-15 17:03:20 +00:00
print('Newswire feeds not updated')
2022-01-03 11:33:46 +00:00
if os.path.isfile(newswire_state_filename):
2021-09-15 17:03:20 +00:00
print('Loading newswire from file')
2022-01-03 11:33:46 +00:00
httpd.newswire = load_json(newswire_state_filename)
2020-10-09 09:02:01 +00:00
2021-09-15 17:03:20 +00:00
print('Merging with previous newswire')
2022-01-03 11:33:46 +00:00
_merge_with_previous_newswire(httpd.newswire, new_newswire)
2020-10-09 09:02:01 +00:00
2022-01-03 11:33:46 +00:00
httpd.newswire = new_newswire
if new_newswire:
save_json(httpd.newswire, newswire_state_filename)
2020-11-03 21:53:29 +00:00
print('Newswire updated')
2021-09-15 17:03:20 +00:00
else:
print('No new newswire')
2021-09-15 17:03:20 +00:00
print('Converting newswire to activitypub format')
2022-06-12 20:31:56 +00:00
_convert_rss_to_activitypub(base_dir, http_prefix, domain, port,
2022-01-12 19:40:12 +00:00
new_newswire, translate,
httpd.recent_posts_cache,
httpd.max_mirrored_articles,
httpd.allow_local_network_access,
httpd.system_language,
httpd.low_bandwidth,
2023-01-22 23:47:13 +00:00
httpd.content_license_url,
2024-10-15 22:46:47 +00:00
httpd.content_license_url, '',
httpd.session)
print('Newswire feed converted to ActivityPub')
2021-12-25 19:39:45 +00:00
if httpd.max_news_posts > 0:
2021-12-25 23:41:17 +00:00
archive_dir = base_dir + '/archive'
2022-01-03 11:33:46 +00:00
archive_subdir = \
2021-12-25 23:41:17 +00:00
archive_dir + '/accounts/news@' + domain + '/outbox'
2021-09-15 17:03:20 +00:00
print('Archiving news posts')
2021-12-29 21:55:09 +00:00
archive_posts_for_person(http_prefix, 'news',
domain, base_dir, 'outbox',
2022-01-03 11:33:46 +00:00
archive_subdir,
2021-12-29 21:55:09 +00:00
httpd.recent_posts_cache,
httpd.max_news_posts)
2020-10-21 10:39:09 +00:00
2020-10-07 12:05:49 +00:00
# wait a while before the next feeds update
2022-04-24 20:19:16 +00:00
for _ in range(360):
time.sleep(10)
# if a new blog post has been created then stop
# waiting and recalculate the newswire
2024-07-15 19:56:25 +00:00
if not os.path.isfile(refresh_filename):
continue
try:
os.remove(refresh_filename)
except OSError:
print('EX: run_newswire_daemon unable to delete ' +
str(refresh_filename))
break
2020-10-07 12:05:49 +00:00
2021-12-29 21:55:09 +00:00
def run_newswire_watchdog(project_version: str, httpd) -> None:
2020-10-07 12:05:49 +00:00
"""This tries to keep the newswire update thread running even if it dies
"""
2022-03-13 11:01:07 +00:00
print('THREAD: Starting newswire watchdog')
2022-01-03 11:33:46 +00:00
newswire_original = \
2021-12-29 21:55:09 +00:00
httpd.thrPostSchedule.clone(run_newswire_daemon)
2022-07-28 09:59:18 +00:00
begin_thread(httpd.thrNewswireDaemon, 'run_newswire_watchdog')
2020-10-07 12:05:49 +00:00
while True:
time.sleep(50)
2021-06-05 12:43:57 +00:00
if httpd.thrNewswireDaemon.is_alive():
continue
httpd.thrNewswireDaemon.kill()
2022-03-13 11:01:07 +00:00
print('THREAD: restarting newswire watchdog')
2021-06-05 12:43:57 +00:00
httpd.thrNewswireDaemon = \
2022-01-03 11:33:46 +00:00
newswire_original.clone(run_newswire_daemon)
2022-07-28 09:59:18 +00:00
begin_thread(httpd.thrNewswireDaemon, 'run_newswire_watchdog 2')
2021-06-05 12:43:57 +00:00
print('Restarting newswire daemon...')