2020-10-07 12:05:49 +00:00
|
|
|
__filename__ = "newsdaemon.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2021-01-26 10:07:42 +00:00
|
|
|
__version__ = "1.2.0"
|
2020-10-07 12:05:49 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2020-10-07 12:05:49 +00:00
|
|
|
__status__ = "Production"
|
2021-06-26 11:27:14 +00:00
|
|
|
__module_group__ = "Web Interface Columns"
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-17 18:53:08 +00:00
|
|
|
# Example hashtag logic:
|
|
|
|
#
|
|
|
|
# if moderated and not #imcoxford then block
|
|
|
|
# if #pol and contains "westminster" then add #britpol
|
2020-10-17 19:06:56 +00:00
|
|
|
# if #unwantedtag then block
|
2020-10-17 18:53:08 +00:00
|
|
|
|
2020-10-07 13:51:29 +00:00
|
|
|
import os
|
2020-10-07 12:05:49 +00:00
|
|
|
import time
|
2020-10-09 10:05:01 +00:00
|
|
|
import datetime
|
2020-10-20 13:07:02 +00:00
|
|
|
import html
|
2020-10-19 19:26:58 +00:00
|
|
|
from shutil import rmtree
|
|
|
|
from subprocess import Popen
|
2020-10-07 18:46:42 +00:00
|
|
|
from collections import OrderedDict
|
2020-10-07 12:05:49 +00:00
|
|
|
from newswire import getDictFromNewswire
|
2020-10-16 21:33:18 +00:00
|
|
|
# from posts import sendSignedJson
|
2020-10-07 21:26:03 +00:00
|
|
|
from posts import createNewsPost
|
2020-10-21 10:39:09 +00:00
|
|
|
from posts import archivePostsForPerson
|
2020-10-17 12:05:41 +00:00
|
|
|
from content import validHashTag
|
2021-12-26 11:29:40 +00:00
|
|
|
from utils import get_base_content_from_post
|
2021-12-27 15:43:22 +00:00
|
|
|
from utils import remove_html
|
2021-12-26 12:45:03 +00:00
|
|
|
from utils import get_full_domain
|
2021-12-26 15:13:34 +00:00
|
|
|
from utils import load_json
|
2021-12-26 14:47:21 +00:00
|
|
|
from utils import save_json
|
2021-12-27 17:42:35 +00:00
|
|
|
from utils import get_status_number
|
2020-10-18 16:19:28 +00:00
|
|
|
from utils import clearFromPostCaches
|
2021-01-31 11:05:17 +00:00
|
|
|
from utils import dangerousMarkup
|
2021-12-26 10:19:59 +00:00
|
|
|
from utils import local_actor_url
|
2020-10-17 13:39:04 +00:00
|
|
|
from inbox import storeHashTags
|
2020-11-03 16:08:31 +00:00
|
|
|
from session import createSession
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-08 12:29:40 +00:00
|
|
|
|
2021-12-26 19:47:06 +00:00
|
|
|
def _updateFeedsOutboxIndex(base_dir: str, domain: str, post_id: str) -> None:
|
2020-10-07 13:51:29 +00:00
|
|
|
"""Updates the index used for imported RSS feeds
|
|
|
|
"""
|
2021-12-25 16:17:53 +00:00
|
|
|
basePath = base_dir + '/accounts/news@' + domain
|
2020-10-07 16:55:15 +00:00
|
|
|
indexFilename = basePath + '/outbox.index'
|
2020-10-07 13:51:29 +00:00
|
|
|
|
|
|
|
if os.path.isfile(indexFilename):
|
2021-12-26 19:47:06 +00:00
|
|
|
if post_id not in open(indexFilename).read():
|
2020-10-07 18:46:42 +00:00
|
|
|
try:
|
|
|
|
with open(indexFilename, 'r+') as feedsFile:
|
|
|
|
content = feedsFile.read()
|
2021-12-26 19:47:06 +00:00
|
|
|
if post_id + '\n' not in content:
|
2020-12-29 20:22:28 +00:00
|
|
|
feedsFile.seek(0, 0)
|
2021-12-26 19:47:06 +00:00
|
|
|
feedsFile.write(post_id + '\n' + content)
|
2020-12-29 20:22:28 +00:00
|
|
|
print('DEBUG: feeds post added to index')
|
2021-12-25 15:28:52 +00:00
|
|
|
except Exception as ex:
|
2020-10-07 18:46:42 +00:00
|
|
|
print('WARN: Failed to write entry to feeds posts index ' +
|
2021-12-25 15:28:52 +00:00
|
|
|
indexFilename + ' ' + str(ex))
|
2020-10-07 13:51:29 +00:00
|
|
|
else:
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
|
|
|
with open(indexFilename, 'w+') as feedsFile:
|
2021-12-26 19:47:06 +00:00
|
|
|
feedsFile.write(post_id + '\n')
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to write ' + indexFilename)
|
2020-10-07 13:51:29 +00:00
|
|
|
|
|
|
|
|
2021-12-26 23:41:34 +00:00
|
|
|
def _saveArrivedTime(base_dir: str, post_filename: str, arrived: str) -> None:
|
2020-10-09 12:15:20 +00:00
|
|
|
"""Saves the time when an rss post arrived to a file
|
|
|
|
"""
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
2021-12-26 23:41:34 +00:00
|
|
|
with open(post_filename + '.arrived', 'w+') as arrivedFile:
|
2021-11-25 21:18:53 +00:00
|
|
|
arrivedFile.write(arrived)
|
|
|
|
except OSError:
|
2021-12-26 23:41:34 +00:00
|
|
|
print('EX: unable to write ' + post_filename + '.arrived')
|
2020-10-09 12:15:20 +00:00
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _removeControlCharacters(content: str) -> str:
|
2020-10-20 13:07:02 +00:00
|
|
|
"""Remove escaped html
|
2020-10-11 09:33:31 +00:00
|
|
|
"""
|
2020-10-20 13:07:02 +00:00
|
|
|
if '&' in content:
|
|
|
|
return html.unescape(content)
|
2020-10-11 09:33:31 +00:00
|
|
|
return content
|
2020-10-10 09:36:23 +00:00
|
|
|
|
2020-10-10 08:54:13 +00:00
|
|
|
|
2021-07-04 09:24:35 +00:00
|
|
|
def _hashtagLogicalNot(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" NOT
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
|
|
|
if isinstance(tree[1], str):
|
|
|
|
return tree[1] not in hashtags
|
|
|
|
elif isinstance(tree[1], list):
|
|
|
|
return not hashtagRuleResolve(tree[1], hashtags,
|
|
|
|
moderated, content, url)
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _hashtagLogicalContains(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" Contains
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
|
|
|
matchStr = None
|
|
|
|
if isinstance(tree[1], str):
|
|
|
|
matchStr = tree[1]
|
|
|
|
elif isinstance(tree[1], list):
|
|
|
|
matchStr = tree[1][0]
|
|
|
|
if matchStr:
|
|
|
|
if matchStr.startswith('"') and matchStr.endswith('"'):
|
|
|
|
matchStr = matchStr[1:]
|
|
|
|
matchStr = matchStr[:len(matchStr) - 1]
|
|
|
|
matchStrLower = matchStr.lower()
|
|
|
|
contentWithoutTags = content.replace('#' + matchStrLower, '')
|
|
|
|
return matchStrLower in contentWithoutTags
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _hashtagLogicalFrom(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" FROM
|
|
|
|
"""
|
|
|
|
if len(tree) != 2:
|
|
|
|
return False
|
|
|
|
matchStr = None
|
|
|
|
if isinstance(tree[1], str):
|
|
|
|
matchStr = tree[1]
|
|
|
|
elif isinstance(tree[1], list):
|
|
|
|
matchStr = tree[1][0]
|
|
|
|
if matchStr:
|
|
|
|
if matchStr.startswith('"') and matchStr.endswith('"'):
|
|
|
|
matchStr = matchStr[1:]
|
|
|
|
matchStr = matchStr[:len(matchStr) - 1]
|
|
|
|
return matchStr.lower() in url
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _hashtagLogicalAnd(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" AND
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
|
|
|
for argIndex in range(1, len(tree)):
|
|
|
|
argValue = False
|
|
|
|
if isinstance(tree[argIndex], str):
|
|
|
|
argValue = (tree[argIndex] in hashtags)
|
|
|
|
elif isinstance(tree[argIndex], list):
|
|
|
|
argValue = hashtagRuleResolve(tree[argIndex],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if not argValue:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def _hashtagLogicalOr(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" OR
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
|
|
|
for argIndex in range(1, len(tree)):
|
|
|
|
argValue = False
|
|
|
|
if isinstance(tree[argIndex], str):
|
|
|
|
argValue = (tree[argIndex] in hashtags)
|
|
|
|
elif isinstance(tree[argIndex], list):
|
|
|
|
argValue = hashtagRuleResolve(tree[argIndex],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if argValue:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _hashtagLogicalXor(tree: [], hashtags: [], moderated: bool,
|
|
|
|
content: str, url: str) -> bool:
|
|
|
|
""" XOR
|
|
|
|
"""
|
|
|
|
if len(tree) < 3:
|
|
|
|
return False
|
|
|
|
trueCtr = 0
|
|
|
|
for argIndex in range(1, len(tree)):
|
|
|
|
argValue = False
|
|
|
|
if isinstance(tree[argIndex], str):
|
|
|
|
argValue = (tree[argIndex] in hashtags)
|
|
|
|
elif isinstance(tree[argIndex], list):
|
|
|
|
argValue = hashtagRuleResolve(tree[argIndex],
|
|
|
|
hashtags, moderated,
|
|
|
|
content, url)
|
|
|
|
if argValue:
|
|
|
|
trueCtr += 1
|
|
|
|
if trueCtr == 1:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2020-10-17 18:49:43 +00:00
|
|
|
def hashtagRuleResolve(tree: [], hashtags: [], moderated: bool,
|
2020-10-20 17:37:15 +00:00
|
|
|
content: str, url: str) -> bool:
|
2020-10-17 12:05:41 +00:00
|
|
|
"""Returns whether the tree for a hashtag rule evaluates to true or false
|
|
|
|
"""
|
|
|
|
if not tree:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if tree[0] == 'not':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalNot(tree, hashtags, moderated, content, url)
|
2020-10-17 18:49:43 +00:00
|
|
|
elif tree[0] == 'contains':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalContains(tree, hashtags, moderated, content, url)
|
2020-10-20 17:37:15 +00:00
|
|
|
elif tree[0] == 'from':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalFrom(tree, hashtags, moderated, content, url)
|
2020-10-17 12:05:41 +00:00
|
|
|
elif tree[0] == 'and':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalAnd(tree, hashtags, moderated, content, url)
|
2020-10-17 12:05:41 +00:00
|
|
|
elif tree[0] == 'or':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalOr(tree, hashtags, moderated, content, url)
|
2020-10-20 09:43:30 +00:00
|
|
|
elif tree[0] == 'xor':
|
2021-07-04 09:24:35 +00:00
|
|
|
return _hashtagLogicalXor(tree, hashtags, moderated, content, url)
|
2020-10-17 12:05:41 +00:00
|
|
|
elif tree[0].startswith('#') and len(tree) == 1:
|
|
|
|
return tree[0] in hashtags
|
2020-10-17 17:36:10 +00:00
|
|
|
elif tree[0].startswith('moderated'):
|
|
|
|
return moderated
|
2020-10-17 18:49:43 +00:00
|
|
|
elif tree[0].startswith('"') and tree[0].endswith('"'):
|
|
|
|
return True
|
2020-10-17 12:05:41 +00:00
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def hashtagRuleTree(operators: [],
|
|
|
|
conditionsStr: str,
|
2020-10-17 17:36:10 +00:00
|
|
|
tagsInConditions: [],
|
|
|
|
moderated: bool) -> []:
|
2020-10-17 12:05:41 +00:00
|
|
|
"""Walks the tree
|
|
|
|
"""
|
|
|
|
if not operators and conditionsStr:
|
|
|
|
conditionsStr = conditionsStr.strip()
|
2020-10-17 18:49:43 +00:00
|
|
|
isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"')
|
|
|
|
if conditionsStr.startswith('#') or isStr or \
|
2020-10-17 17:36:10 +00:00
|
|
|
conditionsStr in operators or \
|
2020-10-17 18:49:43 +00:00
|
|
|
conditionsStr == 'moderated' or \
|
|
|
|
conditionsStr == 'contains':
|
2020-10-17 12:05:41 +00:00
|
|
|
if conditionsStr.startswith('#'):
|
|
|
|
if conditionsStr not in tagsInConditions:
|
2020-10-17 18:49:43 +00:00
|
|
|
if ' ' not in conditionsStr or \
|
|
|
|
conditionsStr.startswith('"'):
|
2020-10-17 12:05:41 +00:00
|
|
|
tagsInConditions.append(conditionsStr)
|
|
|
|
return [conditionsStr.strip()]
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
if not operators or not conditionsStr:
|
|
|
|
return None
|
|
|
|
tree = None
|
|
|
|
conditionsStr = conditionsStr.strip()
|
2020-10-17 18:49:43 +00:00
|
|
|
isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"')
|
|
|
|
if conditionsStr.startswith('#') or isStr or \
|
2020-10-17 17:36:10 +00:00
|
|
|
conditionsStr in operators or \
|
2020-10-17 18:49:43 +00:00
|
|
|
conditionsStr == 'moderated' or \
|
|
|
|
conditionsStr == 'contains':
|
2020-10-17 12:05:41 +00:00
|
|
|
if conditionsStr.startswith('#'):
|
|
|
|
if conditionsStr not in tagsInConditions:
|
2020-10-17 18:49:43 +00:00
|
|
|
if ' ' not in conditionsStr or \
|
|
|
|
conditionsStr.startswith('"'):
|
2020-10-17 12:05:41 +00:00
|
|
|
tagsInConditions.append(conditionsStr)
|
|
|
|
tree = [conditionsStr.strip()]
|
|
|
|
ctr = 0
|
|
|
|
while ctr < len(operators):
|
|
|
|
op = operators[ctr]
|
2020-10-18 15:10:36 +00:00
|
|
|
opMatch = ' ' + op + ' '
|
|
|
|
if opMatch not in conditionsStr and \
|
|
|
|
not conditionsStr.startswith(op + ' '):
|
2020-10-17 12:05:41 +00:00
|
|
|
ctr += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
tree = [op]
|
2020-10-18 15:10:36 +00:00
|
|
|
if opMatch in conditionsStr:
|
|
|
|
sections = conditionsStr.split(opMatch)
|
|
|
|
else:
|
|
|
|
sections = conditionsStr.split(op + ' ', 1)
|
2020-10-17 12:05:41 +00:00
|
|
|
for subConditionStr in sections:
|
2020-10-18 15:10:36 +00:00
|
|
|
result = hashtagRuleTree(operators[ctr + 1:],
|
|
|
|
subConditionStr,
|
2020-10-17 17:36:10 +00:00
|
|
|
tagsInConditions, moderated)
|
2020-10-17 12:05:41 +00:00
|
|
|
if result:
|
|
|
|
tree.append(result)
|
|
|
|
break
|
|
|
|
return tree
|
|
|
|
|
|
|
|
|
2021-12-26 10:00:46 +00:00
|
|
|
def _hashtagAdd(base_dir: str, http_prefix: str, domain_full: str,
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object: {},
|
2021-12-25 23:03:28 +00:00
|
|
|
actionStr: str, hashtags: [], system_language: str,
|
2021-10-20 13:33:34 +00:00
|
|
|
translate: {}) -> None:
|
2021-07-04 09:46:48 +00:00
|
|
|
"""Adds a hashtag via a hashtag rule
|
|
|
|
"""
|
|
|
|
addHashtag = actionStr.split('add ', 1)[1].strip()
|
|
|
|
if not addHashtag.startswith('#'):
|
|
|
|
return
|
|
|
|
|
|
|
|
if addHashtag not in hashtags:
|
|
|
|
hashtags.append(addHashtag)
|
|
|
|
htId = addHashtag.replace('#', '')
|
|
|
|
if not validHashTag(htId):
|
|
|
|
return
|
|
|
|
|
2021-12-26 10:00:46 +00:00
|
|
|
hashtagUrl = http_prefix + "://" + domain_full + "/tags/" + htId
|
2021-07-04 09:46:48 +00:00
|
|
|
newTag = {
|
|
|
|
'href': hashtagUrl,
|
|
|
|
'name': addHashtag,
|
|
|
|
'type': 'Hashtag'
|
|
|
|
}
|
|
|
|
# does the tag already exist?
|
|
|
|
addTagObject = None
|
2021-12-25 22:09:19 +00:00
|
|
|
for t in post_json_object['object']['tag']:
|
2021-07-04 09:46:48 +00:00
|
|
|
if t.get('type') and t.get('name'):
|
|
|
|
if t['type'] == 'Hashtag' and \
|
|
|
|
t['name'] == addHashtag:
|
|
|
|
addTagObject = t
|
|
|
|
break
|
|
|
|
# append the tag if it wasn't found
|
|
|
|
if not addTagObject:
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['tag'].append(newTag)
|
2021-07-04 09:46:48 +00:00
|
|
|
# add corresponding html to the post content
|
|
|
|
hashtagHtml = \
|
|
|
|
" <a href=\"" + hashtagUrl + "\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + htId + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(post_json_object, system_language)
|
2021-07-04 09:46:48 +00:00
|
|
|
if hashtagHtml in content:
|
|
|
|
return
|
|
|
|
|
|
|
|
if content.endswith('</p>'):
|
|
|
|
content = \
|
|
|
|
content[:len(content) - len('</p>')] + \
|
|
|
|
hashtagHtml + '</p>'
|
|
|
|
else:
|
|
|
|
content += hashtagHtml
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['content'] = content
|
2021-12-26 10:00:46 +00:00
|
|
|
domain = domain_full
|
2021-10-20 13:33:34 +00:00
|
|
|
if ':' in domain:
|
|
|
|
domain = domain.split(':')[0]
|
2021-12-25 16:17:53 +00:00
|
|
|
storeHashTags(base_dir, 'news', domain,
|
2021-12-26 10:00:46 +00:00
|
|
|
http_prefix, domain_full,
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object, translate)
|
2021-07-04 09:46:48 +00:00
|
|
|
|
|
|
|
|
2021-12-26 10:00:46 +00:00
|
|
|
def _hashtagRemove(http_prefix: str, domain_full: str, post_json_object: {},
|
2021-12-25 23:03:28 +00:00
|
|
|
actionStr: str, hashtags: [], system_language: str) -> None:
|
2021-07-04 09:46:48 +00:00
|
|
|
"""Removes a hashtag via a hashtag rule
|
|
|
|
"""
|
|
|
|
rmHashtag = actionStr.split('remove ', 1)[1].strip()
|
|
|
|
if not rmHashtag.startswith('#'):
|
|
|
|
return
|
|
|
|
|
|
|
|
if rmHashtag in hashtags:
|
|
|
|
hashtags.remove(rmHashtag)
|
|
|
|
htId = rmHashtag.replace('#', '')
|
2021-12-26 10:00:46 +00:00
|
|
|
hashtagUrl = http_prefix + "://" + domain_full + "/tags/" + htId
|
2021-07-04 09:46:48 +00:00
|
|
|
# remove tag html from the post content
|
|
|
|
hashtagHtml = \
|
|
|
|
"<a href=\"" + hashtagUrl + "\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + htId + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(post_json_object, system_language)
|
2021-07-04 09:46:48 +00:00
|
|
|
if hashtagHtml in content:
|
|
|
|
content = content.replace(hashtagHtml, '').replace(' ', ' ')
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['content'] = content
|
2021-12-25 23:03:28 +00:00
|
|
|
post_json_object['object']['contentMap'][system_language] = content
|
2021-07-04 09:46:48 +00:00
|
|
|
rmTagObject = None
|
2021-12-25 22:09:19 +00:00
|
|
|
for t in post_json_object['object']['tag']:
|
2021-07-04 09:46:48 +00:00
|
|
|
if t.get('type') and t.get('name'):
|
|
|
|
if t['type'] == 'Hashtag' and \
|
|
|
|
t['name'] == rmHashtag:
|
|
|
|
rmTagObject = t
|
|
|
|
break
|
|
|
|
if rmTagObject:
|
2021-12-25 22:09:19 +00:00
|
|
|
post_json_object['object']['tag'].remove(rmTagObject)
|
2021-07-04 09:46:48 +00:00
|
|
|
|
|
|
|
|
2021-12-25 22:09:19 +00:00
|
|
|
def _newswireHashtagProcessing(session, base_dir: str, post_json_object: {},
|
2021-12-25 17:09:22 +00:00
|
|
|
hashtags: [], http_prefix: str,
|
2020-12-22 18:06:23 +00:00
|
|
|
domain: str, port: int,
|
2021-12-25 22:17:49 +00:00
|
|
|
person_cache: {},
|
2021-12-25 22:28:18 +00:00
|
|
|
cached_webfingers: {},
|
2021-12-25 23:45:30 +00:00
|
|
|
federation_list: [],
|
2021-12-25 21:37:41 +00:00
|
|
|
send_threads: [], postLog: [],
|
2021-07-18 14:15:16 +00:00
|
|
|
moderated: bool, url: str,
|
2021-12-25 23:03:28 +00:00
|
|
|
system_language: str,
|
2021-10-20 13:33:34 +00:00
|
|
|
translate: {}) -> bool:
|
2020-10-16 21:33:18 +00:00
|
|
|
"""Applies hashtag rules to a news post.
|
|
|
|
Returns true if the post should be saved to the news timeline
|
|
|
|
of this instance
|
|
|
|
"""
|
2021-12-25 16:17:53 +00:00
|
|
|
rulesFilename = base_dir + '/accounts/hashtagrules.txt'
|
2020-10-17 12:05:41 +00:00
|
|
|
if not os.path.isfile(rulesFilename):
|
|
|
|
return True
|
|
|
|
rules = []
|
2021-07-13 14:40:49 +00:00
|
|
|
with open(rulesFilename, 'r') as f:
|
2020-10-17 12:05:41 +00:00
|
|
|
rules = f.readlines()
|
|
|
|
|
2021-12-26 12:45:03 +00:00
|
|
|
domain_full = get_full_domain(domain, port)
|
2020-10-17 12:05:41 +00:00
|
|
|
|
2020-10-17 18:49:43 +00:00
|
|
|
# get the full text content of the post
|
|
|
|
content = ''
|
2021-12-25 22:09:19 +00:00
|
|
|
if post_json_object['object'].get('content'):
|
2021-12-26 11:29:40 +00:00
|
|
|
content += get_base_content_from_post(post_json_object,
|
|
|
|
system_language)
|
2021-12-25 22:09:19 +00:00
|
|
|
if post_json_object['object'].get('summary'):
|
|
|
|
content += ' ' + post_json_object['object']['summary']
|
2020-10-17 19:04:39 +00:00
|
|
|
content = content.lower()
|
2020-10-17 18:49:43 +00:00
|
|
|
|
2020-10-17 13:41:20 +00:00
|
|
|
# actionOccurred = False
|
2020-10-20 17:37:15 +00:00
|
|
|
operators = ('not', 'and', 'or', 'xor', 'from', 'contains')
|
2020-10-17 12:05:41 +00:00
|
|
|
for ruleStr in rules:
|
|
|
|
if not ruleStr:
|
|
|
|
continue
|
|
|
|
if not ruleStr.startswith('if '):
|
|
|
|
continue
|
|
|
|
if ' then ' not in ruleStr:
|
|
|
|
continue
|
|
|
|
conditionsStr = ruleStr.split('if ', 1)[1]
|
|
|
|
conditionsStr = conditionsStr.split(' then ')[0]
|
|
|
|
tagsInConditions = []
|
2020-10-17 17:36:10 +00:00
|
|
|
tree = hashtagRuleTree(operators, conditionsStr,
|
|
|
|
tagsInConditions, moderated)
|
2020-10-20 17:37:15 +00:00
|
|
|
if not hashtagRuleResolve(tree, hashtags, moderated, content, url):
|
2020-10-17 12:05:41 +00:00
|
|
|
continue
|
|
|
|
# the condition matches, so do something
|
|
|
|
actionStr = ruleStr.split(' then ')[1].strip()
|
|
|
|
|
|
|
|
if actionStr.startswith('add '):
|
2021-07-04 09:46:48 +00:00
|
|
|
# add a hashtag
|
2021-12-26 10:00:46 +00:00
|
|
|
_hashtagAdd(base_dir, http_prefix, domain_full,
|
2021-12-25 23:03:28 +00:00
|
|
|
post_json_object, actionStr, hashtags, system_language,
|
2021-10-20 13:33:34 +00:00
|
|
|
translate)
|
2021-07-04 09:46:48 +00:00
|
|
|
elif actionStr.startswith('remove '):
|
|
|
|
# remove a hashtag
|
2021-12-26 10:00:46 +00:00
|
|
|
_hashtagRemove(http_prefix, domain_full, post_json_object,
|
2021-12-25 23:03:28 +00:00
|
|
|
actionStr, hashtags, system_language)
|
2021-07-04 09:46:48 +00:00
|
|
|
elif actionStr.startswith('block') or actionStr.startswith('drop'):
|
|
|
|
# Block this item
|
2020-10-17 16:24:47 +00:00
|
|
|
return False
|
2020-10-16 21:33:18 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def _createNewsMirror(base_dir: str, domain: str,
|
2021-12-26 19:47:06 +00:00
|
|
|
post_idNumber: str, url: str,
|
2021-12-25 19:42:14 +00:00
|
|
|
max_mirrored_articles: int) -> bool:
|
2020-10-19 16:33:58 +00:00
|
|
|
"""Creates a local mirror of a news article
|
|
|
|
"""
|
2020-10-19 19:26:58 +00:00
|
|
|
if '|' in url or '>' in url:
|
|
|
|
return True
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
mirrorDir = base_dir + '/accounts/newsmirror'
|
2020-10-19 16:33:58 +00:00
|
|
|
if not os.path.isdir(mirrorDir):
|
|
|
|
os.mkdir(mirrorDir)
|
|
|
|
|
2020-10-19 19:26:58 +00:00
|
|
|
# count the directories
|
|
|
|
noOfDirs = 0
|
|
|
|
for subdir, dirs, files in os.walk(mirrorDir):
|
|
|
|
noOfDirs = len(dirs)
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
mirrorIndexFilename = base_dir + '/accounts/newsmirror.txt'
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2021-12-25 19:42:14 +00:00
|
|
|
if max_mirrored_articles > 0 and noOfDirs > max_mirrored_articles:
|
2020-10-19 19:26:58 +00:00
|
|
|
if not os.path.isfile(mirrorIndexFilename):
|
|
|
|
# no index for mirrors found
|
|
|
|
return True
|
|
|
|
removals = []
|
|
|
|
with open(mirrorIndexFilename, 'r') as indexFile:
|
|
|
|
# remove the oldest directories
|
|
|
|
ctr = 0
|
2021-12-25 19:42:14 +00:00
|
|
|
while noOfDirs > max_mirrored_articles:
|
2020-10-19 19:26:58 +00:00
|
|
|
ctr += 1
|
|
|
|
if ctr > 5000:
|
|
|
|
# escape valve
|
|
|
|
break
|
|
|
|
|
2021-12-26 19:47:06 +00:00
|
|
|
post_id = indexFile.readline()
|
|
|
|
if not post_id:
|
2020-10-19 19:26:58 +00:00
|
|
|
continue
|
2021-12-26 19:47:06 +00:00
|
|
|
post_id = post_id.strip()
|
|
|
|
mirrorArticleDir = mirrorDir + '/' + post_id
|
2020-10-19 19:26:58 +00:00
|
|
|
if os.path.isdir(mirrorArticleDir):
|
2021-10-29 18:48:15 +00:00
|
|
|
rmtree(mirrorArticleDir, ignore_errors=False, onerror=None)
|
2021-12-26 19:47:06 +00:00
|
|
|
removals.append(post_id)
|
2020-10-19 19:26:58 +00:00
|
|
|
noOfDirs -= 1
|
|
|
|
|
|
|
|
# remove the corresponding index entries
|
|
|
|
if removals:
|
2021-06-21 22:52:04 +00:00
|
|
|
indexContent = ''
|
|
|
|
with open(mirrorIndexFilename, 'r') as indexFile:
|
|
|
|
indexContent = indexFile.read()
|
2020-10-19 19:26:58 +00:00
|
|
|
for removePostId in removals:
|
|
|
|
indexContent = \
|
|
|
|
indexContent.replace(removePostId + '\n', '')
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
|
|
|
with open(mirrorIndexFilename, 'w+') as indexFile:
|
|
|
|
indexFile.write(indexContent)
|
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to write ' + mirrorIndexFilename)
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2021-12-26 19:47:06 +00:00
|
|
|
mirrorArticleDir = mirrorDir + '/' + post_idNumber
|
2020-10-19 19:26:58 +00:00
|
|
|
if os.path.isdir(mirrorArticleDir):
|
|
|
|
# already mirrored
|
|
|
|
return True
|
|
|
|
|
2020-10-20 09:27:58 +00:00
|
|
|
# for onion instances mirror via tor
|
|
|
|
prefixStr = ''
|
|
|
|
if domain.endswith('.onion'):
|
|
|
|
prefixStr = '/usr/bin/torsocks '
|
|
|
|
|
2020-10-19 19:26:58 +00:00
|
|
|
# download the files
|
|
|
|
commandStr = \
|
2020-10-20 09:27:58 +00:00
|
|
|
prefixStr + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \
|
2020-10-19 19:26:58 +00:00
|
|
|
' -P ' + mirrorArticleDir
|
|
|
|
p = Popen(commandStr, shell=True)
|
|
|
|
os.waitpid(p.pid, 0)
|
|
|
|
|
|
|
|
if not os.path.isdir(mirrorArticleDir):
|
2020-10-20 09:27:58 +00:00
|
|
|
print('WARN: failed to mirror ' + url)
|
2020-10-19 19:26:58 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
# append the post Id number to the index file
|
|
|
|
if os.path.isfile(mirrorIndexFilename):
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
|
|
|
with open(mirrorIndexFilename, 'a+') as indexFile:
|
2021-12-26 19:47:06 +00:00
|
|
|
indexFile.write(post_idNumber + '\n')
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to append ' + mirrorIndexFilename)
|
2020-10-19 19:26:58 +00:00
|
|
|
else:
|
2021-11-25 21:18:53 +00:00
|
|
|
try:
|
|
|
|
with open(mirrorIndexFilename, 'w+') as indexFile:
|
2021-12-26 19:47:06 +00:00
|
|
|
indexFile.write(post_idNumber + '\n')
|
2021-11-25 21:18:53 +00:00
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to write ' + mirrorIndexFilename)
|
2020-10-19 19:26:58 +00:00
|
|
|
|
2020-10-19 16:33:58 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-25 17:09:22 +00:00
|
|
|
def _convertRSStoActivityPub(base_dir: str, http_prefix: str,
|
2020-12-22 18:06:23 +00:00
|
|
|
domain: str, port: int,
|
|
|
|
newswire: {},
|
|
|
|
translate: {},
|
2021-12-26 20:01:37 +00:00
|
|
|
recent_posts_cache: {}, max_recent_posts: int,
|
2021-12-25 22:28:18 +00:00
|
|
|
session, cached_webfingers: {},
|
2021-12-25 22:17:49 +00:00
|
|
|
person_cache: {},
|
2021-12-25 23:45:30 +00:00
|
|
|
federation_list: [],
|
2021-12-25 21:37:41 +00:00
|
|
|
send_threads: [], postLog: [],
|
2021-12-25 19:42:14 +00:00
|
|
|
max_mirrored_articles: int,
|
2021-12-25 18:54:50 +00:00
|
|
|
allow_local_network_access: bool,
|
2021-12-25 23:03:28 +00:00
|
|
|
system_language: str,
|
2021-12-25 18:20:56 +00:00
|
|
|
low_bandwidth: bool,
|
2021-12-25 17:13:38 +00:00
|
|
|
content_license_url: str) -> None:
|
2020-10-07 13:51:29 +00:00
|
|
|
"""Converts rss items in a newswire into posts
|
|
|
|
"""
|
2020-11-03 14:41:28 +00:00
|
|
|
if not newswire:
|
2021-09-15 17:43:06 +00:00
|
|
|
print('No newswire to convert')
|
2020-11-03 14:41:28 +00:00
|
|
|
return
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
basePath = base_dir + '/accounts/news@' + domain + '/outbox'
|
2020-10-07 13:51:29 +00:00
|
|
|
if not os.path.isdir(basePath):
|
|
|
|
os.mkdir(basePath)
|
|
|
|
|
2020-10-09 10:05:01 +00:00
|
|
|
# oldest items first
|
2021-07-13 21:59:53 +00:00
|
|
|
newswireReverse = OrderedDict(sorted(newswire.items(), reverse=False))
|
2020-10-07 18:46:42 +00:00
|
|
|
|
|
|
|
for dateStr, item in newswireReverse.items():
|
2020-10-07 20:03:39 +00:00
|
|
|
originalDateStr = dateStr
|
2020-10-07 14:10:06 +00:00
|
|
|
# convert the date to the format used by ActivityPub
|
2020-10-20 12:37:32 +00:00
|
|
|
if '+00:00' in dateStr:
|
|
|
|
dateStr = dateStr.replace(' ', 'T')
|
|
|
|
dateStr = dateStr.replace('+00:00', 'Z')
|
|
|
|
else:
|
2021-09-15 17:43:06 +00:00
|
|
|
try:
|
|
|
|
dateStrWithOffset = \
|
|
|
|
datetime.datetime.strptime(dateStr, "%Y-%m-%d %H:%M:%S%z")
|
|
|
|
except BaseException:
|
2021-10-29 18:48:15 +00:00
|
|
|
print('EX: Newswire strptime failed ' + str(dateStr))
|
2021-09-15 17:43:06 +00:00
|
|
|
continue
|
2021-09-15 19:04:29 +00:00
|
|
|
try:
|
|
|
|
dateStr = dateStrWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
except BaseException:
|
2021-10-29 18:48:15 +00:00
|
|
|
print('EX: Newswire dateStrWithOffset failed ' +
|
2021-09-15 19:04:29 +00:00
|
|
|
str(dateStrWithOffset))
|
|
|
|
continue
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2021-12-27 17:42:35 +00:00
|
|
|
statusNumber, published = get_status_number(dateStr)
|
2020-10-07 16:55:15 +00:00
|
|
|
newPostId = \
|
2021-12-26 10:19:59 +00:00
|
|
|
local_actor_url(http_prefix, 'news', domain) + \
|
2021-08-14 11:13:39 +00:00
|
|
|
'/statuses/' + statusNumber
|
2020-10-07 16:55:15 +00:00
|
|
|
|
2020-10-07 14:10:06 +00:00
|
|
|
# file where the post is stored
|
2020-10-07 16:55:15 +00:00
|
|
|
filename = basePath + '/' + newPostId.replace('/', '#') + '.json'
|
2020-10-07 13:51:29 +00:00
|
|
|
if os.path.isfile(filename):
|
2020-10-08 12:52:15 +00:00
|
|
|
# don't create the post if it already exists
|
2020-10-08 14:35:26 +00:00
|
|
|
# set the url
|
2020-11-08 18:29:01 +00:00
|
|
|
# newswire[originalDateStr][1] = \
|
|
|
|
# '/users/news/statuses/' + statusNumber
|
2020-10-08 14:35:26 +00:00
|
|
|
# set the filename
|
|
|
|
newswire[originalDateStr][3] = filename
|
2020-10-07 13:51:29 +00:00
|
|
|
continue
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
rssTitle = _removeControlCharacters(item[0])
|
2020-10-10 09:53:56 +00:00
|
|
|
url = item[1]
|
2021-12-25 18:54:50 +00:00
|
|
|
if dangerousMarkup(url, allow_local_network_access) or \
|
|
|
|
dangerousMarkup(rssTitle, allow_local_network_access):
|
2020-10-11 09:33:31 +00:00
|
|
|
continue
|
2020-10-07 13:55:27 +00:00
|
|
|
rssDescription = ''
|
2020-10-07 14:10:06 +00:00
|
|
|
|
|
|
|
# get the rss description if it exists
|
2021-12-27 15:43:22 +00:00
|
|
|
rssDescription = '<p>' + remove_html(item[4]) + '<p>'
|
2020-10-07 14:10:06 +00:00
|
|
|
|
2020-10-19 20:43:27 +00:00
|
|
|
mirrored = item[7]
|
|
|
|
postUrl = url
|
|
|
|
if mirrored and '://' in url:
|
2020-10-19 22:17:06 +00:00
|
|
|
postUrl = '/newsmirror/' + statusNumber + '/' + \
|
2020-10-19 22:21:30 +00:00
|
|
|
url.split('://')[1]
|
|
|
|
if postUrl.endswith('/'):
|
|
|
|
postUrl += 'index.html'
|
|
|
|
else:
|
|
|
|
postUrl += '/index.html'
|
2020-10-19 20:43:27 +00:00
|
|
|
|
2020-10-07 14:10:06 +00:00
|
|
|
# add the off-site link to the description
|
2021-01-11 21:38:31 +00:00
|
|
|
rssDescription += \
|
|
|
|
'<br><a href="' + postUrl + '">' + \
|
|
|
|
translate['Read more...'] + '</a>'
|
2020-10-11 09:33:31 +00:00
|
|
|
|
2020-10-07 16:55:15 +00:00
|
|
|
followersOnly = False
|
2020-10-09 10:08:01 +00:00
|
|
|
# NOTE: the id when the post is created will not be
|
|
|
|
# consistent (it's based on the current time, not the
|
|
|
|
# published time), so we change that later
|
2021-05-09 19:11:05 +00:00
|
|
|
saveToFile = False
|
|
|
|
attachImageFilename = None
|
|
|
|
mediaType = None
|
|
|
|
imageDescription = None
|
2021-05-09 19:29:53 +00:00
|
|
|
city = 'London, England'
|
2021-08-08 16:52:32 +00:00
|
|
|
conversationId = None
|
2021-12-25 16:17:53 +00:00
|
|
|
blog = createNewsPost(base_dir,
|
2021-12-25 17:09:22 +00:00
|
|
|
domain, port, http_prefix,
|
2020-10-11 11:00:28 +00:00
|
|
|
rssDescription,
|
2021-05-09 19:11:05 +00:00
|
|
|
followersOnly, saveToFile,
|
|
|
|
attachImageFilename, mediaType,
|
|
|
|
imageDescription, city,
|
2021-12-25 23:03:28 +00:00
|
|
|
rssTitle, system_language,
|
2021-12-25 18:20:56 +00:00
|
|
|
conversationId, low_bandwidth,
|
2021-12-25 17:13:38 +00:00
|
|
|
content_license_url)
|
2020-10-07 16:55:15 +00:00
|
|
|
if not blog:
|
|
|
|
continue
|
|
|
|
|
2020-10-19 16:33:58 +00:00
|
|
|
if mirrored:
|
2021-12-25 16:17:53 +00:00
|
|
|
if not _createNewsMirror(base_dir, domain, statusNumber,
|
2021-12-25 19:42:14 +00:00
|
|
|
url, max_mirrored_articles):
|
2020-10-19 16:33:58 +00:00
|
|
|
continue
|
|
|
|
|
2020-10-07 16:55:15 +00:00
|
|
|
idStr = \
|
2021-12-26 10:19:59 +00:00
|
|
|
local_actor_url(http_prefix, 'news', domain) + \
|
2020-10-07 16:55:15 +00:00
|
|
|
'/statuses/' + statusNumber + '/replies'
|
2020-10-08 09:07:45 +00:00
|
|
|
blog['news'] = True
|
2020-10-09 10:05:01 +00:00
|
|
|
|
|
|
|
# note the time of arrival
|
2021-12-26 13:17:46 +00:00
|
|
|
curr_time = datetime.datetime.utcnow()
|
|
|
|
blog['object']['arrived'] = curr_time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
2020-10-09 10:05:01 +00:00
|
|
|
|
2020-10-09 10:08:01 +00:00
|
|
|
# change the id, based upon the published time
|
2020-10-07 16:55:15 +00:00
|
|
|
blog['object']['replies']['id'] = idStr
|
|
|
|
blog['object']['replies']['first']['partOf'] = idStr
|
|
|
|
|
|
|
|
blog['id'] = newPostId + '/activity'
|
|
|
|
blog['object']['id'] = newPostId
|
|
|
|
blog['object']['atomUri'] = newPostId
|
|
|
|
blog['object']['url'] = \
|
2021-12-25 17:09:22 +00:00
|
|
|
http_prefix + '://' + domain + '/@news/' + statusNumber
|
2020-10-07 16:55:15 +00:00
|
|
|
blog['object']['published'] = dateStr
|
2020-10-20 13:07:02 +00:00
|
|
|
|
2020-10-20 12:49:12 +00:00
|
|
|
blog['object']['content'] = rssDescription
|
2021-12-25 23:03:28 +00:00
|
|
|
blog['object']['contentMap'][system_language] = rssDescription
|
2020-10-07 16:55:15 +00:00
|
|
|
|
2021-12-26 12:45:03 +00:00
|
|
|
domain_full = get_full_domain(domain, port)
|
2020-10-17 13:59:47 +00:00
|
|
|
|
|
|
|
hashtags = item[6]
|
|
|
|
|
2021-12-26 19:47:06 +00:00
|
|
|
post_id = newPostId.replace('/', '#')
|
2020-10-07 14:10:06 +00:00
|
|
|
|
2020-10-09 12:15:20 +00:00
|
|
|
moderated = item[5]
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
savePost = _newswireHashtagProcessing(session, base_dir,
|
|
|
|
blog, hashtags,
|
2021-12-25 17:09:22 +00:00
|
|
|
http_prefix, domain, port,
|
2021-12-25 22:28:18 +00:00
|
|
|
person_cache, cached_webfingers,
|
2021-12-25 23:45:30 +00:00
|
|
|
federation_list,
|
2021-12-25 21:37:41 +00:00
|
|
|
send_threads, postLog,
|
2021-12-25 23:03:28 +00:00
|
|
|
moderated, url, system_language,
|
2021-10-20 13:33:34 +00:00
|
|
|
translate)
|
2020-10-09 12:15:20 +00:00
|
|
|
|
2020-10-16 21:33:18 +00:00
|
|
|
# save the post and update the index
|
|
|
|
if savePost:
|
2020-10-25 12:00:55 +00:00
|
|
|
# ensure that all hashtags are stored in the json
|
|
|
|
# and appended to the content
|
|
|
|
blog['object']['tag'] = []
|
2020-10-25 11:22:52 +00:00
|
|
|
for tagName in hashtags:
|
|
|
|
htId = tagName.replace('#', '')
|
|
|
|
hashtagUrl = \
|
2021-12-26 10:00:46 +00:00
|
|
|
http_prefix + "://" + domain_full + "/tags/" + htId
|
2020-10-25 11:22:52 +00:00
|
|
|
newTag = {
|
|
|
|
'href': hashtagUrl,
|
|
|
|
'name': tagName,
|
|
|
|
'type': 'Hashtag'
|
|
|
|
}
|
|
|
|
blog['object']['tag'].append(newTag)
|
2020-10-25 12:00:55 +00:00
|
|
|
hashtagHtml = \
|
2020-10-25 12:57:14 +00:00
|
|
|
" <a href=\"" + hashtagUrl + \
|
2020-10-25 12:00:55 +00:00
|
|
|
"\" class=\"addedHashtag\" " + \
|
|
|
|
"rel=\"tag\">#<span>" + \
|
|
|
|
htId + "</span></a>"
|
2021-12-26 11:29:40 +00:00
|
|
|
content = get_base_content_from_post(blog, system_language)
|
2020-10-25 14:37:51 +00:00
|
|
|
if hashtagHtml not in content:
|
|
|
|
if content.endswith('</p>'):
|
|
|
|
content = \
|
|
|
|
content[:len(content) - len('</p>')] + \
|
|
|
|
hashtagHtml + '</p>'
|
|
|
|
else:
|
|
|
|
content += hashtagHtml
|
|
|
|
blog['object']['content'] = content
|
2021-12-25 23:03:28 +00:00
|
|
|
blog['object']['contentMap'][system_language] = content
|
2020-10-25 11:22:52 +00:00
|
|
|
|
2020-10-25 14:21:29 +00:00
|
|
|
# update the newswire tags if new ones have been found by
|
2020-12-22 18:06:23 +00:00
|
|
|
# _newswireHashtagProcessing
|
2020-10-25 14:21:29 +00:00
|
|
|
for tag in hashtags:
|
|
|
|
if tag not in newswire[originalDateStr][6]:
|
|
|
|
newswire[originalDateStr][6].append(tag)
|
2020-10-17 13:39:04 +00:00
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
storeHashTags(base_dir, 'news', domain,
|
2021-12-26 10:00:46 +00:00
|
|
|
http_prefix, domain_full,
|
2021-10-20 13:33:34 +00:00
|
|
|
blog, translate)
|
2020-10-17 13:39:04 +00:00
|
|
|
|
2021-12-26 20:01:37 +00:00
|
|
|
clearFromPostCaches(base_dir, recent_posts_cache, post_id)
|
2021-12-26 14:47:21 +00:00
|
|
|
if save_json(blog, filename):
|
2021-12-26 19:47:06 +00:00
|
|
|
_updateFeedsOutboxIndex(base_dir, domain, post_id + '.json')
|
2020-10-16 21:33:18 +00:00
|
|
|
|
|
|
|
# Save a file containing the time when the post arrived
|
|
|
|
# this can then later be used to construct the news timeline
|
|
|
|
# excluding items during the voting period
|
|
|
|
if moderated:
|
2021-12-25 16:17:53 +00:00
|
|
|
_saveArrivedTime(base_dir, filename,
|
2020-12-22 18:06:23 +00:00
|
|
|
blog['object']['arrived'])
|
2020-10-16 21:33:18 +00:00
|
|
|
else:
|
|
|
|
if os.path.isfile(filename + '.arrived'):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(filename + '.arrived')
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2021-10-29 18:48:15 +00:00
|
|
|
print('EX: _convertRSStoActivityPub ' +
|
|
|
|
'unable to delete ' + filename + '.arrived')
|
2020-10-16 21:33:18 +00:00
|
|
|
|
2020-11-08 16:52:57 +00:00
|
|
|
# setting the url here links to the activitypub object
|
|
|
|
# stored locally
|
2020-11-08 16:50:50 +00:00
|
|
|
# newswire[originalDateStr][1] = \
|
|
|
|
# '/users/news/statuses/' + statusNumber
|
2020-11-08 16:52:57 +00:00
|
|
|
|
2020-10-16 21:33:18 +00:00
|
|
|
# set the filename
|
|
|
|
newswire[originalDateStr][3] = filename
|
2020-10-07 13:51:29 +00:00
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _mergeWithPreviousNewswire(oldNewswire: {}, newNewswire: {}) -> None:
|
2020-10-09 09:02:01 +00:00
|
|
|
"""Preserve any votes or generated activitypub post filename
|
|
|
|
as rss feeds are updated
|
|
|
|
"""
|
2020-11-03 14:41:28 +00:00
|
|
|
if not oldNewswire:
|
|
|
|
return
|
|
|
|
|
2020-10-09 09:02:01 +00:00
|
|
|
for published, fields in oldNewswire.items():
|
|
|
|
if not newNewswire.get(published):
|
|
|
|
continue
|
2020-10-13 08:53:59 +00:00
|
|
|
for i in range(1, 5):
|
|
|
|
newNewswire[published][i] = fields[i]
|
2020-10-09 09:02:01 +00:00
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def runNewswireDaemon(base_dir: str, httpd,
|
2021-12-25 17:09:22 +00:00
|
|
|
http_prefix: str, domain: str, port: int,
|
2020-10-07 13:51:29 +00:00
|
|
|
translate: {}) -> None:
|
2020-10-07 12:05:49 +00:00
|
|
|
"""Periodically updates RSS feeds
|
|
|
|
"""
|
2021-12-25 16:17:53 +00:00
|
|
|
newswireStateFilename = base_dir + '/accounts/.newswirestate.json'
|
|
|
|
refreshFilename = base_dir + '/accounts/.refresh_newswire'
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2020-10-07 12:05:49 +00:00
|
|
|
# initial sleep to allow the system to start up
|
|
|
|
time.sleep(50)
|
|
|
|
while True:
|
|
|
|
# has the session been created yet?
|
|
|
|
if not httpd.session:
|
2020-11-03 16:10:54 +00:00
|
|
|
print('Newswire daemon waiting for session')
|
2021-12-25 21:09:22 +00:00
|
|
|
httpd.session = createSession(httpd.proxy_type)
|
2020-11-03 16:08:31 +00:00
|
|
|
if not httpd.session:
|
2020-11-03 16:10:54 +00:00
|
|
|
print('Newswire daemon has no session')
|
2020-11-03 16:08:31 +00:00
|
|
|
time.sleep(60)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
print('Newswire daemon session established')
|
2020-10-07 12:05:49 +00:00
|
|
|
|
|
|
|
# try to update the feeds
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Updating newswire feeds')
|
2020-11-22 20:28:32 +00:00
|
|
|
newNewswire = \
|
2021-12-25 16:17:53 +00:00
|
|
|
getDictFromNewswire(httpd.session, base_dir, domain,
|
2021-12-25 18:49:19 +00:00
|
|
|
httpd.max_newswire_postsPerSource,
|
2021-12-25 20:09:29 +00:00
|
|
|
httpd.max_newswire_feed_size_kb,
|
2020-11-22 20:28:32 +00:00
|
|
|
httpd.maxTags,
|
2021-12-25 18:57:13 +00:00
|
|
|
httpd.max_feed_item_size_kb,
|
2021-12-25 18:49:19 +00:00
|
|
|
httpd.max_newswire_posts,
|
2021-07-18 14:15:16 +00:00
|
|
|
httpd.maxCategoriesFeedItemSizeKb,
|
2021-12-25 23:03:28 +00:00
|
|
|
httpd.system_language,
|
2021-12-16 20:57:30 +00:00
|
|
|
httpd.debug)
|
2020-10-07 12:05:49 +00:00
|
|
|
|
2020-10-09 09:02:01 +00:00
|
|
|
if not httpd.newswire:
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Newswire feeds not updated')
|
2020-10-09 09:13:16 +00:00
|
|
|
if os.path.isfile(newswireStateFilename):
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Loading newswire from file')
|
2021-12-26 15:13:34 +00:00
|
|
|
httpd.newswire = load_json(newswireStateFilename)
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Merging with previous newswire')
|
2020-12-22 18:06:23 +00:00
|
|
|
_mergeWithPreviousNewswire(httpd.newswire, newNewswire)
|
2020-10-09 09:02:01 +00:00
|
|
|
|
2020-10-07 12:05:49 +00:00
|
|
|
httpd.newswire = newNewswire
|
2020-11-03 21:53:29 +00:00
|
|
|
if newNewswire:
|
2021-12-26 14:47:21 +00:00
|
|
|
save_json(httpd.newswire, newswireStateFilename)
|
2020-11-03 21:53:29 +00:00
|
|
|
print('Newswire updated')
|
2021-09-15 17:03:20 +00:00
|
|
|
else:
|
|
|
|
print('No new newswire')
|
2020-10-07 13:51:29 +00:00
|
|
|
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Converting newswire to activitypub format')
|
2021-12-25 16:17:53 +00:00
|
|
|
_convertRSStoActivityPub(base_dir,
|
2021-12-25 17:09:22 +00:00
|
|
|
http_prefix, domain, port,
|
2020-12-22 18:06:23 +00:00
|
|
|
newNewswire, translate,
|
2021-12-26 20:01:37 +00:00
|
|
|
httpd.recent_posts_cache,
|
2021-12-25 20:28:06 +00:00
|
|
|
httpd.max_recent_posts,
|
2020-12-22 18:06:23 +00:00
|
|
|
httpd.session,
|
2021-12-25 22:28:18 +00:00
|
|
|
httpd.cached_webfingers,
|
2021-12-25 22:17:49 +00:00
|
|
|
httpd.person_cache,
|
2021-12-25 23:45:30 +00:00
|
|
|
httpd.federation_list,
|
2021-12-25 21:37:41 +00:00
|
|
|
httpd.send_threads,
|
2020-12-22 18:06:23 +00:00
|
|
|
httpd.postLog,
|
2021-12-25 19:42:14 +00:00
|
|
|
httpd.max_mirrored_articles,
|
2021-12-25 18:54:50 +00:00
|
|
|
httpd.allow_local_network_access,
|
2021-12-25 23:03:28 +00:00
|
|
|
httpd.system_language,
|
2021-12-25 18:20:56 +00:00
|
|
|
httpd.low_bandwidth,
|
2021-12-25 17:13:38 +00:00
|
|
|
httpd.content_license_url)
|
2020-10-07 13:51:29 +00:00
|
|
|
print('Newswire feed converted to ActivityPub')
|
|
|
|
|
2021-12-25 19:39:45 +00:00
|
|
|
if httpd.max_news_posts > 0:
|
2021-12-25 23:41:17 +00:00
|
|
|
archive_dir = base_dir + '/archive'
|
2020-10-21 10:39:09 +00:00
|
|
|
archiveSubdir = \
|
2021-12-25 23:41:17 +00:00
|
|
|
archive_dir + '/accounts/news@' + domain + '/outbox'
|
2021-09-15 17:03:20 +00:00
|
|
|
print('Archiving news posts')
|
2021-12-25 17:09:22 +00:00
|
|
|
archivePostsForPerson(http_prefix, 'news',
|
2021-12-25 16:17:53 +00:00
|
|
|
domain, base_dir, 'outbox',
|
2020-10-21 10:39:09 +00:00
|
|
|
archiveSubdir,
|
2021-12-26 20:01:37 +00:00
|
|
|
httpd.recent_posts_cache,
|
2021-12-25 19:39:45 +00:00
|
|
|
httpd.max_news_posts)
|
2020-10-21 10:39:09 +00:00
|
|
|
|
2020-10-07 12:05:49 +00:00
|
|
|
# wait a while before the next feeds update
|
2021-02-10 13:31:19 +00:00
|
|
|
for tick in range(120):
|
|
|
|
time.sleep(10)
|
|
|
|
# if a new blog post has been created then stop
|
|
|
|
# waiting and recalculate the newswire
|
|
|
|
if os.path.isfile(refreshFilename):
|
|
|
|
try:
|
|
|
|
os.remove(refreshFilename)
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2021-10-29 18:48:15 +00:00
|
|
|
print('EX: runNewswireDaemon unable to delete ' +
|
|
|
|
str(refreshFilename))
|
2021-02-10 13:31:19 +00:00
|
|
|
break
|
2020-10-07 12:05:49 +00:00
|
|
|
|
|
|
|
|
2021-12-25 20:34:38 +00:00
|
|
|
def runNewswireWatchdog(project_version: str, httpd) -> None:
|
2020-10-07 12:05:49 +00:00
|
|
|
"""This tries to keep the newswire update thread running even if it dies
|
|
|
|
"""
|
|
|
|
print('Starting newswire watchdog')
|
|
|
|
newswireOriginal = \
|
|
|
|
httpd.thrPostSchedule.clone(runNewswireDaemon)
|
|
|
|
httpd.thrNewswireDaemon.start()
|
|
|
|
while True:
|
|
|
|
time.sleep(50)
|
2021-06-05 12:43:57 +00:00
|
|
|
if httpd.thrNewswireDaemon.is_alive():
|
|
|
|
continue
|
|
|
|
httpd.thrNewswireDaemon.kill()
|
|
|
|
httpd.thrNewswireDaemon = \
|
|
|
|
newswireOriginal.clone(runNewswireDaemon)
|
|
|
|
httpd.thrNewswireDaemon.start()
|
|
|
|
print('Restarting newswire daemon...')
|