epicyon/newsdaemon.py

778 lines
30 KiB
Python
Raw Normal View History

2020-10-07 12:05:49 +00:00
__filename__ = "newsdaemon.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
2020-10-17 18:53:08 +00:00
# Example hashtag logic:
#
# if moderated and not #imcoxford then block
# if #pol and contains "westminster" then add #britpol
2020-10-17 19:06:56 +00:00
# if #unwantedtag then block
2020-10-17 18:53:08 +00:00
import os
2020-10-07 12:05:49 +00:00
import time
2020-10-09 10:05:01 +00:00
import datetime
2020-10-20 13:07:02 +00:00
import html
2020-10-19 19:26:58 +00:00
from shutil import rmtree
from subprocess import Popen
2020-10-07 18:46:42 +00:00
from collections import OrderedDict
2020-10-07 12:05:49 +00:00
from newswire import getDictFromNewswire
2020-10-16 21:33:18 +00:00
# from posts import sendSignedJson
2020-10-07 21:26:03 +00:00
from posts import createNewsPost
2020-10-21 10:39:09 +00:00
from posts import archivePostsForPerson
2020-10-11 09:33:31 +00:00
from content import removeHtmlTag
from content import dangerousMarkup
2020-10-17 12:05:41 +00:00
from content import validHashTag
2020-10-09 09:02:01 +00:00
from utils import loadJson
from utils import saveJson
2020-10-07 16:55:15 +00:00
from utils import getStatusNumber
2020-10-18 16:19:28 +00:00
from utils import clearFromPostCaches
2020-10-17 13:39:04 +00:00
from inbox import storeHashTags
from session import createSession
2020-10-07 12:05:49 +00:00
2020-10-08 12:29:40 +00:00
2020-10-09 09:43:34 +00:00
def updateFeedsOutboxIndex(baseDir: str, domain: str, postId: str) -> None:
"""Updates the index used for imported RSS feeds
"""
2020-10-07 16:55:15 +00:00
basePath = baseDir + '/accounts/news@' + domain
indexFilename = basePath + '/outbox.index'
if os.path.isfile(indexFilename):
2020-10-07 18:46:42 +00:00
if postId not in open(indexFilename).read():
try:
with open(indexFilename, 'r+') as feedsFile:
content = feedsFile.read()
feedsFile.seek(0, 0)
feedsFile.write(postId + '\n' + content)
print('DEBUG: feeds post added to index')
except Exception as e:
print('WARN: Failed to write entry to feeds posts index ' +
indexFilename + ' ' + str(e))
else:
feedsFile = open(indexFilename, 'w+')
if feedsFile:
2020-10-07 16:55:15 +00:00
feedsFile.write(postId + '\n')
feedsFile.close()
2020-10-09 12:15:20 +00:00
def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
"""Saves the time when an rss post arrived to a file
"""
arrivedFile = open(postFilename + '.arrived', 'w+')
if arrivedFile:
arrivedFile.write(arrived)
arrivedFile.close()
2020-10-10 09:36:23 +00:00
def removeControlCharacters(content: str) -> str:
2020-10-20 13:07:02 +00:00
"""Remove escaped html
2020-10-11 09:33:31 +00:00
"""
2020-10-20 13:07:02 +00:00
if '&' in content:
return html.unescape(content)
2020-10-11 09:33:31 +00:00
return content
2020-10-10 09:36:23 +00:00
2020-10-10 08:54:13 +00:00
def hashtagRuleResolve(tree: [], hashtags: [], moderated: bool,
2020-10-20 17:37:15 +00:00
content: str, url: str) -> bool:
2020-10-17 12:05:41 +00:00
"""Returns whether the tree for a hashtag rule evaluates to true or false
"""
if not tree:
return False
if tree[0] == 'not':
if len(tree) == 2:
if isinstance(tree[1], str):
return tree[1] not in hashtags
elif isinstance(tree[1], list):
return not hashtagRuleResolve(tree[1], hashtags, moderated,
2020-10-20 17:37:15 +00:00
content, url)
elif tree[0] == 'contains':
if len(tree) == 2:
2020-10-25 12:17:59 +00:00
matchStr = None
if isinstance(tree[1], str):
matchStr = tree[1]
elif isinstance(tree[1], list):
matchStr = tree[1][0]
2020-10-25 12:17:59 +00:00
if matchStr:
if matchStr.startswith('"') and matchStr.endswith('"'):
matchStr = matchStr[1:]
matchStr = matchStr[:len(matchStr) - 1]
matchStrLower = matchStr.lower()
contentWithoutTags = content.replace('#' + matchStrLower, '')
return matchStrLower in contentWithoutTags
2020-10-20 17:37:15 +00:00
elif tree[0] == 'from':
if len(tree) == 2:
2020-10-25 12:22:09 +00:00
matchStr = None
2020-10-20 17:37:15 +00:00
if isinstance(tree[1], str):
matchStr = tree[1]
elif isinstance(tree[1], list):
matchStr = tree[1][0]
2020-10-25 12:22:09 +00:00
if matchStr:
2020-10-20 17:37:15 +00:00
if matchStr.startswith('"') and matchStr.endswith('"'):
matchStr = matchStr[1:]
matchStr = matchStr[:len(matchStr) - 1]
return matchStr.lower() in url
2020-10-17 12:05:41 +00:00
elif tree[0] == 'and':
2020-10-18 15:10:36 +00:00
if len(tree) >= 3:
for argIndex in range(1, len(tree)):
argValue = False
if isinstance(tree[argIndex], str):
argValue = (tree[argIndex] in hashtags)
elif isinstance(tree[argIndex], list):
argValue = hashtagRuleResolve(tree[argIndex],
hashtags, moderated,
2020-10-20 17:37:15 +00:00
content, url)
2020-10-18 15:10:36 +00:00
if not argValue:
return False
return True
2020-10-17 12:05:41 +00:00
elif tree[0] == 'or':
2020-10-18 15:10:36 +00:00
if len(tree) >= 3:
for argIndex in range(1, len(tree)):
argValue = False
if isinstance(tree[argIndex], str):
argValue = (tree[argIndex] in hashtags)
elif isinstance(tree[argIndex], list):
argValue = hashtagRuleResolve(tree[argIndex],
hashtags, moderated,
2020-10-20 17:37:15 +00:00
content, url)
2020-10-18 15:10:36 +00:00
if argValue:
return True
return False
2020-10-20 09:43:30 +00:00
elif tree[0] == 'xor':
if len(tree) >= 3:
trueCtr = 0
for argIndex in range(1, len(tree)):
argValue = False
if isinstance(tree[argIndex], str):
argValue = (tree[argIndex] in hashtags)
elif isinstance(tree[argIndex], list):
argValue = hashtagRuleResolve(tree[argIndex],
hashtags, moderated,
2020-10-20 17:37:15 +00:00
content, url)
2020-10-20 09:43:30 +00:00
if argValue:
trueCtr += 1
if trueCtr == 1:
return True
2020-10-17 12:05:41 +00:00
elif tree[0].startswith('#') and len(tree) == 1:
return tree[0] in hashtags
2020-10-17 17:36:10 +00:00
elif tree[0].startswith('moderated'):
return moderated
elif tree[0].startswith('"') and tree[0].endswith('"'):
return True
2020-10-17 12:05:41 +00:00
return False
def hashtagRuleTree(operators: [],
conditionsStr: str,
2020-10-17 17:36:10 +00:00
tagsInConditions: [],
moderated: bool) -> []:
2020-10-17 12:05:41 +00:00
"""Walks the tree
"""
if not operators and conditionsStr:
conditionsStr = conditionsStr.strip()
isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"')
if conditionsStr.startswith('#') or isStr or \
2020-10-17 17:36:10 +00:00
conditionsStr in operators or \
conditionsStr == 'moderated' or \
conditionsStr == 'contains':
2020-10-17 12:05:41 +00:00
if conditionsStr.startswith('#'):
if conditionsStr not in tagsInConditions:
if ' ' not in conditionsStr or \
conditionsStr.startswith('"'):
2020-10-17 12:05:41 +00:00
tagsInConditions.append(conditionsStr)
return [conditionsStr.strip()]
else:
return None
if not operators or not conditionsStr:
return None
tree = None
conditionsStr = conditionsStr.strip()
isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"')
if conditionsStr.startswith('#') or isStr or \
2020-10-17 17:36:10 +00:00
conditionsStr in operators or \
conditionsStr == 'moderated' or \
conditionsStr == 'contains':
2020-10-17 12:05:41 +00:00
if conditionsStr.startswith('#'):
if conditionsStr not in tagsInConditions:
if ' ' not in conditionsStr or \
conditionsStr.startswith('"'):
2020-10-17 12:05:41 +00:00
tagsInConditions.append(conditionsStr)
tree = [conditionsStr.strip()]
ctr = 0
while ctr < len(operators):
op = operators[ctr]
2020-10-18 15:10:36 +00:00
opMatch = ' ' + op + ' '
if opMatch not in conditionsStr and \
not conditionsStr.startswith(op + ' '):
2020-10-17 12:05:41 +00:00
ctr += 1
continue
else:
tree = [op]
2020-10-18 15:10:36 +00:00
if opMatch in conditionsStr:
sections = conditionsStr.split(opMatch)
else:
sections = conditionsStr.split(op + ' ', 1)
2020-10-17 12:05:41 +00:00
for subConditionStr in sections:
2020-10-18 15:10:36 +00:00
result = hashtagRuleTree(operators[ctr + 1:],
subConditionStr,
2020-10-17 17:36:10 +00:00
tagsInConditions, moderated)
2020-10-17 12:05:41 +00:00
if result:
tree.append(result)
break
return tree
2020-10-16 21:33:18 +00:00
def newswireHashtagProcessing(session, baseDir: str, postJsonObject: {},
2020-10-17 16:46:21 +00:00
hashtags: [], httpPrefix: str,
2020-10-16 21:33:18 +00:00
domain: str, port: int,
personCache: {},
cachedWebfingers: {},
federationList: [],
2020-10-17 17:36:10 +00:00
sendThreads: [], postLog: [],
2020-10-20 17:37:15 +00:00
moderated: bool, url: str) -> bool:
2020-10-16 21:33:18 +00:00
"""Applies hashtag rules to a news post.
Returns true if the post should be saved to the news timeline
of this instance
"""
2020-10-17 12:05:41 +00:00
rulesFilename = baseDir + '/accounts/hashtagrules.txt'
if not os.path.isfile(rulesFilename):
return True
rules = []
with open(rulesFilename, "r") as f:
rules = f.readlines()
domainFull = domain
if port:
if port != 80 and port != 443:
domainFull = domain + ':' + str(port)
# get the full text content of the post
content = ''
if postJsonObject['object'].get('content'):
content += postJsonObject['object']['content']
if postJsonObject['object'].get('summary'):
content += ' ' + postJsonObject['object']['summary']
content = content.lower()
2020-10-17 13:41:20 +00:00
# actionOccurred = False
2020-10-20 17:37:15 +00:00
operators = ('not', 'and', 'or', 'xor', 'from', 'contains')
2020-10-17 12:05:41 +00:00
for ruleStr in rules:
if not ruleStr:
continue
if not ruleStr.startswith('if '):
continue
if ' then ' not in ruleStr:
continue
conditionsStr = ruleStr.split('if ', 1)[1]
conditionsStr = conditionsStr.split(' then ')[0]
tagsInConditions = []
2020-10-17 17:36:10 +00:00
tree = hashtagRuleTree(operators, conditionsStr,
tagsInConditions, moderated)
2020-10-20 17:37:15 +00:00
if not hashtagRuleResolve(tree, hashtags, moderated, content, url):
2020-10-17 12:05:41 +00:00
continue
# the condition matches, so do something
actionStr = ruleStr.split(' then ')[1].strip()
# add a hashtag
if actionStr.startswith('add '):
addHashtag = actionStr.split('add ', 1)[1].strip()
if addHashtag.startswith('#'):
if addHashtag not in hashtags:
hashtags.append(addHashtag)
2020-10-18 12:53:23 +00:00
htId = addHashtag.replace('#', '')
if validHashTag(htId):
hashtagUrl = \
httpPrefix + "://" + domainFull + "/tags/" + htId
newTag = {
'href': hashtagUrl,
'name': addHashtag,
'type': 'Hashtag'
}
2020-10-18 13:31:50 +00:00
# does the tag already exist?
addTagObject = None
for t in postJsonObject['object']['tag']:
if t.get('type') and t.get('name'):
if t['type'] == 'Hashtag' and \
t['name'] == addHashtag:
addTagObject = t
break
# append the tag if it wasn't found
if not addTagObject:
postJsonObject['object']['tag'].append(newTag)
# add corresponding html to the post content
hashtagHtml = \
" <a href=\"" + hashtagUrl + \
2020-10-23 17:13:02 +00:00
"\" class=\"addedHashtag\" " + \
2020-10-18 13:31:50 +00:00
"rel=\"tag\">#<span>" + \
htId + "</span></a>"
content = postJsonObject['object']['content']
if hashtagHtml not in content:
if content.endswith('</p>'):
content = \
content[:len(content) - len('</p>')] + \
hashtagHtml + '</p>'
else:
content += hashtagHtml
postJsonObject['object']['content'] = content
storeHashTags(baseDir, 'news', postJsonObject)
# actionOccurred = True
2020-10-17 12:05:41 +00:00
# remove a hashtag
if actionStr.startswith('remove '):
rmHashtag = actionStr.split('remove ', 1)[1].strip()
if rmHashtag.startswith('#'):
if rmHashtag in hashtags:
hashtags.remove(rmHashtag)
2020-10-18 12:53:23 +00:00
htId = rmHashtag.replace('#', '')
hashtagUrl = \
httpPrefix + "://" + domainFull + "/tags/" + htId
# remove tag html from the post content
hashtagHtml = \
"<a href=\"" + hashtagUrl + \
2020-10-23 17:13:02 +00:00
"\" class=\"addedHashtag\" " + \
2020-10-18 12:53:23 +00:00
"rel=\"tag\">#<span>" + \
htId + "</span></a>"
content = postJsonObject['object']['content']
if hashtagHtml in content:
content = \
content.replace(hashtagHtml, '').replace(' ', ' ')
postJsonObject['object']['content'] = content
rmTagObject = None
for t in postJsonObject['object']['tag']:
if t.get('type') and t.get('name'):
if t['type'] == 'Hashtag' and \
t['name'] == rmHashtag:
rmTagObject = t
break
if rmTagObject:
postJsonObject['object']['tag'].remove(rmTagObject)
# actionOccurred = True
2020-10-17 12:05:41 +00:00
# Block this item
if actionStr.startswith('block') or actionStr.startswith('drop'):
return False
2020-10-16 21:33:18 +00:00
# TODO
# If routing to another instance
# sendSignedJson(postJsonObject: {}, session, baseDir: str,
# nickname: str, domain: str, port: int,
# toNickname: str, toDomain: str, toPort: int, cc: str,
# httpPrefix: str, False, False,
# federationList: [],
# sendThreads: [], postLog: [], cachedWebfingers: {},
# personCache: {}, False, __version__) -> int:
2020-10-17 13:40:36 +00:00
# if actionOccurred:
# return True
2020-10-16 21:33:18 +00:00
return True
2020-10-20 09:27:58 +00:00
def createNewsMirror(baseDir: str, domain: str,
postIdNumber: str, url: str,
2020-10-19 16:33:58 +00:00
maxMirroredArticles: int) -> bool:
"""Creates a local mirror of a news article
"""
2020-10-19 19:26:58 +00:00
if '|' in url or '>' in url:
return True
2020-10-19 16:33:58 +00:00
mirrorDir = baseDir + '/accounts/newsmirror'
if not os.path.isdir(mirrorDir):
os.mkdir(mirrorDir)
2020-10-19 19:26:58 +00:00
# count the directories
noOfDirs = 0
for subdir, dirs, files in os.walk(mirrorDir):
noOfDirs = len(dirs)
mirrorIndexFilename = baseDir + '/accounts/newsmirror.txt'
if maxMirroredArticles > 0 and noOfDirs > maxMirroredArticles:
if not os.path.isfile(mirrorIndexFilename):
# no index for mirrors found
return True
removals = []
with open(mirrorIndexFilename, 'r') as indexFile:
# remove the oldest directories
ctr = 0
while noOfDirs > maxMirroredArticles:
ctr += 1
if ctr > 5000:
# escape valve
break
postId = indexFile.readline()
if not postId:
continue
postId = postId.strip()
mirrorArticleDir = mirrorDir + '/' + postId
if os.path.isdir(mirrorArticleDir):
rmtree(mirrorArticleDir)
removals.append(postId)
noOfDirs -= 1
# remove the corresponding index entries
if removals:
indexContent = ''
with open(mirrorIndexFilename, 'r') as indexFile:
indexContent = indexFile.read()
for removePostId in removals:
indexContent = \
indexContent.replace(removePostId + '\n', '')
with open(mirrorIndexFilename, "w+") as indexFile:
indexFile.write(indexContent)
mirrorArticleDir = mirrorDir + '/' + postIdNumber
if os.path.isdir(mirrorArticleDir):
# already mirrored
return True
2020-10-20 09:27:58 +00:00
# for onion instances mirror via tor
prefixStr = ''
if domain.endswith('.onion'):
prefixStr = '/usr/bin/torsocks '
2020-10-19 19:26:58 +00:00
# download the files
commandStr = \
2020-10-20 09:27:58 +00:00
prefixStr + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \
2020-10-19 19:26:58 +00:00
' -P ' + mirrorArticleDir
p = Popen(commandStr, shell=True)
os.waitpid(p.pid, 0)
if not os.path.isdir(mirrorArticleDir):
2020-10-20 09:27:58 +00:00
print('WARN: failed to mirror ' + url)
2020-10-19 19:26:58 +00:00
return True
# append the post Id number to the index file
if os.path.isfile(mirrorIndexFilename):
indexFile = open(mirrorIndexFilename, "a+")
if indexFile:
indexFile.write(postIdNumber + '\n')
indexFile.close()
else:
indexFile = open(mirrorIndexFilename, "w+")
if indexFile:
indexFile.write(postIdNumber + '\n')
indexFile.close()
2020-10-19 16:33:58 +00:00
return True
def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
domain: str, port: int,
newswire: {},
translate: {},
recentPostsCache: {}, maxRecentPosts: int,
session, cachedWebfingers: {},
2020-10-16 21:33:18 +00:00
personCache: {},
federationList: [],
2020-10-19 16:33:58 +00:00
sendThreads: [], postLog: [],
maxMirroredArticles: int,
allowLocalNetworkAccess: bool) -> None:
"""Converts rss items in a newswire into posts
"""
2020-11-03 14:41:28 +00:00
if not newswire:
return
2020-10-07 16:55:15 +00:00
basePath = baseDir + '/accounts/news@' + domain + '/outbox'
if not os.path.isdir(basePath):
os.mkdir(basePath)
2020-10-09 10:05:01 +00:00
# oldest items first
2020-10-07 18:46:42 +00:00
newswireReverse = \
2020-10-07 19:41:55 +00:00
OrderedDict(sorted(newswire.items(), reverse=False))
2020-10-07 18:46:42 +00:00
for dateStr, item in newswireReverse.items():
2020-10-07 20:03:39 +00:00
originalDateStr = dateStr
# convert the date to the format used by ActivityPub
2020-10-20 12:37:32 +00:00
if '+00:00' in dateStr:
dateStr = dateStr.replace(' ', 'T')
dateStr = dateStr.replace('+00:00', 'Z')
else:
dateStrWithOffset = \
datetime.datetime.strptime(dateStr, "%Y-%m-%d %H:%M:%S%z")
dateStr = dateStrWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
2020-10-07 16:55:15 +00:00
statusNumber, published = getStatusNumber(dateStr)
newPostId = \
httpPrefix + '://' + domain + \
'/users/news/statuses/' + statusNumber
# file where the post is stored
2020-10-07 16:55:15 +00:00
filename = basePath + '/' + newPostId.replace('/', '#') + '.json'
if os.path.isfile(filename):
2020-10-08 12:52:15 +00:00
# don't create the post if it already exists
2020-10-08 14:35:26 +00:00
# set the url
2020-11-08 18:29:01 +00:00
# newswire[originalDateStr][1] = \
# '/users/news/statuses/' + statusNumber
2020-10-08 14:35:26 +00:00
# set the filename
newswire[originalDateStr][3] = filename
continue
2020-10-11 11:00:28 +00:00
rssTitle = removeControlCharacters(item[0])
url = item[1]
if dangerousMarkup(url, allowLocalNetworkAccess) or \
dangerousMarkup(rssTitle, allowLocalNetworkAccess):
2020-10-11 09:33:31 +00:00
continue
2020-10-07 13:55:27 +00:00
rssDescription = ''
# get the rss description if it exists
2020-10-11 11:00:28 +00:00
rssDescription = removeControlCharacters(item[4])
2020-10-10 10:16:06 +00:00
if rssDescription.startswith('<![CDATA['):
rssDescription = rssDescription.replace('<![CDATA[', '')
rssDescription = rssDescription.replace(']]>', '')
2020-11-08 10:54:29 +00:00
rssDescription = rssDescription.replace(']]', '')
2020-10-20 13:07:02 +00:00
if '&' in rssDescription:
rssDescription = html.unescape(rssDescription)
2020-10-10 10:24:40 +00:00
rssDescription = '<p>' + rssDescription + '<p>'
2020-10-19 20:43:27 +00:00
mirrored = item[7]
postUrl = url
if mirrored and '://' in url:
2020-10-19 22:17:06 +00:00
postUrl = '/newsmirror/' + statusNumber + '/' + \
2020-10-19 22:21:30 +00:00
url.split('://')[1]
if postUrl.endswith('/'):
postUrl += 'index.html'
else:
postUrl += '/index.html'
2020-10-19 20:43:27 +00:00
# add the off-site link to the description
if rssDescription and \
not dangerousMarkup(rssDescription, allowLocalNetworkAccess):
2020-10-10 10:10:20 +00:00
rssDescription += \
2020-10-19 20:43:27 +00:00
'<br><a href="' + postUrl + '">' + \
2020-10-10 10:10:20 +00:00
translate['Read more...'] + '</a>'
else:
2020-10-10 10:10:20 +00:00
rssDescription = \
2020-10-19 20:43:27 +00:00
'<a href="' + postUrl + '">' + \
2020-10-10 10:10:20 +00:00
translate['Read more...'] + '</a>'
2020-10-11 09:33:31 +00:00
# remove image dimensions
if '<img' in rssDescription:
2020-10-11 10:31:26 +00:00
rssDescription = removeHtmlTag(rssDescription, 'width')
rssDescription = removeHtmlTag(rssDescription, 'height')
2020-10-11 09:33:31 +00:00
2020-10-07 16:55:15 +00:00
followersOnly = False
useBlurhash = False
2020-10-09 10:08:01 +00:00
# NOTE: the id when the post is created will not be
# consistent (it's based on the current time, not the
# published time), so we change that later
2020-10-07 21:26:03 +00:00
blog = createNewsPost(baseDir,
2020-10-07 22:25:30 +00:00
domain, port, httpPrefix,
2020-10-11 11:00:28 +00:00
rssDescription,
2020-10-11 10:57:18 +00:00
followersOnly, False,
2020-10-07 21:26:03 +00:00
None, None, None, useBlurhash,
2020-10-11 11:00:28 +00:00
rssTitle)
2020-10-07 16:55:15 +00:00
if not blog:
continue
2020-10-19 16:33:58 +00:00
if mirrored:
2020-10-20 09:27:58 +00:00
if not createNewsMirror(baseDir, domain, statusNumber,
2020-10-19 19:26:58 +00:00
url, maxMirroredArticles):
2020-10-19 16:33:58 +00:00
continue
2020-10-07 16:55:15 +00:00
idStr = \
httpPrefix + '://' + domain + '/users/news' + \
'/statuses/' + statusNumber + '/replies'
blog['news'] = True
2020-10-09 10:05:01 +00:00
# note the time of arrival
currTime = datetime.datetime.utcnow()
blog['object']['arrived'] = currTime.strftime("%Y-%m-%dT%H:%M:%SZ")
2020-10-09 10:08:01 +00:00
# change the id, based upon the published time
2020-10-07 16:55:15 +00:00
blog['object']['replies']['id'] = idStr
blog['object']['replies']['first']['partOf'] = idStr
blog['id'] = newPostId + '/activity'
blog['object']['id'] = newPostId
blog['object']['atomUri'] = newPostId
blog['object']['url'] = \
httpPrefix + '://' + domain + '/@news/' + statusNumber
blog['object']['published'] = dateStr
2020-10-20 13:07:02 +00:00
2020-10-20 12:49:12 +00:00
blog['object']['content'] = rssDescription
blog['object']['contentMap']['en'] = rssDescription
2020-10-07 16:55:15 +00:00
2020-10-17 13:59:47 +00:00
domainFull = domain
if port:
if port != 80 and port != 443:
domainFull = domain + ':' + str(port)
hashtags = item[6]
2020-10-07 16:55:15 +00:00
postId = newPostId.replace('/', '#')
2020-10-09 12:15:20 +00:00
moderated = item[5]
2020-10-16 21:33:18 +00:00
savePost = newswireHashtagProcessing(session, baseDir, blog, hashtags,
httpPrefix, domain, port,
personCache, cachedWebfingers,
federationList,
2020-10-20 17:37:15 +00:00
sendThreads, postLog,
moderated, url)
2020-10-09 12:15:20 +00:00
2020-10-16 21:33:18 +00:00
# save the post and update the index
if savePost:
# ensure that all hashtags are stored in the json
# and appended to the content
blog['object']['tag'] = []
2020-10-25 11:22:52 +00:00
for tagName in hashtags:
htId = tagName.replace('#', '')
hashtagUrl = \
httpPrefix + "://" + domainFull + "/tags/" + htId
newTag = {
'href': hashtagUrl,
'name': tagName,
'type': 'Hashtag'
}
blog['object']['tag'].append(newTag)
hashtagHtml = \
2020-10-25 12:57:14 +00:00
" <a href=\"" + hashtagUrl + \
"\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + \
htId + "</span></a>"
2020-10-25 14:37:51 +00:00
content = blog['object']['content']
if hashtagHtml not in content:
if content.endswith('</p>'):
content = \
content[:len(content) - len('</p>')] + \
hashtagHtml + '</p>'
else:
content += hashtagHtml
blog['object']['content'] = content
2020-10-25 11:22:52 +00:00
2020-10-25 14:21:29 +00:00
# update the newswire tags if new ones have been found by
# newswireHashtagProcessing
for tag in hashtags:
if tag not in newswire[originalDateStr][6]:
newswire[originalDateStr][6].append(tag)
2020-10-17 13:39:04 +00:00
storeHashTags(baseDir, 'news', blog)
clearFromPostCaches(baseDir, recentPostsCache, postId)
2020-10-16 21:33:18 +00:00
if saveJson(blog, filename):
updateFeedsOutboxIndex(baseDir, domain, postId + '.json')
# Save a file containing the time when the post arrived
# this can then later be used to construct the news timeline
# excluding items during the voting period
if moderated:
saveArrivedTime(baseDir, filename,
blog['object']['arrived'])
else:
if os.path.isfile(filename + '.arrived'):
os.remove(filename + '.arrived')
2020-11-08 16:52:57 +00:00
# setting the url here links to the activitypub object
# stored locally
2020-11-08 16:50:50 +00:00
# newswire[originalDateStr][1] = \
# '/users/news/statuses/' + statusNumber
2020-11-08 16:52:57 +00:00
2020-10-16 21:33:18 +00:00
# set the filename
newswire[originalDateStr][3] = filename
2020-10-09 09:02:01 +00:00
def mergeWithPreviousNewswire(oldNewswire: {}, newNewswire: {}) -> None:
"""Preserve any votes or generated activitypub post filename
as rss feeds are updated
"""
2020-11-03 14:41:28 +00:00
if not oldNewswire:
return
2020-10-09 09:02:01 +00:00
for published, fields in oldNewswire.items():
if not newNewswire.get(published):
continue
2020-10-13 08:53:59 +00:00
for i in range(1, 5):
newNewswire[published][i] = fields[i]
2020-10-09 09:02:01 +00:00
def runNewswireDaemon(baseDir: str, httpd,
httpPrefix: str, domain: str, port: int,
translate: {}) -> None:
2020-10-07 12:05:49 +00:00
"""Periodically updates RSS feeds
"""
newswireStateFilename = baseDir + '/accounts/.newswirestate.json'
2020-10-09 09:02:01 +00:00
2020-10-07 12:05:49 +00:00
# initial sleep to allow the system to start up
time.sleep(50)
while True:
# has the session been created yet?
if not httpd.session:
2020-11-03 16:10:54 +00:00
print('Newswire daemon waiting for session')
httpd.session = createSession(httpd.proxyType)
if not httpd.session:
2020-11-03 16:10:54 +00:00
print('Newswire daemon has no session')
time.sleep(60)
continue
else:
print('Newswire daemon session established')
2020-10-07 12:05:49 +00:00
# try to update the feeds
2020-11-22 20:28:32 +00:00
newNewswire = \
getDictFromNewswire(httpd.session, baseDir, domain,
httpd.maxNewswirePostsPerSource,
httpd.maxNewswireFeedSizeKb,
httpd.maxTags,
httpd.maxFeedItemSizeKb,
2020-12-02 17:02:32 +00:00
httpd.maxNewswirePosts,
httpd.maxCategoriesFeedItemSizeKb)
2020-10-07 12:05:49 +00:00
2020-10-09 09:02:01 +00:00
if not httpd.newswire:
if os.path.isfile(newswireStateFilename):
httpd.newswire = loadJson(newswireStateFilename)
2020-10-09 09:02:01 +00:00
mergeWithPreviousNewswire(httpd.newswire, newNewswire)
2020-10-07 12:05:49 +00:00
httpd.newswire = newNewswire
2020-11-03 21:53:29 +00:00
if newNewswire:
saveJson(httpd.newswire, newswireStateFilename)
print('Newswire updated')
convertRSStoActivityPub(baseDir,
httpPrefix, domain, port,
newNewswire, translate,
httpd.recentPostsCache,
httpd.maxRecentPosts,
httpd.session,
httpd.cachedWebfingers,
2020-10-16 21:33:18 +00:00
httpd.personCache,
httpd.federationList,
httpd.sendThreads,
2020-10-19 16:33:58 +00:00
httpd.postLog,
httpd.maxMirroredArticles,
httpd.allowLocalNetworkAccess)
print('Newswire feed converted to ActivityPub')
2020-10-21 10:39:09 +00:00
if httpd.maxNewsPosts > 0:
archiveDir = baseDir + '/archive'
archiveSubdir = \
archiveDir + '/accounts/news@' + domain + '/outbox'
archivePostsForPerson(httpPrefix, 'news',
domain, baseDir, 'outbox',
archiveSubdir,
httpd.recentPostsCache,
httpd.maxNewsPosts)
2020-10-07 12:05:49 +00:00
# wait a while before the next feeds update
time.sleep(1200)
def runNewswireWatchdog(projectVersion: str, httpd) -> None:
"""This tries to keep the newswire update thread running even if it dies
"""
print('Starting newswire watchdog')
newswireOriginal = \
httpd.thrPostSchedule.clone(runNewswireDaemon)
httpd.thrNewswireDaemon.start()
while True:
time.sleep(50)
if not httpd.thrNewswireDaemon.isAlive():
httpd.thrNewswireDaemon.kill()
httpd.thrNewswireDaemon = \
newswireOriginal.clone(runNewswireDaemon)
httpd.thrNewswireDaemon.start()
print('Restarting newswire daemon...')