epicyon/newsdaemon.py

276 lines
9.7 KiB
Python
Raw Normal View History

2020-10-07 12:05:49 +00:00
__filename__ = "newsdaemon.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
2020-10-07 12:05:49 +00:00
import time
2020-10-09 10:05:01 +00:00
import datetime
2020-10-07 18:46:42 +00:00
from collections import OrderedDict
2020-10-07 12:05:49 +00:00
from newswire import getDictFromNewswire
2020-10-07 21:26:03 +00:00
from posts import createNewsPost
2020-10-11 09:33:31 +00:00
from content import removeHtmlTag
from content import dangerousMarkup
2020-10-09 09:02:01 +00:00
from utils import loadJson
from utils import saveJson
2020-10-07 16:55:15 +00:00
from utils import getStatusNumber
2020-10-07 12:05:49 +00:00
2020-10-08 12:29:40 +00:00
2020-10-09 09:43:34 +00:00
def updateFeedsOutboxIndex(baseDir: str, domain: str, postId: str) -> None:
"""Updates the index used for imported RSS feeds
"""
2020-10-07 16:55:15 +00:00
basePath = baseDir + '/accounts/news@' + domain
indexFilename = basePath + '/outbox.index'
if os.path.isfile(indexFilename):
2020-10-07 18:46:42 +00:00
if postId not in open(indexFilename).read():
try:
with open(indexFilename, 'r+') as feedsFile:
content = feedsFile.read()
feedsFile.seek(0, 0)
feedsFile.write(postId + '\n' + content)
print('DEBUG: feeds post added to index')
except Exception as e:
print('WARN: Failed to write entry to feeds posts index ' +
indexFilename + ' ' + str(e))
else:
feedsFile = open(indexFilename, 'w+')
if feedsFile:
2020-10-07 16:55:15 +00:00
feedsFile.write(postId + '\n')
feedsFile.close()
2020-10-09 12:15:20 +00:00
def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
"""Saves the time when an rss post arrived to a file
"""
arrivedFile = open(postFilename + '.arrived', 'w+')
if arrivedFile:
arrivedFile.write(arrived)
arrivedFile.close()
2020-10-10 09:36:23 +00:00
def removeControlCharacters(content: str) -> str:
2020-10-11 09:33:31 +00:00
"""TODO this is hacky and a better solution is needed
the unicode is messing up somehow
"""
lookups = {
"8211": "-",
"8230": "...",
"8216": "'",
"8217": "'",
"8220": '"',
"8221": '"'
}
for code, ch in lookups.items():
content = content.replace('&' + code + ';', ch)
content = content.replace('&#' + code + ';', ch)
return content
2020-10-10 09:36:23 +00:00
2020-10-10 08:54:13 +00:00
def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
domain: str, port: int,
newswire: {},
translate: {},
recentPostsCache: {}, maxRecentPosts: int,
session, cachedWebfingers: {},
personCache: {}) -> None:
"""Converts rss items in a newswire into posts
"""
2020-10-07 16:55:15 +00:00
basePath = baseDir + '/accounts/news@' + domain + '/outbox'
if not os.path.isdir(basePath):
os.mkdir(basePath)
2020-10-09 10:05:01 +00:00
# oldest items first
2020-10-07 18:46:42 +00:00
newswireReverse = \
2020-10-07 19:41:55 +00:00
OrderedDict(sorted(newswire.items(), reverse=False))
2020-10-07 18:46:42 +00:00
for dateStr, item in newswireReverse.items():
2020-10-07 20:03:39 +00:00
originalDateStr = dateStr
# convert the date to the format used by ActivityPub
dateStr = dateStr.replace(' ', 'T')
dateStr = dateStr.replace('+00:00', 'Z')
2020-10-07 16:55:15 +00:00
statusNumber, published = getStatusNumber(dateStr)
newPostId = \
httpPrefix + '://' + domain + \
'/users/news/statuses/' + statusNumber
# file where the post is stored
2020-10-07 16:55:15 +00:00
filename = basePath + '/' + newPostId.replace('/', '#') + '.json'
if os.path.isfile(filename):
2020-10-08 12:52:15 +00:00
# don't create the post if it already exists
2020-10-08 14:35:26 +00:00
# set the url
2020-10-08 12:37:14 +00:00
newswire[originalDateStr][1] = \
'/users/news/statuses/' + statusNumber
2020-10-08 14:35:26 +00:00
# set the filename
newswire[originalDateStr][3] = filename
continue
2020-10-11 11:00:28 +00:00
rssTitle = removeControlCharacters(item[0])
url = item[1]
2020-10-11 09:33:31 +00:00
if dangerousMarkup(url) or dangerousMarkup(rssTitle):
continue
2020-10-07 13:55:27 +00:00
rssDescription = ''
# get the rss description if it exists
2020-10-11 11:00:28 +00:00
rssDescription = removeControlCharacters(item[4])
2020-10-10 10:16:06 +00:00
if rssDescription.startswith('<![CDATA['):
rssDescription = rssDescription.replace('<![CDATA[', '')
rssDescription = rssDescription.replace(']]>', '')
2020-10-10 10:24:40 +00:00
rssDescription = '<p>' + rssDescription + '<p>'
# add the off-site link to the description
2020-10-11 09:33:31 +00:00
if rssDescription and not dangerousMarkup(rssDescription):
2020-10-10 10:10:20 +00:00
rssDescription += \
2020-10-10 11:59:09 +00:00
'<br><a href="' + url + '">' + \
2020-10-10 10:10:20 +00:00
translate['Read more...'] + '</a>'
else:
2020-10-10 10:10:20 +00:00
rssDescription = \
2020-10-10 10:01:14 +00:00
'<a href="' + url + '">' + \
2020-10-10 10:10:20 +00:00
translate['Read more...'] + '</a>'
2020-10-11 09:33:31 +00:00
# remove image dimensions
if '<img' in rssDescription:
2020-10-11 10:31:26 +00:00
rssDescription = removeHtmlTag(rssDescription, 'width')
rssDescription = removeHtmlTag(rssDescription, 'height')
2020-10-11 09:33:31 +00:00
2020-10-07 16:55:15 +00:00
followersOnly = False
useBlurhash = False
2020-10-09 10:08:01 +00:00
# NOTE: the id when the post is created will not be
# consistent (it's based on the current time, not the
# published time), so we change that later
2020-10-07 21:26:03 +00:00
blog = createNewsPost(baseDir,
2020-10-07 22:25:30 +00:00
domain, port, httpPrefix,
2020-10-11 11:00:28 +00:00
rssDescription,
2020-10-11 10:57:18 +00:00
followersOnly, False,
2020-10-07 21:26:03 +00:00
None, None, None, useBlurhash,
2020-10-11 11:00:28 +00:00
rssTitle)
2020-10-07 16:55:15 +00:00
if not blog:
continue
idStr = \
httpPrefix + '://' + domain + '/users/news' + \
'/statuses/' + statusNumber + '/replies'
blog['news'] = True
2020-10-09 10:05:01 +00:00
# note the time of arrival
currTime = datetime.datetime.utcnow()
blog['object']['arrived'] = currTime.strftime("%Y-%m-%dT%H:%M:%SZ")
2020-10-09 10:08:01 +00:00
# change the id, based upon the published time
2020-10-07 16:55:15 +00:00
blog['object']['replies']['id'] = idStr
blog['object']['replies']['first']['partOf'] = idStr
blog['id'] = newPostId + '/activity'
blog['object']['id'] = newPostId
blog['object']['atomUri'] = newPostId
blog['object']['url'] = \
httpPrefix + '://' + domain + '/@news/' + statusNumber
blog['object']['published'] = dateStr
postId = newPostId.replace('/', '#')
2020-10-09 12:15:20 +00:00
moderated = item[5]
# save the post and update the index
if saveJson(blog, filename):
2020-10-09 09:43:34 +00:00
updateFeedsOutboxIndex(baseDir, domain, postId + '.json')
2020-10-09 12:15:20 +00:00
# Save a file containing the time when the post arrived
# this can then later be used to construct the news timeline
# excluding items during the voting period
if moderated:
saveArrivedTime(baseDir, filename, blog['object']['arrived'])
else:
if os.path.isfile(filename + '.arrived'):
os.remove(filename + '.arrived')
2020-10-08 14:35:26 +00:00
# set the url
2020-10-08 12:37:14 +00:00
newswire[originalDateStr][1] = \
'/users/news/statuses/' + statusNumber
2020-10-08 14:35:26 +00:00
# set the filename
newswire[originalDateStr][3] = filename
2020-10-09 09:02:01 +00:00
def mergeWithPreviousNewswire(oldNewswire: {}, newNewswire: {}) -> None:
"""Preserve any votes or generated activitypub post filename
as rss feeds are updated
"""
for published, fields in oldNewswire.items():
if not newNewswire.get(published):
continue
2020-10-13 08:53:59 +00:00
for i in range(1, 5):
newNewswire[published][i] = fields[i]
2020-10-09 09:02:01 +00:00
def runNewswireDaemon(baseDir: str, httpd,
httpPrefix: str, domain: str, port: int,
translate: {}) -> None:
2020-10-07 12:05:49 +00:00
"""Periodically updates RSS feeds
"""
newswireStateFilename = baseDir + '/accounts/.newswirestate.json'
2020-10-09 09:02:01 +00:00
2020-10-07 12:05:49 +00:00
# initial sleep to allow the system to start up
time.sleep(50)
while True:
# has the session been created yet?
if not httpd.session:
print('Newswire daemon waiting for session')
time.sleep(60)
continue
# try to update the feeds
newNewswire = None
try:
newNewswire = \
getDictFromNewswire(httpd.session, baseDir,
httpd.maxNewswirePostsPerSource,
httpd.maxNewswireFeedSizeKb)
2020-10-07 12:05:49 +00:00
except Exception as e:
print('WARN: unable to update newswire ' + str(e))
time.sleep(120)
continue
2020-10-09 09:02:01 +00:00
if not httpd.newswire:
if os.path.isfile(newswireStateFilename):
httpd.newswire = loadJson(newswireStateFilename)
2020-10-09 09:02:01 +00:00
mergeWithPreviousNewswire(httpd.newswire, newNewswire)
2020-10-07 12:05:49 +00:00
httpd.newswire = newNewswire
saveJson(httpd.newswire, newswireStateFilename)
2020-10-07 12:05:49 +00:00
print('Newswire updated')
convertRSStoActivityPub(baseDir,
httpPrefix, domain, port,
newNewswire, translate,
httpd.recentPostsCache,
httpd.maxRecentPosts,
httpd.session,
httpd.cachedWebfingers,
httpd.personCache)
print('Newswire feed converted to ActivityPub')
2020-10-07 12:05:49 +00:00
# wait a while before the next feeds update
time.sleep(1200)
def runNewswireWatchdog(projectVersion: str, httpd) -> None:
"""This tries to keep the newswire update thread running even if it dies
"""
print('Starting newswire watchdog')
newswireOriginal = \
httpd.thrPostSchedule.clone(runNewswireDaemon)
httpd.thrNewswireDaemon.start()
while True:
time.sleep(50)
if not httpd.thrNewswireDaemon.isAlive():
httpd.thrNewswireDaemon.kill()
httpd.thrNewswireDaemon = \
newswireOriginal.clone(runNewswireDaemon)
httpd.thrNewswireDaemon.start()
print('Restarting newswire daemon...')