epicyon/newsdaemon.py

__filename__ = "newsdaemon.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"

import os
import time
import datetime
from collections import OrderedDict
from newswire import getDictFromNewswire
from posts import createNewsPost
from content import removeHtmlTag
from content import dangerousMarkup
from utils import loadJson
from utils import saveJson
from utils import getStatusNumber


def updateFeedsOutboxIndex(baseDir: str, domain: str, postId: str) -> None:
    """Updates the index used for imported RSS feeds
    """
    basePath = baseDir + '/accounts/news@' + domain
    indexFilename = basePath + '/outbox.index'

    if os.path.isfile(indexFilename):
        if postId not in open(indexFilename).read():
            try:
                with open(indexFilename, 'r+') as feedsFile:
                    content = feedsFile.read()
                    feedsFile.seek(0, 0)
                    feedsFile.write(postId + '\n' + content)
                    print('DEBUG: feeds post added to index')
            except Exception as e:
                print('WARN: Failed to write entry to feeds posts index ' +
                      indexFilename + ' ' + str(e))
    else:
        feedsFile = open(indexFilename, 'w+')
        if feedsFile:
            feedsFile.write(postId + '\n')
            feedsFile.close()


def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
    """Saves the time when an rss post arrived to a file
    """
    arrivedFile = open(postFilename + '.arrived', 'w+')
    if arrivedFile:
        arrivedFile.write(arrived)
        arrivedFile.close()


def removeControlCharacters(content: str) -> str:
    """TODO this is hacky and a better solution is needed
    the unicode is messing up somehow
    """
    lookups = {
        "8211": "-",
        "8230": "...",
        "8216": "'",
        "8217": "'",
        "8220": '"',
        "8221": '"'
    }
    for code, ch in lookups.items():
        content = content.replace('&' + code + ';', ch)
        content = content.replace('&#' + code + ';', ch)
    return content


def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
                            domain: str, port: int,
                            newswire: {},
                            translate: {},
                            recentPostsCache: {}, maxRecentPosts: int,
                            session, cachedWebfingers: {},
                            personCache: {}) -> None:
    """Converts rss items in a newswire into posts
    """
    basePath = baseDir + '/accounts/news@' + domain + '/outbox'
    if not os.path.isdir(basePath):
        os.mkdir(basePath)

    # oldest items first
    newswireReverse = \
        OrderedDict(sorted(newswire.items(), reverse=False))

    for dateStr, item in newswireReverse.items():
        originalDateStr = dateStr
        # convert the date to the format used by ActivityPub
        dateStr = dateStr.replace(' ', 'T')
        dateStr = dateStr.replace('+00:00', 'Z')

        statusNumber, published = getStatusNumber(dateStr)
        newPostId = \
            httpPrefix + '://' + domain + \
            '/users/news/statuses/' + statusNumber

        # file where the post is stored
        filename = basePath + '/' + newPostId.replace('/', '#') + '.json'
        if os.path.isfile(filename):
            # don't create the post if it already exists
            # set the url
            newswire[originalDateStr][1] = \
                '/users/news/statuses/' + statusNumber
            # set the filename
            newswire[originalDateStr][3] = filename
            continue

        rssTitle = removeControlCharacters(item[0])
        url = item[1]
        if dangerousMarkup(url) or dangerousMarkup(rssTitle):
            continue
        rssDescription = ''

        # get the rss description if it exists
        rssDescription = removeControlCharacters(item[4])
        if rssDescription.startswith('<![CDATA['):
            rssDescription = rssDescription.replace('<![CDATA[', '')
            rssDescription = rssDescription.replace(']]>', '')
        rssDescription = '<p>' + rssDescription + '<p>'

        # add the off-site link to the description
        if rssDescription and not dangerousMarkup(rssDescription):
            rssDescription += \
                '<br><a href="' + url + '">' + \
                translate['Read more...'] + '</a>'
        else:
            rssDescription = \
                '<a href="' + url + '">' + \
                translate['Read more...'] + '</a>'

        # remove image dimensions
        if '<img' in rssDescription:
            rssDescription = removeHtmlTag(rssDescription, 'width')
            rssDescription = removeHtmlTag(rssDescription, 'height')

        followersOnly = False
        useBlurhash = False
        # NOTE: the id when the post is created will not be
        # consistent (it's based on the current time, not the
        # published time), so we change that later
        blog = createNewsPost(baseDir,
                              domain, port, httpPrefix,
                              rssDescription,
                              followersOnly, False,
                              None, None, None, useBlurhash,
                              rssTitle)
        if not blog:
            continue

        idStr = \
            httpPrefix + '://' + domain + '/users/news' + \
            '/statuses/' + statusNumber + '/replies'
        blog['news'] = True

        # note the time of arrival
        currTime = datetime.datetime.utcnow()
        blog['object']['arrived'] = currTime.strftime("%Y-%m-%dT%H:%M:%SZ")

        # change the id, based upon the published time
        blog['object']['replies']['id'] = idStr
        blog['object']['replies']['first']['partOf'] = idStr

        blog['id'] = newPostId + '/activity'
        blog['object']['id'] = newPostId
        blog['object']['atomUri'] = newPostId
        blog['object']['url'] = \
            httpPrefix + '://' + domain + '/@news/' + statusNumber
        blog['object']['published'] = dateStr

        postId = newPostId.replace('/', '#')

        moderated = item[5]

        # save the post and update the index
        if saveJson(blog, filename):
            updateFeedsOutboxIndex(baseDir, domain, postId + '.json')

            # Save a file containing the time when the post arrived
            # this can then later be used to construct the news timeline
            # excluding items during the voting period
            if moderated:
                saveArrivedTime(baseDir, filename, blog['object']['arrived'])
            else:
                if os.path.isfile(filename + '.arrived'):
                    os.remove(filename + '.arrived')

            # set the url
            newswire[originalDateStr][1] = \
                '/users/news/statuses/' + statusNumber
            # set the filename
            newswire[originalDateStr][3] = filename


def mergeWithPreviousNewswire(oldNewswire: {}, newNewswire: {}) -> None:
    """Preserve any votes or generated activitypub post filename
    as rss feeds are updated
    """
    for published, fields in oldNewswire.items():
        if not newNewswire.get(published):
            continue
        for i in range(1, 5):
            newNewswire[published][i] = fields[i]


def runNewswireDaemon(baseDir: str, httpd,
                      httpPrefix: str, domain: str, port: int,
                      translate: {}) -> None:
    """Periodically updates RSS feeds
    """
    newswireStateFilename = baseDir + '/accounts/.newswirestate.json'

    # initial sleep to allow the system to start up
    time.sleep(50)
    while True:
        # has the session been created yet?
        if not httpd.session:
            print('Newswire daemon waiting for session')
            time.sleep(60)
            continue

        # try to update the feeds
        newNewswire = None
        try:
            newNewswire = \
                getDictFromNewswire(httpd.session, baseDir,
                                    httpd.maxNewswirePostsPerSource,
                                    httpd.maxNewswireFeedSizeKb)
        except Exception as e:
            print('WARN: unable to update newswire ' + str(e))
            time.sleep(120)
            continue

        if not httpd.newswire:
            if os.path.isfile(newswireStateFilename):
                httpd.newswire = loadJson(newswireStateFilename)

        mergeWithPreviousNewswire(httpd.newswire, newNewswire)

        httpd.newswire = newNewswire
        saveJson(httpd.newswire, newswireStateFilename)
        print('Newswire updated')

        convertRSStoActivityPub(baseDir,
                                httpPrefix, domain, port,
                                newNewswire, translate,
                                httpd.recentPostsCache,
                                httpd.maxRecentPosts,
                                httpd.session,
                                httpd.cachedWebfingers,
                                httpd.personCache)
        print('Newswire feed converted to ActivityPub')

        # wait a while before the next feeds update
        time.sleep(1200)


def runNewswireWatchdog(projectVersion: str, httpd) -> None:
    """This tries to keep the newswire update thread running even if it dies
    """
    print('Starting newswire watchdog')
    newswireOriginal = \
        httpd.thrPostSchedule.clone(runNewswireDaemon)
    httpd.thrNewswireDaemon.start()
    while True:
        time.sleep(50)
        if not httpd.thrNewswireDaemon.isAlive():
            httpd.thrNewswireDaemon.kill()
            httpd.thrNewswireDaemon = \
                newswireOriginal.clone(runNewswireDaemon)
            httpd.thrNewswireDaemon.start()
            print('Restarting newswire daemon...')