epicyon/newswire.py

__filename__ = "newswire.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"

import os
import requests
from socket import error as SocketError
import errno
from datetime import datetime
from datetime import timedelta
from collections import OrderedDict
from utils import firstParagraphFromString
from utils import isPublicPost
from utils import locatePost
from utils import loadJson
from utils import saveJson
from utils import isSuspended
from utils import containsInvalidChars
from utils import removeHtml
from blocking import isBlockedDomain
from blocking import isBlockedHashtag
from filters import isFiltered


def removeCDATA(text: str) -> str:
    """Removes any CDATA from the given text
    """
    if 'CDATA[' in text:
        text = text.split('CDATA[')[1]
        if ']' in text:
            text = text.split(']')[0]
    return text


def rss2Header(httpPrefix: str,
               nickname: str, domainFull: str,
               title: str, translate: {}) -> str:
    """Header for an RSS 2.0 feed
    """
    rssStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
    rssStr += "<rss version=\"2.0\">"
    rssStr += '<channel>'

    if title.startswith('News'):
        rssStr += '    <title>Newswire</title>'
        rssStr += '    <link>' + httpPrefix + '://' + domainFull + \
            '/newswire.xml' + '</link>'
    elif title.startswith('Site'):
        rssStr += '    <title>' + domainFull + '</title>'
        rssStr += '    <link>' + httpPrefix + '://' + domainFull + \
            '/blog/rss.xml' + '</link>'
    else:
        rssStr += '    <title>' + translate[title] + '</title>'
        rssStr += '    <link>' + httpPrefix + '://' + domainFull + \
            '/users/' + nickname + '/rss.xml' + '</link>'
    return rssStr


def rss2Footer() -> str:
    """Footer for an RSS 2.0 feed
    """
    rssStr = '</channel>'
    rssStr += '</rss>'
    return rssStr


def getNewswireTags(text: str, maxTags: int) -> []:
    """Returns a list of hashtags found in the given text
    """
    if '#' not in text:
        return []
    if ' ' not in text:
        return []
    textSimplified = \
        text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
    textSimplified = textSimplified.replace('. ', ' ').strip()
    if textSimplified.endswith('.'):
        textSimplified = textSimplified[:len(textSimplified)-1]
    words = textSimplified.split(' ')
    tags = []
    for wrd in words:
        if wrd.startswith('#'):
            if len(wrd) > 1:
                if wrd not in tags:
                    tags.append(wrd)
                    if len(tags) >= maxTags:
                        break
    return tags


def addNewswireDictEntry(baseDir: str, domain: str,
                         newswire: {}, dateStr: str,
                         title: str, link: str,
                         votesStatus: str, postFilename: str,
                         description: str, moderated: bool,
                         mirrored: bool,
                         tags=[], maxTags=32) -> None:
    """Update the newswire dictionary
    """
    allText = removeHtml(title + ' ' + description)

    # check that none of the text is filtered against
    if isFiltered(baseDir, 'news', domain, allText):
        return

    if tags is None:
        tags = []

    # extract hashtags from the text of the feed post
    postTags = getNewswireTags(allText, maxTags)

    # combine the tags into a single list
    for tag in tags:
        if tag not in postTags:
            if len(postTags) < maxTags:
                postTags.append(tag)

    # check that no tags are blocked
    for tag in postTags:
        if isBlockedHashtag(baseDir, tag.replace('#', '')):
            return

    newswire[dateStr] = [
        title,
        link,
        votesStatus,
        postFilename,
        description,
        moderated,
        postTags,
        mirrored
    ]


def parseFeedDate(pubDate: str):
    """Returns a date object based on the given date string
    This tries a number of formats to see which work
    """
    formats = ("%a, %d %b %Y %H:%M:%S %z",
               "%a, %d %b %Y %H:%M:%S EST",
               "%a, %d %b %Y %H:%M:%S UT",
               "%Y-%m-%dT%H:%M:%SZ",
               "%Y-%m-%dT%H:%M:%S%z")

    publishedDate = None
    for dateFormat in formats:
        if ',' in pubDate and ',' not in dateFormat:
            continue
        if ',' not in pubDate and ',' in dateFormat:
            continue
        if '-' in pubDate and '-' not in dateFormat:
            continue
        if '-' not in pubDate and '-' in dateFormat:
            continue
        if 'T' in pubDate and 'T' not in dateFormat:
            continue
        if 'T' not in pubDate and 'T' in dateFormat:
            continue
        if 'Z' in pubDate and 'Z' not in dateFormat:
            continue
        if 'Z' not in pubDate and 'Z' in dateFormat:
            continue
        if 'EST' not in pubDate and 'EST' in dateFormat:
            continue
        if 'EST' in pubDate and 'EST' not in dateFormat:
            continue
        if 'UT' not in pubDate and 'UT' in dateFormat:
            continue
        if 'UT' in pubDate and 'UT' not in dateFormat:
            continue

        try:
            publishedDate = \
                datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST")
        except BaseException:
            print('WARN: unrecognized date format: ' +
                  pubDate + ' ' + dateFormat)
            continue

        if publishedDate:
            if pubDate.endswith(' EST'):
                hoursAdded = timedelta(hours=5)
                publishedDate = publishedDate + hoursAdded
            break
    print('Feed date: ' + pubDate + ' ' + str(publishedDate))
    return publishedDate


def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
                  moderated: bool, mirrored: bool,
                  maxPostsPerSource: int,
                  maxFeedItemSizeKb: int) -> {}:
    """Converts an xml 2.0 string to a dictionary
    """
    if '<item>' not in xmlStr:
        return {}
    result = {}
    rssItems = xmlStr.split('<item>')
    postCtr = 0
    maxBytes = maxFeedItemSizeKb * 1024
    for rssItem in rssItems:
        if len(rssItem) > maxBytes:
            print('WARN: rss feed item is too big')
            continue
        if '<title>' not in rssItem:
            continue
        if '</title>' not in rssItem:
            continue
        if '<link>' not in rssItem:
            continue
        if '</link>' not in rssItem:
            continue
        if '<pubDate>' not in rssItem:
            continue
        if '</pubDate>' not in rssItem:
            continue
        title = rssItem.split('<title>')[1]
        title = removeCDATA(title.split('</title>')[0])
        description = ''
        if '<description>' in rssItem and '</description>' in rssItem:
            description = rssItem.split('<description>')[1]
            description = removeCDATA(description.split('</description>')[0])
        else:
            if '<media:description>' in rssItem and \
               '</media:description>' in rssItem:
                description = rssItem.split('<media:description>')[1]
                description = description.split('</media:description>')[0]
                description = removeCDATA(description)
        link = rssItem.split('<link>')[1]
        link = link.split('</link>')[0]
        if '://' not in link:
            continue
        itemDomain = link.split('://')[1]
        if '/' in itemDomain:
            itemDomain = itemDomain.split('/')[0]
        if isBlockedDomain(baseDir, itemDomain):
            continue
        pubDate = rssItem.split('<pubDate>')[1]
        pubDate = pubDate.split('</pubDate>')[0]

        publishedDate = parseFeedDate(pubDate)
        if publishedDate:
            pubDateStr = str(publishedDate)
            if not pubDateStr.endswith('+00:00'):
                pubDateStr += '+00:00'
            print('Feed date: ' + pubDateStr)
            postFilename = ''
            votesStatus = []
            addNewswireDictEntry(baseDir, domain,
                                 result, pubDateStr,
                                 title, link,
                                 votesStatus, postFilename,
                                 description, moderated, mirrored)
            postCtr += 1
            if postCtr >= maxPostsPerSource:
                break
    return result


def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
                   moderated: bool, mirrored: bool,
                   maxPostsPerSource: int,
                   maxFeedItemSizeKb: int) -> {}:
    """Converts an atom feed string to a dictionary
    """
    if '<entry>' not in xmlStr:
        return {}
    result = {}
    atomItems = xmlStr.split('<entry>')
    postCtr = 0
    maxBytes = maxFeedItemSizeKb * 1024
    for atomItem in atomItems:
        if len(atomItem) > maxBytes:
            print('WARN: atom feed item is too big')
            continue
        if '<title>' not in atomItem:
            continue
        if '</title>' not in atomItem:
            continue
        if '<link>' not in atomItem:
            continue
        if '</link>' not in atomItem:
            continue
        if '<updated>' not in atomItem:
            continue
        if '</updated>' not in atomItem:
            continue
        title = atomItem.split('<title>')[1]
        title = removeCDATA(title.split('</title>')[0])
        description = ''
        if '<summary>' in atomItem and '</summary>' in atomItem:
            description = atomItem.split('<summary>')[1]
            description = removeCDATA(description.split('</summary>')[0])
        else:
            if '<media:description>' in atomItem and \
               '</media:description>' in atomItem:
                description = atomItem.split('<media:description>')[1]
                description = description.split('</media:description>')[0]
                description = removeCDATA(description)
        link = atomItem.split('<link>')[1]
        link = link.split('</link>')[0]
        if '://' not in link:
            continue
        itemDomain = link.split('://')[1]
        if '/' in itemDomain:
            itemDomain = itemDomain.split('/')[0]
        if isBlockedDomain(baseDir, itemDomain):
            continue
        pubDate = atomItem.split('<updated>')[1]
        pubDate = pubDate.split('</updated>')[0]

        publishedDate = parseFeedDate(pubDate)
        if publishedDate:
            pubDateStr = str(publishedDate)
            if not pubDateStr.endswith('+00:00'):
                pubDateStr += '+00:00'
            print('Feed date: ' + pubDateStr)
            postFilename = ''
            votesStatus = []
            addNewswireDictEntry(baseDir, domain,
                                 result, pubDateStr,
                                 title, link,
                                 votesStatus, postFilename,
                                 description, moderated, mirrored)
            postCtr += 1
            if postCtr >= maxPostsPerSource:
                break
    return result


def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
                     moderated: bool, mirrored: bool,
                     maxPostsPerSource: int,
                     maxFeedItemSizeKb: int) -> {}:
    """Converts an atom-style YouTube feed string to a dictionary
    """
    if '<entry>' not in xmlStr:
        return {}
    if isBlockedDomain(baseDir, 'www.youtube.com'):
        return {}
    result = {}
    atomItems = xmlStr.split('<entry>')
    postCtr = 0
    maxBytes = maxFeedItemSizeKb * 1024
    for atomItem in atomItems:
        print('YouTube feed item: ' + atomItem)
        if len(atomItem) > maxBytes:
            print('WARN: atom feed item is too big')
            continue
        if '<title>' not in atomItem:
            continue
        if '</title>' not in atomItem:
            continue
        if '<updated>' not in atomItem:
            continue
        if '</updated>' not in atomItem:
            continue
        if '<yt:videoId>' not in atomItem:
            continue
        if '</yt:videoId>' not in atomItem:
            continue
        title = atomItem.split('<title>')[1]
        title = removeCDATA(title.split('</title>')[0])
        description = ''
        if '<media:description>' in atomItem and \
           '</media:description>' in atomItem:
            description = atomItem.split('<media:description>')[1]
            description = description.split('</media:description>')[0]
            description = removeCDATA(description)
        elif '<summary>' in atomItem and '</summary>' in atomItem:
            description = atomItem.split('<summary>')[1]
            description = description.split('</summary>')[0]
            description = removeCDATA(description)
        link = atomItem.split('<yt:videoId>')[1]
        link = link.split('</yt:videoId>')[0]
        link = 'https://www.youtube.com/watch?v=' + link.strip()
        pubDate = atomItem.split('<updated>')[1]
        pubDate = pubDate.split('</updated>')[0]

        publishedDate = parseFeedDate(pubDate)
        if publishedDate:
            pubDateStr = str(publishedDate)
            if not pubDateStr.endswith('+00:00'):
                pubDateStr += '+00:00'
            print('Feed date: ' + pubDateStr)
            postFilename = ''
            votesStatus = []
            addNewswireDictEntry(baseDir, domain,
                                 result, pubDateStr,
                                 title, link,
                                 votesStatus, postFilename,
                                 description, moderated, mirrored)
            postCtr += 1
            if postCtr >= maxPostsPerSource:
                break
    return result


def xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
                 moderated: bool, mirrored: bool,
                 maxPostsPerSource: int,
                 maxFeedItemSizeKb: int) -> {}:
    """Converts an xml string to a dictionary
    """
    if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr:
        print('YouTube feed: reading')
        return atomFeedYTToDict(baseDir, domain,
                                xmlStr, moderated, mirrored,
                                maxPostsPerSource, maxFeedItemSizeKb)
    elif 'rss version="2.0"' in xmlStr:
        return xml2StrToDict(baseDir, domain,
                             xmlStr, moderated, mirrored,
                             maxPostsPerSource, maxFeedItemSizeKb)
    elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
        return atomFeedToDict(baseDir, domain,
                              xmlStr, moderated, mirrored,
                              maxPostsPerSource, maxFeedItemSizeKb)
    return {}


def YTchannelToAtomFeed(url: str) -> str:
    """Converts a YouTube channel url into an atom feed url
    """
    if 'youtube.com/channel/' not in url:
        return url
    channelId = url.split('youtube.com/channel/')[1].strip()
    channelUrl = \
        'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId
    print('YouTube feed: ' + channelUrl)
    return channelUrl


def getRSS(baseDir: str, domain: str, session, url: str,
           moderated: bool, mirrored: bool,
           maxPostsPerSource: int, maxFeedSizeKb: int,
           maxFeedItemSizeKb: int) -> {}:
    """Returns an RSS url as a dict
    """
    if not isinstance(url, str):
        print('url: ' + str(url))
        print('ERROR: getRSS url should be a string')
        return None
    headers = {
        'Accept': 'text/xml; charset=UTF-8'
    }
    params = None
    sessionParams = {}
    sessionHeaders = {}
    if headers:
        sessionHeaders = headers
    if params:
        sessionParams = params
    sessionHeaders['User-Agent'] = \
        'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
    if not session:
        print('WARN: no session specified for getRSS')
    url = YTchannelToAtomFeed(url)
    try:
        result = session.get(url, headers=sessionHeaders, params=sessionParams)
        if result:
            if int(len(result.text) / 1024) < maxFeedSizeKb and \
               not containsInvalidChars(result.text):
                return xmlStrToDict(baseDir, domain, result.text,
                                    moderated, mirrored,
                                    maxPostsPerSource,
                                    maxFeedItemSizeKb)
            else:
                print('WARN: feed is too large, ' +
                      'or contains invalid characters: ' + url)
        else:
            print('WARN: no result returned for feed ' + url)
    except requests.exceptions.RequestException as e:
        print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
              'headers: ' + str(sessionHeaders) + '\n' +
              'params: ' + str(sessionParams) + '\n')
        print(e)
    except ValueError as e:
        print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
              'headers: ' + str(sessionHeaders) + '\n' +
              'params: ' + str(sessionParams) + '\n')
        print(e)
    except SocketError as e:
        if e.errno == errno.ECONNRESET:
            print('WARN: connection was reset during getRSS')
        print(e)
    return None


def getRSSfromDict(baseDir: str, newswire: {},
                   httpPrefix: str, domainFull: str,
                   title: str, translate: {}) -> str:
    """Returns an rss feed from the current newswire dict.
    This allows other instances to subscribe to the same newswire
    """
    rssStr = rss2Header(httpPrefix,
                        None, domainFull,
                        'Newswire', translate)
    if not newswire:
        return ''
    for published, fields in newswire.items():
        if '+00:00' in published:
            published = published.replace('+00:00', 'Z').strip()
            published = published.replace(' ', 'T')
        else:
            publishedWithOffset = \
                datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z")
            published = publishedWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
        try:
            pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
        except Exception as e:
            print('WARN: Unable to convert date ' + published + ' ' + str(e))
            continue
        rssStr += '<item>\n'
        rssStr += '  <title>' + fields[0] + '</title>\n'
        description = removeCDATA(firstParagraphFromString(fields[4]))
        rssStr += '  <description>' + description + '</description>\n'
        url = fields[1]
        if '://' not in url:
            if domainFull not in url:
                url = httpPrefix + '://' + domainFull + url
        rssStr += '  <link>' + url + '</link>\n'

        rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
        rssStr += '  <pubDate>' + rssDateStr + '</pubDate>\n'
        rssStr += '</item>\n'
    rssStr += rss2Footer()
    return rssStr


def isNewswireBlogPost(postJsonObject: {}) -> bool:
    """Is the given object a blog post?
    There isn't any difference between a blog post and a newswire blog post
    but we may here need to check for different properties than
    isBlogPost does
    """
    if not postJsonObject:
        return False
    if not postJsonObject.get('object'):
        return False
    if not isinstance(postJsonObject['object'], dict):
        return False
    if postJsonObject['object'].get('summary') and \
       postJsonObject['object'].get('url') and \
       postJsonObject['object'].get('content') and \
       postJsonObject['object'].get('published'):
        return isPublicPost(postJsonObject)
    return False


def getHashtagsFromPost(postJsonObject: {}) -> []:
    """Returns a list of any hashtags within a post
    """
    if not postJsonObject.get('object'):
        return []
    if not isinstance(postJsonObject['object'], dict):
        return []
    if not postJsonObject['object'].get('tag'):
        return []
    if not isinstance(postJsonObject['object']['tag'], list):
        return []
    tags = []
    for tg in postJsonObject['object']['tag']:
        if not isinstance(tg, dict):
            continue
        if not tg.get('name'):
            continue
        if not tg.get('type'):
            continue
        if tg['type'] != 'Hashtag':
            continue
        if tg['name'] not in tags:
            tags.append(tg['name'])
    return tags


def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
                              newswire: {},
                              maxBlogsPerAccount: int,
                              indexFilename: str,
                              maxTags: int) -> None:
    """Adds blogs for the given account to the newswire
    """
    if not os.path.isfile(indexFilename):
        return
    # local blog entries are unmoderated by default
    moderated = False

    # local blogs can potentially be moderated
    moderatedFilename = \
        baseDir + '/accounts/' + nickname + '@' + domain + \
        '/.newswiremoderated'
    if os.path.isfile(moderatedFilename):
        moderated = True

    with open(indexFilename, 'r') as indexFile:
        postFilename = 'start'
        ctr = 0
        while postFilename:
            postFilename = indexFile.readline()
            if postFilename:
                # if this is a full path then remove the directories
                if '/' in postFilename:
                    postFilename = postFilename.split('/')[-1]

                # filename of the post without any extension or path
                # This should also correspond to any index entry in
                # the posts cache
                postUrl = \
                    postFilename.replace('\n', '').replace('\r', '')
                postUrl = postUrl.replace('.json', '').strip()

                # read the post from file
                fullPostFilename = \
                    locatePost(baseDir, nickname,
                               domain, postUrl, False)
                if not fullPostFilename:
                    print('Unable to locate post ' + postUrl)
                    ctr += 1
                    if ctr >= maxBlogsPerAccount:
                        break
                    continue

                postJsonObject = None
                if fullPostFilename:
                    postJsonObject = loadJson(fullPostFilename)
                if isNewswireBlogPost(postJsonObject):
                    published = postJsonObject['object']['published']
                    published = published.replace('T', ' ')
                    published = published.replace('Z', '+00:00')
                    votes = []
                    if os.path.isfile(fullPostFilename + '.votes'):
                        votes = loadJson(fullPostFilename + '.votes')
                    content = postJsonObject['object']['content']
                    description = firstParagraphFromString(content)
                    description = removeCDATA(description)
                    addNewswireDictEntry(baseDir, domain,
                                         newswire, published,
                                         postJsonObject['object']['summary'],
                                         postJsonObject['object']['url'],
                                         votes, fullPostFilename,
                                         description, moderated, False,
                                         getHashtagsFromPost(postJsonObject),
                                         maxTags)

            ctr += 1
            if ctr >= maxBlogsPerAccount:
                break


def addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
                       maxBlogsPerAccount: int,
                       maxTags: int) -> None:
    """Adds blogs from each user account into the newswire
    """
    moderationDict = {}

    # go through each account
    for subdir, dirs, files in os.walk(baseDir + '/accounts'):
        for handle in dirs:
            if '@' not in handle:
                continue
            if 'inbox@' in handle:
                continue

            nickname = handle.split('@')[0]

            # has this account been suspended?
            if isSuspended(baseDir, nickname):
                continue

            if os.path.isfile(baseDir + '/accounts/' + handle +
                              '/.nonewswire'):
                continue

            # is there a blogs timeline for this account?
            accountDir = os.path.join(baseDir + '/accounts', handle)
            blogsIndex = accountDir + '/tlblogs.index'
            if os.path.isfile(blogsIndex):
                domain = handle.split('@')[1]
                addAccountBlogsToNewswire(baseDir, nickname, domain,
                                          newswire, maxBlogsPerAccount,
                                          blogsIndex, maxTags)

    # sort the moderation dict into chronological order, latest first
    sortedModerationDict = \
        OrderedDict(sorted(moderationDict.items(), reverse=True))
    # save the moderation queue details for later display
    newswireModerationFilename = baseDir + '/accounts/newswiremoderation.txt'
    if sortedModerationDict:
        saveJson(sortedModerationDict, newswireModerationFilename)
    else:
        # remove the file if there is nothing to moderate
        if os.path.isfile(newswireModerationFilename):
            os.remove(newswireModerationFilename)


def getDictFromNewswire(session, baseDir: str, domain: str,
                        maxPostsPerSource: int, maxFeedSizeKb: int,
                        maxTags: int, maxFeedItemSizeKb: int,
                        maxNewswirePosts: int) -> {}:
    """Gets rss feeds as a dictionary from newswire file
    """
    subscriptionsFilename = baseDir + '/accounts/newswire.txt'
    if not os.path.isfile(subscriptionsFilename):
        return {}

    maxPostsPerSource = 5

    # add rss feeds
    rssFeed = []
    with open(subscriptionsFilename, 'r') as fp:
        rssFeed = fp.readlines()
    result = {}
    for url in rssFeed:
        url = url.strip()

        # Does this contain a url?
        if '://' not in url:
            continue

        # is this a comment?
        if url.startswith('#'):
            continue

        # should this feed be moderated?
        moderated = False
        if '*' in url:
            moderated = True
            url = url.replace('*', '').strip()

        # should this feed content be mirrored?
        mirrored = False
        if '!' in url:
            mirrored = True
            url = url.replace('!', '').strip()

        itemsList = getRSS(baseDir, domain, session, url,
                           moderated, mirrored,
                           maxPostsPerSource, maxFeedSizeKb,
                           maxFeedItemSizeKb)
        if itemsList:
            for dateStr, item in itemsList.items():
                result[dateStr] = item

    # add blogs from each user account
    addBlogsToNewswire(baseDir, domain, result,
                       maxPostsPerSource, maxTags)

    # sort into chronological order, latest first
    sortedResult = OrderedDict(sorted(result.items(), reverse=True))

    # are there too many posts? If so then remove the oldest ones
    noOfPosts = len(sortedResult.items())
    if noOfPosts > maxNewswirePosts:
        ctr = 0
        removals = []
        for dateStr, item in sortedResult.items():
            ctr += 1
            if ctr > maxNewswirePosts:
                removals.append(dateStr)
        for r in removals:
            sortedResult.pop(r)

    return sortedResult