epicyon/newswire.py

768 lines
27 KiB
Python

__filename__ = "newswire.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
import requests
from socket import error as SocketError
import errno
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from collections import OrderedDict
from utils import firstParagraphFromString
from utils import isPublicPost
from utils import locatePost
from utils import loadJson
from utils import saveJson
from utils import isSuspended
from utils import containsInvalidChars
from utils import removeHtml
from blocking import isBlockedDomain
from blocking import isBlockedHashtag
from filters import isFiltered
def removeCDATA(text: str) -> str:
"""Removes any CDATA from the given text
"""
if 'CDATA[' in text:
text = text.split('CDATA[')[1]
if ']' in text:
text = text.split(']')[0]
return text
def rss2Header(httpPrefix: str,
nickname: str, domainFull: str,
title: str, translate: {}) -> str:
"""Header for an RSS 2.0 feed
"""
rssStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
rssStr += "<rss version=\"2.0\">"
rssStr += '<channel>'
if title.startswith('News'):
rssStr += ' <title>Newswire</title>'
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
'/newswire.xml' + '</link>'
elif title.startswith('Site'):
rssStr += ' <title>' + domainFull + '</title>'
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
'/blog/rss.xml' + '</link>'
else:
rssStr += ' <title>' + translate[title] + '</title>'
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
'/users/' + nickname + '/rss.xml' + '</link>'
return rssStr
def rss2Footer() -> str:
"""Footer for an RSS 2.0 feed
"""
rssStr = '</channel>'
rssStr += '</rss>'
return rssStr
def getNewswireTags(text: str, maxTags: int) -> []:
"""Returns a list of hashtags found in the given text
"""
if '#' not in text:
return []
if ' ' not in text:
return []
textSimplified = \
text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
textSimplified = textSimplified.replace('. ', ' ').strip()
if textSimplified.endswith('.'):
textSimplified = textSimplified[:len(textSimplified)-1]
words = textSimplified.split(' ')
tags = []
for wrd in words:
if wrd.startswith('#'):
if len(wrd) > 1:
if wrd not in tags:
tags.append(wrd)
if len(tags) >= maxTags:
break
return tags
def addNewswireDictEntry(baseDir: str, domain: str,
newswire: {}, dateStr: str,
title: str, link: str,
votesStatus: str, postFilename: str,
description: str, moderated: bool,
mirrored: bool,
tags=[], maxTags=32) -> None:
"""Update the newswire dictionary
"""
allText = removeHtml(title + ' ' + description)
# check that none of the text is filtered against
if isFiltered(baseDir, 'news', domain, allText):
return
if tags is None:
tags = []
# extract hashtags from the text of the feed post
postTags = getNewswireTags(allText, maxTags)
# combine the tags into a single list
for tag in tags:
if tag not in postTags:
if len(postTags) < maxTags:
postTags.append(tag)
# check that no tags are blocked
for tag in postTags:
if isBlockedHashtag(baseDir, tag.replace('#', '')):
return
newswire[dateStr] = [
title,
link,
votesStatus,
postFilename,
description,
moderated,
postTags,
mirrored
]
def parseFeedDate(pubDate: str) -> str:
"""Returns a UTC date string based on the given date string
This tries a number of formats to see which work
"""
formats = ("%a, %d %b %Y %H:%M:%S %z",
"%a, %d %b %Y %H:%M:%S EST",
"%a, %d %b %Y %H:%M:%S UT",
"%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S%z")
publishedDate = None
for dateFormat in formats:
if ',' in pubDate and ',' not in dateFormat:
continue
if ',' not in pubDate and ',' in dateFormat:
continue
if '-' in pubDate and '-' not in dateFormat:
continue
if '-' not in pubDate and '-' in dateFormat:
continue
if 'T' in pubDate and 'T' not in dateFormat:
continue
if 'T' not in pubDate and 'T' in dateFormat:
continue
if 'Z' in pubDate and 'Z' not in dateFormat:
continue
if 'Z' not in pubDate and 'Z' in dateFormat:
continue
if 'EST' not in pubDate and 'EST' in dateFormat:
continue
if 'EST' in pubDate and 'EST' not in dateFormat:
continue
if 'UT' not in pubDate and 'UT' in dateFormat:
continue
if 'UT' in pubDate and 'UT' not in dateFormat:
continue
try:
publishedDate = \
datetime.strptime(pubDate, dateFormat)
except BaseException:
print('WARN: unrecognized date format: ' +
pubDate + ' ' + dateFormat)
continue
if publishedDate:
if pubDate.endswith(' EST'):
hoursAdded = timedelta(hours=5)
publishedDate = publishedDate + hoursAdded
break
pubDateStr = None
if publishedDate:
offset = publishedDate.utcoffset()
publishedDate = publishedDate - offset
# convert local date to UTC
publishedDate = publishedDate.replace(tzinfo=timezone.utc)
pubDateStr = str(publishedDate)
if not pubDateStr.endswith('+00:00'):
pubDateStr += '+00:00'
return pubDateStr
def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
"""Converts an xml 2.0 string to a dictionary
"""
if '<item>' not in xmlStr:
return {}
result = {}
rssItems = xmlStr.split('<item>')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
for rssItem in rssItems:
if len(rssItem) > maxBytes:
print('WARN: rss feed item is too big')
continue
if '<title>' not in rssItem:
continue
if '</title>' not in rssItem:
continue
if '<link>' not in rssItem:
continue
if '</link>' not in rssItem:
continue
if '<pubDate>' not in rssItem:
continue
if '</pubDate>' not in rssItem:
continue
title = rssItem.split('<title>')[1]
title = removeCDATA(title.split('</title>')[0])
description = ''
if '<description>' in rssItem and '</description>' in rssItem:
description = rssItem.split('<description>')[1]
description = removeCDATA(description.split('</description>')[0])
else:
if '<media:description>' in rssItem and \
'</media:description>' in rssItem:
description = rssItem.split('<media:description>')[1]
description = description.split('</media:description>')[0]
description = removeCDATA(description)
link = rssItem.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
continue
itemDomain = link.split('://')[1]
if '/' in itemDomain:
itemDomain = itemDomain.split('/')[0]
if isBlockedDomain(baseDir, itemDomain):
continue
pubDate = rssItem.split('<pubDate>')[1]
pubDate = pubDate.split('</pubDate>')[0]
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
return result
def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
"""Converts an atom feed string to a dictionary
"""
if '<entry>' not in xmlStr:
return {}
result = {}
atomItems = xmlStr.split('<entry>')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
for atomItem in atomItems:
if len(atomItem) > maxBytes:
print('WARN: atom feed item is too big')
continue
if '<title>' not in atomItem:
continue
if '</title>' not in atomItem:
continue
if '<link>' not in atomItem:
continue
if '</link>' not in atomItem:
continue
if '<updated>' not in atomItem:
continue
if '</updated>' not in atomItem:
continue
title = atomItem.split('<title>')[1]
title = removeCDATA(title.split('</title>')[0])
description = ''
if '<summary>' in atomItem and '</summary>' in atomItem:
description = atomItem.split('<summary>')[1]
description = removeCDATA(description.split('</summary>')[0])
else:
if '<media:description>' in atomItem and \
'</media:description>' in atomItem:
description = atomItem.split('<media:description>')[1]
description = description.split('</media:description>')[0]
description = removeCDATA(description)
link = atomItem.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
continue
itemDomain = link.split('://')[1]
if '/' in itemDomain:
itemDomain = itemDomain.split('/')[0]
if isBlockedDomain(baseDir, itemDomain):
continue
pubDate = atomItem.split('<updated>')[1]
pubDate = pubDate.split('</updated>')[0]
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
return result
def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
"""Converts an atom-style YouTube feed string to a dictionary
"""
if '<entry>' not in xmlStr:
return {}
if isBlockedDomain(baseDir, 'www.youtube.com'):
return {}
result = {}
atomItems = xmlStr.split('<entry>')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
for atomItem in atomItems:
print('YouTube feed item: ' + atomItem)
if len(atomItem) > maxBytes:
print('WARN: atom feed item is too big')
continue
if '<title>' not in atomItem:
continue
if '</title>' not in atomItem:
continue
if '<updated>' not in atomItem:
continue
if '</updated>' not in atomItem:
continue
if '<yt:videoId>' not in atomItem:
continue
if '</yt:videoId>' not in atomItem:
continue
title = atomItem.split('<title>')[1]
title = removeCDATA(title.split('</title>')[0])
description = ''
if '<media:description>' in atomItem and \
'</media:description>' in atomItem:
description = atomItem.split('<media:description>')[1]
description = description.split('</media:description>')[0]
description = removeCDATA(description)
elif '<summary>' in atomItem and '</summary>' in atomItem:
description = atomItem.split('<summary>')[1]
description = description.split('</summary>')[0]
description = removeCDATA(description)
link = atomItem.split('<yt:videoId>')[1]
link = link.split('</yt:videoId>')[0]
link = 'https://www.youtube.com/watch?v=' + link.strip()
pubDate = atomItem.split('<updated>')[1]
pubDate = pubDate.split('</updated>')[0]
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
return result
def xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
"""Converts an xml string to a dictionary
"""
if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr:
print('YouTube feed: reading')
return atomFeedYTToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
elif 'rss version="2.0"' in xmlStr:
return xml2StrToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return atomFeedToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
return {}
def YTchannelToAtomFeed(url: str) -> str:
"""Converts a YouTube channel url into an atom feed url
"""
if 'youtube.com/channel/' not in url:
return url
channelId = url.split('youtube.com/channel/')[1].strip()
channelUrl = \
'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId
print('YouTube feed: ' + channelUrl)
return channelUrl
def getRSS(baseDir: str, domain: str, session, url: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxFeedSizeKb: int,
maxFeedItemSizeKb: int) -> {}:
"""Returns an RSS url as a dict
"""
if not isinstance(url, str):
print('url: ' + str(url))
print('ERROR: getRSS url should be a string')
return None
headers = {
'Accept': 'text/xml; charset=UTF-8'
}
params = None
sessionParams = {}
sessionHeaders = {}
if headers:
sessionHeaders = headers
if params:
sessionParams = params
sessionHeaders['User-Agent'] = \
'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
if not session:
print('WARN: no session specified for getRSS')
url = YTchannelToAtomFeed(url)
try:
result = session.get(url, headers=sessionHeaders, params=sessionParams)
if result:
if int(len(result.text) / 1024) < maxFeedSizeKb and \
not containsInvalidChars(result.text):
return xmlStrToDict(baseDir, domain, result.text,
moderated, mirrored,
maxPostsPerSource,
maxFeedItemSizeKb)
else:
print('WARN: feed is too large, ' +
'or contains invalid characters: ' + url)
else:
print('WARN: no result returned for feed ' + url)
except requests.exceptions.RequestException as e:
print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
'headers: ' + str(sessionHeaders) + '\n' +
'params: ' + str(sessionParams) + '\n')
print(e)
except ValueError as e:
print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
'headers: ' + str(sessionHeaders) + '\n' +
'params: ' + str(sessionParams) + '\n')
print(e)
except SocketError as e:
if e.errno == errno.ECONNRESET:
print('WARN: connection was reset during getRSS')
print(e)
return None
def getRSSfromDict(baseDir: str, newswire: {},
httpPrefix: str, domainFull: str,
title: str, translate: {}) -> str:
"""Returns an rss feed from the current newswire dict.
This allows other instances to subscribe to the same newswire
"""
rssStr = rss2Header(httpPrefix,
None, domainFull,
'Newswire', translate)
if not newswire:
return ''
for published, fields in newswire.items():
if '+00:00' in published:
published = published.replace('+00:00', 'Z').strip()
published = published.replace(' ', 'T')
else:
publishedWithOffset = \
datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z")
published = publishedWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
try:
pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
except Exception as e:
print('WARN: Unable to convert date ' + published + ' ' + str(e))
continue
rssStr += '<item>\n'
rssStr += ' <title>' + fields[0] + '</title>\n'
description = removeCDATA(firstParagraphFromString(fields[4]))
rssStr += ' <description>' + description + '</description>\n'
url = fields[1]
if '://' not in url:
if domainFull not in url:
url = httpPrefix + '://' + domainFull + url
rssStr += ' <link>' + url + '</link>\n'
rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
rssStr += ' <pubDate>' + rssDateStr + '</pubDate>\n'
rssStr += '</item>\n'
rssStr += rss2Footer()
return rssStr
def isNewswireBlogPost(postJsonObject: {}) -> bool:
"""Is the given object a blog post?
There isn't any difference between a blog post and a newswire blog post
but we may here need to check for different properties than
isBlogPost does
"""
if not postJsonObject:
return False
if not postJsonObject.get('object'):
return False
if not isinstance(postJsonObject['object'], dict):
return False
if postJsonObject['object'].get('summary') and \
postJsonObject['object'].get('url') and \
postJsonObject['object'].get('content') and \
postJsonObject['object'].get('published'):
return isPublicPost(postJsonObject)
return False
def getHashtagsFromPost(postJsonObject: {}) -> []:
"""Returns a list of any hashtags within a post
"""
if not postJsonObject.get('object'):
return []
if not isinstance(postJsonObject['object'], dict):
return []
if not postJsonObject['object'].get('tag'):
return []
if not isinstance(postJsonObject['object']['tag'], list):
return []
tags = []
for tg in postJsonObject['object']['tag']:
if not isinstance(tg, dict):
continue
if not tg.get('name'):
continue
if not tg.get('type'):
continue
if tg['type'] != 'Hashtag':
continue
if tg['name'] not in tags:
tags.append(tg['name'])
return tags
def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
newswire: {},
maxBlogsPerAccount: int,
indexFilename: str,
maxTags: int) -> None:
"""Adds blogs for the given account to the newswire
"""
if not os.path.isfile(indexFilename):
return
# local blog entries are unmoderated by default
moderated = False
# local blogs can potentially be moderated
moderatedFilename = \
baseDir + '/accounts/' + nickname + '@' + domain + \
'/.newswiremoderated'
if os.path.isfile(moderatedFilename):
moderated = True
with open(indexFilename, 'r') as indexFile:
postFilename = 'start'
ctr = 0
while postFilename:
postFilename = indexFile.readline()
if postFilename:
# if this is a full path then remove the directories
if '/' in postFilename:
postFilename = postFilename.split('/')[-1]
# filename of the post without any extension or path
# This should also correspond to any index entry in
# the posts cache
postUrl = \
postFilename.replace('\n', '').replace('\r', '')
postUrl = postUrl.replace('.json', '').strip()
# read the post from file
fullPostFilename = \
locatePost(baseDir, nickname,
domain, postUrl, False)
if not fullPostFilename:
print('Unable to locate post ' + postUrl)
ctr += 1
if ctr >= maxBlogsPerAccount:
break
continue
postJsonObject = None
if fullPostFilename:
postJsonObject = loadJson(fullPostFilename)
if isNewswireBlogPost(postJsonObject):
published = postJsonObject['object']['published']
published = published.replace('T', ' ')
published = published.replace('Z', '+00:00')
votes = []
if os.path.isfile(fullPostFilename + '.votes'):
votes = loadJson(fullPostFilename + '.votes')
content = postJsonObject['object']['content']
description = firstParagraphFromString(content)
description = removeCDATA(description)
addNewswireDictEntry(baseDir, domain,
newswire, published,
postJsonObject['object']['summary'],
postJsonObject['object']['url'],
votes, fullPostFilename,
description, moderated, False,
getHashtagsFromPost(postJsonObject),
maxTags)
ctr += 1
if ctr >= maxBlogsPerAccount:
break
def addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
maxBlogsPerAccount: int,
maxTags: int) -> None:
"""Adds blogs from each user account into the newswire
"""
moderationDict = {}
# go through each account
for subdir, dirs, files in os.walk(baseDir + '/accounts'):
for handle in dirs:
if '@' not in handle:
continue
if 'inbox@' in handle:
continue
nickname = handle.split('@')[0]
# has this account been suspended?
if isSuspended(baseDir, nickname):
continue
if os.path.isfile(baseDir + '/accounts/' + handle +
'/.nonewswire'):
continue
# is there a blogs timeline for this account?
accountDir = os.path.join(baseDir + '/accounts', handle)
blogsIndex = accountDir + '/tlblogs.index'
if os.path.isfile(blogsIndex):
domain = handle.split('@')[1]
addAccountBlogsToNewswire(baseDir, nickname, domain,
newswire, maxBlogsPerAccount,
blogsIndex, maxTags)
# sort the moderation dict into chronological order, latest first
sortedModerationDict = \
OrderedDict(sorted(moderationDict.items(), reverse=True))
# save the moderation queue details for later display
newswireModerationFilename = baseDir + '/accounts/newswiremoderation.txt'
if sortedModerationDict:
saveJson(sortedModerationDict, newswireModerationFilename)
else:
# remove the file if there is nothing to moderate
if os.path.isfile(newswireModerationFilename):
os.remove(newswireModerationFilename)
def getDictFromNewswire(session, baseDir: str, domain: str,
maxPostsPerSource: int, maxFeedSizeKb: int,
maxTags: int, maxFeedItemSizeKb: int,
maxNewswirePosts: int) -> {}:
"""Gets rss feeds as a dictionary from newswire file
"""
subscriptionsFilename = baseDir + '/accounts/newswire.txt'
if not os.path.isfile(subscriptionsFilename):
return {}
maxPostsPerSource = 5
# add rss feeds
rssFeed = []
with open(subscriptionsFilename, 'r') as fp:
rssFeed = fp.readlines()
result = {}
for url in rssFeed:
url = url.strip()
# Does this contain a url?
if '://' not in url:
continue
# is this a comment?
if url.startswith('#'):
continue
# should this feed be moderated?
moderated = False
if '*' in url:
moderated = True
url = url.replace('*', '').strip()
# should this feed content be mirrored?
mirrored = False
if '!' in url:
mirrored = True
url = url.replace('!', '').strip()
itemsList = getRSS(baseDir, domain, session, url,
moderated, mirrored,
maxPostsPerSource, maxFeedSizeKb,
maxFeedItemSizeKb)
if itemsList:
for dateStr, item in itemsList.items():
result[dateStr] = item
# add blogs from each user account
addBlogsToNewswire(baseDir, domain, result,
maxPostsPerSource, maxTags)
# sort into chronological order, latest first
sortedResult = OrderedDict(sorted(result.items(), reverse=True))
# are there too many posts? If so then remove the oldest ones
noOfPosts = len(sortedResult.items())
if noOfPosts > maxNewswirePosts:
ctr = 0
removals = []
for dateStr, item in sortedResult.items():
ctr += 1
if ctr > maxNewswirePosts:
removals.append(dateStr)
for r in removals:
sortedResult.pop(r)
return sortedResult