epicyon/newswire.py

633 lines
22 KiB
Python
Raw Normal View History

2020-10-04 09:51:12 +00:00
__filename__ = "newswire.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
import requests
from socket import error as SocketError
import errno
from datetime import datetime
from collections import OrderedDict
2020-10-25 10:42:38 +00:00
from utils import isPublicPost
2020-10-05 11:11:48 +00:00
from utils import locatePost
from utils import loadJson
from utils import saveJson
2020-10-06 08:58:44 +00:00
from utils import isSuspended
from utils import containsInvalidChars
from utils import removeHtml
from blocking import isBlockedDomain
2020-10-17 14:23:35 +00:00
from blocking import isBlockedHashtag
2020-10-17 16:08:07 +00:00
from filters import isFiltered
2020-10-04 09:51:12 +00:00
2020-10-16 12:11:05 +00:00
2020-10-04 12:29:07 +00:00
def rss2Header(httpPrefix: str,
nickname: str, domainFull: str,
title: str, translate: {}) -> str:
2020-10-06 09:22:23 +00:00
"""Header for an RSS 2.0 feed
"""
2020-10-04 12:29:07 +00:00
rssStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
rssStr += "<rss version=\"2.0\">"
rssStr += '<channel>'
2020-10-13 17:14:57 +00:00
2020-10-04 12:29:07 +00:00
if title.startswith('News'):
rssStr += ' <title>Newswire</title>'
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
'/newswire.xml' + '</link>'
2020-10-13 17:14:57 +00:00
elif title.startswith('Site'):
rssStr += ' <title>' + domainFull + '</title>'
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
2020-10-13 17:17:17 +00:00
'/blog/rss.xml' + '</link>'
2020-10-04 12:29:07 +00:00
else:
2020-10-13 17:14:57 +00:00
rssStr += ' <title>' + translate[title] + '</title>'
2020-10-04 12:29:07 +00:00
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
'/users/' + nickname + '/rss.xml' + '</link>'
return rssStr
def rss2Footer() -> str:
2020-10-06 09:22:23 +00:00
"""Footer for an RSS 2.0 feed
"""
2020-10-04 12:29:07 +00:00
rssStr = '</channel>'
rssStr += '</rss>'
return rssStr
def getNewswireTags(text: str, maxTags: int) -> []:
2020-10-16 19:49:34 +00:00
"""Returns a list of hashtags found in the given text
"""
2020-10-16 20:46:34 +00:00
if '#' not in text:
return []
2020-10-16 19:49:34 +00:00
if ' ' not in text:
return []
textSimplified = \
text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
textSimplified = textSimplified.replace('. ', ' ').strip()
if textSimplified.endswith('.'):
textSimplified = textSimplified[:len(textSimplified)-1]
words = textSimplified.split(' ')
tags = []
for wrd in words:
if wrd.startswith('#'):
2020-10-16 19:52:27 +00:00
if len(wrd) > 1:
if wrd not in tags:
tags.append(wrd)
if len(tags) >= maxTags:
break
2020-10-16 19:49:34 +00:00
return tags
2020-10-17 16:08:07 +00:00
def addNewswireDictEntry(baseDir: str, domain: str,
newswire: {}, dateStr: str,
2020-11-08 10:05:28 +00:00
title: str, link: str,
votesStatus: str, postFilename: str,
2020-10-16 20:13:23 +00:00
description: str, moderated: bool,
mirrored: bool,
tags=[], maxTags=32) -> None:
"""Update the newswire dictionary
"""
allText = removeHtml(title + ' ' + description)
2020-10-25 10:17:12 +00:00
# check that none of the text is filtered against
2020-10-17 16:08:07 +00:00
if isFiltered(baseDir, 'news', domain, allText):
return
2020-10-25 10:17:12 +00:00
if tags is None:
tags = []
# extract hashtags from the text of the feed post
postTags = getNewswireTags(allText, maxTags)
# combine the tags into a single list
2020-10-25 12:57:14 +00:00
for tag in tags:
if tag not in postTags:
if len(postTags) < maxTags:
postTags.append(tag)
2020-10-25 10:17:12 +00:00
# check that no tags are blocked
2020-10-25 12:57:14 +00:00
for tag in postTags:
2020-10-25 10:17:12 +00:00
if isBlockedHashtag(baseDir, tag.replace('#', '')):
2020-10-25 10:18:07 +00:00
return
2020-10-25 10:17:12 +00:00
newswire[dateStr] = [
title,
link,
votesStatus,
postFilename,
description,
moderated,
2020-10-25 12:57:14 +00:00
postTags,
2020-10-25 10:17:12 +00:00
mirrored
]
def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
2020-10-04 09:51:12 +00:00
"""Converts an xml 2.0 string to a dictionary
"""
if '<item>' not in xmlStr:
return {}
result = {}
rssItems = xmlStr.split('<item>')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
2020-10-04 09:51:12 +00:00
for rssItem in rssItems:
if len(rssItem) > maxBytes:
print('WARN: rss feed item is too big')
continue
2020-10-04 09:51:12 +00:00
if '<title>' not in rssItem:
continue
if '</title>' not in rssItem:
continue
if '<link>' not in rssItem:
continue
if '</link>' not in rssItem:
continue
if '<pubDate>' not in rssItem:
continue
if '</pubDate>' not in rssItem:
continue
title = rssItem.split('<title>')[1]
title = title.split('</title>')[0]
2020-10-07 12:05:49 +00:00
description = ''
if '<description>' in rssItem and '</description>' in rssItem:
description = rssItem.split('<description>')[1]
description = description.split('</description>')[0]
2020-10-04 09:51:12 +00:00
link = rssItem.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
continue
2020-10-17 20:53:36 +00:00
itemDomain = link.split('://')[1]
if '/' in itemDomain:
itemDomain = itemDomain.split('/')[0]
if isBlockedDomain(baseDir, itemDomain):
continue
2020-10-04 09:51:12 +00:00
pubDate = rssItem.split('<pubDate>')[1]
pubDate = pubDate.split('</pubDate>')[0]
parsed = False
try:
publishedDate = \
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
2020-10-09 08:40:41 +00:00
postFilename = ''
votesStatus = []
2020-10-17 16:08:07 +00:00
addNewswireDictEntry(baseDir, domain,
result, str(publishedDate),
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
2020-10-04 09:51:12 +00:00
parsed = True
except BaseException:
pass
if not parsed:
try:
publishedDate = \
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
postFilename = ''
votesStatus = []
2020-10-17 16:08:07 +00:00
addNewswireDictEntry(baseDir, domain,
result,
2020-10-17 14:23:35 +00:00
str(publishedDate) + '+00:00',
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
2020-10-04 09:51:12 +00:00
parsed = True
except BaseException:
print('WARN: unrecognized RSS date format: ' + pubDate)
pass
return result
def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
2020-10-10 12:24:14 +00:00
"""Converts an atom feed string to a dictionary
"""
if '<entry>' not in xmlStr:
return {}
result = {}
rssItems = xmlStr.split('<entry>')
postCtr = 0
maxBytes = maxFeedItemSizeKb * 1024
2020-10-10 12:24:14 +00:00
for rssItem in rssItems:
if len(rssItem) > maxBytes:
print('WARN: atom feed item is too big')
continue
2020-10-10 12:24:14 +00:00
if '<title>' not in rssItem:
continue
if '</title>' not in rssItem:
continue
if '<link>' not in rssItem:
continue
if '</link>' not in rssItem:
continue
if '<updated>' not in rssItem:
continue
if '</updated>' not in rssItem:
continue
title = rssItem.split('<title>')[1]
title = title.split('</title>')[0]
description = ''
if '<summary>' in rssItem and '</summary>' in rssItem:
description = rssItem.split('<summary>')[1]
description = description.split('</summary>')[0]
link = rssItem.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
continue
2020-10-17 20:53:36 +00:00
itemDomain = link.split('://')[1]
if '/' in itemDomain:
itemDomain = itemDomain.split('/')[0]
if isBlockedDomain(baseDir, itemDomain):
continue
2020-10-10 12:24:14 +00:00
pubDate = rssItem.split('<updated>')[1]
pubDate = pubDate.split('</updated>')[0]
parsed = False
try:
publishedDate = \
datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ")
postFilename = ''
votesStatus = []
2020-10-17 16:08:07 +00:00
addNewswireDictEntry(baseDir, domain,
result, str(publishedDate),
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
2020-10-10 12:24:14 +00:00
parsed = True
except BaseException:
pass
if not parsed:
try:
publishedDate = \
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
postFilename = ''
votesStatus = []
2020-10-17 16:08:07 +00:00
addNewswireDictEntry(baseDir, domain, result,
2020-10-17 14:23:35 +00:00
str(publishedDate) + '+00:00',
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
2020-10-10 12:24:14 +00:00
parsed = True
except BaseException:
print('WARN: unrecognized atom feed date format: ' + pubDate)
pass
return result
def xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
maxFeedItemSizeKb: int) -> {}:
2020-10-04 09:51:12 +00:00
"""Converts an xml string to a dictionary
"""
if 'rss version="2.0"' in xmlStr:
2020-10-17 16:08:07 +00:00
return xml2StrToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
2020-10-10 12:24:14 +00:00
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
2020-10-17 16:08:07 +00:00
return atomFeedToDict(baseDir, domain,
xmlStr, moderated, mirrored,
maxPostsPerSource, maxFeedItemSizeKb)
2020-10-04 09:51:12 +00:00
return {}
def getRSS(baseDir: str, domain: str, session, url: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int, maxFeedSizeKb: int,
maxFeedItemSizeKb: int) -> {}:
2020-10-04 09:51:12 +00:00
"""Returns an RSS url as a dict
"""
if not isinstance(url, str):
print('url: ' + str(url))
print('ERROR: getRSS url should be a string')
return None
headers = {
'Accept': 'text/xml; charset=UTF-8'
}
params = None
sessionParams = {}
sessionHeaders = {}
if headers:
sessionHeaders = headers
if params:
sessionParams = params
sessionHeaders['User-Agent'] = \
'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
if not session:
print('WARN: no session specified for getRSS')
try:
result = session.get(url, headers=sessionHeaders, params=sessionParams)
if result:
2020-10-16 12:03:56 +00:00
if int(len(result.text) / 1024) < maxFeedSizeKb and \
not containsInvalidChars(result.text):
return xmlStrToDict(baseDir, domain, result.text,
moderated, mirrored,
maxPostsPerSource,
maxFeedItemSizeKb)
else:
print('WARN: feed is too large: ' + url)
2020-10-04 09:51:12 +00:00
except requests.exceptions.RequestException as e:
print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
'headers: ' + str(sessionHeaders) + '\n' +
'params: ' + str(sessionParams) + '\n')
print(e)
except ValueError as e:
print('ERROR: getRSS failed\nurl: ' + str(url) + '\n' +
'headers: ' + str(sessionHeaders) + '\n' +
'params: ' + str(sessionParams) + '\n')
print(e)
except SocketError as e:
if e.errno == errno.ECONNRESET:
print('WARN: connection was reset during getRSS')
print(e)
return None
2020-10-04 12:29:07 +00:00
def getRSSfromDict(baseDir: str, newswire: {},
httpPrefix: str, domainFull: str,
title: str, translate: {}) -> str:
"""Returns an rss feed from the current newswire dict.
This allows other instances to subscribe to the same newswire
"""
rssStr = rss2Header(httpPrefix,
None, domainFull,
'Newswire', translate)
2020-11-03 14:41:28 +00:00
if not newswire:
return ''
2020-10-04 12:29:07 +00:00
for published, fields in newswire.items():
2020-10-20 12:22:52 +00:00
if '+00:00' in published:
published = published.replace('+00:00', 'Z').strip()
published = published.replace(' ', 'T')
else:
publishedWithOffset = \
2020-10-20 12:37:32 +00:00
datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z")
2020-10-20 12:22:52 +00:00
published = publishedWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
2020-10-04 22:08:13 +00:00
try:
2020-10-04 22:12:27 +00:00
pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
2020-10-20 12:28:15 +00:00
except Exception as e:
print('WARN: Unable to convert date ' + published + ' ' + str(e))
2020-10-04 22:08:13 +00:00
continue
2020-10-04 12:29:07 +00:00
rssStr += '<item>\n'
rssStr += ' <title>' + fields[0] + '</title>\n'
2020-10-08 15:07:06 +00:00
url = fields[1]
if domainFull not in url:
url = httpPrefix + '://' + domainFull + url
rssStr += ' <link>' + url + '</link>\n'
2020-10-04 22:12:27 +00:00
2020-10-04 12:29:07 +00:00
rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
rssStr += ' <pubDate>' + rssDateStr + '</pubDate>\n'
rssStr += '</item>\n'
rssStr += rss2Footer()
return rssStr
2020-10-25 10:45:42 +00:00
def isNewswireBlogPost(postJsonObject: {}) -> bool:
"""Is the given object a blog post?
2020-10-25 10:47:39 +00:00
There isn't any difference between a blog post and a newswire blog post
but we may here need to check for different properties than
isBlogPost does
"""
if not postJsonObject:
return False
if not postJsonObject.get('object'):
return False
if not isinstance(postJsonObject['object'], dict):
return False
if postJsonObject['object'].get('summary') and \
postJsonObject['object'].get('url') and \
2020-11-08 09:47:01 +00:00
postJsonObject['object'].get('content') and \
postJsonObject['object'].get('published'):
2020-10-25 10:42:38 +00:00
return isPublicPost(postJsonObject)
return False
2020-10-16 20:13:23 +00:00
def getHashtagsFromPost(postJsonObject: {}) -> []:
"""Returns a list of any hashtags within a post
"""
if not postJsonObject.get('object'):
return []
if not isinstance(postJsonObject['object'], dict):
return []
if not postJsonObject['object'].get('tag'):
return []
2020-10-18 09:28:43 +00:00
if not isinstance(postJsonObject['object']['tag'], list):
2020-10-16 20:13:23 +00:00
return []
tags = []
2020-10-18 09:28:43 +00:00
for tg in postJsonObject['object']['tag']:
2020-10-16 20:13:23 +00:00
if not isinstance(tg, dict):
continue
if not tg.get('name'):
continue
if not tg.get('type'):
continue
if tg['type'] != 'Hashtag':
continue
if tg['name'] not in tags:
tags.append(tg['name'])
return tags
2020-11-08 09:47:01 +00:00
def firstParagraph(postJsonObject: {}) -> str:
"""Get the first paragraph from a blog post
to be used as a summary in the newswire feed
"""
content = postJsonObject['object']['content']
if '<p>' not in content or '</p>' not in content:
return removeHtml(content)
paragraph = content.split('<p>')[1]
if '</p>' in paragraph:
paragraph = paragraph.split('</p>')[0]
return removeHtml(paragraph)
2020-10-05 11:11:48 +00:00
def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
newswire: {},
maxBlogsPerAccount: int,
indexFilename: str,
maxTags: int) -> None:
2020-10-05 11:11:48 +00:00
"""Adds blogs for the given account to the newswire
"""
if not os.path.isfile(indexFilename):
return
# local blog entries are unmoderated by default
moderated = False
# local blogs can potentially be moderated
moderatedFilename = \
baseDir + '/accounts/' + nickname + '@' + domain + \
'/.newswiremoderated'
if os.path.isfile(moderatedFilename):
moderated = True
2020-10-05 11:11:48 +00:00
with open(indexFilename, 'r') as indexFile:
postFilename = 'start'
ctr = 0
while postFilename:
postFilename = indexFile.readline()
if postFilename:
# if this is a full path then remove the directories
if '/' in postFilename:
postFilename = postFilename.split('/')[-1]
# filename of the post without any extension or path
# This should also correspond to any index entry in
# the posts cache
postUrl = \
postFilename.replace('\n', '').replace('\r', '')
postUrl = postUrl.replace('.json', '').strip()
# read the post from file
fullPostFilename = \
locatePost(baseDir, nickname,
domain, postUrl, False)
2020-10-06 13:05:15 +00:00
if not fullPostFilename:
print('Unable to locate post ' + postUrl)
ctr += 1
if ctr >= maxBlogsPerAccount:
break
2020-10-06 13:34:04 +00:00
continue
2020-10-06 13:05:15 +00:00
2020-10-05 11:11:48 +00:00
postJsonObject = None
if fullPostFilename:
postJsonObject = loadJson(fullPostFilename)
2020-10-25 10:45:42 +00:00
if isNewswireBlogPost(postJsonObject):
published = postJsonObject['object']['published']
published = published.replace('T', ' ')
published = published.replace('Z', '+00:00')
2020-10-06 20:17:34 +00:00
votes = []
if os.path.isfile(fullPostFilename + '.votes'):
votes = loadJson(fullPostFilename + '.votes')
2020-11-08 09:47:01 +00:00
description = firstParagraph(postJsonObject)
2020-10-17 16:08:07 +00:00
addNewswireDictEntry(baseDir, domain,
newswire, published,
postJsonObject['object']['summary'],
postJsonObject['object']['url'],
votes, fullPostFilename,
description, moderated, False,
getHashtagsFromPost(postJsonObject),
maxTags)
2020-10-05 11:11:48 +00:00
ctr += 1
if ctr >= maxBlogsPerAccount:
break
2020-10-17 16:08:07 +00:00
def addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
maxBlogsPerAccount: int,
maxTags: int) -> None:
"""Adds blogs from each user account into the newswire
2020-10-06 09:37:22 +00:00
"""
moderationDict = {}
2020-10-05 11:11:48 +00:00
# go through each account
for subdir, dirs, files in os.walk(baseDir + '/accounts'):
for handle in dirs:
if '@' not in handle:
continue
if 'inbox@' in handle:
continue
2020-10-06 09:37:22 +00:00
nickname = handle.split('@')[0]
# has this account been suspended?
2020-10-06 08:58:44 +00:00
if isSuspended(baseDir, nickname):
continue
2020-10-06 21:28:40 +00:00
if os.path.isfile(baseDir + '/accounts/' + handle +
'/.nonewswire'):
continue
2020-10-05 11:11:48 +00:00
# is there a blogs timeline for this account?
2020-10-06 09:41:04 +00:00
accountDir = os.path.join(baseDir + '/accounts', handle)
2020-10-05 11:11:48 +00:00
blogsIndex = accountDir + '/tlblogs.index'
if os.path.isfile(blogsIndex):
domain = handle.split('@')[1]
addAccountBlogsToNewswire(baseDir, nickname, domain,
newswire, maxBlogsPerAccount,
blogsIndex, maxTags)
2020-10-05 11:11:48 +00:00
# sort the moderation dict into chronological order, latest first
sortedModerationDict = \
OrderedDict(sorted(moderationDict.items(), reverse=True))
2020-10-06 12:15:35 +00:00
# save the moderation queue details for later display
newswireModerationFilename = baseDir + '/accounts/newswiremoderation.txt'
if sortedModerationDict:
saveJson(sortedModerationDict, newswireModerationFilename)
else:
# remove the file if there is nothing to moderate
if os.path.isfile(newswireModerationFilename):
os.remove(newswireModerationFilename)
2020-10-05 11:11:48 +00:00
2020-10-17 16:08:07 +00:00
def getDictFromNewswire(session, baseDir: str, domain: str,
maxPostsPerSource: int, maxFeedSizeKb: int,
maxTags: int, maxFeedItemSizeKb: int) -> {}:
2020-10-04 09:59:55 +00:00
"""Gets rss feeds as a dictionary from newswire file
2020-10-04 09:51:12 +00:00
"""
2020-10-04 09:59:55 +00:00
subscriptionsFilename = baseDir + '/accounts/newswire.txt'
2020-10-04 09:51:12 +00:00
if not os.path.isfile(subscriptionsFilename):
return {}
maxPostsPerSource = 5
2020-10-05 11:11:48 +00:00
# add rss feeds
2020-10-04 09:51:12 +00:00
rssFeed = []
with open(subscriptionsFilename, 'r') as fp:
rssFeed = fp.readlines()
result = {}
for url in rssFeed:
url = url.strip()
# Does this contain a url?
2020-10-04 09:51:12 +00:00
if '://' not in url:
continue
# is this a comment?
2020-10-04 09:51:12 +00:00
if url.startswith('#'):
continue
# should this feed be moderated?
moderated = False
if '*' in url:
moderated = True
url = url.replace('*', '').strip()
# should this feed content be mirrored?
mirrored = False
if '!' in url:
mirrored = True
url = url.replace('!', '').strip()
itemsList = getRSS(baseDir, domain, session, url,
moderated, mirrored,
maxPostsPerSource, maxFeedSizeKb,
maxFeedItemSizeKb)
2020-11-03 15:04:33 +00:00
if itemsList:
for dateStr, item in itemsList.items():
result[dateStr] = item
2020-10-05 11:11:48 +00:00
# add blogs from each user account
addBlogsToNewswire(baseDir, domain, result,
maxPostsPerSource, maxTags)
2020-10-05 11:11:48 +00:00
# sort into chronological order, latest first
2020-10-04 21:45:46 +00:00
sortedResult = OrderedDict(sorted(result.items(), reverse=True))
2020-10-04 09:51:12 +00:00
return sortedResult