2020-10-04 09:51:12 +00:00
|
|
|
__filename__ = "newswire.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2021-01-26 10:07:42 +00:00
|
|
|
__version__ = "1.2.0"
|
2020-10-04 09:51:12 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
|
|
|
__email__ = "bob@freedombone.net"
|
|
|
|
__status__ = "Production"
|
|
|
|
|
|
|
|
import os
|
2021-02-12 11:28:00 +00:00
|
|
|
import json
|
2020-10-04 09:51:12 +00:00
|
|
|
import requests
|
|
|
|
from socket import error as SocketError
|
|
|
|
import errno
|
|
|
|
from datetime import datetime
|
2020-11-22 15:33:11 +00:00
|
|
|
from datetime import timedelta
|
2020-11-22 19:09:35 +00:00
|
|
|
from datetime import timezone
|
2020-10-04 09:51:12 +00:00
|
|
|
from collections import OrderedDict
|
2020-12-21 12:11:45 +00:00
|
|
|
from utils import validPostDate
|
2020-12-22 10:30:52 +00:00
|
|
|
from categories import setHashtagCategory
|
2020-11-08 10:45:33 +00:00
|
|
|
from utils import firstParagraphFromString
|
2020-10-25 10:42:38 +00:00
|
|
|
from utils import isPublicPost
|
2020-10-05 11:11:48 +00:00
|
|
|
from utils import locatePost
|
|
|
|
from utils import loadJson
|
2020-10-06 11:28:32 +00:00
|
|
|
from utils import saveJson
|
2020-10-06 08:58:44 +00:00
|
|
|
from utils import isSuspended
|
2020-10-16 11:58:31 +00:00
|
|
|
from utils import containsInvalidChars
|
2020-10-25 12:47:16 +00:00
|
|
|
from utils import removeHtml
|
2020-10-16 11:58:31 +00:00
|
|
|
from blocking import isBlockedDomain
|
2020-10-17 14:23:35 +00:00
|
|
|
from blocking import isBlockedHashtag
|
2020-10-17 16:08:07 +00:00
|
|
|
from filters import isFiltered
|
2020-10-04 09:51:12 +00:00
|
|
|
|
2020-10-16 12:11:05 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _removeCDATA(text: str) -> str:
|
2020-11-22 12:18:43 +00:00
|
|
|
"""Removes any CDATA from the given text
|
|
|
|
"""
|
|
|
|
if 'CDATA[' in text:
|
|
|
|
text = text.split('CDATA[')[1]
|
|
|
|
if ']' in text:
|
|
|
|
text = text.split(']')[0]
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
2020-10-04 12:29:07 +00:00
|
|
|
def rss2Header(httpPrefix: str,
|
|
|
|
nickname: str, domainFull: str,
|
|
|
|
title: str, translate: {}) -> str:
|
2020-10-06 09:22:23 +00:00
|
|
|
"""Header for an RSS 2.0 feed
|
|
|
|
"""
|
2020-10-04 12:29:07 +00:00
|
|
|
rssStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
|
|
|
|
rssStr += "<rss version=\"2.0\">"
|
|
|
|
rssStr += '<channel>'
|
2020-10-13 17:14:57 +00:00
|
|
|
|
2020-10-04 12:29:07 +00:00
|
|
|
if title.startswith('News'):
|
|
|
|
rssStr += ' <title>Newswire</title>'
|
|
|
|
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
|
|
|
|
'/newswire.xml' + '</link>'
|
2020-10-13 17:14:57 +00:00
|
|
|
elif title.startswith('Site'):
|
|
|
|
rssStr += ' <title>' + domainFull + '</title>'
|
|
|
|
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
|
2020-10-13 17:17:17 +00:00
|
|
|
'/blog/rss.xml' + '</link>'
|
2020-10-04 12:29:07 +00:00
|
|
|
else:
|
2020-10-13 17:14:57 +00:00
|
|
|
rssStr += ' <title>' + translate[title] + '</title>'
|
2020-10-04 12:29:07 +00:00
|
|
|
rssStr += ' <link>' + httpPrefix + '://' + domainFull + \
|
|
|
|
'/users/' + nickname + '/rss.xml' + '</link>'
|
|
|
|
return rssStr
|
|
|
|
|
|
|
|
|
|
|
|
def rss2Footer() -> str:
|
2020-10-06 09:22:23 +00:00
|
|
|
"""Footer for an RSS 2.0 feed
|
|
|
|
"""
|
2020-10-04 12:29:07 +00:00
|
|
|
rssStr = '</channel>'
|
|
|
|
rssStr += '</rss>'
|
|
|
|
return rssStr
|
|
|
|
|
|
|
|
|
2020-10-23 14:41:29 +00:00
|
|
|
def getNewswireTags(text: str, maxTags: int) -> []:
|
2020-10-16 19:49:34 +00:00
|
|
|
"""Returns a list of hashtags found in the given text
|
|
|
|
"""
|
2020-10-16 20:46:34 +00:00
|
|
|
if '#' not in text:
|
|
|
|
return []
|
2020-10-16 19:49:34 +00:00
|
|
|
if ' ' not in text:
|
|
|
|
return []
|
|
|
|
textSimplified = \
|
|
|
|
text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
|
|
|
|
textSimplified = textSimplified.replace('. ', ' ').strip()
|
|
|
|
if textSimplified.endswith('.'):
|
|
|
|
textSimplified = textSimplified[:len(textSimplified)-1]
|
|
|
|
words = textSimplified.split(' ')
|
|
|
|
tags = []
|
|
|
|
for wrd in words:
|
|
|
|
if wrd.startswith('#'):
|
2020-10-16 19:52:27 +00:00
|
|
|
if len(wrd) > 1:
|
|
|
|
if wrd not in tags:
|
|
|
|
tags.append(wrd)
|
2020-10-23 14:41:29 +00:00
|
|
|
if len(tags) >= maxTags:
|
|
|
|
break
|
2020-10-16 19:49:34 +00:00
|
|
|
return tags
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addNewswireDictEntry(baseDir: str, domain: str,
|
|
|
|
newswire: {}, dateStr: str,
|
|
|
|
title: str, link: str,
|
|
|
|
votesStatus: str, postFilename: str,
|
|
|
|
description: str, moderated: bool,
|
|
|
|
mirrored: bool,
|
|
|
|
tags=[], maxTags=32) -> None:
|
2020-10-16 19:25:55 +00:00
|
|
|
"""Update the newswire dictionary
|
|
|
|
"""
|
2020-12-12 15:44:43 +00:00
|
|
|
# remove any markup
|
|
|
|
title = removeHtml(title)
|
|
|
|
description = removeHtml(description)
|
|
|
|
|
|
|
|
allText = title + ' ' + description
|
2020-10-25 10:17:12 +00:00
|
|
|
|
|
|
|
# check that none of the text is filtered against
|
2020-12-19 11:43:20 +00:00
|
|
|
if isFiltered(baseDir, None, None, allText):
|
2020-10-17 16:08:07 +00:00
|
|
|
return
|
2020-10-25 10:17:12 +00:00
|
|
|
|
|
|
|
if tags is None:
|
|
|
|
tags = []
|
|
|
|
|
|
|
|
# extract hashtags from the text of the feed post
|
|
|
|
postTags = getNewswireTags(allText, maxTags)
|
|
|
|
|
|
|
|
# combine the tags into a single list
|
2020-10-25 12:57:14 +00:00
|
|
|
for tag in tags:
|
|
|
|
if tag not in postTags:
|
|
|
|
if len(postTags) < maxTags:
|
|
|
|
postTags.append(tag)
|
2020-10-25 10:17:12 +00:00
|
|
|
|
|
|
|
# check that no tags are blocked
|
2020-10-25 12:57:14 +00:00
|
|
|
for tag in postTags:
|
2020-12-03 19:51:47 +00:00
|
|
|
if isBlockedHashtag(baseDir, tag):
|
2020-10-25 10:18:07 +00:00
|
|
|
return
|
2020-10-25 10:17:12 +00:00
|
|
|
|
|
|
|
newswire[dateStr] = [
|
|
|
|
title,
|
|
|
|
link,
|
|
|
|
votesStatus,
|
|
|
|
postFilename,
|
|
|
|
description,
|
|
|
|
moderated,
|
2020-10-25 12:57:14 +00:00
|
|
|
postTags,
|
2020-10-25 10:17:12 +00:00
|
|
|
mirrored
|
|
|
|
]
|
2020-10-16 19:25:55 +00:00
|
|
|
|
|
|
|
|
2021-03-14 19:53:22 +00:00
|
|
|
def _validFeedDate(pubDate: str, debug=False) -> bool:
|
2020-12-21 12:11:45 +00:00
|
|
|
# convert from YY-MM-DD HH:MM:SS+00:00 to
|
|
|
|
# YY-MM-DDTHH:MM:SSZ
|
|
|
|
postDate = pubDate.replace(' ', 'T').replace('+00:00', 'Z')
|
2021-03-14 19:53:22 +00:00
|
|
|
return validPostDate(postDate, 90, debug)
|
2020-12-21 12:11:45 +00:00
|
|
|
|
|
|
|
|
2020-11-22 19:01:18 +00:00
|
|
|
def parseFeedDate(pubDate: str) -> str:
|
|
|
|
"""Returns a UTC date string based on the given date string
|
2020-11-22 18:14:40 +00:00
|
|
|
This tries a number of formats to see which work
|
|
|
|
"""
|
|
|
|
formats = ("%a, %d %b %Y %H:%M:%S %z",
|
|
|
|
"%a, %d %b %Y %H:%M:%S EST",
|
|
|
|
"%a, %d %b %Y %H:%M:%S UT",
|
|
|
|
"%Y-%m-%dT%H:%M:%SZ",
|
|
|
|
"%Y-%m-%dT%H:%M:%S%z")
|
|
|
|
|
|
|
|
publishedDate = None
|
|
|
|
for dateFormat in formats:
|
|
|
|
if ',' in pubDate and ',' not in dateFormat:
|
|
|
|
continue
|
|
|
|
if ',' not in pubDate and ',' in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'Z' in pubDate and 'Z' not in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'Z' not in pubDate and 'Z' in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'EST' not in pubDate and 'EST' in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'EST' in pubDate and 'EST' not in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'UT' not in pubDate and 'UT' in dateFormat:
|
|
|
|
continue
|
|
|
|
if 'UT' in pubDate and 'UT' not in dateFormat:
|
|
|
|
continue
|
|
|
|
|
|
|
|
try:
|
|
|
|
publishedDate = \
|
2020-11-22 18:43:01 +00:00
|
|
|
datetime.strptime(pubDate, dateFormat)
|
2020-11-22 18:14:40 +00:00
|
|
|
except BaseException:
|
|
|
|
print('WARN: unrecognized date format: ' +
|
|
|
|
pubDate + ' ' + dateFormat)
|
|
|
|
continue
|
|
|
|
|
|
|
|
if publishedDate:
|
|
|
|
if pubDate.endswith(' EST'):
|
|
|
|
hoursAdded = timedelta(hours=5)
|
|
|
|
publishedDate = publishedDate + hoursAdded
|
|
|
|
break
|
2020-11-22 19:01:18 +00:00
|
|
|
|
|
|
|
pubDateStr = None
|
|
|
|
if publishedDate:
|
2020-11-22 20:33:24 +00:00
|
|
|
offset = publishedDate.utcoffset()
|
2020-11-22 20:37:08 +00:00
|
|
|
if offset:
|
|
|
|
publishedDate = publishedDate - offset
|
2020-11-22 19:09:35 +00:00
|
|
|
# convert local date to UTC
|
|
|
|
publishedDate = publishedDate.replace(tzinfo=timezone.utc)
|
2020-11-22 19:01:18 +00:00
|
|
|
pubDateStr = str(publishedDate)
|
|
|
|
if not pubDateStr.endswith('+00:00'):
|
|
|
|
pubDateStr += '+00:00'
|
|
|
|
|
|
|
|
return pubDateStr
|
2020-11-22 18:14:40 +00:00
|
|
|
|
|
|
|
|
2020-12-05 13:38:07 +00:00
|
|
|
def loadHashtagCategories(baseDir: str, language: str) -> None:
|
|
|
|
"""Loads an rss file containing hashtag categories
|
|
|
|
"""
|
|
|
|
hashtagCategoriesFilename = baseDir + '/categories.xml'
|
|
|
|
if not os.path.isfile(hashtagCategoriesFilename):
|
|
|
|
hashtagCategoriesFilename = \
|
|
|
|
baseDir + '/defaultcategories/' + language + '.xml'
|
|
|
|
if not os.path.isfile(hashtagCategoriesFilename):
|
|
|
|
return
|
|
|
|
|
|
|
|
with open(hashtagCategoriesFilename, 'r') as fp:
|
|
|
|
xmlStr = fp.read()
|
2020-12-22 18:06:23 +00:00
|
|
|
_xml2StrToHashtagCategories(baseDir, xmlStr, 1024, True)
|
2020-12-05 13:38:07 +00:00
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _xml2StrToHashtagCategories(baseDir: str, xmlStr: str,
|
|
|
|
maxCategoriesFeedItemSizeKb: int,
|
|
|
|
force=False) -> None:
|
2020-12-02 16:18:36 +00:00
|
|
|
"""Updates hashtag categories based upon an rss feed
|
|
|
|
"""
|
|
|
|
rssItems = xmlStr.split('<item>')
|
|
|
|
maxBytes = maxCategoriesFeedItemSizeKb * 1024
|
|
|
|
for rssItem in rssItems:
|
|
|
|
if not rssItem:
|
|
|
|
continue
|
|
|
|
if len(rssItem) > maxBytes:
|
|
|
|
print('WARN: rss categories feed item is too big')
|
|
|
|
continue
|
|
|
|
if '<title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '<description>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</description>' not in rssItem:
|
|
|
|
continue
|
|
|
|
categoryStr = rssItem.split('<title>')[1]
|
|
|
|
categoryStr = categoryStr.split('</title>')[0].strip()
|
|
|
|
if not categoryStr:
|
|
|
|
continue
|
2020-12-03 10:12:09 +00:00
|
|
|
if 'CDATA' in categoryStr:
|
|
|
|
continue
|
2020-12-02 16:18:36 +00:00
|
|
|
hashtagListStr = rssItem.split('<description>')[1]
|
|
|
|
hashtagListStr = hashtagListStr.split('</description>')[0].strip()
|
|
|
|
if not hashtagListStr:
|
|
|
|
continue
|
2020-12-03 10:12:09 +00:00
|
|
|
if 'CDATA' in hashtagListStr:
|
|
|
|
continue
|
2020-12-02 16:18:36 +00:00
|
|
|
hashtagList = hashtagListStr.split(' ')
|
2020-12-02 22:40:46 +00:00
|
|
|
if not isBlockedHashtag(baseDir, categoryStr):
|
|
|
|
for hashtag in hashtagList:
|
2020-12-05 14:43:29 +00:00
|
|
|
setHashtagCategory(baseDir, hashtag, categoryStr, force)
|
2020-12-02 16:18:36 +00:00
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int,
|
|
|
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
2020-12-14 14:22:44 +00:00
|
|
|
"""Converts an xml RSS 2.0 string to a dictionary
|
2020-10-04 09:51:12 +00:00
|
|
|
"""
|
|
|
|
if '<item>' not in xmlStr:
|
|
|
|
return {}
|
|
|
|
result = {}
|
2020-12-09 10:38:09 +00:00
|
|
|
|
|
|
|
# is this an rss feed containing hashtag categories?
|
2020-12-02 16:18:36 +00:00
|
|
|
if '<title>#categories</title>' in xmlStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
_xml2StrToHashtagCategories(baseDir, xmlStr,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
2020-12-02 16:18:36 +00:00
|
|
|
return {}
|
2020-12-09 10:38:09 +00:00
|
|
|
|
2020-10-04 09:51:12 +00:00
|
|
|
rssItems = xmlStr.split('<item>')
|
2020-10-16 10:13:14 +00:00
|
|
|
postCtr = 0
|
2020-11-03 16:04:25 +00:00
|
|
|
maxBytes = maxFeedItemSizeKb * 1024
|
2020-10-04 09:51:12 +00:00
|
|
|
for rssItem in rssItems:
|
2020-11-27 22:43:34 +00:00
|
|
|
if not rssItem:
|
|
|
|
continue
|
2020-11-03 16:04:25 +00:00
|
|
|
if len(rssItem) > maxBytes:
|
|
|
|
print('WARN: rss feed item is too big')
|
|
|
|
continue
|
2020-10-04 09:51:12 +00:00
|
|
|
if '<title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '<link>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</link>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '<pubDate>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</pubDate>' not in rssItem:
|
|
|
|
continue
|
|
|
|
title = rssItem.split('<title>')[1]
|
2020-12-22 18:06:23 +00:00
|
|
|
title = _removeCDATA(title.split('</title>')[0])
|
2021-02-13 21:48:24 +00:00
|
|
|
title = removeHtml(title)
|
2020-10-07 12:05:49 +00:00
|
|
|
description = ''
|
|
|
|
if '<description>' in rssItem and '</description>' in rssItem:
|
|
|
|
description = rssItem.split('<description>')[1]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description.split('</description>')[0])
|
2020-11-21 23:18:34 +00:00
|
|
|
else:
|
|
|
|
if '<media:description>' in rssItem and \
|
|
|
|
'</media:description>' in rssItem:
|
|
|
|
description = rssItem.split('<media:description>')[1]
|
|
|
|
description = description.split('</media:description>')[0]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-10-04 09:51:12 +00:00
|
|
|
link = rssItem.split('<link>')[1]
|
|
|
|
link = link.split('</link>')[0]
|
2020-10-16 11:58:31 +00:00
|
|
|
if '://' not in link:
|
|
|
|
continue
|
2020-10-17 20:53:36 +00:00
|
|
|
itemDomain = link.split('://')[1]
|
|
|
|
if '/' in itemDomain:
|
|
|
|
itemDomain = itemDomain.split('/')[0]
|
|
|
|
if isBlockedDomain(baseDir, itemDomain):
|
2020-10-16 11:58:31 +00:00
|
|
|
continue
|
2020-10-04 09:51:12 +00:00
|
|
|
pubDate = rssItem.split('<pubDate>')[1]
|
|
|
|
pubDate = pubDate.split('</pubDate>')[0]
|
2020-11-22 18:14:40 +00:00
|
|
|
|
2020-11-22 19:01:18 +00:00
|
|
|
pubDateStr = parseFeedDate(pubDate)
|
|
|
|
if pubDateStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _validFeedDate(pubDateStr):
|
2020-12-21 12:11:45 +00:00
|
|
|
postFilename = ''
|
|
|
|
votesStatus = []
|
2020-12-22 18:06:23 +00:00
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
result, pubDateStr,
|
|
|
|
title, link,
|
|
|
|
votesStatus, postFilename,
|
2021-02-12 11:28:00 +00:00
|
|
|
description, moderated,
|
|
|
|
mirrored)
|
2020-12-21 12:11:45 +00:00
|
|
|
postCtr += 1
|
|
|
|
if postCtr >= maxPostsPerSource:
|
|
|
|
break
|
2020-11-27 22:43:34 +00:00
|
|
|
if postCtr > 0:
|
2021-02-12 11:28:00 +00:00
|
|
|
print('Added ' + str(postCtr) +
|
|
|
|
' rss 2.0 feed items to newswire')
|
2020-10-04 09:51:12 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int,
|
|
|
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
2020-12-14 14:22:44 +00:00
|
|
|
"""Converts an xml RSS 1.0 string to a dictionary
|
|
|
|
https://validator.w3.org/feed/docs/rss1.html
|
|
|
|
"""
|
2020-12-14 17:18:16 +00:00
|
|
|
itemStr = '<item'
|
|
|
|
if itemStr not in xmlStr:
|
2020-12-14 14:22:44 +00:00
|
|
|
return {}
|
|
|
|
result = {}
|
|
|
|
|
|
|
|
# is this an rss feed containing hashtag categories?
|
|
|
|
if '<title>#categories</title>' in xmlStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
_xml2StrToHashtagCategories(baseDir, xmlStr,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
2020-12-14 14:22:44 +00:00
|
|
|
return {}
|
|
|
|
|
2020-12-14 17:18:16 +00:00
|
|
|
rssItems = xmlStr.split(itemStr)
|
2020-12-14 14:22:44 +00:00
|
|
|
postCtr = 0
|
|
|
|
maxBytes = maxFeedItemSizeKb * 1024
|
|
|
|
for rssItem in rssItems:
|
|
|
|
if not rssItem:
|
|
|
|
continue
|
|
|
|
if len(rssItem) > maxBytes:
|
2020-12-14 17:18:16 +00:00
|
|
|
print('WARN: rss 1.0 feed item is too big')
|
|
|
|
continue
|
|
|
|
if rssItem.startswith('s>'):
|
2020-12-14 14:22:44 +00:00
|
|
|
continue
|
|
|
|
if '<title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</title>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '<link>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</link>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '<dc:date>' not in rssItem:
|
|
|
|
continue
|
|
|
|
if '</dc:date>' not in rssItem:
|
|
|
|
continue
|
|
|
|
title = rssItem.split('<title>')[1]
|
2020-12-22 18:06:23 +00:00
|
|
|
title = _removeCDATA(title.split('</title>')[0])
|
2021-02-13 21:48:24 +00:00
|
|
|
title = removeHtml(title)
|
2020-12-14 14:22:44 +00:00
|
|
|
description = ''
|
|
|
|
if '<description>' in rssItem and '</description>' in rssItem:
|
|
|
|
description = rssItem.split('<description>')[1]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description.split('</description>')[0])
|
2020-12-14 14:22:44 +00:00
|
|
|
else:
|
|
|
|
if '<media:description>' in rssItem and \
|
|
|
|
'</media:description>' in rssItem:
|
|
|
|
description = rssItem.split('<media:description>')[1]
|
|
|
|
description = description.split('</media:description>')[0]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-12-14 14:22:44 +00:00
|
|
|
link = rssItem.split('<link>')[1]
|
|
|
|
link = link.split('</link>')[0]
|
|
|
|
if '://' not in link:
|
|
|
|
continue
|
|
|
|
itemDomain = link.split('://')[1]
|
|
|
|
if '/' in itemDomain:
|
|
|
|
itemDomain = itemDomain.split('/')[0]
|
|
|
|
if isBlockedDomain(baseDir, itemDomain):
|
|
|
|
continue
|
|
|
|
pubDate = rssItem.split('<dc:date>')[1]
|
|
|
|
pubDate = pubDate.split('</dc:date>')[0]
|
|
|
|
|
|
|
|
pubDateStr = parseFeedDate(pubDate)
|
|
|
|
if pubDateStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _validFeedDate(pubDateStr):
|
2020-12-21 12:11:45 +00:00
|
|
|
postFilename = ''
|
|
|
|
votesStatus = []
|
2020-12-22 18:06:23 +00:00
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
result, pubDateStr,
|
|
|
|
title, link,
|
|
|
|
votesStatus, postFilename,
|
2021-02-12 11:28:00 +00:00
|
|
|
description, moderated,
|
|
|
|
mirrored)
|
2020-12-21 12:11:45 +00:00
|
|
|
postCtr += 1
|
|
|
|
if postCtr >= maxPostsPerSource:
|
|
|
|
break
|
2020-12-14 14:22:44 +00:00
|
|
|
if postCtr > 0:
|
2021-02-12 11:28:00 +00:00
|
|
|
print('Added ' + str(postCtr) +
|
|
|
|
' rss 1.0 feed items to newswire')
|
2020-12-14 14:22:44 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int) -> {}:
|
2020-10-10 12:24:14 +00:00
|
|
|
"""Converts an atom feed string to a dictionary
|
|
|
|
"""
|
|
|
|
if '<entry>' not in xmlStr:
|
|
|
|
return {}
|
|
|
|
result = {}
|
2020-11-22 12:41:54 +00:00
|
|
|
atomItems = xmlStr.split('<entry>')
|
2020-10-16 10:13:14 +00:00
|
|
|
postCtr = 0
|
2020-11-03 16:04:25 +00:00
|
|
|
maxBytes = maxFeedItemSizeKb * 1024
|
2020-11-22 12:41:54 +00:00
|
|
|
for atomItem in atomItems:
|
2020-11-27 22:43:34 +00:00
|
|
|
if not atomItem:
|
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if len(atomItem) > maxBytes:
|
2020-11-03 16:04:25 +00:00
|
|
|
print('WARN: atom feed item is too big')
|
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<title>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '</title>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<link>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '</link>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<updated>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '</updated>' not in atomItem:
|
2020-10-10 12:24:14 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
title = atomItem.split('<title>')[1]
|
2020-12-22 18:06:23 +00:00
|
|
|
title = _removeCDATA(title.split('</title>')[0])
|
2021-02-13 21:48:24 +00:00
|
|
|
title = removeHtml(title)
|
2020-10-10 12:24:14 +00:00
|
|
|
description = ''
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<summary>' in atomItem and '</summary>' in atomItem:
|
|
|
|
description = atomItem.split('<summary>')[1]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description.split('</summary>')[0])
|
2020-11-21 23:29:46 +00:00
|
|
|
else:
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<media:description>' in atomItem and \
|
|
|
|
'</media:description>' in atomItem:
|
|
|
|
description = atomItem.split('<media:description>')[1]
|
2020-11-21 23:29:46 +00:00
|
|
|
description = description.split('</media:description>')[0]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-11-22 12:41:54 +00:00
|
|
|
link = atomItem.split('<link>')[1]
|
2020-10-10 12:24:14 +00:00
|
|
|
link = link.split('</link>')[0]
|
2020-10-16 11:58:31 +00:00
|
|
|
if '://' not in link:
|
|
|
|
continue
|
2020-10-17 20:53:36 +00:00
|
|
|
itemDomain = link.split('://')[1]
|
|
|
|
if '/' in itemDomain:
|
|
|
|
itemDomain = itemDomain.split('/')[0]
|
|
|
|
if isBlockedDomain(baseDir, itemDomain):
|
2020-10-16 11:58:31 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
pubDate = atomItem.split('<updated>')[1]
|
2020-10-10 12:24:14 +00:00
|
|
|
pubDate = pubDate.split('</updated>')[0]
|
2020-11-22 18:14:40 +00:00
|
|
|
|
2020-11-22 19:01:18 +00:00
|
|
|
pubDateStr = parseFeedDate(pubDate)
|
|
|
|
if pubDateStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _validFeedDate(pubDateStr):
|
2020-12-21 12:11:45 +00:00
|
|
|
postFilename = ''
|
|
|
|
votesStatus = []
|
2020-12-22 18:06:23 +00:00
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
result, pubDateStr,
|
|
|
|
title, link,
|
|
|
|
votesStatus, postFilename,
|
2021-02-12 11:28:00 +00:00
|
|
|
description, moderated,
|
|
|
|
mirrored)
|
|
|
|
postCtr += 1
|
|
|
|
if postCtr >= maxPostsPerSource:
|
|
|
|
break
|
|
|
|
if postCtr > 0:
|
|
|
|
print('Added ' + str(postCtr) +
|
|
|
|
' atom feed items to newswire')
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2021-02-12 11:30:23 +00:00
|
|
|
def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int) -> {}:
|
2021-02-12 11:28:00 +00:00
|
|
|
"""Converts a json feed string to a dictionary
|
2021-02-12 11:46:26 +00:00
|
|
|
See https://jsonfeed.org/version/1.1
|
2021-02-12 11:28:00 +00:00
|
|
|
"""
|
2021-02-12 11:46:26 +00:00
|
|
|
if '"items"' not in xmlStr:
|
2021-02-12 11:28:00 +00:00
|
|
|
return {}
|
|
|
|
try:
|
|
|
|
feedJson = json.loads(xmlStr)
|
|
|
|
except BaseException:
|
|
|
|
return {}
|
|
|
|
maxBytes = maxFeedItemSizeKb * 1024
|
|
|
|
if not feedJson.get('version'):
|
|
|
|
return {}
|
2021-02-12 11:46:26 +00:00
|
|
|
if not feedJson['version'].startswith('https://jsonfeed.org/version/1'):
|
2021-02-12 11:28:00 +00:00
|
|
|
return {}
|
|
|
|
if not feedJson.get('items'):
|
|
|
|
return {}
|
|
|
|
if not isinstance(feedJson['items'], list):
|
|
|
|
return {}
|
2021-02-12 11:50:05 +00:00
|
|
|
postCtr = 0
|
2021-02-12 11:47:49 +00:00
|
|
|
result = {}
|
2021-02-12 11:28:00 +00:00
|
|
|
for jsonFeedItem in feedJson['items']:
|
|
|
|
if not jsonFeedItem:
|
|
|
|
continue
|
|
|
|
if not isinstance(jsonFeedItem, dict):
|
|
|
|
continue
|
|
|
|
if not jsonFeedItem.get('url'):
|
|
|
|
continue
|
|
|
|
if not isinstance(jsonFeedItem['url'], str):
|
|
|
|
continue
|
|
|
|
if not jsonFeedItem.get('date_published'):
|
|
|
|
if not jsonFeedItem.get('date_modified'):
|
|
|
|
continue
|
|
|
|
if not jsonFeedItem.get('content_text'):
|
|
|
|
if not jsonFeedItem.get('content_html'):
|
|
|
|
continue
|
|
|
|
if jsonFeedItem.get('content_html'):
|
|
|
|
if not isinstance(jsonFeedItem['content_html'], str):
|
|
|
|
continue
|
|
|
|
title = removeHtml(jsonFeedItem['content_html'])
|
|
|
|
else:
|
|
|
|
if not isinstance(jsonFeedItem['content_text'], str):
|
|
|
|
continue
|
2021-02-12 12:02:09 +00:00
|
|
|
title = removeHtml(jsonFeedItem['content_text'])
|
2021-02-12 11:28:00 +00:00
|
|
|
if len(title) > maxBytes:
|
|
|
|
print('WARN: json feed title is too long')
|
|
|
|
continue
|
|
|
|
description = ''
|
|
|
|
if jsonFeedItem.get('description'):
|
|
|
|
if not isinstance(jsonFeedItem['description'], str):
|
|
|
|
continue
|
2021-02-12 12:02:09 +00:00
|
|
|
description = removeHtml(jsonFeedItem['description'])
|
2021-02-12 11:28:00 +00:00
|
|
|
if len(description) > maxBytes:
|
|
|
|
print('WARN: json feed description is too long')
|
|
|
|
continue
|
2021-02-12 12:09:16 +00:00
|
|
|
if jsonFeedItem.get('tags'):
|
2021-02-12 12:09:48 +00:00
|
|
|
if isinstance(jsonFeedItem['tags'], list):
|
2021-02-12 12:09:16 +00:00
|
|
|
for tagName in jsonFeedItem['tags']:
|
|
|
|
if not isinstance(tagName, str):
|
|
|
|
continue
|
|
|
|
if ' ' in tagName:
|
|
|
|
continue
|
|
|
|
if not tagName.startswith('#'):
|
|
|
|
tagName = '#' + tagName
|
|
|
|
if tagName not in description:
|
|
|
|
description += ' ' + tagName
|
|
|
|
|
2021-02-12 11:28:00 +00:00
|
|
|
link = jsonFeedItem['url']
|
|
|
|
if '://' not in link:
|
|
|
|
continue
|
|
|
|
if len(link) > maxBytes:
|
|
|
|
print('WARN: json feed link is too long')
|
|
|
|
continue
|
|
|
|
itemDomain = link.split('://')[1]
|
|
|
|
if '/' in itemDomain:
|
|
|
|
itemDomain = itemDomain.split('/')[0]
|
|
|
|
if isBlockedDomain(baseDir, itemDomain):
|
|
|
|
continue
|
|
|
|
if jsonFeedItem.get('date_published'):
|
|
|
|
if not isinstance(jsonFeedItem['date_published'], str):
|
|
|
|
continue
|
|
|
|
pubDate = jsonFeedItem['date_published']
|
|
|
|
else:
|
|
|
|
if not isinstance(jsonFeedItem['date_modified'], str):
|
|
|
|
continue
|
|
|
|
pubDate = jsonFeedItem['date_modified']
|
|
|
|
|
|
|
|
pubDateStr = parseFeedDate(pubDate)
|
|
|
|
if pubDateStr:
|
|
|
|
if _validFeedDate(pubDateStr):
|
|
|
|
postFilename = ''
|
|
|
|
votesStatus = []
|
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
result, pubDateStr,
|
|
|
|
title, link,
|
|
|
|
votesStatus, postFilename,
|
|
|
|
description, moderated,
|
|
|
|
mirrored)
|
2020-12-21 12:11:45 +00:00
|
|
|
postCtr += 1
|
|
|
|
if postCtr >= maxPostsPerSource:
|
|
|
|
break
|
2020-11-27 22:43:34 +00:00
|
|
|
if postCtr > 0:
|
2021-02-12 11:28:00 +00:00
|
|
|
print('Added ' + str(postCtr) +
|
|
|
|
' json feed items to newswire')
|
2020-10-10 12:24:14 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int) -> {}:
|
2020-11-22 10:34:42 +00:00
|
|
|
"""Converts an atom-style YouTube feed string to a dictionary
|
|
|
|
"""
|
|
|
|
if '<entry>' not in xmlStr:
|
|
|
|
return {}
|
|
|
|
if isBlockedDomain(baseDir, 'www.youtube.com'):
|
|
|
|
return {}
|
|
|
|
result = {}
|
2020-11-22 12:41:54 +00:00
|
|
|
atomItems = xmlStr.split('<entry>')
|
2020-11-22 10:34:42 +00:00
|
|
|
postCtr = 0
|
|
|
|
maxBytes = maxFeedItemSizeKb * 1024
|
2020-11-22 12:41:54 +00:00
|
|
|
for atomItem in atomItems:
|
2020-11-27 22:43:34 +00:00
|
|
|
if not atomItem:
|
|
|
|
continue
|
|
|
|
if not atomItem.strip():
|
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if len(atomItem) > maxBytes:
|
2020-11-22 10:34:42 +00:00
|
|
|
print('WARN: atom feed item is too big')
|
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<title>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '</title>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-28 20:46:52 +00:00
|
|
|
if '<published>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-28 20:46:52 +00:00
|
|
|
if '</published>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<yt:videoId>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
if '</yt:videoId>' not in atomItem:
|
2020-11-22 10:34:42 +00:00
|
|
|
continue
|
2020-11-22 12:41:54 +00:00
|
|
|
title = atomItem.split('<title>')[1]
|
2020-12-22 18:06:23 +00:00
|
|
|
title = _removeCDATA(title.split('</title>')[0])
|
2020-11-22 10:34:42 +00:00
|
|
|
description = ''
|
2020-11-22 12:41:54 +00:00
|
|
|
if '<media:description>' in atomItem and \
|
|
|
|
'</media:description>' in atomItem:
|
|
|
|
description = atomItem.split('<media:description>')[1]
|
2020-11-22 10:34:42 +00:00
|
|
|
description = description.split('</media:description>')[0]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-11-22 12:41:54 +00:00
|
|
|
elif '<summary>' in atomItem and '</summary>' in atomItem:
|
|
|
|
description = atomItem.split('<summary>')[1]
|
2020-11-22 10:34:42 +00:00
|
|
|
description = description.split('</summary>')[0]
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-11-22 12:41:54 +00:00
|
|
|
link = atomItem.split('<yt:videoId>')[1]
|
2020-11-22 10:34:42 +00:00
|
|
|
link = link.split('</yt:videoId>')[0]
|
|
|
|
link = 'https://www.youtube.com/watch?v=' + link.strip()
|
2020-11-28 20:46:52 +00:00
|
|
|
pubDate = atomItem.split('<published>')[1]
|
|
|
|
pubDate = pubDate.split('</published>')[0]
|
2020-11-22 18:14:40 +00:00
|
|
|
|
2020-11-22 19:01:18 +00:00
|
|
|
pubDateStr = parseFeedDate(pubDate)
|
|
|
|
if pubDateStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _validFeedDate(pubDateStr):
|
2020-12-21 12:11:45 +00:00
|
|
|
postFilename = ''
|
|
|
|
votesStatus = []
|
2020-12-22 18:06:23 +00:00
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
result, pubDateStr,
|
|
|
|
title, link,
|
|
|
|
votesStatus, postFilename,
|
|
|
|
description, moderated, mirrored)
|
2020-12-21 12:11:45 +00:00
|
|
|
postCtr += 1
|
|
|
|
if postCtr >= maxPostsPerSource:
|
|
|
|
break
|
2020-11-27 22:43:34 +00:00
|
|
|
if postCtr > 0:
|
|
|
|
print('Added ' + str(postCtr) + ' YouTube feed items to newswire')
|
2020-11-22 10:34:42 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
|
|
|
maxPostsPerSource: int,
|
|
|
|
maxFeedItemSizeKb: int,
|
|
|
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
2020-10-04 09:51:12 +00:00
|
|
|
"""Converts an xml string to a dictionary
|
|
|
|
"""
|
2020-11-22 16:10:58 +00:00
|
|
|
if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr:
|
|
|
|
print('YouTube feed: reading')
|
2020-12-22 18:06:23 +00:00
|
|
|
return _atomFeedYTToDict(baseDir, domain,
|
|
|
|
xmlStr, moderated, mirrored,
|
|
|
|
maxPostsPerSource, maxFeedItemSizeKb)
|
2020-11-22 16:10:58 +00:00
|
|
|
elif 'rss version="2.0"' in xmlStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
return _xml2StrToDict(baseDir, domain,
|
|
|
|
xmlStr, moderated, mirrored,
|
|
|
|
maxPostsPerSource, maxFeedItemSizeKb,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
2020-12-14 20:22:05 +00:00
|
|
|
elif '<?xml version="1.0"' in xmlStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
return _xml1StrToDict(baseDir, domain,
|
2020-11-03 16:04:25 +00:00
|
|
|
xmlStr, moderated, mirrored,
|
2020-12-22 18:06:23 +00:00
|
|
|
maxPostsPerSource, maxFeedItemSizeKb,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
|
|
|
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
|
|
|
|
return _atomFeedToDict(baseDir, domain,
|
|
|
|
xmlStr, moderated, mirrored,
|
|
|
|
maxPostsPerSource, maxFeedItemSizeKb)
|
2021-02-12 11:28:00 +00:00
|
|
|
elif 'https://jsonfeed.org/version/1' in xmlStr:
|
2021-02-12 11:30:23 +00:00
|
|
|
return _jsonFeedV1ToDict(baseDir, domain,
|
|
|
|
xmlStr, moderated, mirrored,
|
|
|
|
maxPostsPerSource, maxFeedItemSizeKb)
|
2020-10-04 09:51:12 +00:00
|
|
|
return {}
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _YTchannelToAtomFeed(url: str) -> str:
|
2020-11-22 10:46:54 +00:00
|
|
|
"""Converts a YouTube channel url into an atom feed url
|
|
|
|
"""
|
|
|
|
if 'youtube.com/channel/' not in url:
|
|
|
|
return url
|
2020-11-22 12:27:42 +00:00
|
|
|
channelId = url.split('youtube.com/channel/')[1].strip()
|
2020-11-22 12:36:21 +00:00
|
|
|
channelUrl = \
|
|
|
|
'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId
|
|
|
|
print('YouTube feed: ' + channelUrl)
|
|
|
|
return channelUrl
|
2020-11-22 10:46:54 +00:00
|
|
|
|
|
|
|
|
2020-10-19 14:37:17 +00:00
|
|
|
def getRSS(baseDir: str, domain: str, session, url: str,
|
|
|
|
moderated: bool, mirrored: bool,
|
2020-11-03 16:04:25 +00:00
|
|
|
maxPostsPerSource: int, maxFeedSizeKb: int,
|
2020-12-02 17:02:32 +00:00
|
|
|
maxFeedItemSizeKb: int,
|
|
|
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
2020-10-04 09:51:12 +00:00
|
|
|
"""Returns an RSS url as a dict
|
|
|
|
"""
|
|
|
|
if not isinstance(url, str):
|
|
|
|
print('url: ' + str(url))
|
|
|
|
print('ERROR: getRSS url should be a string')
|
|
|
|
return None
|
|
|
|
headers = {
|
2020-12-14 20:22:05 +00:00
|
|
|
'Accept': 'text/xml, application/xml; charset=UTF-8'
|
2020-10-04 09:51:12 +00:00
|
|
|
}
|
|
|
|
params = None
|
|
|
|
sessionParams = {}
|
|
|
|
sessionHeaders = {}
|
|
|
|
if headers:
|
|
|
|
sessionHeaders = headers
|
|
|
|
if params:
|
|
|
|
sessionParams = params
|
|
|
|
sessionHeaders['User-Agent'] = \
|
|
|
|
'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
|
|
|
|
if not session:
|
|
|
|
print('WARN: no session specified for getRSS')
|
2020-12-22 18:06:23 +00:00
|
|
|
url = _YTchannelToAtomFeed(url)
|
2020-10-04 09:51:12 +00:00
|
|
|
try:
|
|
|
|
result = session.get(url, headers=sessionHeaders, params=sessionParams)
|
2020-10-16 11:40:01 +00:00
|
|
|
if result:
|
2020-10-16 12:03:56 +00:00
|
|
|
if int(len(result.text) / 1024) < maxFeedSizeKb and \
|
|
|
|
not containsInvalidChars(result.text):
|
2020-12-22 18:06:23 +00:00
|
|
|
return _xmlStrToDict(baseDir, domain, result.text,
|
|
|
|
moderated, mirrored,
|
|
|
|
maxPostsPerSource,
|
|
|
|
maxFeedItemSizeKb,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
2020-10-16 11:40:01 +00:00
|
|
|
else:
|
2020-11-22 12:43:22 +00:00
|
|
|
print('WARN: feed is too large, ' +
|
|
|
|
'or contains invalid characters: ' + url)
|
2020-11-22 13:04:58 +00:00
|
|
|
else:
|
|
|
|
print('WARN: no result returned for feed ' + url)
|
2020-10-04 09:51:12 +00:00
|
|
|
except requests.exceptions.RequestException as e:
|
2021-05-20 12:52:13 +00:00
|
|
|
print('WARN: getRSS failed\nurl: ' + str(url) + ', ' +
|
|
|
|
'headers: ' + str(sessionHeaders) + ', ' +
|
|
|
|
'params: ' + str(sessionParams) + ', ' + str(e))
|
2020-10-04 09:51:12 +00:00
|
|
|
except ValueError as e:
|
2021-05-20 12:52:13 +00:00
|
|
|
print('WARN: getRSS failed\nurl: ' + str(url) + ', ' +
|
|
|
|
'headers: ' + str(sessionHeaders) + ', ' +
|
|
|
|
'params: ' + str(sessionParams) + ', ' + str(e))
|
2020-10-04 09:51:12 +00:00
|
|
|
except SocketError as e:
|
|
|
|
if e.errno == errno.ECONNRESET:
|
2021-05-20 12:52:13 +00:00
|
|
|
print('WARN: connection was reset during getRSS ' + str(e))
|
|
|
|
else:
|
|
|
|
print('WARN: getRSS, ' + str(e))
|
2020-10-04 09:51:12 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
|
2020-10-04 12:29:07 +00:00
|
|
|
def getRSSfromDict(baseDir: str, newswire: {},
|
|
|
|
httpPrefix: str, domainFull: str,
|
|
|
|
title: str, translate: {}) -> str:
|
|
|
|
"""Returns an rss feed from the current newswire dict.
|
|
|
|
This allows other instances to subscribe to the same newswire
|
|
|
|
"""
|
|
|
|
rssStr = rss2Header(httpPrefix,
|
|
|
|
None, domainFull,
|
|
|
|
'Newswire', translate)
|
2020-11-03 14:41:28 +00:00
|
|
|
if not newswire:
|
|
|
|
return ''
|
2020-10-04 12:29:07 +00:00
|
|
|
for published, fields in newswire.items():
|
2020-10-20 12:22:52 +00:00
|
|
|
if '+00:00' in published:
|
|
|
|
published = published.replace('+00:00', 'Z').strip()
|
|
|
|
published = published.replace(' ', 'T')
|
|
|
|
else:
|
|
|
|
publishedWithOffset = \
|
2020-10-20 12:37:32 +00:00
|
|
|
datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z")
|
2020-10-20 12:22:52 +00:00
|
|
|
published = publishedWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ")
|
2020-10-04 22:08:13 +00:00
|
|
|
try:
|
2020-10-04 22:12:27 +00:00
|
|
|
pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
|
2020-10-20 12:28:15 +00:00
|
|
|
except Exception as e:
|
|
|
|
print('WARN: Unable to convert date ' + published + ' ' + str(e))
|
2020-10-04 22:08:13 +00:00
|
|
|
continue
|
2020-10-04 12:29:07 +00:00
|
|
|
rssStr += '<item>\n'
|
|
|
|
rssStr += ' <title>' + fields[0] + '</title>\n'
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(firstParagraphFromString(fields[4]))
|
2020-11-08 10:45:33 +00:00
|
|
|
rssStr += ' <description>' + description + '</description>\n'
|
2020-10-08 15:07:06 +00:00
|
|
|
url = fields[1]
|
2020-11-08 11:04:52 +00:00
|
|
|
if '://' not in url:
|
|
|
|
if domainFull not in url:
|
|
|
|
url = httpPrefix + '://' + domainFull + url
|
2020-10-08 15:07:06 +00:00
|
|
|
rssStr += ' <link>' + url + '</link>\n'
|
2020-10-04 22:12:27 +00:00
|
|
|
|
2020-10-04 12:29:07 +00:00
|
|
|
rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
|
|
|
|
rssStr += ' <pubDate>' + rssDateStr + '</pubDate>\n'
|
|
|
|
rssStr += '</item>\n'
|
|
|
|
rssStr += rss2Footer()
|
|
|
|
return rssStr
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _isNewswireBlogPost(postJsonObject: {}) -> bool:
|
2020-10-06 11:28:32 +00:00
|
|
|
"""Is the given object a blog post?
|
2020-10-25 10:47:39 +00:00
|
|
|
There isn't any difference between a blog post and a newswire blog post
|
|
|
|
but we may here need to check for different properties than
|
|
|
|
isBlogPost does
|
2020-10-06 11:28:32 +00:00
|
|
|
"""
|
|
|
|
if not postJsonObject:
|
|
|
|
return False
|
|
|
|
if not postJsonObject.get('object'):
|
|
|
|
return False
|
|
|
|
if not isinstance(postJsonObject['object'], dict):
|
|
|
|
return False
|
|
|
|
if postJsonObject['object'].get('summary') and \
|
|
|
|
postJsonObject['object'].get('url') and \
|
2020-11-08 09:47:01 +00:00
|
|
|
postJsonObject['object'].get('content') and \
|
2020-10-06 11:28:32 +00:00
|
|
|
postJsonObject['object'].get('published'):
|
2020-10-25 10:42:38 +00:00
|
|
|
return isPublicPost(postJsonObject)
|
2020-10-06 11:28:32 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _getHashtagsFromPost(postJsonObject: {}) -> []:
|
2020-10-16 20:13:23 +00:00
|
|
|
"""Returns a list of any hashtags within a post
|
|
|
|
"""
|
|
|
|
if not postJsonObject.get('object'):
|
|
|
|
return []
|
|
|
|
if not isinstance(postJsonObject['object'], dict):
|
|
|
|
return []
|
|
|
|
if not postJsonObject['object'].get('tag'):
|
|
|
|
return []
|
2020-10-18 09:28:43 +00:00
|
|
|
if not isinstance(postJsonObject['object']['tag'], list):
|
2020-10-16 20:13:23 +00:00
|
|
|
return []
|
|
|
|
tags = []
|
2020-10-18 09:28:43 +00:00
|
|
|
for tg in postJsonObject['object']['tag']:
|
2020-10-16 20:13:23 +00:00
|
|
|
if not isinstance(tg, dict):
|
|
|
|
continue
|
|
|
|
if not tg.get('name'):
|
|
|
|
continue
|
|
|
|
if not tg.get('type'):
|
|
|
|
continue
|
|
|
|
if tg['type'] != 'Hashtag':
|
|
|
|
continue
|
|
|
|
if tg['name'] not in tags:
|
|
|
|
tags.append(tg['name'])
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
|
|
|
|
newswire: {},
|
|
|
|
maxBlogsPerAccount: int,
|
|
|
|
indexFilename: str,
|
|
|
|
maxTags: int) -> None:
|
2020-10-05 11:11:48 +00:00
|
|
|
"""Adds blogs for the given account to the newswire
|
|
|
|
"""
|
|
|
|
if not os.path.isfile(indexFilename):
|
|
|
|
return
|
2020-10-09 10:33:06 +00:00
|
|
|
# local blog entries are unmoderated by default
|
|
|
|
moderated = False
|
|
|
|
|
|
|
|
# local blogs can potentially be moderated
|
|
|
|
moderatedFilename = \
|
|
|
|
baseDir + '/accounts/' + nickname + '@' + domain + \
|
|
|
|
'/.newswiremoderated'
|
|
|
|
if os.path.isfile(moderatedFilename):
|
|
|
|
moderated = True
|
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
with open(indexFilename, 'r') as indexFile:
|
|
|
|
postFilename = 'start'
|
|
|
|
ctr = 0
|
|
|
|
while postFilename:
|
|
|
|
postFilename = indexFile.readline()
|
|
|
|
if postFilename:
|
|
|
|
# if this is a full path then remove the directories
|
|
|
|
if '/' in postFilename:
|
|
|
|
postFilename = postFilename.split('/')[-1]
|
|
|
|
|
|
|
|
# filename of the post without any extension or path
|
|
|
|
# This should also correspond to any index entry in
|
|
|
|
# the posts cache
|
|
|
|
postUrl = \
|
|
|
|
postFilename.replace('\n', '').replace('\r', '')
|
|
|
|
postUrl = postUrl.replace('.json', '').strip()
|
|
|
|
|
|
|
|
# read the post from file
|
|
|
|
fullPostFilename = \
|
|
|
|
locatePost(baseDir, nickname,
|
|
|
|
domain, postUrl, False)
|
2020-10-06 13:05:15 +00:00
|
|
|
if not fullPostFilename:
|
2021-02-11 12:40:56 +00:00
|
|
|
print('Unable to locate post for newswire ' + postUrl)
|
2020-10-06 13:05:15 +00:00
|
|
|
ctr += 1
|
|
|
|
if ctr >= maxBlogsPerAccount:
|
|
|
|
break
|
2020-10-06 13:34:04 +00:00
|
|
|
continue
|
2020-10-06 13:05:15 +00:00
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
postJsonObject = None
|
|
|
|
if fullPostFilename:
|
|
|
|
postJsonObject = loadJson(fullPostFilename)
|
2020-12-22 18:06:23 +00:00
|
|
|
if _isNewswireBlogPost(postJsonObject):
|
2020-10-06 11:28:32 +00:00
|
|
|
published = postJsonObject['object']['published']
|
|
|
|
published = published.replace('T', ' ')
|
|
|
|
published = published.replace('Z', '+00:00')
|
2020-10-06 20:17:34 +00:00
|
|
|
votes = []
|
|
|
|
if os.path.isfile(fullPostFilename + '.votes'):
|
|
|
|
votes = loadJson(fullPostFilename + '.votes')
|
2020-11-08 10:45:33 +00:00
|
|
|
content = postJsonObject['object']['content']
|
|
|
|
description = firstParagraphFromString(content)
|
2021-01-11 21:54:25 +00:00
|
|
|
description = removeHtml(description)
|
2020-12-22 21:24:46 +00:00
|
|
|
tagsFromPost = _getHashtagsFromPost(postJsonObject)
|
2020-12-22 18:06:23 +00:00
|
|
|
_addNewswireDictEntry(baseDir, domain,
|
|
|
|
newswire, published,
|
|
|
|
postJsonObject['object']['summary'],
|
|
|
|
postJsonObject['object']['url'],
|
|
|
|
votes, fullPostFilename,
|
|
|
|
description, moderated, False,
|
2020-12-22 21:24:46 +00:00
|
|
|
tagsFromPost,
|
2020-12-22 18:06:23 +00:00
|
|
|
maxTags)
|
2020-10-05 11:11:48 +00:00
|
|
|
|
|
|
|
ctr += 1
|
|
|
|
if ctr >= maxBlogsPerAccount:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
|
|
|
|
maxBlogsPerAccount: int,
|
|
|
|
maxTags: int) -> None:
|
2020-10-06 09:47:58 +00:00
|
|
|
"""Adds blogs from each user account into the newswire
|
2020-10-06 09:37:22 +00:00
|
|
|
"""
|
2020-10-06 10:34:56 +00:00
|
|
|
moderationDict = {}
|
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
# go through each account
|
|
|
|
for subdir, dirs, files in os.walk(baseDir + '/accounts'):
|
|
|
|
for handle in dirs:
|
|
|
|
if '@' not in handle:
|
|
|
|
continue
|
2021-02-11 12:40:56 +00:00
|
|
|
if 'inbox@' in handle or 'news@' in handle:
|
2020-10-05 11:11:48 +00:00
|
|
|
continue
|
2020-10-06 10:34:56 +00:00
|
|
|
|
2020-10-06 09:37:22 +00:00
|
|
|
nickname = handle.split('@')[0]
|
2020-10-05 11:30:11 +00:00
|
|
|
|
|
|
|
# has this account been suspended?
|
2020-10-06 08:58:44 +00:00
|
|
|
if isSuspended(baseDir, nickname):
|
|
|
|
continue
|
2020-10-05 11:30:11 +00:00
|
|
|
|
2020-10-06 21:28:40 +00:00
|
|
|
if os.path.isfile(baseDir + '/accounts/' + handle +
|
|
|
|
'/.nonewswire'):
|
|
|
|
continue
|
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
# is there a blogs timeline for this account?
|
2020-10-06 09:41:04 +00:00
|
|
|
accountDir = os.path.join(baseDir + '/accounts', handle)
|
2020-10-05 11:11:48 +00:00
|
|
|
blogsIndex = accountDir + '/tlblogs.index'
|
|
|
|
if os.path.isfile(blogsIndex):
|
|
|
|
domain = handle.split('@')[1]
|
2020-12-22 18:06:23 +00:00
|
|
|
_addAccountBlogsToNewswire(baseDir, nickname, domain,
|
|
|
|
newswire, maxBlogsPerAccount,
|
|
|
|
blogsIndex, maxTags)
|
2020-12-13 22:13:45 +00:00
|
|
|
break
|
2020-10-05 11:11:48 +00:00
|
|
|
|
2020-10-06 11:28:32 +00:00
|
|
|
# sort the moderation dict into chronological order, latest first
|
|
|
|
sortedModerationDict = \
|
|
|
|
OrderedDict(sorted(moderationDict.items(), reverse=True))
|
2020-10-06 12:15:35 +00:00
|
|
|
# save the moderation queue details for later display
|
2020-10-06 11:28:32 +00:00
|
|
|
newswireModerationFilename = baseDir + '/accounts/newswiremoderation.txt'
|
2020-10-06 14:32:53 +00:00
|
|
|
if sortedModerationDict:
|
|
|
|
saveJson(sortedModerationDict, newswireModerationFilename)
|
|
|
|
else:
|
|
|
|
# remove the file if there is nothing to moderate
|
|
|
|
if os.path.isfile(newswireModerationFilename):
|
|
|
|
os.remove(newswireModerationFilename)
|
2020-10-06 11:28:32 +00:00
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
|
2020-10-17 16:08:07 +00:00
|
|
|
def getDictFromNewswire(session, baseDir: str, domain: str,
|
2020-10-23 14:41:29 +00:00
|
|
|
maxPostsPerSource: int, maxFeedSizeKb: int,
|
2020-11-22 11:48:53 +00:00
|
|
|
maxTags: int, maxFeedItemSizeKb: int,
|
2020-12-02 17:02:32 +00:00
|
|
|
maxNewswirePosts: int,
|
|
|
|
maxCategoriesFeedItemSizeKb: int) -> {}:
|
2020-10-04 09:59:55 +00:00
|
|
|
"""Gets rss feeds as a dictionary from newswire file
|
2020-10-04 09:51:12 +00:00
|
|
|
"""
|
2020-10-04 09:59:55 +00:00
|
|
|
subscriptionsFilename = baseDir + '/accounts/newswire.txt'
|
2020-10-04 09:51:12 +00:00
|
|
|
if not os.path.isfile(subscriptionsFilename):
|
|
|
|
return {}
|
|
|
|
|
2020-10-16 10:13:14 +00:00
|
|
|
maxPostsPerSource = 5
|
|
|
|
|
2020-10-05 11:11:48 +00:00
|
|
|
# add rss feeds
|
2020-10-04 09:51:12 +00:00
|
|
|
rssFeed = []
|
|
|
|
with open(subscriptionsFilename, 'r') as fp:
|
|
|
|
rssFeed = fp.readlines()
|
|
|
|
result = {}
|
|
|
|
for url in rssFeed:
|
|
|
|
url = url.strip()
|
2020-10-09 10:33:06 +00:00
|
|
|
|
|
|
|
# Does this contain a url?
|
2020-10-04 09:51:12 +00:00
|
|
|
if '://' not in url:
|
|
|
|
continue
|
2020-10-09 10:33:06 +00:00
|
|
|
|
|
|
|
# is this a comment?
|
2020-10-04 09:51:12 +00:00
|
|
|
if url.startswith('#'):
|
|
|
|
continue
|
2020-10-09 10:33:06 +00:00
|
|
|
|
|
|
|
# should this feed be moderated?
|
|
|
|
moderated = False
|
|
|
|
if '*' in url:
|
|
|
|
moderated = True
|
|
|
|
url = url.replace('*', '').strip()
|
|
|
|
|
2020-10-19 14:37:17 +00:00
|
|
|
# should this feed content be mirrored?
|
|
|
|
mirrored = False
|
|
|
|
if '!' in url:
|
|
|
|
mirrored = True
|
|
|
|
url = url.replace('!', '').strip()
|
|
|
|
|
|
|
|
itemsList = getRSS(baseDir, domain, session, url,
|
|
|
|
moderated, mirrored,
|
2020-11-03 16:04:25 +00:00
|
|
|
maxPostsPerSource, maxFeedSizeKb,
|
2020-12-02 17:02:32 +00:00
|
|
|
maxFeedItemSizeKb,
|
|
|
|
maxCategoriesFeedItemSizeKb)
|
2020-11-03 15:04:33 +00:00
|
|
|
if itemsList:
|
|
|
|
for dateStr, item in itemsList.items():
|
|
|
|
result[dateStr] = item
|
2020-10-05 11:11:48 +00:00
|
|
|
|
2020-10-06 09:47:58 +00:00
|
|
|
# add blogs from each user account
|
2020-12-22 18:06:23 +00:00
|
|
|
_addBlogsToNewswire(baseDir, domain, result,
|
|
|
|
maxPostsPerSource, maxTags)
|
2020-10-05 11:11:48 +00:00
|
|
|
|
|
|
|
# sort into chronological order, latest first
|
2020-10-04 21:45:46 +00:00
|
|
|
sortedResult = OrderedDict(sorted(result.items(), reverse=True))
|
2020-11-22 11:48:53 +00:00
|
|
|
|
|
|
|
# are there too many posts? If so then remove the oldest ones
|
|
|
|
noOfPosts = len(sortedResult.items())
|
|
|
|
if noOfPosts > maxNewswirePosts:
|
2020-11-22 12:05:15 +00:00
|
|
|
ctr = 0
|
|
|
|
removals = []
|
|
|
|
for dateStr, item in sortedResult.items():
|
|
|
|
ctr += 1
|
2020-11-22 12:25:53 +00:00
|
|
|
if ctr > maxNewswirePosts:
|
2020-11-22 12:05:15 +00:00
|
|
|
removals.append(dateStr)
|
|
|
|
for r in removals:
|
|
|
|
sortedResult.pop(r)
|
2020-11-22 11:48:53 +00:00
|
|
|
|
2020-10-04 09:51:12 +00:00
|
|
|
return sortedResult
|