Apply word filter to newswire

main
Bob Mottram 2020-10-17 17:08:07 +01:00
parent c536a8a0c8
commit 22fcc7be06
2 changed files with 36 additions and 19 deletions

View File

@ -468,7 +468,7 @@ def runNewswireDaemon(baseDir: str, httpd,
newNewswire = None newNewswire = None
try: try:
newNewswire = \ newNewswire = \
getDictFromNewswire(httpd.session, baseDir, getDictFromNewswire(httpd.session, baseDir, domain,
httpd.maxNewswirePostsPerSource, httpd.maxNewswirePostsPerSource,
httpd.maxNewswireFeedSizeKb) httpd.maxNewswireFeedSizeKb)
except Exception as e: except Exception as e:

View File

@ -19,6 +19,7 @@ from utils import isSuspended
from utils import containsInvalidChars from utils import containsInvalidChars
from blocking import isBlockedDomain from blocking import isBlockedDomain
from blocking import isBlockedHashtag from blocking import isBlockedHashtag
from filters import isFiltered
def rss2Header(httpPrefix: str, def rss2Header(httpPrefix: str,
@ -75,15 +76,19 @@ def getNewswireTags(text: str) -> []:
return tags return tags
def addNewswireDictEntry(baseDir: str, newswire: {}, dateStr: str, def addNewswireDictEntry(baseDir: str, domain: str,
newswire: {}, dateStr: str,
title: str, link: str, title: str, link: str,
votesStatus: str, postFilename: str, votesStatus: str, postFilename: str,
description: str, moderated: bool, description: str, moderated: bool,
tags=[]) -> None: tags=[]) -> None:
"""Update the newswire dictionary """Update the newswire dictionary
""" """
allText = title + ' ' + description
if isFiltered(baseDir, 'news', domain, allText):
return
if not tags: if not tags:
tags = getNewswireTags(title + ' ' + description) tags = getNewswireTags(allText)
newswireItemBlocked = False newswireItemBlocked = False
if tags: if tags:
for tag in tags: for tag in tags:
@ -102,7 +107,8 @@ def addNewswireDictEntry(baseDir: str, newswire: {}, dateStr: str,
] ]
def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool, def xml2StrToDict(baseDir: str, domain: str,
xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}: maxPostsPerSource: int) -> {}:
"""Converts an xml 2.0 string to a dictionary """Converts an xml 2.0 string to a dictionary
""" """
@ -147,7 +153,8 @@ def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z") datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
postFilename = '' postFilename = ''
votesStatus = [] votesStatus = []
addNewswireDictEntry(baseDir, result, str(publishedDate), addNewswireDictEntry(baseDir, domain,
result, str(publishedDate),
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated) description, moderated)
@ -163,7 +170,8 @@ def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
postFilename = '' postFilename = ''
votesStatus = [] votesStatus = []
addNewswireDictEntry(baseDir, result, addNewswireDictEntry(baseDir, domain,
result,
str(publishedDate) + '+00:00', str(publishedDate) + '+00:00',
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
@ -178,7 +186,8 @@ def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,
return result return result
def atomFeedToDict(baseDir: str, xmlStr: str, moderated: bool, def atomFeedToDict(baseDir: str, domain: str,
xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}: maxPostsPerSource: int) -> {}:
"""Converts an atom feed string to a dictionary """Converts an atom feed string to a dictionary
""" """
@ -223,7 +232,8 @@ def atomFeedToDict(baseDir: str, xmlStr: str, moderated: bool,
datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ") datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ")
postFilename = '' postFilename = ''
votesStatus = [] votesStatus = []
addNewswireDictEntry(baseDir, result, str(publishedDate), addNewswireDictEntry(baseDir, domain,
result, str(publishedDate),
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated) description, moderated)
@ -239,7 +249,7 @@ def atomFeedToDict(baseDir: str, xmlStr: str, moderated: bool,
datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT") datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
postFilename = '' postFilename = ''
votesStatus = [] votesStatus = []
addNewswireDictEntry(baseDir, result, addNewswireDictEntry(baseDir, domain, result,
str(publishedDate) + '+00:00', str(publishedDate) + '+00:00',
title, link, title, link,
votesStatus, postFilename, votesStatus, postFilename,
@ -254,18 +264,22 @@ def atomFeedToDict(baseDir: str, xmlStr: str, moderated: bool,
return result return result
def xmlStrToDict(baseDir: str, xmlStr: str, moderated: bool, def xmlStrToDict(baseDir: str, domain: str,
xmlStr: str, moderated: bool,
maxPostsPerSource: int) -> {}: maxPostsPerSource: int) -> {}:
"""Converts an xml string to a dictionary """Converts an xml string to a dictionary
""" """
if 'rss version="2.0"' in xmlStr: if 'rss version="2.0"' in xmlStr:
return xml2StrToDict(baseDir, xmlStr, moderated, maxPostsPerSource) return xml2StrToDict(baseDir, domain,
xmlStr, moderated, maxPostsPerSource)
elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr: elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr:
return atomFeedToDict(baseDir, xmlStr, moderated, maxPostsPerSource) return atomFeedToDict(baseDir, domain,
xmlStr, moderated, maxPostsPerSource)
return {} return {}
def getRSS(baseDir: str, session, url: str, moderated: bool, def getRSS(baseDir: str, domain: str,
session, url: str, moderated: bool,
maxPostsPerSource: int, maxPostsPerSource: int,
maxFeedSizeKb: int) -> {}: maxFeedSizeKb: int) -> {}:
"""Returns an RSS url as a dict """Returns an RSS url as a dict
@ -293,7 +307,8 @@ def getRSS(baseDir: str, session, url: str, moderated: bool,
if result: if result:
if int(len(result.text) / 1024) < maxFeedSizeKb and \ if int(len(result.text) / 1024) < maxFeedSizeKb and \
not containsInvalidChars(result.text): not containsInvalidChars(result.text):
return xmlStrToDict(baseDir, result.text, moderated, return xmlStrToDict(baseDir, domain,
result.text, moderated,
maxPostsPerSource) maxPostsPerSource)
else: else:
print('WARN: feed is too large: ' + url) print('WARN: feed is too large: ' + url)
@ -443,7 +458,8 @@ def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
if os.path.isfile(fullPostFilename + '.votes'): if os.path.isfile(fullPostFilename + '.votes'):
votes = loadJson(fullPostFilename + '.votes') votes = loadJson(fullPostFilename + '.votes')
description = '' description = ''
addNewswireDictEntry(baseDir, newswire, published, addNewswireDictEntry(baseDir, domain,
newswire, published,
postJsonObject['object']['summary'], postJsonObject['object']['summary'],
postJsonObject['object']['url'], postJsonObject['object']['url'],
votes, fullPostFilename, votes, fullPostFilename,
@ -455,7 +471,7 @@ def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
break break
def addBlogsToNewswire(baseDir: str, newswire: {}, def addBlogsToNewswire(baseDir: str, domain: str, newswire: {},
maxBlogsPerAccount: int) -> None: maxBlogsPerAccount: int) -> None:
"""Adds blogs from each user account into the newswire """Adds blogs from each user account into the newswire
""" """
@ -501,7 +517,7 @@ def addBlogsToNewswire(baseDir: str, newswire: {},
os.remove(newswireModerationFilename) os.remove(newswireModerationFilename)
def getDictFromNewswire(session, baseDir: str, def getDictFromNewswire(session, baseDir: str, domain: str,
maxPostsPerSource: int, maxFeedSizeKb: int) -> {}: maxPostsPerSource: int, maxFeedSizeKb: int) -> {}:
"""Gets rss feeds as a dictionary from newswire file """Gets rss feeds as a dictionary from newswire file
""" """
@ -533,13 +549,14 @@ def getDictFromNewswire(session, baseDir: str,
moderated = True moderated = True
url = url.replace('*', '').strip() url = url.replace('*', '').strip()
itemsList = getRSS(baseDir, session, url, moderated, itemsList = getRSS(baseDir, domain,
session, url, moderated,
maxPostsPerSource, maxFeedSizeKb) maxPostsPerSource, maxFeedSizeKb)
for dateStr, item in itemsList.items(): for dateStr, item in itemsList.items():
result[dateStr] = item result[dateStr] = item
# add blogs from each user account # add blogs from each user account
addBlogsToNewswire(baseDir, result, maxPostsPerSource) addBlogsToNewswire(baseDir, domain, result, maxPostsPerSource)
# sort into chronological order, latest first # sort into chronological order, latest first
sortedResult = OrderedDict(sorted(result.items(), reverse=True)) sortedResult = OrderedDict(sorted(result.items(), reverse=True))