Tidy extraction of tags from rss feeds

main
Bob Mottram 2020-10-25 10:17:12 +00:00
parent 3d30aa55ce
commit 361df8a2ae
1 changed files with 33 additions and 18 deletions

View File

@ -88,27 +88,42 @@ def addNewswireDictEntry(baseDir: str, domain: str,
"""Update the newswire dictionary """Update the newswire dictionary
""" """
allText = title + ' ' + description allText = title + ' ' + description
# check that none of the text is filtered against
if isFiltered(baseDir, 'news', domain, allText): if isFiltered(baseDir, 'news', domain, allText):
return return
if not tags:
tags = getNewswireTags(allText, maxTags) if tags is None:
tags = []
# extract hashtags from the text of the feed post
postTags = getNewswireTags(allText, maxTags)
# combine the tags into a single list
for tag in postTags:
if tag not in tags:
tags.append(tag)
# check that no tags are blocked
newswireItemBlocked = False newswireItemBlocked = False
if tags: for tag in tags:
for tag in tags: if isBlockedHashtag(baseDir, tag.replace('#', '')):
if isBlockedHashtag(baseDir, tag.replace('#', '')): newswireItemBlocked = True
newswireItemBlocked = True break
break
if not newswireItemBlocked: if newswireItemBlocked:
newswire[dateStr] = [ return
title,
link, newswire[dateStr] = [
votesStatus, title,
postFilename, link,
description, votesStatus,
moderated, postFilename,
tags, description,
mirrored moderated,
] tags,
mirrored
]
def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,