From 361df8a2ae783b79ab9ccf2f0fbba6499fab2b57 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 25 Oct 2020 10:17:12 +0000 Subject: [PATCH] Tidy extraction of tags from rss feeds --- newswire.py | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/newswire.py b/newswire.py index a73753dc..f5319206 100644 --- a/newswire.py +++ b/newswire.py @@ -88,27 +88,42 @@ def addNewswireDictEntry(baseDir: str, domain: str, """Update the newswire dictionary """ allText = title + ' ' + description + + # check that none of the text is filtered against if isFiltered(baseDir, 'news', domain, allText): return - if not tags: - tags = getNewswireTags(allText, maxTags) + + if tags is None: + tags = [] + + # extract hashtags from the text of the feed post + postTags = getNewswireTags(allText, maxTags) + + # combine the tags into a single list + for tag in postTags: + if tag not in tags: + tags.append(tag) + + # check that no tags are blocked newswireItemBlocked = False - if tags: - for tag in tags: - if isBlockedHashtag(baseDir, tag.replace('#', '')): - newswireItemBlocked = True - break - if not newswireItemBlocked: - newswire[dateStr] = [ - title, - link, - votesStatus, - postFilename, - description, - moderated, - tags, - mirrored - ] + for tag in tags: + if isBlockedHashtag(baseDir, tag.replace('#', '')): + newswireItemBlocked = True + break + + if newswireItemBlocked: + return + + newswire[dateStr] = [ + title, + link, + votesStatus, + postFilename, + description, + moderated, + tags, + mirrored + ] def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,