Extract hashtags from newswire feeds

main
Bob Mottram 2020-10-16 21:13:23 +01:00
parent a5d21852bb
commit af3b7baf12
2 changed files with 34 additions and 5 deletions

View File

@ -374,7 +374,7 @@ def addHashTags(wordStr: str, httpPrefix: str, domain: str,
hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag
postHashtags[hashtag] = { postHashtags[hashtag] = {
'href': hashtagUrl, 'href': hashtagUrl,
'name': '#'+hashtag, 'name': '#' + hashtag,
'type': 'Hashtag' 'type': 'Hashtag'
} }
replaceHashTags[wordStr] = "<a href=\"" + hashtagUrl + \ replaceHashTags[wordStr] = "<a href=\"" + hashtagUrl + \

View File

@ -75,13 +75,15 @@ def getNewswireTags(text: str) -> []:
def addNewswireDictEntry(newswire: {}, dateStr: str, def addNewswireDictEntry(newswire: {}, dateStr: str,
title: str, link: str, title: str, link: str,
votesStatus: str, postFilename: str, votesStatus: str, postFilename: str,
description: str, moderated: bool) -> None: description: str, moderated: bool,
tags=[]) -> None:
"""Update the newswire dictionary """Update the newswire dictionary
""" """
if not tags:
tags = getNewswireTags(title + ' ' + description)
newswire[dateStr] = [title, link, newswire[dateStr] = [title, link,
votesStatus, postFilename, votesStatus, postFilename,
description, moderated, description, moderated, tags]
getNewswireTags(title + ' ' + description)]
def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool, def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,
@ -340,6 +342,32 @@ def isaBlogPost(postJsonObject: {}) -> bool:
return False return False
def getHashtagsFromPost(postJsonObject: {}) -> []:
"""Returns a list of any hashtags within a post
"""
if not postJsonObject.get('object'):
return []
if not isinstance(postJsonObject['object'], dict):
return []
if not postJsonObject['object'].get('tag'):
return []
if not isinstance(postJsonObject['object']['tag'], dict):
return []
tags = []
for tg in postJsonObject['object']['tag'].items():
if not isinstance(tg, dict):
continue
if not tg.get('name'):
continue
if not tg.get('type'):
continue
if tg['type'] != 'Hashtag':
continue
if tg['name'] not in tags:
tags.append(tg['name'])
return tags
def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
newswire: {}, newswire: {},
maxBlogsPerAccount: int, maxBlogsPerAccount: int,
@ -401,7 +429,8 @@ def addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
postJsonObject['object']['summary'], postJsonObject['object']['summary'],
postJsonObject['object']['url'], postJsonObject['object']['url'],
votes, fullPostFilename, votes, fullPostFilename,
description, moderated) description, moderated,
getHashtagsFromPost(postJsonObject))
ctr += 1 ctr += 1
if ctr >= maxBlogsPerAccount: if ctr >= maxBlogsPerAccount: