Extract hashtags from feeds

main
Bob Mottram 2020-10-16 20:49:34 +01:00
parent 5bd133ffff
commit a60491585e
2 changed files with 27 additions and 2 deletions

View File

@ -701,7 +701,12 @@ def addHtmlTags(baseDir: str, httpPrefix: str,
content = content.replace('\r', '')
content = content.replace('\n', ' --linebreak-- ')
content = addMusicTag(content, 'nowplaying')
words = content.replace(',', ' ').replace(';', ' ').split(' ')
contentSimplified = \
content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
contentSimplified = contentSimplified.replace('. ', ' ').strip()
if contentSimplified.endswith('.'):
contentSimplified = contentSimplified[:len(contentSimplified)-1]
words = contentSimplified.split(' ')
# remove . for words which are not mentions
newWords = []

View File

@ -52,6 +52,25 @@ def rss2Footer() -> str:
return rssStr
def getNewswireTags(text: str) -> []:
"""Returns a list of hashtags found in the given text
"""
if ' ' not in text:
return []
textSimplified = \
text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
textSimplified = textSimplified.replace('. ', ' ').strip()
if textSimplified.endswith('.'):
textSimplified = textSimplified[:len(textSimplified)-1]
words = textSimplified.split(' ')
tags = []
for wrd in words:
if wrd.startswith('#'):
if wrd not in tags:
tags.append(wrd)
return tags
def addNewswireDictEntry(newswire: {}, dateStr: str,
title: str, link: str,
votesStatus: str, postFilename: str,
@ -60,7 +79,8 @@ def addNewswireDictEntry(newswire: {}, dateStr: str,
"""
newswire[dateStr] = [title, link,
votesStatus, postFilename,
description, moderated]
description, moderated,
getNewswireTags(title + ' ' + description)]
def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,