Extract hashtags from feeds

2020-10-16 20:49:34 +01:00 · 2020-10-16 20:49:34 +01:00 · a60491585e
parent 5bd133ffff
commit a60491585e
2 changed files with 27 additions and 2 deletions
--- a/content.py
+++ b/content.py
@ -701,7 +701,12 @@ def addHtmlTags(baseDir: str, httpPrefix: str,
    content = content.replace('\r', '')
    content = content.replace('\n', ' --linebreak-- ')
    content = addMusicTag(content, 'nowplaying')
-    words = content.replace(',', ' ').replace(';', ' ').split(' ')
+    contentSimplified = \
+        content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
+    contentSimplified = contentSimplified.replace('. ', ' ').strip()
+    if contentSimplified.endswith('.'):
+        contentSimplified = contentSimplified[:len(contentSimplified)-1]
+    words = contentSimplified.split(' ')

    # remove . for words which are not mentions
    newWords = []
--- a/newswire.py
+++ b/newswire.py
@ -52,6 +52,25 @@ def rss2Footer() -> str:
    return rssStr


+def getNewswireTags(text: str) -> []:
+    """Returns a list of hashtags found in the given text
+    """
+    if ' ' not in text:
+        return []
+    textSimplified = \
+        text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
+    textSimplified = textSimplified.replace('. ', ' ').strip()
+    if textSimplified.endswith('.'):
+        textSimplified = textSimplified[:len(textSimplified)-1]
+    words = textSimplified.split(' ')
+    tags = []
+    for wrd in words:
+        if wrd.startswith('#'):
+            if wrd not in tags:
+                tags.append(wrd)
+    return tags
+
+
 def addNewswireDictEntry(newswire: {}, dateStr: str,
                         title: str, link: str,
                         votesStatus: str, postFilename: str,
@ -60,7 +79,8 @@ def addNewswireDictEntry(newswire: {}, dateStr: str,
    """
    newswire[dateStr] = [title, link,
                         votesStatus, postFilename,
-                         description, moderated]
+                         description, moderated,
+                         getNewswireTags(title + ' ' + description)]


 def xml2StrToDict(baseDir: str, xmlStr: str, moderated: bool,