From 67e06f65c9ef03950b327e7229a12a9b992f4537 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 5 Dec 2020 11:11:32 +0000 Subject: [PATCH] Guess hashtag categorisations --- inbox.py | 37 +++++++++++++++++++++++++++++++++++++ tests.py | 15 +++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/inbox.py b/inbox.py index cf681eb4e..d0c2a21dc 100644 --- a/inbox.py +++ b/inbox.py @@ -30,6 +30,8 @@ from utils import loadJson from utils import saveJson from utils import updateLikesCollection from utils import undoLikesCollectionEntry +from utils import getHashtagCategories +from utils import setHashtagCategory from httpsig import verifyPostHeaders from session import createSession from session import getJson @@ -68,6 +70,31 @@ from happening import saveEventPost from delete import removeOldHashtags +def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str: + """Tries to guess a category for the given hashtag. + This works by trying to find the longest similar hashtag + """ + categoryMatched = '' + tagMatched = '' + tagMatchedLen = 0 + + for categoryStr, hashtagList in hashtagCategories.items(): + for hashtag in hashtagList: + if hashtag in tagName: + if not tagMatched: + tagMatched = hashtag + tagMatchedLen = len(tagMatched) + categoryMatched = categoryStr + else: + # match the longest tag + if len(hashtag) > tagMatchedLen: + tagMatched = hashtag + categoryMatched = categoryStr + if not categoryMatched: + return + return categoryMatched + + def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None: """Extracts hashtags from an incoming post and updates the relevant tags files. @@ -91,6 +118,8 @@ def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None: print('Creating tags directory') os.mkdir(tagsDir) + hashtagCategories = getHashtagCategories(baseDir) + for tag in postJsonObject['object']['tag']: if not tag.get('type'): continue @@ -122,6 +151,14 @@ def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None: tagsFilename + ' ' + str(e)) removeOldHashtags(baseDir, 3) + # automatically assign a category to the tag if possible + categoryFilename = tagsDir + '/' + tagName + '.category' + if not os.path.isfile(categoryFilename): + categoryStr = \ + guessHashtagCategory(tagName, hashtagCategories) + if categoryStr: + setHashtagCategory(baseDir, tagName, categoryStr) + def inboxStorePostToHtmlCache(recentPostsCache: {}, maxRecentPosts: int, translate: {}, diff --git a/tests.py b/tests.py index aae2345b3..be85000b3 100644 --- a/tests.py +++ b/tests.py @@ -71,6 +71,7 @@ from delete import sendDeleteViaServer from inbox import jsonPostAllowsComments from inbox import validInbox from inbox import validInboxFilenames +from inbox import guessHashtagCategory from content import htmlReplaceEmailQuote from content import htmlReplaceQuoteMarks from content import dangerousMarkup @@ -2421,8 +2422,22 @@ def testValidNickname(): assert not validNickname(domain, nickname) +def testGuessHashtagCategory() -> None: + print('testGuessHashtagCategory') + hashtagCategories = { + "foo": ["swan", "goose"], + "bar": ["cat", "mouse"] + } + guess = guessHashtagCategory("unspecifiedgoose", hashtagCategories) + assert guess == "foo" + + guess = guessHashtagCategory("catpic", hashtagCategories) + assert guess == "bar" + + def runAllTests(): print('Running tests...') + testGuessHashtagCategory() testValidNickname() testParseFeedDate() testFirstParagraphFromString()