Guess hashtag categorisations

main
Bob Mottram 2020-12-05 11:11:32 +00:00
parent 60abb1a0a6
commit 67e06f65c9
2 changed files with 52 additions and 0 deletions

View File

@ -30,6 +30,8 @@ from utils import loadJson
from utils import saveJson
from utils import updateLikesCollection
from utils import undoLikesCollectionEntry
from utils import getHashtagCategories
from utils import setHashtagCategory
from httpsig import verifyPostHeaders
from session import createSession
from session import getJson
@ -68,6 +70,31 @@ from happening import saveEventPost
from delete import removeOldHashtags
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
categoryMatched = ''
tagMatched = ''
tagMatchedLen = 0
for categoryStr, hashtagList in hashtagCategories.items():
for hashtag in hashtagList:
if hashtag in tagName:
if not tagMatched:
tagMatched = hashtag
tagMatchedLen = len(tagMatched)
categoryMatched = categoryStr
else:
# match the longest tag
if len(hashtag) > tagMatchedLen:
tagMatched = hashtag
categoryMatched = categoryStr
if not categoryMatched:
return
return categoryMatched
def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None:
"""Extracts hashtags from an incoming post and updates the
relevant tags files.
@ -91,6 +118,8 @@ def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None:
print('Creating tags directory')
os.mkdir(tagsDir)
hashtagCategories = getHashtagCategories(baseDir)
for tag in postJsonObject['object']['tag']:
if not tag.get('type'):
continue
@ -122,6 +151,14 @@ def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None:
tagsFilename + ' ' + str(e))
removeOldHashtags(baseDir, 3)
# automatically assign a category to the tag if possible
categoryFilename = tagsDir + '/' + tagName + '.category'
if not os.path.isfile(categoryFilename):
categoryStr = \
guessHashtagCategory(tagName, hashtagCategories)
if categoryStr:
setHashtagCategory(baseDir, tagName, categoryStr)
def inboxStorePostToHtmlCache(recentPostsCache: {}, maxRecentPosts: int,
translate: {},

View File

@ -71,6 +71,7 @@ from delete import sendDeleteViaServer
from inbox import jsonPostAllowsComments
from inbox import validInbox
from inbox import validInboxFilenames
from inbox import guessHashtagCategory
from content import htmlReplaceEmailQuote
from content import htmlReplaceQuoteMarks
from content import dangerousMarkup
@ -2421,8 +2422,22 @@ def testValidNickname():
assert not validNickname(domain, nickname)
def testGuessHashtagCategory() -> None:
print('testGuessHashtagCategory')
hashtagCategories = {
"foo": ["swan", "goose"],
"bar": ["cat", "mouse"]
}
guess = guessHashtagCategory("unspecifiedgoose", hashtagCategories)
assert guess == "foo"
guess = guessHashtagCategory("catpic", hashtagCategories)
assert guess == "bar"
def runAllTests():
print('Running tests...')
testGuessHashtagCategory()
testValidNickname()
testParseFeedDate()
testFirstParagraphFromString()