epicyon/categories.py

188 lines
6.1 KiB
Python

__filename__ = "categories.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
import datetime
def getHashtagCategory(baseDir: str, hashtag: str) -> str:
"""Returns the category for the hashtag
"""
categoryFilename = baseDir + '/tags/' + hashtag + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = baseDir + '/tags/' + hashtag.title() + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = \
baseDir + '/tags/' + hashtag.upper() + '.category'
if not os.path.isfile(categoryFilename):
return ''
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if categoryStr:
return categoryStr
return ''
def getHashtagCategories(baseDir: str, recent=False, category=None) -> None:
"""Returns a dictionary containing hashtag categories
"""
maxTagLength = 42
hashtagCategories = {}
if recent:
currTime = datetime.datetime.utcnow()
daysSinceEpoch = (currTime - datetime.datetime(1970, 1, 1)).days
recently = daysSinceEpoch - 1
for subdir, dirs, files in os.walk(baseDir + '/tags'):
for f in files:
if not f.endswith('.category'):
continue
categoryFilename = os.path.join(baseDir + '/tags', f)
if not os.path.isfile(categoryFilename):
continue
hashtag = f.split('.')[0]
if len(hashtag) > maxTagLength:
continue
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if not categoryStr:
continue
if category:
# only return a dictionary for a specific category
if categoryStr != category:
continue
if recent:
tagsFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(tagsFilename):
continue
modTimesinceEpoc = \
os.path.getmtime(tagsFilename)
lastModifiedDate = \
datetime.datetime.fromtimestamp(modTimesinceEpoc)
fileDaysSinceEpoch = \
(lastModifiedDate -
datetime.datetime(1970, 1, 1)).days
if fileDaysSinceEpoch < recently:
continue
if not hashtagCategories.get(categoryStr):
hashtagCategories[categoryStr] = [hashtag]
else:
if hashtag not in hashtagCategories[categoryStr]:
hashtagCategories[categoryStr].append(hashtag)
break
return hashtagCategories
def _updateHashtagCategories(baseDir: str) -> None:
"""Regenerates the list of hashtag categories
"""
categoryListFilename = baseDir + '/accounts/categoryList.txt'
hashtagCategories = getHashtagCategories(baseDir)
if not hashtagCategories:
if os.path.isfile(categoryListFilename):
os.remove(categoryListFilename)
return
categoryList = []
for categoryStr, hashtagList in hashtagCategories.items():
categoryList.append(categoryStr)
categoryList.sort()
categoryListStr = ''
for categoryStr in categoryList:
categoryListStr += categoryStr + '\n'
# save a list of available categories for quick lookup
with open(categoryListFilename, 'w+') as fp:
fp.write(categoryListStr)
def _validHashtagCategory(category: str) -> bool:
"""Returns true if the category name is valid
"""
if not category:
return False
invalidChars = (',', ' ', '<', ';', '\\')
for ch in invalidChars:
if ch in category:
return False
# too long
if len(category) > 40:
return False
return True
def setHashtagCategory(baseDir: str, hashtag: str, category: str,
force=False) -> bool:
"""Sets the category for the hashtag
"""
if not _validHashtagCategory(category):
return False
if not force:
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.title()
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.upper()
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
return False
if not os.path.isdir(baseDir + '/tags'):
os.mkdir(baseDir + '/tags')
categoryFilename = baseDir + '/tags/' + hashtag + '.category'
if force:
# don't overwrite any existing categories
if os.path.isfile(categoryFilename):
return False
with open(categoryFilename, 'w+') as fp:
fp.write(category)
_updateHashtagCategories(baseDir)
return True
return False
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
categoryMatched = ''
tagMatchedLen = 0
for categoryStr, hashtagList in hashtagCategories.items():
for hashtag in hashtagList:
if len(hashtag) < 3:
# avoid matching very small strings which often
# lead to spurious categories
continue
if hashtag not in tagName:
if tagName not in hashtag:
continue
if not categoryMatched:
tagMatchedLen = len(hashtag)
categoryMatched = categoryStr
else:
# match the longest tag
if len(hashtag) > tagMatchedLen:
categoryMatched = categoryStr
if not categoryMatched:
return
return categoryMatched