epicyon/categories.py

215 lines
6.9 KiB
Python

__filename__ = "categories.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "RSS Feeds"
import os
import datetime
def getHashtagCategory(base_dir: str, hashtag: str) -> str:
"""Returns the category for the hashtag
"""
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = base_dir + '/tags/' + hashtag.title() + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = \
base_dir + '/tags/' + hashtag.upper() + '.category'
if not os.path.isfile(categoryFilename):
return ''
categoryStr = None
try:
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
except OSError:
print('EX: unable to read category ' + categoryFilename)
if categoryStr:
return categoryStr
return ''
def getHashtagCategories(base_dir: str,
recent: bool = False, category: str = None) -> None:
"""Returns a dictionary containing hashtag categories
"""
maxTagLength = 42
hashtagCategories = {}
if recent:
currTime = datetime.datetime.utcnow()
daysSinceEpoch = (currTime - datetime.datetime(1970, 1, 1)).days
recently = daysSinceEpoch - 1
for subdir, dirs, files in os.walk(base_dir + '/tags'):
for f in files:
if not f.endswith('.category'):
continue
categoryFilename = os.path.join(base_dir + '/tags', f)
if not os.path.isfile(categoryFilename):
continue
hashtag = f.split('.')[0]
if len(hashtag) > maxTagLength:
continue
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if not categoryStr:
continue
if category:
# only return a dictionary for a specific category
if categoryStr != category:
continue
if recent:
tagsFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(tagsFilename):
continue
modTimesinceEpoc = \
os.path.getmtime(tagsFilename)
lastModifiedDate = \
datetime.datetime.fromtimestamp(modTimesinceEpoc)
fileDaysSinceEpoch = \
(lastModifiedDate -
datetime.datetime(1970, 1, 1)).days
if fileDaysSinceEpoch < recently:
continue
if not hashtagCategories.get(categoryStr):
hashtagCategories[categoryStr] = [hashtag]
else:
if hashtag not in hashtagCategories[categoryStr]:
hashtagCategories[categoryStr].append(hashtag)
break
return hashtagCategories
def updateHashtagCategories(base_dir: str) -> None:
"""Regenerates the list of hashtag categories
"""
categoryListFilename = base_dir + '/accounts/categoryList.txt'
hashtagCategories = getHashtagCategories(base_dir)
if not hashtagCategories:
if os.path.isfile(categoryListFilename):
try:
os.remove(categoryListFilename)
except OSError:
print('EX: updateHashtagCategories ' +
'unable to delete cached category list ' +
categoryListFilename)
return
categoryList = []
for categoryStr, hashtagList in hashtagCategories.items():
categoryList.append(categoryStr)
categoryList.sort()
categoryListStr = ''
for categoryStr in categoryList:
categoryListStr += categoryStr + '\n'
# save a list of available categories for quick lookup
try:
with open(categoryListFilename, 'w+') as fp:
fp.write(categoryListStr)
except OSError:
print('EX: unable to write category ' + categoryListFilename)
def _validHashtagCategory(category: str) -> bool:
"""Returns true if the category name is valid
"""
if not category:
return False
invalidChars = (',', ' ', '<', ';', '\\', '"', '&', '#')
for ch in invalidChars:
if ch in category:
return False
# too long
if len(category) > 40:
return False
return True
def setHashtagCategory(base_dir: str, hashtag: str, category: str,
update: bool, force: bool = False) -> bool:
"""Sets the category for the hashtag
"""
if not _validHashtagCategory(category):
return False
if not force:
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.title()
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.upper()
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
return False
if not os.path.isdir(base_dir + '/tags'):
os.mkdir(base_dir + '/tags')
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
if force:
# don't overwrite any existing categories
if os.path.isfile(categoryFilename):
return False
categoryWritten = False
try:
with open(categoryFilename, 'w+') as fp:
fp.write(category)
categoryWritten = True
except OSError as ex:
print('EX: unable to write category ' + categoryFilename +
' ' + str(ex))
if categoryWritten:
if update:
updateHashtagCategories(base_dir)
return True
return False
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
if len(tagName) < 4:
return ''
categoryMatched = ''
tagMatchedLen = 0
for categoryStr, hashtagList in hashtagCategories.items():
for hashtag in hashtagList:
if len(hashtag) < 4:
# avoid matching very small strings which often
# lead to spurious categories
continue
if hashtag not in tagName:
if tagName not in hashtag:
continue
if not categoryMatched:
tagMatchedLen = len(hashtag)
categoryMatched = categoryStr
else:
# match the longest tag
if len(hashtag) > tagMatchedLen:
categoryMatched = categoryStr
if not categoryMatched:
return ''
return categoryMatched