epicyon/categories.py

200 lines
6.5 KiB
Python
Raw Normal View History

__filename__ = "categories.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2021-01-26 10:07:42 +00:00
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
2021-06-15 15:08:12 +00:00
__module_group__ = "RSS Feeds"
import os
import datetime
def getHashtagCategory(baseDir: str, hashtag: str) -> str:
"""Returns the category for the hashtag
"""
categoryFilename = baseDir + '/tags/' + hashtag + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = baseDir + '/tags/' + hashtag.title() + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = \
baseDir + '/tags/' + hashtag.upper() + '.category'
if not os.path.isfile(categoryFilename):
return ''
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if categoryStr:
return categoryStr
return ''
2021-06-20 11:28:35 +00:00
def getHashtagCategories(baseDir: str,
recent: bool = False, category: str = None) -> None:
"""Returns a dictionary containing hashtag categories
"""
maxTagLength = 42
hashtagCategories = {}
if recent:
currTime = datetime.datetime.utcnow()
daysSinceEpoch = (currTime - datetime.datetime(1970, 1, 1)).days
recently = daysSinceEpoch - 1
for subdir, dirs, files in os.walk(baseDir + '/tags'):
for f in files:
if not f.endswith('.category'):
continue
categoryFilename = os.path.join(baseDir + '/tags', f)
if not os.path.isfile(categoryFilename):
continue
hashtag = f.split('.')[0]
if len(hashtag) > maxTagLength:
continue
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if not categoryStr:
continue
if category:
# only return a dictionary for a specific category
if categoryStr != category:
continue
if recent:
tagsFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(tagsFilename):
continue
modTimesinceEpoc = \
os.path.getmtime(tagsFilename)
lastModifiedDate = \
datetime.datetime.fromtimestamp(modTimesinceEpoc)
fileDaysSinceEpoch = \
(lastModifiedDate -
datetime.datetime(1970, 1, 1)).days
if fileDaysSinceEpoch < recently:
continue
if not hashtagCategories.get(categoryStr):
hashtagCategories[categoryStr] = [hashtag]
else:
if hashtag not in hashtagCategories[categoryStr]:
hashtagCategories[categoryStr].append(hashtag)
break
return hashtagCategories
def updateHashtagCategories(baseDir: str) -> None:
"""Regenerates the list of hashtag categories
"""
categoryListFilename = baseDir + '/accounts/categoryList.txt'
hashtagCategories = getHashtagCategories(baseDir)
if not hashtagCategories:
if os.path.isfile(categoryListFilename):
try:
os.remove(categoryListFilename)
except BaseException:
2021-10-29 16:31:20 +00:00
print('EX: updateHashtagCategories ' +
'unable to delete cached category list ' +
categoryListFilename)
pass
return
categoryList = []
for categoryStr, hashtagList in hashtagCategories.items():
categoryList.append(categoryStr)
categoryList.sort()
categoryListStr = ''
for categoryStr in categoryList:
categoryListStr += categoryStr + '\n'
# save a list of available categories for quick lookup
with open(categoryListFilename, 'w+') as fp:
fp.write(categoryListStr)
def _validHashtagCategory(category: str) -> bool:
"""Returns true if the category name is valid
"""
if not category:
return False
2021-02-09 13:16:53 +00:00
invalidChars = (',', ' ', '<', ';', '\\', '"', '&', '#')
for ch in invalidChars:
if ch in category:
return False
# too long
if len(category) > 40:
return False
return True
def setHashtagCategory(baseDir: str, hashtag: str, category: str,
update: bool, force: bool = False) -> bool:
"""Sets the category for the hashtag
"""
if not _validHashtagCategory(category):
return False
if not force:
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.title()
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.upper()
hashtagFilename = baseDir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
return False
if not os.path.isdir(baseDir + '/tags'):
os.mkdir(baseDir + '/tags')
categoryFilename = baseDir + '/tags/' + hashtag + '.category'
if force:
# don't overwrite any existing categories
if os.path.isfile(categoryFilename):
return False
with open(categoryFilename, 'w+') as fp:
fp.write(category)
if update:
updateHashtagCategories(baseDir)
return True
return False
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
2021-07-13 08:43:07 +00:00
if len(tagName) < 4:
return ''
categoryMatched = ''
tagMatchedLen = 0
for categoryStr, hashtagList in hashtagCategories.items():
for hashtag in hashtagList:
2021-07-13 08:35:29 +00:00
if len(hashtag) < 4:
# avoid matching very small strings which often
# lead to spurious categories
continue
if hashtag not in tagName:
if tagName not in hashtag:
continue
if not categoryMatched:
tagMatchedLen = len(hashtag)
categoryMatched = categoryStr
else:
# match the longest tag
if len(hashtag) > tagMatchedLen:
categoryMatched = categoryStr
if not categoryMatched:
2021-07-13 08:43:07 +00:00
return ''
return categoryMatched