epicyon/categories.py

215 lines
6.9 KiB
Python
Raw Normal View History

__filename__ = "categories.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2021-01-26 10:07:42 +00:00
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
2021-06-15 15:08:12 +00:00
__module_group__ = "RSS Feeds"
import os
import datetime
2021-12-25 16:17:53 +00:00
def getHashtagCategory(base_dir: str, hashtag: str) -> str:
"""Returns the category for the hashtag
"""
2021-12-25 16:17:53 +00:00
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
if not os.path.isfile(categoryFilename):
2021-12-25 16:17:53 +00:00
categoryFilename = base_dir + '/tags/' + hashtag.title() + '.category'
if not os.path.isfile(categoryFilename):
categoryFilename = \
2021-12-25 16:17:53 +00:00
base_dir + '/tags/' + hashtag.upper() + '.category'
if not os.path.isfile(categoryFilename):
return ''
2021-11-26 12:28:20 +00:00
categoryStr = None
try:
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
except OSError:
print('EX: unable to read category ' + categoryFilename)
if categoryStr:
return categoryStr
return ''
2021-12-25 16:17:53 +00:00
def getHashtagCategories(base_dir: str,
2021-06-20 11:28:35 +00:00
recent: bool = False, category: str = None) -> None:
"""Returns a dictionary containing hashtag categories
"""
maxTagLength = 42
hashtagCategories = {}
if recent:
2021-12-26 13:17:46 +00:00
curr_time = datetime.datetime.utcnow()
daysSinceEpoch = (curr_time - datetime.datetime(1970, 1, 1)).days
recently = daysSinceEpoch - 1
2021-12-25 16:17:53 +00:00
for subdir, dirs, files in os.walk(base_dir + '/tags'):
for f in files:
if not f.endswith('.category'):
continue
2021-12-25 16:17:53 +00:00
categoryFilename = os.path.join(base_dir + '/tags', f)
if not os.path.isfile(categoryFilename):
continue
hashtag = f.split('.')[0]
if len(hashtag) > maxTagLength:
continue
with open(categoryFilename, 'r') as fp:
categoryStr = fp.read()
if not categoryStr:
continue
if category:
# only return a dictionary for a specific category
if categoryStr != category:
continue
if recent:
2021-12-25 16:17:53 +00:00
tagsFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(tagsFilename):
continue
modTimesinceEpoc = \
os.path.getmtime(tagsFilename)
lastModifiedDate = \
datetime.datetime.fromtimestamp(modTimesinceEpoc)
fileDaysSinceEpoch = \
(lastModifiedDate -
datetime.datetime(1970, 1, 1)).days
if fileDaysSinceEpoch < recently:
continue
if not hashtagCategories.get(categoryStr):
hashtagCategories[categoryStr] = [hashtag]
else:
if hashtag not in hashtagCategories[categoryStr]:
hashtagCategories[categoryStr].append(hashtag)
break
return hashtagCategories
2021-12-25 16:17:53 +00:00
def updateHashtagCategories(base_dir: str) -> None:
"""Regenerates the list of hashtag categories
"""
2021-12-25 16:17:53 +00:00
categoryListFilename = base_dir + '/accounts/categoryList.txt'
hashtagCategories = getHashtagCategories(base_dir)
if not hashtagCategories:
if os.path.isfile(categoryListFilename):
try:
os.remove(categoryListFilename)
2021-11-25 18:42:38 +00:00
except OSError:
2021-10-29 16:31:20 +00:00
print('EX: updateHashtagCategories ' +
'unable to delete cached category list ' +
categoryListFilename)
return
categoryList = []
for categoryStr, hashtagList in hashtagCategories.items():
categoryList.append(categoryStr)
categoryList.sort()
categoryListStr = ''
for categoryStr in categoryList:
categoryListStr += categoryStr + '\n'
# save a list of available categories for quick lookup
2021-11-25 18:42:38 +00:00
try:
with open(categoryListFilename, 'w+') as fp:
fp.write(categoryListStr)
except OSError:
2021-11-25 22:22:54 +00:00
print('EX: unable to write category ' + categoryListFilename)
def _validHashtagCategory(category: str) -> bool:
"""Returns true if the category name is valid
"""
if not category:
return False
2021-02-09 13:16:53 +00:00
invalidChars = (',', ' ', '<', ';', '\\', '"', '&', '#')
for ch in invalidChars:
if ch in category:
return False
# too long
if len(category) > 40:
return False
return True
2021-12-25 16:17:53 +00:00
def setHashtagCategory(base_dir: str, hashtag: str, category: str,
update: bool, force: bool = False) -> bool:
"""Sets the category for the hashtag
"""
if not _validHashtagCategory(category):
return False
if not force:
2021-12-25 16:17:53 +00:00
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.title()
2021-12-25 16:17:53 +00:00
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
hashtag = hashtag.upper()
2021-12-25 16:17:53 +00:00
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
if not os.path.isfile(hashtagFilename):
return False
2021-12-25 16:17:53 +00:00
if not os.path.isdir(base_dir + '/tags'):
os.mkdir(base_dir + '/tags')
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
if force:
# don't overwrite any existing categories
if os.path.isfile(categoryFilename):
return False
2021-11-26 12:28:20 +00:00
categoryWritten = False
2021-11-25 18:42:38 +00:00
try:
with open(categoryFilename, 'w+') as fp:
fp.write(category)
2021-11-26 12:28:20 +00:00
categoryWritten = True
2021-12-25 15:28:52 +00:00
except OSError as ex:
2021-11-25 22:22:54 +00:00
print('EX: unable to write category ' + categoryFilename +
2021-12-25 15:28:52 +00:00
' ' + str(ex))
2021-11-26 12:28:20 +00:00
if categoryWritten:
if update:
2021-12-25 16:17:53 +00:00
updateHashtagCategories(base_dir)
2021-11-26 12:28:20 +00:00
return True
return False
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
2021-07-13 08:43:07 +00:00
if len(tagName) < 4:
return ''
categoryMatched = ''
tagMatchedLen = 0
for categoryStr, hashtagList in hashtagCategories.items():
for hashtag in hashtagList:
2021-07-13 08:35:29 +00:00
if len(hashtag) < 4:
# avoid matching very small strings which often
# lead to spurious categories
continue
if hashtag not in tagName:
if tagName not in hashtag:
continue
if not categoryMatched:
tagMatchedLen = len(hashtag)
categoryMatched = categoryStr
else:
# match the longest tag
if len(hashtag) > tagMatchedLen:
categoryMatched = categoryStr
if not categoryMatched:
2021-07-13 08:43:07 +00:00
return ''
return categoryMatched