2020-12-22 10:30:52 +00:00
|
|
|
__filename__ = "categories.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2021-01-26 10:07:42 +00:00
|
|
|
__version__ = "1.2.0"
|
2020-12-22 10:30:52 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2020-12-22 10:30:52 +00:00
|
|
|
__status__ = "Production"
|
2021-06-15 15:08:12 +00:00
|
|
|
__module_group__ = "RSS Feeds"
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
import os
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def getHashtagCategory(base_dir: str, hashtag: str) -> str:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Returns the category for the hashtag
|
|
|
|
"""
|
2021-12-25 16:17:53 +00:00
|
|
|
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(categoryFilename):
|
2021-12-25 16:17:53 +00:00
|
|
|
categoryFilename = base_dir + '/tags/' + hashtag.title() + '.category'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(categoryFilename):
|
|
|
|
categoryFilename = \
|
2021-12-25 16:17:53 +00:00
|
|
|
base_dir + '/tags/' + hashtag.upper() + '.category'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(categoryFilename):
|
|
|
|
return ''
|
|
|
|
|
2021-11-26 12:28:20 +00:00
|
|
|
categoryStr = None
|
|
|
|
try:
|
|
|
|
with open(categoryFilename, 'r') as fp:
|
|
|
|
categoryStr = fp.read()
|
|
|
|
except OSError:
|
|
|
|
print('EX: unable to read category ' + categoryFilename)
|
|
|
|
if categoryStr:
|
|
|
|
return categoryStr
|
2020-12-22 10:30:52 +00:00
|
|
|
return ''
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def getHashtagCategories(base_dir: str,
|
2021-06-20 11:28:35 +00:00
|
|
|
recent: bool = False, category: str = None) -> None:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Returns a dictionary containing hashtag categories
|
|
|
|
"""
|
2021-01-24 10:45:35 +00:00
|
|
|
maxTagLength = 42
|
2020-12-22 10:30:52 +00:00
|
|
|
hashtagCategories = {}
|
|
|
|
|
|
|
|
if recent:
|
|
|
|
currTime = datetime.datetime.utcnow()
|
|
|
|
daysSinceEpoch = (currTime - datetime.datetime(1970, 1, 1)).days
|
|
|
|
recently = daysSinceEpoch - 1
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
for subdir, dirs, files in os.walk(base_dir + '/tags'):
|
2020-12-22 10:30:52 +00:00
|
|
|
for f in files:
|
|
|
|
if not f.endswith('.category'):
|
|
|
|
continue
|
2021-12-25 16:17:53 +00:00
|
|
|
categoryFilename = os.path.join(base_dir + '/tags', f)
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(categoryFilename):
|
|
|
|
continue
|
|
|
|
hashtag = f.split('.')[0]
|
2021-01-24 10:45:35 +00:00
|
|
|
if len(hashtag) > maxTagLength:
|
|
|
|
continue
|
2021-06-21 22:52:04 +00:00
|
|
|
with open(categoryFilename, 'r') as fp:
|
|
|
|
categoryStr = fp.read()
|
|
|
|
|
|
|
|
if not categoryStr:
|
|
|
|
continue
|
|
|
|
|
2020-12-22 10:30:52 +00:00
|
|
|
if category:
|
|
|
|
# only return a dictionary for a specific category
|
|
|
|
if categoryStr != category:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if recent:
|
2021-12-25 16:17:53 +00:00
|
|
|
tagsFilename = base_dir + '/tags/' + hashtag + '.txt'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(tagsFilename):
|
|
|
|
continue
|
|
|
|
modTimesinceEpoc = \
|
|
|
|
os.path.getmtime(tagsFilename)
|
|
|
|
lastModifiedDate = \
|
|
|
|
datetime.datetime.fromtimestamp(modTimesinceEpoc)
|
|
|
|
fileDaysSinceEpoch = \
|
|
|
|
(lastModifiedDate -
|
|
|
|
datetime.datetime(1970, 1, 1)).days
|
|
|
|
if fileDaysSinceEpoch < recently:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if not hashtagCategories.get(categoryStr):
|
|
|
|
hashtagCategories[categoryStr] = [hashtag]
|
|
|
|
else:
|
|
|
|
if hashtag not in hashtagCategories[categoryStr]:
|
|
|
|
hashtagCategories[categoryStr].append(hashtag)
|
|
|
|
break
|
|
|
|
return hashtagCategories
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def updateHashtagCategories(base_dir: str) -> None:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Regenerates the list of hashtag categories
|
|
|
|
"""
|
2021-12-25 16:17:53 +00:00
|
|
|
categoryListFilename = base_dir + '/accounts/categoryList.txt'
|
|
|
|
hashtagCategories = getHashtagCategories(base_dir)
|
2020-12-22 10:30:52 +00:00
|
|
|
if not hashtagCategories:
|
|
|
|
if os.path.isfile(categoryListFilename):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(categoryListFilename)
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2021-10-29 16:31:20 +00:00
|
|
|
print('EX: updateHashtagCategories ' +
|
|
|
|
'unable to delete cached category list ' +
|
|
|
|
categoryListFilename)
|
2020-12-22 10:30:52 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
categoryList = []
|
|
|
|
for categoryStr, hashtagList in hashtagCategories.items():
|
|
|
|
categoryList.append(categoryStr)
|
|
|
|
categoryList.sort()
|
|
|
|
|
|
|
|
categoryListStr = ''
|
|
|
|
for categoryStr in categoryList:
|
|
|
|
categoryListStr += categoryStr + '\n'
|
|
|
|
|
|
|
|
# save a list of available categories for quick lookup
|
2021-11-25 18:42:38 +00:00
|
|
|
try:
|
|
|
|
with open(categoryListFilename, 'w+') as fp:
|
|
|
|
fp.write(categoryListStr)
|
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to write category ' + categoryListFilename)
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _validHashtagCategory(category: str) -> bool:
|
|
|
|
"""Returns true if the category name is valid
|
|
|
|
"""
|
|
|
|
if not category:
|
|
|
|
return False
|
|
|
|
|
2021-02-09 13:16:53 +00:00
|
|
|
invalidChars = (',', ' ', '<', ';', '\\', '"', '&', '#')
|
2020-12-22 10:30:52 +00:00
|
|
|
for ch in invalidChars:
|
|
|
|
if ch in category:
|
|
|
|
return False
|
|
|
|
|
|
|
|
# too long
|
|
|
|
if len(category) > 40:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def setHashtagCategory(base_dir: str, hashtag: str, category: str,
|
2021-08-15 11:39:20 +00:00
|
|
|
update: bool, force: bool = False) -> bool:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Sets the category for the hashtag
|
|
|
|
"""
|
|
|
|
if not _validHashtagCategory(category):
|
|
|
|
return False
|
|
|
|
|
|
|
|
if not force:
|
2021-12-25 16:17:53 +00:00
|
|
|
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(hashtagFilename):
|
|
|
|
hashtag = hashtag.title()
|
2021-12-25 16:17:53 +00:00
|
|
|
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(hashtagFilename):
|
|
|
|
hashtag = hashtag.upper()
|
2021-12-25 16:17:53 +00:00
|
|
|
hashtagFilename = base_dir + '/tags/' + hashtag + '.txt'
|
2020-12-22 10:30:52 +00:00
|
|
|
if not os.path.isfile(hashtagFilename):
|
|
|
|
return False
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
if not os.path.isdir(base_dir + '/tags'):
|
|
|
|
os.mkdir(base_dir + '/tags')
|
|
|
|
categoryFilename = base_dir + '/tags/' + hashtag + '.category'
|
2020-12-22 10:30:52 +00:00
|
|
|
if force:
|
|
|
|
# don't overwrite any existing categories
|
|
|
|
if os.path.isfile(categoryFilename):
|
|
|
|
return False
|
2021-11-26 12:28:20 +00:00
|
|
|
|
|
|
|
categoryWritten = False
|
2021-11-25 18:42:38 +00:00
|
|
|
try:
|
|
|
|
with open(categoryFilename, 'w+') as fp:
|
|
|
|
fp.write(category)
|
2021-11-26 12:28:20 +00:00
|
|
|
categoryWritten = True
|
2021-12-25 15:28:52 +00:00
|
|
|
except OSError as ex:
|
2021-11-25 22:22:54 +00:00
|
|
|
print('EX: unable to write category ' + categoryFilename +
|
2021-12-25 15:28:52 +00:00
|
|
|
' ' + str(ex))
|
2021-11-26 12:28:20 +00:00
|
|
|
|
|
|
|
if categoryWritten:
|
|
|
|
if update:
|
2021-12-25 16:17:53 +00:00
|
|
|
updateHashtagCategories(base_dir)
|
2021-11-26 12:28:20 +00:00
|
|
|
return True
|
|
|
|
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
|
|
|
|
"""Tries to guess a category for the given hashtag.
|
|
|
|
This works by trying to find the longest similar hashtag
|
|
|
|
"""
|
2021-07-13 08:43:07 +00:00
|
|
|
if len(tagName) < 4:
|
|
|
|
return ''
|
|
|
|
|
2020-12-22 10:30:52 +00:00
|
|
|
categoryMatched = ''
|
|
|
|
tagMatchedLen = 0
|
|
|
|
|
|
|
|
for categoryStr, hashtagList in hashtagCategories.items():
|
|
|
|
for hashtag in hashtagList:
|
2021-07-13 08:35:29 +00:00
|
|
|
if len(hashtag) < 4:
|
2020-12-22 10:30:52 +00:00
|
|
|
# avoid matching very small strings which often
|
|
|
|
# lead to spurious categories
|
|
|
|
continue
|
|
|
|
if hashtag not in tagName:
|
|
|
|
if tagName not in hashtag:
|
|
|
|
continue
|
|
|
|
if not categoryMatched:
|
|
|
|
tagMatchedLen = len(hashtag)
|
|
|
|
categoryMatched = categoryStr
|
|
|
|
else:
|
|
|
|
# match the longest tag
|
|
|
|
if len(hashtag) > tagMatchedLen:
|
|
|
|
categoryMatched = categoryStr
|
|
|
|
if not categoryMatched:
|
2021-07-13 08:43:07 +00:00
|
|
|
return ''
|
2020-12-22 10:30:52 +00:00
|
|
|
return categoryMatched
|