2020-12-22 10:30:52 +00:00
|
|
|
__filename__ = "categories.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2024-12-22 23:37:30 +00:00
|
|
|
__version__ = "1.6.0"
|
2020-12-22 10:30:52 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2020-12-22 10:30:52 +00:00
|
|
|
__status__ = "Production"
|
2021-06-15 15:08:12 +00:00
|
|
|
__module_group__ = "RSS Feeds"
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
import os
|
|
|
|
import datetime
|
2024-05-12 12:35:26 +00:00
|
|
|
from utils import data_dir
|
2023-11-20 22:27:58 +00:00
|
|
|
from utils import date_utcnow
|
|
|
|
from utils import date_epoch
|
2024-08-08 17:23:33 +00:00
|
|
|
from utils import replace_strings
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
MAX_TAG_LENGTH = 42
|
|
|
|
|
|
|
|
INVALID_HASHTAG_CHARS = (',', ' ', '<', ';', '\\', '"', '&', '#')
|
|
|
|
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def get_hashtag_category(base_dir: str, hashtag: str) -> str:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Returns the category for the hashtag
|
|
|
|
"""
|
2021-12-30 18:38:36 +00:00
|
|
|
category_filename = base_dir + '/tags/' + hashtag + '.category'
|
|
|
|
if not os.path.isfile(category_filename):
|
|
|
|
category_filename = base_dir + '/tags/' + hashtag.title() + '.category'
|
|
|
|
if not os.path.isfile(category_filename):
|
|
|
|
category_filename = \
|
2021-12-25 16:17:53 +00:00
|
|
|
base_dir + '/tags/' + hashtag.upper() + '.category'
|
2021-12-30 18:38:36 +00:00
|
|
|
if not os.path.isfile(category_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
return ''
|
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
category_str = None
|
2021-11-26 12:28:20 +00:00
|
|
|
try:
|
2024-07-14 13:01:46 +00:00
|
|
|
with open(category_filename, 'r', encoding='utf-8') as fp_category:
|
|
|
|
category_str = fp_category.read()
|
2021-11-26 12:28:20 +00:00
|
|
|
except OSError:
|
2021-12-30 18:38:36 +00:00
|
|
|
print('EX: unable to read category ' + category_filename)
|
2024-02-01 10:50:00 +00:00
|
|
|
except UnicodeEncodeError as ex:
|
|
|
|
print('EX: unable to read category unicode ' + category_filename +
|
|
|
|
' ' + str(ex))
|
2021-12-30 18:38:36 +00:00
|
|
|
if category_str:
|
|
|
|
return category_str
|
2020-12-22 10:30:52 +00:00
|
|
|
return ''
|
|
|
|
|
|
|
|
|
2023-09-02 14:42:59 +00:00
|
|
|
def load_city_hashtags(base_dir: str, translate: {}) -> None:
|
|
|
|
"""create hashtag categories for cities
|
|
|
|
"""
|
|
|
|
category_str = 'places'
|
|
|
|
if translate.get(category_str):
|
|
|
|
category_str = translate[category_str]
|
|
|
|
|
2024-08-08 17:23:33 +00:00
|
|
|
replacements = {
|
|
|
|
' & ': ' and ',
|
|
|
|
'/': ''
|
|
|
|
}
|
|
|
|
replacements2 = {
|
|
|
|
'-': '',
|
|
|
|
' ': ''
|
|
|
|
}
|
2023-09-02 14:42:59 +00:00
|
|
|
for _, _, files in os.walk(base_dir + '/data/cities'):
|
|
|
|
for cities_file in files:
|
|
|
|
if not cities_file.endswith('.txt'):
|
|
|
|
continue
|
|
|
|
cities_filename = base_dir + '/data/cities/' + cities_file
|
|
|
|
if not os.path.isfile(cities_filename):
|
|
|
|
continue
|
2024-12-23 15:39:55 +00:00
|
|
|
cities: list[str] = []
|
2023-09-02 14:42:59 +00:00
|
|
|
try:
|
|
|
|
with open(cities_filename, 'r', encoding='utf-8') as fp_cities:
|
|
|
|
cities = fp_cities.read().split('\n')
|
|
|
|
except OSError:
|
|
|
|
print('EX: unable to load cities file ' + cities_filename)
|
|
|
|
if not cities:
|
|
|
|
continue
|
|
|
|
for hashtag in cities:
|
|
|
|
hashtag = hashtag.lower().strip()
|
2024-08-08 17:23:33 +00:00
|
|
|
hashtag = replace_strings(hashtag, replacements)
|
2023-09-02 14:42:59 +00:00
|
|
|
|
2024-08-08 17:23:33 +00:00
|
|
|
hashtag2 = replace_strings(hashtag, replacements2)
|
2023-09-02 14:42:59 +00:00
|
|
|
city_filename = base_dir + '/tags/' + hashtag2 + '.category'
|
|
|
|
if not os.path.isfile(city_filename):
|
|
|
|
try:
|
|
|
|
with open(city_filename, 'w+',
|
|
|
|
encoding='utf-8') as fp_city:
|
|
|
|
fp_city.write(category_str)
|
|
|
|
except OSError:
|
|
|
|
print('EX: unable to write city category ' +
|
|
|
|
city_filename)
|
|
|
|
if '-' in hashtag:
|
|
|
|
section = hashtag.split('-')
|
|
|
|
new_hashtag = ''
|
|
|
|
for text in section:
|
|
|
|
new_hashtag += text.lower().title()
|
|
|
|
hashtag2 = new_hashtag
|
|
|
|
city_filename = \
|
|
|
|
base_dir + '/tags/' + hashtag2 + '.category'
|
|
|
|
if not os.path.isfile(city_filename):
|
|
|
|
try:
|
|
|
|
with open(city_filename, 'w+',
|
|
|
|
encoding='utf-8') as fp_city:
|
|
|
|
fp_city.write(category_str)
|
|
|
|
except OSError:
|
|
|
|
print('EX: unable to write city category2 ' +
|
|
|
|
city_filename)
|
|
|
|
if ' ' in hashtag:
|
|
|
|
section = hashtag.split(' ')
|
|
|
|
new_hashtag = ''
|
|
|
|
for text in section:
|
|
|
|
new_hashtag += text.lower().title()
|
|
|
|
hashtag2 = new_hashtag
|
|
|
|
city_filename = \
|
|
|
|
base_dir + '/tags/' + hashtag2 + '.category'
|
|
|
|
if not os.path.isfile(city_filename):
|
|
|
|
try:
|
|
|
|
with open(city_filename, 'w+',
|
|
|
|
encoding='utf-8') as fp_city:
|
|
|
|
fp_city.write(category_str)
|
|
|
|
except OSError:
|
|
|
|
print('EX: unable to write city category3 ' +
|
|
|
|
city_filename)
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def get_hashtag_categories(base_dir: str,
|
2024-02-19 14:38:29 +00:00
|
|
|
recent: bool, category: str) -> None:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Returns a dictionary containing hashtag categories
|
|
|
|
"""
|
2021-12-30 18:38:36 +00:00
|
|
|
hashtag_categories = {}
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
if recent:
|
2023-11-20 22:27:58 +00:00
|
|
|
curr_time = date_utcnow()
|
|
|
|
days_since_epoch = (curr_time - date_epoch()).days
|
2021-12-30 18:38:36 +00:00
|
|
|
recently = days_since_epoch - 1
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2022-05-30 21:41:18 +00:00
|
|
|
for _, _, files in os.walk(base_dir + '/tags'):
|
2021-12-30 18:38:36 +00:00
|
|
|
for catfile in files:
|
|
|
|
if not catfile.endswith('.category'):
|
2020-12-22 10:30:52 +00:00
|
|
|
continue
|
2021-12-30 18:38:36 +00:00
|
|
|
category_filename = os.path.join(base_dir + '/tags', catfile)
|
|
|
|
if not os.path.isfile(category_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
continue
|
2021-12-30 18:38:36 +00:00
|
|
|
hashtag = catfile.split('.')[0]
|
|
|
|
if len(hashtag) > MAX_TAG_LENGTH:
|
2021-01-24 10:45:35 +00:00
|
|
|
continue
|
2023-09-02 11:47:24 +00:00
|
|
|
|
|
|
|
category_str = None
|
2024-02-01 10:50:00 +00:00
|
|
|
try:
|
|
|
|
with open(category_filename, 'r',
|
|
|
|
encoding='utf-8') as fp_category:
|
|
|
|
category_str = fp_category.read()
|
|
|
|
except OSError:
|
|
|
|
print('EX: get_hashtag_categories ' + category_filename)
|
|
|
|
except UnicodeEncodeError as ex:
|
|
|
|
print('EX: get_hashtag_categories unicode ' +
|
|
|
|
category_filename + ' ' + str(ex))
|
2021-06-21 22:52:04 +00:00
|
|
|
|
2023-09-02 11:47:24 +00:00
|
|
|
if not category_str:
|
|
|
|
continue
|
2021-06-21 22:52:04 +00:00
|
|
|
|
2023-09-02 11:47:24 +00:00
|
|
|
if category:
|
|
|
|
# only return a dictionary for a specific category
|
|
|
|
if category_str != category:
|
|
|
|
continue
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2023-09-02 11:47:24 +00:00
|
|
|
if recent:
|
|
|
|
tags_filename = base_dir + '/tags/' + hashtag + '.txt'
|
|
|
|
if not os.path.isfile(tags_filename):
|
|
|
|
continue
|
|
|
|
mod_time_since_epoc = \
|
|
|
|
os.path.getmtime(tags_filename)
|
|
|
|
last_modified_date = \
|
2023-11-20 22:27:58 +00:00
|
|
|
datetime.datetime.fromtimestamp(mod_time_since_epoc,
|
|
|
|
datetime.timezone.utc)
|
2023-09-02 11:47:24 +00:00
|
|
|
file_days_since_epoch = \
|
2023-11-20 22:27:58 +00:00
|
|
|
(last_modified_date - date_epoch()).days
|
2023-09-02 11:47:24 +00:00
|
|
|
if file_days_since_epoch < recently:
|
|
|
|
continue
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2023-09-02 11:47:24 +00:00
|
|
|
if not hashtag_categories.get(category_str):
|
|
|
|
hashtag_categories[category_str] = [hashtag]
|
|
|
|
else:
|
|
|
|
if hashtag not in hashtag_categories[category_str]:
|
|
|
|
hashtag_categories[category_str].append(hashtag)
|
2020-12-22 10:30:52 +00:00
|
|
|
break
|
2021-12-30 18:38:36 +00:00
|
|
|
return hashtag_categories
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def update_hashtag_categories(base_dir: str) -> None:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Regenerates the list of hashtag categories
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
category_list_filename = data_dir(base_dir) + '/categoryList.txt'
|
2024-02-19 14:38:29 +00:00
|
|
|
hashtag_categories = get_hashtag_categories(base_dir, False, None)
|
2021-12-30 18:38:36 +00:00
|
|
|
if not hashtag_categories:
|
|
|
|
if os.path.isfile(category_list_filename):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
2021-12-30 18:38:36 +00:00
|
|
|
os.remove(category_list_filename)
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2021-12-29 21:55:09 +00:00
|
|
|
print('EX: update_hashtag_categories ' +
|
2021-10-29 16:31:20 +00:00
|
|
|
'unable to delete cached category list ' +
|
2021-12-30 18:38:36 +00:00
|
|
|
category_list_filename)
|
2020-12-22 10:30:52 +00:00
|
|
|
return
|
|
|
|
|
2024-12-23 15:39:55 +00:00
|
|
|
category_list: list[str] = []
|
2021-12-30 18:38:36 +00:00
|
|
|
for category_str, _ in hashtag_categories.items():
|
|
|
|
category_list.append(category_str)
|
|
|
|
category_list.sort()
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
category_list_str = ''
|
|
|
|
for category_str in category_list:
|
|
|
|
category_list_str += category_str + '\n'
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
# save a list of available categories for quick lookup
|
2021-11-25 18:42:38 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(category_list_filename, 'w+',
|
|
|
|
encoding='utf-8') as fp_category:
|
2021-12-30 18:38:36 +00:00
|
|
|
fp_category.write(category_list_str)
|
2021-11-25 18:42:38 +00:00
|
|
|
except OSError:
|
2021-12-30 18:38:36 +00:00
|
|
|
print('EX: unable to write category ' + category_list_filename)
|
2020-12-22 10:30:52 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _valid_hashtag_category(category: str) -> bool:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Returns true if the category name is valid
|
|
|
|
"""
|
|
|
|
if not category:
|
|
|
|
return False
|
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
for char in INVALID_HASHTAG_CHARS:
|
|
|
|
if char in category:
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
# too long
|
|
|
|
if len(category) > 40:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def set_hashtag_category(base_dir: str, hashtag: str, category: str,
|
2024-05-01 12:03:34 +00:00
|
|
|
update: bool, force: bool) -> bool:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Sets the category for the hashtag
|
|
|
|
"""
|
2021-12-29 21:55:09 +00:00
|
|
|
if not _valid_hashtag_category(category):
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
if not force:
|
2021-12-30 18:38:36 +00:00
|
|
|
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
|
|
|
|
if not os.path.isfile(hashtag_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
hashtag = hashtag.title()
|
2021-12-30 18:38:36 +00:00
|
|
|
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
|
|
|
|
if not os.path.isfile(hashtag_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
hashtag = hashtag.upper()
|
2021-12-30 18:38:36 +00:00
|
|
|
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
|
|
|
|
if not os.path.isfile(hashtag_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
if not os.path.isdir(base_dir + '/tags'):
|
|
|
|
os.mkdir(base_dir + '/tags')
|
2021-12-30 18:38:36 +00:00
|
|
|
category_filename = base_dir + '/tags/' + hashtag + '.category'
|
2020-12-22 10:30:52 +00:00
|
|
|
if force:
|
|
|
|
# don't overwrite any existing categories
|
2021-12-30 18:38:36 +00:00
|
|
|
if os.path.isfile(category_filename):
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
2021-11-26 12:28:20 +00:00
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
category_written = False
|
2021-11-25 18:42:38 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(category_filename, 'w+', encoding='utf-8') as fp_category:
|
2021-12-30 18:38:36 +00:00
|
|
|
fp_category.write(category)
|
|
|
|
category_written = True
|
2021-12-25 15:28:52 +00:00
|
|
|
except OSError as ex:
|
2021-12-30 18:38:36 +00:00
|
|
|
print('EX: unable to write category ' + category_filename +
|
2021-12-25 15:28:52 +00:00
|
|
|
' ' + str(ex))
|
2024-02-01 10:50:00 +00:00
|
|
|
except UnicodeEncodeError as ex:
|
|
|
|
print('EX: unable to write category unicode ' + category_filename +
|
|
|
|
' ' + str(ex))
|
2021-11-26 12:28:20 +00:00
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
if category_written:
|
2021-11-26 12:28:20 +00:00
|
|
|
if update:
|
2021-12-29 21:55:09 +00:00
|
|
|
update_hashtag_categories(base_dir)
|
2021-11-26 12:28:20 +00:00
|
|
|
return True
|
|
|
|
|
2020-12-22 10:30:52 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2024-01-10 13:51:19 +00:00
|
|
|
def guess_hashtag_category(tag_name: str, hashtag_categories: {},
|
|
|
|
min_tag_length: int) -> str:
|
2020-12-22 10:30:52 +00:00
|
|
|
"""Tries to guess a category for the given hashtag.
|
|
|
|
This works by trying to find the longest similar hashtag
|
|
|
|
"""
|
2024-01-10 13:51:19 +00:00
|
|
|
if len(tag_name) < min_tag_length:
|
2021-07-13 08:43:07 +00:00
|
|
|
return ''
|
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
category_matched = ''
|
|
|
|
tag_matched_len = 0
|
2024-01-10 14:04:53 +00:00
|
|
|
finished = False
|
2020-12-22 10:30:52 +00:00
|
|
|
|
2021-12-30 18:38:36 +00:00
|
|
|
for category_str, hashtag_list in hashtag_categories.items():
|
2024-01-10 14:04:53 +00:00
|
|
|
if finished:
|
|
|
|
break
|
2021-12-30 18:38:36 +00:00
|
|
|
for hashtag in hashtag_list:
|
2024-01-10 14:04:53 +00:00
|
|
|
if hashtag == tag_name:
|
|
|
|
# exact match
|
|
|
|
category_matched = category_str
|
|
|
|
finished = True
|
|
|
|
break
|
2024-01-10 13:51:19 +00:00
|
|
|
if len(hashtag) < min_tag_length:
|
2020-12-22 10:30:52 +00:00
|
|
|
# avoid matching very small strings which often
|
|
|
|
# lead to spurious categories
|
|
|
|
continue
|
2024-01-10 13:41:59 +00:00
|
|
|
if hashtag not in tag_name:
|
|
|
|
if tag_name not in hashtag:
|
2020-12-22 10:30:52 +00:00
|
|
|
continue
|
2021-12-30 18:38:36 +00:00
|
|
|
if not category_matched:
|
|
|
|
tag_matched_len = len(hashtag)
|
|
|
|
category_matched = category_str
|
2020-12-22 10:30:52 +00:00
|
|
|
else:
|
|
|
|
# match the longest tag
|
2021-12-30 18:38:36 +00:00
|
|
|
if len(hashtag) > tag_matched_len:
|
|
|
|
category_matched = category_str
|
|
|
|
if not category_matched:
|
2021-07-13 08:43:07 +00:00
|
|
|
return ''
|
2021-12-30 18:38:36 +00:00
|
|
|
return category_matched
|