epicyon/categories.py

305 lines
10 KiB
Python
Raw Normal View History

__filename__ = "categories.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2026-01-02 11:36:24 +00:00
__version__ = "1.7.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
2021-06-15 15:08:12 +00:00
__module_group__ = "RSS Feeds"
import os
import datetime
from timeFunctions import date_utcnow
from timeFunctions import date_epoch
2024-05-12 12:35:26 +00:00
from utils import data_dir
2024-08-08 17:23:33 +00:00
from utils import replace_strings
2026-03-18 12:00:20 +00:00
from utils import get_invalid_characters
2026-04-25 19:43:39 +00:00
from data import load_string
from data import save_string
2026-05-02 09:43:35 +00:00
from data import erase_file
2026-05-02 11:34:27 +00:00
from data import is_a_file
2021-12-30 18:38:36 +00:00
MAX_TAG_LENGTH = 42
INVALID_HASHTAG_CHARS = (',', ' ', '<', ';', '\\', '"', '&', '#')
2021-12-29 21:55:09 +00:00
def get_hashtag_category(base_dir: str, hashtag: str) -> str:
"""Returns the category for the hashtag
"""
2021-12-30 18:38:36 +00:00
category_filename = base_dir + '/tags/' + hashtag + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(category_filename):
2021-12-30 18:38:36 +00:00
category_filename = base_dir + '/tags/' + hashtag.title() + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(category_filename):
2021-12-30 18:38:36 +00:00
category_filename = \
2021-12-25 16:17:53 +00:00
base_dir + '/tags/' + hashtag.upper() + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(category_filename):
return ''
category_str: str = \
load_string(category_filename,
'EX: unable to read category ' +
category_filename + ' [ex]')
2021-12-30 18:38:36 +00:00
if category_str:
return category_str
return ''
2023-09-02 14:42:59 +00:00
def load_city_hashtags(base_dir: str, translate: {}) -> None:
"""create hashtag categories for cities
"""
2025-09-21 11:38:54 +00:00
category_str: str = 'places'
2023-09-02 14:42:59 +00:00
if translate.get(category_str):
category_str = translate[category_str]
2024-08-08 17:23:33 +00:00
replacements = {
' & ': ' and ',
'/': ''
}
replacements2 = {
'-': '',
' ': ''
}
2023-09-02 14:42:59 +00:00
for _, _, files in os.walk(base_dir + '/data/cities'):
for cities_file in files:
if not cities_file.endswith('.txt'):
continue
cities_filename = base_dir + '/data/cities/' + cities_file
2026-05-02 11:34:27 +00:00
if not is_a_file(cities_filename):
2023-09-02 14:42:59 +00:00
continue
2024-12-23 15:39:55 +00:00
cities: list[str] = []
2026-04-25 19:43:39 +00:00
cities_str = \
load_string(cities_filename,
'EX: unable to load cities file ' +
cities_filename + ' [ex]')
2026-04-25 19:43:39 +00:00
if cities_str:
cities = cities_str.split('\n')
2023-09-02 14:42:59 +00:00
if not cities:
continue
for hashtag in cities:
hashtag = hashtag.lower().strip()
2024-08-08 17:23:33 +00:00
hashtag = replace_strings(hashtag, replacements)
2023-09-02 14:42:59 +00:00
2024-08-08 17:23:33 +00:00
hashtag2 = replace_strings(hashtag, replacements2)
2023-09-02 14:42:59 +00:00
city_filename = base_dir + '/tags/' + hashtag2 + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(city_filename):
2026-04-25 19:43:39 +00:00
save_string(category_str, city_filename,
'EX: unable to write city category ' +
city_filename)
2023-09-02 14:42:59 +00:00
if '-' in hashtag:
section = hashtag.split('-')
2025-09-21 11:38:54 +00:00
new_hashtag: str = ''
2023-09-02 14:42:59 +00:00
for text in section:
new_hashtag += text.lower().title()
hashtag2 = new_hashtag
city_filename = \
base_dir + '/tags/' + hashtag2 + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(city_filename):
2026-04-25 19:43:39 +00:00
save_string(category_str, city_filename,
'EX: unable to write city category2 ' +
city_filename)
2023-09-02 14:42:59 +00:00
if ' ' in hashtag:
section = hashtag.split(' ')
2025-09-21 11:38:54 +00:00
new_hashtag: str = ''
2023-09-02 14:42:59 +00:00
for text in section:
new_hashtag += text.lower().title()
hashtag2 = new_hashtag
city_filename = \
base_dir + '/tags/' + hashtag2 + '.category'
2026-05-02 11:34:27 +00:00
if not is_a_file(city_filename):
2026-04-25 19:43:39 +00:00
save_string(category_str, city_filename,
'EX: unable to write city category3 ' +
city_filename)
2023-09-02 14:42:59 +00:00
2021-12-29 21:55:09 +00:00
def get_hashtag_categories(base_dir: str,
2024-02-19 14:38:29 +00:00
recent: bool, category: str) -> None:
"""Returns a dictionary containing hashtag categories
"""
2021-12-30 18:38:36 +00:00
hashtag_categories = {}
if recent:
2023-11-20 22:27:58 +00:00
curr_time = date_utcnow()
days_since_epoch = (curr_time - date_epoch()).days
2021-12-30 18:38:36 +00:00
recently = days_since_epoch - 1
2022-05-30 21:41:18 +00:00
for _, _, files in os.walk(base_dir + '/tags'):
2021-12-30 18:38:36 +00:00
for catfile in files:
if not catfile.endswith('.category'):
continue
2021-12-30 18:38:36 +00:00
category_filename = os.path.join(base_dir + '/tags', catfile)
2026-05-02 11:34:27 +00:00
if not is_a_file(category_filename):
continue
2021-12-30 18:38:36 +00:00
hashtag = catfile.split('.')[0]
if len(hashtag) > MAX_TAG_LENGTH:
continue
2023-09-02 11:47:24 +00:00
category_str: str = \
load_string(category_filename,
'EX: get_hashtag_categories ' +
category_filename + ' [ex]')
2023-09-02 11:47:24 +00:00
if not category_str:
continue
2023-09-02 11:47:24 +00:00
if category:
# only return a dictionary for a specific category
if category_str != category:
continue
2023-09-02 11:47:24 +00:00
if recent:
tags_filename = base_dir + '/tags/' + hashtag + '.txt'
2026-05-02 11:34:27 +00:00
if not is_a_file(tags_filename):
2023-09-02 11:47:24 +00:00
continue
mod_time_since_epoc = \
os.path.getmtime(tags_filename)
last_modified_date = \
2023-11-20 22:27:58 +00:00
datetime.datetime.fromtimestamp(mod_time_since_epoc,
datetime.timezone.utc)
2023-09-02 11:47:24 +00:00
file_days_since_epoch = \
2023-11-20 22:27:58 +00:00
(last_modified_date - date_epoch()).days
2023-09-02 11:47:24 +00:00
if file_days_since_epoch < recently:
continue
2023-09-02 11:47:24 +00:00
if not hashtag_categories.get(category_str):
hashtag_categories[category_str] = [hashtag]
else:
if hashtag not in hashtag_categories[category_str]:
hashtag_categories[category_str].append(hashtag)
break
2021-12-30 18:38:36 +00:00
return hashtag_categories
2021-12-29 21:55:09 +00:00
def update_hashtag_categories(base_dir: str) -> None:
"""Regenerates the list of hashtag categories
"""
2024-05-12 12:35:26 +00:00
category_list_filename = data_dir(base_dir) + '/categoryList.txt'
2024-02-19 14:38:29 +00:00
hashtag_categories = get_hashtag_categories(base_dir, False, None)
2021-12-30 18:38:36 +00:00
if not hashtag_categories:
2026-05-02 11:34:27 +00:00
if is_a_file(category_list_filename):
2026-05-02 09:43:35 +00:00
erase_file(category_list_filename,
'EX: update_hashtag_categories ' +
'unable to delete cached category list ' +
category_list_filename)
return
2024-12-23 15:39:55 +00:00
category_list: list[str] = []
2021-12-30 18:38:36 +00:00
for category_str, _ in hashtag_categories.items():
category_list.append(category_str)
category_list.sort()
2025-09-21 11:38:54 +00:00
category_list_str: str = ''
2021-12-30 18:38:36 +00:00
for category_str in category_list:
category_list_str += category_str + '\n'
# save a list of available categories for quick lookup
2026-04-25 19:43:39 +00:00
save_string(category_list_str, category_list_filename,
'EX: unable to write category ' +
category_list_filename)
2021-12-29 21:55:09 +00:00
def _valid_hashtag_category(category: str) -> bool:
"""Returns true if the category name is valid
"""
if not category:
return False
2026-03-18 13:03:41 +00:00
for char in category:
if ord(char) == 0:
return False
2026-03-18 13:03:41 +00:00
for char in get_invalid_characters():
2026-03-18 12:00:20 +00:00
if char in category:
return False
2021-12-30 18:38:36 +00:00
for char in INVALID_HASHTAG_CHARS:
if char in category:
return False
2026-03-18 11:54:56 +00:00
# too long or too short
if len(category) > 40:
return False
2026-03-18 12:00:20 +00:00
if len(category) == 0:
2026-03-18 11:54:56 +00:00
return False
return True
2021-12-29 21:55:09 +00:00
def set_hashtag_category(base_dir: str, hashtag: str, category: str,
2024-05-01 12:03:34 +00:00
update: bool, force: bool) -> bool:
"""Sets the category for the hashtag
"""
2021-12-29 21:55:09 +00:00
if not _valid_hashtag_category(category):
return False
if not force:
2021-12-30 18:38:36 +00:00
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
2026-05-02 11:34:27 +00:00
if not is_a_file(hashtag_filename):
hashtag = hashtag.title()
2021-12-30 18:38:36 +00:00
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
2026-05-02 11:34:27 +00:00
if not is_a_file(hashtag_filename):
hashtag = hashtag.upper()
2021-12-30 18:38:36 +00:00
hashtag_filename = base_dir + '/tags/' + hashtag + '.txt'
2026-05-02 11:34:27 +00:00
if not is_a_file(hashtag_filename):
return False
2021-12-25 16:17:53 +00:00
if not os.path.isdir(base_dir + '/tags'):
os.mkdir(base_dir + '/tags')
2021-12-30 18:38:36 +00:00
category_filename = base_dir + '/tags/' + hashtag + '.category'
if force:
# don't overwrite any existing categories
2026-05-02 11:34:27 +00:00
if is_a_file(category_filename):
return False
2021-11-26 12:28:20 +00:00
2026-04-28 13:04:50 +00:00
category_written: bool = False
if save_string(category, category_filename,
'EX: unable to write category ' + category_filename +
' [ex]'):
category_written = True
2021-11-26 12:28:20 +00:00
2021-12-30 18:38:36 +00:00
if category_written:
2021-11-26 12:28:20 +00:00
if update:
2021-12-29 21:55:09 +00:00
update_hashtag_categories(base_dir)
2021-11-26 12:28:20 +00:00
return True
return False
def guess_hashtag_category(tag_name: str, hashtag_categories: {},
min_tag_length: int) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag
"""
if len(tag_name) < min_tag_length:
2021-07-13 08:43:07 +00:00
return ''
2025-09-21 11:38:54 +00:00
category_matched: str = ''
2026-04-28 13:04:50 +00:00
tag_matched_len: int = 0
finished: bool = False
2021-12-30 18:38:36 +00:00
for category_str, hashtag_list in hashtag_categories.items():
2024-01-10 14:04:53 +00:00
if finished:
break
2021-12-30 18:38:36 +00:00
for hashtag in hashtag_list:
2024-01-10 14:04:53 +00:00
if hashtag == tag_name:
# exact match
category_matched = category_str
finished = True
break
if len(hashtag) < min_tag_length:
# avoid matching very small strings which often
# lead to spurious categories
continue
2024-01-10 13:41:59 +00:00
if hashtag not in tag_name:
if tag_name not in hashtag:
continue
2021-12-30 18:38:36 +00:00
if not category_matched:
tag_matched_len = len(hashtag)
category_matched = category_str
else:
# match the longest tag
2021-12-30 18:38:36 +00:00
if len(hashtag) > tag_matched_len:
category_matched = category_str
if not category_matched:
2021-07-13 08:43:07 +00:00
return ''
2021-12-30 18:38:36 +00:00
return category_matched