From 36915c35d3bb7504b930c2c79fa796d7eacae8c2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 31 Aug 2024 11:26:14 +0100 Subject: [PATCH] Move hashtag functions to hashtag module --- inbox.py | 180 +--------------------------------------- newsdaemon.py | 2 +- outbox.py | 2 +- webapp_hashtagswarm.py | 181 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 184 insertions(+), 181 deletions(-) diff --git a/inbox.py b/inbox.py index 04bdefd76..9be04dfd0 100644 --- a/inbox.py +++ b/inbox.py @@ -21,9 +21,7 @@ from reaction import valid_emoji_content from utils import harmless_markup from utils import quote_toots_allowed from utils import lines_in_file -from utils import resembles_url from utils import get_url_from_post -from utils import date_from_string_format from utils import date_epoch from utils import date_utcnow from utils import contains_statuses @@ -44,7 +42,6 @@ from utils import is_system_account from utils import invalid_ciphertext from utils import contains_private_key from utils import remove_html -from utils import file_last_modified from utils import has_object_string from utils import has_object_string_object from utils import get_reply_interval_hours @@ -65,7 +62,6 @@ from utils import remove_id_ending from utils import get_protocol_prefixes from utils import is_blog_post from utils import remove_avatar_from_cache -from utils import is_public_post from utils import get_cached_post_filename from utils import remove_post_from_cache from utils import url_permitted @@ -83,13 +79,10 @@ from utils import undo_reaction_collection_entry from utils import has_group_type from utils import local_actor_url from utils import has_object_string_type -from utils import valid_hash_tag from utils import get_attributed_to from utils import get_reply_to from utils import get_actor_from_post from utils import data_dir -from categories import get_hashtag_categories -from categories import set_hashtag_category from httpsig import get_digest_algorithm_from_headers from httpsig import verify_post_headers from session import create_session @@ -149,15 +142,13 @@ from git import is_git_patch from git import receive_git_patch from followingCalendar import receiving_calendar_events from happening import save_event_post -from delete import remove_old_hashtags -from categories import guess_hashtag_category from context import has_valid_context from speaker import update_speaker from announce import is_self_announce from announce import create_announce from notifyOnPost import notify_when_person_posts from conversation import update_conversation -from webapp_hashtagswarm import html_hash_tag_swarm +from webapp_hashtagswarm import store_hash_tags from person import valid_sending_actor from person import get_person_avatar_url from fitnessFunctions import fitness_performance @@ -166,10 +157,6 @@ from content import reject_twitter_summary from content import load_dogwhistles from content import valid_url_lengths from threads import begin_thread -from maps import get_map_links_from_post_content -from maps import get_location_from_post -from maps import add_tag_map_links -from maps import geocoords_from_map_link from reading import store_book_events @@ -207,171 +194,6 @@ def _store_last_post_id(base_dir: str, nickname: str, domain: str, print('EX: Unable to write last post id to ' + actor_filename) -def _update_cached_hashtag_swarm(base_dir: str, nickname: str, domain: str, - http_prefix: str, domain_full: str, - translate: {}) -> bool: - """Updates the hashtag swarm stored as a file - """ - cached_hashtag_swarm_filename = \ - acct_dir(base_dir, nickname, domain) + '/.hashtagSwarm' - save_swarm = True - if os.path.isfile(cached_hashtag_swarm_filename): - last_modified = file_last_modified(cached_hashtag_swarm_filename) - modified_date = None - try: - modified_date = \ - date_from_string_format(last_modified, ["%Y-%m-%dT%H:%M:%S%z"]) - except BaseException: - print('EX: unable to parse last modified cache date ' + - str(last_modified)) - if modified_date: - curr_date = date_utcnow() - time_diff = curr_date - modified_date - diff_mins = int(time_diff.total_seconds() / 60) - if diff_mins < 30: - # was saved recently, so don't save again - # This avoids too much disk I/O - save_swarm = False - print('Not updating hashtag swarm') - else: - print('Updating cached hashtag swarm, last changed ' + - str(diff_mins) + ' minutes ago') - else: - print('WARN: no modified date for ' + str(last_modified)) - if save_swarm: - actor = local_actor_url(http_prefix, nickname, domain_full) - new_swarm_str = html_hash_tag_swarm(base_dir, actor, translate) - if new_swarm_str: - try: - with open(cached_hashtag_swarm_filename, 'w+', - encoding='utf-8') as fp_swarm: - fp_swarm.write(new_swarm_str) - return True - except OSError: - print('EX: unable to write cached hashtag swarm ' + - cached_hashtag_swarm_filename) - remove_old_hashtags(base_dir, 3) - return False - - -def store_hash_tags(base_dir: str, nickname: str, domain: str, - http_prefix: str, domain_full: str, - post_json_object: {}, translate: {}) -> None: - """Extracts hashtags from an incoming post and updates the - relevant tags files. - """ - if not is_public_post(post_json_object): - return - if not has_object_dict(post_json_object): - return - if not post_json_object['object'].get('tag'): - return - if not post_json_object.get('id'): - return - if not isinstance(post_json_object['object']['tag'], list): - return - tags_dir = base_dir + '/tags' - - # add tags directory if it doesn't exist - if not os.path.isdir(tags_dir): - print('Creating tags directory') - os.mkdir(tags_dir) - - # obtain any map links and these can be associated with hashtags - # get geolocations from content - map_links = [] - published = None - if 'content' in post_json_object['object']: - published = post_json_object['object']['published'] - post_content = post_json_object['object']['content'] - map_links += get_map_links_from_post_content(post_content) - # get geolocation from tags - location_str = get_location_from_post(post_json_object) - if location_str: - if resembles_url(location_str): - zoom, latitude, longitude = \ - geocoords_from_map_link(location_str, - 'openstreetmap.org') - if latitude and longitude and zoom and \ - location_str not in map_links: - map_links.append(location_str) - tag_maps_dir = base_dir + '/tagmaps' - if map_links: - # add tagmaps directory if it doesn't exist - if not os.path.isdir(tag_maps_dir): - print('Creating tagmaps directory') - os.mkdir(tag_maps_dir) - - post_url = remove_id_ending(post_json_object['id']) - post_url = post_url.replace('/', '#') - hashtags_ctr = 0 - for tag in post_json_object['object']['tag']: - if not tag.get('type'): - continue - if not isinstance(tag['type'], str): - continue - if tag['type'] != 'Hashtag': - continue - if not tag.get('name'): - continue - tag_name = tag['name'].replace('#', '').strip() - if not valid_hash_tag(tag_name): - continue - tags_filename = tags_dir + '/' + tag_name + '.txt' - days_diff = date_utcnow() - date_epoch() - days_since_epoch = days_diff.days - tag_line = \ - str(days_since_epoch) + ' ' + nickname + ' ' + post_url + '\n' - if map_links and published: - add_tag_map_links(tag_maps_dir, tag_name, map_links, - published, post_url) - hashtag_added = False - if not os.path.isfile(tags_filename): - try: - with open(tags_filename, 'w+', encoding='utf-8') as fp_tags: - fp_tags.write(tag_line) - hashtag_added = True - except OSError: - print('EX: store_hash_tags unable to write ' + tags_filename) - else: - content = '' - try: - with open(tags_filename, 'r', encoding='utf-8') as fp_tags: - content = fp_tags.read() - except OSError: - print('EX: store_hash_tags failed to read ' + tags_filename) - if post_url not in content: - content = tag_line + content - try: - with open(tags_filename, 'w+', - encoding='utf-8') as fp_tags2: - fp_tags2.write(content) - hashtag_added = True - except OSError as ex: - print('EX: Failed to write entry to tags file ' + - tags_filename + ' ' + str(ex)) - - if hashtag_added: - hashtags_ctr += 1 - - # automatically assign a category to the tag if possible - category_filename = tags_dir + '/' + tag_name + '.category' - if not os.path.isfile(category_filename): - hashtag_categories = \ - get_hashtag_categories(base_dir, False, None) - category_str = \ - guess_hashtag_category(tag_name, hashtag_categories, 6) - if category_str: - set_hashtag_category(base_dir, tag_name, - category_str, False, False) - - # if some hashtags were found then recalculate the swarm - # ready for later display - if hashtags_ctr > 0: - _update_cached_hashtag_swarm(base_dir, nickname, domain, - http_prefix, domain_full, translate) - - def _inbox_store_post_to_html_cache(recent_posts_cache: {}, max_recent_posts: int, translate: {}, diff --git a/newsdaemon.py b/newsdaemon.py index fb1dba0d8..8014f6580 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -37,9 +37,9 @@ from utils import dangerous_markup from utils import local_actor_url from utils import text_in_file from utils import data_dir -from inbox import store_hash_tags from session import create_session from threads import begin_thread +from webapp_hashtagswarm import store_hash_tags def _update_feeds_outbox_index(base_dir: str, domain: str, diff --git a/outbox.py b/outbox.py index 8b644a2c3..a3f609be5 100644 --- a/outbox.py +++ b/outbox.py @@ -47,7 +47,6 @@ from media import replace_you_tube from media import replace_twitter from media import get_media_path from media import create_media_dirs -from inbox import store_hash_tags from inbox import inbox_update_index from announce import outbox_announce from announce import outbox_undo_announce @@ -65,6 +64,7 @@ from delete import outbox_delete from shares import outbox_share_upload from shares import outbox_undo_share_upload from webapp_post import individual_post_as_html +from webapp_hashtagswarm import store_hash_tags from speaker import update_speaker from reading import store_book_events from reading import has_edition_tag diff --git a/webapp_hashtagswarm.py b/webapp_hashtagswarm.py index 8dd2a418f..69b2624de 100644 --- a/webapp_hashtagswarm.py +++ b/webapp_hashtagswarm.py @@ -9,6 +9,15 @@ __module_group__ = "Web Interface" import os from datetime import datetime, timezone +from utils import valid_hash_tag +from utils import remove_id_ending +from utils import resembles_url +from utils import has_object_dict +from utils import is_public_post +from utils import local_actor_url +from utils import date_from_string_format +from utils import file_last_modified +from utils import acct_dir from utils import data_dir from utils import get_nickname_from_actor from utils import get_config_param @@ -16,6 +25,13 @@ from utils import escape_text from utils import date_utcnow from utils import date_epoch from utils import string_contains +from delete import remove_old_hashtags +from maps import add_tag_map_links +from maps import geocoords_from_map_link +from maps import get_map_links_from_post_content +from maps import get_location_from_post +from categories import set_hashtag_category +from categories import guess_hashtag_category from categories import get_hashtag_categories from categories import get_hashtag_category from webapp_utils import set_custom_background @@ -271,3 +287,168 @@ def html_search_hashtag_category(translate: {}, '' html_str += html_footer() return html_str + + +def _update_cached_hashtag_swarm(base_dir: str, nickname: str, domain: str, + http_prefix: str, domain_full: str, + translate: {}) -> bool: + """Updates the hashtag swarm stored as a file + """ + cached_hashtag_swarm_filename = \ + acct_dir(base_dir, nickname, domain) + '/.hashtagSwarm' + save_swarm = True + if os.path.isfile(cached_hashtag_swarm_filename): + last_modified = file_last_modified(cached_hashtag_swarm_filename) + modified_date = None + try: + modified_date = \ + date_from_string_format(last_modified, ["%Y-%m-%dT%H:%M:%S%z"]) + except BaseException: + print('EX: unable to parse last modified cache date ' + + str(last_modified)) + if modified_date: + curr_date = date_utcnow() + time_diff = curr_date - modified_date + diff_mins = int(time_diff.total_seconds() / 60) + if diff_mins < 30: + # was saved recently, so don't save again + # This avoids too much disk I/O + save_swarm = False + print('Not updating hashtag swarm') + else: + print('Updating cached hashtag swarm, last changed ' + + str(diff_mins) + ' minutes ago') + else: + print('WARN: no modified date for ' + str(last_modified)) + if save_swarm: + actor = local_actor_url(http_prefix, nickname, domain_full) + new_swarm_str = html_hash_tag_swarm(base_dir, actor, translate) + if new_swarm_str: + try: + with open(cached_hashtag_swarm_filename, 'w+', + encoding='utf-8') as fp_swarm: + fp_swarm.write(new_swarm_str) + return True + except OSError: + print('EX: unable to write cached hashtag swarm ' + + cached_hashtag_swarm_filename) + remove_old_hashtags(base_dir, 3) + return False + + +def store_hash_tags(base_dir: str, nickname: str, domain: str, + http_prefix: str, domain_full: str, + post_json_object: {}, translate: {}) -> None: + """Extracts hashtags from an incoming post and updates the + relevant tags files. + """ + if not is_public_post(post_json_object): + return + if not has_object_dict(post_json_object): + return + if not post_json_object['object'].get('tag'): + return + if not post_json_object.get('id'): + return + if not isinstance(post_json_object['object']['tag'], list): + return + tags_dir = base_dir + '/tags' + + # add tags directory if it doesn't exist + if not os.path.isdir(tags_dir): + print('Creating tags directory') + os.mkdir(tags_dir) + + # obtain any map links and these can be associated with hashtags + # get geolocations from content + map_links = [] + published = None + if 'content' in post_json_object['object']: + published = post_json_object['object']['published'] + post_content = post_json_object['object']['content'] + map_links += get_map_links_from_post_content(post_content) + # get geolocation from tags + location_str = get_location_from_post(post_json_object) + if location_str: + if resembles_url(location_str): + zoom, latitude, longitude = \ + geocoords_from_map_link(location_str, + 'openstreetmap.org') + if latitude and longitude and zoom and \ + location_str not in map_links: + map_links.append(location_str) + tag_maps_dir = base_dir + '/tagmaps' + if map_links: + # add tagmaps directory if it doesn't exist + if not os.path.isdir(tag_maps_dir): + print('Creating tagmaps directory') + os.mkdir(tag_maps_dir) + + post_url = remove_id_ending(post_json_object['id']) + post_url = post_url.replace('/', '#') + hashtags_ctr = 0 + for tag in post_json_object['object']['tag']: + if not tag.get('type'): + continue + if not isinstance(tag['type'], str): + continue + if tag['type'] != 'Hashtag': + continue + if not tag.get('name'): + continue + tag_name = tag['name'].replace('#', '').strip() + if not valid_hash_tag(tag_name): + continue + tags_filename = tags_dir + '/' + tag_name + '.txt' + days_diff = date_utcnow() - date_epoch() + days_since_epoch = days_diff.days + tag_line = \ + str(days_since_epoch) + ' ' + nickname + ' ' + post_url + '\n' + if map_links and published: + add_tag_map_links(tag_maps_dir, tag_name, map_links, + published, post_url) + hashtag_added = False + if not os.path.isfile(tags_filename): + try: + with open(tags_filename, 'w+', encoding='utf-8') as fp_tags: + fp_tags.write(tag_line) + hashtag_added = True + except OSError: + print('EX: store_hash_tags unable to write ' + tags_filename) + else: + content = '' + try: + with open(tags_filename, 'r', encoding='utf-8') as fp_tags: + content = fp_tags.read() + except OSError: + print('EX: store_hash_tags failed to read ' + tags_filename) + if post_url not in content: + content = tag_line + content + try: + with open(tags_filename, 'w+', + encoding='utf-8') as fp_tags2: + fp_tags2.write(content) + hashtag_added = True + except OSError as ex: + print('EX: Failed to write entry to tags file ' + + tags_filename + ' ' + str(ex)) + + if hashtag_added: + hashtags_ctr += 1 + + # automatically assign a category to the tag if possible + category_filename = tags_dir + '/' + tag_name + '.category' + if not os.path.isfile(category_filename): + hashtag_categories = \ + get_hashtag_categories(base_dir, False, None) + category_str = \ + guess_hashtag_category(tag_name, hashtag_categories, 6) + if category_str: + set_hashtag_category(base_dir, tag_name, + category_str, False, False) + + # if some hashtags were found then recalculate the swarm + # ready for later display + if hashtags_ctr > 0: + _update_cached_hashtag_swarm(base_dir, nickname, domain, + http_prefix, domain_full, translate)