Move hashtag functions to hashtag module

main
Bob Mottram 2024-08-31 11:26:14 +01:00
parent 89f6e3d26d
commit 36915c35d3
4 changed files with 184 additions and 181 deletions

180
inbox.py
View File

@ -21,9 +21,7 @@ from reaction import valid_emoji_content
from utils import harmless_markup from utils import harmless_markup
from utils import quote_toots_allowed from utils import quote_toots_allowed
from utils import lines_in_file from utils import lines_in_file
from utils import resembles_url
from utils import get_url_from_post from utils import get_url_from_post
from utils import date_from_string_format
from utils import date_epoch from utils import date_epoch
from utils import date_utcnow from utils import date_utcnow
from utils import contains_statuses from utils import contains_statuses
@ -44,7 +42,6 @@ from utils import is_system_account
from utils import invalid_ciphertext from utils import invalid_ciphertext
from utils import contains_private_key from utils import contains_private_key
from utils import remove_html from utils import remove_html
from utils import file_last_modified
from utils import has_object_string from utils import has_object_string
from utils import has_object_string_object from utils import has_object_string_object
from utils import get_reply_interval_hours from utils import get_reply_interval_hours
@ -65,7 +62,6 @@ from utils import remove_id_ending
from utils import get_protocol_prefixes from utils import get_protocol_prefixes
from utils import is_blog_post from utils import is_blog_post
from utils import remove_avatar_from_cache from utils import remove_avatar_from_cache
from utils import is_public_post
from utils import get_cached_post_filename from utils import get_cached_post_filename
from utils import remove_post_from_cache from utils import remove_post_from_cache
from utils import url_permitted from utils import url_permitted
@ -83,13 +79,10 @@ from utils import undo_reaction_collection_entry
from utils import has_group_type from utils import has_group_type
from utils import local_actor_url from utils import local_actor_url
from utils import has_object_string_type from utils import has_object_string_type
from utils import valid_hash_tag
from utils import get_attributed_to from utils import get_attributed_to
from utils import get_reply_to from utils import get_reply_to
from utils import get_actor_from_post from utils import get_actor_from_post
from utils import data_dir from utils import data_dir
from categories import get_hashtag_categories
from categories import set_hashtag_category
from httpsig import get_digest_algorithm_from_headers from httpsig import get_digest_algorithm_from_headers
from httpsig import verify_post_headers from httpsig import verify_post_headers
from session import create_session from session import create_session
@ -149,15 +142,13 @@ from git import is_git_patch
from git import receive_git_patch from git import receive_git_patch
from followingCalendar import receiving_calendar_events from followingCalendar import receiving_calendar_events
from happening import save_event_post from happening import save_event_post
from delete import remove_old_hashtags
from categories import guess_hashtag_category
from context import has_valid_context from context import has_valid_context
from speaker import update_speaker from speaker import update_speaker
from announce import is_self_announce from announce import is_self_announce
from announce import create_announce from announce import create_announce
from notifyOnPost import notify_when_person_posts from notifyOnPost import notify_when_person_posts
from conversation import update_conversation from conversation import update_conversation
from webapp_hashtagswarm import html_hash_tag_swarm from webapp_hashtagswarm import store_hash_tags
from person import valid_sending_actor from person import valid_sending_actor
from person import get_person_avatar_url from person import get_person_avatar_url
from fitnessFunctions import fitness_performance from fitnessFunctions import fitness_performance
@ -166,10 +157,6 @@ from content import reject_twitter_summary
from content import load_dogwhistles from content import load_dogwhistles
from content import valid_url_lengths from content import valid_url_lengths
from threads import begin_thread from threads import begin_thread
from maps import get_map_links_from_post_content
from maps import get_location_from_post
from maps import add_tag_map_links
from maps import geocoords_from_map_link
from reading import store_book_events from reading import store_book_events
@ -207,171 +194,6 @@ def _store_last_post_id(base_dir: str, nickname: str, domain: str,
print('EX: Unable to write last post id to ' + actor_filename) print('EX: Unable to write last post id to ' + actor_filename)
def _update_cached_hashtag_swarm(base_dir: str, nickname: str, domain: str,
http_prefix: str, domain_full: str,
translate: {}) -> bool:
"""Updates the hashtag swarm stored as a file
"""
cached_hashtag_swarm_filename = \
acct_dir(base_dir, nickname, domain) + '/.hashtagSwarm'
save_swarm = True
if os.path.isfile(cached_hashtag_swarm_filename):
last_modified = file_last_modified(cached_hashtag_swarm_filename)
modified_date = None
try:
modified_date = \
date_from_string_format(last_modified, ["%Y-%m-%dT%H:%M:%S%z"])
except BaseException:
print('EX: unable to parse last modified cache date ' +
str(last_modified))
if modified_date:
curr_date = date_utcnow()
time_diff = curr_date - modified_date
diff_mins = int(time_diff.total_seconds() / 60)
if diff_mins < 30:
# was saved recently, so don't save again
# This avoids too much disk I/O
save_swarm = False
print('Not updating hashtag swarm')
else:
print('Updating cached hashtag swarm, last changed ' +
str(diff_mins) + ' minutes ago')
else:
print('WARN: no modified date for ' + str(last_modified))
if save_swarm:
actor = local_actor_url(http_prefix, nickname, domain_full)
new_swarm_str = html_hash_tag_swarm(base_dir, actor, translate)
if new_swarm_str:
try:
with open(cached_hashtag_swarm_filename, 'w+',
encoding='utf-8') as fp_swarm:
fp_swarm.write(new_swarm_str)
return True
except OSError:
print('EX: unable to write cached hashtag swarm ' +
cached_hashtag_swarm_filename)
remove_old_hashtags(base_dir, 3)
return False
def store_hash_tags(base_dir: str, nickname: str, domain: str,
http_prefix: str, domain_full: str,
post_json_object: {}, translate: {}) -> None:
"""Extracts hashtags from an incoming post and updates the
relevant tags files.
"""
if not is_public_post(post_json_object):
return
if not has_object_dict(post_json_object):
return
if not post_json_object['object'].get('tag'):
return
if not post_json_object.get('id'):
return
if not isinstance(post_json_object['object']['tag'], list):
return
tags_dir = base_dir + '/tags'
# add tags directory if it doesn't exist
if not os.path.isdir(tags_dir):
print('Creating tags directory')
os.mkdir(tags_dir)
# obtain any map links and these can be associated with hashtags
# get geolocations from content
map_links = []
published = None
if 'content' in post_json_object['object']:
published = post_json_object['object']['published']
post_content = post_json_object['object']['content']
map_links += get_map_links_from_post_content(post_content)
# get geolocation from tags
location_str = get_location_from_post(post_json_object)
if location_str:
if resembles_url(location_str):
zoom, latitude, longitude = \
geocoords_from_map_link(location_str,
'openstreetmap.org')
if latitude and longitude and zoom and \
location_str not in map_links:
map_links.append(location_str)
tag_maps_dir = base_dir + '/tagmaps'
if map_links:
# add tagmaps directory if it doesn't exist
if not os.path.isdir(tag_maps_dir):
print('Creating tagmaps directory')
os.mkdir(tag_maps_dir)
post_url = remove_id_ending(post_json_object['id'])
post_url = post_url.replace('/', '#')
hashtags_ctr = 0
for tag in post_json_object['object']['tag']:
if not tag.get('type'):
continue
if not isinstance(tag['type'], str):
continue
if tag['type'] != 'Hashtag':
continue
if not tag.get('name'):
continue
tag_name = tag['name'].replace('#', '').strip()
if not valid_hash_tag(tag_name):
continue
tags_filename = tags_dir + '/' + tag_name + '.txt'
days_diff = date_utcnow() - date_epoch()
days_since_epoch = days_diff.days
tag_line = \
str(days_since_epoch) + ' ' + nickname + ' ' + post_url + '\n'
if map_links and published:
add_tag_map_links(tag_maps_dir, tag_name, map_links,
published, post_url)
hashtag_added = False
if not os.path.isfile(tags_filename):
try:
with open(tags_filename, 'w+', encoding='utf-8') as fp_tags:
fp_tags.write(tag_line)
hashtag_added = True
except OSError:
print('EX: store_hash_tags unable to write ' + tags_filename)
else:
content = ''
try:
with open(tags_filename, 'r', encoding='utf-8') as fp_tags:
content = fp_tags.read()
except OSError:
print('EX: store_hash_tags failed to read ' + tags_filename)
if post_url not in content:
content = tag_line + content
try:
with open(tags_filename, 'w+',
encoding='utf-8') as fp_tags2:
fp_tags2.write(content)
hashtag_added = True
except OSError as ex:
print('EX: Failed to write entry to tags file ' +
tags_filename + ' ' + str(ex))
if hashtag_added:
hashtags_ctr += 1
# automatically assign a category to the tag if possible
category_filename = tags_dir + '/' + tag_name + '.category'
if not os.path.isfile(category_filename):
hashtag_categories = \
get_hashtag_categories(base_dir, False, None)
category_str = \
guess_hashtag_category(tag_name, hashtag_categories, 6)
if category_str:
set_hashtag_category(base_dir, tag_name,
category_str, False, False)
# if some hashtags were found then recalculate the swarm
# ready for later display
if hashtags_ctr > 0:
_update_cached_hashtag_swarm(base_dir, nickname, domain,
http_prefix, domain_full, translate)
def _inbox_store_post_to_html_cache(recent_posts_cache: {}, def _inbox_store_post_to_html_cache(recent_posts_cache: {},
max_recent_posts: int, max_recent_posts: int,
translate: {}, translate: {},

View File

@ -37,9 +37,9 @@ from utils import dangerous_markup
from utils import local_actor_url from utils import local_actor_url
from utils import text_in_file from utils import text_in_file
from utils import data_dir from utils import data_dir
from inbox import store_hash_tags
from session import create_session from session import create_session
from threads import begin_thread from threads import begin_thread
from webapp_hashtagswarm import store_hash_tags
def _update_feeds_outbox_index(base_dir: str, domain: str, def _update_feeds_outbox_index(base_dir: str, domain: str,

View File

@ -47,7 +47,6 @@ from media import replace_you_tube
from media import replace_twitter from media import replace_twitter
from media import get_media_path from media import get_media_path
from media import create_media_dirs from media import create_media_dirs
from inbox import store_hash_tags
from inbox import inbox_update_index from inbox import inbox_update_index
from announce import outbox_announce from announce import outbox_announce
from announce import outbox_undo_announce from announce import outbox_undo_announce
@ -65,6 +64,7 @@ from delete import outbox_delete
from shares import outbox_share_upload from shares import outbox_share_upload
from shares import outbox_undo_share_upload from shares import outbox_undo_share_upload
from webapp_post import individual_post_as_html from webapp_post import individual_post_as_html
from webapp_hashtagswarm import store_hash_tags
from speaker import update_speaker from speaker import update_speaker
from reading import store_book_events from reading import store_book_events
from reading import has_edition_tag from reading import has_edition_tag

View File

@ -9,6 +9,15 @@ __module_group__ = "Web Interface"
import os import os
from datetime import datetime, timezone from datetime import datetime, timezone
from utils import valid_hash_tag
from utils import remove_id_ending
from utils import resembles_url
from utils import has_object_dict
from utils import is_public_post
from utils import local_actor_url
from utils import date_from_string_format
from utils import file_last_modified
from utils import acct_dir
from utils import data_dir from utils import data_dir
from utils import get_nickname_from_actor from utils import get_nickname_from_actor
from utils import get_config_param from utils import get_config_param
@ -16,6 +25,13 @@ from utils import escape_text
from utils import date_utcnow from utils import date_utcnow
from utils import date_epoch from utils import date_epoch
from utils import string_contains from utils import string_contains
from delete import remove_old_hashtags
from maps import add_tag_map_links
from maps import geocoords_from_map_link
from maps import get_map_links_from_post_content
from maps import get_location_from_post
from categories import set_hashtag_category
from categories import guess_hashtag_category
from categories import get_hashtag_categories from categories import get_hashtag_categories
from categories import get_hashtag_category from categories import get_hashtag_category
from webapp_utils import set_custom_background from webapp_utils import set_custom_background
@ -271,3 +287,168 @@ def html_search_hashtag_category(translate: {},
'</div>' '</div>'
html_str += html_footer() html_str += html_footer()
return html_str return html_str
def _update_cached_hashtag_swarm(base_dir: str, nickname: str, domain: str,
http_prefix: str, domain_full: str,
translate: {}) -> bool:
"""Updates the hashtag swarm stored as a file
"""
cached_hashtag_swarm_filename = \
acct_dir(base_dir, nickname, domain) + '/.hashtagSwarm'
save_swarm = True
if os.path.isfile(cached_hashtag_swarm_filename):
last_modified = file_last_modified(cached_hashtag_swarm_filename)
modified_date = None
try:
modified_date = \
date_from_string_format(last_modified, ["%Y-%m-%dT%H:%M:%S%z"])
except BaseException:
print('EX: unable to parse last modified cache date ' +
str(last_modified))
if modified_date:
curr_date = date_utcnow()
time_diff = curr_date - modified_date
diff_mins = int(time_diff.total_seconds() / 60)
if diff_mins < 30:
# was saved recently, so don't save again
# This avoids too much disk I/O
save_swarm = False
print('Not updating hashtag swarm')
else:
print('Updating cached hashtag swarm, last changed ' +
str(diff_mins) + ' minutes ago')
else:
print('WARN: no modified date for ' + str(last_modified))
if save_swarm:
actor = local_actor_url(http_prefix, nickname, domain_full)
new_swarm_str = html_hash_tag_swarm(base_dir, actor, translate)
if new_swarm_str:
try:
with open(cached_hashtag_swarm_filename, 'w+',
encoding='utf-8') as fp_swarm:
fp_swarm.write(new_swarm_str)
return True
except OSError:
print('EX: unable to write cached hashtag swarm ' +
cached_hashtag_swarm_filename)
remove_old_hashtags(base_dir, 3)
return False
def store_hash_tags(base_dir: str, nickname: str, domain: str,
http_prefix: str, domain_full: str,
post_json_object: {}, translate: {}) -> None:
"""Extracts hashtags from an incoming post and updates the
relevant tags files.
"""
if not is_public_post(post_json_object):
return
if not has_object_dict(post_json_object):
return
if not post_json_object['object'].get('tag'):
return
if not post_json_object.get('id'):
return
if not isinstance(post_json_object['object']['tag'], list):
return
tags_dir = base_dir + '/tags'
# add tags directory if it doesn't exist
if not os.path.isdir(tags_dir):
print('Creating tags directory')
os.mkdir(tags_dir)
# obtain any map links and these can be associated with hashtags
# get geolocations from content
map_links = []
published = None
if 'content' in post_json_object['object']:
published = post_json_object['object']['published']
post_content = post_json_object['object']['content']
map_links += get_map_links_from_post_content(post_content)
# get geolocation from tags
location_str = get_location_from_post(post_json_object)
if location_str:
if resembles_url(location_str):
zoom, latitude, longitude = \
geocoords_from_map_link(location_str,
'openstreetmap.org')
if latitude and longitude and zoom and \
location_str not in map_links:
map_links.append(location_str)
tag_maps_dir = base_dir + '/tagmaps'
if map_links:
# add tagmaps directory if it doesn't exist
if not os.path.isdir(tag_maps_dir):
print('Creating tagmaps directory')
os.mkdir(tag_maps_dir)
post_url = remove_id_ending(post_json_object['id'])
post_url = post_url.replace('/', '#')
hashtags_ctr = 0
for tag in post_json_object['object']['tag']:
if not tag.get('type'):
continue
if not isinstance(tag['type'], str):
continue
if tag['type'] != 'Hashtag':
continue
if not tag.get('name'):
continue
tag_name = tag['name'].replace('#', '').strip()
if not valid_hash_tag(tag_name):
continue
tags_filename = tags_dir + '/' + tag_name + '.txt'
days_diff = date_utcnow() - date_epoch()
days_since_epoch = days_diff.days
tag_line = \
str(days_since_epoch) + ' ' + nickname + ' ' + post_url + '\n'
if map_links and published:
add_tag_map_links(tag_maps_dir, tag_name, map_links,
published, post_url)
hashtag_added = False
if not os.path.isfile(tags_filename):
try:
with open(tags_filename, 'w+', encoding='utf-8') as fp_tags:
fp_tags.write(tag_line)
hashtag_added = True
except OSError:
print('EX: store_hash_tags unable to write ' + tags_filename)
else:
content = ''
try:
with open(tags_filename, 'r', encoding='utf-8') as fp_tags:
content = fp_tags.read()
except OSError:
print('EX: store_hash_tags failed to read ' + tags_filename)
if post_url not in content:
content = tag_line + content
try:
with open(tags_filename, 'w+',
encoding='utf-8') as fp_tags2:
fp_tags2.write(content)
hashtag_added = True
except OSError as ex:
print('EX: Failed to write entry to tags file ' +
tags_filename + ' ' + str(ex))
if hashtag_added:
hashtags_ctr += 1
# automatically assign a category to the tag if possible
category_filename = tags_dir + '/' + tag_name + '.category'
if not os.path.isfile(category_filename):
hashtag_categories = \
get_hashtag_categories(base_dir, False, None)
category_str = \
guess_hashtag_category(tag_name, hashtag_categories, 6)
if category_str:
set_hashtag_category(base_dir, tag_name,
category_str, False, False)
# if some hashtags were found then recalculate the swarm
# ready for later display
if hashtags_ctr > 0:
_update_cached_hashtag_swarm(base_dir, nickname, domain,
http_prefix, domain_full, translate)