diff --git a/filters.py b/filters.py index 1382f3ded..77968a47d 100644 --- a/filters.py +++ b/filters.py @@ -100,10 +100,13 @@ def remove_global_filter(base_dir: str, words: str) -> bool: def _is_twitter_post(content: str) -> bool: """Returns true if the given post content is a retweet or twitter crosspost """ - if '/twitter.' in content or '@twitter.' in content: - return True - if '>RT <' in content: - return True + features = ( + '/twitter.', '/nitter.', '@twitter.', '@nitter.', + '>RT <', '_tw<', '_tw@', 'tweet', 'Tweet' + ) + for feat in features: + if feat in content: + return True return False diff --git a/inbox.py b/inbox.py index 3617676b0..1ed30a05f 100644 --- a/inbox.py +++ b/inbox.py @@ -18,6 +18,8 @@ from languages import understood_post_language from like import update_likes_collection from reaction import update_reaction_collection from reaction import valid_emoji_content +from utils import get_media_descriptions_from_post +from utils import get_summary_from_post from utils import delete_cached_html from utils import get_account_timezone from utils import domain_permitted @@ -636,7 +638,14 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str, content_str = \ get_base_content_from_post(post_json_object, system_language) if content_str: - if is_filtered(base_dir, nickname, domain, content_str): + summary_str = \ + get_summary_from_post(post_json_object, + system_language, []) + media_descriptions = \ + get_media_descriptions_from_post(post_json_object) + content_all = \ + summary_str + ' ' + content_str + ' ' + media_descriptions + if is_filtered(base_dir, nickname, domain, content_all): if debug: print('WARN: post was filtered out due to content') return None @@ -2683,7 +2692,11 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str, return False # check for filtered content - if is_filtered(base_dir, nickname, domain, content_str): + media_descriptions = get_media_descriptions_from_post(message_json) + content_all = content_str + if summary: + content_all = summary + ' ' + content_str + ' ' + media_descriptions + if is_filtered(base_dir, nickname, domain, content_all): print('REJECT: content filtered') return False if message_json['object'].get('inReplyTo'): diff --git a/posts.py b/posts.py index c93244d6c..733a34c8b 100644 --- a/posts.py +++ b/posts.py @@ -32,6 +32,7 @@ from webfinger import webfinger_handle from httpsig import create_signed_header from siteactive import site_is_active from languages import understood_post_language +from utils import get_media_descriptions_from_post from utils import valid_hash_tag from utils import get_audio_extensions from utils import get_summary_from_post @@ -5002,8 +5003,15 @@ def download_announce(session, base_dir: str, http_prefix: str, base_dir, nickname, domain, post_id, recent_posts_cache) return None - - if is_filtered(base_dir, nickname, domain, content_str): + summary_str = \ + get_summary_from_post(announced_json, system_language, []) + media_descriptions = \ + get_media_descriptions_from_post(announced_json) + content_all = content_str + if summary_str: + content_all = \ + summary_str + ' ' + content_str + ' ' + media_descriptions + if is_filtered(base_dir, nickname, domain, content_all): print('WARN: announced post has been filtered ' + str(announced_json)) _reject_announce(announce_filename, diff --git a/utils.py b/utils.py index e9e985d13..b741cb520 100644 --- a/utils.py +++ b/utils.py @@ -138,6 +138,25 @@ def get_content_from_post(post_json_object: {}, system_language: str, return content +def get_media_descriptions_from_post(post_json_object: {}) -> str: + """Returns all attached media descriptions as a single text. + This is used for filtering + """ + this_post_json = post_json_object + if has_object_dict(post_json_object): + this_post_json = post_json_object['object'] + if not this_post_json.get('attachment'): + return '' + descriptions = '' + for attach in this_post_json['attachment']: + if not attach.get('name'): + continue + descriptions += attach['name'] + ' ' + if attach.get('url'): + descriptions += attach['url'] + ' ' + return descriptions.strip() + + def get_summary_from_post(post_json_object: {}, system_language: str, languages_understood: []) -> str: """Returns the summary from the post in the given language