From fba106842f3372fb2f108302dcb5598ae8e7c6ff Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 2 Jun 2022 14:39:09 +0100 Subject: [PATCH 1/5] Also exclude nitter when twitter posts are disallowed --- filters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/filters.py b/filters.py index 1382f3ded..244e990a7 100644 --- a/filters.py +++ b/filters.py @@ -100,7 +100,9 @@ def remove_global_filter(base_dir: str, words: str) -> bool: def _is_twitter_post(content: str) -> bool: """Returns true if the given post content is a retweet or twitter crosspost """ - if '/twitter.' in content or '@twitter.' in content: + if '/twitter.' in content or \ + '/nitter.' in content or \ + '@twitter.' in content: return True if '>RT <' in content: return True From ef20655eaff00893fba7a55da233edbc22588208 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 2 Jun 2022 14:54:17 +0100 Subject: [PATCH 2/5] Tidying --- filters.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/filters.py b/filters.py index 244e990a7..dd97ab1be 100644 --- a/filters.py +++ b/filters.py @@ -100,12 +100,12 @@ def remove_global_filter(base_dir: str, words: str) -> bool: def _is_twitter_post(content: str) -> bool: """Returns true if the given post content is a retweet or twitter crosspost """ - if '/twitter.' in content or \ - '/nitter.' in content or \ - '@twitter.' in content: - return True - if '>RT <' in content: - return True + features = ( + '/twitter.', '/nitter.', '@twitter.', '>RT <', '_tw<' + ) + for feat in features: + if feat in content: + return True return False From be5360dc79d9aa336205a53ad56f0f08a96deac3 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 2 Jun 2022 14:56:53 +0100 Subject: [PATCH 3/5] Extra twitter features --- filters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/filters.py b/filters.py index dd97ab1be..d4ed37872 100644 --- a/filters.py +++ b/filters.py @@ -101,7 +101,8 @@ def _is_twitter_post(content: str) -> bool: """Returns true if the given post content is a retweet or twitter crosspost """ features = ( - '/twitter.', '/nitter.', '@twitter.', '>RT <', '_tw<' + '/twitter.', '/nitter.', '@twitter.', '@nitter.', + '>RT <', '_tw<', '_tw@' ) for feat in features: if feat in content: From 1708a928eff053c85f3b8aba937ba84e83a3884b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 2 Jun 2022 18:47:56 +0100 Subject: [PATCH 4/5] Post filtering includes summary --- filters.py | 2 +- inbox.py | 12 ++++++++++-- posts.py | 8 ++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/filters.py b/filters.py index d4ed37872..77968a47d 100644 --- a/filters.py +++ b/filters.py @@ -102,7 +102,7 @@ def _is_twitter_post(content: str) -> bool: """ features = ( '/twitter.', '/nitter.', '@twitter.', '@nitter.', - '>RT <', '_tw<', '_tw@' + '>RT <', '_tw<', '_tw@', 'tweet', 'Tweet' ) for feat in features: if feat in content: diff --git a/inbox.py b/inbox.py index 3617676b0..f2f4b4761 100644 --- a/inbox.py +++ b/inbox.py @@ -18,6 +18,7 @@ from languages import understood_post_language from like import update_likes_collection from reaction import update_reaction_collection from reaction import valid_emoji_content +from utils import get_summary_from_post from utils import delete_cached_html from utils import get_account_timezone from utils import domain_permitted @@ -636,7 +637,11 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str, content_str = \ get_base_content_from_post(post_json_object, system_language) if content_str: - if is_filtered(base_dir, nickname, domain, content_str): + summary_str = \ + get_summary_from_post(post_json_object, + system_language, []) + if is_filtered(base_dir, nickname, domain, + summary_str + ' ' + content_str): if debug: print('WARN: post was filtered out due to content') return None @@ -2683,7 +2688,10 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str, return False # check for filtered content - if is_filtered(base_dir, nickname, domain, content_str): + content_all = content_str + if summary: + content_all = summary + ' ' + content_str + if is_filtered(base_dir, nickname, domain, content_all): print('REJECT: content filtered') return False if message_json['object'].get('inReplyTo'): diff --git a/posts.py b/posts.py index c93244d6c..64cb9183f 100644 --- a/posts.py +++ b/posts.py @@ -5002,8 +5002,12 @@ def download_announce(session, base_dir: str, http_prefix: str, base_dir, nickname, domain, post_id, recent_posts_cache) return None - - if is_filtered(base_dir, nickname, domain, content_str): + summary_str = \ + get_summary_from_post(announced_json, system_language, []) + content_all = content_str + if summary_str: + content_all = summary_str + ' ' + content_str + if is_filtered(base_dir, nickname, domain, content_all): print('WARN: announced post has been filtered ' + str(announced_json)) _reject_announce(announce_filename, From ec31deb73952d70920e000c68d005ce75faed65e Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 2 Jun 2022 19:07:07 +0100 Subject: [PATCH 5/5] Filtering include image descriptions and urls --- inbox.py | 11 ++++++++--- posts.py | 6 +++++- utils.py | 19 +++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/inbox.py b/inbox.py index f2f4b4761..1ed30a05f 100644 --- a/inbox.py +++ b/inbox.py @@ -18,6 +18,7 @@ from languages import understood_post_language from like import update_likes_collection from reaction import update_reaction_collection from reaction import valid_emoji_content +from utils import get_media_descriptions_from_post from utils import get_summary_from_post from utils import delete_cached_html from utils import get_account_timezone @@ -640,8 +641,11 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str, summary_str = \ get_summary_from_post(post_json_object, system_language, []) - if is_filtered(base_dir, nickname, domain, - summary_str + ' ' + content_str): + media_descriptions = \ + get_media_descriptions_from_post(post_json_object) + content_all = \ + summary_str + ' ' + content_str + ' ' + media_descriptions + if is_filtered(base_dir, nickname, domain, content_all): if debug: print('WARN: post was filtered out due to content') return None @@ -2688,9 +2692,10 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str, return False # check for filtered content + media_descriptions = get_media_descriptions_from_post(message_json) content_all = content_str if summary: - content_all = summary + ' ' + content_str + content_all = summary + ' ' + content_str + ' ' + media_descriptions if is_filtered(base_dir, nickname, domain, content_all): print('REJECT: content filtered') return False diff --git a/posts.py b/posts.py index 64cb9183f..733a34c8b 100644 --- a/posts.py +++ b/posts.py @@ -32,6 +32,7 @@ from webfinger import webfinger_handle from httpsig import create_signed_header from siteactive import site_is_active from languages import understood_post_language +from utils import get_media_descriptions_from_post from utils import valid_hash_tag from utils import get_audio_extensions from utils import get_summary_from_post @@ -5004,9 +5005,12 @@ def download_announce(session, base_dir: str, http_prefix: str, return None summary_str = \ get_summary_from_post(announced_json, system_language, []) + media_descriptions = \ + get_media_descriptions_from_post(announced_json) content_all = content_str if summary_str: - content_all = summary_str + ' ' + content_str + content_all = \ + summary_str + ' ' + content_str + ' ' + media_descriptions if is_filtered(base_dir, nickname, domain, content_all): print('WARN: announced post has been filtered ' + str(announced_json)) diff --git a/utils.py b/utils.py index e9e985d13..b741cb520 100644 --- a/utils.py +++ b/utils.py @@ -138,6 +138,25 @@ def get_content_from_post(post_json_object: {}, system_language: str, return content +def get_media_descriptions_from_post(post_json_object: {}) -> str: + """Returns all attached media descriptions as a single text. + This is used for filtering + """ + this_post_json = post_json_object + if has_object_dict(post_json_object): + this_post_json = post_json_object['object'] + if not this_post_json.get('attachment'): + return '' + descriptions = '' + for attach in this_post_json['attachment']: + if not attach.get('name'): + continue + descriptions += attach['name'] + ' ' + if attach.get('url'): + descriptions += attach['url'] + ' ' + return descriptions.strip() + + def get_summary_from_post(post_json_object: {}, system_language: str, languages_understood: []) -> str: """Returns the summary from the post in the given language