From fba106842f3372fb2f108302dcb5598ae8e7c6ff Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Thu, 2 Jun 2022 14:39:09 +0100
Subject: [PATCH 1/5] Also exclude nitter when twitter posts are disallowed

---
 filters.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/filters.py b/filters.py
index 1382f3ded..244e990a7 100644
--- a/filters.py
+++ b/filters.py
@@ -100,7 +100,9 @@ def remove_global_filter(base_dir: str, words: str) -> bool:
 def _is_twitter_post(content: str) -> bool:
     """Returns true if the given post content is a retweet or twitter crosspost
     """
-    if '/twitter.' in content or '@twitter.' in content:
+    if '/twitter.' in content or \
+       '/nitter.' in content or \
+       '@twitter.' in content:
         return True
     if '>RT <' in content:
         return True

From ef20655eaff00893fba7a55da233edbc22588208 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Thu, 2 Jun 2022 14:54:17 +0100
Subject: [PATCH 2/5] Tidying

---
 filters.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/filters.py b/filters.py
index 244e990a7..dd97ab1be 100644
--- a/filters.py
+++ b/filters.py
@@ -100,12 +100,12 @@ def remove_global_filter(base_dir: str, words: str) -> bool:
 def _is_twitter_post(content: str) -> bool:
     """Returns true if the given post content is a retweet or twitter crosspost
     """
-    if '/twitter.' in content or \
-       '/nitter.' in content or \
-       '@twitter.' in content:
-        return True
-    if '>RT <' in content:
-        return True
+    features = (
+        '/twitter.', '/nitter.', '@twitter.', '>RT <', '_tw<'
+    )
+    for feat in features:
+        if feat in content:
+            return True
     return False
 
 

From be5360dc79d9aa336205a53ad56f0f08a96deac3 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Thu, 2 Jun 2022 14:56:53 +0100
Subject: [PATCH 3/5] Extra twitter features

---
 filters.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/filters.py b/filters.py
index dd97ab1be..d4ed37872 100644
--- a/filters.py
+++ b/filters.py
@@ -101,7 +101,8 @@ def _is_twitter_post(content: str) -> bool:
     """Returns true if the given post content is a retweet or twitter crosspost
     """
     features = (
-        '/twitter.', '/nitter.', '@twitter.', '>RT <', '_tw<'
+        '/twitter.', '/nitter.', '@twitter.', '@nitter.',
+        '>RT <', '_tw<', '_tw@'
     )
     for feat in features:
         if feat in content:

From 1708a928eff053c85f3b8aba937ba84e83a3884b Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Thu, 2 Jun 2022 18:47:56 +0100
Subject: [PATCH 4/5] Post filtering includes summary

---
 filters.py |  2 +-
 inbox.py   | 12 ++++++++++--
 posts.py   |  8 ++++++--
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/filters.py b/filters.py
index d4ed37872..77968a47d 100644
--- a/filters.py
+++ b/filters.py
@@ -102,7 +102,7 @@ def _is_twitter_post(content: str) -> bool:
     """
     features = (
         '/twitter.', '/nitter.', '@twitter.', '@nitter.',
-        '>RT <', '_tw<', '_tw@'
+        '>RT <', '_tw<', '_tw@', 'tweet', 'Tweet'
     )
     for feat in features:
         if feat in content:
diff --git a/inbox.py b/inbox.py
index 3617676b0..f2f4b4761 100644
--- a/inbox.py
+++ b/inbox.py
@@ -18,6 +18,7 @@ from languages import understood_post_language
 from like import update_likes_collection
 from reaction import update_reaction_collection
 from reaction import valid_emoji_content
+from utils import get_summary_from_post
 from utils import delete_cached_html
 from utils import get_account_timezone
 from utils import domain_permitted
@@ -636,7 +637,11 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str,
             content_str = \
                 get_base_content_from_post(post_json_object, system_language)
             if content_str:
-                if is_filtered(base_dir, nickname, domain, content_str):
+                summary_str = \
+                    get_summary_from_post(post_json_object,
+                                          system_language, [])
+                if is_filtered(base_dir, nickname, domain,
+                               summary_str + ' ' + content_str):
                     if debug:
                         print('WARN: post was filtered out due to content')
                     return None
@@ -2683,7 +2688,10 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
         return False
 
     # check for filtered content
-    if is_filtered(base_dir, nickname, domain, content_str):
+    content_all = content_str
+    if summary:
+        content_all = summary + ' ' + content_str
+    if is_filtered(base_dir, nickname, domain, content_all):
         print('REJECT: content filtered')
         return False
     if message_json['object'].get('inReplyTo'):
diff --git a/posts.py b/posts.py
index c93244d6c..64cb9183f 100644
--- a/posts.py
+++ b/posts.py
@@ -5002,8 +5002,12 @@ def download_announce(session, base_dir: str, http_prefix: str,
                              base_dir, nickname, domain, post_id,
                              recent_posts_cache)
             return None
-
-        if is_filtered(base_dir, nickname, domain, content_str):
+        summary_str = \
+            get_summary_from_post(announced_json, system_language, [])
+        content_all = content_str
+        if summary_str:
+            content_all = summary_str + ' ' + content_str
+        if is_filtered(base_dir, nickname, domain, content_all):
             print('WARN: announced post has been filtered ' +
                   str(announced_json))
             _reject_announce(announce_filename,

From ec31deb73952d70920e000c68d005ce75faed65e Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Thu, 2 Jun 2022 19:07:07 +0100
Subject: [PATCH 5/5] Filtering include image descriptions and urls

---
 inbox.py | 11 ++++++++---
 posts.py |  6 +++++-
 utils.py | 19 +++++++++++++++++++
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/inbox.py b/inbox.py
index f2f4b4761..1ed30a05f 100644
--- a/inbox.py
+++ b/inbox.py
@@ -18,6 +18,7 @@ from languages import understood_post_language
 from like import update_likes_collection
 from reaction import update_reaction_collection
 from reaction import valid_emoji_content
+from utils import get_media_descriptions_from_post
 from utils import get_summary_from_post
 from utils import delete_cached_html
 from utils import get_account_timezone
@@ -640,8 +641,11 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str,
                 summary_str = \
                     get_summary_from_post(post_json_object,
                                           system_language, [])
-                if is_filtered(base_dir, nickname, domain,
-                               summary_str + ' ' + content_str):
+                media_descriptions = \
+                    get_media_descriptions_from_post(post_json_object)
+                content_all = \
+                    summary_str + ' ' + content_str + ' ' + media_descriptions
+                if is_filtered(base_dir, nickname, domain, content_all):
                     if debug:
                         print('WARN: post was filtered out due to content')
                     return None
@@ -2688,9 +2692,10 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
         return False
 
     # check for filtered content
+    media_descriptions = get_media_descriptions_from_post(message_json)
     content_all = content_str
     if summary:
-        content_all = summary + ' ' + content_str
+        content_all = summary + ' ' + content_str + ' ' + media_descriptions
     if is_filtered(base_dir, nickname, domain, content_all):
         print('REJECT: content filtered')
         return False
diff --git a/posts.py b/posts.py
index 64cb9183f..733a34c8b 100644
--- a/posts.py
+++ b/posts.py
@@ -32,6 +32,7 @@ from webfinger import webfinger_handle
 from httpsig import create_signed_header
 from siteactive import site_is_active
 from languages import understood_post_language
+from utils import get_media_descriptions_from_post
 from utils import valid_hash_tag
 from utils import get_audio_extensions
 from utils import get_summary_from_post
@@ -5004,9 +5005,12 @@ def download_announce(session, base_dir: str, http_prefix: str,
             return None
         summary_str = \
             get_summary_from_post(announced_json, system_language, [])
+        media_descriptions = \
+            get_media_descriptions_from_post(announced_json)
         content_all = content_str
         if summary_str:
-            content_all = summary_str + ' ' + content_str
+            content_all = \
+                summary_str + ' ' + content_str + ' ' + media_descriptions
         if is_filtered(base_dir, nickname, domain, content_all):
             print('WARN: announced post has been filtered ' +
                   str(announced_json))
diff --git a/utils.py b/utils.py
index e9e985d13..b741cb520 100644
--- a/utils.py
+++ b/utils.py
@@ -138,6 +138,25 @@ def get_content_from_post(post_json_object: {}, system_language: str,
     return content
 
 
+def get_media_descriptions_from_post(post_json_object: {}) -> str:
+    """Returns all attached media descriptions as a single text.
+    This is used for filtering
+    """
+    this_post_json = post_json_object
+    if has_object_dict(post_json_object):
+        this_post_json = post_json_object['object']
+    if not this_post_json.get('attachment'):
+        return ''
+    descriptions = ''
+    for attach in this_post_json['attachment']:
+        if not attach.get('name'):
+            continue
+        descriptions += attach['name'] + ' '
+        if attach.get('url'):
+            descriptions += attach['url'] + ' '
+    return descriptions.strip()
+
+
 def get_summary_from_post(post_json_object: {}, system_language: str,
                           languages_understood: []) -> str:
     """Returns the summary from the post in the given language