Filtering include image descriptions and urls

merge-requests/30/head
Bob Mottram 2022-06-02 19:07:07 +01:00
parent 1708a928ef
commit ec31deb739
3 changed files with 32 additions and 4 deletions

View File

@ -18,6 +18,7 @@ from languages import understood_post_language
from like import update_likes_collection
from reaction import update_reaction_collection
from reaction import valid_emoji_content
from utils import get_media_descriptions_from_post
from utils import get_summary_from_post
from utils import delete_cached_html
from utils import get_account_timezone
@ -640,8 +641,11 @@ def save_post_to_inbox_queue(base_dir: str, http_prefix: str,
summary_str = \
get_summary_from_post(post_json_object,
system_language, [])
if is_filtered(base_dir, nickname, domain,
summary_str + ' ' + content_str):
media_descriptions = \
get_media_descriptions_from_post(post_json_object)
content_all = \
summary_str + ' ' + content_str + ' ' + media_descriptions
if is_filtered(base_dir, nickname, domain, content_all):
if debug:
print('WARN: post was filtered out due to content')
return None
@ -2688,9 +2692,10 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
return False
# check for filtered content
media_descriptions = get_media_descriptions_from_post(message_json)
content_all = content_str
if summary:
content_all = summary + ' ' + content_str
content_all = summary + ' ' + content_str + ' ' + media_descriptions
if is_filtered(base_dir, nickname, domain, content_all):
print('REJECT: content filtered')
return False

View File

@ -32,6 +32,7 @@ from webfinger import webfinger_handle
from httpsig import create_signed_header
from siteactive import site_is_active
from languages import understood_post_language
from utils import get_media_descriptions_from_post
from utils import valid_hash_tag
from utils import get_audio_extensions
from utils import get_summary_from_post
@ -5004,9 +5005,12 @@ def download_announce(session, base_dir: str, http_prefix: str,
return None
summary_str = \
get_summary_from_post(announced_json, system_language, [])
media_descriptions = \
get_media_descriptions_from_post(announced_json)
content_all = content_str
if summary_str:
content_all = summary_str + ' ' + content_str
content_all = \
summary_str + ' ' + content_str + ' ' + media_descriptions
if is_filtered(base_dir, nickname, domain, content_all):
print('WARN: announced post has been filtered ' +
str(announced_json))

View File

@ -138,6 +138,25 @@ def get_content_from_post(post_json_object: {}, system_language: str,
return content
def get_media_descriptions_from_post(post_json_object: {}) -> str:
"""Returns all attached media descriptions as a single text.
This is used for filtering
"""
this_post_json = post_json_object
if has_object_dict(post_json_object):
this_post_json = post_json_object['object']
if not this_post_json.get('attachment'):
return ''
descriptions = ''
for attach in this_post_json['attachment']:
if not attach.get('name'):
continue
descriptions += attach['name'] + ' '
if attach.get('url'):
descriptions += attach['url'] + ' '
return descriptions.strip()
def get_summary_from_post(post_json_object: {}, system_language: str,
languages_understood: []) -> str:
"""Returns the summary from the post in the given language