epicyon/cwlists.py

183 lines
5.7 KiB
Python
Raw Normal View History

2023-03-20 14:50:19 +00:00
__filename__ = "cwlists.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2024-01-21 19:01:20 +00:00
__version__ = "1.5.0"
2023-03-20 14:50:19 +00:00
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import os
from utils import load_json
from utils import get_content_from_post
def load_cw_lists(base_dir: str, verbose: bool) -> {}:
"""Load lists used for content warnings
"""
if not os.path.isdir(base_dir + '/cwlists'):
return {}
result = {}
# NOTE: here we do want to allow recursive walk through
# possible subdirectories
for _, _, files in os.walk(base_dir + '/cwlists'):
for fname in files:
if not fname.endswith('.json'):
continue
list_filename = os.path.join(base_dir + '/cwlists', fname)
print('list_filename: ' + list_filename)
list_json = load_json(list_filename, 0, 1)
if not list_json:
continue
if not list_json.get('name'):
continue
2024-02-23 10:32:46 +00:00
if not list_json.get('words') and \
not list_json.get('hashtags') and \
not list_json.get('domains'):
2023-03-20 14:50:19 +00:00
continue
name = list_json['name']
if verbose:
print('List: ' + name)
result[name] = list_json
return result
2024-05-08 10:22:38 +00:00
def _add_cw_match_tags(item: {}, post_tags: {}, cw_text: str,
warning: str) -> (bool, str):
"""Updates content warning text using hashtags from within
the post content
"""
matched = False
for tag in item['hashtags']:
tag = tag.strip()
if not tag:
continue
if not tag.startswith('#'):
tag = '#' + tag
tag = tag.lower()
for tag_dict in post_tags:
if not isinstance(tag_dict, dict):
continue
if not tag_dict.get('Hashtag'):
continue
if not tag_dict.get('name'):
continue
if tag_dict['name'].lower() == tag:
if cw_text:
cw_text = warning + ' / ' + cw_text
else:
cw_text = warning
matched = True
break
if matched:
break
return matched, cw_text
def _add_cw_match_domains(item: {}, content: str, cw_text: str,
warning: str) -> (bool, str):
"""Updates content warning text using domains from within
the post content
"""
matched = False
for domain in item['domains']:
if '.' in domain:
first_section = domain.split('.')[0]
if len(first_section) < 4:
if '.' + domain in content or \
'/' + domain in content:
if cw_text:
cw_text = warning + ' / ' + cw_text
else:
cw_text = warning
matched = True
break
continue
if domain in content:
if cw_text:
cw_text = warning + ' / ' + cw_text
else:
cw_text = warning
matched = True
break
return matched, cw_text
2023-03-20 14:50:19 +00:00
def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},
lists_enabled: str, system_language: str,
languages_understood: []) -> None:
"""Adds content warnings by matching the post content
against domains or keywords
"""
if not lists_enabled:
return
if 'content' not in post_json_object['object']:
if 'contentMap' not in post_json_object['object']:
return
cw_text = ''
if post_json_object['object'].get('summary'):
cw_text = post_json_object['object']['summary']
content = get_content_from_post(post_json_object, system_language,
languages_understood, "content")
if not content:
return
2024-02-23 10:32:46 +00:00
post_tags = []
if post_json_object['object'].get('tag'):
if isinstance(post_json_object['object']['tag'], list):
post_tags = post_json_object['object']['tag']
2023-03-20 14:50:19 +00:00
for name, item in cw_lists.items():
if name not in lists_enabled:
continue
if not item.get('warning'):
continue
warning = item['warning']
# is there a translated version of the warning?
if translate.get(warning):
warning = translate[warning]
# is the warning already in the CW?
if warning in cw_text:
continue
matched = False
2024-02-23 10:32:46 +00:00
# match hashtags within the post
if post_tags and item.get('hashtags'):
2024-05-08 10:22:38 +00:00
matched, cw_text = \
_add_cw_match_tags(item, post_tags, cw_text, warning)
2024-02-23 10:32:46 +00:00
if matched:
continue
2023-03-20 14:50:19 +00:00
# match domains within the content
if item.get('domains'):
2024-05-08 10:22:38 +00:00
matched, cw_text = \
_add_cw_match_domains(item, content, cw_text, warning)
2023-03-20 14:50:19 +00:00
if matched:
continue
# match words within the content
if item.get('words'):
for word_str in item['words']:
if word_str in content or word_str.title() in content:
if cw_text:
cw_text = warning + ' / ' + cw_text
else:
cw_text = warning
break
if cw_text:
post_json_object['object']['summary'] = cw_text
post_json_object['object']['sensitive'] = True
def get_cw_list_variable(list_name: str) -> str:
"""Returns the variable associated with a CW list
"""
return 'list' + list_name.replace(' ', '').replace("'", '')