2023-03-20 14:50:19 +00:00
|
|
|
__filename__ = "cwlists.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2024-01-21 19:01:20 +00:00
|
|
|
__version__ = "1.5.0"
|
2023-03-20 14:50:19 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
|
|
|
__email__ = "bob@libreserver.org"
|
|
|
|
__status__ = "Production"
|
|
|
|
__module_group__ = "Core"
|
|
|
|
|
|
|
|
import os
|
|
|
|
from utils import load_json
|
|
|
|
from utils import get_content_from_post
|
|
|
|
|
|
|
|
|
|
|
|
def load_cw_lists(base_dir: str, verbose: bool) -> {}:
|
|
|
|
"""Load lists used for content warnings
|
|
|
|
"""
|
|
|
|
if not os.path.isdir(base_dir + '/cwlists'):
|
|
|
|
return {}
|
|
|
|
result = {}
|
|
|
|
# NOTE: here we do want to allow recursive walk through
|
|
|
|
# possible subdirectories
|
|
|
|
for _, _, files in os.walk(base_dir + '/cwlists'):
|
|
|
|
for fname in files:
|
|
|
|
if not fname.endswith('.json'):
|
|
|
|
continue
|
|
|
|
list_filename = os.path.join(base_dir + '/cwlists', fname)
|
|
|
|
print('list_filename: ' + list_filename)
|
|
|
|
list_json = load_json(list_filename, 0, 1)
|
|
|
|
if not list_json:
|
|
|
|
continue
|
|
|
|
if not list_json.get('name'):
|
|
|
|
continue
|
2024-02-23 10:32:46 +00:00
|
|
|
if not list_json.get('words') and \
|
|
|
|
not list_json.get('hashtags') and \
|
|
|
|
not list_json.get('domains'):
|
2023-03-20 14:50:19 +00:00
|
|
|
continue
|
|
|
|
name = list_json['name']
|
|
|
|
if verbose:
|
|
|
|
print('List: ' + name)
|
|
|
|
result[name] = list_json
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2024-05-08 10:22:38 +00:00
|
|
|
def _add_cw_match_tags(item: {}, post_tags: {}, cw_text: str,
|
|
|
|
warning: str) -> (bool, str):
|
|
|
|
"""Updates content warning text using hashtags from within
|
|
|
|
the post content
|
|
|
|
"""
|
|
|
|
matched = False
|
|
|
|
for tag in item['hashtags']:
|
|
|
|
tag = tag.strip()
|
|
|
|
if not tag:
|
|
|
|
continue
|
|
|
|
if not tag.startswith('#'):
|
|
|
|
tag = '#' + tag
|
|
|
|
tag = tag.lower()
|
|
|
|
for tag_dict in post_tags:
|
|
|
|
if not isinstance(tag_dict, dict):
|
|
|
|
continue
|
|
|
|
if not tag_dict.get('Hashtag'):
|
|
|
|
continue
|
|
|
|
if not tag_dict.get('name'):
|
|
|
|
continue
|
|
|
|
if tag_dict['name'].lower() == tag:
|
|
|
|
if cw_text:
|
|
|
|
cw_text = warning + ' / ' + cw_text
|
|
|
|
else:
|
|
|
|
cw_text = warning
|
|
|
|
matched = True
|
|
|
|
break
|
|
|
|
if matched:
|
|
|
|
break
|
|
|
|
return matched, cw_text
|
|
|
|
|
|
|
|
|
|
|
|
def _add_cw_match_domains(item: {}, content: str, cw_text: str,
|
|
|
|
warning: str) -> (bool, str):
|
|
|
|
"""Updates content warning text using domains from within
|
|
|
|
the post content
|
|
|
|
"""
|
|
|
|
matched = False
|
|
|
|
for domain in item['domains']:
|
|
|
|
if '.' in domain:
|
|
|
|
first_section = domain.split('.')[0]
|
|
|
|
if len(first_section) < 4:
|
|
|
|
if '.' + domain in content or \
|
|
|
|
'/' + domain in content:
|
|
|
|
if cw_text:
|
|
|
|
cw_text = warning + ' / ' + cw_text
|
|
|
|
else:
|
|
|
|
cw_text = warning
|
|
|
|
matched = True
|
|
|
|
break
|
|
|
|
continue
|
|
|
|
|
|
|
|
if domain in content:
|
|
|
|
if cw_text:
|
|
|
|
cw_text = warning + ' / ' + cw_text
|
|
|
|
else:
|
|
|
|
cw_text = warning
|
|
|
|
matched = True
|
|
|
|
break
|
|
|
|
return matched, cw_text
|
|
|
|
|
|
|
|
|
2023-03-20 14:50:19 +00:00
|
|
|
def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},
|
|
|
|
lists_enabled: str, system_language: str,
|
|
|
|
languages_understood: []) -> None:
|
|
|
|
"""Adds content warnings by matching the post content
|
|
|
|
against domains or keywords
|
|
|
|
"""
|
|
|
|
if not lists_enabled:
|
|
|
|
return
|
|
|
|
if 'content' not in post_json_object['object']:
|
|
|
|
if 'contentMap' not in post_json_object['object']:
|
|
|
|
return
|
|
|
|
cw_text = ''
|
|
|
|
if post_json_object['object'].get('summary'):
|
|
|
|
cw_text = post_json_object['object']['summary']
|
|
|
|
|
|
|
|
content = get_content_from_post(post_json_object, system_language,
|
|
|
|
languages_understood, "content")
|
|
|
|
if not content:
|
|
|
|
return
|
2024-02-23 10:32:46 +00:00
|
|
|
|
|
|
|
post_tags = []
|
|
|
|
if post_json_object['object'].get('tag'):
|
|
|
|
if isinstance(post_json_object['object']['tag'], list):
|
|
|
|
post_tags = post_json_object['object']['tag']
|
|
|
|
|
2023-03-20 14:50:19 +00:00
|
|
|
for name, item in cw_lists.items():
|
|
|
|
if name not in lists_enabled:
|
|
|
|
continue
|
|
|
|
if not item.get('warning'):
|
|
|
|
continue
|
|
|
|
warning = item['warning']
|
|
|
|
|
|
|
|
# is there a translated version of the warning?
|
|
|
|
if translate.get(warning):
|
|
|
|
warning = translate[warning]
|
|
|
|
|
|
|
|
# is the warning already in the CW?
|
|
|
|
if warning in cw_text:
|
|
|
|
continue
|
|
|
|
|
|
|
|
matched = False
|
|
|
|
|
2024-02-23 10:32:46 +00:00
|
|
|
# match hashtags within the post
|
|
|
|
if post_tags and item.get('hashtags'):
|
2024-05-08 10:22:38 +00:00
|
|
|
matched, cw_text = \
|
|
|
|
_add_cw_match_tags(item, post_tags, cw_text, warning)
|
2024-02-23 10:32:46 +00:00
|
|
|
|
|
|
|
if matched:
|
|
|
|
continue
|
|
|
|
|
2023-03-20 14:50:19 +00:00
|
|
|
# match domains within the content
|
|
|
|
if item.get('domains'):
|
2024-05-08 10:22:38 +00:00
|
|
|
matched, cw_text = \
|
|
|
|
_add_cw_match_domains(item, content, cw_text, warning)
|
2023-03-20 14:50:19 +00:00
|
|
|
|
|
|
|
if matched:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# match words within the content
|
|
|
|
if item.get('words'):
|
|
|
|
for word_str in item['words']:
|
|
|
|
if word_str in content or word_str.title() in content:
|
|
|
|
if cw_text:
|
|
|
|
cw_text = warning + ' / ' + cw_text
|
|
|
|
else:
|
|
|
|
cw_text = warning
|
|
|
|
break
|
|
|
|
if cw_text:
|
|
|
|
post_json_object['object']['summary'] = cw_text
|
|
|
|
post_json_object['object']['sensitive'] = True
|
|
|
|
|
|
|
|
|
|
|
|
def get_cw_list_variable(list_name: str) -> str:
|
|
|
|
"""Returns the variable associated with a CW list
|
|
|
|
"""
|
|
|
|
return 'list' + list_name.replace(' ', '').replace("'", '')
|