epicyon/cwlists.py

__filename__ = "cwlists.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.5.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"

import os
from utils import load_json
from utils import get_content_from_post


def load_cw_lists(base_dir: str, verbose: bool) -> {}:
    """Load lists used for content warnings
    """
    if not os.path.isdir(base_dir + '/cwlists'):
        return {}
    result = {}
    # NOTE: here we do want to allow recursive walk through
    # possible subdirectories
    for _, _, files in os.walk(base_dir + '/cwlists'):
        for fname in files:
            if not fname.endswith('.json'):
                continue
            list_filename = os.path.join(base_dir + '/cwlists', fname)
            print('list_filename: ' + list_filename)
            list_json = load_json(list_filename)
            if not list_json:
                continue
            if not list_json.get('name'):
                continue
            if not list_json.get('words') and \
               not list_json.get('hashtags') and \
               not list_json.get('domains'):
                continue
            name = list_json['name']
            if verbose:
                print('List: ' + name)
            result[name] = list_json
    return result


def _add_cw_match_tags(item: {}, post_tags: {}, cw_text: str,
                       warning: str) -> (bool, str):
    """Updates content warning text using hashtags from within
    the post content
    """
    matched = False
    for tag in item['hashtags']:
        tag = tag.strip()
        if not tag:
            continue
        if not tag.startswith('#'):
            tag = '#' + tag
        tag = tag.lower()
        for tag_dict in post_tags:
            if not isinstance(tag_dict, dict):
                continue
            if not tag_dict.get('Hashtag'):
                continue
            if not tag_dict.get('name'):
                continue
            if tag_dict['name'].lower() == tag:
                if cw_text:
                    cw_text = warning + ' / ' + cw_text
                else:
                    cw_text = warning
                matched = True
                break
        if matched:
            break
    return matched, cw_text


def _add_cw_match_domains(item: {}, content: str, cw_text: str,
                          warning: str) -> (bool, str):
    """Updates content warning text using domains from within
    the post content
    """
    matched = False
    for domain in item['domains']:
        if '.' in domain:
            first_section = domain.split('.')[0]
            if len(first_section) < 4:
                if '.' + domain in content or \
                   '/' + domain in content:
                    if cw_text:
                        cw_text = warning + ' / ' + cw_text
                    else:
                        cw_text = warning
                    matched = True
                    break
                continue

        if domain in content:
            if cw_text:
                cw_text = warning + ' / ' + cw_text
            else:
                cw_text = warning
            matched = True
            break
    return matched, cw_text


def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},
                      lists_enabled: str, system_language: str,
                      languages_understood: []) -> None:
    """Adds content warnings by matching the post content
    against domains or keywords
    """
    if not lists_enabled:
        return
    if 'content' not in post_json_object['object']:
        if 'contentMap' not in post_json_object['object']:
            return
    cw_text = ''
    if post_json_object['object'].get('summary'):
        cw_text = post_json_object['object']['summary']

    content = get_content_from_post(post_json_object, system_language,
                                    languages_understood, "content")
    if not content:
        return

    post_tags = []
    if post_json_object['object'].get('tag'):
        if isinstance(post_json_object['object']['tag'], list):
            post_tags = post_json_object['object']['tag']

    for name, item in cw_lists.items():
        if name not in lists_enabled:
            continue
        if not item.get('warning'):
            continue
        warning = item['warning']

        # is there a translated version of the warning?
        if translate.get(warning):
            warning = translate[warning]

        # is the warning already in the CW?
        if warning in cw_text:
            continue

        matched = False

        # match hashtags within the post
        if post_tags and item.get('hashtags'):
            matched, cw_text = \
                _add_cw_match_tags(item, post_tags, cw_text, warning)

        if matched:
            continue

        # match domains within the content
        if item.get('domains'):
            matched, cw_text = \
                _add_cw_match_domains(item, content, cw_text, warning)

        if matched:
            continue

        # match words within the content
        if item.get('words'):
            for word_str in item['words']:
                if word_str in content or word_str.title() in content:
                    if cw_text:
                        cw_text = warning + ' / ' + cw_text
                    else:
                        cw_text = warning
                    break
    if cw_text:
        post_json_object['object']['summary'] = cw_text
        post_json_object['object']['sensitive'] = True


def get_cw_list_variable(list_name: str) -> str:
    """Returns the variable associated with a CW list
    """
    return 'list' + list_name.replace(' ', '').replace("'", '')
cwlists in a separate module 2023-03-20 14:50:19 +00:00			`__filename__ = "cwlists.py"`
			`__author__ = "Bob Mottram"`
			`__license__ = "AGPL3+"`
Version 1.5.0 2024-01-21 19:01:20 +00:00			`__version__ = "1.5.0"`
cwlists in a separate module 2023-03-20 14:50:19 +00:00			`__maintainer__ = "Bob Mottram"`
			`__email__ = "bob@libreserver.org"`
			`__status__ = "Production"`
			`__module_group__ = "Core"`

			`import os`
			`from utils import load_json`
			`from utils import get_content_from_post`


			`def load_cw_lists(base_dir: str, verbose: bool) -> {}:`
			`"""Load lists used for content warnings`
			`"""`
			`if not os.path.isdir(base_dir + '/cwlists'):`
			`return {}`
			`result = {}`
			`# NOTE: here we do want to allow recursive walk through`
			`# possible subdirectories`
			`for _, _, files in os.walk(base_dir + '/cwlists'):`
			`for fname in files:`
			`if not fname.endswith('.json'):`
			`continue`
			`list_filename = os.path.join(base_dir + '/cwlists', fname)`
			`print('list_filename: ' + list_filename)`
json loading retries may not be needed 2024-06-20 10:47:58 +00:00			`list_json = load_json(list_filename)`
cwlists in a separate module 2023-03-20 14:50:19 +00:00			`if not list_json:`
			`continue`
			`if not list_json.get('name'):`
			`continue`
Include hashtags within cw lists 2024-02-23 10:32:46 +00:00			`if not list_json.get('words') and \`
			`not list_json.get('hashtags') and \`
			`not list_json.get('domains'):`
cwlists in a separate module 2023-03-20 14:50:19 +00:00			`continue`
			`name = list_json['name']`
			`if verbose:`
			`print('List: ' + name)`
			`result[name] = list_json`
			`return result`


Tidying 2024-05-08 10:22:38 +00:00			`def _add_cw_match_tags(item: {}, post_tags: {}, cw_text: str,`
			`warning: str) -> (bool, str):`
			`"""Updates content warning text using hashtags from within`
			`the post content`
			`"""`
			`matched = False`
			`for tag in item['hashtags']:`
			`tag = tag.strip()`
			`if not tag:`
			`continue`
			`if not tag.startswith('#'):`
			`tag = '#' + tag`
			`tag = tag.lower()`
			`for tag_dict in post_tags:`
			`if not isinstance(tag_dict, dict):`
			`continue`
			`if not tag_dict.get('Hashtag'):`
			`continue`
			`if not tag_dict.get('name'):`
			`continue`
			`if tag_dict['name'].lower() == tag:`
			`if cw_text:`
			`cw_text = warning + ' / ' + cw_text`
			`else:`
			`cw_text = warning`
			`matched = True`
			`break`
			`if matched:`
			`break`
			`return matched, cw_text`


			`def _add_cw_match_domains(item: {}, content: str, cw_text: str,`
			`warning: str) -> (bool, str):`
			`"""Updates content warning text using domains from within`
			`the post content`
			`"""`
			`matched = False`
			`for domain in item['domains']:`
			`if '.' in domain:`
			`first_section = domain.split('.')[0]`
			`if len(first_section) < 4:`
			`if '.' + domain in content or \`
			`'/' + domain in content:`
			`if cw_text:`
			`cw_text = warning + ' / ' + cw_text`
			`else:`
			`cw_text = warning`
			`matched = True`
			`break`
			`continue`

			`if domain in content:`
			`if cw_text:`
			`cw_text = warning + ' / ' + cw_text`
			`else:`
			`cw_text = warning`
			`matched = True`
			`break`
			`return matched, cw_text`


cwlists in a separate module 2023-03-20 14:50:19 +00:00			`def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},`
			`lists_enabled: str, system_language: str,`
			`languages_understood: []) -> None:`
			`"""Adds content warnings by matching the post content`
			`against domains or keywords`
			`"""`
			`if not lists_enabled:`
			`return`
			`if 'content' not in post_json_object['object']:`
			`if 'contentMap' not in post_json_object['object']:`
			`return`
			`cw_text = ''`
			`if post_json_object['object'].get('summary'):`
			`cw_text = post_json_object['object']['summary']`

			`content = get_content_from_post(post_json_object, system_language,`
			`languages_understood, "content")`
			`if not content:`
			`return`
Include hashtags within cw lists 2024-02-23 10:32:46 +00:00
			`post_tags = []`
			`if post_json_object['object'].get('tag'):`
			`if isinstance(post_json_object['object']['tag'], list):`
			`post_tags = post_json_object['object']['tag']`

cwlists in a separate module 2023-03-20 14:50:19 +00:00			`for name, item in cw_lists.items():`
			`if name not in lists_enabled:`
			`continue`
			`if not item.get('warning'):`
			`continue`
			`warning = item['warning']`

			`# is there a translated version of the warning?`
			`if translate.get(warning):`
			`warning = translate[warning]`

			`# is the warning already in the CW?`
			`if warning in cw_text:`
			`continue`

			`matched = False`

Include hashtags within cw lists 2024-02-23 10:32:46 +00:00			`# match hashtags within the post`
			`if post_tags and item.get('hashtags'):`
Tidying 2024-05-08 10:22:38 +00:00			`matched, cw_text = \`
			`_add_cw_match_tags(item, post_tags, cw_text, warning)`
Include hashtags within cw lists 2024-02-23 10:32:46 +00:00
			`if matched:`
			`continue`

cwlists in a separate module 2023-03-20 14:50:19 +00:00			`# match domains within the content`
			`if item.get('domains'):`
Tidying 2024-05-08 10:22:38 +00:00			`matched, cw_text = \`
			`_add_cw_match_domains(item, content, cw_text, warning)`
cwlists in a separate module 2023-03-20 14:50:19 +00:00
			`if matched:`
			`continue`

			`# match words within the content`
			`if item.get('words'):`
			`for word_str in item['words']:`
			`if word_str in content or word_str.title() in content:`
			`if cw_text:`
			`cw_text = warning + ' / ' + cw_text`
			`else:`
			`cw_text = warning`
			`break`
			`if cw_text:`
			`post_json_object['object']['summary'] = cw_text`
			`post_json_object['object']['sensitive'] = True`


			`def get_cw_list_variable(list_name: str) -> str:`
			`"""Returns the variable associated with a CW list`
			`"""`
			`return 'list' + list_name.replace(' ', '').replace("'", '')`