epicyon/filters.py

__filename__ = "filters.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.5.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Moderation"

import os
from utils import data_dir
from utils import acct_dir
from utils import text_in_file
from utils import remove_eol
from utils import standardize_text
from utils import remove_inverted_text
from utils import remove_square_capitals


def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
    """Adds a filter for particular words within the content of a
    incoming posts
    """
    filters_filename = acct_dir(base_dir, nickname, domain) + '/filters.txt'
    if os.path.isfile(filters_filename):
        if text_in_file(words, filters_filename):
            return False
    try:
        with open(filters_filename, 'a+',
                  encoding='utf-8') as fp_filters:
            fp_filters.write(words + '\n')
    except OSError:
        print('EX: unable to append filters ' + filters_filename)
        return False
    return True


def add_global_filter(base_dir: str, words: str) -> bool:
    """Adds a global filter for particular words within
    the content of a incoming posts
    """
    if not words:
        return False
    if len(words) < 2:
        return False
    filters_filename = data_dir(base_dir) + '/filters.txt'
    if os.path.isfile(filters_filename):
        if text_in_file(words, filters_filename):
            return False
    try:
        with open(filters_filename, 'a+', encoding='utf-8') as fp_filters:
            fp_filters.write(words + '\n')
    except OSError:
        print('EX: unable to append filters ' + filters_filename)
        return False
    return True


def remove_filter(base_dir: str, nickname: str, domain: str,
                  words: str) -> bool:
    """Removes a word filter
    """
    filters_filename = acct_dir(base_dir, nickname, domain) + '/filters.txt'
    if not os.path.isfile(filters_filename):
        return False
    if not text_in_file(words, filters_filename):
        return False
    new_filters_filename = filters_filename + '.new'
    try:
        with open(filters_filename, 'r', encoding='utf-8') as fp_filt:
            with open(new_filters_filename, 'w+', encoding='utf-8') as fp_new:
                for line in fp_filt:
                    line = remove_eol(line)
                    if line != words:
                        fp_new.write(line + '\n')
    except OSError as ex:
        print('EX: unable to remove filter ' +
              filters_filename + ' ' + str(ex))
        return False
    if os.path.isfile(new_filters_filename):
        os.rename(new_filters_filename, filters_filename)
        return True
    return False


def remove_global_filter(base_dir: str, words: str) -> bool:
    """Removes a global word filter
    """
    filters_filename = data_dir(base_dir) + '/filters.txt'
    if not os.path.isfile(filters_filename):
        return False
    if not text_in_file(words, filters_filename):
        return False
    new_filters_filename = filters_filename + '.new'
    try:
        with open(filters_filename, 'r', encoding='utf-8') as fp_filt:
            with open(new_filters_filename, 'w+', encoding='utf-8') as fp_new:
                for line in fp_filt:
                    line = remove_eol(line)
                    if line != words:
                        fp_new.write(line + '\n')
    except OSError as ex:
        print('EX: unable to remove global filter ' +
              filters_filename + ' ' + str(ex))
        return False
    if os.path.isfile(new_filters_filename):
        os.rename(new_filters_filename, filters_filename)
        return True
    return False


def _is_twitter_post(content: str) -> bool:
    """Returns true if the given post content is a retweet or twitter crosspost
    """
    features = (
        '/x.com', '/twitter.', '/nitter.',
        '@twitter.', '@nitter.', '@x.com',
        '>RT <', '_tw<', '_tw@', 'tweet', 'Tweet', '🐦🔗'
    )
    for feat in features:
        if feat in content:
            return True
    return False


def _is_filtered_base(filename: str, content: str,
                      system_language: str) -> bool:
    """Uses the given file containing filtered words to check
    the given content
    """
    if not os.path.isfile(filename):
        return False

    content = remove_inverted_text(content, system_language)
    content = remove_square_capitals(content, system_language)

    # convert any fancy characters to ordinary ones
    content = standardize_text(content)

    try:
        with open(filename, 'r', encoding='utf-8') as fp_filt:
            for line in fp_filt:
                filter_str = remove_eol(line)
                if not filter_str:
                    continue
                if len(filter_str) < 2:
                    continue
                if '+' not in filter_str:
                    if filter_str in content:
                        return True
                else:
                    filter_words = filter_str.replace('"', '').split('+')
                    for word in filter_words:
                        if word not in content:
                            return False
                    return True
    except OSError as ex:
        print('EX: _is_filtered_base ' + filename + ' ' + str(ex))
    return False


def is_filtered_globally(base_dir: str, content: str,
                         system_language: str) -> bool:
    """Is the given content globally filtered?
    """
    global_filters_filename = data_dir(base_dir) + '/filters.txt'
    if _is_filtered_base(global_filters_filename, content,
                         system_language):
        return True
    return False


def is_filtered_bio(base_dir: str,
                    nickname: str, domain: str, bio: str,
                    system_language: str) -> bool:
    """Should the given actor bio be filtered out?
    """
    if is_filtered_globally(base_dir, bio, system_language):
        return True

    if not nickname or not domain:
        return False

    account_filters_filename = \
        acct_dir(base_dir, nickname, domain) + '/filters_bio.txt'
    return _is_filtered_base(account_filters_filename, bio, system_language)


def is_filtered(base_dir: str, nickname: str, domain: str,
                content: str, system_language: str) -> bool:
    """Should the given content be filtered out?
    This is a simple type of filter which just matches words, not a regex
    You can add individual words or use word1+word2 to indicate that two
    words must be present although not necessarily adjacent
    """
    if is_filtered_globally(base_dir, content, system_language):
        return True

    if not nickname or not domain:
        return False

    # optionally remove retweets
    remove_twitter = acct_dir(base_dir, nickname, domain) + '/.removeTwitter'
    if os.path.isfile(remove_twitter):
        if _is_twitter_post(content):
            return True

    account_filters_filename = \
        acct_dir(base_dir, nickname, domain) + '/filters.txt'
    return _is_filtered_base(account_filters_filename, content,
                             system_language)


def is_question_filtered(base_dir: str, nickname: str, domain: str,
                         system_language: str, question_json: {}) -> bool:
    """is the given question filtered based on its options?
    """
    if question_json.get('oneOf'):
        question_options = question_json['oneOf']
    else:
        question_options = question_json['object']['oneOf']
    for option in question_options:
        if option.get('name'):
            if is_filtered(base_dir, nickname, domain, option['name'],
                           system_language):
                return True
    return False
flake8 format 2020-04-03 10:11:54 +00:00			`__filename__ = "filters.py"`
			`__author__ = "Bob Mottram"`
			`__license__ = "AGPL3+"`
Version 1.5.0 2024-01-21 19:01:20 +00:00			`__version__ = "1.5.0"`
flake8 format 2020-04-03 10:11:54 +00:00			`__maintainer__ = "Bob Mottram"`
Change domain to libreserver.org 2021-09-10 16:14:50 +00:00			`__email__ = "bob@libreserver.org"`
flake8 format 2020-04-03 10:11:54 +00:00			`__status__ = "Production"`
Module groups 2021-06-26 11:16:41 +00:00			`__module_group__ = "Moderation"`
Word filters 2019-07-14 20:50:27 +00:00
			`import os`
Function for accounts data directory 2024-05-12 12:35:26 +00:00			`from utils import data_dir`
Snake case 2021-12-26 12:02:29 +00:00			`from utils import acct_dir`
Function for finding text in file 2022-06-10 11:43:33 +00:00			`from utils import text_in_file`
Function for line ending characters 2022-06-21 11:58:50 +00:00			`from utils import remove_eol`
Convert fancy characters to ordinary ones This will help screen readers 2022-07-09 10:54:05 +00:00			`from utils import standardize_text`
Check for inverted text 2022-09-25 17:26:11 +00:00			`from utils import remove_inverted_text`
Remove square capitals when filtering 2022-10-05 17:55:24 +00:00			`from utils import remove_square_capitals`
Standardize text prior to filtering 2022-07-09 10:37:33 +00:00

Moving to snake case 2021-12-29 21:55:09 +00:00			`def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:`
Fix line lengths 2022-09-21 20:00:57 +00:00			`"""Adds a filter for particular words within the content of a`
			`incoming posts`
Word filters 2019-07-14 20:50:27 +00:00			`"""`
Snake case 2022-01-02 12:53:25 +00:00			`filters_filename = acct_dir(base_dir, nickname, domain) + '/filters.txt'`
			`if os.path.isfile(filters_filename):`
Replacing open statements 2022-06-10 13:01:39 +00:00			`if text_in_file(words, filters_filename):`
Word filters 2019-07-14 20:50:27 +00:00			`return False`
Exception handling 2021-11-26 14:35:26 +00:00			`try:`
Explicitly set file encoding 2022-06-09 14:46:30 +00:00			`with open(filters_filename, 'a+',`
Standardise file pointer names 2024-07-14 13:01:46 +00:00			`encoding='utf-8') as fp_filters:`
			`fp_filters.write(words + '\n')`
Exception handling 2021-11-26 14:35:26 +00:00			`except OSError:`
Snake case 2022-01-02 12:53:25 +00:00			`print('EX: unable to append filters ' + filters_filename)`
Return values 2023-05-02 11:47:42 +00:00			`return False`
Word filters 2019-07-14 20:50:27 +00:00			`return True`

flake8 format 2020-04-03 10:11:54 +00:00
Moving to snake case 2021-12-29 21:55:09 +00:00			`def add_global_filter(base_dir: str, words: str) -> bool:`
Add and remove global word filters from moderator screen 2020-12-19 11:29:55 +00:00			`"""Adds a global filter for particular words within`
			`the content of a incoming posts`
			`"""`
Check minimum words length 2020-12-19 13:10:32 +00:00			`if not words:`
			`return False`
			`if len(words) < 2:`
			`return False`
Function for accounts data directory 2024-05-12 12:35:26 +00:00			`filters_filename = data_dir(base_dir) + '/filters.txt'`
Snake case 2022-01-02 12:53:25 +00:00			`if os.path.isfile(filters_filename):`
Replacing open statements 2022-06-10 13:01:39 +00:00			`if text_in_file(words, filters_filename):`
Add and remove global word filters from moderator screen 2020-12-19 11:29:55 +00:00			`return False`
Exception handling 2021-11-26 14:35:26 +00:00			`try:`
Standardise file pointer names 2024-07-14 13:01:46 +00:00			`with open(filters_filename, 'a+', encoding='utf-8') as fp_filters:`
			`fp_filters.write(words + '\n')`
Exception handling 2021-11-26 14:35:26 +00:00			`except OSError:`
Snake case 2022-01-02 12:53:25 +00:00			`print('EX: unable to append filters ' + filters_filename)`
Return values 2023-05-02 11:47:42 +00:00			`return False`
Add and remove global word filters from moderator screen 2020-12-19 11:29:55 +00:00			`return True`


Moving to snake case 2021-12-29 21:55:09 +00:00			`def remove_filter(base_dir: str, nickname: str, domain: str,`
			`words: str) -> bool:`
Word filters 2019-07-14 20:50:27 +00:00			`"""Removes a word filter`
			`"""`
Snake case 2022-01-02 12:53:25 +00:00			`filters_filename = acct_dir(base_dir, nickname, domain) + '/filters.txt'`
			`if not os.path.isfile(filters_filename):`
Tidying 2021-06-07 08:56:08 +00:00			`return False`
Function for finding text in file 2022-06-10 11:43:33 +00:00			`if not text_in_file(words, filters_filename):`
Tidying 2021-06-07 08:56:08 +00:00			`return False`
Snake case 2022-01-02 12:53:25 +00:00			`new_filters_filename = filters_filename + '.new'`
Exception handling 2021-11-26 14:35:26 +00:00			`try:`
Explicitly set file encoding 2022-06-09 14:46:30 +00:00			`with open(filters_filename, 'r', encoding='utf-8') as fp_filt:`
File pointer naming convention 2024-07-16 12:20:58 +00:00			`with open(new_filters_filename, 'w+', encoding='utf-8') as fp_new:`
Snake case 2022-01-02 12:53:25 +00:00			`for line in fp_filt:`
Function for line ending characters 2022-06-21 11:58:50 +00:00			`line = remove_eol(line)`
Exception handling 2021-11-26 14:35:26 +00:00			`if line != words:`
File pointer naming convention 2024-07-16 12:20:58 +00:00			`fp_new.write(line + '\n')`
Exception variable names 2021-12-25 15:28:52 +00:00			`except OSError as ex:`
Snake case 2022-01-02 12:53:25 +00:00			`print('EX: unable to remove filter ' +`
			`filters_filename + ' ' + str(ex))`
Return values 2023-05-02 11:47:42 +00:00			`return False`
Snake case 2022-01-02 12:53:25 +00:00			`if os.path.isfile(new_filters_filename):`
			`os.rename(new_filters_filename, filters_filename)`
Tidying 2021-06-07 08:56:08 +00:00			`return True`
Word filters 2019-07-14 20:50:27 +00:00			`return False`
Optionally remove twitter crossposts 2020-02-05 14:57:10 +00:00
flake8 format 2020-04-03 10:11:54 +00:00
Moving to snake case 2021-12-29 21:55:09 +00:00			`def remove_global_filter(base_dir: str, words: str) -> bool:`
Add and remove global word filters from moderator screen 2020-12-19 11:29:55 +00:00			`"""Removes a global word filter`
			`"""`
Function for accounts data directory 2024-05-12 12:35:26 +00:00			`filters_filename = data_dir(base_dir) + '/filters.txt'`
Snake case 2022-01-02 12:53:25 +00:00			`if not os.path.isfile(filters_filename):`
Tidying 2021-06-07 08:56:08 +00:00			`return False`
Function for finding text in file 2022-06-10 11:43:33 +00:00			`if not text_in_file(words, filters_filename):`
Tidying 2021-06-07 08:56:08 +00:00			`return False`
Snake case 2022-01-02 12:53:25 +00:00			`new_filters_filename = filters_filename + '.new'`
Exception handling 2021-11-26 14:35:26 +00:00			`try:`
Explicitly set file encoding 2022-06-09 14:46:30 +00:00			`with open(filters_filename, 'r', encoding='utf-8') as fp_filt:`
File pointer naming convention 2024-07-16 12:20:58 +00:00			`with open(new_filters_filename, 'w+', encoding='utf-8') as fp_new:`
Snake case 2022-01-02 12:53:25 +00:00			`for line in fp_filt:`
Function for line ending characters 2022-06-21 11:58:50 +00:00			`line = remove_eol(line)`
Exception handling 2021-11-26 14:35:26 +00:00			`if line != words:`
File pointer naming convention 2024-07-16 12:20:58 +00:00			`fp_new.write(line + '\n')`
Exception variable names 2021-12-25 15:28:52 +00:00			`except OSError as ex:`
Exception handling 2021-11-26 14:35:26 +00:00			`print('EX: unable to remove global filter ' +`
Snake case 2022-01-02 12:53:25 +00:00			`filters_filename + ' ' + str(ex))`
Return values 2023-05-02 11:47:42 +00:00			`return False`
Snake case 2022-01-02 12:53:25 +00:00			`if os.path.isfile(new_filters_filename):`
			`os.rename(new_filters_filename, filters_filename)`
Tidying 2021-06-07 08:56:08 +00:00			`return True`
Add and remove global word filters from moderator screen 2020-12-19 11:29:55 +00:00			`return False`


Moving to snake case 2021-12-29 21:55:09 +00:00			`def _is_twitter_post(content: str) -> bool:`
Optionally remove twitter crossposts 2020-02-05 14:57:10 +00:00			`"""Returns true if the given post content is a retweet or twitter crosspost`
			`"""`
Tidying 2022-06-02 13:54:17 +00:00			`features = (`
Twitter filter 2024-08-03 10:05:55 +00:00			`'/x.com', '/twitter.', '/nitter.',`
Less indentation 2024-08-03 15:00:28 +00:00			`'@twitter.', '@nitter.', '@x.com',`
Emojis representing twitter link 2022-07-17 15:56:06 +00:00			`'>RT <', '_tw<', '_tw@', 'tweet', 'Tweet', '🐦🔗'`
Tidying 2022-06-02 13:54:17 +00:00			`)`
			`for feat in features:`
			`if feat in content:`
			`return True`
Optionally remove twitter crossposts 2020-02-05 14:57:10 +00:00			`return False`

flake8 format 2020-04-03 10:11:54 +00:00
Check for inverted text 2022-09-25 17:26:11 +00:00			`def _is_filtered_base(filename: str, content: str,`
			`system_language: str) -> bool:`
Filtering base function So that global and account filtering are treated the same 2020-12-19 13:21:06 +00:00			`"""Uses the given file containing filtered words to check`
			`the given content`
			`"""`
Check that file exists 2020-12-19 13:23:30 +00:00			`if not os.path.isfile(filename):`
			`return False`

Check for inverted text 2022-09-25 17:26:11 +00:00			`content = remove_inverted_text(content, system_language)`
Remove square capitals when filtering 2022-10-05 17:55:24 +00:00			`content = remove_square_capitals(content, system_language)`
Check for inverted text 2022-09-25 17:26:11 +00:00
Standardize text prior to filtering 2022-07-09 10:37:33 +00:00			`# convert any fancy characters to ordinary ones`
			`content = standardize_text(content)`

Exception handling 2021-11-26 14:35:26 +00:00			`try:`
Explicitly set file encoding 2022-06-09 14:46:30 +00:00			`with open(filename, 'r', encoding='utf-8') as fp_filt:`
Snake case 2022-01-02 12:53:25 +00:00			`for line in fp_filt:`
Function for line ending characters 2022-06-21 11:58:50 +00:00			`filter_str = remove_eol(line)`
Snake case 2022-01-02 12:53:25 +00:00			`if not filter_str:`
Exception handling 2021-11-26 14:35:26 +00:00			`continue`
Snake case 2022-01-02 12:53:25 +00:00			`if len(filter_str) < 2:`
Exception handling 2021-11-26 14:35:26 +00:00			`continue`
Snake case 2022-01-02 12:53:25 +00:00			`if '+' not in filter_str:`
			`if filter_str in content:`
Exception handling 2021-11-26 14:35:26 +00:00			`return True`
			`else:`
Snake case 2022-01-02 12:53:25 +00:00			`filter_words = filter_str.replace('"', '').split('+')`
			`for word in filter_words:`
Exception handling 2021-11-26 14:35:26 +00:00			`if word not in content:`
			`return False`
Filtering base function So that global and account filtering are treated the same 2020-12-19 13:21:06 +00:00			`return True`
Exception variable names 2021-12-25 15:28:52 +00:00			`except OSError as ex:`
Moving to snake case 2021-12-29 21:55:09 +00:00			`print('EX: _is_filtered_base ' + filename + ' ' + str(ex))`
Filtering base function So that global and account filtering are treated the same 2020-12-19 13:21:06 +00:00			`return False`


Check for inverted text 2022-09-25 17:26:11 +00:00			`def is_filtered_globally(base_dir: str, content: str,`
			`system_language: str) -> bool:`
Filter federated shared items 2021-07-28 21:28:41 +00:00			`"""Is the given content globally filtered?`
			`"""`
Function for accounts data directory 2024-05-12 12:35:26 +00:00			`global_filters_filename = data_dir(base_dir) + '/filters.txt'`
Check for inverted text 2022-09-25 17:26:11 +00:00			`if _is_filtered_base(global_filters_filename, content,`
			`system_language):`
Filter federated shared items 2021-07-28 21:28:41 +00:00			`return True`
			`return False`


Moving to snake case 2021-12-29 21:55:09 +00:00			`def is_filtered_bio(base_dir: str,`
Check for inverted text 2022-09-25 17:26:11 +00:00			`nickname: str, domain: str, bio: str,`
			`system_language: str) -> bool:`
Validate sending actor of incoming post 2021-12-14 13:27:00 +00:00			`"""Should the given actor bio be filtered out?`
			`"""`
Check for inverted text 2022-09-25 17:26:11 +00:00			`if is_filtered_globally(base_dir, bio, system_language):`
Validate sending actor of incoming post 2021-12-14 13:27:00 +00:00			`return True`

			`if not nickname or not domain:`
			`return False`

Snake case 2022-01-02 12:53:25 +00:00			`account_filters_filename = \`
Snake case 2021-12-26 12:02:29 +00:00			`acct_dir(base_dir, nickname, domain) + '/filters_bio.txt'`
Check for inverted text 2022-09-25 17:26:11 +00:00			`return _is_filtered_base(account_filters_filename, bio, system_language)`
Validate sending actor of incoming post 2021-12-14 13:27:00 +00:00

Moving to snake case 2021-12-29 21:55:09 +00:00			`def is_filtered(base_dir: str, nickname: str, domain: str,`
Check for inverted text 2022-09-25 17:26:11 +00:00			`content: str, system_language: str) -> bool:`
Word filters 2019-07-14 20:50:27 +00:00			`"""Should the given content be filtered out?`
			`This is a simple type of filter which just matches words, not a regex`
			`You can add individual words or use word1+word2 to indicate that two`
			`words must be present although not necessarily adjacent`
			`"""`
Check for inverted text 2022-09-25 17:26:11 +00:00			`if is_filtered_globally(base_dir, content, system_language):`
Filtering base function So that global and account filtering are treated the same 2020-12-19 13:21:06 +00:00			`return True`
Allow filtering without an account 2020-12-19 11:41:40 +00:00
			`if not nickname or not domain:`
			`return False`

Optionally remove twitter crossposts 2020-02-05 14:57:10 +00:00			`# optionally remove retweets`
Snake case 2022-01-02 12:53:25 +00:00			`remove_twitter = acct_dir(base_dir, nickname, domain) + '/.removeTwitter'`
			`if os.path.isfile(remove_twitter):`
Moving to snake case 2021-12-29 21:55:09 +00:00			`if _is_twitter_post(content):`
Optionally remove twitter crossposts 2020-02-05 14:57:10 +00:00			`return True`

Snake case 2022-01-02 12:53:25 +00:00			`account_filters_filename = \`
Snake case 2021-12-26 12:02:29 +00:00			`acct_dir(base_dir, nickname, domain) + '/filters.txt'`
Check for inverted text 2022-09-25 17:26:11 +00:00			`return _is_filtered_base(account_filters_filename, content,`
			`system_language)`
Filtering applied to question options 2023-02-28 17:38:04 +00:00

			`def is_question_filtered(base_dir: str, nickname: str, domain: str,`
			`system_language: str, question_json: {}) -> bool:`
			`"""is the given question filtered based on its options?`
			`"""`
			`if question_json.get('oneOf'):`
			`question_options = question_json['oneOf']`
			`else:`
			`question_options = question_json['object']['oneOf']`
			`for option in question_options:`
			`if option.get('name'):`
			`if is_filtered(base_dir, nickname, domain, option['name'],`
			`system_language):`
			`return True`
			`return False`