From 0e69a6278531cd410d1f76b3b7e8a16b6f53b059 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 11:37:33 +0100 Subject: [PATCH 1/6] Standardize text prior to filtering --- filters.py | 41 +++++++++++++++++++++++++++++++++++++++++ tests.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/filters.py b/filters.py index 329528ac6..318b179be 100644 --- a/filters.py +++ b/filters.py @@ -13,6 +13,44 @@ from utils import text_in_file from utils import remove_eol +def _standardize_text_range(text: str, + range_start: int, range_end: int, + offset: str) -> str: + """Convert any fancy characters within the given range into ordinary ones + """ + offset = ord(offset) + ctr = 0 + text = list(text) + while ctr < len(text): + val = ord(text[ctr]) + if val in range(range_start, range_end): + text[ctr] = chr(val - range_start + offset) + ctr += 1 + return "".join(text) + + +def standardize_text(text: str) -> str: + """Converts fancy unicode text to ordinary letters + """ + fancy_ranges = ( + 119990, 120094, 120198, 120042, 119990, 120146, 119886 + ) + + for range_start in fancy_ranges: + range_end = range_start + 26 + text = _standardize_text_range(text, range_start, range_end, 'a') + + range_start = range_end + range_end = range_start + 26 + text = _standardize_text_range(text, range_start, range_end, 'A') + + text = _standardize_text_range(text, 65345, 65345 + 26, 'a') + text = _standardize_text_range(text, 65313, 65313 + 26, 'A') + text = _standardize_text_range(text, 119964, 119964 + 26, 'A') + + return text + + def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: """Adds a filter for particular words within the content of a incoming posts """ @@ -120,6 +158,9 @@ def _is_filtered_base(filename: str, content: str) -> bool: if not os.path.isfile(filename): return False + # convert any fancy characters to ordinary ones + content = standardize_text(content) + try: with open(filename, 'r', encoding='utf-8') as fp_filt: for line in fp_filt: diff --git a/tests.py b/tests.py index 990299132..a466b6b6b 100644 --- a/tests.py +++ b/tests.py @@ -189,6 +189,7 @@ from blocking import add_cw_from_lists from happening import dav_month_via_server from happening import dav_day_via_server from webapp_theme_designer import color_contrast +from filters import standardize_text TEST_SERVER_GROUP_RUNNING = False @@ -7351,6 +7352,52 @@ def _test_dogwhistles(): assert result['hamstered']['category'] == "hamsterism" +def _test_text_standardize(): + print('text_standardize') + expected = 'This is a test' + + result = standardize_text(expected) + if result != expected: + print(result) + assert result == expected + + text = '𝔗𝔥𝔦𝔰 𝔦𝔰 𝔞 𝔱𝔢𝔰𝔱' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + text = '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖆 𝖙𝖊𝖘𝖙' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + text = '𝓣𝓱𝓲𝓼 𝓲𝓼 𝓪 𝓽𝓮𝓼𝓽' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + text = '𝒯𝒽𝒾𝓈 𝒾𝓈 𝒶 𝓉𝑒𝓈𝓉' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + text = '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕒 𝕥𝕖𝕤𝕥' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + text = 'This is a test' + result = standardize_text(text) + if result != expected: + print(result) + assert result == expected + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -7368,6 +7415,7 @@ def run_all_tests(): _test_checkbox_names() _test_thread_functions() _test_functions() + _test_text_standardize() _test_dogwhistles() _test_remove_end_of_line() _test_translation_labels() From 6e8f07a916abec046731fde55ad55dbd2c6663be Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 11:54:05 +0100 Subject: [PATCH 2/6] Convert fancy characters to ordinary ones This will help screen readers --- filters.py | 39 +-------------------------------------- tests.py | 2 +- utils.py | 46 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/filters.py b/filters.py index 318b179be..d471481ce 100644 --- a/filters.py +++ b/filters.py @@ -11,44 +11,7 @@ import os from utils import acct_dir from utils import text_in_file from utils import remove_eol - - -def _standardize_text_range(text: str, - range_start: int, range_end: int, - offset: str) -> str: - """Convert any fancy characters within the given range into ordinary ones - """ - offset = ord(offset) - ctr = 0 - text = list(text) - while ctr < len(text): - val = ord(text[ctr]) - if val in range(range_start, range_end): - text[ctr] = chr(val - range_start + offset) - ctr += 1 - return "".join(text) - - -def standardize_text(text: str) -> str: - """Converts fancy unicode text to ordinary letters - """ - fancy_ranges = ( - 119990, 120094, 120198, 120042, 119990, 120146, 119886 - ) - - for range_start in fancy_ranges: - range_end = range_start + 26 - text = _standardize_text_range(text, range_start, range_end, 'a') - - range_start = range_end - range_end = range_start + 26 - text = _standardize_text_range(text, range_start, range_end, 'A') - - text = _standardize_text_range(text, 65345, 65345 + 26, 'a') - text = _standardize_text_range(text, 65313, 65313 + 26, 'A') - text = _standardize_text_range(text, 119964, 119964 + 26, 'A') - - return text +from utils import standardize_text def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: diff --git a/tests.py b/tests.py index a466b6b6b..6bbe9e32d 100644 --- a/tests.py +++ b/tests.py @@ -54,6 +54,7 @@ from follow import clear_followers from follow import send_follow_request_via_server from follow import send_unfollow_request_via_server from siteactive import site_is_active +from utils import standardize_text from utils import remove_eol from utils import text_in_file from utils import convert_published_to_local_timezone @@ -189,7 +190,6 @@ from blocking import add_cw_from_lists from happening import dav_month_via_server from happening import dav_day_via_server from webapp_theme_designer import color_contrast -from filters import standardize_text TEST_SERVER_GROUP_RUNNING = False diff --git a/utils.py b/utils.py index 8603383ef..da4d062e5 100644 --- a/utils.py +++ b/utils.py @@ -40,6 +40,44 @@ INVALID_CHARACTERS = ( ) +def _standardize_text_range(text: str, + range_start: int, range_end: int, + offset: str) -> str: + """Convert any fancy characters within the given range into ordinary ones + """ + offset = ord(offset) + ctr = 0 + text = list(text) + while ctr < len(text): + val = ord(text[ctr]) + if val in range(range_start, range_end): + text[ctr] = chr(val - range_start + offset) + ctr += 1 + return "".join(text) + + +def standardize_text(text: str) -> str: + """Converts fancy unicode text to ordinary letters + """ + fancy_ranges = ( + 119990, 120094, 120198, 120042, 119990, 120146, 119886 + ) + + for range_start in fancy_ranges: + range_end = range_start + 26 + text = _standardize_text_range(text, range_start, range_end, 'a') + + range_start = range_end + range_end = range_start + 26 + text = _standardize_text_range(text, range_start, range_end, 'A') + + text = _standardize_text_range(text, 65345, 65345 + 26, 'a') + text = _standardize_text_range(text, 65313, 65313 + 26, 'A') + text = _standardize_text_range(text, 119964, 119964 + 26, 'A') + + return text + + def remove_eol(line: str): """Removes line ending characters """ @@ -150,17 +188,19 @@ def get_content_from_post(post_json_object: {}, system_language: str, if this_post_json[map_dict].get(system_language): sys_lang = this_post_json[map_dict][system_language] if isinstance(sys_lang, str): - return this_post_json[map_dict][system_language] + content = this_post_json[map_dict][system_language] + return standardize_text(content) else: # is there a contentMap/summaryMap entry for one of # the understood languages? for lang in languages_understood: if this_post_json[map_dict].get(lang): - return this_post_json[map_dict][lang] + content = this_post_json[map_dict][lang] + return standardize_text(content) else: if isinstance(this_post_json[content_type], str): content = this_post_json[content_type] - return content + return standardize_text(content) def get_media_descriptions_from_post(post_json_object: {}) -> str: From bdb20ae7348a4428c7e461353e99274cc7cec6eb Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 12:03:12 +0100 Subject: [PATCH 3/6] Tidying --- utils.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/utils.py b/utils.py index da4d062e5..3c3888f89 100644 --- a/utils.py +++ b/utils.py @@ -59,21 +59,30 @@ def _standardize_text_range(text: str, def standardize_text(text: str) -> str: """Converts fancy unicode text to ordinary letters """ - fancy_ranges = ( - 119990, 120094, 120198, 120042, 119990, 120146, 119886 + char_ranges = ( + [119886, 'a'], + [120146, 'a'], + [119990, 'a'], + [120042, 'a'], + [120198, 'a'], + [120094, 'a'], + [119990, 'a'], + [65345, 'a'], + [119886 + 26, 'A'], + [120146 + 26, 'A'], + [119990 + 26, 'A'], + [120042 + 26, 'A'], + [120198 + 26, 'A'], + [119990 + 26, 'A'], + [120094 + 26, 'A'], + [65313, 'A'], + [119964, 'A'] ) - - for range_start in fancy_ranges: + for char_range in char_ranges: + range_start = char_range[0] range_end = range_start + 26 - text = _standardize_text_range(text, range_start, range_end, 'a') - - range_start = range_end - range_end = range_start + 26 - text = _standardize_text_range(text, range_start, range_end, 'A') - - text = _standardize_text_range(text, 65345, 65345 + 26, 'a') - text = _standardize_text_range(text, 65313, 65313 + 26, 'A') - text = _standardize_text_range(text, 119964, 119964 + 26, 'A') + offset = char_range[1] + text = _standardize_text_range(text, range_start, range_end, offset) return text From 91ad251642115bef7d3e9a283e80f15b4d9d06d9 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 12:09:29 +0100 Subject: [PATCH 4/6] Tidying --- tests.py | 2 ++ utils.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests.py b/tests.py index 6bbe9e32d..c47f6a657 100644 --- a/tests.py +++ b/tests.py @@ -7399,6 +7399,8 @@ def _test_text_standardize(): def run_all_tests(): + _test_text_standardize() + return base_dir = os.getcwd() print('Running tests...') update_default_themes_list(os.getcwd()) diff --git a/utils.py b/utils.py index 3c3888f89..73c0cf4b4 100644 --- a/utils.py +++ b/utils.py @@ -60,23 +60,21 @@ def standardize_text(text: str) -> str: """Converts fancy unicode text to ordinary letters """ char_ranges = ( + [65345, 'a'], [119886, 'a'], - [120146, 'a'], [119990, 'a'], [120042, 'a'], - [120198, 'a'], [120094, 'a'], - [119990, 'a'], - [65345, 'a'], - [119886 + 26, 'A'], - [120146 + 26, 'A'], - [119990 + 26, 'A'], - [120042 + 26, 'A'], - [120198 + 26, 'A'], - [119990 + 26, 'A'], - [120094 + 26, 'A'], + [120146, 'a'], + [120198, 'a'], [65313, 'A'], - [119964, 'A'] + [119912, 'A'], + [119964, 'A'], + [120016, 'A'], + [120068, 'A'], + [120120, 'A'], + [120172, 'A'], + [120224, 'A'] ) for char_range in char_ranges: range_start = char_range[0] From 11e105945a87849c6c39deba5ba99fed6c7e2269 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 12:10:01 +0100 Subject: [PATCH 5/6] Tidying --- tests.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests.py b/tests.py index c47f6a657..6bbe9e32d 100644 --- a/tests.py +++ b/tests.py @@ -7399,8 +7399,6 @@ def _test_text_standardize(): def run_all_tests(): - _test_text_standardize() - return base_dir = os.getcwd() print('Running tests...') update_default_themes_list(os.getcwd()) From 66fcf918a84e60e6e2191a76155d52e45eb6ddfb Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 9 Jul 2022 12:24:11 +0100 Subject: [PATCH 6/6] More fancy unicode character ranges --- utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 73c0cf4b4..9d0603b66 100644 --- a/utils.py +++ b/utils.py @@ -67,6 +67,9 @@ def standardize_text(text: str) -> str: [120094, 'a'], [120146, 'a'], [120198, 'a'], + [120302, 'a'], + [120354, 'a'], + [120406, 'a'], [65313, 'A'], [119912, 'A'], [119964, 'A'], @@ -74,7 +77,10 @@ def standardize_text(text: str) -> str: [120068, 'A'], [120120, 'A'], [120172, 'A'], - [120224, 'A'] + [120224, 'A'], + [120328, 'A'], + [120380, 'A'], + [120432, 'A'] ) for char_range in char_ranges: range_start = char_range[0]