diff --git a/filters.py b/filters.py index 62ed018fb..7c419af62 100644 --- a/filters.py +++ b/filters.py @@ -13,9 +13,9 @@ from utils import data_dir from utils import acct_dir from utils import text_in_file from utils import remove_eol -from utils import standardize_text -from utils import remove_inverted_text -from utils import remove_square_capitals +from unicodetext import standardize_text +from unicodetext import remove_inverted_text +from unicodetext import remove_square_capitals def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: diff --git a/status.py b/status.py index 2046d9256..e7ba4c27c 100644 --- a/status.py +++ b/status.py @@ -12,7 +12,7 @@ __accounts_data_path_tests__ = False from utils import date_utcnow from utils import date_from_string_format from utils import remove_html -from utils import standardize_text +from unicodetext import standardize_text MAX_STATUS_LENGTH = 100 diff --git a/tests.py b/tests.py index 3d48d42a0..1a2581230 100644 --- a/tests.py +++ b/tests.py @@ -64,21 +64,21 @@ from flags import is_group_account from flags import is_right_to_left_text from status import actor_status_expired from status import get_actor_status +from unicodetext import uninvert_text from utils import replace_strings from utils import valid_content_warning from utils import data_dir from utils import data_dir_testing from utils import remove_link_tracking -from utils import uninvert_text from utils import get_url_from_post from utils import date_from_string_format from utils import date_utcnow from utils import remove_markup_tag from utils import remove_style_within_html from utils import html_tag_has_closing -from utils import remove_inverted_text -from utils import remove_square_capitals -from utils import standardize_text +from unicodetext import remove_inverted_text +from unicodetext import remove_square_capitals +from unicodetext import standardize_text from utils import remove_eol from utils import text_in_file from utils import convert_published_to_local_timezone diff --git a/unicodetext.py b/unicodetext.py new file mode 100644 index 000000000..c2d6d7149 --- /dev/null +++ b/unicodetext.py @@ -0,0 +1,228 @@ +__filename__ = "unicodetext.py" +__author__ = "Bob Mottram" +__license__ = "AGPL3+" +__version__ = "1.6.0" +__maintainer__ = "Bob Mottram" +__email__ = "bob@libreserver.org" +__status__ = "Production" +__module_group__ = "Core" + +# functions which deal with fancy unicode text characters. +# Such text is "clever", but fucks up screen readers and accessibility +# in general + + +def uninvert_text(text: str) -> str: + """uninverts inverted text + """ + if len(text) < 4: + return text + + flip_table = { + '\u0021': '\u00A1', + '\u0022': '\u201E', + '\u0026': '\u214B', + '\u002E': '\u02D9', + '\u0033': '\u0190', + '\u0034': '\u152D', + '\u0037': '\u2C62', + '\u003B': '\u061B', + '\u003F': '\u00BF', + '\u0041': '\u2200', + '\u0042': '\u10412', + '\u0043': '\u2183', + '\u0044': '\u25D6', + '\u0045': '\u018E', + '\u0046': '\u2132', + '\u0047': '\u2141', + '\u004A': '\u017F', + '\u004B': '\u22CA', + '\u004C': '\u2142', + '\u004D': '\u0057', + '\u004E': '\u1D0E', + '\u0050': '\u0500', + '\u0051': '\u038C', + '\u0052': '\u1D1A', + '\u0054': '\u22A5', + '\u0055': '\u2229', + '\u0056': '\u1D27', + '\u0059': '\u2144', + '\u005F': '\u203E', + '\u0061': '\u0250', + '\u0062': '\u0071', + '\u0063': '\u0254', + '\u0064': '\u0070', + '\u0065': '\u01DD', + '\u0066': '\u025F', + '\u0067': '\u0183', + '\u0068': '\u0265', + '\u0069': '\u0131', + '\u006A': '\u027E', + '\u006B': '\u029E', + '\u006C': '\u0283', + '\u006D': '\u026F', + '\u006E': '\u0075', + '\u0072': '\u0279', + '\u0074': '\u0287', + '\u0076': '\u028C', + '\u0077': '\u028D', + '\u0079': '\u028E', + '\u203F': '\u2040', + '\u2234': '\u2235' + } + + matches = 0 + possible_result = '' + for ch_test in text: + ch_result = ch_test + for ch1, ch_inv in flip_table.items(): + if ch_test == ch_inv: + matches += 1 + ch_result = ch1 + break + possible_result = ch_result + possible_result + + result = text + if matches > len(text)/2: + result = possible_result + new_result = '' + extra_replace = { + '[': ']', + ']': '[', + '(': ')', + ')': '(', + '<': '>', + '>': '<', + '9': '6', + '6': '9' + } + for ch1 in result: + ch_result = ch1 + for ch2, rep in extra_replace.items(): + if ch1 == ch2: + ch_result = rep + break + new_result += ch_result + result = new_result + return result + + +def remove_inverted_text(text: str, system_language: str) -> str: + """Removes any inverted text from the given string + """ + if system_language != 'en': + return text + + text = uninvert_text(text) + + inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"] + inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"] + + start_separator = '' + separator = '\n' + if '
' in text: + text = text.replace('', '') + start_separator = '
' + separator = '
' + paragraphs = text.split(separator) + new_text = '' + inverted_list = (inverted_lower, inverted_upper) + z_value = (ord('z'), ord('Z')) + for para in paragraphs: + replaced_chars = 0 + + for idx in range(2): + index = 0 + for test_ch in inverted_list[idx]: + if test_ch == '_': + index += 1 + continue + if test_ch in para: + para = para.replace(test_ch, chr(z_value[idx] - index)) + replaced_chars += 1 + index += 1 + + if replaced_chars > 2: + para = para[::-1] + if para: + new_text += start_separator + para + if separator in text: + new_text += separator + + return new_text + + +def remove_square_capitals(text: str, system_language: str) -> str: + """Removes any square capital text from the given string + """ + if system_language != 'en': + return text + offset = ord('A') + start_value = ord('🅰') + end_value = start_value + 26 + result = '' + for text_ch in text: + text_value = ord(text_ch) + if text_value < start_value or text_value > end_value: + result += text_ch + else: + result += chr(offset + text_value - start_value) + return result + + +def _standardize_text_range(text: str, + range_start: int, range_end: int, + offset: str) -> str: + """Convert any fancy characters within the given range into ordinary ones + """ + offset = ord(offset) + ctr = 0 + text = list(text) + while ctr < len(text): + val = ord(text[ctr]) + if val in range(range_start, range_end): + text[ctr] = chr(val - range_start + offset) + ctr += 1 + return "".join(text) + + +def standardize_text(text: str) -> str: + """Converts fancy unicode text to ordinary letters + """ + if not text: + return text + + char_ranges = ( + [65345, 'a'], + [119886, 'a'], + [119990, 'a'], + [120042, 'a'], + [120094, 'a'], + [120146, 'a'], + [120198, 'a'], + [120302, 'a'], + [120354, 'a'], + [120406, 'a'], + [65313, 'A'], + [119912, 'A'], + [119964, 'A'], + [120016, 'A'], + [120068, 'A'], + [120120, 'A'], + [120172, 'A'], + [120224, 'A'], + [120328, 'A'], + [120380, 'A'], + [120432, 'A'], + [127344, 'A'], + [127312, 'A'], + [127280, 'A'], + [127248, 'A'] + ) + for char_range in char_ranges: + range_start = char_range[0] + range_end = range_start + 26 + offset = char_range[1] + text = _standardize_text_range(text, range_start, range_end, offset) + + return uninvert_text(text) diff --git a/utils.py b/utils.py index f57966690..128f062be 100644 --- a/utils.py +++ b/utils.py @@ -22,6 +22,7 @@ from dateutil.tz import tz from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from followingCalendar import add_person_to_calendar +from unicodetext import standardize_text VALID_HASHTAG_CHARS = \ set('_0123456789' + @@ -189,159 +190,6 @@ def get_attributed_to(field) -> str: return None -def uninvert_text(text: str) -> str: - """uninverts inverted text - """ - if len(text) < 4: - return text - - flip_table = { - '\u0021': '\u00A1', - '\u0022': '\u201E', - '\u0026': '\u214B', - '\u002E': '\u02D9', - '\u0033': '\u0190', - '\u0034': '\u152D', - '\u0037': '\u2C62', - '\u003B': '\u061B', - '\u003F': '\u00BF', - '\u0041': '\u2200', - '\u0042': '\u10412', - '\u0043': '\u2183', - '\u0044': '\u25D6', - '\u0045': '\u018E', - '\u0046': '\u2132', - '\u0047': '\u2141', - '\u004A': '\u017F', - '\u004B': '\u22CA', - '\u004C': '\u2142', - '\u004D': '\u0057', - '\u004E': '\u1D0E', - '\u0050': '\u0500', - '\u0051': '\u038C', - '\u0052': '\u1D1A', - '\u0054': '\u22A5', - '\u0055': '\u2229', - '\u0056': '\u1D27', - '\u0059': '\u2144', - '\u005F': '\u203E', - '\u0061': '\u0250', - '\u0062': '\u0071', - '\u0063': '\u0254', - '\u0064': '\u0070', - '\u0065': '\u01DD', - '\u0066': '\u025F', - '\u0067': '\u0183', - '\u0068': '\u0265', - '\u0069': '\u0131', - '\u006A': '\u027E', - '\u006B': '\u029E', - '\u006C': '\u0283', - '\u006D': '\u026F', - '\u006E': '\u0075', - '\u0072': '\u0279', - '\u0074': '\u0287', - '\u0076': '\u028C', - '\u0077': '\u028D', - '\u0079': '\u028E', - '\u203F': '\u2040', - '\u2234': '\u2235' - } - - matches = 0 - possible_result = '' - for ch_test in text: - ch_result = ch_test - for ch1, ch_inv in flip_table.items(): - if ch_test == ch_inv: - matches += 1 - ch_result = ch1 - break - possible_result = ch_result + possible_result - - result = text - if matches > len(text)/2: - result = possible_result - new_result = '' - extra_replace = { - '[': ']', - ']': '[', - '(': ')', - ')': '(', - '<': '>', - '>': '<', - '9': '6', - '6': '9' - } - for ch1 in result: - ch_result = ch1 - for ch2, rep in extra_replace.items(): - if ch1 == ch2: - ch_result = rep - break - new_result += ch_result - result = new_result - return result - - -def _standardize_text_range(text: str, - range_start: int, range_end: int, - offset: str) -> str: - """Convert any fancy characters within the given range into ordinary ones - """ - offset = ord(offset) - ctr = 0 - text = list(text) - while ctr < len(text): - val = ord(text[ctr]) - if val in range(range_start, range_end): - text[ctr] = chr(val - range_start + offset) - ctr += 1 - return "".join(text) - - -def standardize_text(text: str) -> str: - """Converts fancy unicode text to ordinary letters - """ - if not text: - return text - - char_ranges = ( - [65345, 'a'], - [119886, 'a'], - [119990, 'a'], - [120042, 'a'], - [120094, 'a'], - [120146, 'a'], - [120198, 'a'], - [120302, 'a'], - [120354, 'a'], - [120406, 'a'], - [65313, 'A'], - [119912, 'A'], - [119964, 'A'], - [120016, 'A'], - [120068, 'A'], - [120120, 'A'], - [120172, 'A'], - [120224, 'A'], - [120328, 'A'], - [120380, 'A'], - [120432, 'A'], - [127344, 'A'], - [127312, 'A'], - [127280, 'A'], - [127248, 'A'] - ) - for char_range in char_ranges: - range_start = char_range[0] - range_end = range_start + 26 - offset = char_range[1] - text = _standardize_text_range(text, range_start, range_end, offset) - - return uninvert_text(text) - - def remove_eol(line: str) -> str: """Removes line ending characters """ @@ -4515,69 +4363,6 @@ def get_json_content_from_accept(accept: str) -> str: return protocol_str -def remove_inverted_text(text: str, system_language: str) -> str: - """Removes any inverted text from the given string - """ - if system_language != 'en': - return text - - text = uninvert_text(text) - - inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"] - inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"] - - start_separator = '' - separator = '\n' - if '' in text: - text = text.replace('', '') - start_separator = '
' - separator = '
' - paragraphs = text.split(separator) - new_text = '' - inverted_list = (inverted_lower, inverted_upper) - z_value = (ord('z'), ord('Z')) - for para in paragraphs: - replaced_chars = 0 - - for idx in range(2): - index = 0 - for test_ch in inverted_list[idx]: - if test_ch == '_': - index += 1 - continue - if test_ch in para: - para = para.replace(test_ch, chr(z_value[idx] - index)) - replaced_chars += 1 - index += 1 - - if replaced_chars > 2: - para = para[::-1] - if para: - new_text += start_separator + para - if separator in text: - new_text += separator - - return new_text - - -def remove_square_capitals(text: str, system_language: str) -> str: - """Removes any square capital text from the given string - """ - if system_language != 'en': - return text - offset = ord('A') - start_value = ord('🅰') - end_value = start_value + 26 - result = '' - for text_ch in text: - text_value = ord(text_ch) - if text_value < start_value or text_value > end_value: - result += text_ch - else: - result += chr(offset + text_value - start_value) - return result - - def dont_speak_hashtags(content: str) -> str: """Ensure that hashtags aren't announced by screen readers """ diff --git a/webapp_profile.py b/webapp_profile.py index 1b56b0f63..bfe817c51 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -19,11 +19,12 @@ from flags import is_premium_account from status import actor_status_expired from status import get_actor_status from textmode import text_mode_removals +from unicodetext import uninvert_text +from unicodetext import standardize_text from utils import get_person_icon from utils import replace_strings from utils import data_dir from utils import time_days_ago -from utils import uninvert_text from utils import get_attributed_to from utils import get_url_from_post from utils import get_memorials @@ -31,7 +32,6 @@ from utils import text_in_file from utils import dangerous_markup from utils import ap_proxy_type from utils import remove_id_ending -from utils import standardize_text from utils import get_display_name from utils import has_object_dict from utils import get_occupation_name