fancy unicode text functions in their own module

2025-05-27 12:14:15 +01:00 · 2025-05-27 12:14:15 +01:00 · c8cb2d74ee
parent 54adba4575
commit c8cb2d74ee
6 changed files with 239 additions and 226 deletions
--- a/filters.py
+++ b/filters.py
@ -13,9 +13,9 @@ from utils import data_dir
 from utils import acct_dir
 from utils import text_in_file
 from utils import remove_eol
-from utils import standardize_text
-from utils import remove_inverted_text
-from utils import remove_square_capitals
+from unicodetext import standardize_text
+from unicodetext import remove_inverted_text
+from unicodetext import remove_square_capitals


 def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
--- a/status.py
+++ b/status.py
@ -12,7 +12,7 @@ __accounts_data_path_tests__ = False
 from utils import date_utcnow
 from utils import date_from_string_format
 from utils import remove_html
-from utils import standardize_text
+from unicodetext import standardize_text

 MAX_STATUS_LENGTH = 100

--- a/tests.py
+++ b/tests.py
@ -64,21 +64,21 @@ from flags import is_group_account
 from flags import is_right_to_left_text
 from status import actor_status_expired
 from status import get_actor_status
+from unicodetext import uninvert_text
 from utils import replace_strings
 from utils import valid_content_warning
 from utils import data_dir
 from utils import data_dir_testing
 from utils import remove_link_tracking
-from utils import uninvert_text
 from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import date_utcnow
 from utils import remove_markup_tag
 from utils import remove_style_within_html
 from utils import html_tag_has_closing
-from utils import remove_inverted_text
-from utils import remove_square_capitals
-from utils import standardize_text
+from unicodetext import remove_inverted_text
+from unicodetext import remove_square_capitals
+from unicodetext import standardize_text
 from utils import remove_eol
 from utils import text_in_file
 from utils import convert_published_to_local_timezone
--- a/unicodetext.py
+++ b/unicodetext.py
@ -0,0 +1,228 @@
+__filename__ = "unicodetext.py"
+__author__ = "Bob Mottram"
+__license__ = "AGPL3+"
+__version__ = "1.6.0"
+__maintainer__ = "Bob Mottram"
+__email__ = "bob@libreserver.org"
+__status__ = "Production"
+__module_group__ = "Core"
+
+# functions which deal with fancy unicode text characters.
+# Such text is "clever", but fucks up screen readers and accessibility
+# in general
+
+
+def uninvert_text(text: str) -> str:
+    """uninverts inverted text
+    """
+    if len(text) < 4:
+        return text
+
+    flip_table = {
+        '\u0021': '\u00A1',
+        '\u0022': '\u201E',
+        '\u0026': '\u214B',
+        '\u002E': '\u02D9',
+        '\u0033': '\u0190',
+        '\u0034': '\u152D',
+        '\u0037': '\u2C62',
+        '\u003B': '\u061B',
+        '\u003F': '\u00BF',
+        '\u0041': '\u2200',
+        '\u0042': '\u10412',
+        '\u0043': '\u2183',
+        '\u0044': '\u25D6',
+        '\u0045': '\u018E',
+        '\u0046': '\u2132',
+        '\u0047': '\u2141',
+        '\u004A': '\u017F',
+        '\u004B': '\u22CA',
+        '\u004C': '\u2142',
+        '\u004D': '\u0057',
+        '\u004E': '\u1D0E',
+        '\u0050': '\u0500',
+        '\u0051': '\u038C',
+        '\u0052': '\u1D1A',
+        '\u0054': '\u22A5',
+        '\u0055': '\u2229',
+        '\u0056': '\u1D27',
+        '\u0059': '\u2144',
+        '\u005F': '\u203E',
+        '\u0061': '\u0250',
+        '\u0062': '\u0071',
+        '\u0063': '\u0254',
+        '\u0064': '\u0070',
+        '\u0065': '\u01DD',
+        '\u0066': '\u025F',
+        '\u0067': '\u0183',
+        '\u0068': '\u0265',
+        '\u0069': '\u0131',
+        '\u006A': '\u027E',
+        '\u006B': '\u029E',
+        '\u006C': '\u0283',
+        '\u006D': '\u026F',
+        '\u006E': '\u0075',
+        '\u0072': '\u0279',
+        '\u0074': '\u0287',
+        '\u0076': '\u028C',
+        '\u0077': '\u028D',
+        '\u0079': '\u028E',
+        '\u203F': '\u2040',
+        '\u2234': '\u2235'
+    }
+
+    matches = 0
+    possible_result = ''
+    for ch_test in text:
+        ch_result = ch_test
+        for ch1, ch_inv in flip_table.items():
+            if ch_test == ch_inv:
+                matches += 1
+                ch_result = ch1
+                break
+        possible_result = ch_result + possible_result
+
+    result = text
+    if matches > len(text)/2:
+        result = possible_result
+        new_result = ''
+        extra_replace = {
+            '[': ']',
+            ']': '[',
+            '(': ')',
+            ')': '(',
+            '<': '>',
+            '>': '<',
+            '9': '6',
+            '6': '9'
+        }
+        for ch1 in result:
+            ch_result = ch1
+            for ch2, rep in extra_replace.items():
+                if ch1 == ch2:
+                    ch_result = rep
+                    break
+            new_result += ch_result
+        result = new_result
+    return result
+
+
+def remove_inverted_text(text: str, system_language: str) -> str:
+    """Removes any inverted text from the given string
+    """
+    if system_language != 'en':
+        return text
+
+    text = uninvert_text(text)
+
+    inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
+    inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]
+
+    start_separator = ''
+    separator = '\n'
+    if '</p>' in text:
+        text = text.replace('<p>', '')
+        start_separator = '<p>'
+        separator = '</p>'
+    paragraphs = text.split(separator)
+    new_text = ''
+    inverted_list = (inverted_lower, inverted_upper)
+    z_value = (ord('z'), ord('Z'))
+    for para in paragraphs:
+        replaced_chars = 0
+
+        for idx in range(2):
+            index = 0
+            for test_ch in inverted_list[idx]:
+                if test_ch == '_':
+                    index += 1
+                    continue
+                if test_ch in para:
+                    para = para.replace(test_ch, chr(z_value[idx] - index))
+                    replaced_chars += 1
+                index += 1
+
+        if replaced_chars > 2:
+            para = para[::-1]
+        if para:
+            new_text += start_separator + para
+            if separator in text:
+                new_text += separator
+
+    return new_text
+
+
+def remove_square_capitals(text: str, system_language: str) -> str:
+    """Removes any square capital text from the given string
+    """
+    if system_language != 'en':
+        return text
+    offset = ord('A')
+    start_value = ord('🅰')
+    end_value = start_value + 26
+    result = ''
+    for text_ch in text:
+        text_value = ord(text_ch)
+        if text_value < start_value or text_value > end_value:
+            result += text_ch
+        else:
+            result += chr(offset + text_value - start_value)
+    return result
+
+
+def _standardize_text_range(text: str,
+                            range_start: int, range_end: int,
+                            offset: str) -> str:
+    """Convert any fancy characters within the given range into ordinary ones
+    """
+    offset = ord(offset)
+    ctr = 0
+    text = list(text)
+    while ctr < len(text):
+        val = ord(text[ctr])
+        if val in range(range_start, range_end):
+            text[ctr] = chr(val - range_start + offset)
+        ctr += 1
+    return "".join(text)
+
+
+def standardize_text(text: str) -> str:
+    """Converts fancy unicode text to ordinary letters
+    """
+    if not text:
+        return text
+
+    char_ranges = (
+        [65345, 'a'],
+        [119886, 'a'],
+        [119990, 'a'],
+        [120042, 'a'],
+        [120094, 'a'],
+        [120146, 'a'],
+        [120198, 'a'],
+        [120302, 'a'],
+        [120354, 'a'],
+        [120406, 'a'],
+        [65313, 'A'],
+        [119912, 'A'],
+        [119964, 'A'],
+        [120016, 'A'],
+        [120068, 'A'],
+        [120120, 'A'],
+        [120172, 'A'],
+        [120224, 'A'],
+        [120328, 'A'],
+        [120380, 'A'],
+        [120432, 'A'],
+        [127344, 'A'],
+        [127312, 'A'],
+        [127280, 'A'],
+        [127248, 'A']
+    )
+    for char_range in char_ranges:
+        range_start = char_range[0]
+        range_end = range_start + 26
+        offset = char_range[1]
+        text = _standardize_text_range(text, range_start, range_end, offset)
+
+    return uninvert_text(text)
--- a/utils.py
+++ b/utils.py
@ -22,6 +22,7 @@ from dateutil.tz import tz
 from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import hashes
 from followingCalendar import add_person_to_calendar
+from unicodetext import standardize_text

 VALID_HASHTAG_CHARS = \
    set('_0123456789' +
@ -189,159 +190,6 @@ def get_attributed_to(field) -> str:
    return None


-def uninvert_text(text: str) -> str:
-    """uninverts inverted text
-    """
-    if len(text) < 4:
-        return text
-
-    flip_table = {
-        '\u0021': '\u00A1',
-        '\u0022': '\u201E',
-        '\u0026': '\u214B',
-        '\u002E': '\u02D9',
-        '\u0033': '\u0190',
-        '\u0034': '\u152D',
-        '\u0037': '\u2C62',
-        '\u003B': '\u061B',
-        '\u003F': '\u00BF',
-        '\u0041': '\u2200',
-        '\u0042': '\u10412',
-        '\u0043': '\u2183',
-        '\u0044': '\u25D6',
-        '\u0045': '\u018E',
-        '\u0046': '\u2132',
-        '\u0047': '\u2141',
-        '\u004A': '\u017F',
-        '\u004B': '\u22CA',
-        '\u004C': '\u2142',
-        '\u004D': '\u0057',
-        '\u004E': '\u1D0E',
-        '\u0050': '\u0500',
-        '\u0051': '\u038C',
-        '\u0052': '\u1D1A',
-        '\u0054': '\u22A5',
-        '\u0055': '\u2229',
-        '\u0056': '\u1D27',
-        '\u0059': '\u2144',
-        '\u005F': '\u203E',
-        '\u0061': '\u0250',
-        '\u0062': '\u0071',
-        '\u0063': '\u0254',
-        '\u0064': '\u0070',
-        '\u0065': '\u01DD',
-        '\u0066': '\u025F',
-        '\u0067': '\u0183',
-        '\u0068': '\u0265',
-        '\u0069': '\u0131',
-        '\u006A': '\u027E',
-        '\u006B': '\u029E',
-        '\u006C': '\u0283',
-        '\u006D': '\u026F',
-        '\u006E': '\u0075',
-        '\u0072': '\u0279',
-        '\u0074': '\u0287',
-        '\u0076': '\u028C',
-        '\u0077': '\u028D',
-        '\u0079': '\u028E',
-        '\u203F': '\u2040',
-        '\u2234': '\u2235'
-    }
-
-    matches = 0
-    possible_result = ''
-    for ch_test in text:
-        ch_result = ch_test
-        for ch1, ch_inv in flip_table.items():
-            if ch_test == ch_inv:
-                matches += 1
-                ch_result = ch1
-                break
-        possible_result = ch_result + possible_result
-
-    result = text
-    if matches > len(text)/2:
-        result = possible_result
-        new_result = ''
-        extra_replace = {
-            '[': ']',
-            ']': '[',
-            '(': ')',
-            ')': '(',
-            '<': '>',
-            '>': '<',
-            '9': '6',
-            '6': '9'
-        }
-        for ch1 in result:
-            ch_result = ch1
-            for ch2, rep in extra_replace.items():
-                if ch1 == ch2:
-                    ch_result = rep
-                    break
-            new_result += ch_result
-        result = new_result
-    return result
-
-
-def _standardize_text_range(text: str,
-                            range_start: int, range_end: int,
-                            offset: str) -> str:
-    """Convert any fancy characters within the given range into ordinary ones
-    """
-    offset = ord(offset)
-    ctr = 0
-    text = list(text)
-    while ctr < len(text):
-        val = ord(text[ctr])
-        if val in range(range_start, range_end):
-            text[ctr] = chr(val - range_start + offset)
-        ctr += 1
-    return "".join(text)
-
-
-def standardize_text(text: str) -> str:
-    """Converts fancy unicode text to ordinary letters
-    """
-    if not text:
-        return text
-
-    char_ranges = (
-        [65345, 'a'],
-        [119886, 'a'],
-        [119990, 'a'],
-        [120042, 'a'],
-        [120094, 'a'],
-        [120146, 'a'],
-        [120198, 'a'],
-        [120302, 'a'],
-        [120354, 'a'],
-        [120406, 'a'],
-        [65313, 'A'],
-        [119912, 'A'],
-        [119964, 'A'],
-        [120016, 'A'],
-        [120068, 'A'],
-        [120120, 'A'],
-        [120172, 'A'],
-        [120224, 'A'],
-        [120328, 'A'],
-        [120380, 'A'],
-        [120432, 'A'],
-        [127344, 'A'],
-        [127312, 'A'],
-        [127280, 'A'],
-        [127248, 'A']
-    )
-    for char_range in char_ranges:
-        range_start = char_range[0]
-        range_end = range_start + 26
-        offset = char_range[1]
-        text = _standardize_text_range(text, range_start, range_end, offset)
-
-    return uninvert_text(text)
-
-
 def remove_eol(line: str) -> str:
    """Removes line ending characters
    """
@ -4515,69 +4363,6 @@ def get_json_content_from_accept(accept: str) -> str:
    return protocol_str


-def remove_inverted_text(text: str, system_language: str) -> str:
-    """Removes any inverted text from the given string
-    """
-    if system_language != 'en':
-        return text
-
-    text = uninvert_text(text)
-
-    inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
-    inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]
-
-    start_separator = ''
-    separator = '\n'
-    if '</p>' in text:
-        text = text.replace('<p>', '')
-        start_separator = '<p>'
-        separator = '</p>'
-    paragraphs = text.split(separator)
-    new_text = ''
-    inverted_list = (inverted_lower, inverted_upper)
-    z_value = (ord('z'), ord('Z'))
-    for para in paragraphs:
-        replaced_chars = 0
-
-        for idx in range(2):
-            index = 0
-            for test_ch in inverted_list[idx]:
-                if test_ch == '_':
-                    index += 1
-                    continue
-                if test_ch in para:
-                    para = para.replace(test_ch, chr(z_value[idx] - index))
-                    replaced_chars += 1
-                index += 1
-
-        if replaced_chars > 2:
-            para = para[::-1]
-        if para:
-            new_text += start_separator + para
-            if separator in text:
-                new_text += separator
-
-    return new_text
-
-
-def remove_square_capitals(text: str, system_language: str) -> str:
-    """Removes any square capital text from the given string
-    """
-    if system_language != 'en':
-        return text
-    offset = ord('A')
-    start_value = ord('🅰')
-    end_value = start_value + 26
-    result = ''
-    for text_ch in text:
-        text_value = ord(text_ch)
-        if text_value < start_value or text_value > end_value:
-            result += text_ch
-        else:
-            result += chr(offset + text_value - start_value)
-    return result
-
-
 def dont_speak_hashtags(content: str) -> str:
    """Ensure that hashtags aren't announced by screen readers
    """
--- a/webapp_profile.py
+++ b/webapp_profile.py
@ -19,11 +19,12 @@ from flags import is_premium_account
 from status import actor_status_expired
 from status import get_actor_status
 from textmode import text_mode_removals
+from unicodetext import uninvert_text
+from unicodetext import standardize_text
 from utils import get_person_icon
 from utils import replace_strings
 from utils import data_dir
 from utils import time_days_ago
-from utils import uninvert_text
 from utils import get_attributed_to
 from utils import get_url_from_post
 from utils import get_memorials
@ -31,7 +32,6 @@ from utils import text_in_file
 from utils import dangerous_markup
 from utils import ap_proxy_type
 from utils import remove_id_ending
-from utils import standardize_text
 from utils import get_display_name
 from utils import has_object_dict
 from utils import get_occupation_name