epicyon/unicodetext.py

__filename__ = "unicodetext.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.6.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"

# functions which deal with fancy unicode text characters.
# Such text is "clever", but fucks up screen readers and accessibility
# in general


def uninvert_text(text: str) -> str:
    """uninverts inverted text
    """
    if len(text) < 4:
        return text

    flip_table = {
        '\u0021': '\u00A1',
        '\u0022': '\u201E',
        '\u0026': '\u214B',
        '\u002E': '\u02D9',
        '\u0033': '\u0190',
        '\u0034': '\u152D',
        '\u0037': '\u2C62',
        '\u003B': '\u061B',
        '\u003F': '\u00BF',
        '\u0041': '\u2200',
        '\u0042': '\u10412',
        '\u0043': '\u2183',
        '\u0044': '\u25D6',
        '\u0045': '\u018E',
        '\u0046': '\u2132',
        '\u0047': '\u2141',
        '\u004A': '\u017F',
        '\u004B': '\u22CA',
        '\u004C': '\u2142',
        '\u004D': '\u0057',
        '\u004E': '\u1D0E',
        '\u0050': '\u0500',
        '\u0051': '\u038C',
        '\u0052': '\u1D1A',
        '\u0054': '\u22A5',
        '\u0055': '\u2229',
        '\u0056': '\u1D27',
        '\u0059': '\u2144',
        '\u005F': '\u203E',
        '\u0061': '\u0250',
        '\u0062': '\u0071',
        '\u0063': '\u0254',
        '\u0064': '\u0070',
        '\u0065': '\u01DD',
        '\u0066': '\u025F',
        '\u0067': '\u0183',
        '\u0068': '\u0265',
        '\u0069': '\u0131',
        '\u006A': '\u027E',
        '\u006B': '\u029E',
        '\u006C': '\u0283',
        '\u006D': '\u026F',
        '\u006E': '\u0075',
        '\u0072': '\u0279',
        '\u0074': '\u0287',
        '\u0076': '\u028C',
        '\u0077': '\u028D',
        '\u0079': '\u028E',
        '\u203F': '\u2040',
        '\u2234': '\u2235'
    }

    matches = 0
    possible_result = ''
    for ch_test in text:
        ch_result = ch_test
        for ch1, ch_inv in flip_table.items():
            if ch_test == ch_inv:
                matches += 1
                ch_result = ch1
                break
        possible_result = ch_result + possible_result

    result = text
    if matches > len(text)/2:
        result = possible_result
        new_result = ''
        extra_replace = {
            '[': ']',
            ']': '[',
            '(': ')',
            ')': '(',
            '<': '>',
            '>': '<',
            '9': '6',
            '6': '9'
        }
        for ch1 in result:
            ch_result = ch1
            for ch2, rep in extra_replace.items():
                if ch1 == ch2:
                    ch_result = rep
                    break
            new_result += ch_result
        result = new_result
    return result


def remove_inverted_text(text: str, system_language: str) -> str:
    """Removes any inverted text from the given string
    """
    if system_language != 'en':
        return text

    text = uninvert_text(text)

    inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
    inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]

    start_separator = ''
    separator = '\n'
    if '</p>' in text:
        text = text.replace('<p>', '')
        start_separator = '<p>'
        separator = '</p>'
    paragraphs = text.split(separator)
    new_text = ''
    inverted_list = (inverted_lower, inverted_upper)
    z_value = (ord('z'), ord('Z'))
    for para in paragraphs:
        replaced_chars = 0

        for idx in range(2):
            index = 0
            for test_ch in inverted_list[idx]:
                if test_ch == '_':
                    index += 1
                    continue
                if test_ch in para:
                    para = para.replace(test_ch, chr(z_value[idx] - index))
                    replaced_chars += 1
                index += 1

        if replaced_chars > 2:
            para = para[::-1]
        if para:
            new_text += start_separator + para
            if separator in text:
                new_text += separator

    return new_text


def remove_square_capitals(text: str, system_language: str) -> str:
    """Removes any square capital text from the given string
    """
    if system_language != 'en':
        return text
    offset = ord('A')
    start_value = ord('🅰')
    end_value = start_value + 26
    result = ''
    for text_ch in text:
        text_value = ord(text_ch)
        if text_value < start_value or text_value > end_value:
            result += text_ch
        else:
            result += chr(offset + text_value - start_value)
    return result


def _standardize_text_range(text: str,
                            range_start: int, range_end: int,
                            offset: str) -> str:
    """Convert any fancy characters within the given range into ordinary ones
    """
    offset = ord(offset)
    ctr = 0
    text = list(text)
    while ctr < len(text):
        val = ord(text[ctr])
        if val in range(range_start, range_end):
            text[ctr] = chr(val - range_start + offset)
        ctr += 1
    return "".join(text)


def standardize_text(text: str) -> str:
    """Converts fancy unicode text to ordinary letters
    """
    if not text:
        return text

    char_ranges = (
        [65345, 'a'],
        [119886, 'a'],
        [119990, 'a'],
        [120042, 'a'],
        [120094, 'a'],
        [120146, 'a'],
        [120198, 'a'],
        [120302, 'a'],
        [120354, 'a'],
        [120406, 'a'],
        [65313, 'A'],
        [119912, 'A'],
        [119964, 'A'],
        [120016, 'A'],
        [120068, 'A'],
        [120120, 'A'],
        [120172, 'A'],
        [120224, 'A'],
        [120328, 'A'],
        [120380, 'A'],
        [120432, 'A'],
        [127344, 'A'],
        [127312, 'A'],
        [127280, 'A'],
        [127248, 'A']
    )
    for char_range in char_ranges:
        range_start = char_range[0]
        range_end = range_start + 26
        offset = char_range[1]
        text = _standardize_text_range(text, range_start, range_end, offset)

    return uninvert_text(text)