epicyon/unicodetext.py

229 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

__filename__ = "unicodetext.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.6.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
# functions which deal with fancy unicode text characters.
# Such text is "clever", but fucks up screen readers and accessibility
# in general
def uninvert_text(text: str) -> str:
"""uninverts inverted text
"""
if len(text) < 4:
return text
flip_table = {
'\u0021': '\u00A1',
'\u0022': '\u201E',
'\u0026': '\u214B',
'\u002E': '\u02D9',
'\u0033': '\u0190',
'\u0034': '\u152D',
'\u0037': '\u2C62',
'\u003B': '\u061B',
'\u003F': '\u00BF',
'\u0041': '\u2200',
'\u0042': '\u10412',
'\u0043': '\u2183',
'\u0044': '\u25D6',
'\u0045': '\u018E',
'\u0046': '\u2132',
'\u0047': '\u2141',
'\u004A': '\u017F',
'\u004B': '\u22CA',
'\u004C': '\u2142',
'\u004D': '\u0057',
'\u004E': '\u1D0E',
'\u0050': '\u0500',
'\u0051': '\u038C',
'\u0052': '\u1D1A',
'\u0054': '\u22A5',
'\u0055': '\u2229',
'\u0056': '\u1D27',
'\u0059': '\u2144',
'\u005F': '\u203E',
'\u0061': '\u0250',
'\u0062': '\u0071',
'\u0063': '\u0254',
'\u0064': '\u0070',
'\u0065': '\u01DD',
'\u0066': '\u025F',
'\u0067': '\u0183',
'\u0068': '\u0265',
'\u0069': '\u0131',
'\u006A': '\u027E',
'\u006B': '\u029E',
'\u006C': '\u0283',
'\u006D': '\u026F',
'\u006E': '\u0075',
'\u0072': '\u0279',
'\u0074': '\u0287',
'\u0076': '\u028C',
'\u0077': '\u028D',
'\u0079': '\u028E',
'\u203F': '\u2040',
'\u2234': '\u2235'
}
matches = 0
possible_result = ''
for ch_test in text:
ch_result = ch_test
for ch1, ch_inv in flip_table.items():
if ch_test == ch_inv:
matches += 1
ch_result = ch1
break
possible_result = ch_result + possible_result
result = text
if matches > len(text)/2:
result = possible_result
new_result = ''
extra_replace = {
'[': ']',
']': '[',
'(': ')',
')': '(',
'<': '>',
'>': '<',
'9': '6',
'6': '9'
}
for ch1 in result:
ch_result = ch1
for ch2, rep in extra_replace.items():
if ch1 == ch2:
ch_result = rep
break
new_result += ch_result
result = new_result
return result
def remove_inverted_text(text: str, system_language: str) -> str:
"""Removes any inverted text from the given string
"""
if system_language != 'en':
return text
text = uninvert_text(text)
inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]
start_separator = ''
separator = '\n'
if '</p>' in text:
text = text.replace('<p>', '')
start_separator = '<p>'
separator = '</p>'
paragraphs = text.split(separator)
new_text = ''
inverted_list = (inverted_lower, inverted_upper)
z_value = (ord('z'), ord('Z'))
for para in paragraphs:
replaced_chars = 0
for idx in range(2):
index = 0
for test_ch in inverted_list[idx]:
if test_ch == '_':
index += 1
continue
if test_ch in para:
para = para.replace(test_ch, chr(z_value[idx] - index))
replaced_chars += 1
index += 1
if replaced_chars > 2:
para = para[::-1]
if para:
new_text += start_separator + para
if separator in text:
new_text += separator
return new_text
def remove_square_capitals(text: str, system_language: str) -> str:
"""Removes any square capital text from the given string
"""
if system_language != 'en':
return text
offset = ord('A')
start_value = ord('🅰')
end_value = start_value + 26
result = ''
for text_ch in text:
text_value = ord(text_ch)
if text_value < start_value or text_value > end_value:
result += text_ch
else:
result += chr(offset + text_value - start_value)
return result
def _standardize_text_range(text: str,
range_start: int, range_end: int,
offset: str) -> str:
"""Convert any fancy characters within the given range into ordinary ones
"""
offset = ord(offset)
ctr = 0
text = list(text)
while ctr < len(text):
val = ord(text[ctr])
if val in range(range_start, range_end):
text[ctr] = chr(val - range_start + offset)
ctr += 1
return "".join(text)
def standardize_text(text: str) -> str:
"""Converts fancy unicode text to ordinary letters
"""
if not text:
return text
char_ranges = (
[65345, 'a'],
[119886, 'a'],
[119990, 'a'],
[120042, 'a'],
[120094, 'a'],
[120146, 'a'],
[120198, 'a'],
[120302, 'a'],
[120354, 'a'],
[120406, 'a'],
[65313, 'A'],
[119912, 'A'],
[119964, 'A'],
[120016, 'A'],
[120068, 'A'],
[120120, 'A'],
[120172, 'A'],
[120224, 'A'],
[120328, 'A'],
[120380, 'A'],
[120432, 'A'],
[127344, 'A'],
[127312, 'A'],
[127280, 'A'],
[127248, 'A']
)
for char_range in char_ranges:
range_start = char_range[0]
range_end = range_start + 26
offset = char_range[1]
text = _standardize_text_range(text, range_start, range_end, offset)
return uninvert_text(text)