mirror of https://gitlab.com/bashrc2/epicyon
fancy unicode text functions in their own module
parent
54adba4575
commit
c8cb2d74ee
|
@ -13,9 +13,9 @@ from utils import data_dir
|
|||
from utils import acct_dir
|
||||
from utils import text_in_file
|
||||
from utils import remove_eol
|
||||
from utils import standardize_text
|
||||
from utils import remove_inverted_text
|
||||
from utils import remove_square_capitals
|
||||
from unicodetext import standardize_text
|
||||
from unicodetext import remove_inverted_text
|
||||
from unicodetext import remove_square_capitals
|
||||
|
||||
|
||||
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
|
||||
|
|
|
@ -12,7 +12,7 @@ __accounts_data_path_tests__ = False
|
|||
from utils import date_utcnow
|
||||
from utils import date_from_string_format
|
||||
from utils import remove_html
|
||||
from utils import standardize_text
|
||||
from unicodetext import standardize_text
|
||||
|
||||
MAX_STATUS_LENGTH = 100
|
||||
|
||||
|
|
8
tests.py
8
tests.py
|
@ -64,21 +64,21 @@ from flags import is_group_account
|
|||
from flags import is_right_to_left_text
|
||||
from status import actor_status_expired
|
||||
from status import get_actor_status
|
||||
from unicodetext import uninvert_text
|
||||
from utils import replace_strings
|
||||
from utils import valid_content_warning
|
||||
from utils import data_dir
|
||||
from utils import data_dir_testing
|
||||
from utils import remove_link_tracking
|
||||
from utils import uninvert_text
|
||||
from utils import get_url_from_post
|
||||
from utils import date_from_string_format
|
||||
from utils import date_utcnow
|
||||
from utils import remove_markup_tag
|
||||
from utils import remove_style_within_html
|
||||
from utils import html_tag_has_closing
|
||||
from utils import remove_inverted_text
|
||||
from utils import remove_square_capitals
|
||||
from utils import standardize_text
|
||||
from unicodetext import remove_inverted_text
|
||||
from unicodetext import remove_square_capitals
|
||||
from unicodetext import standardize_text
|
||||
from utils import remove_eol
|
||||
from utils import text_in_file
|
||||
from utils import convert_published_to_local_timezone
|
||||
|
|
|
@ -0,0 +1,228 @@
|
|||
__filename__ = "unicodetext.py"
|
||||
__author__ = "Bob Mottram"
|
||||
__license__ = "AGPL3+"
|
||||
__version__ = "1.6.0"
|
||||
__maintainer__ = "Bob Mottram"
|
||||
__email__ = "bob@libreserver.org"
|
||||
__status__ = "Production"
|
||||
__module_group__ = "Core"
|
||||
|
||||
# functions which deal with fancy unicode text characters.
|
||||
# Such text is "clever", but fucks up screen readers and accessibility
|
||||
# in general
|
||||
|
||||
|
||||
def uninvert_text(text: str) -> str:
|
||||
"""uninverts inverted text
|
||||
"""
|
||||
if len(text) < 4:
|
||||
return text
|
||||
|
||||
flip_table = {
|
||||
'\u0021': '\u00A1',
|
||||
'\u0022': '\u201E',
|
||||
'\u0026': '\u214B',
|
||||
'\u002E': '\u02D9',
|
||||
'\u0033': '\u0190',
|
||||
'\u0034': '\u152D',
|
||||
'\u0037': '\u2C62',
|
||||
'\u003B': '\u061B',
|
||||
'\u003F': '\u00BF',
|
||||
'\u0041': '\u2200',
|
||||
'\u0042': '\u10412',
|
||||
'\u0043': '\u2183',
|
||||
'\u0044': '\u25D6',
|
||||
'\u0045': '\u018E',
|
||||
'\u0046': '\u2132',
|
||||
'\u0047': '\u2141',
|
||||
'\u004A': '\u017F',
|
||||
'\u004B': '\u22CA',
|
||||
'\u004C': '\u2142',
|
||||
'\u004D': '\u0057',
|
||||
'\u004E': '\u1D0E',
|
||||
'\u0050': '\u0500',
|
||||
'\u0051': '\u038C',
|
||||
'\u0052': '\u1D1A',
|
||||
'\u0054': '\u22A5',
|
||||
'\u0055': '\u2229',
|
||||
'\u0056': '\u1D27',
|
||||
'\u0059': '\u2144',
|
||||
'\u005F': '\u203E',
|
||||
'\u0061': '\u0250',
|
||||
'\u0062': '\u0071',
|
||||
'\u0063': '\u0254',
|
||||
'\u0064': '\u0070',
|
||||
'\u0065': '\u01DD',
|
||||
'\u0066': '\u025F',
|
||||
'\u0067': '\u0183',
|
||||
'\u0068': '\u0265',
|
||||
'\u0069': '\u0131',
|
||||
'\u006A': '\u027E',
|
||||
'\u006B': '\u029E',
|
||||
'\u006C': '\u0283',
|
||||
'\u006D': '\u026F',
|
||||
'\u006E': '\u0075',
|
||||
'\u0072': '\u0279',
|
||||
'\u0074': '\u0287',
|
||||
'\u0076': '\u028C',
|
||||
'\u0077': '\u028D',
|
||||
'\u0079': '\u028E',
|
||||
'\u203F': '\u2040',
|
||||
'\u2234': '\u2235'
|
||||
}
|
||||
|
||||
matches = 0
|
||||
possible_result = ''
|
||||
for ch_test in text:
|
||||
ch_result = ch_test
|
||||
for ch1, ch_inv in flip_table.items():
|
||||
if ch_test == ch_inv:
|
||||
matches += 1
|
||||
ch_result = ch1
|
||||
break
|
||||
possible_result = ch_result + possible_result
|
||||
|
||||
result = text
|
||||
if matches > len(text)/2:
|
||||
result = possible_result
|
||||
new_result = ''
|
||||
extra_replace = {
|
||||
'[': ']',
|
||||
']': '[',
|
||||
'(': ')',
|
||||
')': '(',
|
||||
'<': '>',
|
||||
'>': '<',
|
||||
'9': '6',
|
||||
'6': '9'
|
||||
}
|
||||
for ch1 in result:
|
||||
ch_result = ch1
|
||||
for ch2, rep in extra_replace.items():
|
||||
if ch1 == ch2:
|
||||
ch_result = rep
|
||||
break
|
||||
new_result += ch_result
|
||||
result = new_result
|
||||
return result
|
||||
|
||||
|
||||
def remove_inverted_text(text: str, system_language: str) -> str:
|
||||
"""Removes any inverted text from the given string
|
||||
"""
|
||||
if system_language != 'en':
|
||||
return text
|
||||
|
||||
text = uninvert_text(text)
|
||||
|
||||
inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
|
||||
inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]
|
||||
|
||||
start_separator = ''
|
||||
separator = '\n'
|
||||
if '</p>' in text:
|
||||
text = text.replace('<p>', '')
|
||||
start_separator = '<p>'
|
||||
separator = '</p>'
|
||||
paragraphs = text.split(separator)
|
||||
new_text = ''
|
||||
inverted_list = (inverted_lower, inverted_upper)
|
||||
z_value = (ord('z'), ord('Z'))
|
||||
for para in paragraphs:
|
||||
replaced_chars = 0
|
||||
|
||||
for idx in range(2):
|
||||
index = 0
|
||||
for test_ch in inverted_list[idx]:
|
||||
if test_ch == '_':
|
||||
index += 1
|
||||
continue
|
||||
if test_ch in para:
|
||||
para = para.replace(test_ch, chr(z_value[idx] - index))
|
||||
replaced_chars += 1
|
||||
index += 1
|
||||
|
||||
if replaced_chars > 2:
|
||||
para = para[::-1]
|
||||
if para:
|
||||
new_text += start_separator + para
|
||||
if separator in text:
|
||||
new_text += separator
|
||||
|
||||
return new_text
|
||||
|
||||
|
||||
def remove_square_capitals(text: str, system_language: str) -> str:
|
||||
"""Removes any square capital text from the given string
|
||||
"""
|
||||
if system_language != 'en':
|
||||
return text
|
||||
offset = ord('A')
|
||||
start_value = ord('🅰')
|
||||
end_value = start_value + 26
|
||||
result = ''
|
||||
for text_ch in text:
|
||||
text_value = ord(text_ch)
|
||||
if text_value < start_value or text_value > end_value:
|
||||
result += text_ch
|
||||
else:
|
||||
result += chr(offset + text_value - start_value)
|
||||
return result
|
||||
|
||||
|
||||
def _standardize_text_range(text: str,
|
||||
range_start: int, range_end: int,
|
||||
offset: str) -> str:
|
||||
"""Convert any fancy characters within the given range into ordinary ones
|
||||
"""
|
||||
offset = ord(offset)
|
||||
ctr = 0
|
||||
text = list(text)
|
||||
while ctr < len(text):
|
||||
val = ord(text[ctr])
|
||||
if val in range(range_start, range_end):
|
||||
text[ctr] = chr(val - range_start + offset)
|
||||
ctr += 1
|
||||
return "".join(text)
|
||||
|
||||
|
||||
def standardize_text(text: str) -> str:
|
||||
"""Converts fancy unicode text to ordinary letters
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
char_ranges = (
|
||||
[65345, 'a'],
|
||||
[119886, 'a'],
|
||||
[119990, 'a'],
|
||||
[120042, 'a'],
|
||||
[120094, 'a'],
|
||||
[120146, 'a'],
|
||||
[120198, 'a'],
|
||||
[120302, 'a'],
|
||||
[120354, 'a'],
|
||||
[120406, 'a'],
|
||||
[65313, 'A'],
|
||||
[119912, 'A'],
|
||||
[119964, 'A'],
|
||||
[120016, 'A'],
|
||||
[120068, 'A'],
|
||||
[120120, 'A'],
|
||||
[120172, 'A'],
|
||||
[120224, 'A'],
|
||||
[120328, 'A'],
|
||||
[120380, 'A'],
|
||||
[120432, 'A'],
|
||||
[127344, 'A'],
|
||||
[127312, 'A'],
|
||||
[127280, 'A'],
|
||||
[127248, 'A']
|
||||
)
|
||||
for char_range in char_ranges:
|
||||
range_start = char_range[0]
|
||||
range_end = range_start + 26
|
||||
offset = char_range[1]
|
||||
text = _standardize_text_range(text, range_start, range_end, offset)
|
||||
|
||||
return uninvert_text(text)
|
217
utils.py
217
utils.py
|
@ -22,6 +22,7 @@ from dateutil.tz import tz
|
|||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
from followingCalendar import add_person_to_calendar
|
||||
from unicodetext import standardize_text
|
||||
|
||||
VALID_HASHTAG_CHARS = \
|
||||
set('_0123456789' +
|
||||
|
@ -189,159 +190,6 @@ def get_attributed_to(field) -> str:
|
|||
return None
|
||||
|
||||
|
||||
def uninvert_text(text: str) -> str:
|
||||
"""uninverts inverted text
|
||||
"""
|
||||
if len(text) < 4:
|
||||
return text
|
||||
|
||||
flip_table = {
|
||||
'\u0021': '\u00A1',
|
||||
'\u0022': '\u201E',
|
||||
'\u0026': '\u214B',
|
||||
'\u002E': '\u02D9',
|
||||
'\u0033': '\u0190',
|
||||
'\u0034': '\u152D',
|
||||
'\u0037': '\u2C62',
|
||||
'\u003B': '\u061B',
|
||||
'\u003F': '\u00BF',
|
||||
'\u0041': '\u2200',
|
||||
'\u0042': '\u10412',
|
||||
'\u0043': '\u2183',
|
||||
'\u0044': '\u25D6',
|
||||
'\u0045': '\u018E',
|
||||
'\u0046': '\u2132',
|
||||
'\u0047': '\u2141',
|
||||
'\u004A': '\u017F',
|
||||
'\u004B': '\u22CA',
|
||||
'\u004C': '\u2142',
|
||||
'\u004D': '\u0057',
|
||||
'\u004E': '\u1D0E',
|
||||
'\u0050': '\u0500',
|
||||
'\u0051': '\u038C',
|
||||
'\u0052': '\u1D1A',
|
||||
'\u0054': '\u22A5',
|
||||
'\u0055': '\u2229',
|
||||
'\u0056': '\u1D27',
|
||||
'\u0059': '\u2144',
|
||||
'\u005F': '\u203E',
|
||||
'\u0061': '\u0250',
|
||||
'\u0062': '\u0071',
|
||||
'\u0063': '\u0254',
|
||||
'\u0064': '\u0070',
|
||||
'\u0065': '\u01DD',
|
||||
'\u0066': '\u025F',
|
||||
'\u0067': '\u0183',
|
||||
'\u0068': '\u0265',
|
||||
'\u0069': '\u0131',
|
||||
'\u006A': '\u027E',
|
||||
'\u006B': '\u029E',
|
||||
'\u006C': '\u0283',
|
||||
'\u006D': '\u026F',
|
||||
'\u006E': '\u0075',
|
||||
'\u0072': '\u0279',
|
||||
'\u0074': '\u0287',
|
||||
'\u0076': '\u028C',
|
||||
'\u0077': '\u028D',
|
||||
'\u0079': '\u028E',
|
||||
'\u203F': '\u2040',
|
||||
'\u2234': '\u2235'
|
||||
}
|
||||
|
||||
matches = 0
|
||||
possible_result = ''
|
||||
for ch_test in text:
|
||||
ch_result = ch_test
|
||||
for ch1, ch_inv in flip_table.items():
|
||||
if ch_test == ch_inv:
|
||||
matches += 1
|
||||
ch_result = ch1
|
||||
break
|
||||
possible_result = ch_result + possible_result
|
||||
|
||||
result = text
|
||||
if matches > len(text)/2:
|
||||
result = possible_result
|
||||
new_result = ''
|
||||
extra_replace = {
|
||||
'[': ']',
|
||||
']': '[',
|
||||
'(': ')',
|
||||
')': '(',
|
||||
'<': '>',
|
||||
'>': '<',
|
||||
'9': '6',
|
||||
'6': '9'
|
||||
}
|
||||
for ch1 in result:
|
||||
ch_result = ch1
|
||||
for ch2, rep in extra_replace.items():
|
||||
if ch1 == ch2:
|
||||
ch_result = rep
|
||||
break
|
||||
new_result += ch_result
|
||||
result = new_result
|
||||
return result
|
||||
|
||||
|
||||
def _standardize_text_range(text: str,
|
||||
range_start: int, range_end: int,
|
||||
offset: str) -> str:
|
||||
"""Convert any fancy characters within the given range into ordinary ones
|
||||
"""
|
||||
offset = ord(offset)
|
||||
ctr = 0
|
||||
text = list(text)
|
||||
while ctr < len(text):
|
||||
val = ord(text[ctr])
|
||||
if val in range(range_start, range_end):
|
||||
text[ctr] = chr(val - range_start + offset)
|
||||
ctr += 1
|
||||
return "".join(text)
|
||||
|
||||
|
||||
def standardize_text(text: str) -> str:
|
||||
"""Converts fancy unicode text to ordinary letters
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
char_ranges = (
|
||||
[65345, 'a'],
|
||||
[119886, 'a'],
|
||||
[119990, 'a'],
|
||||
[120042, 'a'],
|
||||
[120094, 'a'],
|
||||
[120146, 'a'],
|
||||
[120198, 'a'],
|
||||
[120302, 'a'],
|
||||
[120354, 'a'],
|
||||
[120406, 'a'],
|
||||
[65313, 'A'],
|
||||
[119912, 'A'],
|
||||
[119964, 'A'],
|
||||
[120016, 'A'],
|
||||
[120068, 'A'],
|
||||
[120120, 'A'],
|
||||
[120172, 'A'],
|
||||
[120224, 'A'],
|
||||
[120328, 'A'],
|
||||
[120380, 'A'],
|
||||
[120432, 'A'],
|
||||
[127344, 'A'],
|
||||
[127312, 'A'],
|
||||
[127280, 'A'],
|
||||
[127248, 'A']
|
||||
)
|
||||
for char_range in char_ranges:
|
||||
range_start = char_range[0]
|
||||
range_end = range_start + 26
|
||||
offset = char_range[1]
|
||||
text = _standardize_text_range(text, range_start, range_end, offset)
|
||||
|
||||
return uninvert_text(text)
|
||||
|
||||
|
||||
def remove_eol(line: str) -> str:
|
||||
"""Removes line ending characters
|
||||
"""
|
||||
|
@ -4515,69 +4363,6 @@ def get_json_content_from_accept(accept: str) -> str:
|
|||
return protocol_str
|
||||
|
||||
|
||||
def remove_inverted_text(text: str, system_language: str) -> str:
|
||||
"""Removes any inverted text from the given string
|
||||
"""
|
||||
if system_language != 'en':
|
||||
return text
|
||||
|
||||
text = uninvert_text(text)
|
||||
|
||||
inverted_lower = [*"_ʎ_ʍʌ_ʇ_ɹ____ɯʃʞɾıɥƃɟǝ_ɔ_ɐ"]
|
||||
inverted_upper = [*"_⅄__ᴧ∩⊥_ᴚΌԀ_ᴎ_⅂⋊ſ__⅁ℲƎ◖Ↄ𐐒∀"]
|
||||
|
||||
start_separator = ''
|
||||
separator = '\n'
|
||||
if '</p>' in text:
|
||||
text = text.replace('<p>', '')
|
||||
start_separator = '<p>'
|
||||
separator = '</p>'
|
||||
paragraphs = text.split(separator)
|
||||
new_text = ''
|
||||
inverted_list = (inverted_lower, inverted_upper)
|
||||
z_value = (ord('z'), ord('Z'))
|
||||
for para in paragraphs:
|
||||
replaced_chars = 0
|
||||
|
||||
for idx in range(2):
|
||||
index = 0
|
||||
for test_ch in inverted_list[idx]:
|
||||
if test_ch == '_':
|
||||
index += 1
|
||||
continue
|
||||
if test_ch in para:
|
||||
para = para.replace(test_ch, chr(z_value[idx] - index))
|
||||
replaced_chars += 1
|
||||
index += 1
|
||||
|
||||
if replaced_chars > 2:
|
||||
para = para[::-1]
|
||||
if para:
|
||||
new_text += start_separator + para
|
||||
if separator in text:
|
||||
new_text += separator
|
||||
|
||||
return new_text
|
||||
|
||||
|
||||
def remove_square_capitals(text: str, system_language: str) -> str:
|
||||
"""Removes any square capital text from the given string
|
||||
"""
|
||||
if system_language != 'en':
|
||||
return text
|
||||
offset = ord('A')
|
||||
start_value = ord('🅰')
|
||||
end_value = start_value + 26
|
||||
result = ''
|
||||
for text_ch in text:
|
||||
text_value = ord(text_ch)
|
||||
if text_value < start_value or text_value > end_value:
|
||||
result += text_ch
|
||||
else:
|
||||
result += chr(offset + text_value - start_value)
|
||||
return result
|
||||
|
||||
|
||||
def dont_speak_hashtags(content: str) -> str:
|
||||
"""Ensure that hashtags aren't announced by screen readers
|
||||
"""
|
||||
|
|
|
@ -19,11 +19,12 @@ from flags import is_premium_account
|
|||
from status import actor_status_expired
|
||||
from status import get_actor_status
|
||||
from textmode import text_mode_removals
|
||||
from unicodetext import uninvert_text
|
||||
from unicodetext import standardize_text
|
||||
from utils import get_person_icon
|
||||
from utils import replace_strings
|
||||
from utils import data_dir
|
||||
from utils import time_days_ago
|
||||
from utils import uninvert_text
|
||||
from utils import get_attributed_to
|
||||
from utils import get_url_from_post
|
||||
from utils import get_memorials
|
||||
|
@ -31,7 +32,6 @@ from utils import text_in_file
|
|||
from utils import dangerous_markup
|
||||
from utils import ap_proxy_type
|
||||
from utils import remove_id_ending
|
||||
from utils import standardize_text
|
||||
from utils import get_display_name
|
||||
from utils import has_object_dict
|
||||
from utils import get_occupation_name
|
||||
|
|
Loading…
Reference in New Issue