Standardize text prior to filtering

main
Bob Mottram 2022-07-09 11:37:33 +01:00
parent caaef8eeae
commit 0e69a62785
2 changed files with 89 additions and 0 deletions

View File

@ -13,6 +13,44 @@ from utils import text_in_file
from utils import remove_eol from utils import remove_eol
def _standardize_text_range(text: str,
range_start: int, range_end: int,
offset: str) -> str:
"""Convert any fancy characters within the given range into ordinary ones
"""
offset = ord(offset)
ctr = 0
text = list(text)
while ctr < len(text):
val = ord(text[ctr])
if val in range(range_start, range_end):
text[ctr] = chr(val - range_start + offset)
ctr += 1
return "".join(text)
def standardize_text(text: str) -> str:
"""Converts fancy unicode text to ordinary letters
"""
fancy_ranges = (
119990, 120094, 120198, 120042, 119990, 120146, 119886
)
for range_start in fancy_ranges:
range_end = range_start + 26
text = _standardize_text_range(text, range_start, range_end, 'a')
range_start = range_end
range_end = range_start + 26
text = _standardize_text_range(text, range_start, range_end, 'A')
text = _standardize_text_range(text, 65345, 65345 + 26, 'a')
text = _standardize_text_range(text, 65313, 65313 + 26, 'A')
text = _standardize_text_range(text, 119964, 119964 + 26, 'A')
return text
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
"""Adds a filter for particular words within the content of a incoming posts """Adds a filter for particular words within the content of a incoming posts
""" """
@ -120,6 +158,9 @@ def _is_filtered_base(filename: str, content: str) -> bool:
if not os.path.isfile(filename): if not os.path.isfile(filename):
return False return False
# convert any fancy characters to ordinary ones
content = standardize_text(content)
try: try:
with open(filename, 'r', encoding='utf-8') as fp_filt: with open(filename, 'r', encoding='utf-8') as fp_filt:
for line in fp_filt: for line in fp_filt:

View File

@ -189,6 +189,7 @@ from blocking import add_cw_from_lists
from happening import dav_month_via_server from happening import dav_month_via_server
from happening import dav_day_via_server from happening import dav_day_via_server
from webapp_theme_designer import color_contrast from webapp_theme_designer import color_contrast
from filters import standardize_text
TEST_SERVER_GROUP_RUNNING = False TEST_SERVER_GROUP_RUNNING = False
@ -7351,6 +7352,52 @@ def _test_dogwhistles():
assert result['hamstered']['category'] == "hamsterism" assert result['hamstered']['category'] == "hamsterism"
def _test_text_standardize():
print('text_standardize')
expected = 'This is a test'
result = standardize_text(expected)
if result != expected:
print(result)
assert result == expected
text = '๐”—๐”ฅ๐”ฆ๐”ฐ ๐”ฆ๐”ฐ ๐”ž ๐”ฑ๐”ข๐”ฐ๐”ฑ'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
text = '๐•ฟ๐–๐–Ž๐–˜ ๐–Ž๐–˜ ๐–† ๐–™๐–Š๐–˜๐–™'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
text = '๐“ฃ๐“ฑ๐“ฒ๐“ผ ๐“ฒ๐“ผ ๐“ช ๐“ฝ๐“ฎ๐“ผ๐“ฝ'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
text = '๐’ฏ๐’ฝ๐’พ๐“ˆ ๐’พ๐“ˆ ๐’ถ ๐“‰๐‘’๐“ˆ๐“‰'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
text = '๐•‹๐•™๐•š๐•ค ๐•š๐•ค ๐•’ ๐•ฅ๐•–๐•ค๐•ฅ'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
text = '๏ผด๏ฝˆ๏ฝ‰๏ฝ“ ๏ฝ‰๏ฝ“ ๏ฝ ๏ฝ”๏ฝ…๏ฝ“๏ฝ”'
result = standardize_text(text)
if result != expected:
print(result)
assert result == expected
def run_all_tests(): def run_all_tests():
base_dir = os.getcwd() base_dir = os.getcwd()
print('Running tests...') print('Running tests...')
@ -7368,6 +7415,7 @@ def run_all_tests():
_test_checkbox_names() _test_checkbox_names()
_test_thread_functions() _test_thread_functions()
_test_functions() _test_functions()
_test_text_standardize()
_test_dogwhistles() _test_dogwhistles()
_test_remove_end_of_line() _test_remove_end_of_line()
_test_translation_labels() _test_translation_labels()