mirror of https://gitlab.com/bashrc2/epicyon
Standardize text prior to filtering
parent
caaef8eeae
commit
0e69a62785
41
filters.py
41
filters.py
|
@ -13,6 +13,44 @@ from utils import text_in_file
|
||||||
from utils import remove_eol
|
from utils import remove_eol
|
||||||
|
|
||||||
|
|
||||||
|
def _standardize_text_range(text: str,
|
||||||
|
range_start: int, range_end: int,
|
||||||
|
offset: str) -> str:
|
||||||
|
"""Convert any fancy characters within the given range into ordinary ones
|
||||||
|
"""
|
||||||
|
offset = ord(offset)
|
||||||
|
ctr = 0
|
||||||
|
text = list(text)
|
||||||
|
while ctr < len(text):
|
||||||
|
val = ord(text[ctr])
|
||||||
|
if val in range(range_start, range_end):
|
||||||
|
text[ctr] = chr(val - range_start + offset)
|
||||||
|
ctr += 1
|
||||||
|
return "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_text(text: str) -> str:
|
||||||
|
"""Converts fancy unicode text to ordinary letters
|
||||||
|
"""
|
||||||
|
fancy_ranges = (
|
||||||
|
119990, 120094, 120198, 120042, 119990, 120146, 119886
|
||||||
|
)
|
||||||
|
|
||||||
|
for range_start in fancy_ranges:
|
||||||
|
range_end = range_start + 26
|
||||||
|
text = _standardize_text_range(text, range_start, range_end, 'a')
|
||||||
|
|
||||||
|
range_start = range_end
|
||||||
|
range_end = range_start + 26
|
||||||
|
text = _standardize_text_range(text, range_start, range_end, 'A')
|
||||||
|
|
||||||
|
text = _standardize_text_range(text, 65345, 65345 + 26, 'a')
|
||||||
|
text = _standardize_text_range(text, 65313, 65313 + 26, 'A')
|
||||||
|
text = _standardize_text_range(text, 119964, 119964 + 26, 'A')
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
|
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
|
||||||
"""Adds a filter for particular words within the content of a incoming posts
|
"""Adds a filter for particular words within the content of a incoming posts
|
||||||
"""
|
"""
|
||||||
|
@ -120,6 +158,9 @@ def _is_filtered_base(filename: str, content: str) -> bool:
|
||||||
if not os.path.isfile(filename):
|
if not os.path.isfile(filename):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# convert any fancy characters to ordinary ones
|
||||||
|
content = standardize_text(content)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(filename, 'r', encoding='utf-8') as fp_filt:
|
with open(filename, 'r', encoding='utf-8') as fp_filt:
|
||||||
for line in fp_filt:
|
for line in fp_filt:
|
||||||
|
|
48
tests.py
48
tests.py
|
@ -189,6 +189,7 @@ from blocking import add_cw_from_lists
|
||||||
from happening import dav_month_via_server
|
from happening import dav_month_via_server
|
||||||
from happening import dav_day_via_server
|
from happening import dav_day_via_server
|
||||||
from webapp_theme_designer import color_contrast
|
from webapp_theme_designer import color_contrast
|
||||||
|
from filters import standardize_text
|
||||||
|
|
||||||
|
|
||||||
TEST_SERVER_GROUP_RUNNING = False
|
TEST_SERVER_GROUP_RUNNING = False
|
||||||
|
@ -7351,6 +7352,52 @@ def _test_dogwhistles():
|
||||||
assert result['hamstered']['category'] == "hamsterism"
|
assert result['hamstered']['category'] == "hamsterism"
|
||||||
|
|
||||||
|
|
||||||
|
def _test_text_standardize():
|
||||||
|
print('text_standardize')
|
||||||
|
expected = 'This is a test'
|
||||||
|
|
||||||
|
result = standardize_text(expected)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๐๐ฅ๐ฆ๐ฐ ๐ฆ๐ฐ ๐ ๐ฑ๐ข๐ฐ๐ฑ'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๐ฟ๐๐๐ ๐๐ ๐ ๐๐๐๐'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๐ฃ๐ฑ๐ฒ๐ผ ๐ฒ๐ผ ๐ช ๐ฝ๐ฎ๐ผ๐ฝ'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๐ฏ๐ฝ๐พ๐ ๐พ๐ ๐ถ ๐๐๐๐'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๐๐๐๐ค ๐๐ค ๐ ๐ฅ๐๐ค๐ฅ'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '๏ผด๏ฝ๏ฝ๏ฝ ๏ฝ๏ฝ ๏ฝ ๏ฝ๏ฝ
๏ฝ๏ฝ'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
def run_all_tests():
|
def run_all_tests():
|
||||||
base_dir = os.getcwd()
|
base_dir = os.getcwd()
|
||||||
print('Running tests...')
|
print('Running tests...')
|
||||||
|
@ -7368,6 +7415,7 @@ def run_all_tests():
|
||||||
_test_checkbox_names()
|
_test_checkbox_names()
|
||||||
_test_thread_functions()
|
_test_thread_functions()
|
||||||
_test_functions()
|
_test_functions()
|
||||||
|
_test_text_standardize()
|
||||||
_test_dogwhistles()
|
_test_dogwhistles()
|
||||||
_test_remove_end_of_line()
|
_test_remove_end_of_line()
|
||||||
_test_translation_labels()
|
_test_translation_labels()
|
||||||
|
|
Loadingโฆ
Reference in New Issue