mirror of https://gitlab.com/bashrc2/epicyon
Merge branch 'main' of gitlab.com:bashrc2/epicyon
commit
1982a5afb2
|
@ -11,6 +11,7 @@ import os
|
||||||
from utils import acct_dir
|
from utils import acct_dir
|
||||||
from utils import text_in_file
|
from utils import text_in_file
|
||||||
from utils import remove_eol
|
from utils import remove_eol
|
||||||
|
from utils import standardize_text
|
||||||
|
|
||||||
|
|
||||||
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
|
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
|
||||||
|
@ -120,6 +121,9 @@ def _is_filtered_base(filename: str, content: str) -> bool:
|
||||||
if not os.path.isfile(filename):
|
if not os.path.isfile(filename):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# convert any fancy characters to ordinary ones
|
||||||
|
content = standardize_text(content)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(filename, 'r', encoding='utf-8') as fp_filt:
|
with open(filename, 'r', encoding='utf-8') as fp_filt:
|
||||||
for line in fp_filt:
|
for line in fp_filt:
|
||||||
|
|
48
tests.py
48
tests.py
|
@ -54,6 +54,7 @@ from follow import clear_followers
|
||||||
from follow import send_follow_request_via_server
|
from follow import send_follow_request_via_server
|
||||||
from follow import send_unfollow_request_via_server
|
from follow import send_unfollow_request_via_server
|
||||||
from siteactive import site_is_active
|
from siteactive import site_is_active
|
||||||
|
from utils import standardize_text
|
||||||
from utils import remove_eol
|
from utils import remove_eol
|
||||||
from utils import text_in_file
|
from utils import text_in_file
|
||||||
from utils import convert_published_to_local_timezone
|
from utils import convert_published_to_local_timezone
|
||||||
|
@ -7351,6 +7352,52 @@ def _test_dogwhistles():
|
||||||
assert result['hamstered']['category'] == "hamsterism"
|
assert result['hamstered']['category'] == "hamsterism"
|
||||||
|
|
||||||
|
|
||||||
|
def _test_text_standardize():
|
||||||
|
print('text_standardize')
|
||||||
|
expected = 'This is a test'
|
||||||
|
|
||||||
|
result = standardize_text(expected)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '𝔗𝔥𝔦𝔰 𝔦𝔰 𝔞 𝔱𝔢𝔰𝔱'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖆 𝖙𝖊𝖘𝖙'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '𝓣𝓱𝓲𝓼 𝓲𝓼 𝓪 𝓽𝓮𝓼𝓽'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '𝒯𝒽𝒾𝓈 𝒾𝓈 𝒶 𝓉𝑒𝓈𝓉'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕒 𝕥𝕖𝕤𝕥'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
text = 'This is a test'
|
||||||
|
result = standardize_text(text)
|
||||||
|
if result != expected:
|
||||||
|
print(result)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
def run_all_tests():
|
def run_all_tests():
|
||||||
base_dir = os.getcwd()
|
base_dir = os.getcwd()
|
||||||
print('Running tests...')
|
print('Running tests...')
|
||||||
|
@ -7368,6 +7415,7 @@ def run_all_tests():
|
||||||
_test_checkbox_names()
|
_test_checkbox_names()
|
||||||
_test_thread_functions()
|
_test_thread_functions()
|
||||||
_test_functions()
|
_test_functions()
|
||||||
|
_test_text_standardize()
|
||||||
_test_dogwhistles()
|
_test_dogwhistles()
|
||||||
_test_remove_end_of_line()
|
_test_remove_end_of_line()
|
||||||
_test_translation_labels()
|
_test_translation_labels()
|
||||||
|
|
59
utils.py
59
utils.py
|
@ -40,6 +40,57 @@ INVALID_CHARACTERS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _standardize_text_range(text: str,
|
||||||
|
range_start: int, range_end: int,
|
||||||
|
offset: str) -> str:
|
||||||
|
"""Convert any fancy characters within the given range into ordinary ones
|
||||||
|
"""
|
||||||
|
offset = ord(offset)
|
||||||
|
ctr = 0
|
||||||
|
text = list(text)
|
||||||
|
while ctr < len(text):
|
||||||
|
val = ord(text[ctr])
|
||||||
|
if val in range(range_start, range_end):
|
||||||
|
text[ctr] = chr(val - range_start + offset)
|
||||||
|
ctr += 1
|
||||||
|
return "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_text(text: str) -> str:
|
||||||
|
"""Converts fancy unicode text to ordinary letters
|
||||||
|
"""
|
||||||
|
char_ranges = (
|
||||||
|
[65345, 'a'],
|
||||||
|
[119886, 'a'],
|
||||||
|
[119990, 'a'],
|
||||||
|
[120042, 'a'],
|
||||||
|
[120094, 'a'],
|
||||||
|
[120146, 'a'],
|
||||||
|
[120198, 'a'],
|
||||||
|
[120302, 'a'],
|
||||||
|
[120354, 'a'],
|
||||||
|
[120406, 'a'],
|
||||||
|
[65313, 'A'],
|
||||||
|
[119912, 'A'],
|
||||||
|
[119964, 'A'],
|
||||||
|
[120016, 'A'],
|
||||||
|
[120068, 'A'],
|
||||||
|
[120120, 'A'],
|
||||||
|
[120172, 'A'],
|
||||||
|
[120224, 'A'],
|
||||||
|
[120328, 'A'],
|
||||||
|
[120380, 'A'],
|
||||||
|
[120432, 'A']
|
||||||
|
)
|
||||||
|
for char_range in char_ranges:
|
||||||
|
range_start = char_range[0]
|
||||||
|
range_end = range_start + 26
|
||||||
|
offset = char_range[1]
|
||||||
|
text = _standardize_text_range(text, range_start, range_end, offset)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def remove_eol(line: str):
|
def remove_eol(line: str):
|
||||||
"""Removes line ending characters
|
"""Removes line ending characters
|
||||||
"""
|
"""
|
||||||
|
@ -150,17 +201,19 @@ def get_content_from_post(post_json_object: {}, system_language: str,
|
||||||
if this_post_json[map_dict].get(system_language):
|
if this_post_json[map_dict].get(system_language):
|
||||||
sys_lang = this_post_json[map_dict][system_language]
|
sys_lang = this_post_json[map_dict][system_language]
|
||||||
if isinstance(sys_lang, str):
|
if isinstance(sys_lang, str):
|
||||||
return this_post_json[map_dict][system_language]
|
content = this_post_json[map_dict][system_language]
|
||||||
|
return standardize_text(content)
|
||||||
else:
|
else:
|
||||||
# is there a contentMap/summaryMap entry for one of
|
# is there a contentMap/summaryMap entry for one of
|
||||||
# the understood languages?
|
# the understood languages?
|
||||||
for lang in languages_understood:
|
for lang in languages_understood:
|
||||||
if this_post_json[map_dict].get(lang):
|
if this_post_json[map_dict].get(lang):
|
||||||
return this_post_json[map_dict][lang]
|
content = this_post_json[map_dict][lang]
|
||||||
|
return standardize_text(content)
|
||||||
else:
|
else:
|
||||||
if isinstance(this_post_json[content_type], str):
|
if isinstance(this_post_json[content_type], str):
|
||||||
content = this_post_json[content_type]
|
content = this_post_json[content_type]
|
||||||
return content
|
return standardize_text(content)
|
||||||
|
|
||||||
|
|
||||||
def get_media_descriptions_from_post(post_json_object: {}) -> str:
|
def get_media_descriptions_from_post(post_json_object: {}) -> str:
|
||||||
|
|
Loading…
Reference in New Issue