diff --git a/filters.py b/filters.py index 4a22ab164..6bdc4d364 100644 --- a/filters.py +++ b/filters.py @@ -13,6 +13,7 @@ from utils import text_in_file from utils import remove_eol from utils import standardize_text from utils import remove_inverted_text +from utils import remove_square_capitals def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: @@ -125,6 +126,7 @@ def _is_filtered_base(filename: str, content: str, return False content = remove_inverted_text(content, system_language) + content = remove_square_capitals(content, system_language) # convert any fancy characters to ordinary ones content = standardize_text(content) diff --git a/tests.py b/tests.py index 83643acdf..7f4fb1449 100644 --- a/tests.py +++ b/tests.py @@ -55,6 +55,7 @@ from follow import send_follow_request_via_server from follow import send_unfollow_request_via_server from siteactive import site_is_active from utils import remove_inverted_text +from utils import remove_square_capitals from utils import standardize_text from utils import remove_eol from utils import text_in_file @@ -7562,6 +7563,15 @@ def _test_uninvert(): print('result: ' + result) assert result == expected + text = 'π »π ΄π π Έπ °ππ °π abc' + expected = "LEVIATAR abc" + result = remove_square_capitals(text, 'en') + if result != expected: + print('expected: ' + expected) + print('result: ' + result) + print('text: ' + text) + assert result == expected + text = '
Some ordinary text
ΚsΗΚ Ι sΔ± sΔ±Ι₯Κ
' expected = "Some ordinary text
this is a test
" result = remove_inverted_text(text, 'en') diff --git a/utils.py b/utils.py index fce00339e..16cc94ad9 100644 --- a/utils.py +++ b/utils.py @@ -3915,3 +3915,21 @@ def remove_inverted_text(text: str, system_language: str) -> str: new_text += separator return new_text + + +def remove_square_capitals(text: str, system_language: str) -> str: + """Removes any square capital text from the given string + """ + if system_language != 'en': + return text + offset = ord('A') + start_value = ord('π °') + end_value = start_value + 26 + result = '' + for text_ch in text: + text_value = ord(text_ch) + if text_value < start_value or text_value > end_value: + result += text_ch + else: + result += chr(offset + text_value - start_value) + return result