Remove square capitals when filtering

main
Bob Mottram 2022-10-05 18:55:24 +01:00
parent e40ac467fd
commit e2ba518b96
3 changed files with 30 additions and 0 deletions

View File

@ -13,6 +13,7 @@ from utils import text_in_file
from utils import remove_eol
from utils import standardize_text
from utils import remove_inverted_text
from utils import remove_square_capitals
def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
@ -125,6 +126,7 @@ def _is_filtered_base(filename: str, content: str,
return False
content = remove_inverted_text(content, system_language)
content = remove_square_capitals(content, system_language)
# convert any fancy characters to ordinary ones
content = standardize_text(content)

View File

@ -55,6 +55,7 @@ from follow import send_follow_request_via_server
from follow import send_unfollow_request_via_server
from siteactive import site_is_active
from utils import remove_inverted_text
from utils import remove_square_capitals
from utils import standardize_text
from utils import remove_eol
from utils import text_in_file
@ -7562,6 +7563,15 @@ def _test_uninvert():
print('result: ' + result)
assert result == expected
text = '🅻🅴🆅🅸🅰🆃🅰🆁 abc'
expected = "LEVIATAR abc"
result = remove_square_capitals(text, 'en')
if result != expected:
print('expected: ' + expected)
print('result: ' + result)
print('text: ' + text)
assert result == expected
text = '<p>Some ordinary text</p><p>ʇsǝʇ ɐ sı sıɥʇ</p>'
expected = "<p>Some ordinary text</p><p>this is a test</p>"
result = remove_inverted_text(text, 'en')

View File

@ -3915,3 +3915,21 @@ def remove_inverted_text(text: str, system_language: str) -> str:
new_text += separator
return new_text
def remove_square_capitals(text: str, system_language: str) -> str:
"""Removes any square capital text from the given string
"""
if system_language != 'en':
return text
offset = ord('A')
start_value = ord('🅰')
end_value = start_value + 26
result = ''
for text_ch in text:
text_value = ord(text_ch)
if text_value < start_value or text_value > end_value:
result += text_ch
else:
result += chr(offset + text_value - start_value)
return result