Merge branch 'main' of gitlab.com:bashrc2/epicyon

2022-07-09 12:43:47 +01:00 · 2022-07-09 12:43:47 +01:00 · 1982a5afb2
parent fca6801716 66fcf918a8
commit 1982a5afb2
3 changed files with 108 additions and 3 deletions
--- a/filters.py
+++ b/filters.py
@ -11,6 +11,7 @@ import os
 from utils import acct_dir
 from utils import text_in_file
 from utils import remove_eol
+from utils import standardize_text


 def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool:
@ -120,6 +121,9 @@ def _is_filtered_base(filename: str, content: str) -> bool:
    if not os.path.isfile(filename):
        return False

+    # convert any fancy characters to ordinary ones
+    content = standardize_text(content)
+
    try:
        with open(filename, 'r', encoding='utf-8') as fp_filt:
            for line in fp_filt:
--- a/tests.py
+++ b/tests.py
@ -54,6 +54,7 @@ from follow import clear_followers
 from follow import send_follow_request_via_server
 from follow import send_unfollow_request_via_server
 from siteactive import site_is_active
+from utils import standardize_text
 from utils import remove_eol
 from utils import text_in_file
 from utils import convert_published_to_local_timezone
@ -7351,6 +7352,52 @@ def _test_dogwhistles():
    assert result['hamstered']['category'] == "hamsterism"


+def _test_text_standardize():
+    print('text_standardize')
+    expected = 'This is a test'
+
+    result = standardize_text(expected)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = '𝔗𝔥𝔦𝔰 𝔦𝔰 𝔞 𝔱𝔢𝔰𝔱'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖆 𝖙𝖊𝖘𝖙'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = '𝓣𝓱𝓲𝓼 𝓲𝓼 𝓪 𝓽𝓮𝓼𝓽'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = '𝒯𝒽𝒾𝓈 𝒾𝓈 𝒶 𝓉𝑒𝓈𝓉'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕒 𝕥𝕖𝕤𝕥'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+    text = 'Ｔｈｉｓ ｉｓ ａ ｔｅｓｔ'
+    result = standardize_text(text)
+    if result != expected:
+        print(result)
+    assert result == expected
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -7368,6 +7415,7 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
+    _test_text_standardize()
    _test_dogwhistles()
    _test_remove_end_of_line()
    _test_translation_labels()
--- a/utils.py
+++ b/utils.py
@ -40,6 +40,57 @@ INVALID_CHARACTERS = (
 )


+def _standardize_text_range(text: str,
+                            range_start: int, range_end: int,
+                            offset: str) -> str:
+    """Convert any fancy characters within the given range into ordinary ones
+    """
+    offset = ord(offset)
+    ctr = 0
+    text = list(text)
+    while ctr < len(text):
+        val = ord(text[ctr])
+        if val in range(range_start, range_end):
+            text[ctr] = chr(val - range_start + offset)
+        ctr += 1
+    return "".join(text)
+
+
+def standardize_text(text: str) -> str:
+    """Converts fancy unicode text to ordinary letters
+    """
+    char_ranges = (
+        [65345, 'a'],
+        [119886, 'a'],
+        [119990, 'a'],
+        [120042, 'a'],
+        [120094, 'a'],
+        [120146, 'a'],
+        [120198, 'a'],
+        [120302, 'a'],
+        [120354, 'a'],
+        [120406, 'a'],
+        [65313, 'A'],
+        [119912, 'A'],
+        [119964, 'A'],
+        [120016, 'A'],
+        [120068, 'A'],
+        [120120, 'A'],
+        [120172, 'A'],
+        [120224, 'A'],
+        [120328, 'A'],
+        [120380, 'A'],
+        [120432, 'A']
+    )
+    for char_range in char_ranges:
+        range_start = char_range[0]
+        range_end = range_start + 26
+        offset = char_range[1]
+        text = _standardize_text_range(text, range_start, range_end, offset)
+
+    return text
+
+
 def remove_eol(line: str):
    """Removes line ending characters
    """
@ -150,17 +201,19 @@ def get_content_from_post(post_json_object: {}, system_language: str,
            if this_post_json[map_dict].get(system_language):
                sys_lang = this_post_json[map_dict][system_language]
                if isinstance(sys_lang, str):
-                    return this_post_json[map_dict][system_language]
+                    content = this_post_json[map_dict][system_language]
+                    return standardize_text(content)
            else:
                # is there a contentMap/summaryMap entry for one of
                # the understood languages?
                for lang in languages_understood:
                    if this_post_json[map_dict].get(lang):
-                        return this_post_json[map_dict][lang]
+                        content = this_post_json[map_dict][lang]
+                        return standardize_text(content)
    else:
        if isinstance(this_post_json[content_type], str):
            content = this_post_json[content_type]
-    return content
+    return standardize_text(content)


 def get_media_descriptions_from_post(post_json_object: {}) -> str: