mirror of https://gitlab.com/bashrc2/epicyon
				
				
				
			Merge branch 'main' of gitlab.com:bashrc2/epicyon
						commit
						1982a5afb2
					
				|  | @ -11,6 +11,7 @@ import os | ||||||
| from utils import acct_dir | from utils import acct_dir | ||||||
| from utils import text_in_file | from utils import text_in_file | ||||||
| from utils import remove_eol | from utils import remove_eol | ||||||
|  | from utils import standardize_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: | def add_filter(base_dir: str, nickname: str, domain: str, words: str) -> bool: | ||||||
|  | @ -120,6 +121,9 @@ def _is_filtered_base(filename: str, content: str) -> bool: | ||||||
|     if not os.path.isfile(filename): |     if not os.path.isfile(filename): | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|  |     # convert any fancy characters to ordinary ones | ||||||
|  |     content = standardize_text(content) | ||||||
|  | 
 | ||||||
|     try: |     try: | ||||||
|         with open(filename, 'r', encoding='utf-8') as fp_filt: |         with open(filename, 'r', encoding='utf-8') as fp_filt: | ||||||
|             for line in fp_filt: |             for line in fp_filt: | ||||||
|  |  | ||||||
							
								
								
									
										48
									
								
								tests.py
								
								
								
								
							
							
						
						
									
										48
									
								
								tests.py
								
								
								
								
							|  | @ -54,6 +54,7 @@ from follow import clear_followers | ||||||
| from follow import send_follow_request_via_server | from follow import send_follow_request_via_server | ||||||
| from follow import send_unfollow_request_via_server | from follow import send_unfollow_request_via_server | ||||||
| from siteactive import site_is_active | from siteactive import site_is_active | ||||||
|  | from utils import standardize_text | ||||||
| from utils import remove_eol | from utils import remove_eol | ||||||
| from utils import text_in_file | from utils import text_in_file | ||||||
| from utils import convert_published_to_local_timezone | from utils import convert_published_to_local_timezone | ||||||
|  | @ -7351,6 +7352,52 @@ def _test_dogwhistles(): | ||||||
|     assert result['hamstered']['category'] == "hamsterism" |     assert result['hamstered']['category'] == "hamsterism" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def _test_text_standardize(): | ||||||
|  |     print('text_standardize') | ||||||
|  |     expected = 'This is a test' | ||||||
|  | 
 | ||||||
|  |     result = standardize_text(expected) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = '𝔗𝔥𝔦𝔰 𝔦𝔰 𝔞 𝔱𝔢𝔰𝔱' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖆 𝖙𝖊𝖘𝖙' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = '𝓣𝓱𝓲𝓼 𝓲𝓼 𝓪 𝓽𝓮𝓼𝓽' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = '𝒯𝒽𝒾𝓈 𝒾𝓈 𝒶 𝓉𝑒𝓈𝓉' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕒 𝕥𝕖𝕤𝕥' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  |     text = 'This is a test' | ||||||
|  |     result = standardize_text(text) | ||||||
|  |     if result != expected: | ||||||
|  |         print(result) | ||||||
|  |     assert result == expected | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def run_all_tests(): | def run_all_tests(): | ||||||
|     base_dir = os.getcwd() |     base_dir = os.getcwd() | ||||||
|     print('Running tests...') |     print('Running tests...') | ||||||
|  | @ -7368,6 +7415,7 @@ def run_all_tests(): | ||||||
|     _test_checkbox_names() |     _test_checkbox_names() | ||||||
|     _test_thread_functions() |     _test_thread_functions() | ||||||
|     _test_functions() |     _test_functions() | ||||||
|  |     _test_text_standardize() | ||||||
|     _test_dogwhistles() |     _test_dogwhistles() | ||||||
|     _test_remove_end_of_line() |     _test_remove_end_of_line() | ||||||
|     _test_translation_labels() |     _test_translation_labels() | ||||||
|  |  | ||||||
							
								
								
									
										59
									
								
								utils.py
								
								
								
								
							
							
						
						
									
										59
									
								
								utils.py
								
								
								
								
							|  | @ -40,6 +40,57 @@ INVALID_CHARACTERS = ( | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def _standardize_text_range(text: str, | ||||||
|  |                             range_start: int, range_end: int, | ||||||
|  |                             offset: str) -> str: | ||||||
|  |     """Convert any fancy characters within the given range into ordinary ones | ||||||
|  |     """ | ||||||
|  |     offset = ord(offset) | ||||||
|  |     ctr = 0 | ||||||
|  |     text = list(text) | ||||||
|  |     while ctr < len(text): | ||||||
|  |         val = ord(text[ctr]) | ||||||
|  |         if val in range(range_start, range_end): | ||||||
|  |             text[ctr] = chr(val - range_start + offset) | ||||||
|  |         ctr += 1 | ||||||
|  |     return "".join(text) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def standardize_text(text: str) -> str: | ||||||
|  |     """Converts fancy unicode text to ordinary letters | ||||||
|  |     """ | ||||||
|  |     char_ranges = ( | ||||||
|  |         [65345, 'a'], | ||||||
|  |         [119886, 'a'], | ||||||
|  |         [119990, 'a'], | ||||||
|  |         [120042, 'a'], | ||||||
|  |         [120094, 'a'], | ||||||
|  |         [120146, 'a'], | ||||||
|  |         [120198, 'a'], | ||||||
|  |         [120302, 'a'], | ||||||
|  |         [120354, 'a'], | ||||||
|  |         [120406, 'a'], | ||||||
|  |         [65313, 'A'], | ||||||
|  |         [119912, 'A'], | ||||||
|  |         [119964, 'A'], | ||||||
|  |         [120016, 'A'], | ||||||
|  |         [120068, 'A'], | ||||||
|  |         [120120, 'A'], | ||||||
|  |         [120172, 'A'], | ||||||
|  |         [120224, 'A'], | ||||||
|  |         [120328, 'A'], | ||||||
|  |         [120380, 'A'], | ||||||
|  |         [120432, 'A'] | ||||||
|  |     ) | ||||||
|  |     for char_range in char_ranges: | ||||||
|  |         range_start = char_range[0] | ||||||
|  |         range_end = range_start + 26 | ||||||
|  |         offset = char_range[1] | ||||||
|  |         text = _standardize_text_range(text, range_start, range_end, offset) | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def remove_eol(line: str): | def remove_eol(line: str): | ||||||
|     """Removes line ending characters |     """Removes line ending characters | ||||||
|     """ |     """ | ||||||
|  | @ -150,17 +201,19 @@ def get_content_from_post(post_json_object: {}, system_language: str, | ||||||
|             if this_post_json[map_dict].get(system_language): |             if this_post_json[map_dict].get(system_language): | ||||||
|                 sys_lang = this_post_json[map_dict][system_language] |                 sys_lang = this_post_json[map_dict][system_language] | ||||||
|                 if isinstance(sys_lang, str): |                 if isinstance(sys_lang, str): | ||||||
|                     return this_post_json[map_dict][system_language] |                     content = this_post_json[map_dict][system_language] | ||||||
|  |                     return standardize_text(content) | ||||||
|             else: |             else: | ||||||
|                 # is there a contentMap/summaryMap entry for one of |                 # is there a contentMap/summaryMap entry for one of | ||||||
|                 # the understood languages? |                 # the understood languages? | ||||||
|                 for lang in languages_understood: |                 for lang in languages_understood: | ||||||
|                     if this_post_json[map_dict].get(lang): |                     if this_post_json[map_dict].get(lang): | ||||||
|                         return this_post_json[map_dict][lang] |                         content = this_post_json[map_dict][lang] | ||||||
|  |                         return standardize_text(content) | ||||||
|     else: |     else: | ||||||
|         if isinstance(this_post_json[content_type], str): |         if isinstance(this_post_json[content_type], str): | ||||||
|             content = this_post_json[content_type] |             content = this_post_json[content_type] | ||||||
|     return content |     return standardize_text(content) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_media_descriptions_from_post(post_json_object: {}) -> str: | def get_media_descriptions_from_post(post_json_object: {}) -> str: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue