diff --git a/happening.py b/happening.py index 1787ee903..1ae82212e 100644 --- a/happening.py +++ b/happening.py @@ -26,6 +26,7 @@ from utils import get_status_number from utils import get_full_domain from utils import text_in_file from utils import remove_eol +from utils import is_arabic from filters import is_filtered from context import get_individual_post_context from session import get_method @@ -288,6 +289,9 @@ def get_todays_events(base_dir: str, nickname: str, domain: str, if content: if not _event_text_match(content, text_match): continue + if content_language != 'ar': + if is_arabic(content): + content_language = 'ar' public_event = is_public_post(post_json_object) diff --git a/tests.py b/tests.py index e5e0286c9..0768dafc3 100644 --- a/tests.py +++ b/tests.py @@ -54,6 +54,7 @@ from follow import clear_followers from follow import send_follow_request_via_server from follow import send_unfollow_request_via_server from siteactive import site_is_active +from utils import is_arabic from utils import remove_inverted_text from utils import remove_square_capitals from utils import standardize_text @@ -7638,6 +7639,16 @@ def _test_reply_language(base_dir: str) -> None: assert not get_reply_language(base_dir, post_json_object) +def _test_is_arabic() -> None: + print('is_arabic') + test = "Some English. هذا نص عربي" + assert is_arabic(test) + test = "Some English" + assert not is_arabic(test) + test = "" + assert not is_arabic(test) + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -7655,6 +7666,7 @@ def run_all_tests(): _test_checkbox_names() _test_thread_functions() _test_functions() + _test_is_arabic() _test_reply_language(base_dir) _test_emoji_in_actor_name(base_dir) _test_uninvert() diff --git a/utils.py b/utils.py index a3f4723df..519e2ad29 100644 --- a/utils.py +++ b/utils.py @@ -219,6 +219,23 @@ def get_content_from_post(post_json_object: {}, system_language: str, return standardize_text(content) +def is_arabic(content: str) -> bool: + """Returns true if the given text contains arabic + """ + if not content: + return False + result = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f' + + '\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-' + + '\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', + ' ', content) + if result: + result = result.strip() + # more than a third of the content + if len(result) > len(content) / 3: + return True + return False + + def get_language_from_post(post_json_object: {}, system_language: str, languages_understood: [], content_type: str = "content") -> str: @@ -236,13 +253,24 @@ def get_language_from_post(post_json_object: {}, system_language: str, if this_post_json[map_dict].get(system_language): sys_lang = this_post_json[map_dict][system_language] if isinstance(sys_lang, str): + content = this_post_json[map_dict][system_language] + if is_arabic(content): + return 'ar' return system_language else: # is there a contentMap/summaryMap entry for one of # the understood languages? for lang in languages_understood: if this_post_json[map_dict].get(lang): + content = this_post_json[map_dict][lang] + if is_arabic(content): + return 'ar' return lang + else: + if isinstance(this_post_json[content_type], str): + content = this_post_json[content_type] + if is_arabic(content): + return 'ar' return system_language