Automatically detect arabic language

2022-12-17 15:43:34 +00:00 · 2022-12-17 15:43:34 +00:00 · 08d8650efd
parent 667f4dcd45
commit 08d8650efd
3 changed files with 44 additions and 0 deletions
--- a/happening.py
+++ b/happening.py
@ -26,6 +26,7 @@ from utils import get_status_number
 from utils import get_full_domain
 from utils import text_in_file
 from utils import remove_eol
+from utils import is_arabic
 from filters import is_filtered
 from context import get_individual_post_context
 from session import get_method
@ -288,6 +289,9 @@ def get_todays_events(base_dir: str, nickname: str, domain: str,
                if content:
                    if not _event_text_match(content, text_match):
                        continue
+                    if content_language != 'ar':
+                        if is_arabic(content):
+                            content_language = 'ar'

            public_event = is_public_post(post_json_object)

--- a/tests.py
+++ b/tests.py
@ -54,6 +54,7 @@ from follow import clear_followers
 from follow import send_follow_request_via_server
 from follow import send_unfollow_request_via_server
 from siteactive import site_is_active
+from utils import is_arabic
 from utils import remove_inverted_text
 from utils import remove_square_capitals
 from utils import standardize_text
@ -7638,6 +7639,16 @@ def _test_reply_language(base_dir: str) -> None:
    assert not get_reply_language(base_dir, post_json_object)


+def _test_is_arabic() -> None:
+    print('is_arabic')
+    test = "Some English. هذا نص عربي"
+    assert is_arabic(test)
+    test = "Some English"
+    assert not is_arabic(test)
+    test = ""
+    assert not is_arabic(test)
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -7655,6 +7666,7 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
+    _test_is_arabic()
    _test_reply_language(base_dir)
    _test_emoji_in_actor_name(base_dir)
    _test_uninvert()
--- a/utils.py
+++ b/utils.py
@ -219,6 +219,23 @@ def get_content_from_post(post_json_object: {}, system_language: str,
    return standardize_text(content)


+def is_arabic(content: str) -> bool:
+    """Returns true if the given text contains arabic
+    """
+    if not content:
+        return False
+    result = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f' +
+                    '\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-' +
+                    '\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+',
+                    ' ', content)
+    if result:
+        result = result.strip()
+        # more than a third of the content
+        if len(result) > len(content) / 3:
+            return True
+    return False
+
+
 def get_language_from_post(post_json_object: {}, system_language: str,
                           languages_understood: [],
                           content_type: str = "content") -> str:
@ -236,13 +253,24 @@ def get_language_from_post(post_json_object: {}, system_language: str,
            if this_post_json[map_dict].get(system_language):
                sys_lang = this_post_json[map_dict][system_language]
                if isinstance(sys_lang, str):
+                    content = this_post_json[map_dict][system_language]
+                    if is_arabic(content):
+                        return 'ar'
                    return system_language
            else:
                # is there a contentMap/summaryMap entry for one of
                # the understood languages?
                for lang in languages_understood:
                    if this_post_json[map_dict].get(lang):
+                        content = this_post_json[map_dict][lang]
+                        if is_arabic(content):
+                            return 'ar'
                        return lang
+    else:
+        if isinstance(this_post_json[content_type], str):
+            content = this_post_json[content_type]
+            if is_arabic(content):
+                return 'ar'
    return system_language