Automatically detect arabic language

merge-requests/30/head
Bob Mottram 2022-12-17 15:43:34 +00:00
parent 667f4dcd45
commit 08d8650efd
3 changed files with 44 additions and 0 deletions

View File

@ -26,6 +26,7 @@ from utils import get_status_number
from utils import get_full_domain
from utils import text_in_file
from utils import remove_eol
from utils import is_arabic
from filters import is_filtered
from context import get_individual_post_context
from session import get_method
@ -288,6 +289,9 @@ def get_todays_events(base_dir: str, nickname: str, domain: str,
if content:
if not _event_text_match(content, text_match):
continue
if content_language != 'ar':
if is_arabic(content):
content_language = 'ar'
public_event = is_public_post(post_json_object)

View File

@ -54,6 +54,7 @@ from follow import clear_followers
from follow import send_follow_request_via_server
from follow import send_unfollow_request_via_server
from siteactive import site_is_active
from utils import is_arabic
from utils import remove_inverted_text
from utils import remove_square_capitals
from utils import standardize_text
@ -7638,6 +7639,16 @@ def _test_reply_language(base_dir: str) -> None:
assert not get_reply_language(base_dir, post_json_object)
def _test_is_arabic() -> None:
print('is_arabic')
test = "Some English. هذا نص عربي"
assert is_arabic(test)
test = "Some English"
assert not is_arabic(test)
test = ""
assert not is_arabic(test)
def run_all_tests():
base_dir = os.getcwd()
print('Running tests...')
@ -7655,6 +7666,7 @@ def run_all_tests():
_test_checkbox_names()
_test_thread_functions()
_test_functions()
_test_is_arabic()
_test_reply_language(base_dir)
_test_emoji_in_actor_name(base_dir)
_test_uninvert()

View File

@ -219,6 +219,23 @@ def get_content_from_post(post_json_object: {}, system_language: str,
return standardize_text(content)
def is_arabic(content: str) -> bool:
"""Returns true if the given text contains arabic
"""
if not content:
return False
result = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f' +
'\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-' +
'\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+',
' ', content)
if result:
result = result.strip()
# more than a third of the content
if len(result) > len(content) / 3:
return True
return False
def get_language_from_post(post_json_object: {}, system_language: str,
languages_understood: [],
content_type: str = "content") -> str:
@ -236,13 +253,24 @@ def get_language_from_post(post_json_object: {}, system_language: str,
if this_post_json[map_dict].get(system_language):
sys_lang = this_post_json[map_dict][system_language]
if isinstance(sys_lang, str):
content = this_post_json[map_dict][system_language]
if is_arabic(content):
return 'ar'
return system_language
else:
# is there a contentMap/summaryMap entry for one of
# the understood languages?
for lang in languages_understood:
if this_post_json[map_dict].get(lang):
content = this_post_json[map_dict][lang]
if is_arabic(content):
return 'ar'
return lang
else:
if isinstance(this_post_json[content_type], str):
content = this_post_json[content_type]
if is_arabic(content):
return 'ar'
return system_language