mirror of https://gitlab.com/bashrc2/epicyon
Automatically detect arabic language
parent
667f4dcd45
commit
08d8650efd
|
@ -26,6 +26,7 @@ from utils import get_status_number
|
||||||
from utils import get_full_domain
|
from utils import get_full_domain
|
||||||
from utils import text_in_file
|
from utils import text_in_file
|
||||||
from utils import remove_eol
|
from utils import remove_eol
|
||||||
|
from utils import is_arabic
|
||||||
from filters import is_filtered
|
from filters import is_filtered
|
||||||
from context import get_individual_post_context
|
from context import get_individual_post_context
|
||||||
from session import get_method
|
from session import get_method
|
||||||
|
@ -288,6 +289,9 @@ def get_todays_events(base_dir: str, nickname: str, domain: str,
|
||||||
if content:
|
if content:
|
||||||
if not _event_text_match(content, text_match):
|
if not _event_text_match(content, text_match):
|
||||||
continue
|
continue
|
||||||
|
if content_language != 'ar':
|
||||||
|
if is_arabic(content):
|
||||||
|
content_language = 'ar'
|
||||||
|
|
||||||
public_event = is_public_post(post_json_object)
|
public_event = is_public_post(post_json_object)
|
||||||
|
|
||||||
|
|
12
tests.py
12
tests.py
|
@ -54,6 +54,7 @@ from follow import clear_followers
|
||||||
from follow import send_follow_request_via_server
|
from follow import send_follow_request_via_server
|
||||||
from follow import send_unfollow_request_via_server
|
from follow import send_unfollow_request_via_server
|
||||||
from siteactive import site_is_active
|
from siteactive import site_is_active
|
||||||
|
from utils import is_arabic
|
||||||
from utils import remove_inverted_text
|
from utils import remove_inverted_text
|
||||||
from utils import remove_square_capitals
|
from utils import remove_square_capitals
|
||||||
from utils import standardize_text
|
from utils import standardize_text
|
||||||
|
@ -7638,6 +7639,16 @@ def _test_reply_language(base_dir: str) -> None:
|
||||||
assert not get_reply_language(base_dir, post_json_object)
|
assert not get_reply_language(base_dir, post_json_object)
|
||||||
|
|
||||||
|
|
||||||
|
def _test_is_arabic() -> None:
|
||||||
|
print('is_arabic')
|
||||||
|
test = "Some English. هذا نص عربي"
|
||||||
|
assert is_arabic(test)
|
||||||
|
test = "Some English"
|
||||||
|
assert not is_arabic(test)
|
||||||
|
test = ""
|
||||||
|
assert not is_arabic(test)
|
||||||
|
|
||||||
|
|
||||||
def run_all_tests():
|
def run_all_tests():
|
||||||
base_dir = os.getcwd()
|
base_dir = os.getcwd()
|
||||||
print('Running tests...')
|
print('Running tests...')
|
||||||
|
@ -7655,6 +7666,7 @@ def run_all_tests():
|
||||||
_test_checkbox_names()
|
_test_checkbox_names()
|
||||||
_test_thread_functions()
|
_test_thread_functions()
|
||||||
_test_functions()
|
_test_functions()
|
||||||
|
_test_is_arabic()
|
||||||
_test_reply_language(base_dir)
|
_test_reply_language(base_dir)
|
||||||
_test_emoji_in_actor_name(base_dir)
|
_test_emoji_in_actor_name(base_dir)
|
||||||
_test_uninvert()
|
_test_uninvert()
|
||||||
|
|
28
utils.py
28
utils.py
|
@ -219,6 +219,23 @@ def get_content_from_post(post_json_object: {}, system_language: str,
|
||||||
return standardize_text(content)
|
return standardize_text(content)
|
||||||
|
|
||||||
|
|
||||||
|
def is_arabic(content: str) -> bool:
|
||||||
|
"""Returns true if the given text contains arabic
|
||||||
|
"""
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
result = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f' +
|
||||||
|
'\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-' +
|
||||||
|
'\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+',
|
||||||
|
' ', content)
|
||||||
|
if result:
|
||||||
|
result = result.strip()
|
||||||
|
# more than a third of the content
|
||||||
|
if len(result) > len(content) / 3:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_language_from_post(post_json_object: {}, system_language: str,
|
def get_language_from_post(post_json_object: {}, system_language: str,
|
||||||
languages_understood: [],
|
languages_understood: [],
|
||||||
content_type: str = "content") -> str:
|
content_type: str = "content") -> str:
|
||||||
|
@ -236,13 +253,24 @@ def get_language_from_post(post_json_object: {}, system_language: str,
|
||||||
if this_post_json[map_dict].get(system_language):
|
if this_post_json[map_dict].get(system_language):
|
||||||
sys_lang = this_post_json[map_dict][system_language]
|
sys_lang = this_post_json[map_dict][system_language]
|
||||||
if isinstance(sys_lang, str):
|
if isinstance(sys_lang, str):
|
||||||
|
content = this_post_json[map_dict][system_language]
|
||||||
|
if is_arabic(content):
|
||||||
|
return 'ar'
|
||||||
return system_language
|
return system_language
|
||||||
else:
|
else:
|
||||||
# is there a contentMap/summaryMap entry for one of
|
# is there a contentMap/summaryMap entry for one of
|
||||||
# the understood languages?
|
# the understood languages?
|
||||||
for lang in languages_understood:
|
for lang in languages_understood:
|
||||||
if this_post_json[map_dict].get(lang):
|
if this_post_json[map_dict].get(lang):
|
||||||
|
content = this_post_json[map_dict][lang]
|
||||||
|
if is_arabic(content):
|
||||||
|
return 'ar'
|
||||||
return lang
|
return lang
|
||||||
|
else:
|
||||||
|
if isinstance(this_post_json[content_type], str):
|
||||||
|
content = this_post_json[content_type]
|
||||||
|
if is_arabic(content):
|
||||||
|
return 'ar'
|
||||||
return system_language
|
return system_language
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue