mirror of https://gitlab.com/bashrc2/epicyon
Automatically detect arabic language
parent
667f4dcd45
commit
08d8650efd
|
@ -26,6 +26,7 @@ from utils import get_status_number
|
|||
from utils import get_full_domain
|
||||
from utils import text_in_file
|
||||
from utils import remove_eol
|
||||
from utils import is_arabic
|
||||
from filters import is_filtered
|
||||
from context import get_individual_post_context
|
||||
from session import get_method
|
||||
|
@ -288,6 +289,9 @@ def get_todays_events(base_dir: str, nickname: str, domain: str,
|
|||
if content:
|
||||
if not _event_text_match(content, text_match):
|
||||
continue
|
||||
if content_language != 'ar':
|
||||
if is_arabic(content):
|
||||
content_language = 'ar'
|
||||
|
||||
public_event = is_public_post(post_json_object)
|
||||
|
||||
|
|
12
tests.py
12
tests.py
|
@ -54,6 +54,7 @@ from follow import clear_followers
|
|||
from follow import send_follow_request_via_server
|
||||
from follow import send_unfollow_request_via_server
|
||||
from siteactive import site_is_active
|
||||
from utils import is_arabic
|
||||
from utils import remove_inverted_text
|
||||
from utils import remove_square_capitals
|
||||
from utils import standardize_text
|
||||
|
@ -7638,6 +7639,16 @@ def _test_reply_language(base_dir: str) -> None:
|
|||
assert not get_reply_language(base_dir, post_json_object)
|
||||
|
||||
|
||||
def _test_is_arabic() -> None:
|
||||
print('is_arabic')
|
||||
test = "Some English. هذا نص عربي"
|
||||
assert is_arabic(test)
|
||||
test = "Some English"
|
||||
assert not is_arabic(test)
|
||||
test = ""
|
||||
assert not is_arabic(test)
|
||||
|
||||
|
||||
def run_all_tests():
|
||||
base_dir = os.getcwd()
|
||||
print('Running tests...')
|
||||
|
@ -7655,6 +7666,7 @@ def run_all_tests():
|
|||
_test_checkbox_names()
|
||||
_test_thread_functions()
|
||||
_test_functions()
|
||||
_test_is_arabic()
|
||||
_test_reply_language(base_dir)
|
||||
_test_emoji_in_actor_name(base_dir)
|
||||
_test_uninvert()
|
||||
|
|
28
utils.py
28
utils.py
|
@ -219,6 +219,23 @@ def get_content_from_post(post_json_object: {}, system_language: str,
|
|||
return standardize_text(content)
|
||||
|
||||
|
||||
def is_arabic(content: str) -> bool:
|
||||
"""Returns true if the given text contains arabic
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
result = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f' +
|
||||
'\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-' +
|
||||
'\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+',
|
||||
' ', content)
|
||||
if result:
|
||||
result = result.strip()
|
||||
# more than a third of the content
|
||||
if len(result) > len(content) / 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_language_from_post(post_json_object: {}, system_language: str,
|
||||
languages_understood: [],
|
||||
content_type: str = "content") -> str:
|
||||
|
@ -236,13 +253,24 @@ def get_language_from_post(post_json_object: {}, system_language: str,
|
|||
if this_post_json[map_dict].get(system_language):
|
||||
sys_lang = this_post_json[map_dict][system_language]
|
||||
if isinstance(sys_lang, str):
|
||||
content = this_post_json[map_dict][system_language]
|
||||
if is_arabic(content):
|
||||
return 'ar'
|
||||
return system_language
|
||||
else:
|
||||
# is there a contentMap/summaryMap entry for one of
|
||||
# the understood languages?
|
||||
for lang in languages_understood:
|
||||
if this_post_json[map_dict].get(lang):
|
||||
content = this_post_json[map_dict][lang]
|
||||
if is_arabic(content):
|
||||
return 'ar'
|
||||
return lang
|
||||
else:
|
||||
if isinstance(this_post_json[content_type], str):
|
||||
content = this_post_json[content_type]
|
||||
if is_arabic(content):
|
||||
return 'ar'
|
||||
return system_language
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue