diff --git a/content.py b/content.py index 5d69dfff4..52eb33391 100644 --- a/content.py +++ b/content.py @@ -15,6 +15,8 @@ import email.parser import urllib.parse from shutil import copyfile from dateutil.parser import parse +from utils import is_right_to_left_text +from utils import language_right_to_left from utils import binary_is_image from utils import get_content_from_post from utils import get_full_domain @@ -2195,3 +2197,29 @@ def add_name_emojis_to_tags(base_dir: str, http_prefix: str, if updated: new_tag['updated'] = updated actor_json['tag'].append(new_tag) + + +def format_mixed_right_to_left(content: str, + language: str) -> str: + """Adds RTL direction formatting for non-RTL language + eg. where some paragraphs are English and others are Arabic + """ + # not a RTL language + if language_right_to_left(language): + return content + paragraphs = content.split('

') + result = '' + changed = False + for text_html in paragraphs: + if '

' not in text_html: + continue + text_html = '

' + text_html + text_plain = remove_html(text_html) + if is_right_to_left_text(text_plain): + text_html = text_html.replace('

', '

', 1) + text_html = text_html.replace('

', '

', 1) + changed = True + result += text_html + if not changed: + return content + return result diff --git a/languages.py b/languages.py index 668cddba2..12ce8cc72 100644 --- a/languages.py +++ b/languages.py @@ -381,7 +381,7 @@ def get_reply_language(base_dir: str, post_obj = post_json_object['object'] if not post_obj.get('contentMap'): return None - for lang, content in post_obj['contentMap'].items(): + for lang, _ in post_obj['contentMap'].items(): lang_filename = base_dir + '/translations/' + lang + '.json' if not os.path.isfile(lang_filename): continue diff --git a/tests.py b/tests.py index 8a74c3814..df9fca23a 100644 --- a/tests.py +++ b/tests.py @@ -56,6 +56,7 @@ from follow import clear_followers from follow import send_follow_request_via_server from follow import send_unfollow_request_via_server from siteactive import site_is_active +from utils import is_right_to_left_text from utils import remove_markup_tag from utils import remove_style_within_html from utils import html_tag_has_closing @@ -142,6 +143,7 @@ from inbox import valid_inbox from inbox import valid_inbox_filenames from inbox import cache_svg_images from categories import guess_hashtag_category +from content import format_mixed_right_to_left from content import replace_remote_hashtags from content import add_name_emojis_to_tags from content import combine_textarea_lines @@ -8090,6 +8092,63 @@ def _test_remove_tag() -> None: assert result == 'This is a test
something
again' +def _test_is_right_to_left() -> None: + print('is_right_to_left') + text = 'This is a test' + assert not is_right_to_left_text(text) + + # arabic + text = 'هذا اختبار' + assert is_right_to_left_text(text) + + text = 'Das ist ein Test' + assert not is_right_to_left_text(text) + + # persian + text = 'این یک امتحان است' + assert is_right_to_left_text(text) + + # chinese + text = '这是一个测试' + assert not is_right_to_left_text(text) + + # hebrew + text = 'זה מבחן' + assert is_right_to_left_text(text) + + # yiddish + text = 'דאָס איז אַ פּראָבע' + assert is_right_to_left_text(text) + + +def _test_format_mixed_rtl() -> None: + print('format_mixed_rtl') + content = '

This is some English

' + \ + '

هذه عربية

' + \ + '

And more English

' + result = format_mixed_right_to_left(content, 'en') + expected = '

This is some English

' + \ + '

هذه عربية

' + \ + '

And more English

' + assert result == expected + + content = '

This is some only English

' + result = format_mixed_right_to_left(content, 'en') + assert result == content + + content = 'This is some only English without markup' + result = format_mixed_right_to_left(content, 'en') + assert result == content + + content = '

هذا عربي فقط

' + result = format_mixed_right_to_left(content, 'en') + expected = '

هذا عربي فقط

' + assert result == expected + + result = format_mixed_right_to_left(content, 'ar') + assert result == content + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -8107,6 +8166,8 @@ def run_all_tests(): _test_checkbox_names() _test_thread_functions() _test_functions() + _test_is_right_to_left() + _test_format_mixed_rtl() _test_remove_tag() _test_featured_tags() _test_xor_hashes() diff --git a/utils.py b/utils.py index ce083f64f..60c6b0557 100644 --- a/utils.py +++ b/utils.py @@ -4542,6 +4542,19 @@ def language_right_to_left(language: str) -> bool: return False +def is_right_to_left_text(text: str) -> bool: + """Is the given text right to left? + Persian \u0600-\u06FF + Arabic \u0627-\u064a + Hebrew/Yiddish \u0590-\u05FF\uFB2A-\uFB4E + """ + unicode_str = '[\u0627-\u064a]|[\u0600-\u06FF]|' + \ + '[\u0590-\u05FF\uFB2A-\uFB4E]' + pattern = re.compile(unicode_str) + + return len(re.findall(pattern, text)) > (len(text)/2) + + def binary_is_image(filename: str, media_binary) -> bool: """Returns true if the given file binary data contains an image """ diff --git a/webapp_post.py b/webapp_post.py index e06d14950..87ed82543 100644 --- a/webapp_post.py +++ b/webapp_post.py @@ -71,6 +71,7 @@ from utils import acct_dir from utils import local_actor_url from utils import is_unlisted_post from utils import language_right_to_left +from content import format_mixed_right_to_left from content import replace_remote_hashtags from content import detect_dogwhistles from content import create_edits_html @@ -2720,6 +2721,8 @@ def individual_post_as_html(signing_priv_key_pem: str, switch_words(base_dir, nickname, domain, object_content) object_content = html_replace_email_quote(object_content) object_content = html_replace_quote_marks(object_content) + object_content = \ + format_mixed_right_to_left(object_content, system_language) # append any edits object_content += edits_str else: