Handling of mixed direction post content

2023-09-20 13:23:45 +01:00 · 2023-09-20 13:23:45 +01:00 · b77389400d
parent d76f69d162
commit b77389400d
5 changed files with 106 additions and 1 deletions
--- a/content.py
+++ b/content.py
@ -15,6 +15,8 @@ import email.parser
 import urllib.parse
 from shutil import copyfile
 from dateutil.parser import parse
 from utils import is_right_to_left_text
 from utils import language_right_to_left
 from utils import binary_is_image
 from utils import get_content_from_post
 from utils import get_full_domain
@ -2195,3 +2197,29 @@ def add_name_emojis_to_tags(base_dir: str, http_prefix: str,
        if updated:
            new_tag['updated'] = updated
        actor_json['tag'].append(new_tag)
 def format_mixed_right_to_left(content: str,
                               language: str) -> str:
    """Adds RTL direction formatting for non-RTL language
    eg. where some paragraphs are English and others are Arabic
    """
    # not a RTL language
    if language_right_to_left(language):
        return content
    paragraphs = content.split('<p>')
    result = ''
    changed = False
    for text_html in paragraphs:
        if '</p>' not in text_html:
            continue
        text_html = '<p>' + text_html
        text_plain = remove_html(text_html)
        if is_right_to_left_text(text_plain):
            text_html = text_html.replace('<p>', '<p><div dir="rtl">', 1)
            text_html = text_html.replace('</p>', '</div></p>', 1)
            changed = True
        result += text_html
    if not changed:
        return content
    return result
--- a/languages.py
+++ b/languages.py
@ -381,7 +381,7 @@ def get_reply_language(base_dir: str,
        post_obj = post_json_object['object']
    if not post_obj.get('contentMap'):
        return None
-    for lang, content in post_obj['contentMap'].items():
+    for lang, _ in post_obj['contentMap'].items():
        lang_filename = base_dir + '/translations/' + lang + '.json'
        if not os.path.isfile(lang_filename):
            continue
--- a/tests.py
+++ b/tests.py
@ -56,6 +56,7 @@ from follow import clear_followers
 from follow import send_follow_request_via_server
 from follow import send_unfollow_request_via_server
 from siteactive import site_is_active
 from utils import is_right_to_left_text
 from utils import remove_markup_tag
 from utils import remove_style_within_html
 from utils import html_tag_has_closing
@ -142,6 +143,7 @@ from inbox import valid_inbox
 from inbox import valid_inbox_filenames
 from inbox import cache_svg_images
 from categories import guess_hashtag_category
 from content import format_mixed_right_to_left
 from content import replace_remote_hashtags
 from content import add_name_emojis_to_tags
 from content import combine_textarea_lines
@ -8090,6 +8092,63 @@ def _test_remove_tag() -> None:
    assert result == 'This is a test<br>something<br>again'
 def _test_is_right_to_left() -> None:
    print('is_right_to_left')
    text = 'This is a test'
    assert not is_right_to_left_text(text)
    # arabic
    text = 'هذا اختبار'
    assert is_right_to_left_text(text)
    text = 'Das ist ein Test'
    assert not is_right_to_left_text(text)
    # persian
    text = 'این یک امتحان است'
    assert is_right_to_left_text(text)
    # chinese
    text = '这是一个测试'
    assert not is_right_to_left_text(text)
    # hebrew
    text = 'זה מבחן'
    assert is_right_to_left_text(text)
    # yiddish
    text = 'דאָס איז אַ פּראָבע'
    assert is_right_to_left_text(text)
 def _test_format_mixed_rtl() -> None:
    print('format_mixed_rtl')
    content = '<p>This is some English</p>' + \
        '<p>هذه عربية</p>' + \
        '<p>And more English</p>'
    result = format_mixed_right_to_left(content, 'en')
    expected = '<p>This is some English</p>' + \
        '<p><div dir="rtl">هذه عربية</div></p>' + \
        '<p>And more English</p>'
    assert result == expected
    content = '<p>This is some only English</p>'
    result = format_mixed_right_to_left(content, 'en')
    assert result == content
    content = 'This is some only English without markup'
    result = format_mixed_right_to_left(content, 'en')
    assert result == content
    content = '<p>هذا عربي فقط</p>'
    result = format_mixed_right_to_left(content, 'en')
    expected = '<p><div dir="rtl">هذا عربي فقط</div></p>'
    assert result == expected
    result = format_mixed_right_to_left(content, 'ar')
    assert result == content
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -8107,6 +8166,8 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
    _test_is_right_to_left()
    _test_format_mixed_rtl()
    _test_remove_tag()
    _test_featured_tags()
    _test_xor_hashes()
--- a/utils.py
+++ b/utils.py
@ -4542,6 +4542,19 @@ def language_right_to_left(language: str) -> bool:
    return False
 def is_right_to_left_text(text: str) -> bool:
    """Is the given text right to left?
    Persian \u0600-\u06FF
    Arabic \u0627-\u064a
    Hebrew/Yiddish \u0590-\u05FF\uFB2A-\uFB4E
    """
    unicode_str = '[\u0627-\u064a]|[\u0600-\u06FF]|' + \
        '[\u0590-\u05FF\uFB2A-\uFB4E]'
    pattern = re.compile(unicode_str)
    return len(re.findall(pattern, text)) > (len(text)/2)
 def binary_is_image(filename: str, media_binary) -> bool:
    """Returns true if the given file binary data contains an image
    """
--- a/webapp_post.py
+++ b/webapp_post.py
@ -71,6 +71,7 @@ from utils import acct_dir
 from utils import local_actor_url
 from utils import is_unlisted_post
 from utils import language_right_to_left
 from content import format_mixed_right_to_left
 from content import replace_remote_hashtags
 from content import detect_dogwhistles
 from content import create_edits_html
@ -2720,6 +2721,8 @@ def individual_post_as_html(signing_priv_key_pem: str,
                switch_words(base_dir, nickname, domain, object_content)
            object_content = html_replace_email_quote(object_content)
            object_content = html_replace_quote_marks(object_content)
            object_content = \
                format_mixed_right_to_left(object_content, system_language)
            # append any edits
            object_content += edits_str
        else: