Handling of mixed direction post content

main
Bob Mottram 2023-09-20 13:23:45 +01:00
parent d76f69d162
commit b77389400d
5 changed files with 106 additions and 1 deletions

View File

@ -15,6 +15,8 @@ import email.parser
import urllib.parse
from shutil import copyfile
from dateutil.parser import parse
from utils import is_right_to_left_text
from utils import language_right_to_left
from utils import binary_is_image
from utils import get_content_from_post
from utils import get_full_domain
@ -2195,3 +2197,29 @@ def add_name_emojis_to_tags(base_dir: str, http_prefix: str,
if updated:
new_tag['updated'] = updated
actor_json['tag'].append(new_tag)
def format_mixed_right_to_left(content: str,
language: str) -> str:
"""Adds RTL direction formatting for non-RTL language
eg. where some paragraphs are English and others are Arabic
"""
# not a RTL language
if language_right_to_left(language):
return content
paragraphs = content.split('<p>')
result = ''
changed = False
for text_html in paragraphs:
if '</p>' not in text_html:
continue
text_html = '<p>' + text_html
text_plain = remove_html(text_html)
if is_right_to_left_text(text_plain):
text_html = text_html.replace('<p>', '<p><div dir="rtl">', 1)
text_html = text_html.replace('</p>', '</div></p>', 1)
changed = True
result += text_html
if not changed:
return content
return result

View File

@ -381,7 +381,7 @@ def get_reply_language(base_dir: str,
post_obj = post_json_object['object']
if not post_obj.get('contentMap'):
return None
for lang, content in post_obj['contentMap'].items():
for lang, _ in post_obj['contentMap'].items():
lang_filename = base_dir + '/translations/' + lang + '.json'
if not os.path.isfile(lang_filename):
continue

View File

@ -56,6 +56,7 @@ from follow import clear_followers
from follow import send_follow_request_via_server
from follow import send_unfollow_request_via_server
from siteactive import site_is_active
from utils import is_right_to_left_text
from utils import remove_markup_tag
from utils import remove_style_within_html
from utils import html_tag_has_closing
@ -142,6 +143,7 @@ from inbox import valid_inbox
from inbox import valid_inbox_filenames
from inbox import cache_svg_images
from categories import guess_hashtag_category
from content import format_mixed_right_to_left
from content import replace_remote_hashtags
from content import add_name_emojis_to_tags
from content import combine_textarea_lines
@ -8090,6 +8092,63 @@ def _test_remove_tag() -> None:
assert result == 'This is a test<br>something<br>again'
def _test_is_right_to_left() -> None:
print('is_right_to_left')
text = 'This is a test'
assert not is_right_to_left_text(text)
# arabic
text = 'هذا اختبار'
assert is_right_to_left_text(text)
text = 'Das ist ein Test'
assert not is_right_to_left_text(text)
# persian
text = 'این یک امتحان است'
assert is_right_to_left_text(text)
# chinese
text = '这是一个测试'
assert not is_right_to_left_text(text)
# hebrew
text = 'זה מבחן'
assert is_right_to_left_text(text)
# yiddish
text = 'דאָס איז אַ פּראָבע'
assert is_right_to_left_text(text)
def _test_format_mixed_rtl() -> None:
print('format_mixed_rtl')
content = '<p>This is some English</p>' + \
'<p>هذه عربية</p>' + \
'<p>And more English</p>'
result = format_mixed_right_to_left(content, 'en')
expected = '<p>This is some English</p>' + \
'<p><div dir="rtl">هذه عربية</div></p>' + \
'<p>And more English</p>'
assert result == expected
content = '<p>This is some only English</p>'
result = format_mixed_right_to_left(content, 'en')
assert result == content
content = 'This is some only English without markup'
result = format_mixed_right_to_left(content, 'en')
assert result == content
content = '<p>هذا عربي فقط</p>'
result = format_mixed_right_to_left(content, 'en')
expected = '<p><div dir="rtl">هذا عربي فقط</div></p>'
assert result == expected
result = format_mixed_right_to_left(content, 'ar')
assert result == content
def run_all_tests():
base_dir = os.getcwd()
print('Running tests...')
@ -8107,6 +8166,8 @@ def run_all_tests():
_test_checkbox_names()
_test_thread_functions()
_test_functions()
_test_is_right_to_left()
_test_format_mixed_rtl()
_test_remove_tag()
_test_featured_tags()
_test_xor_hashes()

View File

@ -4542,6 +4542,19 @@ def language_right_to_left(language: str) -> bool:
return False
def is_right_to_left_text(text: str) -> bool:
"""Is the given text right to left?
Persian \u0600-\u06FF
Arabic \u0627-\u064a
Hebrew/Yiddish \u0590-\u05FF\uFB2A-\uFB4E
"""
unicode_str = '[\u0627-\u064a]|[\u0600-\u06FF]|' + \
'[\u0590-\u05FF\uFB2A-\uFB4E]'
pattern = re.compile(unicode_str)
return len(re.findall(pattern, text)) > (len(text)/2)
def binary_is_image(filename: str, media_binary) -> bool:
"""Returns true if the given file binary data contains an image
"""

View File

@ -71,6 +71,7 @@ from utils import acct_dir
from utils import local_actor_url
from utils import is_unlisted_post
from utils import language_right_to_left
from content import format_mixed_right_to_left
from content import replace_remote_hashtags
from content import detect_dogwhistles
from content import create_edits_html
@ -2720,6 +2721,8 @@ def individual_post_as_html(signing_priv_key_pem: str,
switch_words(base_dir, nickname, domain, object_content)
object_content = html_replace_email_quote(object_content)
object_content = html_replace_quote_marks(object_content)
object_content = \
format_mixed_right_to_left(object_content, system_language)
# append any edits
object_content += edits_str
else: