Fix for removing long words

2024-09-21 21:52:13 +01:00 · 2024-09-21 21:52:13 +01:00 · bdc52a1fce
parent 29d4f68fd5
commit bdc52a1fce
2 changed files with 75 additions and 0 deletions
--- a/content.py
+++ b/content.py
@ -1088,6 +1088,16 @@ def remove_long_words(content: str, max_word_length: int,
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    content = replace_content_duplicates(content)
+
+    non_html_list = False
+    if '\n\n' in content and '<p>' not in content:
+        content = '<p>' + content.replace('\n\n', '</p> <p>') + '</p>'
+        non_html_list = True
+    non_html_list2 = False
+    if '\n' in content and '<p>' not in content:
+        content = '<p>' + content.replace('\n', '</p> <p>') + '</p>'
+        non_html_list2 = True
+
    if ' ' not in content and '</p><p>' not in content:
        # handle a single very long string with no spaces
        content_str = content.replace('<p>', '').replace(r'<\p>', '')
@ -1166,6 +1176,16 @@ def remove_long_words(content: str, max_word_length: int,
        if not content.endswith('</p>'):
            content = content.strip() + '</p>'
    content = content.replace('<p> </p>', '<p></p>')
+    if non_html_list:
+        content = content.replace('</p> <p>', '\n\n')
+        content = content.replace('<p>', '')
+        content = content.replace('</p>', '')
+    if non_html_list2:
+        content = content.replace('</p> <p>', '\n')
+        content = content.replace('<p>', '')
+        content = content.replace('</p>', '')
+    content = content.replace('</p> <p>', '</p><p>')
+
    return content


--- a/tests.py
+++ b/tests.py
@ -8855,6 +8855,61 @@ def _test_uninvert2():

 def _test_check_individual_post_content():
    print('check_individual_post_content')
+
+    content = "Unenshitification?\n\n" + \
+        "Counter-enshitification?\n\n" + \
+        "Anti-enshitification?"
+    content2 = remove_style_within_html(content)
+    if content2 != content:
+        print(content)
+        print(content2)
+    assert content2 == content
+
+    content3 = remove_long_words(content, 40, [])
+    if content3 != content:
+        print(content)
+        print(content3)
+    assert content3 == content
+
+    content4 = remove_text_formatting(content, False)
+    if content4 != content:
+        print(content)
+        print(content4)
+    assert content4 == content
+
+    content5 = limit_repeated_words(content, 6)
+    if content5 != content:
+        print(content)
+        print(content5)
+    assert content5 == content
+
+    content = "Unenshitification?\n" + \
+        "Counter-enshitification?\n" + \
+        "Anti-enshitification?"
+    content2 = remove_style_within_html(content)
+    if content2 != content:
+        print(content)
+        print(content2)
+    assert content2 == content
+
+    content3 = remove_long_words(content, 40, [])
+    if content3 != content:
+        print(content)
+        print(content3)
+    assert content3 == content
+
+    content4 = remove_text_formatting(content, False)
+    if content4 != content:
+        print(content)
+        print(content4)
+    assert content4 == content
+
+    content5 = limit_repeated_words(content, 6)
+    if content5 != content:
+        print(content)
+        print(content5)
+    assert content5 == content
+
    content = "<p>Unenshitification?</p><p></p><p>" + \
        "Counter-enshitification?</p><p></p>" + \
        "<p>Anti-enshitification?</p><p></p><p>Nonshitification?</p>"