From bdc52a1fceba0b6ca2301128e6978c85bf771a4a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 21 Sep 2024 21:52:13 +0100 Subject: [PATCH] Fix for removing long words --- content.py | 20 ++++++++++++++++++++ tests.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/content.py b/content.py index 8d0d7edbb..5fa6e136e 100644 --- a/content.py +++ b/content.py @@ -1088,6 +1088,16 @@ def remove_long_words(content: str, max_word_length: int, if is_pgp_encrypted(content) or contains_pgp_public_key(content): return content content = replace_content_duplicates(content) + + non_html_list = False + if '\n\n' in content and '

' not in content: + content = '

' + content.replace('\n\n', '

') + '

' + non_html_list = True + non_html_list2 = False + if '\n' in content and '

' not in content: + content = '

' + content.replace('\n', '

') + '

' + non_html_list2 = True + if ' ' not in content and '

' not in content: # handle a single very long string with no spaces content_str = content.replace('

', '').replace(r'<\p>', '') @@ -1166,6 +1176,16 @@ def remove_long_words(content: str, max_word_length: int, if not content.endswith('

'): content = content.strip() + '

' content = content.replace('

', '

') + if non_html_list: + content = content.replace('

', '\n\n') + content = content.replace('

', '') + content = content.replace('

', '') + if non_html_list2: + content = content.replace('

', '\n') + content = content.replace('

', '') + content = content.replace('

', '') + content = content.replace('

', '

') + return content diff --git a/tests.py b/tests.py index 1862cac39..8b8e6241f 100644 --- a/tests.py +++ b/tests.py @@ -8855,6 +8855,61 @@ def _test_uninvert2(): def _test_check_individual_post_content(): print('check_individual_post_content') + + content = "Unenshitification?\n\n" + \ + "Counter-enshitification?\n\n" + \ + "Anti-enshitification?" + content2 = remove_style_within_html(content) + if content2 != content: + print(content) + print(content2) + assert content2 == content + + content3 = remove_long_words(content, 40, []) + if content3 != content: + print(content) + print(content3) + assert content3 == content + + content4 = remove_text_formatting(content, False) + if content4 != content: + print(content) + print(content4) + assert content4 == content + + content5 = limit_repeated_words(content, 6) + if content5 != content: + print(content) + print(content5) + assert content5 == content + + content = "Unenshitification?\n" + \ + "Counter-enshitification?\n" + \ + "Anti-enshitification?" + content2 = remove_style_within_html(content) + if content2 != content: + print(content) + print(content2) + assert content2 == content + + content3 = remove_long_words(content, 40, []) + if content3 != content: + print(content) + print(content3) + assert content3 == content + + content4 = remove_text_formatting(content, False) + if content4 != content: + print(content) + print(content4) + assert content4 == content + + content5 = limit_repeated_words(content, 6) + if content5 != content: + print(content) + print(content5) + assert content5 == content + content = "

Unenshitification?

" + \ "Counter-enshitification?

" + \ "

Anti-enshitification?

Nonshitification?

"