Another fix for removing long words

2024-02-07 10:48:46 +00:00 · 2024-02-07 10:48:46 +00:00 · b8372aa02e
parent 3e0e6d6cde
commit b8372aa02e
2 changed files with 33 additions and 2 deletions
--- a/content.py
+++ b/content.py
@ -1049,7 +1049,7 @@ def remove_long_words(content: str, max_word_length: int,
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    content = replace_content_duplicates(content)
-    if ' ' not in content and '<p></p>' not in content:
+    if ' ' not in content and '</p><p>' not in content:
        # handle a single very long string with no spaces
        content_str = content.replace('<p>', '').replace(r'<\p>', '')
        if '://' not in content_str:
@ -1068,6 +1068,7 @@ def remove_long_words(content: str, max_word_length: int,
                if word_str not in long_words_list:
                    long_words_list.append(word_str)
    for word_str in long_words_list:
        original_word_str = word_str
        if word_str.startswith('<p>'):
            word_str = word_str.replace('<p>', '')
        if word_str.startswith('<'):
@ -1112,8 +1113,12 @@ def remove_long_words(content: str, max_word_length: int,
        if '/' in word_str:
            continue
        if len(word_str[max_word_length:]) < max_word_length:
            end_of_line_char = '\n'
            if '<br>' in original_word_str:
                end_of_line_char = ''
            content = content.replace(word_str,
-                                      word_str[:max_word_length] + '\n' +
+                                      word_str[:max_word_length] +
                                      end_of_line_char +
                                      word_str[max_word_length:])
        else:
            content = content.replace(word_str,
--- a/tests.py
+++ b/tests.py
@ -8644,6 +8644,32 @@ def _test_check_individual_post_content():
        print(content5)
    assert content5 == content
    content = "<p>D-A-N-G-E-R-O-U-S<br>A-N-I-M-A-L</p>" + \
        "<p>D-A-N-G-E-R-O-U-S<br>A-N-I-M-A-L</p>"
    content2 = remove_style_within_html(content)
    if content2 != content:
        print(content)
        print(content2)
    assert content2 == content
    content3 = remove_long_words(content, 40, [])
    if content3 != content:
        print(content)
        print(content3)
    assert content3 == content
    content4 = remove_text_formatting(content, False)
    if content4 != content:
        print(content)
        print(content4)
    assert content4 == content
    content5 = limit_repeated_words(content, 6)
    if content5 != content:
        print(content)
        print(content5)
    assert content5 == content
 def run_all_tests():
    base_dir = os.getcwd()