Fix for removing long words

main
Bob Mottram 2024-09-21 21:52:13 +01:00
parent 29d4f68fd5
commit bdc52a1fce
2 changed files with 75 additions and 0 deletions

View File

@ -1088,6 +1088,16 @@ def remove_long_words(content: str, max_word_length: int,
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
return content
content = replace_content_duplicates(content)
non_html_list = False
if '\n\n' in content and '<p>' not in content:
content = '<p>' + content.replace('\n\n', '</p> <p>') + '</p>'
non_html_list = True
non_html_list2 = False
if '\n' in content and '<p>' not in content:
content = '<p>' + content.replace('\n', '</p> <p>') + '</p>'
non_html_list2 = True
if ' ' not in content and '</p><p>' not in content:
# handle a single very long string with no spaces
content_str = content.replace('<p>', '').replace(r'<\p>', '')
@ -1166,6 +1176,16 @@ def remove_long_words(content: str, max_word_length: int,
if not content.endswith('</p>'):
content = content.strip() + '</p>'
content = content.replace('<p> </p>', '<p></p>')
if non_html_list:
content = content.replace('</p> <p>', '\n\n')
content = content.replace('<p>', '')
content = content.replace('</p>', '')
if non_html_list2:
content = content.replace('</p> <p>', '\n')
content = content.replace('<p>', '')
content = content.replace('</p>', '')
content = content.replace('</p> <p>', '</p><p>')
return content

View File

@ -8855,6 +8855,61 @@ def _test_uninvert2():
def _test_check_individual_post_content():
print('check_individual_post_content')
content = "Unenshitification?\n\n" + \
"Counter-enshitification?\n\n" + \
"Anti-enshitification?"
content2 = remove_style_within_html(content)
if content2 != content:
print(content)
print(content2)
assert content2 == content
content3 = remove_long_words(content, 40, [])
if content3 != content:
print(content)
print(content3)
assert content3 == content
content4 = remove_text_formatting(content, False)
if content4 != content:
print(content)
print(content4)
assert content4 == content
content5 = limit_repeated_words(content, 6)
if content5 != content:
print(content)
print(content5)
assert content5 == content
content = "Unenshitification?\n" + \
"Counter-enshitification?\n" + \
"Anti-enshitification?"
content2 = remove_style_within_html(content)
if content2 != content:
print(content)
print(content2)
assert content2 == content
content3 = remove_long_words(content, 40, [])
if content3 != content:
print(content)
print(content3)
assert content3 == content
content4 = remove_text_formatting(content, False)
if content4 != content:
print(content)
print(content4)
assert content4 == content
content5 = limit_repeated_words(content, 6)
if content5 != content:
print(content)
print(content5)
assert content5 == content
content = "<p>Unenshitification?</p><p></p><p>" + \
"Counter-enshitification?</p><p></p>" + \
"<p>Anti-enshitification?</p><p></p><p>Nonshitification?</p>"