From e6976bbcccaeeb215d6f152238a0b696b7d35ed1 Mon Sep 17 00:00:00 2001 From: bashrc Date: Fri, 13 Feb 2026 22:04:40 +0000 Subject: [PATCH] Remove web link concatenations --- content.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/content.py b/content.py index 12b8ba882..e289b8a05 100644 --- a/content.py +++ b/content.py @@ -798,6 +798,25 @@ def add_web_links(content: str) -> str: return content +def _web_link_concatenations(html_text: str) -> str: + """If any web links are concatenated with other words then undo them + """ + if 'https://' not in html_text: + return html_text + words = html_text.split(' ') + new_html_text = '' + for wrd in words: + if 'https://' not in wrd: + new_html_text += wrd + ' ' + continue + if wrd.startswith('https://'): + new_html_text += wrd + ' ' + continue + new_html_text += wrd.split('https://')[0] + ' ' + new_html_text += 'https://' + wrd.split('https://')[1] + ' ' + return new_html_text.strip() + + def safe_web_text(arbitrary_html: str) -> str: """Turns arbitrary html into something safe. So if the arbitrary html contains attack scripts those will be removed @@ -810,6 +829,7 @@ def safe_web_text(arbitrary_html: str) -> str: remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]', '__') for remchar in remove_chars: safe_text = safe_text.replace(remchar, '') + safe_text = _web_link_concatenations(safe_text) # recreate any url links safely return add_web_links(safe_text)