diff --git a/content.py b/content.py index 2394e21a1..2135006b8 100644 --- a/content.py +++ b/content.py @@ -486,6 +486,22 @@ def add_web_links(content: str) -> str: return content +def safe_web_text(arbitrary_html: str) -> str: + """Turns arbitrary html into something safe. + So if the arbitrary html contains attack scripts those will be removed + """ + # first remove the markup, so that we have something safe + safe_text = remove_html(arbitrary_html) + if not safe_text: + return '' + # remove any spurious characters found in podcast descriptions + remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') + for remchar in remove_chars: + safe_text = safe_text.replace(remchar, '') + # recreate any url links safely + return add_web_links(safe_text) + + def _add_hash_tags(word_str: str, http_prefix: str, domain: str, replace_hashtags: {}, post_hashtags: {}) -> bool: """Detects hashtags and adds them to the replacements dict diff --git a/tests.py b/tests.py index ea7349a6c..07baf487c 100644 --- a/tests.py +++ b/tests.py @@ -128,6 +128,7 @@ from inbox import json_post_allows_comments from inbox import valid_inbox from inbox import valid_inbox_filenames from categories import guess_hashtag_category +from content import safe_web_text from content import words_similarity from content import get_price_from_string from content import limit_repeated_words @@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None: assert link.startswith('https://test.link/creativecommons') +def _test_safe_webtext() -> None: + print('test_safe_webtext') + web_text = '

Some text including a link https://some.site/some-path

' + expected_text = 'Some text including a link ' + \ + '' not in safe_text + assert '

' not in safe_text + + web_text = 'Some text with ' + expected_text = 'Some text with some script' + safe_text = safe_web_text(web_text) + if expected_text != safe_text: + print('Original html: ' + web_text) + print('Expected html: ' + expected_text) + print('Actual html: ' + safe_text) + assert expected_text == safe_text + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -6504,6 +6529,7 @@ def run_all_tests(): 'message_json', 'liked_post_json']) _test_checkbox_names() _test_functions() + _test_safe_webtext() _test_get_link_from_rss_item() _test_xml_podcast_dict() _test_get_actor_from_in_reply_to() diff --git a/webapp_podcast.py b/webapp_podcast.py index 8b12520d6..435400772 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -14,7 +14,7 @@ from shutil import copyfile from utils import get_config_param from utils import remove_html from media import path_is_audio -from content import add_web_links +from content import safe_web_text from webapp_utils import get_broken_link_substitute from webapp_utils import html_header_with_external_style from webapp_utils import html_footer @@ -210,15 +210,8 @@ def html_podcast_episode(css_cache: {}, translate: {}, if newswire_item[4]: podcast_description = \ html.unescape(urllib.parse.unquote_plus(newswire_item[4])) - # Why remove html? Potentially podcast descriptions could contain - # arbitrary html with attack scripts, etc - podcast_description = remove_html(podcast_description) + podcast_description = safe_web_text(podcast_description) if podcast_description: - remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') - for remchar in remove_chars: - podcast_description = podcast_description.replace(remchar, '') - # recreate any url links safely - podcast_description = add_web_links(podcast_description) podcast_str += '

' + podcast_description + '

\n' # donate button