From 9a0185ef3ca98d94c12dfc4ddd5ac78b5ef4dcd6 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@freedombone.net>
Date: Fri, 14 Jan 2022 10:20:37 +0000
Subject: [PATCH] Unit test for safe html

---
 content.py        | 16 ++++++++++++++++
 tests.py          | 26 ++++++++++++++++++++++++++
 webapp_podcast.py | 11 ++---------
 3 files changed, 44 insertions(+), 9 deletions(-)
diff --git a/content.py b/content.py
index 2394e21a1..2135006b8 100644
--- a/content.py
+++ b/content.py
@@ -486,6 +486,22 @@ def add_web_links(content: str) -> str:
     return content
 
 
+def safe_web_text(arbitrary_html: str) -> str:
+    """Turns arbitrary html into something safe.
+    So if the arbitrary html contains attack scripts those will be removed
+    """
+    # first remove the markup, so that we have something safe
+    safe_text = remove_html(arbitrary_html)
+    if not safe_text:
+        return ''
+    # remove any spurious characters found in podcast descriptions
+    remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
+    for remchar in remove_chars:
+        safe_text = safe_text.replace(remchar, '')
+    # recreate any url links safely
+    return add_web_links(safe_text)
+
+
 def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
                    replace_hashtags: {}, post_hashtags: {}) -> bool:
     """Detects hashtags and adds them to the replacements dict
diff --git a/tests.py b/tests.py
index ea7349a6c..07baf487c 100644
--- a/tests.py
+++ b/tests.py
@@ -128,6 +128,7 @@ from inbox import json_post_allows_comments
 from inbox import valid_inbox
 from inbox import valid_inbox_filenames
 from categories import guess_hashtag_category
+from content import safe_web_text
 from content import words_similarity
 from content import get_price_from_string
 from content import limit_repeated_words
@@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None:
     assert link.startswith('https://test.link/creativecommons')
 
 
+def _test_safe_webtext() -> None:
+    print('test_safe_webtext')
+    web_text = '<p>Some text including a link https://some.site/some-path</p>'
+    expected_text = 'Some text including a link ' + \
+        '<a href="https://some.site/some-path"'
+    safe_text = safe_web_text(web_text)
+    if expected_text not in safe_text:
+        print('Original html: ' + web_text)
+        print('Expected html: ' + expected_text)
+        print('Actual html: ' + safe_text)
+    assert expected_text in safe_text
+    assert '<p>' not in safe_text
+    assert '</p>' not in safe_text
+
+    web_text = 'Some text with <script>some script</script>'
+    expected_text = 'Some text with some script'
+    safe_text = safe_web_text(web_text)
+    if expected_text != safe_text:
+        print('Original html: ' + web_text)
+        print('Expected html: ' + expected_text)
+        print('Actual html: ' + safe_text)
+    assert expected_text == safe_text
+
+
 def run_all_tests():
     base_dir = os.getcwd()
     print('Running tests...')
@@ -6504,6 +6529,7 @@ def run_all_tests():
                             'message_json', 'liked_post_json'])
     _test_checkbox_names()
     _test_functions()
+    _test_safe_webtext()
     _test_get_link_from_rss_item()
     _test_xml_podcast_dict()
     _test_get_actor_from_in_reply_to()
diff --git a/webapp_podcast.py b/webapp_podcast.py
index 8b12520d6..435400772 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -14,7 +14,7 @@ from shutil import copyfile
 from utils import get_config_param
 from utils import remove_html
 from media import path_is_audio
-from content import add_web_links
+from content import safe_web_text
 from webapp_utils import get_broken_link_substitute
 from webapp_utils import html_header_with_external_style
 from webapp_utils import html_footer
@@ -210,15 +210,8 @@ def html_podcast_episode(css_cache: {}, translate: {},
     if newswire_item[4]:
         podcast_description = \
             html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
-        # Why remove html? Potentially podcast descriptions could contain
-        # arbitrary html with attack scripts, etc
-        podcast_description = remove_html(podcast_description)
+        podcast_description = safe_web_text(podcast_description)
         if podcast_description:
-            remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
-            for remchar in remove_chars:
-                podcast_description = podcast_description.replace(remchar, '')
-            # recreate any url links safely
-            podcast_description = add_web_links(podcast_description)
             podcast_str += '<p>' + podcast_description + '</p>\n'
 
     # donate button