Allow pre tag through dangerous markup filter in some cases, then remove it when rendering

main
Bob Mottram 2023-05-18 12:15:18 +01:00
parent 8efb5bedd4
commit 4caa930f67
11 changed files with 115 additions and 38 deletions

View File

@ -267,7 +267,7 @@ def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
# an attacker can include html inside of the css
# file as a comment and this may then be run from the html
if dangerous_markup(content, allow_local_network_access):
if dangerous_markup(content, allow_local_network_access, []):
return True
return False

View File

@ -5544,7 +5544,7 @@ class PubServer(BaseHTTPRequestHandler):
if fields.get('editedAbout'):
about_str = fields['editedAbout']
if not dangerous_markup(about_str,
allow_local_network_access):
allow_local_network_access, []):
try:
with open(about_filename, 'w+',
encoding='utf-8') as aboutfile:
@ -5563,7 +5563,7 @@ class PubServer(BaseHTTPRequestHandler):
if fields.get('editedTOS'):
tos_str = fields['editedTOS']
if not dangerous_markup(tos_str,
allow_local_network_access):
allow_local_network_access, []):
try:
with open(tos_filename, 'w+',
encoding='utf-8') as tosfile:

View File

@ -1360,7 +1360,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
if summary != valid_content_warning(summary):
print('WARN: invalid content warning ' + summary)
return False
if dangerous_markup(summary, allow_local_network_access):
if dangerous_markup(summary, allow_local_network_access, []):
if message_json['object'].get('id'):
print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
print('REJECT ARBITRARY HTML: bad string in summary - ' +
@ -1384,7 +1384,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
return False
content_str = get_base_content_from_post(message_json, system_language)
if dangerous_markup(content_str, allow_local_network_access):
if dangerous_markup(content_str, allow_local_network_access, ['pre']):
if message_json['object'].get('id'):
print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
if debug:

View File

@ -601,8 +601,8 @@ def _convert_rss_to_activitypub(base_dir: str, http_prefix: str,
rss_title = _remove_control_characters(item[0])
url = item[1]
if dangerous_markup(url, allow_local_network_access) or \
dangerous_markup(rss_title, allow_local_network_access):
if dangerous_markup(url, allow_local_network_access, []) or \
dangerous_markup(rss_title, allow_local_network_access, []):
continue
rss_description = ''

View File

@ -303,7 +303,7 @@ def post_message_to_outbox(session, translate: {},
system_language, translate,
'nowplaying', 'NowPlaying')
if dangerous_markup(content_str, allow_local_network_access):
if dangerous_markup(content_str, allow_local_network_access, []):
print('POST to outbox contains dangerous markup: ' +
str(message_json))
return False

View File

@ -413,7 +413,7 @@ def get_person_box(signing_priv_key_pem: str, origin_domain: str,
display_name = None
if person_json.get('name'):
display_name = person_json['name']
if dangerous_markup(person_json['name'], False):
if dangerous_markup(person_json['name'], False, []):
display_name = '*ADVERSARY*'
elif is_filtered(base_dir,
nickname, domain,
@ -5549,7 +5549,7 @@ def download_announce(session, base_dir: str, http_prefix: str,
if announced_json['contentMap'].get(system_language):
content_str = announced_json['contentMap'][system_language]
using_content_map = True
if dangerous_markup(content_str, allow_local_network_access):
if dangerous_markup(content_str, allow_local_network_access, []):
print('WARN: announced post contains dangerous markup ' +
str(announced_json))
_reject_announce(announce_filename,

View File

@ -229,6 +229,7 @@ def dangerous_question(question_json: {},
question_options = question_json['object']['oneOf']
for option in question_options:
if option.get('name'):
if dangerous_markup(option['name'], allow_local_network_access):
if dangerous_markup(option['name'],
allow_local_network_access, []):
return True
return False

View File

@ -55,6 +55,7 @@ from follow import clear_followers
from follow import send_follow_request_via_server
from follow import send_unfollow_request_via_server
from siteactive import site_is_active
from utils import remove_markup_tag
from utils import remove_style_within_html
from utils import html_tag_has_closing
from utils import remove_inverted_text
@ -4189,75 +4190,75 @@ def _test_danger_markup():
print('test_dangerous_markup')
allow_local_network_access = False
content = '<p>This is a valid message</p>'
assert not dangerous_markup(content, allow_local_network_access)
assert not dangerous_markup(content, allow_local_network_access, [])
content = 'This is a valid message without markup'
assert not dangerous_markup(content, allow_local_network_access)
assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \
'<script>document.getElementById("concentrated")' + \
'.innerHTML = "evil";</script></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \
'&lt;script&gt;document.getElementById("concentrated")' + \
'.innerHTML = "evil";&lt;/script&gt;</p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This html contains more than you expected... ' + \
'<script language="javascript">document.getElementById("abc")' + \
'.innerHTML = "def";</script></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This html contains more than you expected... ' + \
'<?php $server_output = curl_exec($ch); ?></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \
'<script src="https://evilsite/payload.js" /></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But it contains ' + \
'spyware. <amp-analytics type="gtag" ' + \
'data-credentials="include"></amp-analytics></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But it contains ' + \
'<a href="something.googleapis.com/anotherthing">spyware.</a></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message embeds an evil frame.' + \
'<iframe src="somesite"></iframe></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message tries to obfuscate an evil frame.' + \
'< iframe src = "somesite"></ iframe ></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message is not necessarily evil, but annoying.' + \
'<hr><br><br><br><br><br><br><br><hr><hr></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message contans a ' + \
'<a href="https://validsite/index.html">valid link.</a></p>'
assert not dangerous_markup(content, allow_local_network_access)
assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message contans a ' + \
'<a href="https://validsite/iframe.html">' + \
'valid link having invalid but harmless name.</a></p>'
assert not dangerous_markup(content, allow_local_network_access)
assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message which <a href="127.0.0.1:8736">' + \
'tries to access the local network</a></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message which <a href="http://192.168.5.10:7235">' + \
'tries to access the local network</a></p>'
assert dangerous_markup(content, allow_local_network_access)
assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>127.0.0.1 This message which does not access ' + \
'the local network</a></p>'
assert not dangerous_markup(content, allow_local_network_access)
assert not dangerous_markup(content, allow_local_network_access, [])
def _run_html_replace_quote_marks():
@ -7983,6 +7984,35 @@ def _test_featured_tags() -> None:
assert result == featured_tags
def _test_remove_tag() -> None:
print('remove_tag')
test_html = 'This is a test'
result = remove_markup_tag(test_html, 'pre')
assert result == test_html
test_html = '<pre>This is a test</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'This is a test':
print('expected: This is a test')
print('result: ' + result)
assert result == 'This is a test'
test_html = 'Previous <pre>this is a test</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'Previous this is a test':
print('expected: Previous this is a test')
print('result: ' + result)
assert result == 'Previous this is a test'
test_html = '<pre>This is a test</pre><br>' + \
'something<br><pre>again</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'This is a test<br>something<br>again':
print('expected: This is a test<br>something<br>again')
print('result: ' + result)
assert result == 'This is a test<br>something<br>again'
def run_all_tests():
base_dir = os.getcwd()
print('Running tests...')
@ -8000,6 +8030,7 @@ def run_all_tests():
_test_checkbox_names()
_test_thread_functions()
_test_functions()
_test_remove_tag()
_test_featured_tags()
_test_xor_hashes()
_test_convert_markdown()

View File

@ -192,6 +192,38 @@ def has_object_dict(post_json_object: {}) -> bool:
return False
def remove_markup_tag(html: str, tag: str) -> str:
"""Remove the given tag from the given html markup
"""
if '<' + tag not in html:
return html
section = html.split('<' + tag)
result = ''
for text in section:
if not result:
if html.startswith('<' + tag) and '>' in text:
result = text.split('>', 1)[1]
else:
result = text
continue
result += text.split('>', 1)[1]
html = result
section = html.split('</' + tag)
result = ''
for text in section:
if not result:
if html.startswith('</' + tag) and '>' in text:
result = text.split('>', 1)[1]
else:
result = text
continue
result += text.split('>', 1)[1]
return result
def get_content_from_post(post_json_object: {}, system_language: str,
languages_understood: [],
content_type: str = "content") -> str:
@ -213,6 +245,7 @@ def get_content_from_post(post_json_object: {}, system_language: str,
sys_lang = this_post_json[map_dict][system_language]
if isinstance(sys_lang, str):
content = this_post_json[map_dict][system_language]
content = remove_markup_tag(content, 'pre')
return standardize_text(content)
else:
# is there a contentMap/summaryMap entry for one of
@ -220,10 +253,12 @@ def get_content_from_post(post_json_object: {}, system_language: str,
for lang in languages_understood:
if this_post_json[map_dict].get(lang):
content = this_post_json[map_dict][lang]
content = remove_markup_tag(content, 'pre')
return standardize_text(content)
else:
if isinstance(this_post_json[content_type], str):
content = this_post_json[content_type]
content = remove_markup_tag(content, 'pre')
return standardize_text(content)
@ -1182,7 +1217,8 @@ def html_tag_has_closing(tag_name: str, content: str) -> bool:
return True
def dangerous_markup(content: str, allow_local_network_access: bool) -> bool:
def dangerous_markup(content: str, allow_local_network_access: bool,
allow_tags: []) -> bool:
"""Returns true if the given content contains dangerous html markup
"""
separators = [['<', '>'], ['&lt;', '&gt;']]
@ -1198,8 +1234,11 @@ def dangerous_markup(content: str, allow_local_network_access: bool) -> bool:
invalid_strings = [
'script', 'noscript', 'canvas', 'style', 'abbr', 'input',
'frame', 'iframe', 'html', 'body', 'hr', 'allow-popups',
'allow-scripts', 'amp-', '?php'
'allow-scripts', 'amp-', '?php', 'pre'
]
for allowed in allow_tags:
if allowed in invalid_strings:
invalid_strings.remove(allowed)
return _is_dangerous_string_tag(content, allow_local_network_access,
separators, invalid_strings)
@ -1236,7 +1275,7 @@ def get_display_name(base_dir: str, actor: str, person_cache: {}) -> str:
if actor_json.get('name'):
name_found = actor_json['name']
if name_found:
if dangerous_markup(name_found, False):
if dangerous_markup(name_found, False, []):
name_found = "*ADVERSARY*"
return standardize_text(name_found)
@ -4333,19 +4372,25 @@ def harmless_markup(post_json_object: {}) -> None:
for field_name in ('content', 'summary'):
if post_json_object['object'].get(field_name):
if dangerous_markup(post_json_object['object'][field_name],
False):
False, ['pre']):
post_json_object['object'][field_name] = \
remove_html(post_json_object['object'][field_name])
post_json_object['object'][field_name] = \
remove_markup_tag(post_json_object['object'][field_name],
'pre')
map_name = field_name + 'Map'
if post_json_object['object'].get(map_name):
map_dict = post_json_object['object'][map_name].items()
for lang, content in map_dict:
if not isinstance(content, str):
continue
if dangerous_markup(content, False):
if dangerous_markup(content, False, ['pre']):
content = remove_html(content)
post_json_object['object'][map_name][lang] = \
content
content = post_json_object['object'][map_name][lang]
post_json_object['object'][map_name][lang] = \
remove_markup_tag(content, 'pre')
def ap_proxy_type(json_object: {}) -> str:

View File

@ -92,7 +92,7 @@ def _get_help_for_timeline(base_dir: str, box_name: str) -> str:
instance_title = 'Epicyon'
with open(help_filename, 'r', encoding='utf-8') as help_file:
help_text = help_file.read()
if dangerous_markup(help_text, False):
if dangerous_markup(help_text, False, []):
return ''
help_text = help_text.replace('INSTANCE', instance_title)
return '<div class="container">\n' + \

View File

@ -1269,7 +1269,7 @@ def get_post_attachments_as_html(base_dir: str,
continue
media_license = ''
if attach.get('schema:license'):
if not dangerous_markup(attach['schema:license'], False):
if not dangerous_markup(attach['schema:license'], False, []):
if not is_filtered(base_dir, nickname, domain,
attach['schema:license'],
system_language):
@ -1279,7 +1279,7 @@ def get_post_attachments_as_html(base_dir: str,
else:
media_license = attach['schema:license']
elif attach.get('license'):
if not dangerous_markup(attach['license'], False):
if not dangerous_markup(attach['license'], False, []):
if not is_filtered(base_dir, nickname, domain,
attach['license'],
system_language):
@ -1291,7 +1291,7 @@ def get_post_attachments_as_html(base_dir: str,
media_creator = ''
if attach.get('schema:creator'):
if len(attach['schema:creator']) < 120:
if not dangerous_markup(attach['schema:creator'], False):
if not dangerous_markup(attach['schema:creator'], False, []):
if not is_filtered(base_dir, nickname, domain,
attach['schema:creator'],
system_language):
@ -1300,7 +1300,7 @@ def get_post_attachments_as_html(base_dir: str,
if isinstance(attach['attribution'], list):
if len(attach['attribution']) > 0:
attrib_str = attach['attribution'][0]
if not dangerous_markup(attrib_str, False):
if not dangerous_markup(attrib_str, False, []):
if not is_filtered(base_dir, nickname, domain,
attrib_str, system_language):
media_creator = attrib_str