Allow pre tag through dangerous markup filter in some cases, then remove it when rendering

merge-requests/30/head
Bob Mottram 2023-05-18 12:15:18 +01:00
parent 8efb5bedd4
commit 4caa930f67
11 changed files with 115 additions and 38 deletions

View File

@ -267,7 +267,7 @@ def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
# an attacker can include html inside of the css # an attacker can include html inside of the css
# file as a comment and this may then be run from the html # file as a comment and this may then be run from the html
if dangerous_markup(content, allow_local_network_access): if dangerous_markup(content, allow_local_network_access, []):
return True return True
return False return False

View File

@ -5544,7 +5544,7 @@ class PubServer(BaseHTTPRequestHandler):
if fields.get('editedAbout'): if fields.get('editedAbout'):
about_str = fields['editedAbout'] about_str = fields['editedAbout']
if not dangerous_markup(about_str, if not dangerous_markup(about_str,
allow_local_network_access): allow_local_network_access, []):
try: try:
with open(about_filename, 'w+', with open(about_filename, 'w+',
encoding='utf-8') as aboutfile: encoding='utf-8') as aboutfile:
@ -5563,7 +5563,7 @@ class PubServer(BaseHTTPRequestHandler):
if fields.get('editedTOS'): if fields.get('editedTOS'):
tos_str = fields['editedTOS'] tos_str = fields['editedTOS']
if not dangerous_markup(tos_str, if not dangerous_markup(tos_str,
allow_local_network_access): allow_local_network_access, []):
try: try:
with open(tos_filename, 'w+', with open(tos_filename, 'w+',
encoding='utf-8') as tosfile: encoding='utf-8') as tosfile:

View File

@ -1360,7 +1360,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
if summary != valid_content_warning(summary): if summary != valid_content_warning(summary):
print('WARN: invalid content warning ' + summary) print('WARN: invalid content warning ' + summary)
return False return False
if dangerous_markup(summary, allow_local_network_access): if dangerous_markup(summary, allow_local_network_access, []):
if message_json['object'].get('id'): if message_json['object'].get('id'):
print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
print('REJECT ARBITRARY HTML: bad string in summary - ' + print('REJECT ARBITRARY HTML: bad string in summary - ' +
@ -1384,7 +1384,7 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
return False return False
content_str = get_base_content_from_post(message_json, system_language) content_str = get_base_content_from_post(message_json, system_language)
if dangerous_markup(content_str, allow_local_network_access): if dangerous_markup(content_str, allow_local_network_access, ['pre']):
if message_json['object'].get('id'): if message_json['object'].get('id'):
print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) print('REJECT ARBITRARY HTML: ' + message_json['object']['id'])
if debug: if debug:

View File

@ -601,8 +601,8 @@ def _convert_rss_to_activitypub(base_dir: str, http_prefix: str,
rss_title = _remove_control_characters(item[0]) rss_title = _remove_control_characters(item[0])
url = item[1] url = item[1]
if dangerous_markup(url, allow_local_network_access) or \ if dangerous_markup(url, allow_local_network_access, []) or \
dangerous_markup(rss_title, allow_local_network_access): dangerous_markup(rss_title, allow_local_network_access, []):
continue continue
rss_description = '' rss_description = ''

View File

@ -303,7 +303,7 @@ def post_message_to_outbox(session, translate: {},
system_language, translate, system_language, translate,
'nowplaying', 'NowPlaying') 'nowplaying', 'NowPlaying')
if dangerous_markup(content_str, allow_local_network_access): if dangerous_markup(content_str, allow_local_network_access, []):
print('POST to outbox contains dangerous markup: ' + print('POST to outbox contains dangerous markup: ' +
str(message_json)) str(message_json))
return False return False

View File

@ -413,7 +413,7 @@ def get_person_box(signing_priv_key_pem: str, origin_domain: str,
display_name = None display_name = None
if person_json.get('name'): if person_json.get('name'):
display_name = person_json['name'] display_name = person_json['name']
if dangerous_markup(person_json['name'], False): if dangerous_markup(person_json['name'], False, []):
display_name = '*ADVERSARY*' display_name = '*ADVERSARY*'
elif is_filtered(base_dir, elif is_filtered(base_dir,
nickname, domain, nickname, domain,
@ -5549,7 +5549,7 @@ def download_announce(session, base_dir: str, http_prefix: str,
if announced_json['contentMap'].get(system_language): if announced_json['contentMap'].get(system_language):
content_str = announced_json['contentMap'][system_language] content_str = announced_json['contentMap'][system_language]
using_content_map = True using_content_map = True
if dangerous_markup(content_str, allow_local_network_access): if dangerous_markup(content_str, allow_local_network_access, []):
print('WARN: announced post contains dangerous markup ' + print('WARN: announced post contains dangerous markup ' +
str(announced_json)) str(announced_json))
_reject_announce(announce_filename, _reject_announce(announce_filename,

View File

@ -229,6 +229,7 @@ def dangerous_question(question_json: {},
question_options = question_json['object']['oneOf'] question_options = question_json['object']['oneOf']
for option in question_options: for option in question_options:
if option.get('name'): if option.get('name'):
if dangerous_markup(option['name'], allow_local_network_access): if dangerous_markup(option['name'],
allow_local_network_access, []):
return True return True
return False return False

View File

@ -55,6 +55,7 @@ from follow import clear_followers
from follow import send_follow_request_via_server from follow import send_follow_request_via_server
from follow import send_unfollow_request_via_server from follow import send_unfollow_request_via_server
from siteactive import site_is_active from siteactive import site_is_active
from utils import remove_markup_tag
from utils import remove_style_within_html from utils import remove_style_within_html
from utils import html_tag_has_closing from utils import html_tag_has_closing
from utils import remove_inverted_text from utils import remove_inverted_text
@ -4189,75 +4190,75 @@ def _test_danger_markup():
print('test_dangerous_markup') print('test_dangerous_markup')
allow_local_network_access = False allow_local_network_access = False
content = '<p>This is a valid message</p>' content = '<p>This is a valid message</p>'
assert not dangerous_markup(content, allow_local_network_access) assert not dangerous_markup(content, allow_local_network_access, [])
content = 'This is a valid message without markup' content = 'This is a valid message without markup'
assert not dangerous_markup(content, allow_local_network_access) assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \ content = '<p>This is a valid-looking message. But wait... ' + \
'<script>document.getElementById("concentrated")' + \ '<script>document.getElementById("concentrated")' + \
'.innerHTML = "evil";</script></p>' '.innerHTML = "evil";</script></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \ content = '<p>This is a valid-looking message. But wait... ' + \
'&lt;script&gt;document.getElementById("concentrated")' + \ '&lt;script&gt;document.getElementById("concentrated")' + \
'.innerHTML = "evil";&lt;/script&gt;</p>' '.innerHTML = "evil";&lt;/script&gt;</p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This html contains more than you expected... ' + \ content = '<p>This html contains more than you expected... ' + \
'<script language="javascript">document.getElementById("abc")' + \ '<script language="javascript">document.getElementById("abc")' + \
'.innerHTML = "def";</script></p>' '.innerHTML = "def";</script></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This html contains more than you expected... ' + \ content = '<p>This html contains more than you expected... ' + \
'<?php $server_output = curl_exec($ch); ?></p>' '<?php $server_output = curl_exec($ch); ?></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But wait... ' + \ content = '<p>This is a valid-looking message. But wait... ' + \
'<script src="https://evilsite/payload.js" /></p>' '<script src="https://evilsite/payload.js" /></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But it contains ' + \ content = '<p>This is a valid-looking message. But it contains ' + \
'spyware. <amp-analytics type="gtag" ' + \ 'spyware. <amp-analytics type="gtag" ' + \
'data-credentials="include"></amp-analytics></p>' 'data-credentials="include"></amp-analytics></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This is a valid-looking message. But it contains ' + \ content = '<p>This is a valid-looking message. But it contains ' + \
'<a href="something.googleapis.com/anotherthing">spyware.</a></p>' '<a href="something.googleapis.com/anotherthing">spyware.</a></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message embeds an evil frame.' + \ content = '<p>This message embeds an evil frame.' + \
'<iframe src="somesite"></iframe></p>' '<iframe src="somesite"></iframe></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message tries to obfuscate an evil frame.' + \ content = '<p>This message tries to obfuscate an evil frame.' + \
'< iframe src = "somesite"></ iframe ></p>' '< iframe src = "somesite"></ iframe ></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message is not necessarily evil, but annoying.' + \ content = '<p>This message is not necessarily evil, but annoying.' + \
'<hr><br><br><br><br><br><br><br><hr><hr></p>' '<hr><br><br><br><br><br><br><br><hr><hr></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message contans a ' + \ content = '<p>This message contans a ' + \
'<a href="https://validsite/index.html">valid link.</a></p>' '<a href="https://validsite/index.html">valid link.</a></p>'
assert not dangerous_markup(content, allow_local_network_access) assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message contans a ' + \ content = '<p>This message contans a ' + \
'<a href="https://validsite/iframe.html">' + \ '<a href="https://validsite/iframe.html">' + \
'valid link having invalid but harmless name.</a></p>' 'valid link having invalid but harmless name.</a></p>'
assert not dangerous_markup(content, allow_local_network_access) assert not dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message which <a href="127.0.0.1:8736">' + \ content = '<p>This message which <a href="127.0.0.1:8736">' + \
'tries to access the local network</a></p>' 'tries to access the local network</a></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>This message which <a href="http://192.168.5.10:7235">' + \ content = '<p>This message which <a href="http://192.168.5.10:7235">' + \
'tries to access the local network</a></p>' 'tries to access the local network</a></p>'
assert dangerous_markup(content, allow_local_network_access) assert dangerous_markup(content, allow_local_network_access, [])
content = '<p>127.0.0.1 This message which does not access ' + \ content = '<p>127.0.0.1 This message which does not access ' + \
'the local network</a></p>' 'the local network</a></p>'
assert not dangerous_markup(content, allow_local_network_access) assert not dangerous_markup(content, allow_local_network_access, [])
def _run_html_replace_quote_marks(): def _run_html_replace_quote_marks():
@ -7983,6 +7984,35 @@ def _test_featured_tags() -> None:
assert result == featured_tags assert result == featured_tags
def _test_remove_tag() -> None:
print('remove_tag')
test_html = 'This is a test'
result = remove_markup_tag(test_html, 'pre')
assert result == test_html
test_html = '<pre>This is a test</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'This is a test':
print('expected: This is a test')
print('result: ' + result)
assert result == 'This is a test'
test_html = 'Previous <pre>this is a test</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'Previous this is a test':
print('expected: Previous this is a test')
print('result: ' + result)
assert result == 'Previous this is a test'
test_html = '<pre>This is a test</pre><br>' + \
'something<br><pre>again</pre>'
result = remove_markup_tag(test_html, 'pre')
if result != 'This is a test<br>something<br>again':
print('expected: This is a test<br>something<br>again')
print('result: ' + result)
assert result == 'This is a test<br>something<br>again'
def run_all_tests(): def run_all_tests():
base_dir = os.getcwd() base_dir = os.getcwd()
print('Running tests...') print('Running tests...')
@ -8000,6 +8030,7 @@ def run_all_tests():
_test_checkbox_names() _test_checkbox_names()
_test_thread_functions() _test_thread_functions()
_test_functions() _test_functions()
_test_remove_tag()
_test_featured_tags() _test_featured_tags()
_test_xor_hashes() _test_xor_hashes()
_test_convert_markdown() _test_convert_markdown()

View File

@ -192,6 +192,38 @@ def has_object_dict(post_json_object: {}) -> bool:
return False return False
def remove_markup_tag(html: str, tag: str) -> str:
"""Remove the given tag from the given html markup
"""
if '<' + tag not in html:
return html
section = html.split('<' + tag)
result = ''
for text in section:
if not result:
if html.startswith('<' + tag) and '>' in text:
result = text.split('>', 1)[1]
else:
result = text
continue
result += text.split('>', 1)[1]
html = result
section = html.split('</' + tag)
result = ''
for text in section:
if not result:
if html.startswith('</' + tag) and '>' in text:
result = text.split('>', 1)[1]
else:
result = text
continue
result += text.split('>', 1)[1]
return result
def get_content_from_post(post_json_object: {}, system_language: str, def get_content_from_post(post_json_object: {}, system_language: str,
languages_understood: [], languages_understood: [],
content_type: str = "content") -> str: content_type: str = "content") -> str:
@ -213,6 +245,7 @@ def get_content_from_post(post_json_object: {}, system_language: str,
sys_lang = this_post_json[map_dict][system_language] sys_lang = this_post_json[map_dict][system_language]
if isinstance(sys_lang, str): if isinstance(sys_lang, str):
content = this_post_json[map_dict][system_language] content = this_post_json[map_dict][system_language]
content = remove_markup_tag(content, 'pre')
return standardize_text(content) return standardize_text(content)
else: else:
# is there a contentMap/summaryMap entry for one of # is there a contentMap/summaryMap entry for one of
@ -220,10 +253,12 @@ def get_content_from_post(post_json_object: {}, system_language: str,
for lang in languages_understood: for lang in languages_understood:
if this_post_json[map_dict].get(lang): if this_post_json[map_dict].get(lang):
content = this_post_json[map_dict][lang] content = this_post_json[map_dict][lang]
content = remove_markup_tag(content, 'pre')
return standardize_text(content) return standardize_text(content)
else: else:
if isinstance(this_post_json[content_type], str): if isinstance(this_post_json[content_type], str):
content = this_post_json[content_type] content = this_post_json[content_type]
content = remove_markup_tag(content, 'pre')
return standardize_text(content) return standardize_text(content)
@ -1182,7 +1217,8 @@ def html_tag_has_closing(tag_name: str, content: str) -> bool:
return True return True
def dangerous_markup(content: str, allow_local_network_access: bool) -> bool: def dangerous_markup(content: str, allow_local_network_access: bool,
allow_tags: []) -> bool:
"""Returns true if the given content contains dangerous html markup """Returns true if the given content contains dangerous html markup
""" """
separators = [['<', '>'], ['&lt;', '&gt;']] separators = [['<', '>'], ['&lt;', '&gt;']]
@ -1198,8 +1234,11 @@ def dangerous_markup(content: str, allow_local_network_access: bool) -> bool:
invalid_strings = [ invalid_strings = [
'script', 'noscript', 'canvas', 'style', 'abbr', 'input', 'script', 'noscript', 'canvas', 'style', 'abbr', 'input',
'frame', 'iframe', 'html', 'body', 'hr', 'allow-popups', 'frame', 'iframe', 'html', 'body', 'hr', 'allow-popups',
'allow-scripts', 'amp-', '?php' 'allow-scripts', 'amp-', '?php', 'pre'
] ]
for allowed in allow_tags:
if allowed in invalid_strings:
invalid_strings.remove(allowed)
return _is_dangerous_string_tag(content, allow_local_network_access, return _is_dangerous_string_tag(content, allow_local_network_access,
separators, invalid_strings) separators, invalid_strings)
@ -1236,7 +1275,7 @@ def get_display_name(base_dir: str, actor: str, person_cache: {}) -> str:
if actor_json.get('name'): if actor_json.get('name'):
name_found = actor_json['name'] name_found = actor_json['name']
if name_found: if name_found:
if dangerous_markup(name_found, False): if dangerous_markup(name_found, False, []):
name_found = "*ADVERSARY*" name_found = "*ADVERSARY*"
return standardize_text(name_found) return standardize_text(name_found)
@ -4333,19 +4372,25 @@ def harmless_markup(post_json_object: {}) -> None:
for field_name in ('content', 'summary'): for field_name in ('content', 'summary'):
if post_json_object['object'].get(field_name): if post_json_object['object'].get(field_name):
if dangerous_markup(post_json_object['object'][field_name], if dangerous_markup(post_json_object['object'][field_name],
False): False, ['pre']):
post_json_object['object'][field_name] = \ post_json_object['object'][field_name] = \
remove_html(post_json_object['object'][field_name]) remove_html(post_json_object['object'][field_name])
post_json_object['object'][field_name] = \
remove_markup_tag(post_json_object['object'][field_name],
'pre')
map_name = field_name + 'Map' map_name = field_name + 'Map'
if post_json_object['object'].get(map_name): if post_json_object['object'].get(map_name):
map_dict = post_json_object['object'][map_name].items() map_dict = post_json_object['object'][map_name].items()
for lang, content in map_dict: for lang, content in map_dict:
if not isinstance(content, str): if not isinstance(content, str):
continue continue
if dangerous_markup(content, False): if dangerous_markup(content, False, ['pre']):
content = remove_html(content) content = remove_html(content)
post_json_object['object'][map_name][lang] = \ post_json_object['object'][map_name][lang] = \
content content
content = post_json_object['object'][map_name][lang]
post_json_object['object'][map_name][lang] = \
remove_markup_tag(content, 'pre')
def ap_proxy_type(json_object: {}) -> str: def ap_proxy_type(json_object: {}) -> str:

View File

@ -92,7 +92,7 @@ def _get_help_for_timeline(base_dir: str, box_name: str) -> str:
instance_title = 'Epicyon' instance_title = 'Epicyon'
with open(help_filename, 'r', encoding='utf-8') as help_file: with open(help_filename, 'r', encoding='utf-8') as help_file:
help_text = help_file.read() help_text = help_file.read()
if dangerous_markup(help_text, False): if dangerous_markup(help_text, False, []):
return '' return ''
help_text = help_text.replace('INSTANCE', instance_title) help_text = help_text.replace('INSTANCE', instance_title)
return '<div class="container">\n' + \ return '<div class="container">\n' + \

View File

@ -1269,7 +1269,7 @@ def get_post_attachments_as_html(base_dir: str,
continue continue
media_license = '' media_license = ''
if attach.get('schema:license'): if attach.get('schema:license'):
if not dangerous_markup(attach['schema:license'], False): if not dangerous_markup(attach['schema:license'], False, []):
if not is_filtered(base_dir, nickname, domain, if not is_filtered(base_dir, nickname, domain,
attach['schema:license'], attach['schema:license'],
system_language): system_language):
@ -1279,7 +1279,7 @@ def get_post_attachments_as_html(base_dir: str,
else: else:
media_license = attach['schema:license'] media_license = attach['schema:license']
elif attach.get('license'): elif attach.get('license'):
if not dangerous_markup(attach['license'], False): if not dangerous_markup(attach['license'], False, []):
if not is_filtered(base_dir, nickname, domain, if not is_filtered(base_dir, nickname, domain,
attach['license'], attach['license'],
system_language): system_language):
@ -1291,7 +1291,7 @@ def get_post_attachments_as_html(base_dir: str,
media_creator = '' media_creator = ''
if attach.get('schema:creator'): if attach.get('schema:creator'):
if len(attach['schema:creator']) < 120: if len(attach['schema:creator']) < 120:
if not dangerous_markup(attach['schema:creator'], False): if not dangerous_markup(attach['schema:creator'], False, []):
if not is_filtered(base_dir, nickname, domain, if not is_filtered(base_dir, nickname, domain,
attach['schema:creator'], attach['schema:creator'],
system_language): system_language):
@ -1300,7 +1300,7 @@ def get_post_attachments_as_html(base_dir: str,
if isinstance(attach['attribution'], list): if isinstance(attach['attribution'], list):
if len(attach['attribution']) > 0: if len(attach['attribution']) > 0:
attrib_str = attach['attribution'][0] attrib_str = attach['attribution'][0]
if not dangerous_markup(attrib_str, False): if not dangerous_markup(attrib_str, False, []):
if not is_filtered(base_dir, nickname, domain, if not is_filtered(base_dir, nickname, domain,
attrib_str, system_language): attrib_str, system_language):
media_creator = attrib_str media_creator = attrib_str