mirror of https://gitlab.com/bashrc2/epicyon
Unit test for safe html
parent
bdf1c77408
commit
9a0185ef3c
16
content.py
16
content.py
|
@ -486,6 +486,22 @@ def add_web_links(content: str) -> str:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def safe_web_text(arbitrary_html: str) -> str:
|
||||||
|
"""Turns arbitrary html into something safe.
|
||||||
|
So if the arbitrary html contains attack scripts those will be removed
|
||||||
|
"""
|
||||||
|
# first remove the markup, so that we have something safe
|
||||||
|
safe_text = remove_html(arbitrary_html)
|
||||||
|
if not safe_text:
|
||||||
|
return ''
|
||||||
|
# remove any spurious characters found in podcast descriptions
|
||||||
|
remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]')
|
||||||
|
for remchar in remove_chars:
|
||||||
|
safe_text = safe_text.replace(remchar, '')
|
||||||
|
# recreate any url links safely
|
||||||
|
return add_web_links(safe_text)
|
||||||
|
|
||||||
|
|
||||||
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
|
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
|
||||||
replace_hashtags: {}, post_hashtags: {}) -> bool:
|
replace_hashtags: {}, post_hashtags: {}) -> bool:
|
||||||
"""Detects hashtags and adds them to the replacements dict
|
"""Detects hashtags and adds them to the replacements dict
|
||||||
|
|
26
tests.py
26
tests.py
|
@ -128,6 +128,7 @@ from inbox import json_post_allows_comments
|
||||||
from inbox import valid_inbox
|
from inbox import valid_inbox
|
||||||
from inbox import valid_inbox_filenames
|
from inbox import valid_inbox_filenames
|
||||||
from categories import guess_hashtag_category
|
from categories import guess_hashtag_category
|
||||||
|
from content import safe_web_text
|
||||||
from content import words_similarity
|
from content import words_similarity
|
||||||
from content import get_price_from_string
|
from content import get_price_from_string
|
||||||
from content import limit_repeated_words
|
from content import limit_repeated_words
|
||||||
|
@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None:
|
||||||
assert link.startswith('https://test.link/creativecommons')
|
assert link.startswith('https://test.link/creativecommons')
|
||||||
|
|
||||||
|
|
||||||
|
def _test_safe_webtext() -> None:
|
||||||
|
print('test_safe_webtext')
|
||||||
|
web_text = '<p>Some text including a link https://some.site/some-path</p>'
|
||||||
|
expected_text = 'Some text including a link ' + \
|
||||||
|
'<a href="https://some.site/some-path"'
|
||||||
|
safe_text = safe_web_text(web_text)
|
||||||
|
if expected_text not in safe_text:
|
||||||
|
print('Original html: ' + web_text)
|
||||||
|
print('Expected html: ' + expected_text)
|
||||||
|
print('Actual html: ' + safe_text)
|
||||||
|
assert expected_text in safe_text
|
||||||
|
assert '<p>' not in safe_text
|
||||||
|
assert '</p>' not in safe_text
|
||||||
|
|
||||||
|
web_text = 'Some text with <script>some script</script>'
|
||||||
|
expected_text = 'Some text with some script'
|
||||||
|
safe_text = safe_web_text(web_text)
|
||||||
|
if expected_text != safe_text:
|
||||||
|
print('Original html: ' + web_text)
|
||||||
|
print('Expected html: ' + expected_text)
|
||||||
|
print('Actual html: ' + safe_text)
|
||||||
|
assert expected_text == safe_text
|
||||||
|
|
||||||
|
|
||||||
def run_all_tests():
|
def run_all_tests():
|
||||||
base_dir = os.getcwd()
|
base_dir = os.getcwd()
|
||||||
print('Running tests...')
|
print('Running tests...')
|
||||||
|
@ -6504,6 +6529,7 @@ def run_all_tests():
|
||||||
'message_json', 'liked_post_json'])
|
'message_json', 'liked_post_json'])
|
||||||
_test_checkbox_names()
|
_test_checkbox_names()
|
||||||
_test_functions()
|
_test_functions()
|
||||||
|
_test_safe_webtext()
|
||||||
_test_get_link_from_rss_item()
|
_test_get_link_from_rss_item()
|
||||||
_test_xml_podcast_dict()
|
_test_xml_podcast_dict()
|
||||||
_test_get_actor_from_in_reply_to()
|
_test_get_actor_from_in_reply_to()
|
||||||
|
|
|
@ -14,7 +14,7 @@ from shutil import copyfile
|
||||||
from utils import get_config_param
|
from utils import get_config_param
|
||||||
from utils import remove_html
|
from utils import remove_html
|
||||||
from media import path_is_audio
|
from media import path_is_audio
|
||||||
from content import add_web_links
|
from content import safe_web_text
|
||||||
from webapp_utils import get_broken_link_substitute
|
from webapp_utils import get_broken_link_substitute
|
||||||
from webapp_utils import html_header_with_external_style
|
from webapp_utils import html_header_with_external_style
|
||||||
from webapp_utils import html_footer
|
from webapp_utils import html_footer
|
||||||
|
@ -210,15 +210,8 @@ def html_podcast_episode(css_cache: {}, translate: {},
|
||||||
if newswire_item[4]:
|
if newswire_item[4]:
|
||||||
podcast_description = \
|
podcast_description = \
|
||||||
html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
|
html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
|
||||||
# Why remove html? Potentially podcast descriptions could contain
|
podcast_description = safe_web_text(podcast_description)
|
||||||
# arbitrary html with attack scripts, etc
|
|
||||||
podcast_description = remove_html(podcast_description)
|
|
||||||
if podcast_description:
|
if podcast_description:
|
||||||
remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]')
|
|
||||||
for remchar in remove_chars:
|
|
||||||
podcast_description = podcast_description.replace(remchar, '')
|
|
||||||
# recreate any url links safely
|
|
||||||
podcast_description = add_web_links(podcast_description)
|
|
||||||
podcast_str += '<p>' + podcast_description + '</p>\n'
|
podcast_str += '<p>' + podcast_description + '</p>\n'
|
||||||
|
|
||||||
# donate button
|
# donate button
|
||||||
|
|
Loading…
Reference in New Issue