From 786d53622bc3ea8a6032acbbb50ad1f43023fd99 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 09:17:33 +0000 Subject: [PATCH 01/16] Extra characters removal --- webapp_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index b4d3ae48e..66031a2e0 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -211,7 +211,7 @@ def html_podcast_episode(css_cache: {}, translate: {}, html.unescape(urllib.parse.unquote_plus(newswire_item[4])) podcast_description = remove_html(podcast_description) if podcast_description: - remove_chars = ('Œ', 'â€', 'ğŸ', '�') + remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') for remchar in remove_chars: podcast_description = podcast_description.replace(remchar, '') podcast_str += '

' + podcast_description + '

\n' From bdf1c77408adede77e615556271acffce4af5a26 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 09:25:41 +0000 Subject: [PATCH 02/16] Regenerate links within podcast descriptions --- webapp_podcast.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/webapp_podcast.py b/webapp_podcast.py index 66031a2e0..8b12520d6 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -14,6 +14,7 @@ from shutil import copyfile from utils import get_config_param from utils import remove_html from media import path_is_audio +from content import add_web_links from webapp_utils import get_broken_link_substitute from webapp_utils import html_header_with_external_style from webapp_utils import html_footer @@ -209,11 +210,15 @@ def html_podcast_episode(css_cache: {}, translate: {}, if newswire_item[4]: podcast_description = \ html.unescape(urllib.parse.unquote_plus(newswire_item[4])) + # Why remove html? Potentially podcast descriptions could contain + # arbitrary html with attack scripts, etc podcast_description = remove_html(podcast_description) if podcast_description: remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') for remchar in remove_chars: podcast_description = podcast_description.replace(remchar, '') + # recreate any url links safely + podcast_description = add_web_links(podcast_description) podcast_str += '

' + podcast_description + '

\n' # donate button From 9a0185ef3ca98d94c12dfc4ddd5ac78b5ef4dcd6 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 10:20:37 +0000 Subject: [PATCH 03/16] Unit test for safe html --- content.py | 16 ++++++++++++++++ tests.py | 26 ++++++++++++++++++++++++++ webapp_podcast.py | 11 ++--------- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/content.py b/content.py index 2394e21a1..2135006b8 100644 --- a/content.py +++ b/content.py @@ -486,6 +486,22 @@ def add_web_links(content: str) -> str: return content +def safe_web_text(arbitrary_html: str) -> str: + """Turns arbitrary html into something safe. + So if the arbitrary html contains attack scripts those will be removed + """ + # first remove the markup, so that we have something safe + safe_text = remove_html(arbitrary_html) + if not safe_text: + return '' + # remove any spurious characters found in podcast descriptions + remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') + for remchar in remove_chars: + safe_text = safe_text.replace(remchar, '') + # recreate any url links safely + return add_web_links(safe_text) + + def _add_hash_tags(word_str: str, http_prefix: str, domain: str, replace_hashtags: {}, post_hashtags: {}) -> bool: """Detects hashtags and adds them to the replacements dict diff --git a/tests.py b/tests.py index ea7349a6c..07baf487c 100644 --- a/tests.py +++ b/tests.py @@ -128,6 +128,7 @@ from inbox import json_post_allows_comments from inbox import valid_inbox from inbox import valid_inbox_filenames from categories import guess_hashtag_category +from content import safe_web_text from content import words_similarity from content import get_price_from_string from content import limit_repeated_words @@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None: assert link.startswith('https://test.link/creativecommons') +def _test_safe_webtext() -> None: + print('test_safe_webtext') + web_text = '

Some text including a link https://some.site/some-path

' + expected_text = 'Some text including a link ' + \ + '' not in safe_text + assert '

' not in safe_text + + web_text = 'Some text with ' + expected_text = 'Some text with some script' + safe_text = safe_web_text(web_text) + if expected_text != safe_text: + print('Original html: ' + web_text) + print('Expected html: ' + expected_text) + print('Actual html: ' + safe_text) + assert expected_text == safe_text + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -6504,6 +6529,7 @@ def run_all_tests(): 'message_json', 'liked_post_json']) _test_checkbox_names() _test_functions() + _test_safe_webtext() _test_get_link_from_rss_item() _test_xml_podcast_dict() _test_get_actor_from_in_reply_to() diff --git a/webapp_podcast.py b/webapp_podcast.py index 8b12520d6..435400772 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -14,7 +14,7 @@ from shutil import copyfile from utils import get_config_param from utils import remove_html from media import path_is_audio -from content import add_web_links +from content import safe_web_text from webapp_utils import get_broken_link_substitute from webapp_utils import html_header_with_external_style from webapp_utils import html_footer @@ -210,15 +210,8 @@ def html_podcast_episode(css_cache: {}, translate: {}, if newswire_item[4]: podcast_description = \ html.unescape(urllib.parse.unquote_plus(newswire_item[4])) - # Why remove html? Potentially podcast descriptions could contain - # arbitrary html with attack scripts, etc - podcast_description = remove_html(podcast_description) + podcast_description = safe_web_text(podcast_description) if podcast_description: - remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') - for remchar in remove_chars: - podcast_description = podcast_description.replace(remchar, '') - # recreate any url links safely - podcast_description = add_web_links(podcast_description) podcast_str += '

' + podcast_description + '

\n' # donate button From c395261523bef0dba18f40359af5839e40736283 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 12:53:29 +0000 Subject: [PATCH 04/16] Snake case --- follow.py | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/follow.py b/follow.py index 70e7c0b71..d65636899 100644 --- a/follow.py +++ b/follow.py @@ -999,20 +999,20 @@ def send_follow_requestViaServer(base_dir: str, session, # get the actor inbox for the To handle origin_domain = from_domain - (inboxUrl, _, _, fromPersonId, sharedInbox, avatarUrl, - displayName, _) = get_person_box(signing_priv_key_pem, origin_domain, - base_dir, session, wf_request, - person_cache, - project_version, http_prefix, - from_nickname, - from_domain, post_to_box, 52025) + (inbox_url, _, _, from_person_id, _, _, + _, _) = get_person_box(signing_priv_key_pem, origin_domain, + base_dir, session, wf_request, + person_cache, + project_version, http_prefix, + from_nickname, + from_domain, post_to_box, 52025) - if not inboxUrl: + if not inbox_url: if debug: print('DEBUG: follow request no ' + post_to_box + ' was found for ' + handle) return 3 - if not fromPersonId: + if not from_person_id: if debug: print('DEBUG: follow request no actor was found for ' + handle) return 4 @@ -1026,10 +1026,10 @@ def send_follow_requestViaServer(base_dir: str, session, } post_result = \ post_json(http_prefix, from_domain_full, - session, new_follow_json, [], inboxUrl, headers, 3, True) + session, new_follow_json, [], inbox_url, headers, 3, True) if not post_result: if debug: - print('DEBUG: POST follow request failed for c2s to ' + inboxUrl) + print('DEBUG: POST follow request failed for c2s to ' + inbox_url) return 5 if debug: @@ -1095,22 +1095,22 @@ def send_unfollow_request_via_server(base_dir: str, session, # get the actor inbox for the To handle origin_domain = from_domain - (inboxUrl, pubKeyId, pubKey, fromPersonId, sharedInbox, avatarUrl, - displayName, _) = get_person_box(signing_priv_key_pem, - origin_domain, - base_dir, session, - wf_request, person_cache, - project_version, http_prefix, - from_nickname, - from_domain, post_to_box, - 76536) + (inbox_url, _, _, from_person_id, _, _, + _, _) = get_person_box(signing_priv_key_pem, + origin_domain, + base_dir, session, + wf_request, person_cache, + project_version, http_prefix, + from_nickname, + from_domain, post_to_box, + 76536) - if not inboxUrl: + if not inbox_url: if debug: print('DEBUG: unfollow no ' + post_to_box + ' was found for ' + handle) return 3 - if not fromPersonId: + if not from_person_id: if debug: print('DEBUG: unfollow no actor was found for ' + handle) return 4 @@ -1124,10 +1124,10 @@ def send_unfollow_request_via_server(base_dir: str, session, } post_result = \ post_json(http_prefix, from_domain_full, - session, unfollow_json, [], inboxUrl, headers, 3, True) + session, unfollow_json, [], inbox_url, headers, 3, True) if not post_result: if debug: - print('DEBUG: POST unfollow failed for c2s to ' + inboxUrl) + print('DEBUG: POST unfollow failed for c2s to ' + inbox_url) return 5 if debug: From 75a21345cc353240eedbccdfe87febc230073202 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 13:15:43 +0000 Subject: [PATCH 05/16] Lower case appears to be the standard --- newswire.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/newswire.py b/newswire.py index 425220c5b..717b706d5 100644 --- a/newswire.py +++ b/newswire.py @@ -229,8 +229,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str, # Include tags from podcast categories if podcast_properties: if podcast_properties.get('explicit'): - if '#NSFW' not in post_tags: - post_tags.append('#NSFW') + if '#nsfw' not in post_tags: + post_tags.append('#nsfw') post_tags += podcast_properties['categories'] From 6cceef2386d032eee613ed88eebd457ecca7f35b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 17:40:42 +0000 Subject: [PATCH 06/16] Podcast processing for youtube feeds --- newswire.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/newswire.py b/newswire.py index 717b706d5..0bd032607 100644 --- a/newswire.py +++ b/newswire.py @@ -501,7 +501,7 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}: # get the image for the podcast, if it exists podcast_episode_image = None - episode_image_tags = [' {}: episode_image = episode_image.split('"')[0] podcast_episode_image = episode_image break + elif 'url="' in episode_image: + episode_image = episode_image.split('url="')[1] + if '"' in episode_image: + episode_image = episode_image.split('"')[0] + podcast_episode_image = episode_image + break else: if '>' in episode_image: episode_image = episode_image.split('>')[1] @@ -1019,9 +1025,15 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = atom_item.split('')[1] - link = link.split('')[0] - link = 'https://www.youtube.com/watch?v=' + link.strip() + + link, link_mime_type = get_link_from_rss_item(atom_item) + if not link: + link = atom_item.split('')[1] + link = link.split('')[0] + link = 'https://www.youtube.com/watch?v=' + link.strip() + if not link: + continue + pub_date = atom_item.split('')[1] pub_date = pub_date.split('')[0] @@ -1030,13 +1042,16 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, if _valid_feed_date(pub_date_str): post_filename = '' votes_status = [] + podcast_properties = xml_podcast_to_dict(atom_item, xml_str) + if podcast_properties: + podcast_properties['linkMimeType'] = link_mime_type _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, votes_status, post_filename, description, moderated, mirrored, [], 32, session, debug, - None) + podcast_properties) post_ctr += 1 if post_ctr >= max_posts_per_source: break From e539d3afc09ab6129a159f33eec8a2ef2ba25d51 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 17:55:56 +0000 Subject: [PATCH 07/16] More precise obtaining of podcast image --- newswire.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/newswire.py b/newswire.py index 0bd032607..2aba075f3 100644 --- a/newswire.py +++ b/newswire.py @@ -510,6 +510,9 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}: item_str = xml_str episode_image = item_str.split(image_tag)[1] + if image_tag + ' ' in item_str and '>' in episode_image: + episode_image = episode_image.split('>')[0] + if 'href="' in episode_image: episode_image = episode_image.split('href="')[1] if '"' in episode_image: From c05b569ce69aadbf1d9f12d5abecab3fae81be09 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 18:05:29 +0000 Subject: [PATCH 08/16] Tidying --- newswire.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/newswire.py b/newswire.py index 2aba075f3..87ac339b3 100644 --- a/newswire.py +++ b/newswire.py @@ -441,7 +441,8 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}: """ if ' {}: episode_image = episode_image.split('"')[0] podcast_episode_image = episode_image break - else: - if '>' in episode_image: - episode_image = episode_image.split('>')[1] - if '<' in episode_image: - episode_image = episode_image.split('<')[0] - if '://' in episode_image and '.' in episode_image: - podcast_episode_image = episode_image - break + elif '>' in episode_image: + episode_image = episode_image.split('>')[1] + if '<' in episode_image: + episode_image = episode_image.split('<')[0] + if '://' in episode_image and '.' in episode_image: + podcast_episode_image = episode_image + break # get categories if they exist. These can be turned into hashtags podcast_categories = _get_podcast_categories(xml_item, xml_str) From e3a702efe6740a1ce09afb351ff00052276b81dd Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 18:48:43 +0000 Subject: [PATCH 09/16] Mime type for youtube videos --- newswire.py | 4 ++-- webapp_media.py | 20 ++++++++++---------- webapp_podcast.py | 9 ++++++++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/newswire.py b/newswire.py index 87ac339b3..e0a27f8a9 100644 --- a/newswire.py +++ b/newswire.py @@ -1029,7 +1029,7 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, description = description.split('')[0] description = remove_html(description) - link, link_mime_type = get_link_from_rss_item(atom_item) + link, _ = get_link_from_rss_item(atom_item) if not link: link = atom_item.split('')[1] link = link.split('')[0] @@ -1047,7 +1047,7 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, votes_status = [] podcast_properties = xml_podcast_to_dict(atom_item, xml_str) if podcast_properties: - podcast_properties['linkMimeType'] = link_mime_type + podcast_properties['linkMimeType'] = 'video/youtube' _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, diff --git a/webapp_media.py b/webapp_media.py index 3f019243e..03cb19faa 100644 --- a/webapp_media.py +++ b/webapp_media.py @@ -39,8 +39,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str, url = content.split('>vimeo.com/')[1] if '<' in url: url = url.split('<')[0] - content = \ - content + "
\n\n" + elif 'video' in podcast_properties['linkMimeType']: video_mime_type = podcast_properties['linkMimeType'] video_msg = 'Your browser does not support the video element.' podcast_str += \ From 0bc49a476bd550c74f9fb6b54353f01f5fe82b37 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 19:05:26 +0000 Subject: [PATCH 10/16] Extra removal --- content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content.py b/content.py index 2135006b8..40b380487 100644 --- a/content.py +++ b/content.py @@ -495,7 +495,7 @@ def safe_web_text(arbitrary_html: str) -> str: if not safe_text: return '' # remove any spurious characters found in podcast descriptions - remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]') + remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]', '__') for remchar in remove_chars: safe_text = safe_text.replace(remchar, '') # recreate any url links safely From 99bb7b867fbcdbbb3a0fbe8c148ee325dd8e0ee2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 19:08:01 +0000 Subject: [PATCH 11/16] Embedded youtube video --- webapp_podcast.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index 8987bde77..91d7fcfd1 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -190,11 +190,16 @@ def html_podcast_episode(css_cache: {}, translate: {}, translate['Your browser does not support the audio element.'] + \ '\n \n' elif podcast_properties.get('linkMimeType'): - if '/youtube' in podcast_properties['linkMimeType'] or \ - '/vimeo' in podcast_properties['linkMimeType']: + if '/youtube' in podcast_properties['linkMimeType']: + video_site = 'https://www.youtube.com' + url = link_url.replace('/watch?v=', '/embed/') + if '&' in url: + url = url.split('&')[0] + if '?utm_' in url: + url = url.split('?utm_')[0] podcast_str += \ " \n" elif 'video' in podcast_properties['linkMimeType']: From 066d399df1d26d03791651012acfbed4024ea092 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 19:20:39 +0000 Subject: [PATCH 12/16] Video url --- webapp_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index 91d7fcfd1..73e4472a1 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -199,7 +199,7 @@ def html_podcast_episode(css_cache: {}, translate: {}, url = url.split('?utm_')[0] podcast_str += \ " \n" elif 'video' in podcast_properties['linkMimeType']: From e517f224dfc9c1506c8e2692f14a3322d20d8ae8 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 19:35:07 +0000 Subject: [PATCH 13/16] Video height --- webapp_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index 73e4472a1..a74db6b5b 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -199,7 +199,7 @@ def html_podcast_episode(css_cache: {}, translate: {}, url = url.split('?utm_')[0] podcast_str += \ " \n" elif 'video' in podcast_properties['linkMimeType']: From 7a8d419ac51c09ee619f72c2656b655f57327c06 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 19:47:57 +0000 Subject: [PATCH 14/16] Height percent --- webapp_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index a74db6b5b..0fbdc6872 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -199,7 +199,7 @@ def html_podcast_episode(css_cache: {}, translate: {}, url = url.split('?utm_')[0] podcast_str += \ " \n" elif 'video' in podcast_properties['linkMimeType']: From 89f6d4c374aed45c3991a772d0667c1d2d647e81 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 20:04:39 +0000 Subject: [PATCH 15/16] Video dimensions --- webapp_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index 0fbdc6872..1143a5f4f 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -199,7 +199,7 @@ def html_podcast_episode(css_cache: {}, translate: {}, url = url.split('?utm_')[0] podcast_str += \ " \n" elif 'video' in podcast_properties['linkMimeType']: From 889e57239d8720ce0aa6d8c0a93eedbcf80dbdf9 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 14 Jan 2022 20:20:58 +0000 Subject: [PATCH 16/16] Tidying --- webapp_podcast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/webapp_podcast.py b/webapp_podcast.py index 1143a5f4f..18bef82e7 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -191,7 +191,6 @@ def html_podcast_episode(css_cache: {}, translate: {}, '\n \n' elif podcast_properties.get('linkMimeType'): if '/youtube' in podcast_properties['linkMimeType']: - video_site = 'https://www.youtube.com' url = link_url.replace('/watch?v=', '/embed/') if '&' in url: url = url.split('&')[0]