diff --git a/content.py b/content.py index 2394e21a1..40b380487 100644 --- a/content.py +++ b/content.py @@ -486,6 +486,22 @@ def add_web_links(content: str) -> str: return content +def safe_web_text(arbitrary_html: str) -> str: + """Turns arbitrary html into something safe. + So if the arbitrary html contains attack scripts those will be removed + """ + # first remove the markup, so that we have something safe + safe_text = remove_html(arbitrary_html) + if not safe_text: + return '' + # remove any spurious characters found in podcast descriptions + remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]', '__') + for remchar in remove_chars: + safe_text = safe_text.replace(remchar, '') + # recreate any url links safely + return add_web_links(safe_text) + + def _add_hash_tags(word_str: str, http_prefix: str, domain: str, replace_hashtags: {}, post_hashtags: {}) -> bool: """Detects hashtags and adds them to the replacements dict diff --git a/follow.py b/follow.py index 70e7c0b71..d65636899 100644 --- a/follow.py +++ b/follow.py @@ -999,20 +999,20 @@ def send_follow_requestViaServer(base_dir: str, session, # get the actor inbox for the To handle origin_domain = from_domain - (inboxUrl, _, _, fromPersonId, sharedInbox, avatarUrl, - displayName, _) = get_person_box(signing_priv_key_pem, origin_domain, - base_dir, session, wf_request, - person_cache, - project_version, http_prefix, - from_nickname, - from_domain, post_to_box, 52025) + (inbox_url, _, _, from_person_id, _, _, + _, _) = get_person_box(signing_priv_key_pem, origin_domain, + base_dir, session, wf_request, + person_cache, + project_version, http_prefix, + from_nickname, + from_domain, post_to_box, 52025) - if not inboxUrl: + if not inbox_url: if debug: print('DEBUG: follow request no ' + post_to_box + ' was found for ' + handle) return 3 - if not fromPersonId: + if not from_person_id: if debug: print('DEBUG: follow request no actor was found for ' + handle) return 4 @@ -1026,10 +1026,10 @@ def send_follow_requestViaServer(base_dir: str, session, } post_result = \ post_json(http_prefix, from_domain_full, - session, new_follow_json, [], inboxUrl, headers, 3, True) + session, new_follow_json, [], inbox_url, headers, 3, True) if not post_result: if debug: - print('DEBUG: POST follow request failed for c2s to ' + inboxUrl) + print('DEBUG: POST follow request failed for c2s to ' + inbox_url) return 5 if debug: @@ -1095,22 +1095,22 @@ def send_unfollow_request_via_server(base_dir: str, session, # get the actor inbox for the To handle origin_domain = from_domain - (inboxUrl, pubKeyId, pubKey, fromPersonId, sharedInbox, avatarUrl, - displayName, _) = get_person_box(signing_priv_key_pem, - origin_domain, - base_dir, session, - wf_request, person_cache, - project_version, http_prefix, - from_nickname, - from_domain, post_to_box, - 76536) + (inbox_url, _, _, from_person_id, _, _, + _, _) = get_person_box(signing_priv_key_pem, + origin_domain, + base_dir, session, + wf_request, person_cache, + project_version, http_prefix, + from_nickname, + from_domain, post_to_box, + 76536) - if not inboxUrl: + if not inbox_url: if debug: print('DEBUG: unfollow no ' + post_to_box + ' was found for ' + handle) return 3 - if not fromPersonId: + if not from_person_id: if debug: print('DEBUG: unfollow no actor was found for ' + handle) return 4 @@ -1124,10 +1124,10 @@ def send_unfollow_request_via_server(base_dir: str, session, } post_result = \ post_json(http_prefix, from_domain_full, - session, unfollow_json, [], inboxUrl, headers, 3, True) + session, unfollow_json, [], inbox_url, headers, 3, True) if not post_result: if debug: - print('DEBUG: POST unfollow failed for c2s to ' + inboxUrl) + print('DEBUG: POST unfollow failed for c2s to ' + inbox_url) return 5 if debug: diff --git a/newswire.py b/newswire.py index 8c35fb921..11867ba2d 100644 --- a/newswire.py +++ b/newswire.py @@ -229,8 +229,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str, # Include tags from podcast categories if podcast_properties: if podcast_properties.get('explicit'): - if '#NSFW' not in post_tags: - post_tags.append('#NSFW') + if '#nsfw' not in post_tags: + post_tags.append('#nsfw') post_tags += podcast_properties['categories'] @@ -446,7 +446,8 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}: """ if ' {}: # get the image for the podcast, if it exists podcast_episode_image = None - episode_image_tags = [' {}: item_str = xml_str episode_image = item_str.split(image_tag)[1] + if image_tag + ' ' in item_str and '>' in episode_image: + episode_image = episode_image.split('>')[0] + if 'href="' in episode_image: episode_image = episode_image.split('href="')[1] if '"' in episode_image: episode_image = episode_image.split('"')[0] podcast_episode_image = episode_image break - else: - if '>' in episode_image: - episode_image = episode_image.split('>')[1] - if '<' in episode_image: - episode_image = episode_image.split('<')[0] - if '://' in episode_image and '.' in episode_image: - podcast_episode_image = episode_image - break + elif 'url="' in episode_image: + episode_image = episode_image.split('url="')[1] + if '"' in episode_image: + episode_image = episode_image.split('"')[0] + podcast_episode_image = episode_image + break + elif '>' in episode_image: + episode_image = episode_image.split('>')[1] + if '<' in episode_image: + episode_image = episode_image.split('<')[0] + if '://' in episode_image and '.' in episode_image: + podcast_episode_image = episode_image + break # get categories if they exist. These can be turned into hashtags podcast_categories = _get_podcast_categories(xml_item, xml_str) @@ -1024,9 +1033,15 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = atom_item.split('')[1] - link = link.split('')[0] - link = 'https://www.youtube.com/watch?v=' + link.strip() + + link, _ = get_link_from_rss_item(atom_item) + if not link: + link = atom_item.split('')[1] + link = link.split('')[0] + link = 'https://www.youtube.com/watch?v=' + link.strip() + if not link: + continue + pub_date = atom_item.split('')[1] pub_date = pub_date.split('')[0] @@ -1035,13 +1050,16 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, if _valid_feed_date(pub_date_str): post_filename = '' votes_status = [] + podcast_properties = xml_podcast_to_dict(atom_item, xml_str) + if podcast_properties: + podcast_properties['linkMimeType'] = 'video/youtube' _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, votes_status, post_filename, description, moderated, mirrored, [], 32, session, debug, - None) + podcast_properties) post_ctr += 1 if post_ctr >= max_posts_per_source: break diff --git a/tests.py b/tests.py index ea7349a6c..07baf487c 100644 --- a/tests.py +++ b/tests.py @@ -128,6 +128,7 @@ from inbox import json_post_allows_comments from inbox import valid_inbox from inbox import valid_inbox_filenames from categories import guess_hashtag_category +from content import safe_web_text from content import words_similarity from content import get_price_from_string from content import limit_repeated_words @@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None: assert link.startswith('https://test.link/creativecommons') +def _test_safe_webtext() -> None: + print('test_safe_webtext') + web_text = '

Some text including a link https://some.site/some-path

' + expected_text = 'Some text including a link ' + \ + '' not in safe_text + assert '

' not in safe_text + + web_text = 'Some text with ' + expected_text = 'Some text with some script' + safe_text = safe_web_text(web_text) + if expected_text != safe_text: + print('Original html: ' + web_text) + print('Expected html: ' + expected_text) + print('Actual html: ' + safe_text) + assert expected_text == safe_text + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -6504,6 +6529,7 @@ def run_all_tests(): 'message_json', 'liked_post_json']) _test_checkbox_names() _test_functions() + _test_safe_webtext() _test_get_link_from_rss_item() _test_xml_podcast_dict() _test_get_actor_from_in_reply_to() diff --git a/webapp_media.py b/webapp_media.py index 3f019243e..03cb19faa 100644 --- a/webapp_media.py +++ b/webapp_media.py @@ -39,8 +39,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str, url = content.split('>vimeo.com/')[1] if '<' in url: url = url.split('<')[0] - content = \ - content + "
\n\n" + elif 'video' in podcast_properties['linkMimeType']: video_mime_type = podcast_properties['linkMimeType'] video_msg = 'Your browser does not support the video element.' podcast_str += \ @@ -209,11 +221,8 @@ def html_podcast_episode(css_cache: {}, translate: {}, if newswire_item[4]: podcast_description = \ html.unescape(urllib.parse.unquote_plus(newswire_item[4])) - podcast_description = remove_html(podcast_description) + podcast_description = safe_web_text(podcast_description) if podcast_description: - remove_chars = ('Œ', 'â€', 'ğŸ', '�') - for remchar in remove_chars: - podcast_description = podcast_description.replace(remchar, '') podcast_str += '

' + podcast_description + '

\n' # donate button