Merge branch 'main' of gitlab.com:bashrc2/epicyon

2022-01-14 20:25:37 +00:00 · 2022-01-14 20:25:37 +00:00 · 42f5c97601
parent 8f83e0918a 889e57239d
commit 42f5c97601
6 changed files with 124 additions and 55 deletions
--- a/content.py
+++ b/content.py
@ -486,6 +486,22 @@ def add_web_links(content: str) -> str:
    return content


+def safe_web_text(arbitrary_html: str) -> str:
+    """Turns arbitrary html into something safe.
+    So if the arbitrary html contains attack scripts those will be removed
+    """
+    # first remove the markup, so that we have something safe
+    safe_text = remove_html(arbitrary_html)
+    if not safe_text:
+        return ''
+    # remove any spurious characters found in podcast descriptions
+    remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]', '__')
+    for remchar in remove_chars:
+        safe_text = safe_text.replace(remchar, '')
+    # recreate any url links safely
+    return add_web_links(safe_text)
+
+
 def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
                   replace_hashtags: {}, post_hashtags: {}) -> bool:
    """Detects hashtags and adds them to the replacements dict
--- a/follow.py
+++ b/follow.py
@ -999,20 +999,20 @@ def send_follow_requestViaServer(base_dir: str, session,

    # get the actor inbox for the To handle
    origin_domain = from_domain
-    (inboxUrl, _, _, fromPersonId, sharedInbox, avatarUrl,
-     displayName, _) = get_person_box(signing_priv_key_pem, origin_domain,
-                                      base_dir, session, wf_request,
-                                      person_cache,
-                                      project_version, http_prefix,
-                                      from_nickname,
-                                      from_domain, post_to_box, 52025)
+    (inbox_url, _, _, from_person_id, _, _,
+     _, _) = get_person_box(signing_priv_key_pem, origin_domain,
+                            base_dir, session, wf_request,
+                            person_cache,
+                            project_version, http_prefix,
+                            from_nickname,
+                            from_domain, post_to_box, 52025)

-    if not inboxUrl:
+    if not inbox_url:
        if debug:
            print('DEBUG: follow request no ' + post_to_box +
                  ' was found for ' + handle)
        return 3
-    if not fromPersonId:
+    if not from_person_id:
        if debug:
            print('DEBUG: follow request no actor was found for ' + handle)
        return 4
@ -1026,10 +1026,10 @@ def send_follow_requestViaServer(base_dir: str, session,
    }
    post_result = \
        post_json(http_prefix, from_domain_full,
-                  session, new_follow_json, [], inboxUrl, headers, 3, True)
+                  session, new_follow_json, [], inbox_url, headers, 3, True)
    if not post_result:
        if debug:
-            print('DEBUG: POST follow request failed for c2s to ' + inboxUrl)
+            print('DEBUG: POST follow request failed for c2s to ' + inbox_url)
        return 5

    if debug:
@ -1095,22 +1095,22 @@ def send_unfollow_request_via_server(base_dir: str, session,

    # get the actor inbox for the To handle
    origin_domain = from_domain
-    (inboxUrl, pubKeyId, pubKey, fromPersonId, sharedInbox, avatarUrl,
-     displayName, _) = get_person_box(signing_priv_key_pem,
-                                      origin_domain,
-                                      base_dir, session,
-                                      wf_request, person_cache,
-                                      project_version, http_prefix,
-                                      from_nickname,
-                                      from_domain, post_to_box,
-                                      76536)
+    (inbox_url, _, _, from_person_id, _, _,
+     _, _) = get_person_box(signing_priv_key_pem,
+                            origin_domain,
+                            base_dir, session,
+                            wf_request, person_cache,
+                            project_version, http_prefix,
+                            from_nickname,
+                            from_domain, post_to_box,
+                            76536)

-    if not inboxUrl:
+    if not inbox_url:
        if debug:
            print('DEBUG: unfollow no ' + post_to_box +
                  ' was found for ' + handle)
        return 3
-    if not fromPersonId:
+    if not from_person_id:
        if debug:
            print('DEBUG: unfollow no actor was found for ' + handle)
        return 4
@ -1124,10 +1124,10 @@ def send_unfollow_request_via_server(base_dir: str, session,
    }
    post_result = \
        post_json(http_prefix, from_domain_full,
-                  session, unfollow_json, [], inboxUrl, headers, 3, True)
+                  session, unfollow_json, [], inbox_url, headers, 3, True)
    if not post_result:
        if debug:
-            print('DEBUG: POST unfollow failed for c2s to ' + inboxUrl)
+            print('DEBUG: POST unfollow failed for c2s to ' + inbox_url)
        return 5

    if debug:
--- a/newswire.py
+++ b/newswire.py
@ -229,8 +229,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
    # Include tags from podcast categories
    if podcast_properties:
        if podcast_properties.get('explicit'):
-            if '#NSFW' not in post_tags:
-                post_tags.append('#NSFW')
+            if '#nsfw' not in post_tags:
+                post_tags.append('#nsfw')

        post_tags += podcast_properties['categories']

@ -446,7 +446,8 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
    """
    if '<podcast:' not in xml_item:
        if '<itunes:' not in xml_item:
-            return {}
+            if '<media:thumbnail' not in xml_item:
+                return {}

    podcast_properties = {
        "locations": [],
@ -506,7 +507,7 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:

    # get the image for the podcast, if it exists
    podcast_episode_image = None
-    episode_image_tags = ['<itunes:image']
+    episode_image_tags = ['<itunes:image', '<media:thumbnail']
    for image_tag in episode_image_tags:
        item_str = xml_item
        if image_tag not in xml_item:
@ -515,20 +516,28 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
            item_str = xml_str

        episode_image = item_str.split(image_tag)[1]
+        if image_tag + ' ' in item_str and '>' in episode_image:
+            episode_image = episode_image.split('>')[0]
+
        if 'href="' in episode_image:
            episode_image = episode_image.split('href="')[1]
            if '"' in episode_image:
                episode_image = episode_image.split('"')[0]
                podcast_episode_image = episode_image
                break
-        else:
-            if '>' in episode_image:
-                episode_image = episode_image.split('>')[1]
-                if '<' in episode_image:
-                    episode_image = episode_image.split('<')[0]
-                    if '://' in episode_image and '.' in episode_image:
-                        podcast_episode_image = episode_image
-                        break
+        elif 'url="' in episode_image:
+            episode_image = episode_image.split('url="')[1]
+            if '"' in episode_image:
+                episode_image = episode_image.split('"')[0]
+                podcast_episode_image = episode_image
+                break
+        elif '>' in episode_image:
+            episode_image = episode_image.split('>')[1]
+            if '<' in episode_image:
+                episode_image = episode_image.split('<')[0]
+                if '://' in episode_image and '.' in episode_image:
+                    podcast_episode_image = episode_image
+                    break

    # get categories if they exist. These can be turned into hashtags
    podcast_categories = _get_podcast_categories(xml_item, xml_str)
@ -1024,9 +1033,15 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
            description = atom_item.split('<summary>')[1]
            description = description.split('</summary>')[0]
            description = remove_html(description)
-        link = atom_item.split('<yt:videoId>')[1]
-        link = link.split('</yt:videoId>')[0]
-        link = 'https://www.youtube.com/watch?v=' + link.strip()
+
+        link, _ = get_link_from_rss_item(atom_item)
+        if not link:
+            link = atom_item.split('<yt:videoId>')[1]
+            link = link.split('</yt:videoId>')[0]
+            link = 'https://www.youtube.com/watch?v=' + link.strip()
+        if not link:
+            continue
+
        pub_date = atom_item.split('<published>')[1]
        pub_date = pub_date.split('</published>')[0]

@ -1035,13 +1050,16 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
            if _valid_feed_date(pub_date_str):
                post_filename = ''
                votes_status = []
+                podcast_properties = xml_podcast_to_dict(atom_item, xml_str)
+                if podcast_properties:
+                    podcast_properties['linkMimeType'] = 'video/youtube'
                _add_newswire_dict_entry(base_dir, domain,
                                         result, pub_date_str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated, mirrored,
                                         [], 32, session, debug,
-                                         None)
+                                         podcast_properties)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
--- a/tests.py
+++ b/tests.py
@ -128,6 +128,7 @@ from inbox import json_post_allows_comments
 from inbox import valid_inbox
 from inbox import valid_inbox_filenames
 from categories import guess_hashtag_category
+from content import safe_web_text
 from content import words_similarity
 from content import get_price_from_string
 from content import limit_repeated_words
@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None:
    assert link.startswith('https://test.link/creativecommons')


+def _test_safe_webtext() -> None:
+    print('test_safe_webtext')
+    web_text = '<p>Some text including a link https://some.site/some-path</p>'
+    expected_text = 'Some text including a link ' + \
+        '<a href="https://some.site/some-path"'
+    safe_text = safe_web_text(web_text)
+    if expected_text not in safe_text:
+        print('Original html: ' + web_text)
+        print('Expected html: ' + expected_text)
+        print('Actual html: ' + safe_text)
+    assert expected_text in safe_text
+    assert '<p>' not in safe_text
+    assert '</p>' not in safe_text
+
+    web_text = 'Some text with <script>some script</script>'
+    expected_text = 'Some text with some script'
+    safe_text = safe_web_text(web_text)
+    if expected_text != safe_text:
+        print('Original html: ' + web_text)
+        print('Expected html: ' + expected_text)
+        print('Actual html: ' + safe_text)
+    assert expected_text == safe_text
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -6504,6 +6529,7 @@ def run_all_tests():
                            'message_json', 'liked_post_json'])
    _test_checkbox_names()
    _test_functions()
+    _test_safe_webtext()
    _test_get_link_from_rss_item()
    _test_xml_podcast_dict()
    _test_get_actor_from_in_reply_to()
--- a/webapp_media.py
+++ b/webapp_media.py
@ -39,8 +39,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
        url = content.split('>vimeo.com/')[1]
        if '<' in url:
            url = url.split('<')[0]
-            content = \
-                content + "<center>\n<iframe loading=\"lazy\" " + \
+            content += \
+                "<center>\n<iframe loading=\"lazy\" " + \
                "src=\"https://player.vimeo.com/video/" + \
                url + "\" width=\"" + str(width) + \
                "\" height=\"" + str(height) + \
@ -57,8 +57,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
                url = url.split('&')[0]
            if '?utm_' in url:
                url = url.split('?utm_')[0]
-            content = \
-                content + "<center>\n<iframe loading=\"lazy\" src=\"" + \
+            content += \
+                "<center>\n<iframe loading=\"lazy\" src=\"" + \
                video_site + url + "\" width=\"" + str(width) + \
                "\" height=\"" + str(height) + \
                "\" frameborder=\"0\" allow=\"autoplay; fullscreen\" " + \
@ -88,8 +88,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
                    url = url.split('&')[0]
                if '?utm_' in url:
                    url = url.split('?utm_')[0]
-                content = \
-                    content + "<center>\n<iframe loading=\"lazy\" src=\"" + \
+                content += \
+                    "<center>\n<iframe loading=\"lazy\" src=\"" + \
                    video_site + url + "\" width=\"" + \
                    str(width) + "\" height=\"" + str(height) + \
                    "\" frameborder=\"0\" allow=\"autoplay; fullscreen\" " + \
@ -103,8 +103,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
            url = url.split('"')[0]
            if not url.endswith('/oembed'):
                url = url + '/oembed'
-            content = \
-                content + "<center>\n<iframe loading=\"lazy\" src=\"" + \
+            content += \
+                "<center>\n<iframe loading=\"lazy\" src=\"" + \
                video_site + url + "\" width=\"" + \
                str(width) + "\" height=\"" + str(height) + \
                "\" frameborder=\"0\" allow=\"fullscreen\" " + \
@ -153,8 +153,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
            if '"' not in url:
                continue
            url = url.split('"')[0].replace('/watch/', '/embed/')
-            content = \
-                content + "<center>\n<iframe loading=\"lazy\" " + \
+            content += \
+                "<center>\n<iframe loading=\"lazy\" " + \
                "sandbox=\"allow-same-origin " + \
                "allow-scripts\" src=\"https://" + \
                site + url + "\" width=\"" + str(width) + \
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@ -14,6 +14,7 @@ from shutil import copyfile
 from utils import get_config_param
 from utils import remove_html
 from media import path_is_audio
+from content import safe_web_text
 from webapp_utils import get_broken_link_substitute
 from webapp_utils import html_header_with_external_style
 from webapp_utils import html_footer
@ -189,7 +190,18 @@ def html_podcast_episode(css_cache: {}, translate: {},
            translate['Your browser does not support the audio element.'] + \
            '\n  </audio>\n'
    elif podcast_properties.get('linkMimeType'):
-        if 'video' in podcast_properties['linkMimeType']:
+        if '/youtube' in podcast_properties['linkMimeType']:
+            url = link_url.replace('/watch?v=', '/embed/')
+            if '&' in url:
+                url = url.split('&')[0]
+            if '?utm_' in url:
+                url = url.split('?utm_')[0]
+            podcast_str += \
+                "  <iframe loading=\"lazy\" src=\"" + \
+                url + "\" width=\"400\" height=\"300\" " + \
+                "frameborder=\"0\" allow=\"autoplay; fullscreen\" " + \
+                "allowfullscreen>\n  </iframe>\n"
+        elif 'video' in podcast_properties['linkMimeType']:
            video_mime_type = podcast_properties['linkMimeType']
            video_msg = 'Your browser does not support the video element.'
            podcast_str += \
@ -209,11 +221,8 @@ def html_podcast_episode(css_cache: {}, translate: {},
    if newswire_item[4]:
        podcast_description = \
            html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
-        podcast_description = remove_html(podcast_description)
+        podcast_description = safe_web_text(podcast_description)
        if podcast_description:
-            remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>')
-            for remchar in remove_chars:
-                podcast_description = podcast_description.replace(remchar, '')
            podcast_str += '<p>' + podcast_description + '</p>\n'

    # donate button