Handle situations where urls are lists

2023-12-09 14:18:24 +00:00 · 2023-12-09 14:18:24 +00:00 · 080b2ca352
parent 357896c9cf
commit 080b2ca352
23 changed files with 218 additions and 95 deletions
--- a/blog.py
+++ b/blog.py
@ -16,6 +16,7 @@ from webapp_utils import html_footer
 from webapp_utils import get_post_attachments_as_html
 from webapp_utils import edit_text_area
 from webapp_media import add_embedded_elements
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import get_attributed_to
 from utils import remove_eol
@ -314,7 +315,8 @@ def _html_blog_post_content(debug: bool, session, authorized: bool,
                continue
            if not tag_json.get('url'):
                continue
-            citation_url = remove_html(tag_json['url'])
+            url_str = get_url_from_post(tag_json['url'])
+            citation_url = remove_html(url_str)
            citation_name = remove_html(tag_json['name'])
            citations_str += \
                '<li><a href="' + citation_url + '">' + \
@ -482,7 +484,8 @@ def html_blog_post(session, authorized: bool,
    title = post_json_object['object']['summary']
    url = ''
    if post_json_object['object'].get('url'):
-        url = remove_html(post_json_object['object']['url'])
+        url_str = get_url_from_post(post_json_object['object']['url'])
+        url = remove_html(url_str)
    snippet = _get_snippet_from_blog_content(post_json_object,
                                             system_language)
    blog_str = html_header_with_blog_markup(css_filename, instance_title,
--- a/bookmarks.py
+++ b/bookmarks.py
@ -11,6 +11,7 @@ import os
 from pprint import pprint
 from webfinger import webfinger_handle
 from auth import create_basic_auth_header
+from utils import get_url_from_post
 from utils import remove_domain_port
 from utils import has_users_path
 from utils import get_full_domain
@ -603,7 +604,8 @@ def outbox_bookmark(recent_posts_cache: {},
    if debug:
        print('DEBUG: c2s bookmark Add request arrived in outbox')

-    message_url = remove_id_ending(message_json['object']['url'])
+    url_str = get_url_from_post(message_json['object']['url'])
+    message_url = remove_id_ending(url_str)
    message_url = remove_html(message_url)
    domain = remove_domain_port(domain)
    post_filename = locate_post(base_dir, nickname, domain, message_url)
@ -661,7 +663,8 @@ def outbox_undo_bookmark(recent_posts_cache: {},
    if debug:
        print('DEBUG: c2s unbookmark Remove request arrived in outbox')

-    message_url = remove_id_ending(message_json['object']['url'])
+    url_str = get_url_from_post(message_json['object']['url'])
+    message_url = remove_id_ending(url_str)
    message_url = remove_html(message_url)
    domain = remove_domain_port(domain)
    post_filename = locate_post(base_dir, nickname, domain, message_url)
--- a/content.py
+++ b/content.py
@ -15,6 +15,7 @@ import email.parser
 import urllib.parse
 from shutil import copyfile
 from dateutil.parser import parse
+from utils import get_url_from_post
 from utils import is_right_to_left_text
 from utils import language_right_to_left
 from utils import binary_is_image
@ -446,11 +447,14 @@ def replace_emoji_from_tags(session, base_dir: str,
            continue
        if not tag_item['icon'].get('url'):
            continue
-        if '/' not in tag_item['icon']['url']:
+        url_str = get_url_from_post(tag_item['icon']['url'])
+        if '/' not in url_str:
            continue
        if tag_item['name'] not in content:
            continue
-        tag_url = remove_html(tag_item['icon']['url'])
+        tag_url = remove_html(url_str)
+        if not tag_url:
+            continue
        icon_name = tag_url.split('/')[-1]
        if icon_name:
            if len(icon_name) > 1:
@ -532,7 +536,8 @@ def replace_emoji_from_tags(session, base_dir: str,
            emoji_tag_name = tag_item['name'].replace(':', '')
        else:
            emoji_tag_name = ''
-        tag_url = remove_html(tag_item['icon']['url'])
+        url_str = get_url_from_post(tag_item['icon']['url'])
+        tag_url = remove_html(url_str)
        emoji_html = "<img src=\"" + tag_url + "\" alt=\"" + \
            emoji_tag_name + \
            "\" align=\"middle\" class=\"" + html_class + "\"/>"
--- a/daemon.py
+++ b/daemon.py
@ -300,6 +300,7 @@ from languages import set_actor_languages
 from languages import get_understood_languages
 from like import update_likes_collection
 from reaction import update_reaction_collection
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import corp_servers
 from utils import get_attributed_to
@ -2209,9 +2210,19 @@ class PubServer(BaseHTTPRequestHandler):
        if has_object_dict(message_json):
            if debug:
                print('INBOX: checking object fields')
+            # check that url is a string or list
+            if message_json['object'].get('url'):
+                if not isinstance(message_json['object']['url'], str) and \
+                   not isinstance(message_json['object']['url'], list):
+                    print('INBOX: url should be a string or list ' +
+                          str(message_json['object']['url']))
+                    self._400()
+                    self.server.postreq_busy = False
+                    return 3
+            # check that some fields are strings
            string_fields = (
                'id', 'actor', 'type', 'content', 'published',
-                'summary', 'url'
+                'summary'
            )
            for check_field in string_fields:
                if not message_json['object'].get(check_field):
@ -6696,7 +6707,9 @@ class PubServer(BaseHTTPRequestHandler):
                    for m_type, last_part in uploads:
                        rep_str = '/' + last_part
                        if m_type == 'avatar':
-                            actor_url = remove_html(actor_json['icon']['url'])
+                            url_str = \
+                                get_url_from_post(actor_json['icon']['url'])
+                            actor_url = remove_html(url_str)
                            last_part_of_url = actor_url.split('/')[-1]
                            srch_str = '/' + last_part_of_url
                            actor_url = actor_url.replace(srch_str, rep_str)
@ -6709,8 +6722,10 @@ class PubServer(BaseHTTPRequestHandler):
                                actor_json['icon']['mediaType'] = \
                                    'image/' + img_ext
                        elif m_type == 'image':
+                            url_str = \
+                                get_url_from_post(actor_json['image']['url'])
                            im_url = \
-                                remove_html(actor_json['image']['url'])
+                                remove_html(url_str)
                            last_part_of_url = im_url.split('/')[-1]
                            srch_str = '/' + last_part_of_url
                            actor_json['image']['url'] = \
--- a/desktop_client.py
+++ b/desktop_client.py
@ -16,6 +16,7 @@ import webbrowser
 import urllib.parse
 from pathlib import Path
 from random import randint
+from utils import get_url_from_post
 from utils import get_actor_languages_list
 from utils import get_attributed_to
 from utils import remove_html
@ -761,7 +762,8 @@ def _show_replies_on_post(post_json_object: {}, max_replies: int) -> None:
    print('')
    ctr = 0
    for item in object_replies['items']:
-        item_url = remove_html(item['url'])
+        url_str = get_url_from_post(item['url'])
+        item_url = remove_html(url_str)
        print('  ↰ ' + str(item_url))
        ctr += 1
        if ctr >= max_replies:
--- a/inbox.py
+++ b/inbox.py
@ -18,6 +18,7 @@ from languages import understood_post_language
 from like import update_likes_collection
 from reaction import update_reaction_collection
 from reaction import valid_emoji_content
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import date_epoch
 from utils import date_utcnow
@ -192,9 +193,10 @@ def cache_svg_images(session, base_dir: str, http_prefix: str,
            continue
        if not attach.get('url'):
            continue
-        if attach['url'].endswith('.svg') or \
+        url_str = get_url_from_post(attach['url'])
+        if url_str.endswith('.svg') or \
           'svg' in attach['mediaType']:
-            url = remove_html(attach['url'])
+            url = remove_html(url_str)
            if not url_permitted(url, federation_list):
                continue
            # if this is a local image then it has already been
@ -1209,7 +1211,8 @@ def _person_receive_update(base_dir: str,
                           debug: bool, http_prefix: str) -> bool:
    """Changes an actor. eg: avatar or display name change
    """
-    person_url = remove_html(person_json['url'])
+    url_str = get_url_from_post(person_json['url'])
+    person_url = remove_html(url_str)
    if debug:
        print('Receiving actor update for ' + person_url +
              ' ' + str(person_json))
@ -1901,7 +1904,7 @@ def _receive_update_activity(recent_posts_cache: {}, session, base_dir: str,
                    print('Person Update: ' + str(message_json))
                    if debug:
                        print('DEBUG: Profile update was received for ' +
-                              message_json['object']['url'])
+                              str(message_json['object']['url']))
                        return True
    return False

@ -2714,14 +2717,15 @@ def _receive_bookmark(recent_posts_cache: {},
        if debug:
            print('DEBUG: inbox bookmark Add missing url')
        return False
-    if '/statuses/' not in message_json['object']['url']:
+    url_str = get_url_from_post(message_json['object']['url'])
+    if '/statuses/' not in url_str:
        if debug:
            print('DEBUG: inbox bookmark Add missing statuses un url')
        return False
    if debug:
        print('DEBUG: c2s inbox bookmark Add request arrived in outbox')

-    message_url2 = remove_html(message_json['object']['url'])
+    message_url2 = remove_html(url_str)
    message_url = remove_id_ending(message_url2)
    domain = remove_domain_port(domain)
    post_filename = locate_post(base_dir, nickname, domain, message_url)
@ -2840,7 +2844,8 @@ def _receive_undo_bookmark(recent_posts_cache: {},
        if debug:
            print('DEBUG: inbox undo bookmark Remove missing url')
        return False
-    if '/statuses/' not in message_json['object']['url']:
+    url_str = get_url_from_post(message_json['object']['url'])
+    if '/statuses/' not in url_str:
        if debug:
            print('DEBUG: inbox undo bookmark Remove missing statuses un url')
        return False
@ -2848,7 +2853,7 @@ def _receive_undo_bookmark(recent_posts_cache: {},
        print('DEBUG: c2s inbox Remove bookmark ' +
              'request arrived in outbox')

-    message_url2 = remove_html(message_json['object']['url'])
+    message_url2 = remove_html(url_str)
    message_url = remove_id_ending(message_url2)
    domain = remove_domain_port(domain)
    post_filename = locate_post(base_dir, nickname, domain, message_url)
--- a/maps.py
+++ b/maps.py
@ -9,6 +9,7 @@ __module_group__ = "Core"


 import os
+from utils import get_url_from_post
 from utils import is_float
 from utils import acct_dir
 from utils import load_json
@ -400,7 +401,8 @@ def get_map_preferences_url(base_dir: str, nickname: str, domain: str) -> str:
    if os.path.isfile(maps_filename):
        maps_json = load_json(maps_filename)
        if maps_json.get('url'):
-            return remove_html(maps_json['url'])
+            url_str = get_url_from_post(maps_json['url'])
+            return remove_html(url_str)
    return None


--- a/mastoapiv1.py
+++ b/mastoapiv1.py
@ -8,6 +8,7 @@ __status__ = "Production"
 __module_group__ = "API"

 import os
+from utils import get_url_from_post
 from utils import load_json
 from utils import get_config_param
 from utils import acct_dir
@ -79,8 +80,10 @@ def _meta_data_instance_v1(show_accounts: bool,
    if admin_actor.get('published'):
        created_at = admin_actor['published']

-    icon_url = remove_html(admin_actor['icon']['url'])
-    image_url = remove_html(admin_actor['image']['url'])
+    url_str = get_url_from_post(admin_actor['icon']['url'])
+    icon_url = remove_html(url_str)
+    url_str = get_url_from_post(admin_actor['image']['url'])
+    image_url = remove_html(url_str)
    instance = {
        'approval_required': False,
        'invites_enabled': False,
@ -205,8 +208,10 @@ def _get_masto_api_v1account(base_dir: str, nickname: str, domain: str,
    account_json = load_json(account_filename)
    if not account_json:
        return {}
-    avatar_url = remove_html(account_json['icon']['url'])
-    image_url = remove_html(account_json['image']['url'])
+    url_str = get_url_from_post(account_json['icon']['url'])
+    avatar_url = remove_html(url_str)
+    url_str = get_url_from_post(account_json['image']['url'])
+    image_url = remove_html(url_str)
    joined_date = "2016-10-05T10:30:00Z"
    if account_json.get('published'):
        joined_date = account_json['published']
--- a/mastoapiv2.py
+++ b/mastoapiv2.py
@ -8,6 +8,7 @@ __status__ = "Production"
 __module_group__ = "API"

 import os
+from utils import get_url_from_post
 from utils import load_json
 from utils import get_config_param
 from utils import acct_dir
@ -85,8 +86,10 @@ def _meta_data_instance_v2(show_accounts: bool,
    if admin_actor.get('published'):
        created_at = admin_actor['published']

-    icon_url = remove_html(admin_actor['icon']['url'])
-    image_url = remove_html(admin_actor['image']['url'])
+    url_str = get_url_from_post(admin_actor['icon']['url'])
+    icon_url = remove_html(url_str)
+    url_str = get_url_from_post(admin_actor['image']['url'])
+    image_url = remove_html(url_str)
    thumbnail_url = http_prefix + '://' + domain_full + '/login.png'
    admin_email = None
    noindex = True
--- a/newswire.py
+++ b/newswire.py
@ -19,6 +19,7 @@ from datetime import timezone
 from collections import OrderedDict
 from utils import valid_post_date
 from categories import set_hashtag_category
+from utils import get_url_from_post
 from utils import remove_zero_length_strings
 from utils import date_from_string_format
 from utils import acct_handle_dir
@ -1143,7 +1144,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
            continue
        if not json_feed_item.get('url'):
            continue
-        if not isinstance(json_feed_item['url'], str):
+        url_str = get_url_from_post(json_feed_item['url'])
+        if not url_str:
            continue
        if not json_feed_item.get('date_published'):
            if not json_feed_item.get('date_modified'):
@ -1182,7 +1184,7 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
                        if tag_name not in description:
                            description += ' ' + tag_name

-        link = remove_html(json_feed_item['url'])
+        link = remove_html(url_str)
        if '://' not in link:
            continue
        if len(link) > max_bytes:
@ -1602,7 +1604,9 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
                    description = remove_html(description)
                    tags_from_post = _get_hashtags_from_post(post_json_object)
                    summary = post_json_object['object']['summary']
-                    url2 = remove_html(post_json_object['object']['url'])
+                    url_str = \
+                        get_url_from_post(post_json_object['object']['url'])
+                    url2 = remove_html(url_str)
                    _add_newswire_dict_entry(base_dir, domain,
                                             newswire, published,
                                             summary, url2,
--- a/person.py
+++ b/person.py
@ -37,6 +37,7 @@ from roles import set_role
 from roles import actor_roles_from_list
 from roles import get_actor_roles_list
 from media import process_meta_data
+from utils import get_url_from_post
 from utils import date_utcnow
 from utils import get_memorials
 from utils import is_account_dir
@ -200,7 +201,8 @@ def randomize_actor_images(person_json: {}) -> None:
    This causes other instances to update their cached avatar image
    """
    person_id = person_json['id']
-    last_part_of_filename = person_json['icon']['url'].split('/')[-1]
+    url_str = get_url_from_post(person_json['icon']['url'])
+    last_part_of_filename = url_str.split('/')[-1]
    existing_extension = last_part_of_filename.split('.')[1]
    # NOTE: these files don't need to have cryptographically
    # secure names
@ -210,7 +212,8 @@ def randomize_actor_images(person_json: {}) -> None:
    person_json['icon']['url'] = \
        base_url + '/system/accounts/avatars/' + nickname + \
        '/avatar' + rand_str + '.' + existing_extension
-    last_part_of_filename = person_json['image']['url'].split('/')[-1]
+    url_str = get_url_from_post(person_json['image']['url'])
+    last_part_of_filename = url_str.split('/')[-1]
    existing_extension = last_part_of_filename.split('.')[1]
    rand_str = str(randint(10000000000000, 99999999999999))  # nosec
    person_json['image']['url'] = \
@ -229,6 +232,9 @@ def get_actor_update_json(actor_json: {}) -> {}:
    indexable = False
    if actor_json.get('indexable'):
        indexable = True
+    actor_url = get_url_from_post(actor_json['url'])
+    icon_url = get_url_from_post(actor_json['icon']['url'])
+    image_url = get_url_from_post(actor_json['image']['url'])
    return {
        '@context': [
            "https://www.w3.org/ns/activitystreams",
@ -311,11 +317,11 @@ def get_actor_update_json(actor_json: {}) -> {}:
            'type': actor_json['type'],
            'icon': {
                'type': 'Image',
-                'url': actor_json['icon']['url']
+                'url': icon_url
            },
            'image': {
                'type': 'Image',
-                'url': actor_json['image']['url']
+                'url': image_url
            },
            'attachment': actor_json['attachment'],
            'following': actor_json['id'] + '/following',
@ -327,7 +333,7 @@ def get_actor_update_json(actor_json: {}) -> {}:
            'preferredUsername': actor_json['preferredUsername'],
            'name': actor_json['name'],
            'summary': actor_json['summary'],
-            'url': actor_json['url'],
+            'url': actor_url,
            'manuallyApprovesFollowers': manually_approves_followers,
            'discoverable': actor_json['discoverable'],
            'memorial': memorial,
@ -1844,8 +1850,9 @@ def get_person_avatar_url(base_dir: str, person_url: str,

    if person_json.get('icon'):
        if person_json['icon'].get('url'):
-            if '.svg' not in person_json['icon']['url'].lower():
-                return remove_html(person_json['icon']['url'])
+            url_str = get_url_from_post(person_json['icon']['url'])
+            if '.svg' not in url_str.lower():
+                return remove_html(url_str)
    return None


--- a/pgp.py
+++ b/pgp.py
@ -12,6 +12,7 @@ import base64
 import subprocess
 from pathlib import Path
 from person import get_actor_json
+from utils import get_url_from_post
 from utils import safe_system_string
 from utils import contains_pgp_public_key
 from utils import is_pgp_encrypted
@ -712,18 +713,20 @@ def pgp_public_key_upload(base_dir: str, session,
 def actor_to_vcard(actor: {}, domain: str) -> str:
    """Returns a vcard for a given actor
    """
+    actor_url_str = get_url_from_post(actor['url'])
    vcard_str = 'BEGIN:VCARD\n'
    vcard_str += 'VERSION:4.0\n'
    vcard_str += 'REV:' + actor['published'] + '\n'
    vcard_str += 'FN:' + remove_html(actor['name']) + '\n'
    vcard_str += 'NICKNAME:' + actor['preferredUsername'] + '\n'
-    vcard_str += 'URL;TYPE=profile:' + actor['url'] + '\n'
+    vcard_str += 'URL;TYPE=profile:' + actor_url_str + '\n'
    blog_address = get_blog_address(actor)
    if blog_address:
        vcard_str += 'URL;TYPE=blog:' + blog_address + '\n'
    vcard_str += 'NOTE:' + remove_html(actor['summary']) + '\n'
-    if actor['icon']['url']:
-        vcard_str += 'PHOTO:' + actor['icon']['url'] + '\n'
+    url_str = get_url_from_post(actor['icon']['url'])
+    if url_str:
+        vcard_str += 'PHOTO:' + url_str + '\n'
    pgp_key = get_pgp_pub_key(actor)
    if pgp_key:
        vcard_str += 'KEY:data:application/pgp-keys;base64,' + \
@ -801,18 +804,20 @@ def actor_to_vcard_xml(actor: {}, domain: str) -> str:
        vcard_str += '    <impp>' + \
            '<parameters><type><text>cwtch</text></type></parameters>' + \
            '<text>' + cwtch_address + '</text></impp>\n'
+    url_str = get_url_from_post(actor['url'])
    vcard_str += '    <url>' + \
        '<parameters><type><text>profile</text></type></parameters>' + \
-        '<uri>' + actor['url'] + '</uri></url>\n'
+        '<uri>' + url_str + '</uri></url>\n'
    blog_address = get_blog_address(actor)
    if blog_address:
        vcard_str += '    <url>' + \
            '<parameters><type><text>blog</text></type></parameters>' + \
            '<uri>' + blog_address + '</uri></url>\n'
    vcard_str += '    <rev>' + actor['published'] + '</rev>\n'
-    if actor['icon']['url']:
+    url_str = get_url_from_post(actor['icon']['url'])
+    if url_str:
        vcard_str += \
-            '    <photo><uri>' + actor['icon']['url'] + '</uri></photo>\n'
+            '    <photo><uri>' + url_str + '</uri></photo>\n'
    pgp_key = get_pgp_pub_key(actor)
    if pgp_key:
        pgp_key_encoded = \
--- a/posts.py
+++ b/posts.py
@ -34,6 +34,7 @@ from webfinger import webfinger_handle
 from httpsig import create_signed_header
 from siteactive import site_is_active
 from languages import understood_post_language
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import date_epoch
 from utils import date_utcnow
@ -406,7 +407,8 @@ def get_person_box(signing_priv_key_pem: str, origin_domain: str,
    avatar_url = None
    if person_json.get('icon'):
        if person_json['icon'].get('url'):
-            avatar_url = remove_html(person_json['icon']['url'])
+            url_str = get_url_from_post(person_json['icon']['url'])
+            avatar_url = remove_html(url_str)
    display_name = None
    possible_display_name = None
    if person_json.get('name'):
@ -652,16 +654,16 @@ def _get_posts(session, outbox_url: str, max_posts: int,
                        if tag_item.get('name') and tag_item.get('icon'):
                            if tag_item['icon'].get('url'):
                                # No emoji from non-permitted domains
-                                if url_permitted(tag_item['icon']['url'],
+                                url_str = \
+                                    get_url_from_post(tag_item['icon']['url'])
+                                if url_permitted(url_str,
                                                 federation_list):
                                    emoji_name = tag_item['name']
-                                    emoji_icon = \
-                                        remove_html(tag_item['icon']['url'])
+                                    emoji_icon = remove_html(url_str)
                                    emoji[emoji_name] = emoji_icon
                                else:
                                    if debug:
-                                        print('url not permitted ' +
-                                              tag_item['icon']['url'])
+                                        print('url not permitted ' + url_str)
                    if tag_type == 'mention':
                        if tag_item.get('name'):
                            if tag_item['name'] not in mentions:
@ -703,15 +705,15 @@ def _get_posts(session, outbox_url: str, max_posts: int,
                    for attach in this_item['attachment']:
                        if attach.get('name') and attach.get('url'):
                            # no attachments from non-permitted domains
-                            attach_url = remove_html(attach['url'])
+                            url_str = get_url_from_post(attach['url'])
+                            attach_url = remove_html(url_str)
                            if url_permitted(attach_url,
                                             federation_list):
                                attachment.append([attach['name'],
                                                   attach_url])
                            else:
                                if debug:
-                                    print('url not permitted ' +
-                                          attach['url'])
+                                    print('url not permitted ' + url_str)

            sensitive = False
            if this_item.get('sensitive'):
@ -906,9 +908,9 @@ def _get_posts_for_blocked_domains(base_dir: str,
                    continue
                if is_blocked_domain(base_dir, post_domain):
                    if item['object'].get('url'):
-                        url = item['object']['url']
+                        url = get_url_from_post(item['object']['url'])
                    else:
-                        url = item['object']['id']
+                        url = get_url_from_post(item['object']['id'])
                    url = remove_html(url)
                    if not blocked_posts.get(post_domain):
                        blocked_posts[post_domain] = [url]
@ -929,9 +931,9 @@ def _get_posts_for_blocked_domains(base_dir: str,
                        continue
                    if is_blocked_domain(base_dir, post_domain):
                        if item['object'].get('url'):
-                            url = item['object']['url']
+                            url = get_url_from_post(item['object']['url'])
                        else:
-                            url = item['object']['id']
+                            url = get_url_from_post(item['object']['id'])
                        url = remove_html(url)
                        if not blocked_posts.get(post_domain):
                            blocked_posts[post_domain] = [url]
@ -2206,7 +2208,8 @@ def create_blog_post(base_dir: str,
                           low_bandwidth, content_license_url,
                           media_license_url, media_creator,
                           languages_understood, translate, buy_url, chat_url)
-    obj_url = remove_html(blog_json['object']['url'])
+    url_str = get_url_from_post(blog_json['object']['url'])
+    obj_url = remove_html(url_str)
    if '/@/' not in obj_url:
        blog_json['object']['url'] = obj_url.replace('/@', '/users/')
    _append_citations_to_blog_post(base_dir, nickname, domain, blog_json)
--- a/tests.py
+++ b/tests.py
@ -56,6 +56,7 @@ from follow import clear_followers
 from follow import send_follow_request_via_server
 from follow import send_unfollow_request_via_server
 from siteactive import site_is_active
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import date_utcnow
 from utils import is_right_to_left_text
@ -1470,10 +1471,11 @@ def test_post_message_between_servers(base_dir: str) -> None:
        assert attached.get('type')
        assert attached.get('url')
        assert attached['mediaType'] == 'image/png'
-        if '/system/media_attachments/files/' not in attached['url']:
-            print(attached['url'])
-        assert '/system/media_attachments/files/' in attached['url']
-        assert attached['url'].endswith('.png')
+        url_str = get_url_from_post(attached['url'])
+        if '/system/media_attachments/files/' not in url_str:
+            print(str(attached['url']))
+        assert '/system/media_attachments/files/' in url_str
+        assert url_str.endswith('.png')
        assert attached.get('width')
        assert attached.get('height')
        assert attached['width'] > 0
@ -4255,7 +4257,7 @@ def _test_danger_svg(base_dir: str) -> None:
                            federation_list, debug,
                            svg_image_filename)

-    url = post_json_object['object']['attachment'][0]['url']
+    url = get_url_from_post(post_json_object['object']['attachment'][0]['url'])
    assert url == 'https://ratsratsrats.live/media/1234_wibble.svg'

    with open(svg_image_filename, 'rb') as fp_svg:
@ -7317,8 +7319,8 @@ def _test_xml_podcast_dict(base_dir: str) -> None:
    assert podcast_properties.get('funding')
    assert int(podcast_properties['episode']) == 5
    assert podcast_properties['funding']['text'] == "Support the show"
-    assert podcast_properties['funding']['url'] == \
-        "https://whoframed.rodger/donate"
+    url_str = get_url_from_post(podcast_properties['funding']['url'])
+    assert url_str == "https://whoframed.rodger/donate"
    assert len(podcast_properties['transcripts']) == 3
    assert len(podcast_properties['valueRecipients']) == 2
    assert len(podcast_properties['persons']) == 5
--- a/utils.py
+++ b/utils.py
@ -110,6 +110,31 @@ def date_epoch():
    return date_from_numbers(1970, 1, 1, 0, 0)


+def get_url_from_post(url_field) -> str:
+    """Returns a url from a post object
+    """
+    if isinstance(url_field, str):
+        return url_field
+    if isinstance(url_field, list):
+        for url_dict in url_field:
+            if not isinstance(url_dict, dict):
+                continue
+            if 'href' not in url_dict:
+                continue
+            if 'mediaType' not in url_dict:
+                continue
+            if not isinstance(url_dict['href'], str):
+                continue
+            if not isinstance(url_dict['mediaType'], str):
+                continue
+            if url_dict['mediaType'] != 'text/html':
+                continue
+            if '://' not in url_dict['href']:
+                continue
+            return url_dict['href']
+    return ''
+
+
 def get_attributed_to(field) -> str:
    """Returns the actor
    """
@ -404,7 +429,7 @@ def get_media_descriptions_from_post(post_json_object: {}) -> str:
            continue
        descriptions += attach['name'] + ' '
        if attach.get('url'):
-            descriptions += attach['url'] + ' '
+            descriptions += get_url_from_post(attach['url']) + ' '
    return descriptions.strip()


@ -2056,7 +2081,7 @@ def _remove_attachment(base_dir: str, http_prefix: str, domain: str,
        return
    if not post_json['attachment'][0].get('url'):
        return
-    attachment_url = post_json['attachment'][0]['url']
+    attachment_url = get_url_from_post(post_json['attachment'][0]['url'])
    if not attachment_url:
        return
    attachment_url = remove_html(attachment_url)
--- a/video.py
+++ b/video.py
@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
 __status__ = "Production"
 __module_group__ = "Timeline"

+from utils import get_url_from_post
 from utils import remove_html
 from utils import get_full_domain
 from utils import get_nickname_from_actor
@ -204,9 +205,10 @@ def convert_video_to_note(base_dir: str, nickname: str, domain: str,
                    continue
                if not lang.get('url'):
                    continue
-                if not isinstance(lang['url'], str):
+                url_str = get_url_from_post(lang['url'])
+                if not url_str:
                    continue
-                if not lang['url'].endswith('.vtt'):
+                if not url_str.endswith('.vtt'):
                    continue
                for understood in languages_understood:
                    if understood in lang['identifier']:
@ -214,7 +216,7 @@ def convert_video_to_note(base_dir: str, nickname: str, domain: str,
                            "type": "Document",
                            "name": understood,
                            "mediaType": "text/vtt",
-                            "url": lang['url']
+                            "url": url_str
                        })
                        break

--- a/webapp_moderation.py
+++ b/webapp_moderation.py
@ -8,6 +8,7 @@ __status__ = "Production"
 __module_group__ = "Moderation"

 import os
+from utils import get_url_from_post
 from utils import remove_html
 from utils import is_artist
 from utils import is_account_dir
@ -388,7 +389,8 @@ def html_moderation_info(translate: {}, base_dir: str,
        ext = ''
        if actor_json.get('icon'):
            if actor_json['icon'].get('url'):
-                avatar_url = remove_html(actor_json['icon']['url'])
+                url_str = get_url_from_post(actor_json['icon']['url'])
+                avatar_url = remove_html(url_str)
                if '.' in avatar_url:
                    ext = '.' + avatar_url.split('.')[-1]
        acct_url = \
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@ -12,6 +12,7 @@ import html
 import datetime
 import urllib.parse
 from shutil import copyfile
+from utils import get_url_from_post
 from utils import get_config_param
 from utils import remove_html
 from media import path_is_audio
@ -39,7 +40,8 @@ def _html_podcast_chapters(link_url: str,
    if not isinstance(podcast_properties[key], dict):
        return ''
    if podcast_properties[key].get('url'):
-        chapters_url = remove_html(podcast_properties[key]['url'])
+        url_str = get_url_from_post(podcast_properties[key]['url'])
+        chapters_url = remove_html(url_str)
    elif podcast_properties[key].get('uri'):
        chapters_url = podcast_properties[key]['uri']
    else:
@ -80,7 +82,8 @@ def _html_podcast_chapters(link_url: str,
                chapter_title = chapter['title']
                chapter_url = ''
                if chapter.get('url'):
-                    chapter_url = remove_html(chapter['url'])
+                    url_str = get_url_from_post(chapter['url'])
+                    chapter_url = remove_html(url_str)
                    chapter_title = \
                        '<a href="' + chapter_url + '">' + \
                        chapter['title'] + '<\a>'
@ -122,7 +125,8 @@ def _html_podcast_transcripts(podcast_properties: {}, translate: {}) -> str:
    for _ in podcast_properties[key]:
        transcript_url = None
        if podcast_properties[key].get('url'):
-            transcript_url = remove_html(podcast_properties[key]['url'])
+            url_str = get_url_from_post(podcast_properties[key]['url'])
+            transcript_url = remove_html(url_str)
        elif podcast_properties[key].get('uri'):
            transcript_url = podcast_properties[key]['uri']
        if not transcript_url:
@ -155,7 +159,8 @@ def _html_podcast_social_interactions(podcast_properties: {},
    if podcast_properties[key].get('uri'):
        episode_post_url = podcast_properties[key]['uri']
    elif podcast_properties[key].get('url'):
-        episode_post_url = remove_html(podcast_properties[key]['url'])
+        url_str = get_url_from_post(podcast_properties[key]['url'])
+        episode_post_url = remove_html(url_str)
    elif podcast_properties[key].get('text'):
        episode_post_url = podcast_properties[key]['text']
    else:
@ -439,7 +444,8 @@ def html_podcast_episode(translate: {},
    # donate button
    if podcast_properties.get('funding'):
        if podcast_properties['funding'].get('url'):
-            donate_url = remove_html(podcast_properties['funding']['url'])
+            url_str = get_url_from_post(podcast_properties['funding']['url'])
+            donate_url = remove_html(url_str)
            podcast_str += \
                '<p><span itemprop="funding"><a href="' + donate_url + \
                '" rel="donation"><button class="donateButton">' + \
--- a/webapp_post.py
+++ b/webapp_post.py
@ -24,6 +24,7 @@ from posts import post_is_muted
 from posts import get_person_box
 from posts import download_announce
 from posts import populate_replies_json
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import remove_markup_tag
 from utils import ap_proxy_type
@ -155,7 +156,8 @@ def _html_post_metadata_open_graph(domain: str, post_json_object: {},
                    "    <meta content=\"@" + actor_handle + \
                    "\" property=\"og:title\" />\n"
    if obj_json.get('url'):
-        obj_url = remove_html(obj_json['url'])
+        url_str = get_url_from_post(obj_json['url'])
+        obj_url = remove_html(url_str)
        metadata += \
            "    <meta content=\"" + obj_url + \
            "\" property=\"og:url\" />\n"
@ -211,7 +213,8 @@ def _html_post_metadata_open_graph(domain: str, post_json_object: {},
            metadata += \
                "    <meta content=\"" + description + \
                "\" name=\"og:description\">\n"
-            attach_url = remove_html(attach_json['url'])
+            url_str = get_url_from_post(attach_json['url'])
+            attach_url = remove_html(url_str)
            metadata += \
                "    <meta content=\"" + attach_url + \
                "\" property=\"og:image\" />\n"
@ -1197,7 +1200,8 @@ def _get_blog_citations_html(box_name: str,
            continue
        if not tag_json.get('url'):
            continue
-        citation_url = remove_html(tag_json['url'])
+        url_str = get_url_from_post(tag_json['url'])
+        citation_url = remove_html(url_str)
        citation_name = remove_html(tag_json['name'])
        citations_str += \
            '<li><a href="' + citation_url + '" tabindex="10">' + \
--- a/webapp_profile.py
+++ b/webapp_profile.py
@ -10,6 +10,7 @@ __module_group__ = "Web Interface"
 import os
 from pprint import pprint
 from webfinger import webfinger_handle
+from utils import get_url_from_post
 from utils import get_memorials
 from utils import text_in_file
 from utils import dangerous_markup
@ -267,7 +268,8 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
    avatar_url = ''
    if profile_json.get('icon'):
        if profile_json['icon'].get('url'):
-            avatar_url = remove_html(profile_json['icon']['url'])
+            url_str = get_url_from_post(profile_json['icon']['url'])
+            avatar_url = remove_html(url_str)
    if not avatar_url:
        avatar_url = get_person_avatar_url(base_dir, person_url, person_cache)
    display_name = search_nickname
@ -324,8 +326,8 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
    # profileBackgroundImage = ''
    # if profile_json.get('image'):
    #     if profile_json['image'].get('url'):
-    #         profileBackgroundImage = \
-    #             remove_html(profile_json['image']['url'])
+    #         url_str = get_url_from_post(profile_json['image']['url'])
+    #         profileBackgroundImage = remove_html(url_str)

    # url to return to
    back_url = path
@ -348,7 +350,8 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
    image_url = ''
    if profile_json.get('image'):
        if profile_json['image'].get('url'):
-            image_url = remove_html(profile_json['image']['url'])
+            url_str = get_url_from_post(profile_json['image']['url'])
+            image_url = remove_html(url_str)

    also_known_as = None
    if profile_json.get('alsoKnownAs'):
@ -1194,7 +1197,8 @@ def html_profile(signing_priv_key_pem: str,
    if profile_json.get('hasOccupation'):
        occupation_name = get_occupation_name(profile_json)

-    avatar_url = remove_html(profile_json['icon']['url'])
+    url_str = get_url_from_post(profile_json['icon']['url'])
+    avatar_url = remove_html(url_str)
    # use alternate path for local avatars to avoid any caching issues
    if '://' + domain_full + '/system/accounts/avatars/' in avatar_url:
        avatar_url = \
--- a/webapp_search.py
+++ b/webapp_search.py
@ -10,6 +10,7 @@ __module_group__ = "Web Interface"
 import os
 from shutil import copyfile
 import urllib.parse
+from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import get_attributed_to
 from utils import get_actor_from_post_id
@ -571,7 +572,8 @@ def html_skills_search(actor: str, translate: {}, base_dir: str,
                        skill_level_str = '0' + skill_level_str
                    if skill_level < 10:
                        skill_level_str = '0' + skill_level_str
-                    icon_url = remove_html(actor_json['icon']['url'])
+                    url_str = get_url_from_post(actor_json['icon']['url'])
+                    icon_url = remove_html(url_str)
                    index_str = \
                        skill_level_str + ';' + actor + ';' + \
                        actor_json['name'] + \
@ -611,7 +613,9 @@ def html_skills_search(actor: str, translate: {}, base_dir: str,
                                skill_level_str = '0' + skill_level_str
                            if skill_level < 10:
                                skill_level_str = '0' + skill_level_str
-                            icon_url = remove_html(actor_json['icon']['url'])
+                            url_str = \
+                                get_url_from_post(actor_json['icon']['url'])
+                            icon_url = remove_html(url_str)
                            index_str = \
                                skill_level_str + ';' + actor + ';' + \
                                actor_json['name'] + \
@ -1375,7 +1379,8 @@ def rss_hashtag_search(nickname: str, domain: str, port: int,
                    for attach in post_json_object['object']['attachment']:
                        if not attach.get('url'):
                            continue
-                        attach_url = remove_html(attach['url'])
+                        url_str = get_url_from_post(attach['url'])
+                        attach_url = remove_html(url_str)
                        hashtag_feed += \
                            '         <link>' + attach_url + '</link>'
                hashtag_feed += '     </item>'
--- a/webapp_utils.py
+++ b/webapp_utils.py
@ -12,6 +12,7 @@ from shutil import copyfile
 from collections import OrderedDict
 from session import get_json
 from session import get_json_valid
+from utils import get_url_from_post
 from utils import get_media_url_from_video
 from utils import get_attributed_to
 from utils import local_network_host
@ -857,7 +858,8 @@ def html_header_with_person_markup(css_filename: str, instance_title: str,
    domain_full = actor_json['id'].split('://')[1].split('/')[0]
    handle = actor_json['preferredUsername'] + '@' + domain_full

-    icon_url = remove_html(actor_json['icon']['url'])
+    url_str = get_url_from_post(actor_json['icon']['url'])
+    icon_url = remove_html(url_str)
    person_markup = \
        '      "about": {\n' + \
        '        "@type" : "Person",\n' + \
@ -893,7 +895,8 @@ def html_header_with_person_markup(css_filename: str, instance_title: str,
        '    </script>\n'

    description = remove_html(description)
-    actor2_url = remove_html(actor_json['url'])
+    url_str = get_url_from_post(actor_json['url'])
+    actor2_url = remove_html(url_str)
    og_metadata = \
        "    <meta content=\"profile\" property=\"og:type\" />\n" + \
        "    <meta content=\"" + description + \
@ -1344,7 +1347,7 @@ def get_post_attachments_as_html(base_dir: str,
            name = attach['hreflang']
        url = None
        if attach.get('url'):
-            url = attach['url']
+            url = get_url_from_post(attach['url'])
        elif attach.get('href'):
            url = attach['href']
        if name and url:
@ -1397,7 +1400,8 @@ def get_post_attachments_as_html(base_dir: str,
            image_description = attach['name'].replace('"', "'")
            image_description = remove_html(image_description)
        if _is_image_mime_type(media_type):
-            image_url = remove_html(attach['url'])
+            url_str = get_url_from_post(attach['url'])
+            image_url = remove_html(url_str)
            if image_url in attached_urls:
                continue
            attached_urls.append(image_url)
@ -1452,7 +1456,8 @@ def get_post_attachments_as_html(base_dir: str,
                                '   ' + license_str + \
                                '</figcaption></figure>\n'
                    if post_json_object['object'].get('url'):
-                        image_post_url = post_json_object['object']['url']
+                        url_str = post_json_object['object']['url']
+                        image_post_url = get_url_from_post(url_str)
                    else:
                        image_post_url = post_json_object['object']['id']
                    image_post_url = remove_html(image_post_url)
@ -1554,7 +1559,8 @@ def get_post_attachments_as_html(base_dir: str,
                if box_name == 'tlmedia':
                    gallery_str += '<div class="gallery">\n'
                    if post_json_object['object'].get('url'):
-                        video_post_url = post_json_object['object']['url']
+                        url_str = post_json_object['object']['url']
+                        video_post_url = get_url_from_post(url_str)
                    else:
                        video_post_url = post_json_object['object']['id']
                    video_post_url = remove_html(video_post_url)
@ -1629,7 +1635,8 @@ def get_post_attachments_as_html(base_dir: str,
                attachment_ctr += 1
        elif _is_audio_mime_type(media_type):
            extension = '.mp3'
-            audio_url = remove_html(attach['url'])
+            url_str = get_url_from_post(attach['url'])
+            audio_url = remove_html(url_str)
            if audio_url in attached_urls:
                continue
            attached_urls.append(audio_url)
@ -1664,7 +1671,8 @@ def get_post_attachments_as_html(base_dir: str,
                        gallery_str += '    </audio>\n'
                        gallery_str += '  </a>\n'
                    if post_json_object['object'].get('url'):
-                        audio_post_url = post_json_object['object']['url']
+                        url_str = post_json_object['object']['url']
+                        audio_post_url = get_url_from_post(url_str)
                    else:
                        audio_post_url = post_json_object['object']['id']
                    audio_post_url = remove_html(audio_post_url)
--- a/webfinger.py
+++ b/webfinger.py
@ -13,6 +13,7 @@ from session import get_json
 from session import get_json_valid
 from cache import store_webfinger_in_cache
 from cache import get_webfinger_from_cache
+from utils import get_url_from_post
 from utils import remove_html
 from utils import acct_handle_dir
 from utils import get_attachment_property_value
@ -433,7 +434,8 @@ def _webfinger_update_avatar(wf_json: {}, actor_json: {}) -> bool:
    """Updates the avatar image link
    """
    found = False
-    avatar_url = remove_html(actor_json['icon']['url'])
+    url_str = get_url_from_post(actor_json['icon']['url'])
+    avatar_url = remove_html(url_str)
    media_type = actor_json['icon']['mediaType']
    for link in wf_json['links']:
        if not link.get('rel'):
@ -463,7 +465,8 @@ def _webfinger_update_vcard(wf_json: {}, actor_json: {}) -> bool:
        if link.get('type'):
            if link['type'] == 'text/vcard':
                return False
-    actor_url = remove_html(actor_json['url'])
+    url_str = get_url_from_post(actor_json['url'])
+    actor_url = remove_html(url_str)
    wf_json['links'].append({
        "href": actor_url,
        "rel": "http://webfinger.net/rel/profile-page",