From eca0fa17366caaf1471a61872ba0d43d5123b500 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Wed, 12 Jul 2023 12:08:02 +0100
Subject: [PATCH] Sanitise links to avoid injection attacks in rendered html

---
 blog.py              |  8 +++++---
 bookmarks.py         |  3 +++
 content.py           | 19 +++++++++----------
 daemon.py            | 15 +++++++--------
 desktop_client.py    |  3 ++-
 inbox.py             | 13 ++++++++-----
 maps.py              |  3 ++-
 mastoapiv1.py        | 11 +++++++----
 metadata.py          | 11 +++++++----
 newswire.py          | 10 +++++-----
 person.py            |  8 ++++----
 posts.py             | 31 +++++++++++++++++++------------
 utils.py             |  1 +
 video.py             | 10 ++++++----
 webapp_moderation.py |  3 ++-
 webapp_podcast.py    | 12 ++++++------
 webapp_post.py       | 16 ++++++++++------
 webapp_profile.py    |  9 +++++----
 webapp_search.py     | 10 +++++++---
 webapp_utils.py      | 19 +++++++++++++------
 webfinger.py         |  6 ++++--
 21 files changed, 132 insertions(+), 89 deletions(-)
diff --git a/blog.py b/blog.py
index 6c9462bbb..85358a644 100644
--- a/blog.py
+++ b/blog.py
@@ -311,9 +311,11 @@ def _html_blog_post_content(debug: bool, session, authorized: bool,
                 continue
             if not tag_json.get('url'):
                 continue
+            citation_url = remove_html(tag_json['url'])
+            citation_name = remove_html(tag_json['name'])
             citations_str += \
-                '<li><a href="' + tag_json['url'] + '">' + \
-                '<cite>' + tag_json['name'] + '</cite></a></li>\n'
+                '<li><a href="' + citation_url + '">' + \
+                '<cite>' + citation_name + '</cite></a></li>\n'
         if citations_str:
             citations_str = '<p><b>' + translate['Citations'] + \
                 ':</b></p>' + \
@@ -475,7 +477,7 @@ def html_blog_post(session, authorized: bool,
     title = post_json_object['object']['summary']
     url = ''
     if post_json_object['object'].get('url'):
-        url = post_json_object['object']['url']
+        url = remove_html(post_json_object['object']['url'])
     snippet = _get_snippet_from_blog_content(post_json_object,
                                              system_language)
     blog_str = html_header_with_blog_markup(css_filename, instance_title,
diff --git a/bookmarks.py b/bookmarks.py
index 584afa02a..be7f6ddf0 100644
--- a/bookmarks.py
+++ b/bookmarks.py
@@ -30,6 +30,7 @@ from utils import has_actor
 from utils import has_object_string_type
 from utils import text_in_file
 from utils import remove_eol
+from utils import remove_html
 from posts import get_person_box
 from session import post_json
 
@@ -599,6 +600,7 @@ def outbox_bookmark(recent_posts_cache: {},
         print('DEBUG: c2s bookmark Add request arrived in outbox')
 
     message_url = remove_id_ending(message_json['object']['url'])
+    message_url = remove_html(message_url)
     domain = remove_domain_port(domain)
     post_filename = locate_post(base_dir, nickname, domain, message_url)
     if not post_filename:
@@ -656,6 +658,7 @@ def outbox_undo_bookmark(recent_posts_cache: {},
         print('DEBUG: c2s unbookmark Remove request arrived in outbox')
 
     message_url = remove_id_ending(message_json['object']['url'])
+    message_url = remove_html(message_url)
     domain = remove_domain_port(domain)
     post_filename = locate_post(base_dir, nickname, domain, message_url)
     if not post_filename:
diff --git a/content.py b/content.py
index 0493ad087..500ac71c7 100644
--- a/content.py
+++ b/content.py
@@ -445,7 +445,8 @@ def replace_emoji_from_tags(session, base_dir: str,
             continue
         if tag_item['name'] not in content:
             continue
-        icon_name = tag_item['icon']['url'].split('/')[-1]
+        tag_url = remove_html(tag_item['icon']['url'])
+        icon_name = tag_url.split('/')[-1]
         if icon_name:
             if len(icon_name) > 1:
                 if icon_name[0].isdigit():
@@ -472,14 +473,12 @@ def replace_emoji_from_tags(session, base_dir: str,
                                           'no conversion of ' +
                                           str(icon_name) + ' to chr ' +
                                           tag_item['name'] + ' ' +
-                                          tag_item['icon']['url'])
+                                          tag_url)
                             if not replaced:
                                 _save_custom_emoji(session, base_dir,
                                                    tag_item['name'],
-                                                   tag_item['icon']['url'],
-                                                   debug)
-                                _update_common_emoji(base_dir,
-                                                     icon_name)
+                                                   tag_url, debug)
+                                _update_common_emoji(base_dir, icon_name)
                             else:
                                 _update_common_emoji(base_dir,
                                                      "0x" + icon_name)
@@ -501,12 +500,11 @@ def replace_emoji_from_tags(session, base_dir: str,
                                               'no conversion of ' +
                                               str(icode) + ' to chr ' +
                                               tag_item['name'] + ' ' +
-                                              tag_item['icon']['url'])
+                                              tag_url)
                                 if not replaced:
                                     _save_custom_emoji(session, base_dir,
                                                        tag_item['name'],
-                                                       tag_item['icon']['url'],
-                                                       debug)
+                                                       tag_url, debug)
                                     _update_common_emoji(base_dir,
                                                          icon_name)
                                 else:
@@ -529,7 +527,8 @@ def replace_emoji_from_tags(session, base_dir: str,
             emoji_tag_name = tag_item['name'].replace(':', '')
         else:
             emoji_tag_name = ''
-        emoji_html = "<img src=\"" + tag_item['icon']['url'] + "\" alt=\"" + \
+        tag_url = remove_html(tag_item['icon']['url'])
+        emoji_html = "<img src=\"" + tag_url + "\" alt=\"" + \
             emoji_tag_name + \
             "\" align=\"middle\" class=\"" + html_class + "\"/>"
         content = content.replace(tag_item['name'], emoji_html)
diff --git a/daemon.py b/daemon.py
index d966c1b3a..9ea3248e1 100644
--- a/daemon.py
+++ b/daemon.py
@@ -6419,7 +6419,7 @@ class PubServer(BaseHTTPRequestHandler):
                     for m_type, last_part in uploads:
                         rep_str = '/' + last_part
                         if m_type == 'avatar':
-                            actor_url = actor_json['icon']['url']
+                            actor_url = remove_html(actor_json['icon']['url'])
                             last_part_of_url = actor_url.split('/')[-1]
                             srch_str = '/' + last_part_of_url
                             actor_url = actor_url.replace(srch_str, rep_str)
@@ -6432,15 +6432,14 @@ class PubServer(BaseHTTPRequestHandler):
                                 actor_json['icon']['mediaType'] = \
                                     'image/' + img_ext
                         elif m_type == 'image':
-                            last_part_of_url = \
-                                actor_json['image']['url'].split('/')[-1]
+                            im_url = \
+                                remove_html(actor_json['image']['url'])
+                            last_part_of_url = im_url.split('/')[-1]
                             srch_str = '/' + last_part_of_url
                             actor_json['image']['url'] = \
-                                actor_json['image']['url'].replace(srch_str,
-                                                                   rep_str)
-                            if '.' in actor_json['image']['url']:
-                                img_ext = \
-                                    actor_json['image']['url'].split('.')[-1]
+                                im_url.replace(srch_str, rep_str)
+                            if '.' in im_url:
+                                img_ext = im_url.split('.')[-1]
                                 if img_ext == 'jpg':
                                     img_ext = 'jpeg'
                                 actor_json['image']['mediaType'] = \
diff --git a/desktop_client.py b/desktop_client.py
index 67175b4bb..79f27224e 100644
--- a/desktop_client.py
+++ b/desktop_client.py
@@ -742,7 +742,8 @@ def _show_replies_on_post(post_json_object: {}, max_replies: int) -> None:
     print('')
     ctr = 0
     for item in object_replies['items']:
-        print('  ↰ ' + str(item['url']))
+        item_url = remove_html(item['url'])
+        print('  ↰ ' + str(item_url))
         ctr += 1
         if ctr >= max_replies:
             break
diff --git a/inbox.py b/inbox.py
index 7957a1ad0..f52ef1d66 100644
--- a/inbox.py
+++ b/inbox.py
@@ -187,7 +187,7 @@ def cache_svg_images(session, base_dir: str, http_prefix: str,
             continue
         if attach['url'].endswith('.svg') or \
            'svg' in attach['mediaType']:
-            url = attach['url']
+            url = remove_html(attach['url'])
             if not url_permitted(url, federation_list):
                 continue
             # if this is a local image then it has already been
@@ -1175,8 +1175,9 @@ def _person_receive_update(base_dir: str,
                            debug: bool, http_prefix: str) -> bool:
     """Changes an actor. eg: avatar or display name change
     """
+    person_url = remove_html(person_json['url'])
     if debug:
-        print('Receiving actor update for ' + person_json['url'] +
+        print('Receiving actor update for ' + person_url +
               ' ' + str(person_json))
     domain_full = get_full_domain(domain, port)
     update_domain_full = get_full_domain(update_domain, update_port)
@@ -2581,7 +2582,8 @@ def _receive_bookmark(recent_posts_cache: {},
     if debug:
         print('DEBUG: c2s inbox bookmark Add request arrived in outbox')
 
-    message_url = remove_id_ending(message_json['object']['url'])
+    message_url2 = remove_html(message_json['object']['url'])
+    message_url = remove_id_ending(message_url2)
     domain = remove_domain_port(domain)
     post_filename = locate_post(base_dir, nickname, domain, message_url)
     if not post_filename:
@@ -2591,7 +2593,7 @@ def _receive_bookmark(recent_posts_cache: {},
         return True
 
     update_bookmarks_collection(recent_posts_cache, base_dir, post_filename,
-                                message_json['object']['url'],
+                                message_url2,
                                 message_json['actor'], domain, debug)
     # regenerate the html
     bookmarked_post_json = load_json(post_filename, 0, 1)
@@ -2707,7 +2709,8 @@ def _receive_undo_bookmark(recent_posts_cache: {},
         print('DEBUG: c2s inbox Remove bookmark ' +
               'request arrived in outbox')
 
-    message_url = remove_id_ending(message_json['object']['url'])
+    message_url2 = remove_html(message_json['object']['url'])
+    message_url = remove_id_ending(message_url2)
     domain = remove_domain_port(domain)
     post_filename = locate_post(base_dir, nickname, domain, message_url)
     if not post_filename:
diff --git a/maps.py b/maps.py
index c9fe0a20b..48489db6f 100644
--- a/maps.py
+++ b/maps.py
@@ -15,6 +15,7 @@ from utils import acct_dir
 from utils import load_json
 from utils import save_json
 from utils import locate_post
+from utils import remove_html
 
 
 def get_location_from_tags(tags: []) -> str:
@@ -340,7 +341,7 @@ def get_map_preferences_url(base_dir: str, nickname: str, domain: str) -> str:
     if os.path.isfile(maps_filename):
         maps_json = load_json(maps_filename)
         if maps_json.get('url'):
-            return maps_json['url']
+            return remove_html(maps_json['url'])
     return None
 
 
diff --git a/mastoapiv1.py b/mastoapiv1.py
index 3b6c42129..d165fe87a 100644
--- a/mastoapiv1.py
+++ b/mastoapiv1.py
@@ -11,6 +11,7 @@ import os
 from utils import load_json
 from utils import get_config_param
 from utils import acct_dir
+from utils import remove_html
 from metadata import meta_data_instance
 
 
@@ -62,6 +63,8 @@ def _get_masto_api_v1account(base_dir: str, nickname: str, domain: str) -> {}:
     account_json = load_json(account_filename)
     if not account_json:
         return {}
+    avatar_url = remove_html(account_json['icon']['url'])
+    image_url = remove_html(account_json['image']['url'])
     masto_account_json = {
         "id": get_masto_api_v1id_from_nickname(nickname),
         "username": nickname,
@@ -74,10 +77,10 @@ def _get_masto_api_v1account(base_dir: str, nickname: str, domain: str) -> {}:
         "statuses_count": 0,
         "note": account_json['summary'],
         "url": account_json['id'],
-        "avatar": account_json['icon']['url'],
-        "avatar_static": account_json['icon']['url'],
-        "header": account_json['image']['url'],
-        "header_static": account_json['image']['url']
+        "avatar": avatar_url,
+        "avatar_static": avatar_url,
+        "header": image_url,
+        "header_static": image_url
     }
     return masto_account_json
 
diff --git a/metadata.py b/metadata.py
index f42a00806..2c2b5f401 100644
--- a/metadata.py
+++ b/metadata.py
@@ -12,6 +12,7 @@ from utils import is_account_dir
 from utils import load_json
 from utils import no_of_accounts
 from utils import no_of_active_accounts_monthly
+from utils import remove_html
 
 
 def _get_status_count(base_dir: str) -> int:
@@ -152,6 +153,8 @@ def meta_data_instance(show_accounts: bool,
     if admin_actor.get('published'):
         created_at = admin_actor['published']
 
+    icon_url = remove_html(admin_actor['icon']['url'])
+    image_url = remove_html(admin_actor['image']['url'])
     instance = {
         'approval_required': False,
         'invites_enabled': False,
@@ -159,10 +162,10 @@ def meta_data_instance(show_accounts: bool,
         'contact_account': {
             'acct': admin_actor['preferredUsername'],
             'created_at': created_at,
-            'avatar': admin_actor['icon']['url'],
-            'avatar_static': admin_actor['icon']['url'],
-            'header': admin_actor['image']['url'],
-            'header_static': admin_actor['image']['url'],
+            'avatar': icon_url,
+            'avatar_static': icon_url,
+            'header': image_url,
+            'header_static': image_url,
             'bot': is_bot,
             'discoverable': True,
             'group': is_group,
diff --git a/newswire.py b/newswire.py
index d416e6237..8026c3fcb 100644
--- a/newswire.py
+++ b/newswire.py
@@ -492,9 +492,9 @@ def _valid_podcast_entry(base_dir: str, key: str, entry: {}) -> bool:
         if entry['protocol'].tolower() != 'activitypub':
             return False
         if entry.get('uri'):
-            post_url = entry['uri']
+            post_url = remove_html(entry['uri'])
         elif entry.get('url'):
-            post_url = entry['uri']
+            post_url = remove_html(entry['uri'])
         else:
             post_url = entry['text']
         if '://' not in post_url:
@@ -1133,7 +1133,7 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
                         if tag_name not in description:
                             description += ' ' + tag_name
 
-        link = json_feed_item['url']
+        link = remove_html(json_feed_item['url'])
         if '://' not in link:
             continue
         if len(link) > max_bytes:
@@ -1551,10 +1551,10 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
                     description = remove_html(description)
                     tags_from_post = _get_hashtags_from_post(post_json_object)
                     summary = post_json_object['object']['summary']
+                    url2 = remove_html(post_json_object['object']['url'])
                     _add_newswire_dict_entry(base_dir, domain,
                                              newswire, published,
-                                             summary,
-                                             post_json_object['object']['url'],
+                                             summary, url2,
                                              votes, full_post_filename,
                                              description, moderated, False,
                                              tags_from_post,
diff --git a/person.py b/person.py
index 3cba8d394..7d816b2c2 100644
--- a/person.py
+++ b/person.py
@@ -1789,7 +1789,7 @@ def get_person_avatar_url(base_dir: str, person_url: str,
     if person_json.get('icon'):
         if person_json['icon'].get('url'):
             if '.svg' not in person_json['icon']['url'].lower():
-                return person_json['icon']['url']
+                return remove_html(person_json['icon']['url'])
     return None
 
 
@@ -1971,7 +1971,7 @@ def get_featured_hashtags(actor_json: {}) -> str:
             tag_name = tag_name[1:]
         if not tag_name:
             continue
-        tag_url = tag_dict['href']
+        tag_url = remove_html(tag_dict['href'])
         if '://' not in tag_url:
             continue
         if not valid_hash_tag(tag_name):
@@ -2019,13 +2019,13 @@ def get_featured_hashtags_as_html(actor_json: {},
             continue
         if ' #' + tag_name in profile_description:
             continue
-        tag_url = tag_dict['href']
+        tag_url = remove_html(tag_dict['href'])
         if '://' not in tag_url:
             continue
         if not valid_hash_tag(tag_name):
             continue
         result += \
-            '<a href="' + tag_dict['href'] + '" ' + \
+            '<a href="' + tag_url + '" ' + \
             'class="mention hashtag" rel="tag" ' + \
             'tabindex="10">#' + tag_name + '</a> '
         ctr += 1
diff --git a/posts.py b/posts.py
index fd8553d54..01e960b1b 100644
--- a/posts.py
+++ b/posts.py
@@ -216,10 +216,10 @@ def get_user_url(wf_request: {}, source_id: int, debug: bool) -> str:
             else:
                 url = link['href']
             if not contains_invalid_actor_url_chars(url):
-                return url
+                return remove_html(url)
         url = link['href']
         if not contains_invalid_actor_url_chars(url):
-            return url
+            return remove_html(url)
     return None
 
 
@@ -404,7 +404,7 @@ def get_person_box(signing_priv_key_pem: str, origin_domain: str,
     avatar_url = None
     if person_json.get('icon'):
         if person_json['icon'].get('url'):
-            avatar_url = person_json['icon']['url']
+            avatar_url = remove_html(person_json['icon']['url'])
     display_name = None
     if person_json.get('name'):
         display_name = person_json['name']
@@ -628,7 +628,8 @@ def _get_posts(session, outbox_url: str, max_posts: int,
                                 if url_permitted(tag_item['icon']['url'],
                                                  federation_list):
                                     emoji_name = tag_item['name']
-                                    emoji_icon = tag_item['icon']['url']
+                                    emoji_icon = \
+                                        remove_html(tag_item['icon']['url'])
                                     emoji[emoji_name] = emoji_icon
                                 else:
                                     if debug:
@@ -675,10 +676,11 @@ def _get_posts(session, outbox_url: str, max_posts: int,
                     for attach in this_item['attachment']:
                         if attach.get('name') and attach.get('url'):
                             # no attachments from non-permitted domains
-                            if url_permitted(attach['url'],
+                            attach_url = remove_html(attach['url'])
+                            if url_permitted(attach_url,
                                              federation_list):
                                 attachment.append([attach['name'],
-                                                   attach['url']])
+                                                   attach_url])
                             else:
                                 if debug:
                                     print('url not permitted ' +
@@ -820,8 +822,9 @@ def get_post_domains(session, outbox_url: str, max_posts: int, debug: bool,
                 tag_type = tag_item['type'].lower()
                 if tag_type == 'mention':
                     if tag_item.get('href'):
+                        tag_url = remove_html(tag_item['href'])
                         post_domain, _ = \
-                            get_domain_from_actor(tag_item['href'])
+                            get_domain_from_actor(tag_url)
                         if post_domain:
                             if post_domain not in post_domains:
                                 post_domains.append(post_domain)
@@ -879,6 +882,7 @@ def _get_posts_for_blocked_domains(base_dir: str,
                         url = item['object']['url']
                     else:
                         url = item['object']['id']
+                    url = remove_html(url)
                     if not blocked_posts.get(post_domain):
                         blocked_posts[post_domain] = [url]
                     else:
@@ -891,8 +895,9 @@ def _get_posts_for_blocked_domains(base_dir: str,
                     continue
                 tag_type = tag_item['type'].lower()
                 if tag_type == 'mention' and tag_item.get('href'):
+                    tag_url = remove_html(tag_item['href'])
                     post_domain, _ = \
-                        get_domain_from_actor(tag_item['href'])
+                        get_domain_from_actor(tag_url)
                     if not post_domain:
                         continue
                     if is_blocked_domain(base_dir, post_domain):
@@ -900,6 +905,7 @@ def _get_posts_for_blocked_domains(base_dir: str,
                             url = item['object']['url']
                         else:
                             url = item['object']['id']
+                        url = remove_html(url)
                         if not blocked_posts.get(post_domain):
                             blocked_posts[post_domain] = [url]
                         else:
@@ -1496,7 +1502,8 @@ def _create_post_mentions(cc_url: str, new_post: {},
                 if tag['type'] != 'Mention':
                     continue
                 if tag['href'] not in to_cc:
-                    new_post['object']['cc'].append(tag['href'])
+                    tag_url = remove_html(tag['href'])
+                    new_post['object']['cc'].append(tag_url)
 
         _consolidate_actors_list(new_post['object']['cc'])
         new_post['cc'] = new_post['object']['cc']
@@ -2099,9 +2106,9 @@ def create_blog_post(base_dir: str,
                            low_bandwidth, content_license_url,
                            media_license_url, media_creator,
                            languages_understood, translate, buy_url, chat_url)
-    if '/@/' not in blog_json['object']['url']:
-        blog_json['object']['url'] = \
-            blog_json['object']['url'].replace('/@', '/users/')
+    obj_url = remove_html(blog_json['object']['url'])
+    if '/@/' not in obj_url:
+        blog_json['object']['url'] = obj_url.replace('/@', '/users/')
     _append_citations_to_blog_post(base_dir, nickname, domain, blog_json)
 
     return blog_json
diff --git a/utils.py b/utils.py
index 9a4ab922d..ad8e1f1bf 100644
--- a/utils.py
+++ b/utils.py
@@ -1862,6 +1862,7 @@ def _remove_attachment(base_dir: str, http_prefix: str, domain: str,
     attachment_url = post_json['attachment'][0]['url']
     if not attachment_url:
         return
+    attachment_url = remove_html(attachment_url)
     media_filename = base_dir + '/' + \
         attachment_url.replace(http_prefix + '://' + domain + '/', '')
     if os.path.isfile(media_filename):
diff --git a/video.py b/video.py
index 77974f482..9e06ddb68 100644
--- a/video.py
+++ b/video.py
@@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
 __status__ = "Production"
 __module_group__ = "Timeline"
 
+from utils import remove_html
 from utils import get_full_domain
 from utils import get_nickname_from_actor
 from utils import get_domain_from_actor
@@ -110,15 +111,15 @@ def convert_video_to_note(base_dir: str, nickname: str, domain: str,
         if not media_link.get('href'):
             continue
         if media_link['mediaType'] == 'application/x-bittorrent':
-            media_torrent = media_link['href']
+            media_torrent = remove_html(media_link['href'])
         if media_link['href'].startswith('magnet:'):
-            media_magnet = media_link['href']
+            media_magnet = remove_html(media_link['href'])
         if media_link['mediaType'] != 'video/mp4' and \
            media_link['mediaType'] != 'video/ogv':
             continue
         if not media_url:
             media_type = media_link['mediaType']
-            media_url = media_link['href']
+            media_url = remove_html(media_link['href'])
 
     if not media_url:
         return None
@@ -138,7 +139,8 @@ def convert_video_to_note(base_dir: str, nickname: str, domain: str,
             content += '<a href="' + media_magnet + '">🧲</a>'
         content += '</p>'
 
-    new_post_id = remove_id_ending(post_json_object['id'])
+    new_post_id2 = remove_html(post_json_object['id'])
+    new_post_id = remove_id_ending(new_post_id2)
     new_post = {
         '@context': post_json_object['@context'],
         'id': new_post_id + '/activity',
diff --git a/webapp_moderation.py b/webapp_moderation.py
index 24829bba1..102285150 100644
--- a/webapp_moderation.py
+++ b/webapp_moderation.py
@@ -8,6 +8,7 @@ __status__ = "Production"
 __module_group__ = "Moderation"
 
 import os
+from utils import remove_html
 from utils import is_artist
 from utils import is_account_dir
 from utils import get_full_domain
@@ -387,7 +388,7 @@ def html_moderation_info(translate: {}, base_dir: str,
         ext = ''
         if actor_json.get('icon'):
             if actor_json['icon'].get('url'):
-                avatar_url = actor_json['icon']['url']
+                avatar_url = remove_html(actor_json['icon']['url'])
                 if '.' in avatar_url:
                     ext = '.' + avatar_url.split('.')[-1]
         acct_url = \
diff --git a/webapp_podcast.py b/webapp_podcast.py
index 4d396c94a..b9104918c 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -38,7 +38,7 @@ def _html_podcast_chapters(link_url: str,
     if not isinstance(podcast_properties[key], dict):
         return ''
     if podcast_properties[key].get('url'):
-        chapters_url = podcast_properties[key]['url']
+        chapters_url = remove_html(podcast_properties[key]['url'])
     elif podcast_properties[key].get('uri'):
         chapters_url = podcast_properties[key]['uri']
     else:
@@ -79,7 +79,7 @@ def _html_podcast_chapters(link_url: str,
                 chapter_title = chapter['title']
                 chapter_url = ''
                 if chapter.get('url'):
-                    chapter_url = chapter['url']
+                    chapter_url = remove_html(chapter['url'])
                     chapter_title = \
                         '<a href="' + chapter_url + '">' + \
                         chapter['title'] + '<\a>'
@@ -121,7 +121,7 @@ def _html_podcast_transcripts(podcast_properties: {}, translate: {}) -> str:
     for _ in podcast_properties[key]:
         transcript_url = None
         if podcast_properties[key].get('url'):
-            transcript_url = podcast_properties[key]['url']
+            transcript_url = remove_html(podcast_properties[key]['url'])
         elif podcast_properties[key].get('uri'):
             transcript_url = podcast_properties[key]['uri']
         if not transcript_url:
@@ -154,7 +154,7 @@ def _html_podcast_social_interactions(podcast_properties: {},
     if podcast_properties[key].get('uri'):
         episode_post_url = podcast_properties[key]['uri']
     elif podcast_properties[key].get('url'):
-        episode_post_url = podcast_properties[key]['url']
+        episode_post_url = remove_html(podcast_properties[key]['url'])
     elif podcast_properties[key].get('text'):
         episode_post_url = podcast_properties[key]['text']
     else:
@@ -218,7 +218,7 @@ def _html_podcast_performers(podcast_properties: {}) -> str:
 
         performer_url = ''
         if performer.get('href'):
-            performer_url = performer['href']
+            performer_url = remove_html(performer['href'])
 
         performer_img = ''
         if performer.get('img'):
@@ -431,7 +431,7 @@ def html_podcast_episode(translate: {},
     # donate button
     if podcast_properties.get('funding'):
         if podcast_properties['funding'].get('url'):
-            donate_url = podcast_properties['funding']['url']
+            donate_url = remove_html(podcast_properties['funding']['url'])
             podcast_str += \
                 '<p><span itemprop="funding"><a href="' + donate_url + \
                 '" rel="donation"><button class="donateButton">' + \
diff --git a/webapp_post.py b/webapp_post.py
index a860ec4f3..84fd71d86 100644
--- a/webapp_post.py
+++ b/webapp_post.py
@@ -149,8 +149,9 @@ def _html_post_metadata_open_graph(domain: str, post_json_object: {},
                     "    <meta content=\"@" + actor_handle + \
                     "\" property=\"og:title\" />\n"
     if obj_json.get('url'):
+        obj_url = remove_html(obj_json['url'])
         metadata += \
-            "    <meta content=\"" + obj_json['url'] + \
+            "    <meta content=\"" + obj_url + \
             "\" property=\"og:url\" />\n"
     if obj_json.get('published'):
         metadata += "    <meta name=\"DC.date\" " + \
@@ -204,8 +205,9 @@ def _html_post_metadata_open_graph(domain: str, post_json_object: {},
             metadata += \
                 "    <meta content=\"" + description + \
                 "\" name=\"og:description\">\n"
+            attach_url = remove_html(attach_json['url'])
             metadata += \
-                "    <meta content=\"" + attach_json['url'] + \
+                "    <meta content=\"" + attach_url + \
                 "\" property=\"og:image\" />\n"
             metadata += \
                 "    <meta content=\"" + attach_json['mediaType'] + \
@@ -1188,9 +1190,11 @@ def _get_blog_citations_html(box_name: str,
             continue
         if not tag_json.get('url'):
             continue
+        citation_url = remove_html(tag_json['url'])
+        citation_name = remove_html(tag_json['name'])
         citations_str += \
-            '<li><a href="' + tag_json['url'] + '" tabindex="10">' + \
-            '<cite>' + tag_json['name'] + '</cite></a></li>\n'
+            '<li><a href="' + citation_url + '" tabindex="10">' + \
+            '<cite>' + citation_name + '</cite></a></li>\n'
 
     if citations_str:
         translated_citations_str = 'Citations'
@@ -1844,9 +1848,9 @@ def _get_content_license(post_json_object: {}) -> str:
            'licence' not in name_lower:
             continue
         if item.get('value'):
-            value = item['value']
+            value = remove_html(item['value'])
         elif item.get('href'):
-            value = item['href']
+            value = remove_html(item['href'])
         else:
             continue
         if '://' not in value:
diff --git a/webapp_profile.py b/webapp_profile.py
index 961a833f1..9fe39d3ff 100644
--- a/webapp_profile.py
+++ b/webapp_profile.py
@@ -238,7 +238,7 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
     avatar_url = ''
     if profile_json.get('icon'):
         if profile_json['icon'].get('url'):
-            avatar_url = profile_json['icon']['url']
+            avatar_url = remove_html(profile_json['icon']['url'])
     if not avatar_url:
         avatar_url = get_person_avatar_url(base_dir, person_url, person_cache)
     display_name = search_nickname
@@ -286,7 +286,8 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
     # profileBackgroundImage = ''
     # if profile_json.get('image'):
     #     if profile_json['image'].get('url'):
-    #         profileBackgroundImage = profile_json['image']['url']
+    #         profileBackgroundImage = \
+    #             remove_html(profile_json['image']['url'])
 
     # url to return to
     back_url = path
@@ -318,7 +319,7 @@ def html_profile_after_search(recent_posts_cache: {}, max_recent_posts: int,
     image_url = ''
     if profile_json.get('image'):
         if profile_json['image'].get('url'):
-            image_url = profile_json['image']['url']
+            image_url = remove_html(profile_json['image']['url'])
 
     also_known_as = None
     if profile_json.get('alsoKnownAs'):
@@ -1096,7 +1097,7 @@ def html_profile(signing_priv_key_pem: str,
     if profile_json.get('hasOccupation'):
         occupation_name = get_occupation_name(profile_json)
 
-    avatar_url = profile_json['icon']['url']
+    avatar_url = remove_html(profile_json['icon']['url'])
     # use alternate path for local avatars to avoid any caching issues
     if '://' + domain_full + '/system/accounts/avatars/' in avatar_url:
         avatar_url = \
diff --git a/webapp_search.py b/webapp_search.py
index 852dd7eb2..985d34ca0 100644
--- a/webapp_search.py
+++ b/webapp_search.py
@@ -11,6 +11,7 @@ import os
 from shutil import copyfile
 import urllib.parse
 from datetime import datetime
+from utils import remove_html
 from utils import harmless_markup
 from utils import remove_id_ending
 from utils import has_object_dict
@@ -567,10 +568,11 @@ def html_skills_search(actor: str, translate: {}, base_dir: str,
                         skill_level_str = '0' + skill_level_str
                     if skill_level < 10:
                         skill_level_str = '0' + skill_level_str
+                    icon_url = remove_html(actor_json['icon']['url'])
                     index_str = \
                         skill_level_str + ';' + actor + ';' + \
                         actor_json['name'] + \
-                        ';' + actor_json['icon']['url']
+                        ';' + icon_url
                     if index_str not in results:
                         results.append(index_str)
         break
@@ -606,10 +608,11 @@ def html_skills_search(actor: str, translate: {}, base_dir: str,
                                 skill_level_str = '0' + skill_level_str
                             if skill_level < 10:
                                 skill_level_str = '0' + skill_level_str
+                            icon_url = remove_html(actor_json['icon']['url'])
                             index_str = \
                                 skill_level_str + ';' + actor + ';' + \
                                 actor_json['name'] + \
-                                ';' + actor_json['icon']['url']
+                                ';' + icon_url
                             if index_str not in results:
                                 results.append(index_str)
             break
@@ -1369,8 +1372,9 @@ def rss_hashtag_search(nickname: str, domain: str, port: int,
                     for attach in post_json_object['object']['attachment']:
                         if not attach.get('url'):
                             continue
+                        attach_url = remove_html(attach['url'])
                         hashtag_feed += \
-                            '         <link>' + attach['url'] + '</link>'
+                            '         <link>' + attach_url + '</link>'
                 hashtag_feed += '     </item>'
         index += 1
         if index >= max_feed_length:
diff --git a/webapp_utils.py b/webapp_utils.py
index d8c77e11e..6192fc5da 100644
--- a/webapp_utils.py
+++ b/webapp_utils.py
@@ -627,11 +627,12 @@ def get_shares_collection(actor: str, page_number: int, items_per_page: int,
                         file_extension = mtype
                 if file_extension:
                     media_type = 'image/' + file_extension
+                    shared_item_url = remove_html(shared_item['imageUrl'])
                     offer_item['object']['attachment'].append({
                         'mediaType': media_type,
                         'name': shared_item['displayName'],
                         'type': 'Document',
-                        'url': shared_item['imageUrl']
+                        'url': shared_item_url
                     })
         if shared_item['itemPrice'] and shared_item['itemCurrency']:
             offer_item['object']['attachment'].append({
@@ -939,11 +940,12 @@ def html_header_with_person_markup(css_filename: str, instance_title: str,
     domain_full = actor_json['id'].split('://')[1].split('/')[0]
     handle = actor_json['preferredUsername'] + '@' + domain_full
 
+    icon_url = remove_html(actor_json['icon']['url'])
     person_markup = \
         '      "about": {\n' + \
         '        "@type" : "Person",\n' + \
         '        "name": "' + name_str + '",\n' + \
-        '        "image": "' + actor_json['icon']['url'] + '",\n' + \
+        '        "image": "' + icon_url + '",\n' + \
         '        "description": "' + description + '",\n' + \
         city_markup + skills_markup + \
         '        "url": "' + actor_json['id'] + '"\n' + \
@@ -967,18 +969,19 @@ def html_header_with_person_markup(css_filename: str, instance_title: str,
         '        "name": "' + name_str + '"\n' + \
         '      },\n' + \
         '      "name": "' + name_str + '",\n' + \
-        '      "image": "' + actor_json['icon']['url'] + '",\n' + \
+        '      "image": "' + icon_url + '",\n' + \
         '      "description": "' + description + '",\n' + \
         '      "license": "' + content_license_url + '"\n' + \
         '    }\n' + \
         '    </script>\n'
 
     description = remove_html(description)
+    actor2_url = remove_html(actor_json['url'])
     og_metadata = \
         "    <meta content=\"profile\" property=\"og:type\" />\n" + \
         "    <meta content=\"" + description + \
         "\" name='description'>\n" + \
-        "    <meta content=\"" + actor_json['url'] + \
+        "    <meta content=\"" + actor2_url + \
         "\" property=\"og:url\" />\n" + \
         "    <meta content=\"" + domain_full + \
         "\" property=\"og:site_name\" />\n" + \
@@ -986,7 +989,7 @@ def html_header_with_person_markup(css_filename: str, instance_title: str,
         ")\" property=\"og:title\" />\n" + \
         "    <meta content=\"" + description + \
         "\" property=\"og:description\" />\n" + \
-        "    <meta content=\"" + actor_json['icon']['url'] + \
+        "    <meta content=\"" + icon_url + \
         "\" property=\"og:image\" />\n" + \
         "    <meta content=\"400\" property=\"og:image:width\" />\n" + \
         "    <meta content=\"400\" property=\"og:image:height\" />\n" + \
@@ -1362,7 +1365,8 @@ def get_post_attachments_as_html(base_dir: str,
             continue
         # get the domain for the chat link
         chat_domain_str = ''
-        chat_domain, _ = get_domain_from_actor(attach['href'])
+        attach_url = remove_html(attach['href'])
+        chat_domain, _ = get_domain_from_actor(attach_url)
         if chat_domain:
             if local_network_host(chat_domain):
                 print('REJECT: local network chat link ' + attach['href'])
@@ -1505,6 +1509,7 @@ def get_post_attachments_as_html(base_dir: str,
                         image_post_url = post_json_object['object']['url']
                     else:
                         image_post_url = post_json_object['object']['id']
+                    image_post_url = remove_html(image_post_url)
                     if image_description and not is_muted:
                         gallery_str += \
                             '  <a href="' + image_post_url + \
@@ -1632,6 +1637,7 @@ def get_post_attachments_as_html(base_dir: str,
                         video_post_url = post_json_object['object']['url']
                     else:
                         video_post_url = post_json_object['object']['id']
+                    video_post_url = remove_html(video_post_url)
                     if image_description and not is_muted:
                         gallery_str += \
                             '  <a href="' + video_post_url + \
@@ -1709,6 +1715,7 @@ def get_post_attachments_as_html(base_dir: str,
                         audio_post_url = post_json_object['object']['url']
                     else:
                         audio_post_url = post_json_object['object']['id']
+                    audio_post_url = remove_html(audio_post_url)
                     if image_description and not is_muted:
                         gallery_str += \
                             '  <a href="' + audio_post_url + \
diff --git a/webfinger.py b/webfinger.py
index b850794ec..559664679 100644
--- a/webfinger.py
+++ b/webfinger.py
@@ -12,6 +12,7 @@ import urllib.parse
 from session import get_json
 from cache import store_webfinger_in_cache
 from cache import get_webfinger_from_cache
+from utils import remove_html
 from utils import acct_handle_dir
 from utils import get_attachment_property_value
 from utils import get_full_domain
@@ -425,7 +426,7 @@ def _webfinger_update_avatar(wf_json: {}, actor_json: {}) -> bool:
     """Updates the avatar image link
     """
     found = False
-    avatar_url = actor_json['icon']['url']
+    avatar_url = remove_html(actor_json['icon']['url'])
     media_type = actor_json['icon']['mediaType']
     for link in wf_json['links']:
         if not link.get('rel'):
@@ -455,8 +456,9 @@ def _webfinger_update_vcard(wf_json: {}, actor_json: {}) -> bool:
         if link.get('type'):
             if link['type'] == 'text/vcard':
                 return False
+    actor_url = remove_html(actor_json['url'])
     wf_json['links'].append({
-        "href": actor_json['url'],
+        "href": actor_url,
         "rel": "http://webfinger.net/rel/profile-page",
         "type": "text/vcard"
     })