From ad3ad1b045144624a78441e49d8c83cd3b4fda00 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Tue, 17 May 2022 12:40:05 +0100
Subject: [PATCH] Check for very long urls within incoming posts

---
 content.py | 19 +++++++++++++++++++
 daemon.py  | 26 ++++++++++++++++++++++++++
 inbox.py   |  7 +++++++
 3 files changed, 52 insertions(+)

diff --git a/content.py b/content.py
index 26c08d14b..911e34041 100644
--- a/content.py
+++ b/content.py
@@ -59,6 +59,25 @@ INVALID_CONTENT_STRINGS = (
 )
 
 
+def valid_url_lengths(content: str, max_url_length: int) -> bool:
+    """Returns true if the given content contains urls which are too long
+    """
+    if '://' not in content:
+        return True
+    sections = content.split('://')
+    ctr = 0
+    for text in sections:
+        if ctr == 0:
+            ctr += 1
+            continue
+        if '"' in text:
+            url = text.split('"')[0]
+            if '<' not in url and '>' not in url:
+                if len(url) > max_url_length:
+                    return False
+    return True
+
+
 def remove_html_tag(html_str: str, tag: str) -> str:
     """Removes a given tag from a html string
     """
diff --git a/daemon.py b/daemon.py
index 00a0f459d..b534f1de0 100644
--- a/daemon.py
+++ b/daemon.py
@@ -322,6 +322,7 @@ from utils import has_group_type
 from manualapprove import manual_deny_follow_request_thread
 from manualapprove import manual_approve_follow_request_thread
 from announce import create_announce
+from content import valid_url_lengths
 from content import contains_invalid_local_links
 from content import get_price_from_string
 from content import replace_emoji_from_tags
@@ -1747,6 +1748,31 @@ class PubServer(BaseHTTPRequestHandler):
                     self._400()
                     self.server.postreq_busy = False
                     return 3
+            # check that the content does not contain impossibly long urls
+            if message_json['object'].get('content'):
+                content_str = message_json['object']['content']
+                if not valid_url_lengths(content_str, 2048):
+                    print('INBOX: content contains urls which are too long ' +
+                          message_json['actor'])
+                    self._400()
+                    self.server.postreq_busy = False
+                    return 3
+            # check that the summary does not contain links
+            if message_json['object'].get('summary'):
+                if len(message_json['object']['summary']) > 1024:
+                    print('INBOX: summary is too long ' +
+                          message_json['actor'] + ' ' +
+                          message_json['object']['summary'])
+                    self._400()
+                    self.server.postreq_busy = False
+                    return 3
+                if '://' in message_json['object']['summary']:
+                    print('INBOX: summary should not contain links ' +
+                          message_json['actor'] + ' ' +
+                          message_json['object']['summary'])
+                    self._400()
+                    self.server.postreq_busy = False
+                    return 3
 
         # actor should look like a url
         if debug:
diff --git a/inbox.py b/inbox.py
index cd816268f..d8e443d76 100644
--- a/inbox.py
+++ b/inbox.py
@@ -126,6 +126,7 @@ from conversation import update_conversation
 from webapp_hashtagswarm import html_hash_tag_swarm
 from person import valid_sending_actor
 from fitnessFunctions import fitness_performance
+from content import valid_url_lengths
 
 
 def _store_last_post_id(base_dir: str, nickname: str, domain: str,
@@ -2586,6 +2587,12 @@ def _valid_post_content(base_dir: str, nickname: str, domain: str,
                                     http_prefix, domain_full,
                                     person_cache):
         return False
+
+    # check for urls which are too long
+    if not valid_url_lengths(content_str, 2048):
+        print('REJECT: url within content too long')
+        return False
+
     # check for filtered content
     if is_filtered(base_dir, nickname, domain, content_str):
         print('REJECT: content filtered')