From c9982cd428f89b731dafc793e356e4f9a03fc9ca Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Wed, 24 Apr 2024 11:38:45 +0100
Subject: [PATCH] Remove link tracking for outgoing posts

---
 content.py | 27 +++++++++++++++++++--------
 tests.py   | 22 ++++++++++++++++++++++
 2 files changed, 41 insertions(+), 8 deletions(-)
diff --git a/content.py b/content.py
index 38912abfd..c5eb57288 100644
--- a/content.py
+++ b/content.py
@@ -663,6 +663,15 @@ def _contains_academic_references(content: str) -> bool:
     return False
 
 
+def remove_link_tracking(url: str) -> str:
+    """ Removes any web link tracking, such as utm_medium, utm_campaign
+    or utm_source
+    """
+    if '?utm_' not in url:
+        return url
+    return url.split('?utm_')[0]
+
+
 def add_web_links(content: str) -> str:
     """Adds markup for web links
     """
@@ -697,7 +706,7 @@ def add_web_links(content: str) -> str:
             continue
         if _contains_doi_reference(wrd, replace_dict):
             continue
-        # does the word begin with a prefix?
+        # does the word begin with a link prefix?
         prefix_found = False
         for prefix in prefixes:
             if wrd.startswith(prefix):
@@ -705,16 +714,18 @@ def add_web_links(content: str) -> str:
                 break
         if not prefix_found:
             continue
-        # the word contains a prefix
-        if wrd.endswith('.') or wrd.endswith(';'):
-            wrd = wrd[:-1]
-        markup = '<a href="' + wrd + '" tabindex="10" ' + \
+        # the word contains a link prefix
+        url = wrd
+        if url.endswith('.') or wrd.endswith(';'):
+            url = url[:-1]
+        url = remove_link_tracking(url)
+        markup = '<a href="' + url + '" tabindex="10" ' + \
             'rel="nofollow noopener noreferrer" target="_blank">'
         for prefix in prefixes:
-            if wrd.startswith(prefix):
+            if url.startswith(prefix):
                 markup += '<span class="invisible">' + prefix + '</span>'
                 break
-        link_text = wrd
+        link_text = url
         for prefix in prefixes:
             link_text = link_text.replace(prefix, '')
         # prevent links from becoming too long
@@ -725,7 +736,7 @@ def add_web_links(content: str) -> str:
                 link_text[MAX_LINK_LENGTH:] + '</span></a>'
         else:
             markup += '<span class="ellipsis">' + link_text + '</span></a>'
-        replace_dict[wrd] = markup
+        replace_dict[url] = markup
 
     # do the replacements
     for url, markup in replace_dict.items():
diff --git a/tests.py b/tests.py
index 286fe0120..7d5fbc6e5 100644
--- a/tests.py
+++ b/tests.py
@@ -147,6 +147,7 @@ from inbox import valid_inbox
 from inbox import valid_inbox_filenames
 from inbox import cache_svg_images
 from categories import guess_hashtag_category
+from content import remove_link_tracking
 from content import format_mixed_right_to_left
 from content import replace_remote_hashtags
 from content import add_name_emojis_to_tags
@@ -8758,6 +8759,26 @@ def _test_remove_tags() -> None:
     assert result == 'This is some content. Some other content'
 
 
+def _test_link_tracking() -> None:
+    print('link tracking')
+    url = 'someweblink.net/some/path'
+    expected = url
+    assert remove_link_tracking(url) == expected
+
+    url = \
+        'https://somenauseating.com/we-want-to-track-your-web-browsing-' + \
+        'habits-and-then-sell-that-to-letter-agencies?utm_medium=email&' + \
+        'utm_campaign=Latest%20from%20SomeNauseating%20DotCom' + \
+        '%20for%20April%2024%202024%20-%503948479461&utm_content=' + \
+        'Latest%20from%20SomeNeuseating%20DotCom%20for%20April%2024%' + \
+        '202024%20-%34567123+CID_34678246&utm_source=campaign_monitor_uk' + \
+        '&utm_term=wibble'
+    expected = \
+        'https://somenauseating.com/we-want-to-track-your-web-browsing-' + \
+        'habits-and-then-sell-that-to-letter-agencies'
+    assert remove_link_tracking(url) == expected
+
+
 def run_all_tests():
     base_dir = os.getcwd()
     print('Running tests...')
@@ -8775,6 +8796,7 @@ def run_all_tests():
     _test_checkbox_names()
     _test_thread_functions()
     _test_functions()
+    _test_link_tracking()
     _test_remove_tags()
     _test_check_individual_post_content()
     _test_uninvert2()