Remove link tracking for outgoing posts

2024-04-24 11:38:45 +01:00 · 2024-04-24 11:38:45 +01:00 · c9982cd428
parent 85d92d0c22
commit c9982cd428
2 changed files with 41 additions and 8 deletions
--- a/content.py
+++ b/content.py
@ -663,6 +663,15 @@ def _contains_academic_references(content: str) -> bool:
    return False


+def remove_link_tracking(url: str) -> str:
+    """ Removes any web link tracking, such as utm_medium, utm_campaign
+    or utm_source
+    """
+    if '?utm_' not in url:
+        return url
+    return url.split('?utm_')[0]
+
+
 def add_web_links(content: str) -> str:
    """Adds markup for web links
    """
@ -697,7 +706,7 @@ def add_web_links(content: str) -> str:
            continue
        if _contains_doi_reference(wrd, replace_dict):
            continue
-        # does the word begin with a prefix?
+        # does the word begin with a link prefix?
        prefix_found = False
        for prefix in prefixes:
            if wrd.startswith(prefix):
@ -705,16 +714,18 @@ def add_web_links(content: str) -> str:
                break
        if not prefix_found:
            continue
-        # the word contains a prefix
-        if wrd.endswith('.') or wrd.endswith(';'):
-            wrd = wrd[:-1]
-        markup = '<a href="' + wrd + '" tabindex="10" ' + \
+        # the word contains a link prefix
+        url = wrd
+        if url.endswith('.') or wrd.endswith(';'):
+            url = url[:-1]
+        url = remove_link_tracking(url)
+        markup = '<a href="' + url + '" tabindex="10" ' + \
            'rel="nofollow noopener noreferrer" target="_blank">'
        for prefix in prefixes:
-            if wrd.startswith(prefix):
+            if url.startswith(prefix):
                markup += '<span class="invisible">' + prefix + '</span>'
                break
-        link_text = wrd
+        link_text = url
        for prefix in prefixes:
            link_text = link_text.replace(prefix, '')
        # prevent links from becoming too long
@ -725,7 +736,7 @@ def add_web_links(content: str) -> str:
                link_text[MAX_LINK_LENGTH:] + '</span></a>'
        else:
            markup += '<span class="ellipsis">' + link_text + '</span></a>'
-        replace_dict[wrd] = markup
+        replace_dict[url] = markup

    # do the replacements
    for url, markup in replace_dict.items():
--- a/tests.py
+++ b/tests.py
@ -147,6 +147,7 @@ from inbox import valid_inbox
 from inbox import valid_inbox_filenames
 from inbox import cache_svg_images
 from categories import guess_hashtag_category
+from content import remove_link_tracking
 from content import format_mixed_right_to_left
 from content import replace_remote_hashtags
 from content import add_name_emojis_to_tags
@ -8758,6 +8759,26 @@ def _test_remove_tags() -> None:
    assert result == 'This is some content. Some other content'


+def _test_link_tracking() -> None:
+    print('link tracking')
+    url = 'someweblink.net/some/path'
+    expected = url
+    assert remove_link_tracking(url) == expected
+
+    url = \
+        'https://somenauseating.com/we-want-to-track-your-web-browsing-' + \
+        'habits-and-then-sell-that-to-letter-agencies?utm_medium=email&' + \
+        'utm_campaign=Latest%20from%20SomeNauseating%20DotCom' + \
+        '%20for%20April%2024%202024%20-%503948479461&utm_content=' + \
+        'Latest%20from%20SomeNeuseating%20DotCom%20for%20April%2024%' + \
+        '202024%20-%34567123+CID_34678246&utm_source=campaign_monitor_uk' + \
+        '&utm_term=wibble'
+    expected = \
+        'https://somenauseating.com/we-want-to-track-your-web-browsing-' + \
+        'habits-and-then-sell-that-to-letter-agencies'
+    assert remove_link_tracking(url) == expected
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -8775,6 +8796,7 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
+    _test_link_tracking()
    _test_remove_tags()
    _test_check_individual_post_content()
    _test_uninvert2()