Remove link tracking gloop from post urls

merge-requests/30/head
Bob Mottram 2024-04-24 12:10:50 +01:00
parent cc71d42a3c
commit 2903cb521f
3 changed files with 42 additions and 0 deletions

View File

@ -663,6 +663,27 @@ def _contains_academic_references(content: str) -> bool:
return False
def remove_link_trackers_from_content(content: str) -> str:
""" Removes any link trackers from urls within the content
"""
if '?utm_' not in content:
return content
sections = content.split('?utm_')
ctr = 0
new_content = ''
for section_str in sections:
if ctr == 0:
new_content = section_str
ctr = 1
continue
if '"' in section_str:
new_content += '"' + section_str.split('"', 1)[1]
else:
new_content += section_str
ctr += 1
return new_content
def remove_link_tracking(url: str) -> str:
""" Removes any web link tracking, such as utm_medium, utm_campaign
or utm_source

View File

@ -147,6 +147,7 @@ from inbox import valid_inbox
from inbox import valid_inbox_filenames
from inbox import cache_svg_images
from categories import guess_hashtag_category
from content import remove_link_trackers_from_content
from content import remove_link_tracking
from content import format_mixed_right_to_left
from content import replace_remote_hashtags
@ -8778,6 +8779,24 @@ def _test_link_tracking() -> None:
'habits-and-then-sell-that-to-letter-agencies'
assert remove_link_tracking(url) == expected
content = 'Some content'
expected = content
assert remove_link_trackers_from_content(content) == expected
content = \
'Some <a href="dreadfulsite.com/abc?utm_medium=gloop">content</a>'
expected = \
'Some <a href="dreadfulsite.com/abc">content</a>'
assert remove_link_trackers_from_content(content) == expected
content = \
'Some <a href="dreadfulsite.com/abc?utm_medium=gloop">content</a> ' + \
'<a href="surveillancecrap.com/def?utm_campaign=ohno">scurrilous</a>'
expected = \
'Some <a href="dreadfulsite.com/abc">content</a> ' + \
'<a href="surveillancecrap.com/def">scurrilous</a>'
assert remove_link_trackers_from_content(content) == expected
def run_all_tests():
base_dir = os.getcwd()

View File

@ -78,6 +78,7 @@ from utils import get_attributed_to
from utils import get_reply_to
from utils import get_actor_from_post
from utils import resembles_url
from content import remove_link_trackers_from_content
from content import format_mixed_right_to_left
from content import replace_remote_hashtags
from content import detect_dogwhistles
@ -2845,6 +2846,7 @@ def individual_post_as_html(signing_priv_key_pem: str,
not post_is_blog:
content_str = bold_reading_string(content_str)
object_content = remove_link_trackers_from_content(content_str)
object_content = \
remove_long_words(content_str, 40, [])
object_content = \