Checking for url strings

2024-01-27 17:04:21 +00:00 · 2024-01-27 17:04:21 +00:00 · d312a52c26
parent b129df0eaa
commit d312a52c26
12 changed files with 41 additions and 27 deletions
--- a/daemon.py
+++ b/daemon.py
@ -301,6 +301,7 @@ from languages import set_actor_languages
 from languages import get_understood_languages
 from like import update_likes_collection
 from reaction import update_reaction_collection
+from utils import resembles_url
 from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import corp_servers
@ -2347,8 +2348,7 @@ class PubServer(BaseHTTPRequestHandler):
        if debug:
            print('INBOX: checking that actor looks like a url')
        actor_url = get_actor_from_post(message_json)
-        if '://' not in actor_url or \
-           '.' not in actor_url:
+        if not resembles_url(actor_url):
            print('INBOX: POST actor does not look like a url ' +
                  actor_url)
            self._400()
@ -7159,8 +7159,7 @@ class PubServer(BaseHTTPRequestHandler):
                            if fields['libretranslateUrl'] != \
                               curr_libretranslate_url:
                                lt_url = fields['libretranslateUrl']
-                                if '://' in lt_url and \
-                                   '.' in lt_url:
+                                if resembles_url(lt_url):
                                    set_config_param(base_dir,
                                                     'libretranslateUrl',
                                                     lt_url)
@ -7551,8 +7550,7 @@ class PubServer(BaseHTTPRequestHandler):
                        moved_to = actor_json['movedTo']
                    if fields.get('movedTo'):
                        if fields['movedTo'] != moved_to and \
-                           '://' in fields['movedTo'] and \
-                           '.' in fields['movedTo']:
+                           resembles_url(fields['movedTo']):
                            actor_json['movedTo'] = fields['movedTo']
                            send_move_activity = True
                            actor_changed = True
@ -7615,7 +7613,7 @@ class PubServer(BaseHTTPRequestHandler):
                            also_known_as = []
                            for alt_actor in new_also_known_as:
                                alt_actor = alt_actor.strip()
-                                if '://' in alt_actor and '.' in alt_actor:
+                                if resembles_url(alt_actor):
                                    if alt_actor not in also_known_as:
                                        also_known_as.append(alt_actor)
                            actor_json['alsoKnownAs'] = also_known_as
--- a/epicyon.py
+++ b/epicyon.py
@ -87,6 +87,7 @@ from utils import follow_person
 from utils import valid_nickname
 from utils import get_protocol_prefixes
 from utils import acct_dir
+from utils import resembles_url
 from media import archive_media
 from media import get_attachment_media_type
 from delete import send_delete_via_server
@ -867,8 +868,7 @@ def _command_options() -> None:

    # automatic translations
    if argb.libretranslateUrl:
-        if '://' in argb.libretranslateUrl and \
-           '.' in argb.libretranslateUrl:
+        if resembles_url(argb.libretranslateUrl):
            set_config_param(base_dir, 'libretranslateUrl',
                             argb.libretranslateUrl)
    if argb.libretranslateApiKey:
--- a/inbox.py
+++ b/inbox.py
@ -18,6 +18,7 @@ from languages import understood_post_language
 from like import update_likes_collection
 from reaction import update_reaction_collection
 from reaction import valid_emoji_content
+from utils import resembles_url
 from utils import get_url_from_post
 from utils import date_from_string_format
 from utils import date_epoch
@ -371,7 +372,7 @@ def store_hash_tags(base_dir: str, nickname: str, domain: str,
    # get geolocation from tags
    location_str = get_location_from_post(post_json_object)
    if location_str:
-        if '://' in location_str and '.' in location_str:
+        if resembles_url(location_str):
            zoom, latitude, longitude = geocoords_from_map_link(location_str)
            if latitude and longitude and zoom and \
               location_str not in map_links:
--- a/languages.py
+++ b/languages.py
@ -17,6 +17,7 @@ from utils import remove_html
 from utils import has_object_dict
 from utils import get_config_param
 from utils import local_actor_url
+from utils import resembles_url
 from cache import get_person_from_cache


@ -209,7 +210,7 @@ def get_links_from_content(content: str) -> {}:
        if '"' not in subsection:
            continue
        url = subsection.split('"')[1].strip()
-        if '://' in url and '.' in url and \
+        if resembles_url(url) and \
           '>' in subsection:
            if url not in links:
                link_text = subsection.split('>')[1]
--- a/newswire.py
+++ b/newswire.py
@ -19,6 +19,7 @@ from datetime import timezone
 from collections import OrderedDict
 from utils import valid_post_date
 from categories import set_hashtag_category
+from utils import resembles_url
 from utils import get_url_from_post
 from utils import remove_zero_length_strings
 from utils import date_from_string_format
@ -659,7 +660,7 @@ def xml_podcast_to_dict(base_dir: str, xml_item: str, xml_str: str) -> {}:
            episode_image = episode_image.split('>')[1]
            if '<' in episode_image:
                episode_image = episode_image.split('<')[0]
-                if '://' in episode_image and '.' in episode_image:
+                if resembles_url(episode_image):
                    podcast_episode_image = episode_image
                    break

@ -756,7 +757,7 @@ def get_link_from_rss_item(rss_item: str,
                link_str = enclosure.split('url="')[1]
                if '"' in link_str:
                    link = link_str.split('"')[0]
-                    if '://' in link:
+                    if resembles_url(link):
                        return link, mime_type

    if '<link>' in rss_item and '</link>' in rss_item:
--- a/relationships.py
+++ b/relationships.py
@ -159,7 +159,7 @@ def get_moved_feed(base_dir: str, domain: str, port: int, path: str,
    curr_page = 1
    page_ctr = 0
    total_ctr = 0
-    for handle, new_handle in lines.items():
+    for handle, _ in lines.items():
        # nickname@domain
        page_ctr += 1
        total_ctr += 1
--- a/shares.py
+++ b/shares.py
@ -23,6 +23,7 @@ from session import post_json
 from session import post_image
 from session import create_session
 from session import get_json_valid
+from utils import resembles_url
 from utils import date_utcnow
 from utils import dangerous_markup
 from utils import remove_html
@ -2147,7 +2148,7 @@ def vf_proposal_from_share(shared_item: {},
            "name": shared_item['location'].title()
        }
    if shared_item['imageUrl']:
-        if '://' in shared_item['imageUrl']:
+        if resembles_url(shared_item['imageUrl']):
            file_extension = None
            accepted_types = get_media_extensions()
            for mtype in accepted_types:
--- a/utils.py
+++ b/utils.py
@ -147,8 +147,7 @@ def get_attributed_to(field) -> str:
                    if isinstance(attrib['type'], str) and \
                       isinstance(attrib['id'], str):
                        if attrib['type'] == 'Person' and \
-                           '://' in attrib['id'] and \
-                           '.' in attrib['id']:
+                           resembles_url(attrib['id']):
                            return attrib['id']
        if isinstance(field[0], str):
            return field[0]
@ -4037,7 +4036,7 @@ def get_actor_from_post(post_json_object: {}) -> str:

    if actor_id:
        # looks vaguely like a url
-        if '://' in actor_id and '.' in actor_id:
+        if resembles_url(actor_id):
            return actor_id
    return ''

@ -4915,3 +4914,14 @@ def is_valid_date(date_str: str) -> bool:
                return False
        date_sect_ctr += 1
    return True
+
+
+def resembles_url(text: str) -> bool:
+    """Does the given text look like a url?
+    """
+    if '://' in text and \
+       '.' in text and \
+       ' ' not in text and \
+       '<' not in text:
+        return True
+    return False
--- a/video.py
+++ b/video.py
@ -18,6 +18,7 @@ from utils import get_content_from_post
 from utils import dangerous_markup
 from utils import license_link_from_name
 from utils import get_media_url_from_video
+from utils import resembles_url
 from blocking import is_blocked
 from filters import is_filtered

@ -168,9 +169,7 @@ def convert_video_to_note(base_dir: str, nickname: str, domain: str,
                                   system_language):
                    new_post['object']['support'] = support_str
                    # if this is a link
-                    if ' ' not in support_str and \
-                       '://' in support_str and \
-                       '.' in support_str:
+                    if resembles_url(support_str):
                        # add a buy link
                        new_post['object']['attachment'].append({
                            'type': 'Link',
--- a/webapp_post.py
+++ b/webapp_post.py
@ -75,6 +75,7 @@ from utils import language_right_to_left
 from utils import get_attributed_to
 from utils import get_reply_to
 from utils import get_actor_from_post
+from utils import resembles_url
 from content import format_mixed_right_to_left
 from content import replace_remote_hashtags
 from content import detect_dogwhistles
@ -2364,7 +2365,7 @@ def individual_post_as_html(signing_priv_key_pem: str,
    post_proxied = ap_proxy_type(post_json_object['object'])
    if post_proxied:
        post_proxied = remove_html(post_proxied)
-        if '://' in post_proxied:
+        if resembles_url(post_proxied):
            proxy_str = 'Proxy'
            if translate.get(proxy_str):
                proxy_str = translate[proxy_str]
@ -2905,7 +2906,7 @@ def individual_post_as_html(signing_priv_key_pem: str,
        # show embedded map if the location contains a map url
        location_str = get_location_from_post(post_json_object)
        if location_str:
-            if '://' in location_str and '.' in location_str:
+            if resembles_url(location_str):
                bounding_box_degrees = 0.001
                map_str = \
                    html_open_street_map(location_str,
--- a/webapp_profile.py
+++ b/webapp_profile.py
@ -40,6 +40,7 @@ from utils import get_account_timezone
 from utils import remove_eol
 from utils import is_valid_date
 from utils import get_actor_from_post
+from utils import resembles_url
 from languages import get_actor_languages
 from skills import get_skills
 from theme import get_themes_list
@ -633,7 +634,7 @@ def _get_profile_header(base_dir: str, http_prefix: str, nickname: str,
        actor_proxied = ''
    else:
        actor_proxied = remove_html(actor_proxied)
-        if '://' in actor_proxied:
+        if resembles_url(actor_proxied):
            proxy_str = 'Proxy'
            if translate.get(proxy_str):
                proxy_str = translate[proxy_str]
@ -792,7 +793,7 @@ def _get_profile_header_after_search(base_dir: str,
        actor_proxied = ''
    else:
        actor_proxied = remove_html(actor_proxied)
-        if '://' in actor_proxied:
+        if resembles_url(actor_proxied):
            proxy_str = 'Proxy'
            if translate.get(proxy_str):
                proxy_str = translate[proxy_str]
--- a/webapp_utils.py
+++ b/webapp_utils.py
@ -37,6 +37,7 @@ from utils import local_actor_url
 from utils import text_in_file
 from utils import remove_eol
 from utils import binary_is_image
+from utils import resembles_url
 from filters import is_filtered
 from cache import get_actor_public_key_from_id
 from cache import store_person_in_cache
@ -1442,7 +1443,7 @@ def get_post_attachments_as_html(base_dir: str,
                        license_str = ''
                        if media_license and media_creator:
                            media_license = remove_html(media_license)
-                            if '://' in media_license:
+                            if resembles_url(media_license):
                                license_str += \
                                    '<a href="' + media_license + \
                                    '" target="_blank" ' + \
@ -1531,7 +1532,7 @@ def get_post_attachments_as_html(base_dir: str,
                    license_str = ''
                    attachment_str += '<figcaption>'
                    media_license = remove_html(media_license)
-                    if '://' in media_license:
+                    if resembles_url(media_license):
                        license_str += \
                            '<a href="' + media_license + \
                            '" target="_blank" ' + \