Extract any extra links from atom feed items for display on podcast screen

2024-06-05 20:34:38 +01:00 · 2024-06-05 20:34:38 +01:00 · 7524d7656d
parent 8433d9069e
commit 7524d7656d
2 changed files with 64 additions and 11 deletions
--- a/newswire.py
+++ b/newswire.py
@ -215,7 +215,8 @@ def _add_newswire_dict_entry(base_dir: str,
                             max_tags: int, session, debug: bool,
                             podcast_properties: {},
                             system_language: str,
-                             fediverse_handle: str) -> None:
+                             fediverse_handle: str,
+                             extra_links: []) -> None:
    """Update the newswire dictionary
    """
    # remove any markup
@ -268,7 +269,8 @@ def _add_newswire_dict_entry(base_dir: str,
        post_tags,
        mirrored,
        podcast_properties,
-        fediverse_handle
+        fediverse_handle,
+        extra_links
    ]


@ -872,6 +874,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
                if podcast_properties:
                    podcast_properties['linkMimeType'] = link_mime_type
                fediverse_handle = ''
+                extra_links = []
                _add_newswire_dict_entry(base_dir,
                                         result, pub_date_str,
                                         title, link,
@ -879,7 +882,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
                                         description, moderated,
                                         mirrored, [], 32, session, debug,
                                         podcast_properties, system_language,
-                                         fediverse_handle)
+                                         fediverse_handle, extra_links)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -988,6 +991,7 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
                if podcast_properties:
                    podcast_properties['linkMimeType'] = link_mime_type
                fediverse_handle = ''
+                extra_links = []
                _add_newswire_dict_entry(base_dir,
                                         result, pub_date_str,
                                         title, link,
@ -995,7 +999,7 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
                                         description, moderated,
                                         mirrored, [], 32, session, debug,
                                         podcast_properties, system_language,
-                                         fediverse_handle)
+                                         fediverse_handle, extra_links)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -1084,6 +1088,33 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
                       not is_local_network_address(actor_uri):
                        fediverse_handle = actor_uri

+        # are there any extra links?
+        extra_links = []
+        if '<activity:object>' in atom_item and \
+           '</activity:object>' in atom_item:
+            obj_str = atom_item.split('<activity:object>')[1]
+            obj_str = \
+                unescaped_text(obj_str.split('</activity:object>')[0])
+            obj_str = remove_script(obj_str, None, None, None)
+            sections = obj_str.split('<link ')
+            ctr = 0
+            for section_str in sections:
+                if ctr == 0:
+                    ctr = 1
+                    continue
+                if '>' in section_str:
+                    link_str = section_str.split('>')[0]
+                    if 'href="' in link_str and \
+                       'rel="preview"' not in link_str:
+                        link_str = link_str.split('href="')[1]
+                        if '"' in link_str:
+                            link_str = link_str.split('"')[0]
+                            link_str = remove_html(link_str)
+                            if resembles_url(link_str) and \
+                               not is_local_network_address(link_str):
+                                if link_str not in extra_links:
+                                    extra_links.append(link_str)
+
        proxy_type = None
        if domain.endswith('.onion'):
            proxy_type = 'tor'
@ -1122,7 +1153,7 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
                                         description, moderated,
                                         mirrored, [], 32, session, debug,
                                         podcast_properties, system_language,
-                                         fediverse_handle)
+                                         fediverse_handle, extra_links)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -1232,6 +1263,7 @@ def _json_feed_v1to_dict(base_dir: str, xml_str: str,
                post_filename = ''
                votes_status = []
                fediverse_handle = ''
+                extra_links = []
                _add_newswire_dict_entry(base_dir,
                                         result, pub_date_str,
                                         title, link,
@ -1239,7 +1271,7 @@ def _json_feed_v1to_dict(base_dir: str, xml_str: str,
                                         description, moderated,
                                         mirrored, [], 32, session, debug,
                                         None, system_language,
-                                         fediverse_handle)
+                                         fediverse_handle, extra_links)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -1337,6 +1369,7 @@ def _atom_feed_yt_to_dict(base_dir: str, xml_str: str,
                if podcast_properties:
                    podcast_properties['linkMimeType'] = 'video/youtube'
                fediverse_handle = ''
+                extra_links = []
                _add_newswire_dict_entry(base_dir,
                                         result, pub_date_str,
                                         title, link,
@ -1344,7 +1377,7 @@ def _atom_feed_yt_to_dict(base_dir: str, xml_str: str,
                                         description, moderated, mirrored,
                                         [], 32, session, debug,
                                         podcast_properties, system_language,
-                                         fediverse_handle)
+                                         fediverse_handle, extra_links)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -1633,6 +1666,7 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
                        get_url_from_post(post_json_object['object']['url'])
                    url2 = remove_html(url_str)
                    fediverse_handle = ''
+                    extra_links = []
                    _add_newswire_dict_entry(base_dir,
                                             newswire, published,
                                             summary, url2,
@ -1641,7 +1675,7 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
                                             tags_from_post,
                                             max_tags, session, debug,
                                             None, system_language,
-                                             fediverse_handle)
+                                             fediverse_handle, extra_links)

            ctr += 1
            if ctr >= max_blogs_per_account:
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@ -12,6 +12,7 @@ import html
 import datetime
 import urllib.parse
 from shutil import copyfile
+from utils import resembles_url
 from utils import get_nickname_from_actor
 from utils import get_domain_from_actor
 from utils import data_dir
@ -460,9 +461,27 @@ def html_podcast_episode(translate: {},
        fediverse_handle = newswire_item[9]
        podcast_nickname = get_nickname_from_actor(fediverse_handle)
        podcast_domain, _ = get_domain_from_actor(fediverse_handle)
-        podcast_str += \
-            '<p><a href="' + fediverse_handle + '">' + \
-            podcast_nickname + '@' + podcast_domain + '</a></p>\n'
+        if podcast_nickname and podcast_domain:
+            podcast_str += \
+                '<p><a href="' + fediverse_handle + '">' + \
+                podcast_nickname + '@' + podcast_domain + '</a></p>\n'
+
+    extra_links = []
+    if len(newswire_item) > 10:
+        extra_links = newswire_item[10]
+        if extra_links:
+            links_text = ''
+            for link_str in extra_links:
+                link_str = remove_html(link_str)
+                if not resembles_url(link_str):
+                    continue
+                if not links_text:
+                    links_text = '<p>\n'
+                links_text += \
+                    '<a href="' + link_str + '">' + link_str + '</a><br>\n'
+            if links_text:
+                links_text += '</p>\n'
+                podcast_str += links_text

    if podcast_properties['categories']:
        tags_str = ''