Support for podcast fields within rss feeds

2022-01-10 18:48:57 +00:00 · 2022-01-10 18:48:57 +00:00 · df6b71009e
parent 1085f97070
commit df6b71009e
3 changed files with 170 additions and 9 deletions
--- a/newsdaemon.py
+++ b/newsdaemon.py
@ -627,6 +627,10 @@ def _convert_rs_sto_activity_pub(base_dir: str, http_prefix: str,
            '<br><a href="' + post_url + '">' + \
            translate['Read more...'] + '</a>'

+#        podcast_properties = None
+#        if len(item) > 8:
+#            podcast_properties = item[8]
+
        followers_only = False
        # NOTE: the id when the post is created will not be
        # consistent (it's based on the current time, not the
--- a/newswire.py
+++ b/newswire.py
@ -203,7 +203,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
                             description: str, moderated: bool,
                             mirrored: bool,
                             tags: [],
-                             max_tags: int, session, debug: bool) -> None:
+                             max_tags: int, session, debug: bool,
+                             podcast_properties: {}) -> None:
    """Update the newswire dictionary
    """
    # remove any markup
@ -246,7 +247,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
        description,
        moderated,
        post_tags,
-        mirrored
+        mirrored,
+        podcast_properties
    ]


@ -377,6 +379,71 @@ def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
                                     False, force)


+def xml_podcast_to_dict(xml_str: str) -> {}:
+    """podcasting extensions for RSS feeds
+    """
+    if 'podcastindex.org/namespace/1.0' not in xml_str:
+        return {}
+    if '<podcast:' not in xml_str:
+        return {}
+
+    podcast_properties = {
+        "locations": [],
+        "persons": [],
+        "soundbites": [],
+        "transcripts": [],
+        "valueRecipients": []
+    }
+
+    pod_lines = xml_str.split('<podcast:')
+    ctr = 0
+    for pod_line in pod_lines:
+        if ctr == 0 or '>' not in pod_line:
+            ctr += 1
+            continue
+        if ' ' not in pod_line.split('>')[0]:
+            pod_key = pod_line.split('>')[0].strip()
+            pod_val = pod_line.split('>', 1)[1].strip()
+            if '<' in pod_val:
+                pod_val = pod_val.split('<')[0]
+            podcast_properties[pod_key] = pod_val
+            ctr += 1
+            continue
+        pod_key = pod_line.split(' ')[0]
+
+        pod_fields = (
+            'url', 'geo', 'osm', 'type', 'method', 'group',
+            'owner', 'srcset', 'img', 'role', 'address', 'suggested',
+            'startTime', 'duration', 'href', 'name'
+        )
+        pod_entry = {}
+        for pod_field in pod_fields:
+            if pod_field + '="' not in pod_line:
+                continue
+            pod_str = pod_line.split(pod_field + '="')[1]
+            if '"' not in pod_str:
+                continue
+            pod_val = pod_str.split('"')[0]
+            pod_entry[pod_field] = pod_val
+
+        pod_text = pod_line.split('>')[1]
+        if '<' in pod_text:
+            pod_text = pod_text.split('<')[0].strip()
+            if pod_text:
+                pod_entry['text'] = pod_text
+
+        if pod_key + 's' in podcast_properties:
+            if isinstance(podcast_properties[pod_key + 's'], list):
+                podcast_properties[pod_key + 's'].append(pod_entry)
+            else:
+                podcast_properties[pod_key] = pod_entry
+        else:
+            podcast_properties[pod_key] = pod_entry
+        ctr += 1
+
+    return podcast_properties
+
+
 def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
                     moderated: bool, mirrored: bool,
                     max_posts_per_source: int,
@ -446,12 +513,14 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
            if _valid_feed_date(pub_date_str):
                post_filename = ''
                votes_status = []
+                podcast_properties = xml_podcast_to_dict(xml_str)
                _add_newswire_dict_entry(base_dir, domain,
                                         result, pub_date_str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated,
-                                         mirrored, [], 32, session, debug)
+                                         mirrored, [], 32, session, debug,
+                                         podcast_properties)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -534,12 +603,14 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
            if _valid_feed_date(pub_date_str):
                post_filename = ''
                votes_status = []
+                podcast_properties = xml_podcast_to_dict(xml_str)
                _add_newswire_dict_entry(base_dir, domain,
                                         result, pub_date_str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated,
-                                         mirrored, [], 32, session, debug)
+                                         mirrored, [], 32, session, debug,
+                                         podcast_properties)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -610,12 +681,14 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
            if _valid_feed_date(pub_date_str):
                post_filename = ''
                votes_status = []
+                podcast_properties = xml_podcast_to_dict(xml_str)
                _add_newswire_dict_entry(base_dir, domain,
                                         result, pub_date_str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated,
-                                         mirrored, [], 32, session, debug)
+                                         mirrored, [], 32, session, debug,
+                                         podcast_properties)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -727,7 +800,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated,
-                                         mirrored, [], 32, session, debug)
+                                         mirrored, [], 32, session, debug,
+                                         None)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -800,7 +874,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated, mirrored,
-                                         [], 32, session, debug)
+                                         [], 32, session, debug,
+                                         None)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break
@ -1077,7 +1152,8 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
                                             votes, full_post_filename,
                                             description, moderated, False,
                                             tags_from_post,
-                                             max_tags, session, debug)
+                                             max_tags, session, debug,
+                                             None)

            ctr += 1
            if ctr >= max_blogs_per_account:
--- a/tests.py
+++ b/tests.py
@ -150,6 +150,7 @@ from linked_data_sig import generate_json_signature
 from linked_data_sig import verify_json_signature
 from newsdaemon import hashtag_rule_tree
 from newsdaemon import hashtag_rule_resolve
+from newswire import xml_podcast_to_dict
 from newswire import get_newswire_tags
 from newswire import parse_feed_date
 from newswire import limit_word_lengths
@ -6354,7 +6355,7 @@ def _test_httpsig_base_new(with_digest: bool, base_dir: str,


 def _test_get_actor_from_in_reply_to() -> None:
-    print('testGetActorFromInReplyTo')
+    print('test_get_actor_from_in_reply_to')
    in_reply_to = \
        'https://fosstodon.org/users/bashrc/statuses/107400700612621140'
    reply_actor = get_actor_from_in_reply_to(in_reply_to)
@ -6365,6 +6366,85 @@ def _test_get_actor_from_in_reply_to() -> None:
    assert reply_actor is None


+def _test_xml_podcast_dict() -> None:
+    print('test_xml_podcast_dict')
+    xml_str = \
+        '<?xml version="1.0" encoding="UTF-8" ?>\n' + \
+        '<rss version="2.0" xmlns:podcast="' + \
+        'https://podcastindex.org/namespace/1.0">\n' + \
+        '<podcast:episode>5</podcast:episode>\n' + \
+        '<podcast:chapters ' + \
+        'url="https://whoframed.rodger/ep1_chapters.json" ' + \
+        'type="application/json"/>\n' + \
+        '<podcast:funding ' + \
+        'url="https://whoframed.rodger/donate">' + \
+        'Support the show</podcast:funding>\n' + \
+        '<podcast:images ' + \
+        'srcset="https://whoframed.rodger/images/ep1/' + \
+        'pci_avatar-massive.jpg 1500w, ' + \
+        'https://whoframed.rodger/images/ep1/pci_avatar-middle.jpg 600w, ' + \
+        'https://whoframed.rodger/images/ep1/pci_avatar-small.jpg 300w, ' + \
+        'https://whoframed.rodger/images/ep1/' + \
+        'pci_avatar-microfiche.jpg 50w" />\n' + \
+        '<podcast:location geo="geo:57.4272,34.63763" osm="R472152">' + \
+        'Nowheresville</podcast:location>\n' + \
+        '<podcast:locked owner="podcastowner@whoframed.rodger">yes' + \
+        '</podcast:locked>\n' + \
+        '<podcast:person group="visuals" role="cover art designer" ' + \
+        'href="https://whoframed.rodger/artist/rodgetrabbit">' + \
+        'Rodger Rabbit</podcast:person>\n' + \
+        '<podcast:person href="https://whoframed.rodger" ' + \
+        'img="http://whoframed.rodger/images/rr.jpg">Rodger Rabbit' + \
+        '</podcast:person>\n' + \
+        '<podcast:person href="https://whoframed.rodger" ' + \
+        'img="http://whoframed.rodger/images/jr.jpg">' + \
+        'Jessica Rabbit</podcast:person>\n' + \
+        '<podcast:person role="guest" ' + \
+        'href="https://whoframed.rodger/blog/bettyboop/" ' + \
+        'img="http://whoframed.rodger/images/bb.jpg">' + \
+        'Betty Boop</podcast:person>\n' + \
+        '<podcast:person role="guest" ' + \
+        'href="https://goodto.talk/bobhoskins/" ' + \
+        'img="https://goodto.talk/images/bhosk.jpg">' + \
+        'Bob Hoskins</podcast:person>\n' + \
+        '<podcast:season name="Podcasting 2.0">1</podcast:season>\n' + \
+        '<podcast:soundbite startTime="15.27" duration="8.0" />\n' + \
+        '<podcast:soundbite startTime="21.34" duration="32.0" />\n' + \
+        '<podcast:transcript ' + \
+        'url="https://whoframed.rodger/ep1/transcript.txt" ' + \
+        'type="text/plain" />\n' + \
+        '<podcast:transcript ' + \
+        'url="https://whoframed.rodger/ep2/transcript.txt" ' + \
+        'type="text/plain" />\n' + \
+        '<podcast:transcript ' + \
+        'url="https://whoframed.rodger/ep3/transcript.txt" ' + \
+        'type="text/plain" />\n' + \
+        '<podcast:value type="donate" method="keysend" ' + \
+        'suggested="2.95">\n' + \
+        '  <podcast:valueRecipient name="hosting company" ' + \
+        'type="node" address="someaddress1" split="1" />\n' + \
+        '  <podcast:valueRecipient name="podcaster" type="node" ' + \
+        'address="someaddress2" split="99" />\n' + \
+        '</podcast:value>\n' + \
+        '</rss>'
+    podcast_properties = xml_podcast_to_dict(xml_str)
+    assert podcast_properties
+    # pprint(podcast_properties)
+    assert podcast_properties.get('valueRecipients')
+    assert podcast_properties.get('persons')
+    assert podcast_properties.get('soundbites')
+    assert podcast_properties.get('locations')
+    assert podcast_properties.get('transcripts')
+    assert podcast_properties.get('episode')
+    assert podcast_properties.get('funding')
+    assert int(podcast_properties['episode']) == 5
+    assert podcast_properties['funding']['text'] == "Support the show"
+    assert len(podcast_properties['transcripts']) == 3
+    assert len(podcast_properties['valueRecipients']) == 2
+    assert len(podcast_properties['persons']) == 5
+    assert len(podcast_properties['locations']) == 1
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    print('Running tests...')
@ -6381,6 +6461,7 @@ def run_all_tests():
                            'message_json', 'liked_post_json'])
    _test_checkbox_names()
    _test_functions()
+    _test_xml_podcast_dict()
    _test_get_actor_from_in_reply_to()
    _test_valid_emoji_content()
    _test_add_cw_lists(base_dir)