From df6b71009e46a6ad8737133e34abd228aafeea4f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 10 Jan 2022 18:48:57 +0000 Subject: [PATCH] Support for podcast fields within rss feeds --- newsdaemon.py | 4 +++ newswire.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++----- tests.py | 83 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 170 insertions(+), 9 deletions(-) diff --git a/newsdaemon.py b/newsdaemon.py index cf1b21aeb..4d01008df 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -627,6 +627,10 @@ def _convert_rs_sto_activity_pub(base_dir: str, http_prefix: str, '
' + \ translate['Read more...'] + '' +# podcast_properties = None +# if len(item) > 8: +# podcast_properties = item[8] + followers_only = False # NOTE: the id when the post is created will not be # consistent (it's based on the current time, not the diff --git a/newswire.py b/newswire.py index a5231931e..438610dfa 100644 --- a/newswire.py +++ b/newswire.py @@ -203,7 +203,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str, description: str, moderated: bool, mirrored: bool, tags: [], - max_tags: int, session, debug: bool) -> None: + max_tags: int, session, debug: bool, + podcast_properties: {}) -> None: """Update the newswire dictionary """ # remove any markup @@ -246,7 +247,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str, description, moderated, post_tags, - mirrored + mirrored, + podcast_properties ] @@ -377,6 +379,71 @@ def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str, False, force) +def xml_podcast_to_dict(xml_str: str) -> {}: + """podcasting extensions for RSS feeds + """ + if 'podcastindex.org/namespace/1.0' not in xml_str: + return {} + if '' not in pod_line: + ctr += 1 + continue + if ' ' not in pod_line.split('>')[0]: + pod_key = pod_line.split('>')[0].strip() + pod_val = pod_line.split('>', 1)[1].strip() + if '<' in pod_val: + pod_val = pod_val.split('<')[0] + podcast_properties[pod_key] = pod_val + ctr += 1 + continue + pod_key = pod_line.split(' ')[0] + + pod_fields = ( + 'url', 'geo', 'osm', 'type', 'method', 'group', + 'owner', 'srcset', 'img', 'role', 'address', 'suggested', + 'startTime', 'duration', 'href', 'name' + ) + pod_entry = {} + for pod_field in pod_fields: + if pod_field + '="' not in pod_line: + continue + pod_str = pod_line.split(pod_field + '="')[1] + if '"' not in pod_str: + continue + pod_val = pod_str.split('"')[0] + pod_entry[pod_field] = pod_val + + pod_text = pod_line.split('>')[1] + if '<' in pod_text: + pod_text = pod_text.split('<')[0].strip() + if pod_text: + pod_entry['text'] = pod_text + + if pod_key + 's' in podcast_properties: + if isinstance(podcast_properties[pod_key + 's'], list): + podcast_properties[pod_key + 's'].append(pod_entry) + else: + podcast_properties[pod_key] = pod_entry + else: + podcast_properties[pod_key] = pod_entry + ctr += 1 + + return podcast_properties + + def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, moderated: bool, mirrored: bool, max_posts_per_source: int, @@ -446,12 +513,14 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, if _valid_feed_date(pub_date_str): post_filename = '' votes_status = [] + podcast_properties = xml_podcast_to_dict(xml_str) _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, votes_status, post_filename, description, moderated, - mirrored, [], 32, session, debug) + mirrored, [], 32, session, debug, + podcast_properties) post_ctr += 1 if post_ctr >= max_posts_per_source: break @@ -534,12 +603,14 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, if _valid_feed_date(pub_date_str): post_filename = '' votes_status = [] + podcast_properties = xml_podcast_to_dict(xml_str) _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, votes_status, post_filename, description, moderated, - mirrored, [], 32, session, debug) + mirrored, [], 32, session, debug, + podcast_properties) post_ctr += 1 if post_ctr >= max_posts_per_source: break @@ -610,12 +681,14 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, if _valid_feed_date(pub_date_str): post_filename = '' votes_status = [] + podcast_properties = xml_podcast_to_dict(xml_str) _add_newswire_dict_entry(base_dir, domain, result, pub_date_str, title, link, votes_status, post_filename, description, moderated, - mirrored, [], 32, session, debug) + mirrored, [], 32, session, debug, + podcast_properties) post_ctr += 1 if post_ctr >= max_posts_per_source: break @@ -727,7 +800,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str, title, link, votes_status, post_filename, description, moderated, - mirrored, [], 32, session, debug) + mirrored, [], 32, session, debug, + None) post_ctr += 1 if post_ctr >= max_posts_per_source: break @@ -800,7 +874,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, title, link, votes_status, post_filename, description, moderated, mirrored, - [], 32, session, debug) + [], 32, session, debug, + None) post_ctr += 1 if post_ctr >= max_posts_per_source: break @@ -1077,7 +1152,8 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str, votes, full_post_filename, description, moderated, False, tags_from_post, - max_tags, session, debug) + max_tags, session, debug, + None) ctr += 1 if ctr >= max_blogs_per_account: diff --git a/tests.py b/tests.py index c5626da89..cbe943bf1 100644 --- a/tests.py +++ b/tests.py @@ -150,6 +150,7 @@ from linked_data_sig import generate_json_signature from linked_data_sig import verify_json_signature from newsdaemon import hashtag_rule_tree from newsdaemon import hashtag_rule_resolve +from newswire import xml_podcast_to_dict from newswire import get_newswire_tags from newswire import parse_feed_date from newswire import limit_word_lengths @@ -6354,7 +6355,7 @@ def _test_httpsig_base_new(with_digest: bool, base_dir: str, def _test_get_actor_from_in_reply_to() -> None: - print('testGetActorFromInReplyTo') + print('test_get_actor_from_in_reply_to') in_reply_to = \ 'https://fosstodon.org/users/bashrc/statuses/107400700612621140' reply_actor = get_actor_from_in_reply_to(in_reply_to) @@ -6365,6 +6366,85 @@ def _test_get_actor_from_in_reply_to() -> None: assert reply_actor is None +def _test_xml_podcast_dict() -> None: + print('test_xml_podcast_dict') + xml_str = \ + '\n' + \ + '\n' + \ + '5\n' + \ + '\n' + \ + '' + \ + 'Support the show\n' + \ + '\n' + \ + '' + \ + 'Nowheresville\n' + \ + 'yes' + \ + '\n' + \ + '' + \ + 'Rodger Rabbit\n' + \ + 'Rodger Rabbit' + \ + '\n' + \ + '' + \ + 'Jessica Rabbit\n' + \ + '' + \ + 'Betty Boop\n' + \ + '' + \ + 'Bob Hoskins\n' + \ + '1\n' + \ + '\n' + \ + '\n' + \ + '\n' + \ + '\n' + \ + '\n' + \ + '\n' + \ + ' \n' + \ + ' \n' + \ + '\n' + \ + '' + podcast_properties = xml_podcast_to_dict(xml_str) + assert podcast_properties + # pprint(podcast_properties) + assert podcast_properties.get('valueRecipients') + assert podcast_properties.get('persons') + assert podcast_properties.get('soundbites') + assert podcast_properties.get('locations') + assert podcast_properties.get('transcripts') + assert podcast_properties.get('episode') + assert podcast_properties.get('funding') + assert int(podcast_properties['episode']) == 5 + assert podcast_properties['funding']['text'] == "Support the show" + assert len(podcast_properties['transcripts']) == 3 + assert len(podcast_properties['valueRecipients']) == 2 + assert len(podcast_properties['persons']) == 5 + assert len(podcast_properties['locations']) == 1 + + def run_all_tests(): base_dir = os.getcwd() print('Running tests...') @@ -6381,6 +6461,7 @@ def run_all_tests(): 'message_json', 'liked_post_json']) _test_checkbox_names() _test_functions() + _test_xml_podcast_dict() _test_get_actor_from_in_reply_to() _test_valid_emoji_content() _test_add_cw_lists(base_dir)