Support for podcast fields within rss feeds

main
Bob Mottram 2022-01-10 18:48:57 +00:00
parent 1085f97070
commit df6b71009e
3 changed files with 170 additions and 9 deletions

View File

@ -627,6 +627,10 @@ def _convert_rs_sto_activity_pub(base_dir: str, http_prefix: str,
'<br><a href="' + post_url + '">' + \
translate['Read more...'] + '</a>'
# podcast_properties = None
# if len(item) > 8:
# podcast_properties = item[8]
followers_only = False
# NOTE: the id when the post is created will not be
# consistent (it's based on the current time, not the

View File

@ -203,7 +203,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
description: str, moderated: bool,
mirrored: bool,
tags: [],
max_tags: int, session, debug: bool) -> None:
max_tags: int, session, debug: bool,
podcast_properties: {}) -> None:
"""Update the newswire dictionary
"""
# remove any markup
@ -246,7 +247,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
description,
moderated,
post_tags,
mirrored
mirrored,
podcast_properties
]
@ -377,6 +379,71 @@ def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
False, force)
def xml_podcast_to_dict(xml_str: str) -> {}:
"""podcasting extensions for RSS feeds
"""
if 'podcastindex.org/namespace/1.0' not in xml_str:
return {}
if '<podcast:' not in xml_str:
return {}
podcast_properties = {
"locations": [],
"persons": [],
"soundbites": [],
"transcripts": [],
"valueRecipients": []
}
pod_lines = xml_str.split('<podcast:')
ctr = 0
for pod_line in pod_lines:
if ctr == 0 or '>' not in pod_line:
ctr += 1
continue
if ' ' not in pod_line.split('>')[0]:
pod_key = pod_line.split('>')[0].strip()
pod_val = pod_line.split('>', 1)[1].strip()
if '<' in pod_val:
pod_val = pod_val.split('<')[0]
podcast_properties[pod_key] = pod_val
ctr += 1
continue
pod_key = pod_line.split(' ')[0]
pod_fields = (
'url', 'geo', 'osm', 'type', 'method', 'group',
'owner', 'srcset', 'img', 'role', 'address', 'suggested',
'startTime', 'duration', 'href', 'name'
)
pod_entry = {}
for pod_field in pod_fields:
if pod_field + '="' not in pod_line:
continue
pod_str = pod_line.split(pod_field + '="')[1]
if '"' not in pod_str:
continue
pod_val = pod_str.split('"')[0]
pod_entry[pod_field] = pod_val
pod_text = pod_line.split('>')[1]
if '<' in pod_text:
pod_text = pod_text.split('<')[0].strip()
if pod_text:
pod_entry['text'] = pod_text
if pod_key + 's' in podcast_properties:
if isinstance(podcast_properties[pod_key + 's'], list):
podcast_properties[pod_key + 's'].append(pod_entry)
else:
podcast_properties[pod_key] = pod_entry
else:
podcast_properties[pod_key] = pod_entry
ctr += 1
return podcast_properties
def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
moderated: bool, mirrored: bool,
max_posts_per_source: int,
@ -446,12 +513,14 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
if _valid_feed_date(pub_date_str):
post_filename = ''
votes_status = []
podcast_properties = xml_podcast_to_dict(xml_str)
_add_newswire_dict_entry(base_dir, domain,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug)
mirrored, [], 32, session, debug,
podcast_properties)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
@ -534,12 +603,14 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
if _valid_feed_date(pub_date_str):
post_filename = ''
votes_status = []
podcast_properties = xml_podcast_to_dict(xml_str)
_add_newswire_dict_entry(base_dir, domain,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug)
mirrored, [], 32, session, debug,
podcast_properties)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
@ -610,12 +681,14 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
if _valid_feed_date(pub_date_str):
post_filename = ''
votes_status = []
podcast_properties = xml_podcast_to_dict(xml_str)
_add_newswire_dict_entry(base_dir, domain,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug)
mirrored, [], 32, session, debug,
podcast_properties)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
@ -727,7 +800,8 @@ def _json_feed_v1to_dict(base_dir: str, domain: str, xml_str: str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug)
mirrored, [], 32, session, debug,
None)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
@ -800,7 +874,8 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
title, link,
votes_status, post_filename,
description, moderated, mirrored,
[], 32, session, debug)
[], 32, session, debug,
None)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
@ -1077,7 +1152,8 @@ def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
votes, full_post_filename,
description, moderated, False,
tags_from_post,
max_tags, session, debug)
max_tags, session, debug,
None)
ctr += 1
if ctr >= max_blogs_per_account:

View File

@ -150,6 +150,7 @@ from linked_data_sig import generate_json_signature
from linked_data_sig import verify_json_signature
from newsdaemon import hashtag_rule_tree
from newsdaemon import hashtag_rule_resolve
from newswire import xml_podcast_to_dict
from newswire import get_newswire_tags
from newswire import parse_feed_date
from newswire import limit_word_lengths
@ -6354,7 +6355,7 @@ def _test_httpsig_base_new(with_digest: bool, base_dir: str,
def _test_get_actor_from_in_reply_to() -> None:
print('testGetActorFromInReplyTo')
print('test_get_actor_from_in_reply_to')
in_reply_to = \
'https://fosstodon.org/users/bashrc/statuses/107400700612621140'
reply_actor = get_actor_from_in_reply_to(in_reply_to)
@ -6365,6 +6366,85 @@ def _test_get_actor_from_in_reply_to() -> None:
assert reply_actor is None
def _test_xml_podcast_dict() -> None:
print('test_xml_podcast_dict')
xml_str = \
'<?xml version="1.0" encoding="UTF-8" ?>\n' + \
'<rss version="2.0" xmlns:podcast="' + \
'https://podcastindex.org/namespace/1.0">\n' + \
'<podcast:episode>5</podcast:episode>\n' + \
'<podcast:chapters ' + \
'url="https://whoframed.rodger/ep1_chapters.json" ' + \
'type="application/json"/>\n' + \
'<podcast:funding ' + \
'url="https://whoframed.rodger/donate">' + \
'Support the show</podcast:funding>\n' + \
'<podcast:images ' + \
'srcset="https://whoframed.rodger/images/ep1/' + \
'pci_avatar-massive.jpg 1500w, ' + \
'https://whoframed.rodger/images/ep1/pci_avatar-middle.jpg 600w, ' + \
'https://whoframed.rodger/images/ep1/pci_avatar-small.jpg 300w, ' + \
'https://whoframed.rodger/images/ep1/' + \
'pci_avatar-microfiche.jpg 50w" />\n' + \
'<podcast:location geo="geo:57.4272,34.63763" osm="R472152">' + \
'Nowheresville</podcast:location>\n' + \
'<podcast:locked owner="podcastowner@whoframed.rodger">yes' + \
'</podcast:locked>\n' + \
'<podcast:person group="visuals" role="cover art designer" ' + \
'href="https://whoframed.rodger/artist/rodgetrabbit">' + \
'Rodger Rabbit</podcast:person>\n' + \
'<podcast:person href="https://whoframed.rodger" ' + \
'img="http://whoframed.rodger/images/rr.jpg">Rodger Rabbit' + \
'</podcast:person>\n' + \
'<podcast:person href="https://whoframed.rodger" ' + \
'img="http://whoframed.rodger/images/jr.jpg">' + \
'Jessica Rabbit</podcast:person>\n' + \
'<podcast:person role="guest" ' + \
'href="https://whoframed.rodger/blog/bettyboop/" ' + \
'img="http://whoframed.rodger/images/bb.jpg">' + \
'Betty Boop</podcast:person>\n' + \
'<podcast:person role="guest" ' + \
'href="https://goodto.talk/bobhoskins/" ' + \
'img="https://goodto.talk/images/bhosk.jpg">' + \
'Bob Hoskins</podcast:person>\n' + \
'<podcast:season name="Podcasting 2.0">1</podcast:season>\n' + \
'<podcast:soundbite startTime="15.27" duration="8.0" />\n' + \
'<podcast:soundbite startTime="21.34" duration="32.0" />\n' + \
'<podcast:transcript ' + \
'url="https://whoframed.rodger/ep1/transcript.txt" ' + \
'type="text/plain" />\n' + \
'<podcast:transcript ' + \
'url="https://whoframed.rodger/ep2/transcript.txt" ' + \
'type="text/plain" />\n' + \
'<podcast:transcript ' + \
'url="https://whoframed.rodger/ep3/transcript.txt" ' + \
'type="text/plain" />\n' + \
'<podcast:value type="donate" method="keysend" ' + \
'suggested="2.95">\n' + \
' <podcast:valueRecipient name="hosting company" ' + \
'type="node" address="someaddress1" split="1" />\n' + \
' <podcast:valueRecipient name="podcaster" type="node" ' + \
'address="someaddress2" split="99" />\n' + \
'</podcast:value>\n' + \
'</rss>'
podcast_properties = xml_podcast_to_dict(xml_str)
assert podcast_properties
# pprint(podcast_properties)
assert podcast_properties.get('valueRecipients')
assert podcast_properties.get('persons')
assert podcast_properties.get('soundbites')
assert podcast_properties.get('locations')
assert podcast_properties.get('transcripts')
assert podcast_properties.get('episode')
assert podcast_properties.get('funding')
assert int(podcast_properties['episode']) == 5
assert podcast_properties['funding']['text'] == "Support the show"
assert len(podcast_properties['transcripts']) == 3
assert len(podcast_properties['valueRecipients']) == 2
assert len(podcast_properties['persons']) == 5
assert len(podcast_properties['locations']) == 1
def run_all_tests():
base_dir = os.getcwd()
print('Running tests...')
@ -6381,6 +6461,7 @@ def run_all_tests():
'message_json', 'liked_post_json'])
_test_checkbox_names()
_test_functions()
_test_xml_podcast_dict()
_test_get_actor_from_in_reply_to()
_test_valid_emoji_content()
_test_add_cw_lists(base_dir)