Podcast processing for youtube feeds

2022-01-14 17:40:42 +00:00 · 2022-01-14 17:40:42 +00:00 · 6cceef2386
parent 75a21345cc
commit 6cceef2386
1 changed files with 20 additions and 5 deletions
--- a/newswire.py
+++ b/newswire.py
@ -501,7 +501,7 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
    # get the image for the podcast, if it exists
    podcast_episode_image = None
-    episode_image_tags = ['<itunes:image']
+    episode_image_tags = ['<itunes:image', '<media:thumbnail']
    for image_tag in episode_image_tags:
        item_str = xml_item
        if image_tag not in xml_item:
@ -516,6 +516,12 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
                episode_image = episode_image.split('"')[0]
                podcast_episode_image = episode_image
                break
        elif 'url="' in episode_image:
            episode_image = episode_image.split('url="')[1]
            if '"' in episode_image:
                episode_image = episode_image.split('"')[0]
                podcast_episode_image = episode_image
                break
        else:
            if '>' in episode_image:
                episode_image = episode_image.split('>')[1]
@ -1019,9 +1025,15 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
            description = atom_item.split('<summary>')[1]
            description = description.split('</summary>')[0]
            description = remove_html(description)
-        link = atom_item.split('<yt:videoId>')[1]
+
-        link = link.split('</yt:videoId>')[0]
+        link, link_mime_type = get_link_from_rss_item(atom_item)
-        link = 'https://www.youtube.com/watch?v=' + link.strip()
+        if not link:
            link = atom_item.split('<yt:videoId>')[1]
            link = link.split('</yt:videoId>')[0]
            link = 'https://www.youtube.com/watch?v=' + link.strip()
        if not link:
            continue
        pub_date = atom_item.split('<published>')[1]
        pub_date = pub_date.split('</published>')[0]
@ -1030,13 +1042,16 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
            if _valid_feed_date(pub_date_str):
                post_filename = ''
                votes_status = []
                podcast_properties = xml_podcast_to_dict(atom_item, xml_str)
                if podcast_properties:
                    podcast_properties['linkMimeType'] = link_mime_type
                _add_newswire_dict_entry(base_dir, domain,
                                         result, pub_date_str,
                                         title, link,
                                         votes_status, post_filename,
                                         description, moderated, mirrored,
                                         [], 32, session, debug,
-                                         None)
+                                         podcast_properties)
                post_ctr += 1
                if post_ctr >= max_posts_per_source:
                    break