From 13dbfba96b1103d3f165b9c4b794a980957c260c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Wed, 12 Jan 2022 14:02:47 +0000 Subject: [PATCH] Extract rss links from audio/video enclosures --- newswire.py | 78 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/newswire.py b/newswire.py index dfab73216..fdba9c3ca 100644 --- a/newswire.py +++ b/newswire.py @@ -523,14 +523,30 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, description = description.split('')[0] description = remove_html(description) - link = rss_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue - item_domain = link.split('://')[1] + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + if not link: + link = rss_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = rss_item.split('')[1] @@ -614,13 +630,31 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, description = rss_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = rss_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue + + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + + if not link: + link = rss_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = rss_item.split('')[1] @@ -692,13 +726,31 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = atom_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue + + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + + if not link: + link = atom_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = atom_item.split('')[1]