diff --git a/newswire.py b/newswire.py index dfab73216..fdba9c3ca 100644 --- a/newswire.py +++ b/newswire.py @@ -523,14 +523,30 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, description = description.split('')[0] description = remove_html(description) - link = rss_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue - item_domain = link.split('://')[1] + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + if not link: + link = rss_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = rss_item.split('')[1] @@ -614,13 +630,31 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, description = rss_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = rss_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue + + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + + if not link: + link = rss_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = rss_item.split('')[1] @@ -692,13 +726,31 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = remove_html(description) - link = atom_item.split('')[1] - link = link.split('')[0] - if '://' not in link: - continue + + link = None + if '' in enclosure: + enclosure = enclosure.split('>')[0] + if 'url="' in enclosure and \ + ('"audio/' in enclosure or '"video/' in enclosure): + link_str = enclosure.split('url="')[1] + if '"' in link_str: + link_str = link_str.split('"')[0] + if '://' in link_str: + link = link_str + + if not link: + link = atom_item.split('')[1] + link = link.split('')[0] + if '://' not in link: + continue + item_domain = link.split('://')[1] if '/' in item_domain: item_domain = item_domain.split('/')[0] + if is_blocked_domain(base_dir, item_domain): continue pub_date = atom_item.split('')[1]