diff --git a/newswire.py b/newswire.py index 6eafec39e..d416e6237 100644 --- a/newswire.py +++ b/newswire.py @@ -45,6 +45,7 @@ from blocking import is_blocked_domain from blocking import is_blocked_hashtag from filters import is_filtered from session import download_image_any_mime_type +from content import remove_script def _remove_cdata(text: str) -> str: @@ -773,6 +774,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, title = rss_item.split('')[1] title = _remove_cdata(title.split('')[0]) title = unescaped_text(title) + title = remove_script(title, None, None, None) title = remove_html(title) description = '' @@ -780,6 +782,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, description = rss_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) else: if '' in rss_item and \ @@ -787,6 +790,7 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, description = rss_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) proxy_type = None @@ -885,12 +889,14 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, title = rss_item.split('')[1] title = _remove_cdata(title.split('')[0]) title = unescaped_text(title) + title = remove_script(title, None, None, None) title = remove_html(title) description = '' if '' in rss_item and '' in rss_item: description = rss_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) else: if '' in rss_item and \ @@ -898,6 +904,7 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, description = rss_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) proxy_type = None @@ -984,16 +991,19 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, title = atom_item.split('')[1] title = _remove_cdata(title.split('')[0]) title = unescaped_text(title) + title = remove_script(title, None, None, None) title = remove_html(title) description = '' if '' in atom_item and '' in atom_item: description = atom_item.split('')[1] description = unescaped_text(description.split('')[0]) + description = remove_script(description, None, None, None) description = remove_html(description) elif '' in atom_item: description = atom_item.split('', 1)[1] description = unescaped_text(description.split('')[0]) + description = remove_script(description, None, None, None) description = remove_html(description) else: if '' in atom_item and \ @@ -1001,6 +1011,7 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) proxy_type = None @@ -1206,6 +1217,7 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, continue title = atom_item.split('')[1] title = _remove_cdata(title.split('')[0]) + title = remove_script(title, None, None, None) title = unescaped_text(title) description = '' if '' in atom_item and \ @@ -1213,17 +1225,20 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, description = atom_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) elif '' in atom_item and '' in atom_item: description = atom_item.split('')[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) elif '' in atom_item: description = atom_item.split('', 1)[1] description = description.split('')[0] description = unescaped_text(description) + description = remove_script(description, None, None, None) description = remove_html(description) link, _ = get_link_from_rss_item(atom_item, None, None)