epicyon/newswire.py

1847 lines
70 KiB
Python
Raw Normal View History

2020-10-04 09:51:12 +00:00
__filename__ = "newswire.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2024-12-22 23:37:30 +00:00
__version__ = "1.6.0"
2020-10-04 09:51:12 +00:00
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
2020-10-04 09:51:12 +00:00
__status__ = "Production"
2021-06-26 11:27:14 +00:00
__module_group__ = "Web Interface Columns"
2020-10-04 09:51:12 +00:00
import os
2021-02-12 11:28:00 +00:00
import json
2020-10-04 09:51:12 +00:00
import requests
import random
2022-04-27 20:02:56 +00:00
import time
2020-10-04 09:51:12 +00:00
from socket import error as SocketError
import errno
2020-11-22 15:33:11 +00:00
from datetime import timedelta
2020-11-22 19:09:35 +00:00
from datetime import timezone
2020-10-04 09:51:12 +00:00
from collections import OrderedDict
2021-12-26 12:31:47 +00:00
from utils import valid_post_date
2021-12-29 21:55:09 +00:00
from categories import set_hashtag_category
from flags import is_suspended
from flags import is_local_network_address
from flags import is_public_post
2024-05-12 12:35:26 +00:00
from utils import data_dir
2024-04-10 13:32:03 +00:00
from utils import string_contains
2024-02-05 20:05:00 +00:00
from utils import image_mime_types_dict
2024-01-27 17:04:21 +00:00
from utils import resembles_url
2023-12-09 14:18:24 +00:00
from utils import get_url_from_post
2023-11-29 11:37:44 +00:00
from utils import remove_zero_length_strings
2023-11-20 22:27:58 +00:00
from utils import date_from_string_format
2022-12-18 15:29:54 +00:00
from utils import acct_handle_dir
2022-06-21 11:58:50 +00:00
from utils import remove_eol
2022-02-12 20:37:15 +00:00
from utils import get_domain_from_actor
2022-01-13 15:10:41 +00:00
from utils import valid_hash_tag
2021-12-27 21:44:48 +00:00
from utils import dangerous_svg
2021-12-26 16:01:32 +00:00
from utils import get_fav_filename_from_url
2021-12-26 11:29:40 +00:00
from utils import get_base_content_from_post
2021-12-26 10:57:03 +00:00
from utils import has_object_dict
2021-12-27 15:52:08 +00:00
from utils import first_paragraph_from_string
2021-12-26 20:36:08 +00:00
from utils import locate_post
2021-12-26 15:13:34 +00:00
from utils import load_json
2021-12-26 14:47:21 +00:00
from utils import save_json
2021-12-27 17:53:41 +00:00
from utils import contains_invalid_chars
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-26 18:46:43 +00:00
from utils import is_account_dir
2021-12-26 12:02:29 +00:00
from utils import acct_dir
2021-12-26 10:19:59 +00:00
from utils import local_actor_url
2023-01-02 10:24:35 +00:00
from utils import escape_text
from utils import unescaped_text
2021-12-28 21:55:38 +00:00
from blocking import is_blocked_domain
from blocking import is_blocked_hashtag
2021-12-29 21:55:09 +00:00
from filters import is_filtered
from session import download_image_any_mime_type
from content import remove_script
2020-10-04 09:51:12 +00:00
2020-10-16 12:11:05 +00:00
2021-12-29 21:55:09 +00:00
def _remove_cdata(text: str) -> str:
"""Removes any CDATA from the given text
"""
if 'CDATA[' in text:
text = text.split('CDATA[')[1]
if ']' in text:
text = text.split(']')[0]
return text
2021-12-29 21:55:09 +00:00
def rss2header(http_prefix: str,
2021-12-26 10:00:46 +00:00
nickname: str, domain_full: str,
2020-10-04 12:29:07 +00:00
title: str, translate: {}) -> str:
2020-10-06 09:22:23 +00:00
"""Header for an RSS 2.0 feed
"""
2022-01-03 12:37:09 +00:00
rss_str = \
2021-07-04 09:50:09 +00:00
"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \
"<rss version=\"2.0\">" + \
'<channel>'
2020-10-13 17:14:57 +00:00
2020-10-04 12:29:07 +00:00
if title.startswith('News'):
2022-01-03 12:37:09 +00:00
rss_str += \
2021-07-04 09:50:09 +00:00
' <title>Newswire</title>' + \
2021-12-26 10:00:46 +00:00
' <link>' + http_prefix + '://' + domain_full + \
2020-10-04 12:29:07 +00:00
'/newswire.xml' + '</link>'
2020-10-13 17:14:57 +00:00
elif title.startswith('Site'):
2022-01-03 12:37:09 +00:00
rss_str += \
2021-12-26 10:00:46 +00:00
' <title>' + domain_full + '</title>' + \
' <link>' + http_prefix + '://' + domain_full + \
2020-10-13 17:17:17 +00:00
'/blog/rss.xml' + '</link>'
2020-10-04 12:29:07 +00:00
else:
2023-01-02 10:24:35 +00:00
title_str = escape_text(translate[title])
2022-01-03 12:37:09 +00:00
rss_str += \
2023-01-02 10:24:35 +00:00
' <title>' + title_str + '</title>' + \
2021-08-14 11:13:39 +00:00
' <link>' + \
2021-12-26 10:19:59 +00:00
local_actor_url(http_prefix, nickname, domain_full) + \
2021-08-14 11:13:39 +00:00
'/rss.xml' + '</link>'
2022-01-03 12:37:09 +00:00
return rss_str
2020-10-04 12:29:07 +00:00
2021-12-29 21:55:09 +00:00
def rss2footer() -> str:
2020-10-06 09:22:23 +00:00
"""Footer for an RSS 2.0 feed
"""
2022-01-03 12:37:09 +00:00
rss_str = '</channel></rss>'
return rss_str
2020-10-04 12:29:07 +00:00
2022-01-03 12:37:09 +00:00
def get_newswire_tags(text: str, max_tags: int) -> []:
2020-10-16 19:49:34 +00:00
"""Returns a list of hashtags found in the given text
"""
2020-10-16 20:46:34 +00:00
if '#' not in text:
return []
2020-10-16 19:49:34 +00:00
if ' ' not in text:
return []
2022-01-03 12:37:09 +00:00
text_simplified = \
2020-10-16 19:49:34 +00:00
text.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
2022-01-03 12:37:09 +00:00
text_simplified = text_simplified.replace('. ', ' ').strip()
if text_simplified.endswith('.'):
text_simplified = text_simplified[:len(text_simplified)-1]
words = text_simplified.split(' ')
2024-12-23 17:45:20 +00:00
tags: list[str] = []
2020-10-16 19:49:34 +00:00
for wrd in words:
if not wrd.startswith('#'):
continue
if len(wrd) <= 1:
continue
if wrd in tags:
continue
tags.append(wrd)
2022-01-03 12:37:09 +00:00
if len(tags) >= max_tags:
break
2020-10-16 19:49:34 +00:00
return tags
2022-06-09 14:58:47 +00:00
def limit_word_lengths(text: str, max_word_length: int) -> str:
"""Limits the maximum length of words so that the newswire
column cannot become too wide
"""
if ' ' not in text:
return text
words = text.split(' ')
result = ''
for wrd in words:
2022-06-09 14:58:47 +00:00
if len(wrd) > max_word_length:
wrd = wrd[:max_word_length]
if result:
result += ' '
result += wrd
return result
2021-12-29 21:55:09 +00:00
def get_newswire_favicon_url(url: str) -> str:
2021-12-16 20:57:30 +00:00
"""Returns a favicon url from the given article link
"""
if '://' not in url:
return '/newswire_favicon.ico'
if url.startswith('http://'):
if not (url.endswith('.onion') or url.endswith('.i2p')):
return '/newswire_favicon.ico'
domain = url.split('://')[1]
if '/' not in domain:
return url + '/favicon.ico'
2022-01-03 12:37:09 +00:00
domain = domain.split('/')[0]
2021-12-16 20:57:30 +00:00
return url.split('://')[0] + '://' + domain + '/favicon.ico'
2021-12-29 21:55:09 +00:00
def _download_newswire_feed_favicon(session, base_dir: str,
link: str, debug: bool) -> bool:
2021-12-16 20:57:30 +00:00
"""Downloads the favicon for the given feed link
"""
2022-01-03 12:37:09 +00:00
fav_url = get_newswire_favicon_url(link)
2021-12-16 20:57:30 +00:00
if '://' not in link:
return False
2022-01-03 12:37:09 +00:00
timeout_sec = 10
image_data, mime_type = \
download_image_any_mime_type(session, fav_url, timeout_sec, debug)
if not image_data or not mime_type:
2021-12-16 23:59:53 +00:00
return False
2021-12-17 12:01:54 +00:00
# update the favicon url
2024-02-05 20:05:00 +00:00
extensions_to_mime = image_mime_types_dict()
2022-01-03 12:37:09 +00:00
for ext, mime_ext in extensions_to_mime.items():
if 'image/' + mime_ext in mime_type:
fav_url = fav_url.replace('.ico', '.' + ext)
2021-12-17 12:01:54 +00:00
break
# create cached favicons directory if needed
2021-12-25 16:17:53 +00:00
if not os.path.isdir(base_dir + '/favicons'):
os.mkdir(base_dir + '/favicons')
2021-12-17 12:01:54 +00:00
# check svg for dubious scripts
2022-01-03 12:37:09 +00:00
if fav_url.endswith('.svg'):
image_data_str = str(image_data)
if dangerous_svg(image_data_str, False):
return False
2021-12-17 12:01:54 +00:00
# save to the cache
2022-01-03 12:37:09 +00:00
fav_filename = get_fav_filename_from_url(base_dir, fav_url)
if os.path.isfile(fav_filename):
2021-12-16 21:14:24 +00:00
return True
2021-12-16 20:57:30 +00:00
try:
2022-01-03 12:37:09 +00:00
with open(fav_filename, 'wb+') as fp_fav:
fp_fav.write(image_data)
2021-12-16 20:57:30 +00:00
except OSError:
2022-01-03 12:37:09 +00:00
print('EX: failed writing favicon ' + fav_filename)
2021-12-16 20:57:30 +00:00
return False
2021-12-16 20:57:30 +00:00
return True
2024-02-06 19:48:11 +00:00
def _add_newswire_dict_entry(base_dir: str,
2022-01-03 12:37:09 +00:00
newswire: {}, date_str: str,
2021-12-29 21:55:09 +00:00
title: str, link: str,
2022-01-03 12:37:09 +00:00
votes_status: str, post_filename: str,
2021-12-29 21:55:09 +00:00
description: str, moderated: bool,
mirrored: bool,
tags: [],
max_tags: int, session, debug: bool,
2022-09-25 17:26:11 +00:00
podcast_properties: {},
2024-06-05 15:42:33 +00:00
system_language: str,
fediverse_handle: str,
extra_links: []) -> None:
"""Update the newswire dictionary
"""
# remove any markup
2021-12-27 15:43:22 +00:00
title = remove_html(title)
description = remove_html(description)
2022-01-03 12:37:09 +00:00
all_text = title + ' ' + description
2020-10-25 10:17:12 +00:00
# check that none of the text is filtered against
2022-09-25 17:26:11 +00:00
if is_filtered(base_dir, None, None, all_text, system_language):
2020-10-17 16:08:07 +00:00
return
2020-10-25 10:17:12 +00:00
2021-12-29 21:55:09 +00:00
title = limit_word_lengths(title, 13)
2020-10-25 10:17:12 +00:00
if tags is None:
2024-12-23 17:45:20 +00:00
tags: list[str] = []
2020-10-25 10:17:12 +00:00
# extract hashtags from the text of the feed post
2022-01-03 12:37:09 +00:00
post_tags = get_newswire_tags(all_text, max_tags)
2020-10-25 10:17:12 +00:00
2022-01-13 15:15:47 +00:00
# Include tags from podcast categories
if podcast_properties:
2022-01-13 23:06:04 +00:00
if podcast_properties.get('explicit'):
2022-01-14 13:15:43 +00:00
if '#nsfw' not in post_tags:
post_tags.append('#nsfw')
2022-01-13 23:06:04 +00:00
2022-01-13 15:15:47 +00:00
post_tags += podcast_properties['categories']
2020-10-25 10:17:12 +00:00
# combine the tags into a single list
2020-10-25 12:57:14 +00:00
for tag in tags:
2022-01-03 12:37:09 +00:00
if tag in post_tags:
continue
2022-01-03 12:37:09 +00:00
if len(post_tags) < max_tags:
post_tags.append(tag)
2020-10-25 10:17:12 +00:00
# check that no tags are blocked
2022-01-03 12:37:09 +00:00
for tag in post_tags:
2021-12-28 21:55:38 +00:00
if is_blocked_hashtag(base_dir, tag):
2020-10-25 10:18:07 +00:00
return
2020-10-25 10:17:12 +00:00
2021-12-29 21:55:09 +00:00
_download_newswire_feed_favicon(session, base_dir, link, debug)
2021-12-16 20:57:30 +00:00
2022-01-03 12:37:09 +00:00
newswire[date_str] = [
2020-10-25 10:17:12 +00:00
title,
link,
2022-01-03 12:37:09 +00:00
votes_status,
2021-12-26 23:41:34 +00:00
post_filename,
2020-10-25 10:17:12 +00:00
description,
moderated,
2022-01-03 12:37:09 +00:00
post_tags,
mirrored,
2024-06-05 15:42:33 +00:00
podcast_properties,
fediverse_handle,
extra_links
2020-10-25 10:17:12 +00:00
]
2022-01-03 12:37:09 +00:00
def _valid_feed_date(pub_date: str, debug: bool = False) -> bool:
# convert from YY-MM-DD HH:MM:SS+00:00 to
# YY-MM-DDTHH:MM:SSZ
2022-01-03 12:37:09 +00:00
post_date = pub_date.replace(' ', 'T').replace('+00:00', 'Z')
if '.' in post_date:
ending = post_date.split('.')[1]
timezone_str = ''
for ending_char in ending:
if not ending_char.isdigit():
timezone_str += ending_char
if timezone_str:
post_date = post_date.split('.')[0] + timezone_str
2022-01-03 12:37:09 +00:00
return valid_post_date(post_date, 90, debug)
def parse_feed_date(pub_date: str, unique_string_identifier: str) -> str:
2020-11-22 19:01:18 +00:00
"""Returns a UTC date string based on the given date string
2020-11-22 18:14:40 +00:00
This tries a number of formats to see which work
"""
if ':00:00' in pub_date:
# If this was published exactly on the hour then assign a
# random minute and second to make this item relatively unique
randgen = random.Random(unique_string_identifier)
rand_min = randgen.randint(0, 59)
rand_sec = randgen.randint(0, 59)
replace_time_str = \
':' + str(rand_min).zfill(2) + ':' + str(rand_sec).zfill(2)
pub_date = pub_date.replace(':00:00', replace_time_str)
2020-11-22 18:14:40 +00:00
formats = ("%a, %d %b %Y %H:%M:%S %z",
2021-10-17 14:17:42 +00:00
"%a, %d %b %Y %H:%M:%S Z",
2021-09-07 19:09:41 +00:00
"%a, %d %b %Y %H:%M:%S GMT",
2021-10-17 14:24:21 +00:00
"%a, %d %b %Y %H:%M:%S EST",
"%a, %d %b %Y %H:%M:%S PST",
"%a, %d %b %Y %H:%M:%S AST",
"%a, %d %b %Y %H:%M:%S CST",
"%a, %d %b %Y %H:%M:%S MST",
"%a, %d %b %Y %H:%M:%S AKST",
"%a, %d %b %Y %H:%M:%S HST",
"%a, %d %b %Y %H:%M:%S UT",
2023-11-20 22:27:58 +00:00
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S%Z")
2022-01-03 12:37:09 +00:00
published_date = None
for date_format in formats:
if ',' in pub_date and ',' not in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if ',' not in pub_date and ',' in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'Z' in pub_date and 'Z' not in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'Z' not in pub_date and 'Z' in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'EST' not in pub_date and 'EST' in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'GMT' not in pub_date and 'GMT' in date_format:
2021-09-07 19:09:41 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'EST' in pub_date and 'EST' not in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'UT' not in pub_date and 'UT' in date_format:
2020-11-22 18:14:40 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'UT' in pub_date and 'UT' not in date_format:
2020-11-22 18:14:40 +00:00
continue
# remove any fraction of a second
2023-01-18 13:05:29 +00:00
pub_date2 = pub_date
if '.' in pub_date2:
ending = pub_date2.split('.')[1]
timezone_str = ''
2023-01-18 13:05:29 +00:00
if '+' in ending:
timezone_str = '+' + ending.split('+')[1]
elif '-' in ending:
timezone_str = '-' + ending.split('-')[1]
pub_date2 = pub_date2.split('.')[0] + timezone_str
2020-11-22 18:14:40 +00:00
try:
2023-11-20 22:27:58 +00:00
published_date = date_from_string_format(pub_date2, [date_format])
2020-11-22 18:14:40 +00:00
except BaseException:
continue
2022-01-03 12:37:09 +00:00
if published_date:
if pub_date.endswith(' EST'):
hours_added = timedelta(hours=5)
published_date = published_date + hours_added
2020-11-22 18:14:40 +00:00
break
2020-11-22 19:01:18 +00:00
2022-01-03 12:37:09 +00:00
pub_date_str = None
if published_date:
offset = published_date.utcoffset()
2020-11-22 20:37:08 +00:00
if offset:
2022-01-03 12:37:09 +00:00
published_date = published_date - offset
2020-11-22 19:09:35 +00:00
# convert local date to UTC
2022-01-03 12:37:09 +00:00
published_date = published_date.replace(tzinfo=timezone.utc)
pub_date_str = str(published_date)
if not pub_date_str.endswith('+00:00'):
pub_date_str += '+00:00'
2021-09-07 19:33:27 +00:00
else:
2023-01-18 13:05:29 +00:00
print('WARN: unrecognized date format: ' + pub_date)
2020-11-22 19:01:18 +00:00
2022-01-03 12:37:09 +00:00
return pub_date_str
2020-11-22 18:14:40 +00:00
2021-12-29 21:55:09 +00:00
def load_hashtag_categories(base_dir: str, language: str) -> None:
"""Loads an rss file containing hashtag categories
"""
2022-01-03 12:37:09 +00:00
hashtag_categories_filename = base_dir + '/categories.xml'
if not os.path.isfile(hashtag_categories_filename):
hashtag_categories_filename = \
2021-12-25 16:17:53 +00:00
base_dir + '/defaultcategories/' + language + '.xml'
2022-01-03 12:37:09 +00:00
if not os.path.isfile(hashtag_categories_filename):
return
try:
with open(hashtag_categories_filename, 'r',
encoding='utf-8') as fp_cat:
xml_str = fp_cat.read()
_xml2str_to_hashtag_categories(base_dir, xml_str, 1024, True)
except OSError:
print('EX: load_hashtag_categories unable to read ' +
hashtag_categories_filename)
2022-01-03 12:37:09 +00:00
def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int,
2021-12-29 21:55:09 +00:00
force: bool = False) -> None:
2020-12-02 16:18:36 +00:00
"""Updates hashtag categories based upon an rss feed
"""
2022-01-03 12:37:09 +00:00
rss_items = xml_str.split('<item>')
2022-06-09 14:58:47 +00:00
max_bytes = max_categories_feed_item_size_kb * 1024
2022-01-03 12:37:09 +00:00
for rss_item in rss_items:
if not rss_item:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if len(rss_item) > max_bytes:
2020-12-02 16:18:36 +00:00
print('WARN: rss categories feed item is too big')
continue
2022-01-03 12:37:09 +00:00
if '<title>' not in rss_item:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</title>' not in rss_item:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<description>' not in rss_item:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</description>' not in rss_item:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
category_str = rss_item.split('<title>')[1]
category_str = category_str.split('</title>')[0].strip()
2023-01-02 10:24:35 +00:00
category_str = unescaped_text(category_str)
2022-01-03 12:37:09 +00:00
if not category_str:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'CDATA' in category_str:
2020-12-03 10:12:09 +00:00
continue
2022-01-03 12:37:09 +00:00
hashtag_list_str = rss_item.split('<description>')[1]
hashtag_list_str = hashtag_list_str.split('</description>')[0].strip()
2023-01-02 10:24:35 +00:00
hashtag_list_str = unescaped_text(hashtag_list_str)
2022-01-03 12:37:09 +00:00
if not hashtag_list_str:
2020-12-02 16:18:36 +00:00
continue
2022-01-03 12:37:09 +00:00
if 'CDATA' in hashtag_list_str:
2020-12-03 10:12:09 +00:00
continue
2022-01-03 12:37:09 +00:00
hashtag_list = hashtag_list_str.split(' ')
2024-07-15 19:34:16 +00:00
if is_blocked_hashtag(base_dir, category_str):
continue
for hashtag in hashtag_list:
set_hashtag_category(base_dir, hashtag, category_str,
False, force)
2020-12-02 16:18:36 +00:00
2022-01-13 15:30:55 +00:00
def _get_podcast_categories(xml_item: str, xml_str: str) -> str:
""" get podcast categories if they exist. These can be turned into hashtags
2023-12-05 17:07:29 +00:00
See https://podcast-standard.org/itunes_tags
2022-01-13 15:30:55 +00:00
"""
2024-12-23 17:45:20 +00:00
podcast_categories: list[str] = []
2022-01-13 15:30:55 +00:00
2023-12-05 17:07:29 +00:00
# convert keywords to hashtags
if '<itunes:keywords' in xml_item:
keywords_str = xml_item.split('<itunes:keywords')[1]
if '>' in keywords_str:
keywords_str = keywords_str.split('>')[1]
if '<' in keywords_str:
keywords_str = keywords_str.split('<')[0]
keywords_str = remove_html(keywords_str)
keywords_list = keywords_str.split(',')
for keyword in keywords_list:
keyword_hashtag = '#' + keyword.strip()
if keyword_hashtag not in podcast_categories:
if valid_hash_tag(keyword):
podcast_categories.append(keyword_hashtag)
episode_category_tags = ['<itunes:category', '<category']
2022-01-13 15:30:55 +00:00
for category_tag in episode_category_tags:
item_str = xml_item
if category_tag not in xml_item:
if category_tag not in xml_str:
continue
item_str = xml_str
category_list = item_str.split(category_tag)
first_category = True
2022-01-13 16:12:55 +00:00
for episode_category in category_list:
if first_category:
first_category = False
continue
if 'text="' in episode_category:
episode_category = episode_category.split('text="')[1]
if '"' in episode_category:
episode_category = episode_category.split('"')[0]
episode_category = \
episode_category.lower().replace(' ', '')
episode_category = episode_category.replace('#', '')
2023-12-05 17:13:26 +00:00
episode_category_hashtag = '#' + episode_category
if episode_category_hashtag not in podcast_categories:
if valid_hash_tag(episode_category):
2023-12-05 17:13:26 +00:00
podcast_categories.append(episode_category_hashtag)
continue
2022-01-13 15:30:55 +00:00
if '>' in episode_category:
episode_category = episode_category.split('>')[1]
if '<' in episode_category:
episode_category = episode_category.split('<')[0]
episode_category = \
episode_category.lower().replace(' ', '')
episode_category = episode_category.replace('#', '')
2023-12-05 17:13:26 +00:00
episode_category_hashtag = '#' + episode_category
if episode_category_hashtag not in podcast_categories:
if valid_hash_tag(episode_category):
2023-12-05 17:13:26 +00:00
podcast_categories.append(episode_category_hashtag)
2022-01-13 15:30:55 +00:00
return podcast_categories
2023-12-05 11:32:34 +00:00
def _get_podcast_author(xml_item: str, xml_str: str) -> str:
""" get podcast author if specified.
"""
author = None
episode_author_tags = ['<itunes:author', '<author']
for author_tag in episode_author_tags:
item_str = xml_item
if author_tag not in xml_item:
if author_tag not in xml_str:
continue
item_str = xml_str
author_str = item_str.split(author_tag)[1]
if '>' not in author_str:
continue
author_str = author_str.split('>')[1]
if '<' not in author_str:
continue
author = item_str.split('>')[0]
return remove_html(author).strip()
return author
2022-02-12 20:37:15 +00:00
def _valid_podcast_entry(base_dir: str, key: str, entry: {}) -> bool:
"""Is the given podcast namespace entry valid?
https://github.com/Podcastindex-org/podcast-namespace/
blob/main/proposal-docs/social/social.md#socialinteract-element
"""
2022-05-30 15:15:17 +00:00
if key in ('socialInteract', 'discussion'):
2022-02-12 20:37:15 +00:00
if not entry.get('protocol'):
return False
if not entry.get('uri'):
if not entry.get('text'):
if not entry.get('url'):
return False
2022-02-12 20:37:15 +00:00
if entry['protocol'].tolower() != 'activitypub':
return False
if entry.get('uri'):
post_url = remove_html(entry['uri'])
elif entry.get('url'):
post_url = remove_html(entry['uri'])
else:
post_url = entry['text']
2022-02-12 20:37:15 +00:00
if '://' not in post_url:
return False
2022-06-09 14:58:47 +00:00
post_domain, _ = get_domain_from_actor(post_url)
2022-02-12 20:37:15 +00:00
if not post_domain:
return False
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, post_domain, None, None):
2022-02-12 20:37:15 +00:00
return False
return True
def xml_podcast_to_dict(base_dir: str, xml_item: str, xml_str: str) -> {}:
"""podcasting extensions for RSS feeds
2022-01-10 19:07:16 +00:00
See https://github.com/Podcastindex-org/podcast-namespace/
blob/main/docs/1.0.md
2022-02-12 15:40:55 +00:00
https://github.com/Podcastindex-org/podcast-namespace/
blob/main/proposal-docs/social/social.md#socialinteract-element
"""
if '<podcast:' not in xml_item:
if '<itunes:' not in xml_item:
2022-01-14 18:05:29 +00:00
if '<media:thumbnail' not in xml_item:
return {}
podcast_properties = {
"locations": [],
"persons": [],
"soundbites": [],
"transcripts": [],
2022-01-10 19:07:16 +00:00
"valueRecipients": [],
2022-02-12 15:38:35 +00:00
"trailers": [],
2022-05-03 16:38:16 +00:00
"chapters": [],
2022-04-29 13:54:13 +00:00
"discussion": [],
"episode": '',
"socialInteract": [],
}
pod_lines = xml_item.split('<podcast:')
ctr = 0
for pod_line in pod_lines:
if ctr == 0 or '>' not in pod_line:
ctr += 1
continue
if ' ' not in pod_line.split('>')[0]:
pod_key = pod_line.split('>')[0].strip()
pod_val = pod_line.split('>', 1)[1].strip()
if '<' in pod_val:
pod_val = pod_val.split('<')[0]
2022-04-29 13:54:13 +00:00
if pod_key in podcast_properties:
podcast_properties[pod_key] = pod_val
ctr += 1
continue
pod_key = pod_line.split(' ')[0]
pod_fields = (
'url', 'geo', 'osm', 'type', 'method', 'group',
'owner', 'srcset', 'img', 'role', 'address', 'suggested',
2022-01-10 19:07:16 +00:00
'startTime', 'duration', 'href', 'name', 'pubdate',
2022-02-12 15:38:35 +00:00
'length', 'season', 'email', 'platform', 'protocol',
'accountId', 'priority', 'podcastAccountId',
'podcastAccountUrl'
)
pod_entry = {}
for pod_field in pod_fields:
if pod_field + '="' not in pod_line:
continue
pod_str = pod_line.split(pod_field + '="')[1]
if '"' not in pod_str:
continue
pod_val = pod_str.split('"')[0]
pod_entry[pod_field] = pod_val
pod_text = pod_line.split('>')[1]
if '<' in pod_text:
pod_text = pod_text.split('<')[0].strip()
if pod_text:
pod_entry['text'] = pod_text
2022-02-12 16:00:45 +00:00
appended = False
if pod_key + 's' in podcast_properties:
if isinstance(podcast_properties[pod_key + 's'], list):
podcast_properties[pod_key + 's'].append(pod_entry)
2022-02-12 16:00:45 +00:00
appended = True
if not appended:
2022-02-12 16:05:44 +00:00
# if there are repeated keys then only use the first one
if not podcast_properties.get(pod_key):
2022-02-12 20:37:15 +00:00
if _valid_podcast_entry(base_dir, pod_key, pod_entry):
podcast_properties[pod_key] = pod_entry
ctr += 1
2022-01-11 18:25:13 +00:00
# get the image for the podcast, if it exists
2022-01-12 17:44:49 +00:00
podcast_episode_image = None
2022-01-14 17:40:42 +00:00
episode_image_tags = ['<itunes:image', '<media:thumbnail']
2022-01-11 18:25:13 +00:00
for image_tag in episode_image_tags:
item_str = xml_item
if image_tag not in xml_item:
if image_tag not in xml_str:
continue
item_str = xml_str
episode_image = item_str.split(image_tag)[1]
if image_tag + ' ' in item_str and '>' in episode_image:
episode_image = episode_image.split('>')[0]
2022-01-11 18:25:13 +00:00
if 'href="' in episode_image:
episode_image = episode_image.split('href="')[1]
if '"' in episode_image:
episode_image = episode_image.split('"')[0]
2022-01-12 17:44:49 +00:00
podcast_episode_image = episode_image
2022-01-11 18:25:13 +00:00
break
2022-01-14 17:40:42 +00:00
elif 'url="' in episode_image:
episode_image = episode_image.split('url="')[1]
if '"' in episode_image:
episode_image = episode_image.split('"')[0]
podcast_episode_image = episode_image
break
2022-01-14 18:05:29 +00:00
elif '>' in episode_image:
episode_image = episode_image.split('>')[1]
if '<' in episode_image:
episode_image = episode_image.split('<')[0]
2024-01-27 17:04:21 +00:00
if resembles_url(episode_image):
2022-01-14 18:05:29 +00:00
podcast_episode_image = episode_image
break
2022-01-12 17:44:49 +00:00
2022-01-13 15:10:41 +00:00
# get categories if they exist. These can be turned into hashtags
2022-01-13 15:30:55 +00:00
podcast_categories = _get_podcast_categories(xml_item, xml_str)
2022-01-13 15:10:41 +00:00
2023-12-05 11:32:34 +00:00
# get the author name
podcast_author = _get_podcast_author(xml_item, xml_str)
if podcast_author:
podcast_properties['author'] = podcast_author
2022-01-12 17:44:49 +00:00
if podcast_episode_image:
podcast_properties['image'] = podcast_episode_image
2022-01-13 15:10:41 +00:00
podcast_properties['categories'] = podcast_categories
2022-01-12 17:44:49 +00:00
2024-04-10 13:32:03 +00:00
if string_contains(xml_item,
('<itunes:explicit>Y', '<itunes:explicit>T',
'<itunes:explicit>1')):
2022-01-12 17:44:49 +00:00
podcast_properties['explicit'] = True
else:
podcast_properties['explicit'] = False
else:
if '<podcast:' not in xml_item:
2022-01-12 17:44:49 +00:00
return {}
return podcast_properties
def get_link_from_rss_item(rss_item: str,
preferred_mime_types: [],
proxy_type: str) -> (str, str):
2022-01-12 14:23:07 +00:00
"""Extracts rss link from rss item string
"""
2022-01-12 18:35:15 +00:00
mime_type = None
if preferred_mime_types and '<podcast:alternateEnclosure ' in rss_item:
enclosures = rss_item.split('<podcast:alternateEnclosure ')
ctr = 0
for enclosure in enclosures:
if ctr == 0:
ctr += 1
continue
ctr += 1
if '</podcast:alternateEnclosure' not in enclosure:
continue
enclosure = enclosure.split('</podcast:alternateEnclosure')[0]
if 'type="' not in enclosure:
continue
mime_type = enclosure.split('type="')[1]
if '"' in mime_type:
mime_type = mime_type.split('"')[0]
if mime_type not in preferred_mime_types:
continue
if 'uri="' not in enclosure:
continue
uris = enclosure.split('uri="')
ctr2 = 0
for uri in uris:
if ctr2 == 0:
ctr2 += 1
continue
ctr2 += 1
if '"' not in uri:
continue
link = uri.split('"')[0]
if '://' not in link:
continue
if proxy_type:
if proxy_type == 'tor' and \
'.onion/' not in link:
continue
if proxy_type == 'onion' and \
'.onion/' not in link:
continue
if proxy_type == 'i2p' and \
'.i2p/' not in link:
continue
return link, mime_type
2022-05-30 15:15:17 +00:00
if '.onion/' not in link and \
'.i2p/' not in link:
return link, mime_type
2022-01-12 14:23:07 +00:00
if '<enclosure ' in rss_item:
# get link from audio or video enclosure
enclosure = rss_item.split('<enclosure ')[1]
if '>' in enclosure:
enclosure = enclosure.split('>')[0]
2022-01-12 18:35:15 +00:00
if ' type="' in enclosure:
mime_type = enclosure.split(' type="')[1]
if '"' in mime_type:
mime_type = mime_type.split('"')[0]
2022-01-12 14:23:07 +00:00
if 'url="' in enclosure and \
('"audio/' in enclosure or '"video/' in enclosure):
link_str = enclosure.split('url="')[1]
if '"' in link_str:
2022-01-12 16:18:54 +00:00
link = link_str.split('"')[0]
2024-01-27 17:04:21 +00:00
if resembles_url(link):
2022-01-12 18:35:15 +00:00
return link, mime_type
2022-01-12 16:18:54 +00:00
2022-01-13 22:26:01 +00:00
if '<link>' in rss_item and '</link>' in rss_item:
link = rss_item.split('<link>')[1]
link = link.split('</link>')[0]
if '://' not in link:
return None, None
elif '<link ' in rss_item:
link_str = rss_item.split('<link ')[1]
if '>' in link_str:
link_str = link_str.split('>')[0]
if 'href="' in link_str:
link_str = link_str.split('href="')[1]
if '"' in link_str:
link = link_str.split('"')[0]
2022-01-12 18:35:15 +00:00
return link, mime_type
2022-01-12 14:23:07 +00:00
2022-01-03 12:37:09 +00:00
def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int,
session, debug: bool,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats: [],
system_language: str) -> {}:
2020-12-14 14:22:44 +00:00
"""Converts an xml RSS 2.0 string to a dictionary
2020-10-04 09:51:12 +00:00
"""
2022-01-03 12:37:09 +00:00
if '<item>' not in xml_str:
2020-10-04 09:51:12 +00:00
return {}
result = {}
2020-12-09 10:38:09 +00:00
# is this an rss feed containing hashtag categories?
2022-01-03 12:37:09 +00:00
if '<title>#categories</title>' in xml_str:
_xml2str_to_hashtag_categories(base_dir, xml_str,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb)
2020-12-02 16:18:36 +00:00
return {}
2020-12-09 10:38:09 +00:00
2022-01-03 12:37:09 +00:00
rss_items = xml_str.split('<item>')
post_ctr = 0
max_bytes = max_feed_item_size_kb * 1024
2022-01-13 12:16:42 +00:00
first_item = True
2022-01-03 12:37:09 +00:00
for rss_item in rss_items:
2022-01-13 12:16:42 +00:00
if first_item:
2022-01-13 12:19:35 +00:00
first_item = False
2022-01-13 12:16:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if not rss_item:
2020-11-27 22:43:34 +00:00
continue
2022-01-03 12:37:09 +00:00
if len(rss_item) > max_bytes:
print('WARN: rss feed item is too big')
continue
2022-01-03 12:37:09 +00:00
if '<title>' not in rss_item:
2020-10-04 09:51:12 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</title>' not in rss_item:
2020-10-04 09:51:12 +00:00
continue
2022-01-13 22:26:01 +00:00
if '<link' not in rss_item:
2020-10-04 09:51:12 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<pubDate>' not in rss_item:
2020-10-04 09:51:12 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</pubDate>' not in rss_item:
2020-10-04 09:51:12 +00:00
continue
2022-01-11 18:25:13 +00:00
2022-01-03 12:37:09 +00:00
title = rss_item.split('<title>')[1]
2021-12-29 21:55:09 +00:00
title = _remove_cdata(title.split('</title>')[0])
2023-01-02 10:24:35 +00:00
title = unescaped_text(title)
title = remove_script(title, None, None, None)
2021-12-27 15:43:22 +00:00
title = remove_html(title)
2023-08-17 14:04:20 +00:00
title = title.replace('\n', '')
2022-01-11 18:25:13 +00:00
2020-10-07 12:05:49 +00:00
description = ''
2022-01-03 12:37:09 +00:00
if '<description>' in rss_item and '</description>' in rss_item:
description = rss_item.split('<description>')[1]
2023-06-28 11:46:24 +00:00
description = description.split('</description>')[0]
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2023-06-28 11:01:03 +00:00
description = remove_html(description)
else:
2022-01-03 12:37:09 +00:00
if '<media:description>' in rss_item and \
'</media:description>' in rss_item:
description = rss_item.split('<media:description>')[1]
description = description.split('</media:description>')[0]
2023-01-02 10:24:35 +00:00
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2021-12-27 15:43:22 +00:00
description = remove_html(description)
2022-01-11 18:25:13 +00:00
proxy_type = None
if domain.endswith('.onion'):
proxy_type = 'tor'
elif domain.endswith('.i2p'):
proxy_type = 'i2p'
link, link_mime_type = \
get_link_from_rss_item(rss_item, preferred_podcast_formats,
proxy_type)
if not link:
2022-01-12 14:23:07 +00:00
continue
2022-01-11 18:25:13 +00:00
item_domain = link.split('://')[1]
2022-01-03 12:37:09 +00:00
if '/' in item_domain:
item_domain = item_domain.split('/')[0]
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, item_domain, None, None):
continue
2022-01-03 12:37:09 +00:00
pub_date = rss_item.split('<pubDate>')[1]
pub_date = pub_date.split('</pubDate>')[0]
2020-11-22 18:14:40 +00:00
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
2024-07-15 19:38:12 +00:00
if not pub_date_str:
continue
if not _valid_feed_date(pub_date_str):
continue
post_filename = ''
2024-12-23 17:45:20 +00:00
votes_status: list[str] = []
2024-07-15 19:38:12 +00:00
podcast_properties = \
xml_podcast_to_dict(base_dir, rss_item, xml_str)
if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type
fediverse_handle = ''
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
2024-07-15 19:38:12 +00:00
_add_newswire_dict_entry(base_dir,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug,
podcast_properties, system_language,
fediverse_handle, extra_links)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
2022-01-03 12:37:09 +00:00
if post_ctr > 0:
2022-01-12 14:31:04 +00:00
print('Added ' + str(post_ctr) + ' rss 2.0 feed items to newswire')
2020-10-04 09:51:12 +00:00
return result
2022-01-03 12:37:09 +00:00
def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int,
session, debug: bool,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats: [],
system_language: str) -> {}:
2020-12-14 14:22:44 +00:00
"""Converts an xml RSS 1.0 string to a dictionary
https://validator.w3.org/feed/docs/rss1.html
"""
2022-01-03 12:37:09 +00:00
item_str = '<item'
if item_str not in xml_str:
2020-12-14 14:22:44 +00:00
return {}
result = {}
# is this an rss feed containing hashtag categories?
2022-01-03 12:37:09 +00:00
if '<title>#categories</title>' in xml_str:
_xml2str_to_hashtag_categories(base_dir, xml_str,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb)
2020-12-14 14:22:44 +00:00
return {}
2022-01-03 12:37:09 +00:00
rss_items = xml_str.split(item_str)
post_ctr = 0
max_bytes = max_feed_item_size_kb * 1024
2022-01-13 12:16:42 +00:00
first_item = True
2022-01-03 12:37:09 +00:00
for rss_item in rss_items:
2022-01-13 12:16:42 +00:00
if first_item:
2022-01-13 12:19:35 +00:00
first_item = False
2022-01-13 12:16:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if not rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
if len(rss_item) > max_bytes:
2020-12-14 17:18:16 +00:00
print('WARN: rss 1.0 feed item is too big')
continue
2022-01-03 12:37:09 +00:00
if rss_item.startswith('s>'):
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<title>' not in rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</title>' not in rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-13 22:26:01 +00:00
if '<link' not in rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<dc:date>' not in rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</dc:date>' not in rss_item:
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
title = rss_item.split('<title>')[1]
2021-12-29 21:55:09 +00:00
title = _remove_cdata(title.split('</title>')[0])
2023-01-02 10:24:35 +00:00
title = unescaped_text(title)
title = remove_script(title, None, None, None)
2021-12-27 15:43:22 +00:00
title = remove_html(title)
2020-12-14 14:22:44 +00:00
description = ''
2022-01-03 12:37:09 +00:00
if '<description>' in rss_item and '</description>' in rss_item:
description = rss_item.split('<description>')[1]
2023-06-28 11:46:24 +00:00
description = description.split('</description>')[0]
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2023-06-28 11:01:03 +00:00
description = remove_html(description)
2020-12-14 14:22:44 +00:00
else:
2022-01-03 12:37:09 +00:00
if '<media:description>' in rss_item and \
'</media:description>' in rss_item:
description = rss_item.split('<media:description>')[1]
2020-12-14 14:22:44 +00:00
description = description.split('</media:description>')[0]
2023-01-02 10:24:35 +00:00
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2021-12-27 15:43:22 +00:00
description = remove_html(description)
proxy_type = None
if domain.endswith('.onion'):
proxy_type = 'tor'
elif domain.endswith('.i2p'):
proxy_type = 'i2p'
link, link_mime_type = \
get_link_from_rss_item(rss_item, preferred_podcast_formats,
proxy_type)
if not link:
2022-01-12 14:23:07 +00:00
continue
2022-01-03 12:37:09 +00:00
item_domain = link.split('://')[1]
if '/' in item_domain:
item_domain = item_domain.split('/')[0]
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, item_domain, None, None):
2020-12-14 14:22:44 +00:00
continue
2022-01-03 12:37:09 +00:00
pub_date = rss_item.split('<dc:date>')[1]
pub_date = pub_date.split('</dc:date>')[0]
2020-12-14 14:22:44 +00:00
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
2024-07-15 19:44:46 +00:00
if not pub_date_str:
continue
if not _valid_feed_date(pub_date_str):
continue
post_filename = ''
2024-12-23 17:45:20 +00:00
votes_status: list[str] = []
2024-07-15 19:44:46 +00:00
podcast_properties = \
xml_podcast_to_dict(base_dir, rss_item, xml_str)
if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type
fediverse_handle = ''
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
2024-07-15 19:44:46 +00:00
_add_newswire_dict_entry(base_dir,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug,
podcast_properties, system_language,
fediverse_handle, extra_links)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
2022-01-03 12:37:09 +00:00
if post_ctr > 0:
2022-01-12 14:31:04 +00:00
print('Added ' + str(post_ctr) + ' rss 1.0 feed items to newswire')
2020-12-14 14:22:44 +00:00
return result
2022-01-03 12:37:09 +00:00
def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
session, debug: bool,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats: [],
system_language: str) -> {}:
2020-10-10 12:24:14 +00:00
"""Converts an atom feed string to a dictionary
2024-06-05 15:12:09 +00:00
Also see https://activitystrea.ms/specs/atom/1.0/
2020-10-10 12:24:14 +00:00
"""
2022-01-03 12:37:09 +00:00
if '<entry>' not in xml_str:
2020-10-10 12:24:14 +00:00
return {}
result = {}
2022-01-03 12:37:09 +00:00
atom_items = xml_str.split('<entry>')
post_ctr = 0
max_bytes = max_feed_item_size_kb * 1024
2022-01-13 12:16:42 +00:00
first_item = True
2022-01-03 12:37:09 +00:00
for atom_item in atom_items:
2022-01-13 12:16:42 +00:00
if first_item:
2022-01-13 12:19:35 +00:00
first_item = False
2022-01-13 12:16:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if not atom_item:
2020-11-27 22:43:34 +00:00
continue
2022-01-03 12:37:09 +00:00
if len(atom_item) > max_bytes:
print('WARN: atom feed item is too big')
continue
2022-01-03 12:37:09 +00:00
if '<title>' not in atom_item:
2020-10-10 12:24:14 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</title>' not in atom_item:
2020-10-10 12:24:14 +00:00
continue
2022-01-13 22:26:01 +00:00
if '<link' not in atom_item:
2020-10-10 12:24:14 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<updated>' not in atom_item:
2020-10-10 12:24:14 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</updated>' not in atom_item:
2020-10-10 12:24:14 +00:00
continue
2022-01-03 12:37:09 +00:00
title = atom_item.split('<title>')[1]
2021-12-29 21:55:09 +00:00
title = _remove_cdata(title.split('</title>')[0])
2023-01-02 10:24:35 +00:00
title = unescaped_text(title)
title = remove_script(title, None, None, None)
2021-12-27 15:43:22 +00:00
title = remove_html(title)
2020-10-10 12:24:14 +00:00
description = ''
2022-01-03 12:37:09 +00:00
if '<summary>' in atom_item and '</summary>' in atom_item:
description = atom_item.split('<summary>')[1]
2023-06-28 11:01:03 +00:00
description = unescaped_text(description.split('</summary>')[0])
description = remove_script(description, None, None, None)
2023-06-28 11:01:03 +00:00
description = remove_html(description)
2023-06-27 21:27:51 +00:00
elif '<content' in atom_item and '</content>' in atom_item:
description = atom_item.split('<content', 1)[1]
description = description.split('>', 1)[1]
2023-06-28 11:01:03 +00:00
description = unescaped_text(description.split('</content>')[0])
description = remove_script(description, None, None, None)
2023-06-28 11:01:03 +00:00
description = remove_html(description)
else:
2022-01-03 12:37:09 +00:00
if '<media:description>' in atom_item and \
'</media:description>' in atom_item:
description = atom_item.split('<media:description>')[1]
description = description.split('</media:description>')[0]
2023-01-02 10:24:35 +00:00
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2021-12-27 15:43:22 +00:00
description = remove_html(description)
2024-06-05 15:42:33 +00:00
# is there a fediverse handle
fediverse_handle = ''
if '<author>' in atom_item and '</author>' in atom_item:
actor_str = atom_item.split('<author>')[1]
actor_str = unescaped_text(actor_str.split('</author>')[0])
actor_str = remove_script(actor_str, None, None, None)
if '<activity:object-type>' in actor_str and \
'</activity:object-type>' in actor_str and \
'<uri>' in actor_str and '</uri>' in actor_str:
obj_type = actor_str.split('<activity:object-type>')[1]
obj_type = obj_type.split('</activity:object-type>')[0]
if obj_type == 'Person':
actor_uri = actor_str.split('<uri>')[1]
actor_uri = actor_uri.split('</uri>')[0]
if resembles_url(actor_uri) and \
not is_local_network_address(actor_uri):
fediverse_handle = actor_uri
# are there any extra links?
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
if '<activity:object>' in atom_item and \
'</activity:object>' in atom_item:
obj_str = atom_item.split('<activity:object>')[1]
obj_str = \
unescaped_text(obj_str.split('</activity:object>')[0])
obj_str = remove_script(obj_str, None, None, None)
sections = obj_str.split('<link ')
ctr = 0
for section_str in sections:
if ctr == 0:
ctr = 1
continue
if '>' in section_str:
link_str = section_str.split('>')[0]
if 'href="' in link_str and \
'rel="preview"' not in link_str:
link_str = link_str.split('href="')[1]
if '"' in link_str:
link_str = link_str.split('"')[0]
link_str = remove_html(link_str)
if resembles_url(link_str) and \
not is_local_network_address(link_str):
if link_str not in extra_links:
extra_links.append(link_str)
proxy_type = None
if domain.endswith('.onion'):
proxy_type = 'tor'
elif domain.endswith('.i2p'):
proxy_type = 'i2p'
link, link_mime_type = \
get_link_from_rss_item(atom_item, preferred_podcast_formats,
proxy_type)
if not link:
2022-01-12 14:23:07 +00:00
continue
2022-01-03 12:37:09 +00:00
item_domain = link.split('://')[1]
if '/' in item_domain:
item_domain = item_domain.split('/')[0]
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, item_domain, None, None):
continue
2022-01-03 12:37:09 +00:00
pub_date = atom_item.split('<updated>')[1]
pub_date = pub_date.split('</updated>')[0]
2020-11-22 18:14:40 +00:00
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
2024-07-15 19:44:46 +00:00
if not pub_date_str:
continue
if not _valid_feed_date(pub_date_str):
continue
post_filename = ''
2024-12-23 17:45:20 +00:00
votes_status: list[str] = []
2024-07-15 19:44:46 +00:00
podcast_properties = \
xml_podcast_to_dict(base_dir, atom_item, xml_str)
if podcast_properties:
podcast_properties['linkMimeType'] = link_mime_type
_add_newswire_dict_entry(base_dir,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug,
podcast_properties, system_language,
fediverse_handle, extra_links)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
2022-01-03 12:37:09 +00:00
if post_ctr > 0:
2022-01-12 14:31:04 +00:00
print('Added ' + str(post_ctr) + ' atom feed items to newswire')
2021-02-12 11:28:00 +00:00
return result
2024-02-19 14:26:45 +00:00
def _json_feed_v1to_dict(base_dir: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-09-25 17:26:11 +00:00
session, debug: bool,
system_language: str) -> {}:
2021-02-12 11:28:00 +00:00
"""Converts a json feed string to a dictionary
2021-02-12 11:46:26 +00:00
See https://jsonfeed.org/version/1.1
2021-02-12 11:28:00 +00:00
"""
2022-01-03 12:37:09 +00:00
if '"items"' not in xml_str:
2021-02-12 11:28:00 +00:00
return {}
try:
2022-01-03 12:37:09 +00:00
feed_json = json.loads(xml_str)
2021-02-12 11:28:00 +00:00
except BaseException:
2022-01-03 12:37:09 +00:00
print('EX: _json_feed_v1to_dict unable to load json ' + str(xml_str))
2021-02-12 11:28:00 +00:00
return {}
2022-01-03 12:37:09 +00:00
max_bytes = max_feed_item_size_kb * 1024
if not feed_json.get('version'):
2021-02-12 11:28:00 +00:00
return {}
2022-01-03 12:37:09 +00:00
if not feed_json['version'].startswith('https://jsonfeed.org/version/1'):
2021-02-12 11:28:00 +00:00
return {}
2022-01-03 12:37:09 +00:00
if not feed_json.get('items'):
2021-02-12 11:28:00 +00:00
return {}
2022-01-03 12:37:09 +00:00
if not isinstance(feed_json['items'], list):
2021-02-12 11:28:00 +00:00
return {}
2022-01-03 12:37:09 +00:00
post_ctr = 0
2021-02-12 11:47:49 +00:00
result = {}
2022-01-03 12:37:09 +00:00
for json_feed_item in feed_json['items']:
if not json_feed_item:
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if not isinstance(json_feed_item, dict):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if not json_feed_item.get('url'):
2021-02-12 11:28:00 +00:00
continue
2023-12-09 14:18:24 +00:00
url_str = get_url_from_post(json_feed_item['url'])
if not url_str:
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if not json_feed_item.get('date_published'):
if not json_feed_item.get('date_modified'):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if not json_feed_item.get('content_text'):
if not json_feed_item.get('content_html'):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if json_feed_item.get('content_html'):
if not isinstance(json_feed_item['content_html'], str):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
title = remove_html(json_feed_item['content_html'])
2021-02-12 11:28:00 +00:00
else:
2022-01-03 12:37:09 +00:00
if not isinstance(json_feed_item['content_text'], str):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
title = remove_html(json_feed_item['content_text'])
if len(title) > max_bytes:
2021-02-12 11:28:00 +00:00
print('WARN: json feed title is too long')
continue
description = ''
2022-01-03 12:37:09 +00:00
if json_feed_item.get('description'):
if not isinstance(json_feed_item['description'], str):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
description = remove_html(json_feed_item['description'])
if len(description) > max_bytes:
2021-02-12 11:28:00 +00:00
print('WARN: json feed description is too long')
continue
2022-01-03 12:37:09 +00:00
if json_feed_item.get('tags'):
if isinstance(json_feed_item['tags'], list):
for tag_name in json_feed_item['tags']:
if not isinstance(tag_name, str):
2021-02-12 12:09:16 +00:00
continue
2022-01-03 12:37:09 +00:00
if ' ' in tag_name:
2021-02-12 12:09:16 +00:00
continue
2022-01-03 12:37:09 +00:00
if not tag_name.startswith('#'):
tag_name = '#' + tag_name
if tag_name not in description:
description += ' ' + tag_name
2021-02-12 12:09:16 +00:00
2023-12-09 14:18:24 +00:00
link = remove_html(url_str)
2021-02-12 11:28:00 +00:00
if '://' not in link:
continue
2022-01-03 12:37:09 +00:00
if len(link) > max_bytes:
2021-02-12 11:28:00 +00:00
print('WARN: json feed link is too long')
continue
2022-01-03 12:37:09 +00:00
item_domain = link.split('://')[1]
if '/' in item_domain:
item_domain = item_domain.split('/')[0]
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, item_domain, None, None):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
if json_feed_item.get('date_published'):
if not isinstance(json_feed_item['date_published'], str):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
pub_date = json_feed_item['date_published']
2021-02-12 11:28:00 +00:00
else:
2022-01-03 12:37:09 +00:00
if not isinstance(json_feed_item['date_modified'], str):
2021-02-12 11:28:00 +00:00
continue
2022-01-03 12:37:09 +00:00
pub_date = json_feed_item['date_modified']
2021-02-12 11:28:00 +00:00
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
2024-07-15 19:44:46 +00:00
if not pub_date_str:
continue
if not _valid_feed_date(pub_date_str):
continue
post_filename = ''
2024-12-23 17:45:20 +00:00
votes_status: list[str] = []
2024-07-15 19:44:46 +00:00
fediverse_handle = ''
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
2024-07-15 19:44:46 +00:00
_add_newswire_dict_entry(base_dir,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated,
mirrored, [], 32, session, debug,
None, system_language,
fediverse_handle, extra_links)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
2022-01-03 12:37:09 +00:00
if post_ctr > 0:
print('Added ' + str(post_ctr) +
2021-02-12 11:28:00 +00:00
' json feed items to newswire')
2020-10-10 12:24:14 +00:00
return result
2024-02-06 19:48:11 +00:00
def _atom_feed_yt_to_dict(base_dir: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-09-25 17:26:11 +00:00
session, debug: bool,
system_language: str) -> {}:
2020-11-22 10:34:42 +00:00
"""Converts an atom-style YouTube feed string to a dictionary
"""
2022-01-03 12:37:09 +00:00
if '<entry>' not in xml_str:
2020-11-22 10:34:42 +00:00
return {}
2024-02-11 13:42:15 +00:00
if is_blocked_domain(base_dir, 'www.youtube.com', None, None):
2020-11-22 10:34:42 +00:00
return {}
result = {}
2022-01-03 12:37:09 +00:00
atom_items = xml_str.split('<entry>')
post_ctr = 0
max_bytes = max_feed_item_size_kb * 1024
2022-01-13 22:57:16 +00:00
first_entry = True
2022-01-03 12:37:09 +00:00
for atom_item in atom_items:
2022-01-13 22:57:16 +00:00
if first_entry:
first_entry = False
continue
2022-01-03 12:37:09 +00:00
if not atom_item:
2020-11-27 22:43:34 +00:00
continue
2022-01-03 12:37:09 +00:00
if not atom_item.strip():
2020-11-27 22:43:34 +00:00
continue
2022-01-03 12:37:09 +00:00
if len(atom_item) > max_bytes:
2020-11-22 10:34:42 +00:00
print('WARN: atom feed item is too big')
continue
2022-01-03 12:37:09 +00:00
if '<title>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</title>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<published>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</published>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if '<yt:videoId>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
if '</yt:videoId>' not in atom_item:
2020-11-22 10:34:42 +00:00
continue
2022-01-03 12:37:09 +00:00
title = atom_item.split('<title>')[1]
2021-12-29 21:55:09 +00:00
title = _remove_cdata(title.split('</title>')[0])
title = remove_script(title, None, None, None)
2023-01-02 10:24:35 +00:00
title = unescaped_text(title)
2020-11-22 10:34:42 +00:00
description = ''
2022-01-03 12:37:09 +00:00
if '<media:description>' in atom_item and \
'</media:description>' in atom_item:
description = atom_item.split('<media:description>')[1]
2020-11-22 10:34:42 +00:00
description = description.split('</media:description>')[0]
2023-01-02 10:24:35 +00:00
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2021-12-27 15:43:22 +00:00
description = remove_html(description)
2022-01-03 12:37:09 +00:00
elif '<summary>' in atom_item and '</summary>' in atom_item:
description = atom_item.split('<summary>')[1]
2020-11-22 10:34:42 +00:00
description = description.split('</summary>')[0]
2023-01-02 10:24:35 +00:00
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2021-12-27 15:43:22 +00:00
description = remove_html(description)
2023-06-27 21:27:51 +00:00
elif '<content' in atom_item and '</content>' in atom_item:
description = atom_item.split('<content', 1)[1]
description = description.split('>', 1)[1]
description = description.split('</content>')[0]
description = unescaped_text(description)
description = remove_script(description, None, None, None)
2023-06-27 21:27:51 +00:00
description = remove_html(description)
2022-01-14 17:40:42 +00:00
link, _ = get_link_from_rss_item(atom_item, None, None)
2022-01-14 17:40:42 +00:00
if not link:
link = atom_item.split('<yt:videoId>')[1]
link = link.split('</yt:videoId>')[0]
link = 'https://www.youtube.com/watch?v=' + link.strip()
if not link:
continue
2022-01-03 12:37:09 +00:00
pub_date = atom_item.split('<published>')[1]
pub_date = pub_date.split('</published>')[0]
2020-11-22 18:14:40 +00:00
unique_string_identifier = title + ' ' + link
pub_date_str = parse_feed_date(pub_date, unique_string_identifier)
2024-07-15 19:44:46 +00:00
if not pub_date_str:
continue
if not _valid_feed_date(pub_date_str):
continue
post_filename = ''
2024-12-23 17:45:20 +00:00
votes_status: list[str] = []
2024-07-15 19:44:46 +00:00
podcast_properties = \
xml_podcast_to_dict(base_dir, atom_item, xml_str)
if podcast_properties:
podcast_properties['linkMimeType'] = 'video/youtube'
fediverse_handle = ''
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
2024-07-15 19:44:46 +00:00
_add_newswire_dict_entry(base_dir,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated, mirrored,
[], 32, session, debug,
podcast_properties, system_language,
fediverse_handle, extra_links)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
2022-01-03 12:37:09 +00:00
if post_ctr > 0:
print('Added ' + str(post_ctr) + ' YouTube feed items to newswire')
2020-11-22 10:34:42 +00:00
return result
2022-01-03 12:37:09 +00:00
def _xml_str_to_dict(base_dir: str, domain: str, xml_str: str,
2021-12-29 21:55:09 +00:00
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int,
session, debug: bool,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats: [],
system_language: str) -> {}:
2020-10-04 09:51:12 +00:00
"""Converts an xml string to a dictionary
"""
2022-01-03 12:37:09 +00:00
if '<yt:videoId>' in xml_str and '<yt:channelId>' in xml_str:
2020-11-22 16:10:58 +00:00
print('YouTube feed: reading')
2024-02-06 19:48:11 +00:00
return _atom_feed_yt_to_dict(base_dir,
2022-01-03 12:37:09 +00:00
xml_str, moderated, mirrored,
max_posts_per_source,
max_feed_item_size_kb,
2022-09-25 17:26:11 +00:00
session, debug,
system_language)
2022-01-10 22:30:06 +00:00
if 'rss version="2.0"' in xml_str:
2021-12-29 21:55:09 +00:00
return _xml2str_to_dict(base_dir, domain,
2022-01-03 12:37:09 +00:00
xml_str, moderated, mirrored,
max_posts_per_source, max_feed_item_size_kb,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb,
session, debug,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats,
system_language)
2022-01-10 22:30:06 +00:00
if '<?xml version="1.0"' in xml_str:
2021-12-29 21:55:09 +00:00
return _xml1str_to_dict(base_dir, domain,
2022-01-03 12:37:09 +00:00
xml_str, moderated, mirrored,
max_posts_per_source, max_feed_item_size_kb,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb,
2022-09-25 17:26:11 +00:00
session, debug, preferred_podcast_formats,
system_language)
2022-01-10 22:30:06 +00:00
if 'xmlns="http://www.w3.org/2005/Atom"' in xml_str:
2021-12-29 21:55:09 +00:00
return _atom_feed_to_dict(base_dir, domain,
2022-01-03 12:37:09 +00:00
xml_str, moderated, mirrored,
max_posts_per_source, max_feed_item_size_kb,
2022-09-25 17:26:11 +00:00
session, debug, preferred_podcast_formats,
system_language)
2022-01-10 22:30:06 +00:00
if 'https://jsonfeed.org/version/1' in xml_str:
2024-02-19 14:26:45 +00:00
return _json_feed_v1to_dict(base_dir,
2022-01-03 12:37:09 +00:00
xml_str, moderated, mirrored,
max_posts_per_source,
max_feed_item_size_kb,
2022-09-25 17:26:11 +00:00
session, debug, system_language)
2020-10-04 09:51:12 +00:00
return {}
2022-01-03 12:37:09 +00:00
def _yt_channel_to_atom_feed(url: str) -> str:
"""Converts a YouTube channel url into an atom feed url
"""
if 'youtube.com/channel/' not in url:
return url
2022-01-03 12:37:09 +00:00
channel_id = url.split('youtube.com/channel/')[1].strip()
channel_url = \
'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
print('YouTube feed: ' + channel_url)
return channel_url
2021-12-29 21:55:09 +00:00
def get_rss(base_dir: str, domain: str, session, url: str,
moderated: bool, mirrored: bool,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int, max_feed_size_kb: int,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb: int,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int, debug: bool,
2022-04-24 19:03:02 +00:00
preferred_podcast_formats: [],
2022-09-25 17:26:11 +00:00
timeout_sec: int, system_language: str) -> {}:
2020-10-04 09:51:12 +00:00
"""Returns an RSS url as a dict
"""
if not isinstance(url, str):
print('url: ' + str(url))
2021-12-29 21:55:09 +00:00
print('ERROR: get_rss url should be a string')
2020-10-04 09:51:12 +00:00
return None
headers = {
2020-12-14 20:22:05 +00:00
'Accept': 'text/xml, application/xml; charset=UTF-8'
2020-10-04 09:51:12 +00:00
}
params = None
2022-01-03 12:37:09 +00:00
session_params = {}
session_headers = {}
2020-10-04 09:51:12 +00:00
if headers:
2022-01-03 12:37:09 +00:00
session_headers = headers
2020-10-04 09:51:12 +00:00
if params:
2022-01-03 12:37:09 +00:00
session_params = params
session_headers['User-Agent'] = \
2020-10-04 09:51:12 +00:00
'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'
if not session:
2021-12-29 21:55:09 +00:00
print('WARN: no session specified for get_rss')
2022-01-03 12:37:09 +00:00
url = _yt_channel_to_atom_feed(url)
2020-10-04 09:51:12 +00:00
try:
2022-01-03 12:37:09 +00:00
result = \
2022-04-24 19:03:02 +00:00
session.get(url, headers=session_headers,
params=session_params,
2022-04-24 20:33:07 +00:00
timeout=timeout_sec,
2023-08-17 16:46:59 +00:00
allow_redirects=True)
if result:
2023-11-29 12:50:56 +00:00
result_str = remove_zero_length_strings(result.text)
if int(len(result_str) / 1024) >= max_feed_size_kb:
2023-08-05 10:36:09 +00:00
print('WARN: feed is too large: ' + url)
2023-11-29 12:50:56 +00:00
elif not contains_invalid_chars(result_str):
return _xml_str_to_dict(base_dir, domain, result_str,
2021-12-29 21:55:09 +00:00
moderated, mirrored,
2022-01-03 12:37:09 +00:00
max_posts_per_source,
2021-12-29 21:55:09 +00:00
max_feed_item_size_kb,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb,
session, debug,
2022-09-25 17:26:11 +00:00
preferred_podcast_formats,
system_language)
2023-08-05 10:36:09 +00:00
print('WARN: feed contains invalid characters: ' + url)
2020-11-22 13:04:58 +00:00
else:
print('WARN: no result returned for feed ' + url)
2021-12-25 15:28:52 +00:00
except requests.exceptions.RequestException as ex:
2021-12-29 21:55:09 +00:00
print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
2022-01-03 12:37:09 +00:00
'headers: ' + str(session_headers) + ', ' +
'params: ' + str(session_params) + ', ' + str(ex))
2021-12-25 15:28:52 +00:00
except ValueError as ex:
2021-12-29 21:55:09 +00:00
print('WARN: get_rss failed\nurl: ' + str(url) + ', ' +
2022-01-03 12:37:09 +00:00
'headers: ' + str(session_headers) + ', ' +
'params: ' + str(session_params) + ', ' + str(ex))
2021-12-25 15:28:52 +00:00
except SocketError as ex:
if ex.errno == errno.ECONNRESET:
2021-12-29 21:55:09 +00:00
print('WARN: connection was reset during get_rss ' + str(ex))
2021-05-20 12:52:13 +00:00
else:
2021-12-29 21:55:09 +00:00
print('WARN: get_rss, ' + str(ex))
2020-10-04 09:51:12 +00:00
return None
2024-03-02 21:54:00 +00:00
def get_rss_from_dict(newswire: {},
2021-12-29 21:55:09 +00:00
http_prefix: str, domain_full: str,
2024-02-06 19:48:11 +00:00
translate: {}) -> str:
2020-10-04 12:29:07 +00:00
"""Returns an rss feed from the current newswire dict.
This allows other instances to subscribe to the same newswire
"""
2022-01-03 12:37:09 +00:00
rss_str = rss2header(http_prefix,
None, domain_full,
'Newswire', translate)
2020-11-03 14:41:28 +00:00
if not newswire:
return ''
2020-10-04 12:29:07 +00:00
for published, fields in newswire.items():
2020-10-20 12:22:52 +00:00
if '+00:00' in published:
published = published.replace('+00:00', 'Z').strip()
published = published.replace(' ', 'T')
else:
2022-01-03 12:37:09 +00:00
published_with_offset = \
2023-11-20 22:27:58 +00:00
date_from_string_format(published, ["%Y-%m-%d %H:%M:%S%z"])
published = published_with_offset.strftime("%Y-%m-%dT%H:%M:%S%z")
2020-10-04 22:08:13 +00:00
try:
2023-11-20 22:27:58 +00:00
pub_date = date_from_string_format(published,
["%Y-%m-%dT%H:%M:%S%z"])
2022-02-03 10:39:52 +00:00
except BaseException as ex:
2021-12-25 15:28:52 +00:00
print('WARN: Unable to convert date ' + published + ' ' + str(ex))
2020-10-04 22:08:13 +00:00
continue
2022-01-03 12:37:09 +00:00
rss_str += \
2021-07-04 11:02:08 +00:00
'<item>\n' + \
2023-01-02 10:24:35 +00:00
' <title>' + escape_text(fields[0]) + '</title>\n'
2021-12-27 15:52:08 +00:00
description = remove_html(first_paragraph_from_string(fields[4]))
2023-01-02 10:24:35 +00:00
rss_str += \
' <description>' + escape_text(description) + '</description>\n'
2020-10-08 15:07:06 +00:00
url = fields[1]
2020-11-08 11:04:52 +00:00
if '://' not in url:
2021-12-26 10:00:46 +00:00
if domain_full not in url:
url = http_prefix + '://' + domain_full + url
2022-01-03 12:37:09 +00:00
rss_str += ' <link>' + url + '</link>\n'
2020-10-04 22:12:27 +00:00
2022-01-03 12:37:09 +00:00
rss_date_str = pub_date.strftime("%a, %d %b %Y %H:%M:%S UT")
rss_str += \
' <pubDate>' + rss_date_str + '</pubDate>\n' + \
2021-07-04 11:02:08 +00:00
'</item>\n'
2022-01-03 12:37:09 +00:00
rss_str += rss2footer()
return rss_str
2020-10-04 12:29:07 +00:00
2021-12-29 21:55:09 +00:00
def _is_newswire_blog_post(post_json_object: {}) -> bool:
"""Is the given object a blog post?
2020-10-25 10:47:39 +00:00
There isn't any difference between a blog post and a newswire blog post
but we may here need to check for different properties than
2021-12-28 13:49:44 +00:00
is_blog_post does
"""
2021-12-25 22:09:19 +00:00
if not post_json_object:
return False
2021-12-26 10:57:03 +00:00
if not has_object_dict(post_json_object):
return False
2021-12-25 22:09:19 +00:00
if post_json_object['object'].get('summary') and \
post_json_object['object'].get('url') and \
post_json_object['object'].get('content') and \
post_json_object['object'].get('published'):
2021-12-28 14:41:10 +00:00
return is_public_post(post_json_object)
return False
2021-12-29 21:55:09 +00:00
def _get_hashtags_from_post(post_json_object: {}) -> []:
2020-10-16 20:13:23 +00:00
"""Returns a list of any hashtags within a post
"""
2021-12-26 10:57:03 +00:00
if not has_object_dict(post_json_object):
2020-10-16 20:13:23 +00:00
return []
2021-12-25 22:09:19 +00:00
if not post_json_object['object'].get('tag'):
2020-10-16 20:13:23 +00:00
return []
2021-12-25 22:09:19 +00:00
if not isinstance(post_json_object['object']['tag'], list):
2020-10-16 20:13:23 +00:00
return []
2024-12-23 17:45:20 +00:00
tags: list[str] = []
2022-01-03 12:37:09 +00:00
for tgname in post_json_object['object']['tag']:
if not isinstance(tgname, dict):
2020-10-16 20:13:23 +00:00
continue
2022-01-03 12:37:09 +00:00
if not tgname.get('name'):
2020-10-16 20:13:23 +00:00
continue
2022-01-03 12:37:09 +00:00
if not tgname.get('type'):
2020-10-16 20:13:23 +00:00
continue
2022-01-03 12:37:09 +00:00
if tgname['type'] != 'Hashtag':
2020-10-16 20:13:23 +00:00
continue
2022-01-03 12:37:09 +00:00
if tgname['name'] not in tags:
tags.append(tgname['name'])
2020-10-16 20:13:23 +00:00
return tags
2021-12-29 21:55:09 +00:00
def _add_account_blogs_to_newswire(base_dir: str, nickname: str, domain: str,
newswire: {},
2022-01-03 12:37:09 +00:00
max_blogs_per_account: int,
index_filename: str,
max_tags: int, system_language: str,
2021-12-29 21:55:09 +00:00
session, debug: bool) -> None:
2020-10-05 11:11:48 +00:00
"""Adds blogs for the given account to the newswire
"""
2022-01-03 12:37:09 +00:00
if not os.path.isfile(index_filename):
2020-10-05 11:11:48 +00:00
return
# local blog entries are unmoderated by default
moderated = False
# local blogs can potentially be moderated
2022-01-03 12:37:09 +00:00
moderated_filename = \
2021-12-26 12:02:29 +00:00
acct_dir(base_dir, nickname, domain) + '/.newswiremoderated'
2022-01-03 12:37:09 +00:00
if os.path.isfile(moderated_filename):
moderated = True
try:
2024-07-14 13:01:46 +00:00
with open(index_filename, 'r', encoding='utf-8') as fp_index:
post_filename = 'start'
ctr = 0
while post_filename:
2024-07-14 13:01:46 +00:00
post_filename = fp_index.readline()
2024-07-15 19:46:11 +00:00
if not post_filename:
ctr += 1
if ctr >= max_blogs_per_account:
break
continue
# if this is a full path then remove the directories
if '/' in post_filename:
post_filename = post_filename.split('/')[-1]
# filename of the post without any extension or path
# This should also correspond to any index entry in
# the posts cache
post_url = remove_eol(post_filename)
post_url = post_url.replace('.json', '').strip()
# read the post from file
full_post_filename = \
locate_post(base_dir, nickname,
domain, post_url, False)
if not full_post_filename:
print('Unable to locate post for newswire ' + post_url)
ctr += 1
if ctr >= max_blogs_per_account:
break
continue
2020-10-06 13:05:15 +00:00
2024-07-15 19:46:11 +00:00
post_json_object = None
if full_post_filename:
post_json_object = load_json(full_post_filename)
if _is_newswire_blog_post(post_json_object):
published = post_json_object['object']['published']
published = published.replace('T', ' ')
published = published.replace('Z', '+00:00')
2024-12-23 17:45:20 +00:00
votes: list[str] = []
2024-07-15 19:46:11 +00:00
if os.path.isfile(full_post_filename + '.votes'):
votes = load_json(full_post_filename + '.votes')
content = \
get_base_content_from_post(post_json_object,
system_language)
description = first_paragraph_from_string(content)
description = remove_html(description)
tags_from_post = \
_get_hashtags_from_post(post_json_object)
summary = post_json_object['object']['summary']
url2 = post_json_object['object']['url']
url_str = get_url_from_post(url2)
url3 = remove_html(url_str)
fediverse_handle = ''
2024-12-23 17:45:20 +00:00
extra_links: list[str] = []
2024-07-15 19:46:11 +00:00
_add_newswire_dict_entry(base_dir,
newswire, published,
summary, url3,
votes, full_post_filename,
description, moderated, False,
tags_from_post,
max_tags, session, debug,
None, system_language,
fediverse_handle, extra_links)
2020-10-05 11:11:48 +00:00
ctr += 1
if ctr >= max_blogs_per_account:
break
except OSError as exc:
print('EX: _add_account_blogs_to_newswire unable to read ' +
index_filename + ' ' + str(exc))
2020-10-05 11:11:48 +00:00
2021-12-29 21:55:09 +00:00
def _add_blogs_to_newswire(base_dir: str, domain: str, newswire: {},
2022-01-03 12:37:09 +00:00
max_blogs_per_account: int,
max_tags: int, system_language: str,
2021-12-29 21:55:09 +00:00
session, debug: bool) -> None:
"""Adds blogs from each user account into the newswire
2020-10-06 09:37:22 +00:00
"""
2022-01-03 12:37:09 +00:00
moderation_dict = {}
2020-10-05 11:11:48 +00:00
# go through each account
2024-05-12 12:35:26 +00:00
dir_str = data_dir(base_dir)
for _, dirs, _ in os.walk(dir_str):
2020-10-05 11:11:48 +00:00
for handle in dirs:
2021-12-26 18:46:43 +00:00
if not is_account_dir(handle):
2020-10-05 11:11:48 +00:00
continue
2020-10-06 09:37:22 +00:00
nickname = handle.split('@')[0]
# has this account been suspended?
2021-12-27 15:37:31 +00:00
if is_suspended(base_dir, nickname):
2020-10-06 08:58:44 +00:00
continue
2022-12-18 15:29:54 +00:00
handle_dir = acct_handle_dir(base_dir, handle)
if os.path.isfile(handle_dir + '/.nonewswire'):
2020-10-06 21:28:40 +00:00
continue
2020-10-05 11:11:48 +00:00
# is there a blogs timeline for this account?
2024-05-12 12:35:26 +00:00
account_dir = os.path.join(dir_str, handle)
2022-01-03 12:37:09 +00:00
blogs_index = account_dir + '/tlblogs.index'
if os.path.isfile(blogs_index):
2020-10-05 11:11:48 +00:00
domain = handle.split('@')[1]
2021-12-29 21:55:09 +00:00
_add_account_blogs_to_newswire(base_dir, nickname, domain,
2022-01-03 12:37:09 +00:00
newswire, max_blogs_per_account,
blogs_index, max_tags,
2021-12-29 21:55:09 +00:00
system_language, session,
debug)
2020-12-13 22:13:45 +00:00
break
2020-10-05 11:11:48 +00:00
# sort the moderation dict into chronological order, latest first
2022-01-03 12:37:09 +00:00
sorted_moderation_dict = \
OrderedDict(sorted(moderation_dict.items(), reverse=True))
2020-10-06 12:15:35 +00:00
# save the moderation queue details for later display
2022-01-03 12:37:09 +00:00
newswire_moderation_filename = \
2024-05-12 12:35:26 +00:00
data_dir(base_dir) + '/newswiremoderation.txt'
2022-01-03 12:37:09 +00:00
if sorted_moderation_dict:
save_json(sorted_moderation_dict, newswire_moderation_filename)
else:
# remove the file if there is nothing to moderate
2022-01-03 12:37:09 +00:00
if os.path.isfile(newswire_moderation_filename):
try:
2022-01-03 12:37:09 +00:00
os.remove(newswire_moderation_filename)
2021-11-25 18:42:38 +00:00
except OSError:
2021-12-29 21:55:09 +00:00
print('EX: _add_blogs_to_newswire unable to delete ' +
2022-01-03 12:37:09 +00:00
str(newswire_moderation_filename))
2020-10-05 11:11:48 +00:00
2021-12-29 21:55:09 +00:00
def get_dict_from_newswire(session, base_dir: str, domain: str,
2022-01-03 12:37:09 +00:00
max_posts_per_source: int, max_feed_size_kb: int,
max_tags: int, max_feed_item_size_kb: int,
2021-12-29 21:55:09 +00:00
max_newswire_posts: int,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb: int,
system_language: str, debug: bool,
2022-04-24 19:03:02 +00:00
preferred_podcast_formats: [],
timeout_sec: int) -> {}:
2020-10-04 09:59:55 +00:00
"""Gets rss feeds as a dictionary from newswire file
2020-10-04 09:51:12 +00:00
"""
2024-05-12 12:35:26 +00:00
subscriptions_filename = data_dir(base_dir) + '/newswire.txt'
2022-01-03 12:37:09 +00:00
if not os.path.isfile(subscriptions_filename):
2020-10-04 09:51:12 +00:00
return {}
2022-01-03 12:37:09 +00:00
max_posts_per_source = 5
2020-10-05 11:11:48 +00:00
# add rss feeds
2024-12-23 17:45:20 +00:00
rss_feed: list[str] = []
try:
with open(subscriptions_filename, 'r', encoding='utf-8') as fp_sub:
rss_feed = fp_sub.readlines()
except OSError:
print('EX: get_dict_from_newswire unable to read ' +
subscriptions_filename)
2020-10-04 09:51:12 +00:00
result = {}
2022-01-03 12:37:09 +00:00
for url in rss_feed:
2020-10-04 09:51:12 +00:00
url = url.strip()
# Does this contain a url?
2020-10-04 09:51:12 +00:00
if '://' not in url:
continue
# is this a comment?
2020-10-04 09:51:12 +00:00
if url.startswith('#'):
continue
# should this feed be moderated?
moderated = False
if '*' in url:
moderated = True
url = url.replace('*', '').strip()
# should this feed content be mirrored?
mirrored = False
if '!' in url:
mirrored = True
url = url.replace('!', '').strip()
2022-01-03 12:37:09 +00:00
items_list = get_rss(base_dir, domain, session, url,
moderated, mirrored,
max_posts_per_source, max_feed_size_kb,
max_feed_item_size_kb,
2022-06-09 14:58:47 +00:00
max_categories_feed_item_size_kb, debug,
2022-04-24 19:03:02 +00:00
preferred_podcast_formats,
2022-09-25 17:26:11 +00:00
timeout_sec, system_language)
2022-01-03 12:37:09 +00:00
if items_list:
for date_str, item in items_list.items():
result[date_str] = item
time.sleep(4)
2020-10-05 11:11:48 +00:00
# add blogs from each user account
2021-12-29 21:55:09 +00:00
_add_blogs_to_newswire(base_dir, domain, result,
2022-01-03 12:37:09 +00:00
max_posts_per_source, max_tags, system_language,
2021-12-29 21:55:09 +00:00
session, debug)
2020-10-05 11:11:48 +00:00
# sort into chronological order, latest first
2022-01-03 12:37:09 +00:00
sorted_result = OrderedDict(sorted(result.items(), reverse=True))
# are there too many posts? If so then remove the oldest ones
2022-01-03 12:37:09 +00:00
no_of_posts = len(sorted_result.items())
if no_of_posts > max_newswire_posts:
2020-11-22 12:05:15 +00:00
ctr = 0
2024-12-23 17:45:20 +00:00
removals: list[str] = []
2022-01-03 12:37:09 +00:00
for date_str, item in sorted_result.items():
2020-11-22 12:05:15 +00:00
ctr += 1
2021-12-25 18:49:19 +00:00
if ctr > max_newswire_posts:
2022-01-03 12:37:09 +00:00
removals.append(date_str)
for remov in removals:
sorted_result.pop(remov)
2022-01-03 12:37:09 +00:00
return sorted_result