From 786d53622bc3ea8a6032acbbb50ad1f43023fd99 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 09:17:33 +0000
Subject: [PATCH 01/16] Extra characters removal
---
webapp_podcast.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/webapp_podcast.py b/webapp_podcast.py
index b4d3ae48e..66031a2e0 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -211,7 +211,7 @@ def html_podcast_episode(css_cache: {}, translate: {},
html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
podcast_description = remove_html(podcast_description)
if podcast_description:
- remove_chars = ('Œ', 'â€', 'ğŸ', '�')
+ remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
for remchar in remove_chars:
podcast_description = podcast_description.replace(remchar, '')
podcast_str += '' + podcast_description + '
\n'
From bdf1c77408adede77e615556271acffce4af5a26 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 09:25:41 +0000
Subject: [PATCH 02/16] Regenerate links within podcast descriptions
---
webapp_podcast.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/webapp_podcast.py b/webapp_podcast.py
index 66031a2e0..8b12520d6 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -14,6 +14,7 @@ from shutil import copyfile
from utils import get_config_param
from utils import remove_html
from media import path_is_audio
+from content import add_web_links
from webapp_utils import get_broken_link_substitute
from webapp_utils import html_header_with_external_style
from webapp_utils import html_footer
@@ -209,11 +210,15 @@ def html_podcast_episode(css_cache: {}, translate: {},
if newswire_item[4]:
podcast_description = \
html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
+ # Why remove html? Potentially podcast descriptions could contain
+ # arbitrary html with attack scripts, etc
podcast_description = remove_html(podcast_description)
if podcast_description:
remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
for remchar in remove_chars:
podcast_description = podcast_description.replace(remchar, '')
+ # recreate any url links safely
+ podcast_description = add_web_links(podcast_description)
podcast_str += '' + podcast_description + '
\n'
# donate button
From 9a0185ef3ca98d94c12dfc4ddd5ac78b5ef4dcd6 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 10:20:37 +0000
Subject: [PATCH 03/16] Unit test for safe html
---
content.py | 16 ++++++++++++++++
tests.py | 26 ++++++++++++++++++++++++++
webapp_podcast.py | 11 ++---------
3 files changed, 44 insertions(+), 9 deletions(-)
diff --git a/content.py b/content.py
index 2394e21a1..2135006b8 100644
--- a/content.py
+++ b/content.py
@@ -486,6 +486,22 @@ def add_web_links(content: str) -> str:
return content
+def safe_web_text(arbitrary_html: str) -> str:
+ """Turns arbitrary html into something safe.
+ So if the arbitrary html contains attack scripts those will be removed
+ """
+ # first remove the markup, so that we have something safe
+ safe_text = remove_html(arbitrary_html)
+ if not safe_text:
+ return ''
+ # remove any spurious characters found in podcast descriptions
+ remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
+ for remchar in remove_chars:
+ safe_text = safe_text.replace(remchar, '')
+ # recreate any url links safely
+ return add_web_links(safe_text)
+
+
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
replace_hashtags: {}, post_hashtags: {}) -> bool:
"""Detects hashtags and adds them to the replacements dict
diff --git a/tests.py b/tests.py
index ea7349a6c..07baf487c 100644
--- a/tests.py
+++ b/tests.py
@@ -128,6 +128,7 @@ from inbox import json_post_allows_comments
from inbox import valid_inbox
from inbox import valid_inbox_filenames
from categories import guess_hashtag_category
+from content import safe_web_text
from content import words_similarity
from content import get_price_from_string
from content import limit_repeated_words
@@ -6488,6 +6489,30 @@ def _test_get_link_from_rss_item() -> None:
assert link.startswith('https://test.link/creativecommons')
+def _test_safe_webtext() -> None:
+ print('test_safe_webtext')
+ web_text = 'Some text including a link https://some.site/some-path
'
+ expected_text = 'Some text including a link ' + \
+ '' not in safe_text
+ assert '
' not in safe_text
+
+ web_text = 'Some text with '
+ expected_text = 'Some text with some script'
+ safe_text = safe_web_text(web_text)
+ if expected_text != safe_text:
+ print('Original html: ' + web_text)
+ print('Expected html: ' + expected_text)
+ print('Actual html: ' + safe_text)
+ assert expected_text == safe_text
+
+
def run_all_tests():
base_dir = os.getcwd()
print('Running tests...')
@@ -6504,6 +6529,7 @@ def run_all_tests():
'message_json', 'liked_post_json'])
_test_checkbox_names()
_test_functions()
+ _test_safe_webtext()
_test_get_link_from_rss_item()
_test_xml_podcast_dict()
_test_get_actor_from_in_reply_to()
diff --git a/webapp_podcast.py b/webapp_podcast.py
index 8b12520d6..435400772 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -14,7 +14,7 @@ from shutil import copyfile
from utils import get_config_param
from utils import remove_html
from media import path_is_audio
-from content import add_web_links
+from content import safe_web_text
from webapp_utils import get_broken_link_substitute
from webapp_utils import html_header_with_external_style
from webapp_utils import html_footer
@@ -210,15 +210,8 @@ def html_podcast_episode(css_cache: {}, translate: {},
if newswire_item[4]:
podcast_description = \
html.unescape(urllib.parse.unquote_plus(newswire_item[4]))
- # Why remove html? Potentially podcast descriptions could contain
- # arbitrary html with attack scripts, etc
- podcast_description = remove_html(podcast_description)
+ podcast_description = safe_web_text(podcast_description)
if podcast_description:
- remove_chars = ('Œ', 'â€', 'ğŸ', '�', ']]')
- for remchar in remove_chars:
- podcast_description = podcast_description.replace(remchar, '')
- # recreate any url links safely
- podcast_description = add_web_links(podcast_description)
podcast_str += '' + podcast_description + '
\n'
# donate button
From c395261523bef0dba18f40359af5839e40736283 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 12:53:29 +0000
Subject: [PATCH 04/16] Snake case
---
follow.py | 48 ++++++++++++++++++++++++------------------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/follow.py b/follow.py
index 70e7c0b71..d65636899 100644
--- a/follow.py
+++ b/follow.py
@@ -999,20 +999,20 @@ def send_follow_requestViaServer(base_dir: str, session,
# get the actor inbox for the To handle
origin_domain = from_domain
- (inboxUrl, _, _, fromPersonId, sharedInbox, avatarUrl,
- displayName, _) = get_person_box(signing_priv_key_pem, origin_domain,
- base_dir, session, wf_request,
- person_cache,
- project_version, http_prefix,
- from_nickname,
- from_domain, post_to_box, 52025)
+ (inbox_url, _, _, from_person_id, _, _,
+ _, _) = get_person_box(signing_priv_key_pem, origin_domain,
+ base_dir, session, wf_request,
+ person_cache,
+ project_version, http_prefix,
+ from_nickname,
+ from_domain, post_to_box, 52025)
- if not inboxUrl:
+ if not inbox_url:
if debug:
print('DEBUG: follow request no ' + post_to_box +
' was found for ' + handle)
return 3
- if not fromPersonId:
+ if not from_person_id:
if debug:
print('DEBUG: follow request no actor was found for ' + handle)
return 4
@@ -1026,10 +1026,10 @@ def send_follow_requestViaServer(base_dir: str, session,
}
post_result = \
post_json(http_prefix, from_domain_full,
- session, new_follow_json, [], inboxUrl, headers, 3, True)
+ session, new_follow_json, [], inbox_url, headers, 3, True)
if not post_result:
if debug:
- print('DEBUG: POST follow request failed for c2s to ' + inboxUrl)
+ print('DEBUG: POST follow request failed for c2s to ' + inbox_url)
return 5
if debug:
@@ -1095,22 +1095,22 @@ def send_unfollow_request_via_server(base_dir: str, session,
# get the actor inbox for the To handle
origin_domain = from_domain
- (inboxUrl, pubKeyId, pubKey, fromPersonId, sharedInbox, avatarUrl,
- displayName, _) = get_person_box(signing_priv_key_pem,
- origin_domain,
- base_dir, session,
- wf_request, person_cache,
- project_version, http_prefix,
- from_nickname,
- from_domain, post_to_box,
- 76536)
+ (inbox_url, _, _, from_person_id, _, _,
+ _, _) = get_person_box(signing_priv_key_pem,
+ origin_domain,
+ base_dir, session,
+ wf_request, person_cache,
+ project_version, http_prefix,
+ from_nickname,
+ from_domain, post_to_box,
+ 76536)
- if not inboxUrl:
+ if not inbox_url:
if debug:
print('DEBUG: unfollow no ' + post_to_box +
' was found for ' + handle)
return 3
- if not fromPersonId:
+ if not from_person_id:
if debug:
print('DEBUG: unfollow no actor was found for ' + handle)
return 4
@@ -1124,10 +1124,10 @@ def send_unfollow_request_via_server(base_dir: str, session,
}
post_result = \
post_json(http_prefix, from_domain_full,
- session, unfollow_json, [], inboxUrl, headers, 3, True)
+ session, unfollow_json, [], inbox_url, headers, 3, True)
if not post_result:
if debug:
- print('DEBUG: POST unfollow failed for c2s to ' + inboxUrl)
+ print('DEBUG: POST unfollow failed for c2s to ' + inbox_url)
return 5
if debug:
From 75a21345cc353240eedbccdfe87febc230073202 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 13:15:43 +0000
Subject: [PATCH 05/16] Lower case appears to be the standard
---
newswire.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/newswire.py b/newswire.py
index 425220c5b..717b706d5 100644
--- a/newswire.py
+++ b/newswire.py
@@ -229,8 +229,8 @@ def _add_newswire_dict_entry(base_dir: str, domain: str,
# Include tags from podcast categories
if podcast_properties:
if podcast_properties.get('explicit'):
- if '#NSFW' not in post_tags:
- post_tags.append('#NSFW')
+ if '#nsfw' not in post_tags:
+ post_tags.append('#nsfw')
post_tags += podcast_properties['categories']
From 6cceef2386d032eee613ed88eebd457ecca7f35b Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 17:40:42 +0000
Subject: [PATCH 06/16] Podcast processing for youtube feeds
---
newswire.py | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/newswire.py b/newswire.py
index 717b706d5..0bd032607 100644
--- a/newswire.py
+++ b/newswire.py
@@ -501,7 +501,7 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
# get the image for the podcast, if it exists
podcast_episode_image = None
- episode_image_tags = [' {}:
episode_image = episode_image.split('"')[0]
podcast_episode_image = episode_image
break
+ elif 'url="' in episode_image:
+ episode_image = episode_image.split('url="')[1]
+ if '"' in episode_image:
+ episode_image = episode_image.split('"')[0]
+ podcast_episode_image = episode_image
+ break
else:
if '>' in episode_image:
episode_image = episode_image.split('>')[1]
@@ -1019,9 +1025,15 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
description = atom_item.split('')[1]
description = description.split('')[0]
description = remove_html(description)
- link = atom_item.split('')[1]
- link = link.split('')[0]
- link = 'https://www.youtube.com/watch?v=' + link.strip()
+
+ link, link_mime_type = get_link_from_rss_item(atom_item)
+ if not link:
+ link = atom_item.split('')[1]
+ link = link.split('')[0]
+ link = 'https://www.youtube.com/watch?v=' + link.strip()
+ if not link:
+ continue
+
pub_date = atom_item.split('')[1]
pub_date = pub_date.split('')[0]
@@ -1030,13 +1042,16 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
if _valid_feed_date(pub_date_str):
post_filename = ''
votes_status = []
+ podcast_properties = xml_podcast_to_dict(atom_item, xml_str)
+ if podcast_properties:
+ podcast_properties['linkMimeType'] = link_mime_type
_add_newswire_dict_entry(base_dir, domain,
result, pub_date_str,
title, link,
votes_status, post_filename,
description, moderated, mirrored,
[], 32, session, debug,
- None)
+ podcast_properties)
post_ctr += 1
if post_ctr >= max_posts_per_source:
break
From e539d3afc09ab6129a159f33eec8a2ef2ba25d51 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 17:55:56 +0000
Subject: [PATCH 07/16] More precise obtaining of podcast image
---
newswire.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/newswire.py b/newswire.py
index 0bd032607..2aba075f3 100644
--- a/newswire.py
+++ b/newswire.py
@@ -510,6 +510,9 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
item_str = xml_str
episode_image = item_str.split(image_tag)[1]
+ if image_tag + ' ' in item_str and '>' in episode_image:
+ episode_image = episode_image.split('>')[0]
+
if 'href="' in episode_image:
episode_image = episode_image.split('href="')[1]
if '"' in episode_image:
From c05b569ce69aadbf1d9f12d5abecab3fae81be09 Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 18:05:29 +0000
Subject: [PATCH 08/16] Tidying
---
newswire.py | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/newswire.py b/newswire.py
index 2aba075f3..87ac339b3 100644
--- a/newswire.py
+++ b/newswire.py
@@ -441,7 +441,8 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
"""
if ' {}:
episode_image = episode_image.split('"')[0]
podcast_episode_image = episode_image
break
- else:
- if '>' in episode_image:
- episode_image = episode_image.split('>')[1]
- if '<' in episode_image:
- episode_image = episode_image.split('<')[0]
- if '://' in episode_image and '.' in episode_image:
- podcast_episode_image = episode_image
- break
+ elif '>' in episode_image:
+ episode_image = episode_image.split('>')[1]
+ if '<' in episode_image:
+ episode_image = episode_image.split('<')[0]
+ if '://' in episode_image and '.' in episode_image:
+ podcast_episode_image = episode_image
+ break
# get categories if they exist. These can be turned into hashtags
podcast_categories = _get_podcast_categories(xml_item, xml_str)
From e3a702efe6740a1ce09afb351ff00052276b81dd Mon Sep 17 00:00:00 2001
From: Bob Mottram
Date: Fri, 14 Jan 2022 18:48:43 +0000
Subject: [PATCH 09/16] Mime type for youtube videos
---
newswire.py | 4 ++--
webapp_media.py | 20 ++++++++++----------
webapp_podcast.py | 9 ++++++++-
3 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/newswire.py b/newswire.py
index 87ac339b3..e0a27f8a9 100644
--- a/newswire.py
+++ b/newswire.py
@@ -1029,7 +1029,7 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
description = description.split('')[0]
description = remove_html(description)
- link, link_mime_type = get_link_from_rss_item(atom_item)
+ link, _ = get_link_from_rss_item(atom_item)
if not link:
link = atom_item.split('')[1]
link = link.split('')[0]
@@ -1047,7 +1047,7 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
votes_status = []
podcast_properties = xml_podcast_to_dict(atom_item, xml_str)
if podcast_properties:
- podcast_properties['linkMimeType'] = link_mime_type
+ podcast_properties['linkMimeType'] = 'video/youtube'
_add_newswire_dict_entry(base_dir, domain,
result, pub_date_str,
title, link,
diff --git a/webapp_media.py b/webapp_media.py
index 3f019243e..03cb19faa 100644
--- a/webapp_media.py
+++ b/webapp_media.py
@@ -39,8 +39,8 @@ def _add_embedded_video_from_sites(translate: {}, content: str,
url = content.split('>vimeo.com/')[1]
if '<' in url:
url = url.split('<')[0]
- content = \
- content + "\n