From 7d125c25f1c8568f2a80cdfed22c0b6a23d5cb42 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 1 Jan 2023 22:28:13 +0000 Subject: [PATCH 1/9] Filter out posts containing zero width spaces --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index ce258e548..4c9ad21e6 100644 --- a/utils.py +++ b/utils.py @@ -36,7 +36,7 @@ VALID_HASHTAG_CHARS = \ # both incoming and outgoing. # Could include dubious clacks or admin dogwhistles INVALID_CHARACTERS = ( - '卐', '卍', '࿕', '࿖', '࿗', '࿘', 'ϟϟ', '🏳️‍🌈🚫', '⚡⚡' + '卐', '卍', '࿕', '࿖', '࿗', '࿘', 'ϟϟ', '🏳️‍🌈🚫', '⚡⚡', '​' ) INVALID_ACTOR_URL_CHARACTERS = ( From ab0ca588c98eebda31e6ea1456320cac60acf95a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 09:55:41 +0000 Subject: [PATCH 2/9] Escape text within blog rss feed --- blog.py | 4 +++- utils.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/blog.py b/blog.py index acf875153..84379d48a 100644 --- a/blog.py +++ b/blog.py @@ -35,6 +35,7 @@ from utils import load_json from utils import first_paragraph_from_string from utils import get_actor_property_url from utils import acct_dir +from utils import escape_text from posts import create_blogs_timeline from newswire import rss2header from newswire import rss2footer @@ -375,12 +376,13 @@ def _html_blog_post_rss2(domain: str, post_json_object: {}, post_json_object['object'].get('published'): published = post_json_object['object']['published'] pub_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ") - title_str = post_json_object['object']['summary'] + title_str = escape_text(post_json_object['object']['summary']) rss_date_str = pub_date.strftime("%a, %d %b %Y %H:%M:%S UT") content = \ get_base_content_from_post(post_json_object, system_language) description = first_paragraph_from_string(content) + description = escape_text(description) rss_str = ' ' rss_str += ' ' + title_str + '' rss_str += ' ' + message_link + '' diff --git a/utils.py b/utils.py index 4c9ad21e6..d5abc8b3f 100644 --- a/utils.py +++ b/utils.py @@ -4226,3 +4226,18 @@ def license_link_from_name(license: str) -> str: else: value = 'https://creativecommons.org/publicdomain/zero/1.0' return value + + +def escape_text(txt: str) -> str: + """Escape text for inclusion in xml/rss + """ + replacements = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'" + } + for orig, replacement in replacements.items(): + txt = txt.replace(orig, replacement) + return txt From e371a4d65e46f0898905fb521cdf39c20d85fe0a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 10:24:35 +0000 Subject: [PATCH 3/9] Escape rss titles and descriptions --- feeds.py | 7 +++++-- newswire.py | 24 +++++++++++++++++++++--- utils.py | 15 +++++++++++++++ webapp_hashtagswarm.py | 6 ++++-- webapp_search.py | 4 +++- 5 files changed, 48 insertions(+), 8 deletions(-) diff --git a/feeds.py b/feeds.py index 6b91885c3..ea8589dec 100644 --- a/feeds.py +++ b/feeds.py @@ -8,6 +8,9 @@ __status__ = "Production" __module_group__ = "RSS Feeds" +from utils import escape_text + + def rss2tag_header(hashtag: str, http_prefix: str, domain_full: str) -> str: """Header for rss 2 """ @@ -15,9 +18,9 @@ def rss2tag_header(hashtag: str, http_prefix: str, domain_full: str) -> str: "" + \ "" + \ '' + \ - ' #' + hashtag + '' + \ + ' #' + escape_text(hashtag) + '' + \ ' ' + http_prefix + '://' + domain_full + \ - '/tags/rss2/' + hashtag + '' + '/tags/rss2/' + escape_text(hashtag) + '' def rss2tag_footer() -> str: diff --git a/newswire.py b/newswire.py index 56e667bc4..d80e7546b 100644 --- a/newswire.py +++ b/newswire.py @@ -39,6 +39,8 @@ from utils import remove_html from utils import is_account_dir from utils import acct_dir from utils import local_actor_url +from utils import escape_text +from utils import unescaped_text from blocking import is_blocked_domain from blocking import is_blocked_hashtag from filters import is_filtered @@ -76,8 +78,9 @@ def rss2header(http_prefix: str, ' ' + http_prefix + '://' + domain_full + \ '/blog/rss.xml' + '' else: + title_str = escape_text(translate[title]) rss_str += \ - ' ' + translate[title] + '' + \ + ' ' + title_str + '' + \ ' ' + \ local_actor_url(http_prefix, nickname, domain_full) + \ '/rss.xml' + '' @@ -407,12 +410,14 @@ def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str, continue category_str = rss_item.split('')[1] category_str = category_str.split('')[0].strip() + category_str = unescaped_text(category_str) if not category_str: continue if 'CDATA' in category_str: continue hashtag_list_str = rss_item.split('')[1] hashtag_list_str = hashtag_list_str.split('')[0].strip() + hashtag_list_str = unescaped_text(hashtag_list_str) if not hashtag_list_str: continue if 'CDATA' in hashtag_list_str: @@ -766,17 +771,20 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str, title = rss_item.split('')[1] title = _remove_cdata(title.split('')[0]) + title = unescaped_text(title) title = remove_html(title) description = '' if '' in rss_item and '' in rss_item: description = rss_item.split('')[1] description = remove_html(description.split('')[0]) + description = unescaped_text(description) else: if '' in rss_item and \ '' in rss_item: description = rss_item.split('')[1] description = description.split('')[0] + description = unescaped_text(description) description = remove_html(description) proxy_type = None @@ -874,16 +882,19 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str, continue title = rss_item.split('')[1] title = _remove_cdata(title.split('')[0]) + title = unescaped_text(title) title = remove_html(title) description = '' if '' in rss_item and '' in rss_item: description = rss_item.split('')[1] description = remove_html(description.split('')[0]) + description = unescaped_text(description) else: if '' in rss_item and \ '' in rss_item: description = rss_item.split('')[1] description = description.split('')[0] + description = unescaped_text(description) description = remove_html(description) proxy_type = None @@ -969,16 +980,19 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str, continue title = atom_item.split('')[1] title = _remove_cdata(title.split('')[0]) + title = unescaped_text(title) title = remove_html(title) description = '' if '' in atom_item and '' in atom_item: description = atom_item.split('')[1] description = remove_html(description.split('')[0]) + description = unescaped_text(description) else: if '' in atom_item and \ '' in atom_item: description = atom_item.split('')[1] description = description.split('')[0] + description = unescaped_text(description) description = remove_html(description) proxy_type = None @@ -1184,15 +1198,18 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str, continue title = atom_item.split('')[1] title = _remove_cdata(title.split('')[0]) + title = unescaped_text(title) description = '' if '' in atom_item and \ '' in atom_item: description = atom_item.split('')[1] description = description.split('')[0] + description = unescaped_text(description) description = remove_html(description) elif '' in atom_item and '' in atom_item: description = atom_item.split('')[1] description = description.split('')[0] + description = unescaped_text(description) description = remove_html(description) link, _ = get_link_from_rss_item(atom_item, None, None) @@ -1382,9 +1399,10 @@ def get_rs_sfrom_dict(base_dir: str, newswire: {}, continue rss_str += \ '\n' + \ - ' ' + fields[0] + '\n' + ' ' + escape_text(fields[0]) + '\n' description = remove_html(first_paragraph_from_string(fields[4])) - rss_str += ' ' + description + '\n' + rss_str += \ + ' ' + escape_text(description) + '\n' url = fields[1] if '://' not in url: if domain_full not in url: diff --git a/utils.py b/utils.py index d5abc8b3f..da8702791 100644 --- a/utils.py +++ b/utils.py @@ -4241,3 +4241,18 @@ def escape_text(txt: str) -> str: for orig, replacement in replacements.items(): txt = txt.replace(orig, replacement) return txt + + +def unescaped_text(txt: str) -> str: + """Escape text for inclusion in xml/rss + """ + replacements = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'" + } + for orig, replacement in replacements.items(): + txt = txt.replace(replacement, orig) + return txt diff --git a/webapp_hashtagswarm.py b/webapp_hashtagswarm.py index 9cc0aeeeb..2e6e95f4a 100644 --- a/webapp_hashtagswarm.py +++ b/webapp_hashtagswarm.py @@ -11,6 +11,7 @@ import os from datetime import datetime from utils import get_nickname_from_actor from utils import get_config_param +from utils import escape_text from categories import get_hashtag_categories from categories import get_hashtag_category from webapp_utils import set_custom_background @@ -41,7 +42,7 @@ def get_hashtag_categories_feed(base_dir: str, for category_str, hashtag_list in hashtag_categories.items(): rss_str += \ '\n' + \ - ' ' + category_str + '\n' + ' ' + escape_text(category_str) + '\n' list_str = '' for hashtag in hashtag_list: if ':' in hashtag: @@ -50,7 +51,8 @@ def get_hashtag_categories_feed(base_dir: str, continue list_str += hashtag + ' ' rss_str += \ - ' ' + list_str.strip() + '\n' + \ + ' ' + \ + escape_text(list_str.strip()) + '\n' + \ ' \n' + \ ' ' + rss_date_str + '\n' + \ '\n' diff --git a/webapp_search.py b/webapp_search.py index 077cb99cb..41628549b 100644 --- a/webapp_search.py +++ b/webapp_search.py @@ -26,6 +26,7 @@ from utils import search_box_posts from utils import get_alt_path from utils import acct_dir from utils import local_actor_url +from utils import escape_text from skills import no_of_actor_skills from skills import get_skills_from_list from categories import get_hashtag_category @@ -1133,12 +1134,13 @@ def rss_hashtag_search(nickname: str, domain: str, port: int, if post_json_object['object'].get('summary'): hashtag_feed += \ ' ' + \ - post_json_object['object']['summary'] + \ + escape_text(post_json_object['object']['summary']) + \ '' description = \ get_base_content_from_post(post_json_object, system_language) description = first_paragraph_from_string(description) + description = escape_text(description) hashtag_feed += \ ' ' + description + '' hashtag_feed += \ From bca431c95d64d9f6a966c8e0d087bae092a60c05 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 10:37:57 +0000 Subject: [PATCH 4/9] Don't apply first post to blogs --- person.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/person.py b/person.py index 6badac74a..16abd7561 100644 --- a/person.py +++ b/person.py @@ -1071,7 +1071,7 @@ def person_box_json(recent_posts_cache: {}, if boxname == 'tlblogs': return create_blogs_timeline(base_dir, nickname, domain, port, http_prefix, no_of_items, header_only, - page_number, first_post_id) + page_number) if boxname == 'outbox': return create_outbox(base_dir, nickname, domain, port, http_prefix, From e930eb3aaf1ee62c10fbd1db2d6ed17bda6b057b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 10:41:42 +0000 Subject: [PATCH 5/9] Don't apply first post to blogs --- person.py | 2 +- posts.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/person.py b/person.py index 16abd7561..8372d573a 100644 --- a/person.py +++ b/person.py @@ -1071,7 +1071,7 @@ def person_box_json(recent_posts_cache: {}, if boxname == 'tlblogs': return create_blogs_timeline(base_dir, nickname, domain, port, http_prefix, no_of_items, header_only, - page_number) + page_number, '') if boxname == 'outbox': return create_outbox(base_dir, nickname, domain, port, http_prefix, diff --git a/posts.py b/posts.py index 3d11250b2..82997dc9d 100644 --- a/posts.py +++ b/posts.py @@ -3626,12 +3626,11 @@ def create_replies_timeline(recent_posts_cache: {}, def create_blogs_timeline(base_dir: str, nickname: str, domain: str, port: int, http_prefix: str, items_per_page: int, - header_only: bool, page_number: int, - first_post_id: str) -> {}: + header_only: bool, page_number: int) -> {}: return _create_box_indexed({}, base_dir, 'tlblogs', nickname, domain, port, http_prefix, items_per_page, header_only, True, - 0, False, 0, page_number, first_post_id) + 0, False, 0, page_number) def create_features_timeline(base_dir: str, From 71c291abd18d6c0a5b3f390628df60158be2e311 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 10:44:49 +0000 Subject: [PATCH 6/9] Remove unused argument --- blog.py | 6 +++--- person.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/blog.py b/blog.py index 84379d48a..168078764 100644 --- a/blog.py +++ b/blog.py @@ -544,7 +544,7 @@ def html_blog_page(authorized: bool, session, timeline_json = \ create_blogs_timeline(base_dir, nickname, domain, port, http_prefix, - no_of_items, False, page_number, '') + no_of_items, False, page_number) if not timeline_json: return blog_str + html_footer() @@ -633,7 +633,7 @@ def html_blog_page_rss2(base_dir: str, http_prefix: str, translate: {}, nickname, domain, port, http_prefix, no_of_items, False, - page_number, '') + page_number) if not timeline_json: if include_header: @@ -672,7 +672,7 @@ def html_blog_page_rss3(base_dir: str, http_prefix: str, timeline_json = \ create_blogs_timeline(base_dir, nickname, domain, port, http_prefix, - no_of_items, False, page_number, '') + no_of_items, False, page_number) if not timeline_json: return blog_rss3 diff --git a/person.py b/person.py index 8372d573a..16abd7561 100644 --- a/person.py +++ b/person.py @@ -1071,7 +1071,7 @@ def person_box_json(recent_posts_cache: {}, if boxname == 'tlblogs': return create_blogs_timeline(base_dir, nickname, domain, port, http_prefix, no_of_items, header_only, - page_number, '') + page_number) if boxname == 'outbox': return create_outbox(base_dir, nickname, domain, port, http_prefix, From 58971bb3b7d4886c090700e35dcd30662cd17fc6 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 11:23:05 +0000 Subject: [PATCH 7/9] Check for single posts on timelines --- webapp_timeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp_timeline.py b/webapp_timeline.py index 8208c7ecf..f4a929cee 100644 --- a/webapp_timeline.py +++ b/webapp_timeline.py @@ -1072,7 +1072,7 @@ def html_timeline(default_timeline: str, if item_ctr > 0: # if showing the page down icon then remove the last item so that # firstpost does not overlap on the next timeline - if last_item_str: + if last_item_str and first_post_id != last_post_id: tl_str = tl_str.replace(last_item_str, '') tl_str += text_mode_separator first_post = '' From 5257ba4f55b33c277f4a04b3f9a102e9f74b8e41 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 11:27:52 +0000 Subject: [PATCH 8/9] Only remove last post if there are some quantity --- webapp_timeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webapp_timeline.py b/webapp_timeline.py index f4a929cee..e1ce92329 100644 --- a/webapp_timeline.py +++ b/webapp_timeline.py @@ -1073,7 +1073,8 @@ def html_timeline(default_timeline: str, # if showing the page down icon then remove the last item so that # firstpost does not overlap on the next timeline if last_item_str and first_post_id != last_post_id: - tl_str = tl_str.replace(last_item_str, '') + if item_ctr > items_per_page / 2: + tl_str = tl_str.replace(last_item_str, '') tl_str += text_mode_separator first_post = '' if last_post_id: From 586c482bea38a75fe71bd616f7023fbd16f477b7 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 2 Jan 2023 11:41:48 +0000 Subject: [PATCH 9/9] Tidying --- utils.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/utils.py b/utils.py index da8702791..dad2c098e 100644 --- a/utils.py +++ b/utils.py @@ -4228,17 +4228,22 @@ def license_link_from_name(license: str) -> str: return value -def escape_text(txt: str) -> str: - """Escape text for inclusion in xml/rss +def _get_escaped_chars() -> {}: + """Returns escaped characters """ - replacements = { + return { "&": "&", "<": "<", ">": ">", '"': """, "'": "'" } - for orig, replacement in replacements.items(): + + +def escape_text(txt: str) -> str: + """Escape text for inclusion in xml/rss + """ + for orig, replacement in _get_escaped_chars().items(): txt = txt.replace(orig, replacement) return txt @@ -4246,13 +4251,6 @@ def escape_text(txt: str) -> str: def unescaped_text(txt: str) -> str: """Escape text for inclusion in xml/rss """ - replacements = { - "&": "&", - "<": "<", - ">": ">", - '"': """, - "'": "'" - } - for orig, replacement in replacements.items(): + for orig, replacement in _get_escaped_chars().items(): txt = txt.replace(replacement, orig) return txt