Escape rss titles and descriptions

2023-01-02 10:24:35 +00:00 · 2023-01-02 10:24:35 +00:00 · e371a4d65e
parent ab0ca588c9
commit e371a4d65e
5 changed files with 48 additions and 8 deletions
--- a/feeds.py
+++ b/feeds.py
@ -8,6 +8,9 @@ __status__ = "Production"
 __module_group__ = "RSS Feeds"


+from utils import escape_text
+
+
 def rss2tag_header(hashtag: str, http_prefix: str, domain_full: str) -> str:
    """Header for rss 2
    """
@ -15,9 +18,9 @@ def rss2tag_header(hashtag: str, http_prefix: str, domain_full: str) -> str:
        "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \
        "<rss version=\"2.0\">" + \
        '<channel>' + \
-        '    <title>#' + hashtag + '</title>' + \
+        '    <title>#' + escape_text(hashtag) + '</title>' + \
        '    <link>' + http_prefix + '://' + domain_full + \
-        '/tags/rss2/' + hashtag + '</link>'
+        '/tags/rss2/' + escape_text(hashtag) + '</link>'


 def rss2tag_footer() -> str:
--- a/newswire.py
+++ b/newswire.py
@ -39,6 +39,8 @@ from utils import remove_html
 from utils import is_account_dir
 from utils import acct_dir
 from utils import local_actor_url
+from utils import escape_text
+from utils import unescaped_text
 from blocking import is_blocked_domain
 from blocking import is_blocked_hashtag
 from filters import is_filtered
@ -76,8 +78,9 @@ def rss2header(http_prefix: str,
            '    <link>' + http_prefix + '://' + domain_full + \
            '/blog/rss.xml' + '</link>'
    else:
+        title_str = escape_text(translate[title])
        rss_str += \
-            '    <title>' + translate[title] + '</title>' + \
+            '    <title>' + title_str + '</title>' + \
            '    <link>' + \
            local_actor_url(http_prefix, nickname, domain_full) + \
            '/rss.xml' + '</link>'
@ -407,12 +410,14 @@ def _xml2str_to_hashtag_categories(base_dir: str, xml_str: str,
            continue
        category_str = rss_item.split('<title>')[1]
        category_str = category_str.split('</title>')[0].strip()
+        category_str = unescaped_text(category_str)
        if not category_str:
            continue
        if 'CDATA' in category_str:
            continue
        hashtag_list_str = rss_item.split('<description>')[1]
        hashtag_list_str = hashtag_list_str.split('</description>')[0].strip()
+        hashtag_list_str = unescaped_text(hashtag_list_str)
        if not hashtag_list_str:
            continue
        if 'CDATA' in hashtag_list_str:
@ -766,17 +771,20 @@ def _xml2str_to_dict(base_dir: str, domain: str, xml_str: str,

        title = rss_item.split('<title>')[1]
        title = _remove_cdata(title.split('</title>')[0])
+        title = unescaped_text(title)
        title = remove_html(title)

        description = ''
        if '<description>' in rss_item and '</description>' in rss_item:
            description = rss_item.split('<description>')[1]
            description = remove_html(description.split('</description>')[0])
+            description = unescaped_text(description)
        else:
            if '<media:description>' in rss_item and \
               '</media:description>' in rss_item:
                description = rss_item.split('<media:description>')[1]
                description = description.split('</media:description>')[0]
+                description = unescaped_text(description)
                description = remove_html(description)

        proxy_type = None
@ -874,16 +882,19 @@ def _xml1str_to_dict(base_dir: str, domain: str, xml_str: str,
            continue
        title = rss_item.split('<title>')[1]
        title = _remove_cdata(title.split('</title>')[0])
+        title = unescaped_text(title)
        title = remove_html(title)
        description = ''
        if '<description>' in rss_item and '</description>' in rss_item:
            description = rss_item.split('<description>')[1]
            description = remove_html(description.split('</description>')[0])
+            description = unescaped_text(description)
        else:
            if '<media:description>' in rss_item and \
               '</media:description>' in rss_item:
                description = rss_item.split('<media:description>')[1]
                description = description.split('</media:description>')[0]
+                description = unescaped_text(description)
                description = remove_html(description)

        proxy_type = None
@ -969,16 +980,19 @@ def _atom_feed_to_dict(base_dir: str, domain: str, xml_str: str,
            continue
        title = atom_item.split('<title>')[1]
        title = _remove_cdata(title.split('</title>')[0])
+        title = unescaped_text(title)
        title = remove_html(title)
        description = ''
        if '<summary>' in atom_item and '</summary>' in atom_item:
            description = atom_item.split('<summary>')[1]
            description = remove_html(description.split('</summary>')[0])
+            description = unescaped_text(description)
        else:
            if '<media:description>' in atom_item and \
               '</media:description>' in atom_item:
                description = atom_item.split('<media:description>')[1]
                description = description.split('</media:description>')[0]
+                description = unescaped_text(description)
                description = remove_html(description)

        proxy_type = None
@ -1184,15 +1198,18 @@ def _atom_feed_yt_to_dict(base_dir: str, domain: str, xml_str: str,
            continue
        title = atom_item.split('<title>')[1]
        title = _remove_cdata(title.split('</title>')[0])
+        title = unescaped_text(title)
        description = ''
        if '<media:description>' in atom_item and \
           '</media:description>' in atom_item:
            description = atom_item.split('<media:description>')[1]
            description = description.split('</media:description>')[0]
+            description = unescaped_text(description)
            description = remove_html(description)
        elif '<summary>' in atom_item and '</summary>' in atom_item:
            description = atom_item.split('<summary>')[1]
            description = description.split('</summary>')[0]
+            description = unescaped_text(description)
            description = remove_html(description)

        link, _ = get_link_from_rss_item(atom_item, None, None)
@ -1382,9 +1399,10 @@ def get_rs_sfrom_dict(base_dir: str, newswire: {},
            continue
        rss_str += \
            '<item>\n' + \
-            '  <title>' + fields[0] + '</title>\n'
+            '  <title>' + escape_text(fields[0]) + '</title>\n'
        description = remove_html(first_paragraph_from_string(fields[4]))
-        rss_str += '  <description>' + description + '</description>\n'
+        rss_str += \
+            '  <description>' + escape_text(description) + '</description>\n'
        url = fields[1]
        if '://' not in url:
            if domain_full not in url:
--- a/utils.py
+++ b/utils.py
@ -4241,3 +4241,18 @@ def escape_text(txt: str) -> str:
    for orig, replacement in replacements.items():
        txt = txt.replace(orig, replacement)
    return txt
+
+
+def unescaped_text(txt: str) -> str:
+    """Escape text for inclusion in xml/rss
+    """
+    replacements = {
+        "&": "&amp;",
+        "<": "&lt;",
+        ">": "&gt;",
+        '"': "&quot;",
+        "'": "&apos;"
+    }
+    for orig, replacement in replacements.items():
+        txt = txt.replace(replacement, orig)
+    return txt
--- a/webapp_hashtagswarm.py
+++ b/webapp_hashtagswarm.py
@ -11,6 +11,7 @@ import os
 from datetime import datetime
 from utils import get_nickname_from_actor
 from utils import get_config_param
+from utils import escape_text
 from categories import get_hashtag_categories
 from categories import get_hashtag_category
 from webapp_utils import set_custom_background
@ -41,7 +42,7 @@ def get_hashtag_categories_feed(base_dir: str,
    for category_str, hashtag_list in hashtag_categories.items():
        rss_str += \
            '<item>\n' + \
-            '  <title>' + category_str + '</title>\n'
+            '  <title>' + escape_text(category_str) + '</title>\n'
        list_str = ''
        for hashtag in hashtag_list:
            if ':' in hashtag:
@ -50,7 +51,8 @@ def get_hashtag_categories_feed(base_dir: str,
                continue
            list_str += hashtag + ' '
        rss_str += \
-            '  <description>' + list_str.strip() + '</description>\n' + \
+            '  <description>' + \
+            escape_text(list_str.strip()) + '</description>\n' + \
            '  <link/>\n' + \
            '  <pubDate>' + rss_date_str + '</pubDate>\n' + \
            '</item>\n'
--- a/webapp_search.py
+++ b/webapp_search.py
@ -26,6 +26,7 @@ from utils import search_box_posts
 from utils import get_alt_path
 from utils import acct_dir
 from utils import local_actor_url
+from utils import escape_text
 from skills import no_of_actor_skills
 from skills import get_skills_from_list
 from categories import get_hashtag_category
@ -1133,12 +1134,13 @@ def rss_hashtag_search(nickname: str, domain: str, port: int,
                if post_json_object['object'].get('summary'):
                    hashtag_feed += \
                        '         <title>' + \
-                        post_json_object['object']['summary'] + \
+                        escape_text(post_json_object['object']['summary']) + \
                        '</title>'
                description = \
                    get_base_content_from_post(post_json_object,
                                               system_language)
                description = first_paragraph_from_string(description)
+                description = escape_text(description)
                hashtag_feed += \
                    '         <description>' + description + '</description>'
                hashtag_feed += \