Get categories from podcast feeds

2022-01-13 15:10:41 +00:00 · 2022-01-13 15:10:41 +00:00 · f9e33f2d35
parent 1ef492488b
commit f9e33f2d35
6 changed files with 65 additions and 31 deletions
--- a/content.py
+++ b/content.py
@ -11,9 +11,9 @@ import os
 import email.parser
 import urllib.parse
 from shutil import copyfile
+from utils import valid_hash_tag
 from utils import dangerous_svg
 from utils import remove_domain_port
-from utils import is_valid_language
 from utils import get_image_extensions
 from utils import load_json
 from utils import save_json
@ -33,17 +33,6 @@ MUSIC_SITES = ('soundcloud.com', 'bandcamp.com')

 MAX_LINK_LENGTH = 40

-VALID_HASHTAG_CHARS = \
-    set('0123456789' +
-        'abcdefghijklmnopqrstuvwxyz' +
-        'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
-        '¡¿ÄäÀàÁáÂâÃãÅåǍǎĄąĂăÆæĀā' +
-        'ÇçĆćĈĉČčĎđĐďðÈèÉéÊêËëĚěĘęĖėĒē' +
-        'ĜĝĢģĞğĤĥÌìÍíÎîÏïıĪīĮįĴĵĶķ' +
-        'ĹĺĻļŁłĽľĿŀÑñŃńŇňŅņÖöÒòÓóÔôÕõŐőØøŒœ' +
-        'ŔŕŘřẞßŚśŜŝŞşŠšȘșŤťŢţÞþȚțÜüÙùÚúÛûŰűŨũŲųŮůŪū' +
-        'ŴŵÝýŸÿŶŷŹźŽžŻż')
-
 REMOVE_MARKUP = (
    'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
    'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
@ -497,19 +486,6 @@ def add_web_links(content: str) -> str:
    return content


-def valid_hash_tag(hashtag: str) -> bool:
-    """Returns true if the give hashtag contains valid characters
-    """
-    # long hashtags are not valid
-    if len(hashtag) >= 32:
-        return False
-    if set(hashtag).issubset(VALID_HASHTAG_CHARS):
-        return True
-    if is_valid_language(hashtag):
-        return True
-    return False
-
-
 def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
                   replace_hashtags: {}, post_hashtags: {}) -> bool:
    """Detects hashtags and adds them to the replacements dict
--- a/inbox.py
+++ b/inbox.py
@ -61,6 +61,7 @@ from utils import undo_reaction_collection_entry
 from utils import has_group_type
 from utils import local_actor_url
 from utils import has_object_stringType
+from utils import valid_hash_tag
 from categories import get_hashtag_categories
 from categories import set_hashtag_category
 from httpsig import get_digest_algorithm_from_headers
@ -119,7 +120,6 @@ from announce import is_self_announce
 from announce import create_announce
 from notifyOnPost import notify_when_person_posts
 from conversation import update_conversation
-from content import valid_hash_tag
 from webapp_hashtagswarm import html_hash_tag_swarm
 from person import valid_sending_actor

--- a/newsdaemon.py
+++ b/newsdaemon.py
@ -24,7 +24,7 @@ from newswire import get_dict_from_newswire
 # from posts import send_signed_json
 from posts import create_news_post
 from posts import archive_posts_for_person
-from content import valid_hash_tag
+from utils import valid_hash_tag
 from utils import get_base_content_from_post
 from utils import remove_html
 from utils import get_full_domain
--- a/newswire.py
+++ b/newswire.py
@ -18,6 +18,7 @@ from datetime import timezone
 from collections import OrderedDict
 from utils import valid_post_date
 from categories import set_hashtag_category
+from utils import valid_hash_tag
 from utils import dangerous_svg
 from utils import get_fav_filename_from_url
 from utils import get_base_content_from_post
@ -470,8 +471,41 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
                        podcast_episode_image = episode_image
                        break

+    # get categories if they exist. These can be turned into hashtags
+    podcast_categories = []
+    episode_category_tags = ['<itunes:category', '<category']
+    for category_tag in episode_category_tags:
+        item_str = xml_item
+        if category_tag not in xml_item:
+            if category_tag not in xml_str:
+                continue
+            item_str = xml_str
+
+        episode_category = item_str.split(category_tag)[1]
+        if 'text="' in episode_category:
+            episode_category = episode_category.split('text="')[1]
+            if '"' in episode_category:
+                episode_category = episode_category.split('"')[0]
+                episode_category = episode_category.lower().replace(' ', '')
+                if episode_category not in podcast_categories:
+                    if valid_hash_tag(episode_category):
+                        podcast_categories.append(episode_category)
+                continue
+        else:
+            if '>' in episode_category:
+                episode_category = episode_category.split('>')[1]
+                if '<' in episode_category:
+                    episode_category = episode_category.split('<')[0]
+                    episode_category = \
+                        episode_category.lower().replace(' ', '')
+                    if episode_category not in podcast_categories:
+                        if valid_hash_tag(episode_category):
+                            podcast_categories.append(episode_category)
+                    continue
+
    if podcast_episode_image:
        podcast_properties['image'] = podcast_episode_image
+        podcast_properties['categories'] = podcast_categories

        if '<itunes:explicit>Y' in xml_item or \
           '<itunes:explicit>T' in xml_item or \
--- a/tests.py
+++ b/tests.py
@ -82,6 +82,7 @@ from utils import copytree
 from utils import load_json
 from utils import save_json
 from utils import get_status_number
+from utils import valid_hash_tag
 from utils import get_followers_of_person
 from utils import remove_html
 from utils import dangerous_markup
@ -132,7 +133,6 @@ from content import get_price_from_string
 from content import limit_repeated_words
 from content import switch_words
 from content import extract_text_fields_in_post
-from content import valid_hash_tag
 from content import html_replace_email_quote
 from content import html_replace_quote_marks
 from content import dangerous_css
@ -6428,7 +6428,7 @@ def _test_xml_podcast_dict() -> None:
        'address="someaddress2" split="99" />\n' + \
        '</podcast:value>\n' + \
        '</rss>'
-    podcast_properties = xml_podcast_to_dict(xml_str)
+    podcast_properties = xml_podcast_to_dict(xml_str, xml_str)
    assert podcast_properties
    # pprint(podcast_properties)
    assert podcast_properties.get('valueRecipients')
--- a/utils.py
+++ b/utils.py
@ -20,6 +20,17 @@ from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import hashes
 from followingCalendar import add_person_to_calendar

+VALID_HASHTAG_CHARS = \
+    set('0123456789' +
+        'abcdefghijklmnopqrstuvwxyz' +
+        'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
+        '¡¿ÄäÀàÁáÂâÃãÅåǍǎĄąĂăÆæĀā' +
+        'ÇçĆćĈĉČčĎđĐďðÈèÉéÊêËëĚěĘęĖėĒē' +
+        'ĜĝĢģĞğĤĥÌìÍíÎîÏïıĪīĮįĴĵĶķ' +
+        'ĹĺĻļŁłĽľĿŀÑñŃńŇňŅņÖöÒòÓóÔôÕõŐőØøŒœ' +
+        'ŔŕŘřẞßŚśŜŝŞşŠšȘșŤťŢţÞþȚțÜüÙùÚúÛûŰűŨũŲųŮůŪū' +
+        'ŴŵÝýŸÿŶŷŹźŽžŻż')
+
 # posts containing these strings will always get screened out,
 # both incoming and outgoing.
 # Could include dubious clacks or admin dogwhistles
@ -1798,7 +1809,7 @@ def delete_post(base_dir: str, http_prefix: str,
                  str(post_filename))


-def is_valid_language(text: str) -> bool:
+def _is_valid_language(text: str) -> bool:
    """Returns true if the given text contains a valid
    natural language string
    """
@ -1900,7 +1911,7 @@ def valid_nickname(domain: str, nickname: str) -> bool:
        return False
    if len(nickname) > 30:
        return False
-    if not is_valid_language(nickname):
+    if not _is_valid_language(nickname):
        return False
    forbidden_chars = ('.', ' ', '/', '?', ':', ';', '@', '#', '!')
    for char in forbidden_chars:
@ -3288,3 +3299,16 @@ def get_fav_filename_from_url(base_dir: str, favicon_url: str) -> str:
    if '/favicon.' in favicon_url:
        favicon_url = favicon_url.replace('/favicon.', '.')
    return base_dir + '/favicons/' + favicon_url.replace('/', '-')
+
+
+def valid_hash_tag(hashtag: str) -> bool:
+    """Returns true if the give hashtag contains valid characters
+    """
+    # long hashtags are not valid
+    if len(hashtag) >= 32:
+        return False
+    if set(hashtag).issubset(VALID_HASHTAG_CHARS):
+        return True
+    if _is_valid_language(hashtag):
+        return True
+    return False