mirror of https://gitlab.com/bashrc2/epicyon
Get categories from podcast feeds
parent
1ef492488b
commit
f9e33f2d35
26
content.py
26
content.py
|
@ -11,9 +11,9 @@ import os
|
||||||
import email.parser
|
import email.parser
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
from utils import valid_hash_tag
|
||||||
from utils import dangerous_svg
|
from utils import dangerous_svg
|
||||||
from utils import remove_domain_port
|
from utils import remove_domain_port
|
||||||
from utils import is_valid_language
|
|
||||||
from utils import get_image_extensions
|
from utils import get_image_extensions
|
||||||
from utils import load_json
|
from utils import load_json
|
||||||
from utils import save_json
|
from utils import save_json
|
||||||
|
@ -33,17 +33,6 @@ MUSIC_SITES = ('soundcloud.com', 'bandcamp.com')
|
||||||
|
|
||||||
MAX_LINK_LENGTH = 40
|
MAX_LINK_LENGTH = 40
|
||||||
|
|
||||||
VALID_HASHTAG_CHARS = \
|
|
||||||
set('0123456789' +
|
|
||||||
'abcdefghijklmnopqrstuvwxyz' +
|
|
||||||
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
|
|
||||||
'¡¿ÄäÀàÁáÂâÃãÅåǍǎĄąĂăÆæĀā' +
|
|
||||||
'ÇçĆćĈĉČčĎđĐďðÈèÉéÊêËëĚěĘęĖėĒē' +
|
|
||||||
'ĜĝĢģĞğĤĥÌìÍíÎîÏïıĪīĮįĴĵĶķ' +
|
|
||||||
'ĹĺĻļŁłĽľĿŀÑñŃńŇňŅņÖöÒòÓóÔôÕõŐőØøŒœ' +
|
|
||||||
'ŔŕŘřẞߌśŜŝŞşŠšȘșŤťŢţÞþȚțÜüÙùÚúÛûŰűŨũŲųŮůŪū' +
|
|
||||||
'ŴŵÝýŸÿŶŷŹźŽžŻż')
|
|
||||||
|
|
||||||
REMOVE_MARKUP = (
|
REMOVE_MARKUP = (
|
||||||
'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
|
'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
|
||||||
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
|
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
|
||||||
|
@ -497,19 +486,6 @@ def add_web_links(content: str) -> str:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
def valid_hash_tag(hashtag: str) -> bool:
|
|
||||||
"""Returns true if the give hashtag contains valid characters
|
|
||||||
"""
|
|
||||||
# long hashtags are not valid
|
|
||||||
if len(hashtag) >= 32:
|
|
||||||
return False
|
|
||||||
if set(hashtag).issubset(VALID_HASHTAG_CHARS):
|
|
||||||
return True
|
|
||||||
if is_valid_language(hashtag):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
|
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
|
||||||
replace_hashtags: {}, post_hashtags: {}) -> bool:
|
replace_hashtags: {}, post_hashtags: {}) -> bool:
|
||||||
"""Detects hashtags and adds them to the replacements dict
|
"""Detects hashtags and adds them to the replacements dict
|
||||||
|
|
2
inbox.py
2
inbox.py
|
@ -61,6 +61,7 @@ from utils import undo_reaction_collection_entry
|
||||||
from utils import has_group_type
|
from utils import has_group_type
|
||||||
from utils import local_actor_url
|
from utils import local_actor_url
|
||||||
from utils import has_object_stringType
|
from utils import has_object_stringType
|
||||||
|
from utils import valid_hash_tag
|
||||||
from categories import get_hashtag_categories
|
from categories import get_hashtag_categories
|
||||||
from categories import set_hashtag_category
|
from categories import set_hashtag_category
|
||||||
from httpsig import get_digest_algorithm_from_headers
|
from httpsig import get_digest_algorithm_from_headers
|
||||||
|
@ -119,7 +120,6 @@ from announce import is_self_announce
|
||||||
from announce import create_announce
|
from announce import create_announce
|
||||||
from notifyOnPost import notify_when_person_posts
|
from notifyOnPost import notify_when_person_posts
|
||||||
from conversation import update_conversation
|
from conversation import update_conversation
|
||||||
from content import valid_hash_tag
|
|
||||||
from webapp_hashtagswarm import html_hash_tag_swarm
|
from webapp_hashtagswarm import html_hash_tag_swarm
|
||||||
from person import valid_sending_actor
|
from person import valid_sending_actor
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ from newswire import get_dict_from_newswire
|
||||||
# from posts import send_signed_json
|
# from posts import send_signed_json
|
||||||
from posts import create_news_post
|
from posts import create_news_post
|
||||||
from posts import archive_posts_for_person
|
from posts import archive_posts_for_person
|
||||||
from content import valid_hash_tag
|
from utils import valid_hash_tag
|
||||||
from utils import get_base_content_from_post
|
from utils import get_base_content_from_post
|
||||||
from utils import remove_html
|
from utils import remove_html
|
||||||
from utils import get_full_domain
|
from utils import get_full_domain
|
||||||
|
|
34
newswire.py
34
newswire.py
|
@ -18,6 +18,7 @@ from datetime import timezone
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from utils import valid_post_date
|
from utils import valid_post_date
|
||||||
from categories import set_hashtag_category
|
from categories import set_hashtag_category
|
||||||
|
from utils import valid_hash_tag
|
||||||
from utils import dangerous_svg
|
from utils import dangerous_svg
|
||||||
from utils import get_fav_filename_from_url
|
from utils import get_fav_filename_from_url
|
||||||
from utils import get_base_content_from_post
|
from utils import get_base_content_from_post
|
||||||
|
@ -470,8 +471,41 @@ def xml_podcast_to_dict(xml_item: str, xml_str: str) -> {}:
|
||||||
podcast_episode_image = episode_image
|
podcast_episode_image = episode_image
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# get categories if they exist. These can be turned into hashtags
|
||||||
|
podcast_categories = []
|
||||||
|
episode_category_tags = ['<itunes:category', '<category']
|
||||||
|
for category_tag in episode_category_tags:
|
||||||
|
item_str = xml_item
|
||||||
|
if category_tag not in xml_item:
|
||||||
|
if category_tag not in xml_str:
|
||||||
|
continue
|
||||||
|
item_str = xml_str
|
||||||
|
|
||||||
|
episode_category = item_str.split(category_tag)[1]
|
||||||
|
if 'text="' in episode_category:
|
||||||
|
episode_category = episode_category.split('text="')[1]
|
||||||
|
if '"' in episode_category:
|
||||||
|
episode_category = episode_category.split('"')[0]
|
||||||
|
episode_category = episode_category.lower().replace(' ', '')
|
||||||
|
if episode_category not in podcast_categories:
|
||||||
|
if valid_hash_tag(episode_category):
|
||||||
|
podcast_categories.append(episode_category)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if '>' in episode_category:
|
||||||
|
episode_category = episode_category.split('>')[1]
|
||||||
|
if '<' in episode_category:
|
||||||
|
episode_category = episode_category.split('<')[0]
|
||||||
|
episode_category = \
|
||||||
|
episode_category.lower().replace(' ', '')
|
||||||
|
if episode_category not in podcast_categories:
|
||||||
|
if valid_hash_tag(episode_category):
|
||||||
|
podcast_categories.append(episode_category)
|
||||||
|
continue
|
||||||
|
|
||||||
if podcast_episode_image:
|
if podcast_episode_image:
|
||||||
podcast_properties['image'] = podcast_episode_image
|
podcast_properties['image'] = podcast_episode_image
|
||||||
|
podcast_properties['categories'] = podcast_categories
|
||||||
|
|
||||||
if '<itunes:explicit>Y' in xml_item or \
|
if '<itunes:explicit>Y' in xml_item or \
|
||||||
'<itunes:explicit>T' in xml_item or \
|
'<itunes:explicit>T' in xml_item or \
|
||||||
|
|
4
tests.py
4
tests.py
|
@ -82,6 +82,7 @@ from utils import copytree
|
||||||
from utils import load_json
|
from utils import load_json
|
||||||
from utils import save_json
|
from utils import save_json
|
||||||
from utils import get_status_number
|
from utils import get_status_number
|
||||||
|
from utils import valid_hash_tag
|
||||||
from utils import get_followers_of_person
|
from utils import get_followers_of_person
|
||||||
from utils import remove_html
|
from utils import remove_html
|
||||||
from utils import dangerous_markup
|
from utils import dangerous_markup
|
||||||
|
@ -132,7 +133,6 @@ from content import get_price_from_string
|
||||||
from content import limit_repeated_words
|
from content import limit_repeated_words
|
||||||
from content import switch_words
|
from content import switch_words
|
||||||
from content import extract_text_fields_in_post
|
from content import extract_text_fields_in_post
|
||||||
from content import valid_hash_tag
|
|
||||||
from content import html_replace_email_quote
|
from content import html_replace_email_quote
|
||||||
from content import html_replace_quote_marks
|
from content import html_replace_quote_marks
|
||||||
from content import dangerous_css
|
from content import dangerous_css
|
||||||
|
@ -6428,7 +6428,7 @@ def _test_xml_podcast_dict() -> None:
|
||||||
'address="someaddress2" split="99" />\n' + \
|
'address="someaddress2" split="99" />\n' + \
|
||||||
'</podcast:value>\n' + \
|
'</podcast:value>\n' + \
|
||||||
'</rss>'
|
'</rss>'
|
||||||
podcast_properties = xml_podcast_to_dict(xml_str)
|
podcast_properties = xml_podcast_to_dict(xml_str, xml_str)
|
||||||
assert podcast_properties
|
assert podcast_properties
|
||||||
# pprint(podcast_properties)
|
# pprint(podcast_properties)
|
||||||
assert podcast_properties.get('valueRecipients')
|
assert podcast_properties.get('valueRecipients')
|
||||||
|
|
28
utils.py
28
utils.py
|
@ -20,6 +20,17 @@ from cryptography.hazmat.backends import default_backend
|
||||||
from cryptography.hazmat.primitives import hashes
|
from cryptography.hazmat.primitives import hashes
|
||||||
from followingCalendar import add_person_to_calendar
|
from followingCalendar import add_person_to_calendar
|
||||||
|
|
||||||
|
VALID_HASHTAG_CHARS = \
|
||||||
|
set('0123456789' +
|
||||||
|
'abcdefghijklmnopqrstuvwxyz' +
|
||||||
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
|
||||||
|
'¡¿ÄäÀàÁáÂâÃãÅåǍǎĄąĂăÆæĀā' +
|
||||||
|
'ÇçĆćĈĉČčĎđĐďðÈèÉéÊêËëĚěĘęĖėĒē' +
|
||||||
|
'ĜĝĢģĞğĤĥÌìÍíÎîÏïıĪīĮįĴĵĶķ' +
|
||||||
|
'ĹĺĻļŁłĽľĿŀÑñŃńŇňŅņÖöÒòÓóÔôÕõŐőØøŒœ' +
|
||||||
|
'ŔŕŘřẞߌśŜŝŞşŠšȘșŤťŢţÞþȚțÜüÙùÚúÛûŰűŨũŲųŮůŪū' +
|
||||||
|
'ŴŵÝýŸÿŶŷŹźŽžŻż')
|
||||||
|
|
||||||
# posts containing these strings will always get screened out,
|
# posts containing these strings will always get screened out,
|
||||||
# both incoming and outgoing.
|
# both incoming and outgoing.
|
||||||
# Could include dubious clacks or admin dogwhistles
|
# Could include dubious clacks or admin dogwhistles
|
||||||
|
@ -1798,7 +1809,7 @@ def delete_post(base_dir: str, http_prefix: str,
|
||||||
str(post_filename))
|
str(post_filename))
|
||||||
|
|
||||||
|
|
||||||
def is_valid_language(text: str) -> bool:
|
def _is_valid_language(text: str) -> bool:
|
||||||
"""Returns true if the given text contains a valid
|
"""Returns true if the given text contains a valid
|
||||||
natural language string
|
natural language string
|
||||||
"""
|
"""
|
||||||
|
@ -1900,7 +1911,7 @@ def valid_nickname(domain: str, nickname: str) -> bool:
|
||||||
return False
|
return False
|
||||||
if len(nickname) > 30:
|
if len(nickname) > 30:
|
||||||
return False
|
return False
|
||||||
if not is_valid_language(nickname):
|
if not _is_valid_language(nickname):
|
||||||
return False
|
return False
|
||||||
forbidden_chars = ('.', ' ', '/', '?', ':', ';', '@', '#', '!')
|
forbidden_chars = ('.', ' ', '/', '?', ':', ';', '@', '#', '!')
|
||||||
for char in forbidden_chars:
|
for char in forbidden_chars:
|
||||||
|
@ -3288,3 +3299,16 @@ def get_fav_filename_from_url(base_dir: str, favicon_url: str) -> str:
|
||||||
if '/favicon.' in favicon_url:
|
if '/favicon.' in favicon_url:
|
||||||
favicon_url = favicon_url.replace('/favicon.', '.')
|
favicon_url = favicon_url.replace('/favicon.', '.')
|
||||||
return base_dir + '/favicons/' + favicon_url.replace('/', '-')
|
return base_dir + '/favicons/' + favicon_url.replace('/', '-')
|
||||||
|
|
||||||
|
|
||||||
|
def valid_hash_tag(hashtag: str) -> bool:
|
||||||
|
"""Returns true if the give hashtag contains valid characters
|
||||||
|
"""
|
||||||
|
# long hashtags are not valid
|
||||||
|
if len(hashtag) >= 32:
|
||||||
|
return False
|
||||||
|
if set(hashtag).issubset(VALID_HASHTAG_CHARS):
|
||||||
|
return True
|
||||||
|
if _is_valid_language(hashtag):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
Loading…
Reference in New Issue