epicyon/content.py

1390 lines
50 KiB
Python
Raw Normal View History

2020-04-02 09:56:17 +00:00
__filename__ = "content.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2022-02-03 13:58:20 +00:00
__version__ = "1.3.0"
2020-04-02 09:56:17 +00:00
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
2020-04-02 09:56:17 +00:00
__status__ = "Production"
2021-06-25 16:10:09 +00:00
__module_group__ = "Core"
2019-07-15 14:11:31 +00:00
import math
2022-03-24 15:15:53 +00:00
import html
2019-07-15 14:11:31 +00:00
import os
2019-11-10 11:37:24 +00:00
import email.parser
import urllib.parse
2019-08-11 16:55:22 +00:00
from shutil import copyfile
2022-01-13 15:10:41 +00:00
from utils import valid_hash_tag
2021-12-27 21:44:48 +00:00
from utils import dangerous_svg
2021-12-26 18:17:37 +00:00
from utils import remove_domain_port
2021-12-26 14:26:16 +00:00
from utils import get_image_extensions
2021-12-26 15:13:34 +00:00
from utils import load_json
2021-12-26 14:47:21 +00:00
from utils import save_json
2021-12-28 14:01:37 +00:00
from utils import file_last_modified
2021-12-27 17:32:34 +00:00
from utils import get_link_prefixes
2021-12-27 21:42:08 +00:00
from utils import dangerous_markup
2021-12-26 19:15:36 +00:00
from utils import is_pgp_encrypted
from utils import contains_pgp_public_key
2021-12-26 12:02:29 +00:00
from utils import acct_dir
2021-12-26 18:03:39 +00:00
from utils import is_float
2021-12-26 17:29:09 +00:00
from utils import get_currencies
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-29 21:55:09 +00:00
from petnames import get_pet_name
from session import download_image
2019-07-15 14:11:31 +00:00
2021-12-30 20:24:05 +00:00
MUSIC_SITES = ('soundcloud.com', 'bandcamp.com')
MAX_LINK_LENGTH = 40
REMOVE_MARKUP = (
'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
)
INVALID_CONTENT_STRINGS = (
'mute', 'unmute', 'editeventpost', 'notifypost',
'delete', 'options', 'page', 'repeat',
'bm', 'tl', 'actor', 'unrepeat', 'eventid',
'unannounce', 'like', 'unlike', 'bookmark',
'unbookmark', 'likedBy', 'time',
'year', 'month', 'day', 'editnewpost',
'graph', 'showshare', 'category', 'showwanted',
'rmshare', 'rmwanted', 'repeatprivate',
'unrepeatprivate', 'replyto',
'replyfollowers', 'replydm', 'replychat', 'editblogpost',
2021-12-30 20:24:05 +00:00
'handle', 'blockdomain'
)
def remove_html_tag(html_str: str, tag: str) -> str:
2020-10-11 09:33:31 +00:00
"""Removes a given tag from a html string
"""
2021-12-30 20:24:05 +00:00
tag_found = True
while tag_found:
match_str = ' ' + tag + '="'
if match_str not in html_str:
tag_found = False
2020-10-11 09:33:31 +00:00
break
2021-12-30 20:24:05 +00:00
sections = html_str.split(match_str, 1)
2020-10-11 09:33:31 +00:00
if '"' not in sections[1]:
2021-12-30 20:24:05 +00:00
tag_found = False
2020-10-11 09:33:31 +00:00
break
2021-12-30 20:24:05 +00:00
html_str = sections[0] + sections[1].split('"', 1)[1]
return html_str
2020-10-11 09:33:31 +00:00
2021-12-29 21:55:09 +00:00
def _remove_quotes_within_quotes(content: str) -> str:
2020-09-30 22:52:39 +00:00
"""Removes any blockquote inside blockquote
"""
if '<blockquote>' not in content:
return content
if '</blockquote>' not in content:
return content
ctr = 1
found = True
while found:
prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
2021-12-30 20:24:05 +00:00
quoted_str = content.split('<blockquote>', ctr)[1]
if '</blockquote>' not in quoted_str:
2020-09-30 22:52:39 +00:00
found = False
else:
2021-12-30 20:24:05 +00:00
end_str = quoted_str.split('</blockquote>')[1]
quoted_str = quoted_str.split('</blockquote>')[0]
if '<blockquote>' not in end_str:
2020-09-30 22:52:39 +00:00
found = False
2021-12-30 20:24:05 +00:00
if '<blockquote>' in quoted_str:
quoted_str = quoted_str.replace('<blockquote>', '')
content = prefix + quoted_str + '</blockquote>' + end_str
2020-09-30 22:52:39 +00:00
ctr += 1
return content
2020-04-02 09:56:17 +00:00
2021-12-29 21:55:09 +00:00
def html_replace_email_quote(content: str) -> str:
"""Replaces an email style quote "> Some quote" with html blockquote
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
2020-09-14 11:30:56 +00:00
# replace quote paragraph
if '<p>&quot;' in content:
if '&quot;</p>' in content:
2020-10-30 12:10:57 +00:00
if content.count('<p>&quot;') == content.count('&quot;</p>'):
content = content.replace('<p>&quot;', '<p><blockquote>')
content = content.replace('&quot;</p>', '</blockquote></p>')
2020-09-14 12:17:11 +00:00
if '>\u201c' in content:
if '\u201d<' in content:
2020-10-30 12:10:57 +00:00
if content.count('>\u201c') == content.count('\u201d<'):
2020-10-30 12:12:09 +00:00
content = content.replace('>\u201c', '><blockquote>')
content = content.replace('\u201d<', '</blockquote><')
2020-09-14 11:30:56 +00:00
# replace email style quote
if '>&gt; ' not in content:
return content
2021-12-30 20:24:05 +00:00
content_str = content.replace('<p>', '')
content_lines = content_str.split('</p>')
new_content = ''
for line_str in content_lines:
if not line_str:
continue
2021-12-30 20:24:05 +00:00
if '>&gt; ' not in line_str:
if line_str.startswith('&gt; '):
line_str = line_str.replace('&gt; ', '<blockquote>')
line_str = line_str.replace('&gt;', '<br>')
new_content += '<p>' + line_str + '</blockquote></p>'
2020-09-14 10:25:12 +00:00
else:
2021-12-30 20:24:05 +00:00
new_content += '<p>' + line_str + '</p>'
else:
2021-12-30 20:24:05 +00:00
line_str = line_str.replace('>&gt; ', '><blockquote>')
if line_str.startswith('&gt;'):
line_str = line_str.replace('&gt;', '<blockquote>', 1)
2020-09-30 22:52:39 +00:00
else:
2021-12-30 20:24:05 +00:00
line_str = line_str.replace('&gt;', '<br>')
new_content += '<p>' + line_str + '</blockquote></p>'
return _remove_quotes_within_quotes(new_content)
2021-12-29 21:55:09 +00:00
def html_replace_quote_marks(content: str) -> str:
2020-08-02 17:01:12 +00:00
"""Replaces quotes with html formatting
"hello" becomes <q>hello</q>
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
2020-08-02 17:01:12 +00:00
if '"' not in content:
2020-08-03 17:03:30 +00:00
if '&quot;' not in content:
return content
# only if there are a few quote marks
if content.count('"') > 4:
return content
if content.count('&quot;') > 4:
return content
2020-08-02 17:01:12 +00:00
2021-12-30 20:24:05 +00:00
new_content = content
2020-08-03 17:03:30 +00:00
if '"' in content:
sections = content.split('"')
if len(sections) > 1:
2021-12-30 20:24:05 +00:00
new_content = ''
open_quote = True
2020-08-02 17:17:51 +00:00
markup = False
2021-12-30 20:24:05 +00:00
for char in content:
curr_char = char
if char == '<':
2020-08-03 17:03:30 +00:00
markup = True
2021-12-30 20:24:05 +00:00
elif char == '>':
2020-08-03 17:03:30 +00:00
markup = False
2021-12-30 20:24:05 +00:00
elif char == '"' and not markup:
if open_quote:
curr_char = ''
2020-08-03 17:03:30 +00:00
else:
2021-12-30 20:24:05 +00:00
curr_char = ''
open_quote = not open_quote
new_content += curr_char
if '&quot;' in new_content:
open_quote = True
content = new_content
new_content = ''
2020-08-02 19:16:22 +00:00
ctr = 0
sections = content.split('&quot;')
2021-12-30 20:24:05 +00:00
no_of_sections = len(sections)
for sec in sections:
new_content += sec
if ctr < no_of_sections - 1:
if open_quote:
new_content += ''
2020-08-02 19:16:22 +00:00
else:
2021-12-30 20:24:05 +00:00
new_content += ''
open_quote = not open_quote
2020-08-02 19:16:22 +00:00
ctr += 1
2021-12-30 20:24:05 +00:00
return new_content
2020-08-02 17:01:12 +00:00
2021-12-29 21:55:09 +00:00
def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
2020-11-15 11:01:05 +00:00
"""Returns true is the css file contains code which
can create security problems
"""
if not os.path.isfile(filename):
return False
2021-11-26 12:28:20 +00:00
content = None
try:
2021-12-30 20:24:05 +00:00
with open(filename, 'r') as css_file:
content = css_file.read().lower()
2021-11-26 12:28:20 +00:00
except OSError:
print('EX: unable to read css file ' + filename)
2021-12-30 20:24:05 +00:00
if not content:
return False
2020-11-15 11:01:05 +00:00
2021-12-30 20:24:05 +00:00
css_matches = (
'behavior:', ':expression', '?php', '.php',
'google', 'regexp', 'localhost',
'127.0.', '192.168', '10.0.', '@import'
)
for cssmatch in css_matches:
if cssmatch in content:
2020-11-15 11:01:05 +00:00
return True
2021-12-30 20:24:05 +00:00
# search for non-local web links
if 'url(' in content:
url_list = content.split('url(')
ctr = 0
for url_str in url_list:
if ctr > 0:
if ')' in url_str:
url_str = url_str.split(')')[0]
if 'http' in url_str:
print('ERROR: non-local web link in CSS ' +
filename)
return True
ctr += 1
# an attacker can include html inside of the css
# file as a comment and this may then be run from the html
if dangerous_markup(content, allow_local_network_access):
return True
2020-11-15 11:01:05 +00:00
return False
2021-12-29 21:55:09 +00:00
def switch_words(base_dir: str, nickname: str, domain: str, content: str,
rules: [] = []) -> str:
2020-02-19 18:51:08 +00:00
"""Performs word replacements. eg. Trump -> The Orange Menace
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
2021-07-06 16:29:03 +00:00
if not rules:
2021-12-30 12:23:55 +00:00
switch_words_filename = \
2021-12-26 12:02:29 +00:00
acct_dir(base_dir, nickname, domain) + '/replacewords.txt'
2021-12-30 12:23:55 +00:00
if not os.path.isfile(switch_words_filename):
2021-07-06 16:29:03 +00:00
return content
2021-11-26 12:28:20 +00:00
try:
2021-12-30 20:24:05 +00:00
with open(switch_words_filename, 'r') as words_file:
rules = words_file.readlines()
2021-11-26 12:28:20 +00:00
except OSError:
2021-12-30 12:23:55 +00:00
print('EX: unable to read switches ' + switch_words_filename)
2021-07-06 16:29:03 +00:00
for line in rules:
2021-12-30 20:24:05 +00:00
replace_str = line.replace('\n', '').replace('\r', '')
2021-07-06 16:29:03 +00:00
splitters = ('->', ':', ',', ';', '-')
2021-12-30 20:24:05 +00:00
word_transform = None
for split_str in splitters:
if split_str in replace_str:
word_transform = replace_str.split(split_str)
2021-07-06 16:29:03 +00:00
break
2021-12-30 20:24:05 +00:00
if not word_transform:
2021-07-06 16:29:03 +00:00
continue
2021-12-30 20:24:05 +00:00
if len(word_transform) == 2:
replace_str1 = word_transform[0].strip().replace('"', '')
replace_str2 = word_transform[1].strip().replace('"', '')
content = content.replace(replace_str1, replace_str2)
2020-02-19 18:51:08 +00:00
return content
2020-04-02 09:56:17 +00:00
2021-12-29 21:55:09 +00:00
def _save_custom_emoji(session, base_dir: str, emojiName: str, url: str,
debug: bool) -> None:
2021-11-01 17:12:17 +00:00
"""Saves custom emoji to file
"""
if not session:
2021-11-01 17:50:38 +00:00
if debug:
2021-12-29 21:55:09 +00:00
print('EX: _save_custom_emoji no session')
2021-11-01 17:12:17 +00:00
return
if '.' not in url:
return
ext = url.split('.')[-1]
if ext != 'png':
2021-11-01 17:50:38 +00:00
if debug:
2021-11-01 18:33:32 +00:00
print('EX: Custom emoji is wrong format ' + url)
2021-11-01 17:12:17 +00:00
return
2021-11-01 20:12:04 +00:00
emojiName = emojiName.replace(':', '').strip().lower()
2021-12-30 20:24:05 +00:00
custom_emoji_dir = base_dir + '/emojicustom'
if not os.path.isdir(custom_emoji_dir):
os.mkdir(custom_emoji_dir)
emoji_image_filename = custom_emoji_dir + '/' + emojiName + '.' + ext
2021-12-29 21:55:09 +00:00
if not download_image(session, base_dir, url,
2021-12-30 20:24:05 +00:00
emoji_image_filename, debug, False):
2021-11-01 18:33:32 +00:00
if debug:
print('EX: custom emoji not downloaded ' + url)
2021-11-01 17:12:17 +00:00
return
2021-12-30 20:24:05 +00:00
emoji_json_filename = custom_emoji_dir + '/emoji.json'
emoji_json = {}
if os.path.isfile(emoji_json_filename):
emoji_json = load_json(emoji_json_filename, 0, 1)
if not emoji_json:
emoji_json = {}
if not emoji_json.get(emojiName):
emoji_json[emojiName] = emojiName
save_json(emoji_json, emoji_json_filename)
2021-11-01 17:50:38 +00:00
if debug:
2021-12-30 20:24:05 +00:00
print('EX: Saved custom emoji ' + emoji_json_filename)
2021-11-01 18:33:32 +00:00
elif debug:
print('EX: cusom emoji already saved')
2021-11-01 17:12:17 +00:00
2021-12-29 21:55:09 +00:00
def replace_emoji_from_tags(session, base_dir: str,
2021-12-30 20:24:05 +00:00
content: str, tag: [], message_type: str,
2021-12-29 21:55:09 +00:00
debug: bool) -> str:
2019-09-29 16:28:02 +00:00
"""Uses the tags to replace :emoji: with html image markup
"""
2021-12-30 20:24:05 +00:00
for tag_item in tag:
if not tag_item.get('type'):
2019-09-29 17:20:10 +00:00
continue
2021-12-30 20:24:05 +00:00
if tag_item['type'] != 'Emoji':
2019-09-29 17:20:10 +00:00
continue
2021-12-30 20:24:05 +00:00
if not tag_item.get('name'):
2019-09-29 16:28:02 +00:00
continue
2021-12-30 20:24:05 +00:00
if not tag_item.get('icon'):
2019-09-29 16:28:02 +00:00
continue
2021-12-30 20:24:05 +00:00
if not tag_item['icon'].get('url'):
2019-09-29 16:28:02 +00:00
continue
2021-12-30 20:24:05 +00:00
if '/' not in tag_item['icon']['url']:
2020-02-21 15:09:31 +00:00
continue
2021-12-30 20:24:05 +00:00
if tag_item['name'] not in content:
2019-09-29 16:28:02 +00:00
continue
2021-12-30 20:24:05 +00:00
icon_name = tag_item['icon']['url'].split('/')[-1]
if icon_name:
if len(icon_name) > 1:
if icon_name[0].isdigit():
if '.' in icon_name:
icon_name = icon_name.split('.')[0]
2020-04-02 09:56:17 +00:00
# see https://unicode.org/
# emoji/charts/full-emoji-list.html
2021-12-30 20:24:05 +00:00
if '-' not in icon_name:
2020-02-21 21:08:24 +00:00
# a single code
2021-11-01 17:23:39 +00:00
replaced = False
2020-02-21 21:08:24 +00:00
try:
2021-12-30 20:24:05 +00:00
replace_char = chr(int("0x" + icon_name, 16))
content = content.replace(tag_item['name'],
replace_char)
2021-11-01 17:12:17 +00:00
replaced = True
2020-04-02 09:56:17 +00:00
except BaseException:
2021-12-29 21:55:09 +00:00
print('EX: replace_emoji_from_tags 1 ' +
2021-11-01 11:35:15 +00:00
'no conversion of ' +
2021-12-30 20:24:05 +00:00
str(icon_name) + ' to chr ' +
tag_item['name'] + ' ' +
tag_item['icon']['url'])
2021-11-01 17:12:17 +00:00
if not replaced:
2021-12-29 21:55:09 +00:00
_save_custom_emoji(session, base_dir,
2021-12-30 20:24:05 +00:00
tag_item['name'],
tag_item['icon']['url'],
2021-12-29 21:55:09 +00:00
debug)
2020-02-21 21:08:24 +00:00
else:
# sequence of codes
2021-12-30 20:24:05 +00:00
icon_codes = icon_name.split('-')
icon_code_sequence = ''
for icode in icon_codes:
2021-11-01 17:23:39 +00:00
replaced = False
2020-02-21 21:08:24 +00:00
try:
2021-12-30 20:24:05 +00:00
icon_code_sequence += chr(int("0x" +
icode, 16))
2021-11-01 17:12:17 +00:00
replaced = True
2020-04-02 09:56:17 +00:00
except BaseException:
2021-12-30 20:24:05 +00:00
icon_code_sequence = ''
2021-12-29 21:55:09 +00:00
print('EX: replace_emoji_from_tags 2 ' +
2021-11-01 11:35:15 +00:00
'no conversion of ' +
str(icode) + ' to chr ' +
2021-12-30 20:24:05 +00:00
tag_item['name'] + ' ' +
tag_item['icon']['url'])
2021-11-01 17:12:17 +00:00
if not replaced:
2021-12-29 21:55:09 +00:00
_save_custom_emoji(session, base_dir,
2021-12-30 20:24:05 +00:00
tag_item['name'],
tag_item['icon']['url'],
2021-12-29 21:55:09 +00:00
debug)
2021-12-30 20:24:05 +00:00
if icon_code_sequence:
content = content.replace(tag_item['name'],
icon_code_sequence)
html_class = 'emoji'
if message_type == 'post header':
html_class = 'emojiheader'
if message_type == 'profile':
html_class = 'emojiprofile'
emoji_html = "<img src=\"" + tag_item['icon']['url'] + "\" alt=\"" + \
tag_item['name'].replace(':', '') + \
"\" align=\"middle\" class=\"" + html_class + "\"/>"
content = content.replace(tag_item['name'], emoji_html)
2019-09-29 16:28:02 +00:00
return content
2020-02-21 15:09:31 +00:00
2021-12-29 21:55:09 +00:00
def _add_music_tag(content: str, tag: str) -> str:
2020-03-29 09:59:54 +00:00
"""If a music link is found then ensure that the post is
tagged appropriately
2019-09-05 09:54:27 +00:00
"""
2020-10-11 09:50:17 +00:00
if '#podcast' in content or '#documentary' in content:
return content
2019-09-05 09:54:27 +00:00
if '#' not in tag:
2020-10-11 09:50:17 +00:00
tag = '#' + tag
2019-09-05 09:54:27 +00:00
if tag in content:
return content
2021-12-30 20:24:05 +00:00
music_site_found = False
for site in MUSIC_SITES:
2021-06-22 12:42:52 +00:00
if site + '/' in content:
2021-12-30 20:24:05 +00:00
music_site_found = True
2019-09-05 09:54:27 +00:00
break
2021-12-30 20:24:05 +00:00
if not music_site_found:
2019-09-05 09:54:27 +00:00
return content
2020-04-02 09:56:17 +00:00
return ':music: ' + content + ' ' + tag + ' '
2019-09-05 09:54:27 +00:00
2021-12-29 21:55:09 +00:00
def add_web_links(content: str) -> str:
2019-08-21 12:07:30 +00:00
"""Adds markup for web links
"""
2020-06-11 09:43:48 +00:00
if ':' not in content:
return content
2021-12-27 17:32:34 +00:00
prefixes = get_link_prefixes()
2020-06-11 11:56:08 +00:00
# do any of these prefixes exist within the content?
2021-12-30 20:24:05 +00:00
prefix_found = False
2020-06-11 11:56:08 +00:00
for prefix in prefixes:
if prefix in content:
2021-12-30 20:24:05 +00:00
prefix_found = True
2020-06-11 11:56:08 +00:00
break
# if there are no prefixes then just keep the content we have
2021-12-30 20:24:05 +00:00
if not prefix_found:
2019-08-21 12:07:30 +00:00
return content
2020-05-22 11:32:38 +00:00
content = content.replace('\r', '')
2020-04-02 09:56:17 +00:00
words = content.replace('\n', ' --linebreak-- ').split(' ')
2021-12-30 20:24:05 +00:00
replace_dict = {}
for wrd in words:
if ':' not in wrd:
2020-06-11 09:43:48 +00:00
continue
2020-06-11 11:56:08 +00:00
# does the word begin with a prefix?
2021-12-30 20:24:05 +00:00
prefix_found = False
2020-06-11 11:56:08 +00:00
for prefix in prefixes:
2021-12-30 20:24:05 +00:00
if wrd.startswith(prefix):
prefix_found = True
2020-06-11 11:56:08 +00:00
break
2021-12-30 20:24:05 +00:00
if not prefix_found:
2020-06-11 11:56:08 +00:00
continue
# the word contains a prefix
2021-12-30 20:24:05 +00:00
if wrd.endswith('.') or wrd.endswith(';'):
wrd = wrd[:-1]
markup = '<a href="' + wrd + \
2020-12-11 10:14:58 +00:00
'" rel="nofollow noopener noreferrer" target="_blank">'
2020-06-11 11:56:08 +00:00
for prefix in prefixes:
2021-12-30 20:24:05 +00:00
if wrd.startswith(prefix):
2020-06-11 11:56:08 +00:00
markup += '<span class="invisible">' + prefix + '</span>'
break
2021-12-30 20:24:05 +00:00
link_text = wrd
2020-06-11 11:56:08 +00:00
for prefix in prefixes:
2021-12-30 20:24:05 +00:00
link_text = link_text.replace(prefix, '')
2020-06-11 11:56:08 +00:00
# prevent links from becoming too long
2021-12-30 20:24:05 +00:00
if len(link_text) > MAX_LINK_LENGTH:
2020-06-11 11:56:08 +00:00
markup += '<span class="ellipsis">' + \
2021-12-30 20:24:05 +00:00
link_text[:MAX_LINK_LENGTH] + '</span>'
2020-06-11 11:56:08 +00:00
markup += '<span class="invisible">' + \
2021-12-30 20:24:05 +00:00
link_text[MAX_LINK_LENGTH:] + '</span></a>'
2020-06-11 11:56:08 +00:00
else:
2021-12-30 20:24:05 +00:00
markup += '<span class="ellipsis">' + link_text + '</span></a>'
replace_dict[wrd] = markup
2020-06-11 11:56:08 +00:00
# do the replacements
2021-12-30 20:24:05 +00:00
for url, markup in replace_dict.items():
2020-04-02 09:56:17 +00:00
content = content.replace(url, markup)
2020-06-11 11:56:08 +00:00
# replace any line breaks
2020-04-02 09:56:17 +00:00
content = content.replace(' --linebreak-- ', '<br>')
2020-06-11 11:56:08 +00:00
2019-08-21 12:07:30 +00:00
return content
2020-04-02 09:56:17 +00:00
2022-01-14 10:20:37 +00:00
def safe_web_text(arbitrary_html: str) -> str:
"""Turns arbitrary html into something safe.
So if the arbitrary html contains attack scripts those will be removed
"""
# first remove the markup, so that we have something safe
safe_text = remove_html(arbitrary_html)
if not safe_text:
return ''
# remove any spurious characters found in podcast descriptions
2022-01-14 19:05:26 +00:00
remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]', '__')
2022-01-14 10:20:37 +00:00
for remchar in remove_chars:
safe_text = safe_text.replace(remchar, '')
# recreate any url links safely
return add_web_links(safe_text)
2021-12-30 20:24:05 +00:00
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
replace_hashtags: {}, post_hashtags: {}) -> bool:
2019-08-09 11:12:08 +00:00
"""Detects hashtags and adds them to the replacements dict
Also updates the hashtags list to be added to the post
"""
2021-12-30 20:24:05 +00:00
if replace_hashtags.get(word_str):
2020-04-02 09:56:17 +00:00
return True
2021-12-30 20:24:05 +00:00
hashtag = word_str[1:]
2021-12-29 21:55:09 +00:00
if not valid_hash_tag(hashtag):
2019-08-09 11:12:08 +00:00
return False
2021-12-30 20:24:05 +00:00
hashtag_url = http_prefix + "://" + domain + "/tags/" + hashtag
post_hashtags[hashtag] = {
'href': hashtag_url,
2020-10-16 20:13:23 +00:00
'name': '#' + hashtag,
2019-08-09 11:12:08 +00:00
'type': 'Hashtag'
}
2021-12-30 20:24:05 +00:00
replace_hashtags[word_str] = "<a href=\"" + hashtag_url + \
2020-04-02 09:56:17 +00:00
"\" class=\"mention hashtag\" rel=\"tag\">#<span>" + \
hashtag + "</span></a>"
2019-08-09 11:12:08 +00:00
return True
2020-04-02 09:56:17 +00:00
2021-12-30 20:24:05 +00:00
def _add_emoji(base_dir: str, word_str: str,
2021-12-29 21:55:09 +00:00
http_prefix: str, domain: str,
2021-12-30 20:24:05 +00:00
replace_emoji: {}, post_tags: {},
emoji_dict: {}) -> bool:
2019-08-09 16:18:00 +00:00
"""Detects Emoji and adds them to the replacements dict
Also updates the tags list to be added to the post
"""
2021-12-30 20:24:05 +00:00
if not word_str.startswith(':'):
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
if not word_str.endswith(':'):
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
if len(word_str) < 3:
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
if replace_emoji.get(word_str):
2020-04-02 09:56:17 +00:00
return True
2019-09-23 11:11:13 +00:00
# remove leading and trailing : characters
2021-12-30 20:24:05 +00:00
emoji = word_str[1:]
2020-04-02 09:56:17 +00:00
emoji = emoji[:-1]
2019-09-23 11:11:13 +00:00
# is the text of the emoji valid?
2021-12-29 21:55:09 +00:00
if not valid_hash_tag(emoji):
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
if not emoji_dict.get(emoji):
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
emoji_filename = base_dir + '/emoji/' + emoji_dict[emoji] + '.png'
if not os.path.isfile(emoji_filename):
2019-08-09 16:18:00 +00:00
return False
2021-12-30 20:24:05 +00:00
emoji_url = http_prefix + "://" + domain + \
"/emoji/" + emoji_dict[emoji] + '.png'
post_tags[emoji] = {
2019-08-19 13:35:55 +00:00
'icon': {
'mediaType': 'image/png',
'type': 'Image',
2021-12-30 20:24:05 +00:00
'url': emoji_url
2019-08-19 13:35:55 +00:00
},
2021-06-22 12:42:52 +00:00
'name': ':' + emoji + ':',
2021-12-30 20:24:05 +00:00
"updated": file_last_modified(emoji_filename),
"id": emoji_url.replace('.png', ''),
2019-08-09 16:18:00 +00:00
'type': 'Emoji'
}
return True
2020-04-02 09:56:17 +00:00
2021-12-29 21:55:09 +00:00
def post_tag_exists(tagType: str, tagName: str, tags: {}) -> bool:
2020-12-13 20:07:45 +00:00
"""Returns true if a tag exists in the given dict
"""
for tag in tags:
if tag['name'] == tagName and tag['type'] == tagType:
return True
return False
2021-12-30 20:24:05 +00:00
def _add_mention(word_str: str, http_prefix: str, following: str,
petnames: str, replace_mentions: {},
recipients: [], tags: {}) -> bool:
2020-03-29 09:59:54 +00:00
"""Detects mentions and adds them to the replacements dict and
recipients list
2019-08-09 09:09:21 +00:00
"""
2021-12-30 20:24:05 +00:00
possible_handle = word_str[1:]
2019-08-19 10:05:50 +00:00
# @nick
2021-12-30 20:24:05 +00:00
if following and '@' not in possible_handle:
2019-08-09 09:48:51 +00:00
# fall back to a best effort match against the following list
# if no domain was specified. eg. @nick
2021-12-30 20:24:05 +00:00
possible_nickname = possible_handle
2019-08-09 09:48:51 +00:00
for follow in following:
2021-01-29 21:33:23 +00:00
if '@' not in follow:
continue
2021-12-30 20:24:05 +00:00
follow_nick = follow.split('@')[0]
if possible_nickname == follow_nick:
follow_str = follow.replace('\n', '').replace('\r', '')
replace_domain = follow_str.split('@')[1]
recipient_actor = http_prefix + "://" + \
replace_domain + "/@" + possible_nickname
if recipient_actor not in recipients:
recipients.append(recipient_actor)
tags[word_str] = {
'href': recipient_actor,
'name': word_str,
2019-08-19 12:13:18 +00:00
'type': 'Mention'
}
2021-12-30 20:24:05 +00:00
replace_mentions[word_str] = \
2021-12-25 17:09:22 +00:00
"<span class=\"h-card\"><a href=\"" + http_prefix + \
2021-12-30 20:24:05 +00:00
"://" + replace_domain + "/@" + possible_nickname + \
"\" class=\"u-url mention\">@<span>" + \
possible_nickname + "</span></a></span>"
2019-08-09 09:48:51 +00:00
return True
2021-01-29 21:33:23 +00:00
# try replacing petnames with mentions
2021-12-30 20:24:05 +00:00
follow_ctr = 0
2021-01-29 21:33:23 +00:00
for follow in following:
if '@' not in follow:
2021-12-30 20:24:05 +00:00
follow_ctr += 1
2021-01-29 21:33:23 +00:00
continue
2021-12-30 20:24:05 +00:00
pet = petnames[follow_ctr].replace('\n', '')
2021-01-29 21:33:23 +00:00
if pet:
2021-12-30 20:24:05 +00:00
if possible_nickname == pet:
follow_str = follow.replace('\n', '').replace('\r', '')
replace_nickname = follow_str.split('@')[0]
replace_domain = follow_str.split('@')[1]
recipient_actor = http_prefix + "://" + \
replace_domain + "/@" + replace_nickname
if recipient_actor not in recipients:
recipients.append(recipient_actor)
tags[word_str] = {
'href': recipient_actor,
'name': word_str,
2021-01-29 21:33:23 +00:00
'type': 'Mention'
}
2021-12-30 20:24:05 +00:00
replace_mentions[word_str] = \
2021-12-25 17:09:22 +00:00
"<span class=\"h-card\"><a href=\"" + http_prefix + \
2021-12-30 20:24:05 +00:00
"://" + replace_domain + "/@" + replace_nickname + \
2021-01-29 21:33:23 +00:00
"\" class=\"u-url mention\">@<span>" + \
2021-12-30 20:24:05 +00:00
replace_nickname + "</span></a></span>"
2021-01-29 21:33:23 +00:00
return True
2021-12-30 20:24:05 +00:00
follow_ctr += 1
2019-08-09 09:48:51 +00:00
return False
2021-12-30 20:24:05 +00:00
possible_nickname = None
possible_domain = None
if '@' not in possible_handle:
2019-10-29 20:15:21 +00:00
return False
2021-12-30 20:24:05 +00:00
possible_nickname = possible_handle.split('@')[0]
if not possible_nickname:
2019-10-29 20:15:21 +00:00
return False
2021-12-30 20:24:05 +00:00
possible_domain = \
possible_handle.split('@')[1].strip('\n').strip('\r')
if not possible_domain:
2019-10-29 20:15:21 +00:00
return False
2019-08-19 11:41:15 +00:00
if following:
for follow in following:
2021-12-30 20:24:05 +00:00
if follow.replace('\n', '').replace('\r', '') != possible_handle:
2019-08-19 11:41:15 +00:00
continue
2021-12-30 20:24:05 +00:00
recipient_actor = http_prefix + "://" + \
possible_domain + "/@" + possible_nickname
if recipient_actor not in recipients:
recipients.append(recipient_actor)
tags[word_str] = {
'href': recipient_actor,
'name': word_str,
2019-08-19 12:13:18 +00:00
'type': 'Mention'
}
2021-12-30 20:24:05 +00:00
replace_mentions[word_str] = \
2021-12-25 17:09:22 +00:00
"<span class=\"h-card\"><a href=\"" + http_prefix + \
2021-12-30 20:24:05 +00:00
"://" + possible_domain + "/@" + possible_nickname + \
"\" class=\"u-url mention\">@<span>" + possible_nickname + \
2020-03-29 09:59:54 +00:00
"</span></a></span>"
2019-08-19 11:41:15 +00:00
return True
2019-08-19 10:05:50 +00:00
# @nick@domain
2021-12-30 20:24:05 +00:00
if not (possible_domain == 'localhost' or '.' in possible_domain):
2020-03-22 21:16:02 +00:00
return False
2021-12-30 20:24:05 +00:00
recipient_actor = http_prefix + "://" + \
possible_domain + "/@" + possible_nickname
if recipient_actor not in recipients:
recipients.append(recipient_actor)
tags[word_str] = {
'href': recipient_actor,
'name': word_str,
2019-10-29 20:15:21 +00:00
'type': 'Mention'
}
2021-12-30 20:24:05 +00:00
replace_mentions[word_str] = \
2021-12-25 17:09:22 +00:00
"<span class=\"h-card\"><a href=\"" + http_prefix + \
2021-12-30 20:24:05 +00:00
"://" + possible_domain + "/@" + possible_nickname + \
"\" class=\"u-url mention\">@<span>" + possible_nickname + \
2020-03-29 09:59:54 +00:00
"</span></a></span>"
2019-10-29 20:15:21 +00:00
return True
2019-08-09 09:09:21 +00:00
2020-04-02 09:56:17 +00:00
2021-12-29 21:55:09 +00:00
def replace_content_duplicates(content: str) -> str:
2020-05-12 09:34:58 +00:00
"""Replaces invalid duplicates within content
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
2020-05-12 09:34:58 +00:00
while '<<' in content:
content = content.replace('<<', '<')
while '>>' in content:
content = content.replace('>>', '>')
content = content.replace('<\\p>', '')
2020-05-12 09:34:58 +00:00
return content
2022-03-24 14:40:28 +00:00
def remove_text_formatting(content: str, bold_reading: bool) -> str:
"""Removes markup for bold, italics, etc
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
if '<' not in content:
return content
2021-12-30 20:24:05 +00:00
for markup in REMOVE_MARKUP:
2022-03-24 14:40:28 +00:00
if bold_reading:
if markup == 'b':
continue
2020-06-14 13:39:03 +00:00
content = content.replace('<' + markup + '>', '')
content = content.replace('</' + markup + '>', '')
content = content.replace('<' + markup.upper() + '>', '')
content = content.replace('</' + markup.upper() + '>', '')
return content
2021-12-30 20:24:05 +00:00
def remove_long_words(content: str, max_word_length: int,
long_words_list: []) -> str:
2020-03-29 09:59:54 +00:00
"""Breaks up long words so that on mobile screens this doesn't
disrupt the layout
2019-10-09 12:19:17 +00:00
"""
2021-12-26 19:15:36 +00:00
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
2021-03-11 17:15:32 +00:00
return content
2021-12-29 21:55:09 +00:00
content = replace_content_duplicates(content)
if ' ' not in content:
# handle a single very long string with no spaces
2021-12-30 20:24:05 +00:00
content_str = content.replace('<p>', '').replace(r'<\p>', '')
if '://' not in content_str:
if len(content_str) > max_word_length:
if '<p>' in content:
2021-12-30 20:24:05 +00:00
content = '<p>' + content_str[:max_word_length] + r'<\p>'
else:
2021-12-30 20:24:05 +00:00
content = content[:max_word_length]
return content
2020-04-02 09:56:17 +00:00
words = content.split(' ')
2021-12-30 20:24:05 +00:00
if not long_words_list:
long_words_list = []
for word_str in words:
if len(word_str) > max_word_length:
if word_str not in long_words_list:
long_words_list.append(word_str)
for word_str in long_words_list:
if word_str.startswith('<p>'):
word_str = word_str.replace('<p>', '')
if word_str.startswith('<'):
2019-10-18 12:24:31 +00:00
continue
2021-12-30 20:24:05 +00:00
if len(word_str) == 76:
if word_str.upper() == word_str:
2020-03-22 14:29:34 +00:00
# tox address
continue
2021-12-30 20:24:05 +00:00
if '=\"' in word_str:
2019-11-04 21:08:43 +00:00
continue
2021-12-30 20:24:05 +00:00
if '@' in word_str:
if '@@' not in word_str:
2019-11-04 21:11:09 +00:00
continue
2021-12-30 20:24:05 +00:00
if '=.ed25519' in word_str:
2020-01-25 10:49:59 +00:00
continue
2021-12-30 20:24:05 +00:00
if '.onion' in word_str:
2020-01-25 10:49:59 +00:00
continue
2021-12-30 20:24:05 +00:00
if '.i2p' in word_str:
2020-01-25 10:49:59 +00:00
continue
2021-12-30 20:24:05 +00:00
if 'https:' in word_str:
2019-10-25 18:27:32 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'http:' in word_str:
2019-11-04 20:39:14 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'i2p:' in word_str:
2020-02-17 17:18:21 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'gnunet:' in word_str:
2020-06-09 11:51:51 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'dat:' in word_str:
2019-11-04 20:39:14 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'rad:' in word_str:
2020-12-06 10:18:41 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'hyper:' in word_str:
2020-05-17 09:37:59 +00:00
continue
2021-12-30 20:24:05 +00:00
elif 'briar:' in word_str:
2020-05-17 09:37:59 +00:00
continue
2021-12-30 20:24:05 +00:00
if '<' in word_str:
replace_word = word_str.split('<', 1)[0]
# if len(replace_word) > max_word_length:
# replace_word = replace_word[:max_word_length]
content = content.replace(word_str, replace_word)
word_str = replace_word
if '/' in word_str:
2019-10-25 18:27:32 +00:00
continue
2021-12-30 20:24:05 +00:00
if len(word_str[max_word_length:]) < max_word_length:
content = content.replace(word_str,
word_str[:max_word_length] + '\n' +
word_str[max_word_length:])
2019-10-18 12:24:31 +00:00
else:
2021-12-30 20:24:05 +00:00
content = content.replace(word_str,
word_str[:max_word_length])
2020-01-24 11:27:12 +00:00
if content.startswith('<p>'):
if not content.endswith('</p>'):
2020-10-31 23:10:38 +00:00
content = content.strip() + '</p>'
2019-10-09 12:19:17 +00:00
return content
2020-04-02 09:56:17 +00:00
2021-12-29 21:55:09 +00:00
def _load_auto_tags(base_dir: str, nickname: str, domain: str) -> []:
2020-09-13 14:42:17 +00:00
"""Loads automatic tags file and returns a list containing
the lines of the file
"""
2021-12-26 12:02:29 +00:00
filename = acct_dir(base_dir, nickname, domain) + '/autotags.txt'
2020-09-13 14:42:17 +00:00
if not os.path.isfile(filename):
return []
2021-11-26 12:28:20 +00:00
try:
2021-12-30 20:24:05 +00:00
with open(filename, 'r') as tags_file:
return tags_file.readlines()
2021-11-26 12:28:20 +00:00
except OSError:
print('EX: unable to read auto tags ' + filename)
2020-09-13 14:42:17 +00:00
return []
2021-12-29 21:55:09 +00:00
def _auto_tag(base_dir: str, nickname: str, domain: str,
2021-12-30 20:24:05 +00:00
word_str: str, auto_tag_list: [],
append_tags: []):
2020-09-13 14:42:17 +00:00
"""Generates a list of tags to be automatically appended to the content
"""
2021-12-30 20:24:05 +00:00
for tag_rule in auto_tag_list:
if word_str not in tag_rule:
2020-09-13 14:42:17 +00:00
continue
2021-12-30 20:24:05 +00:00
if '->' not in tag_rule:
2020-09-13 14:42:17 +00:00
continue
2021-12-30 20:24:05 +00:00
rulematch = tag_rule.split('->')[0].strip()
if rulematch != word_str:
2020-09-13 14:42:17 +00:00
continue
2021-12-30 20:24:05 +00:00
tag_name = tag_rule.split('->')[1].strip()
if tag_name.startswith('#'):
if tag_name not in append_tags:
append_tags.append(tag_name)
2020-09-13 14:42:17 +00:00
else:
2021-12-30 20:24:05 +00:00
if '#' + tag_name not in append_tags:
append_tags.append('#' + tag_name)
2020-09-13 14:42:17 +00:00
2021-12-29 21:55:09 +00:00
def add_html_tags(base_dir: str, http_prefix: str,
nickname: str, domain: str, content: str,
recipients: [], hashtags: {},
2021-12-30 20:24:05 +00:00
is_json_content: bool = False) -> str:
2019-07-15 14:11:31 +00:00
""" Replaces plaintext mentions such as @nick@domain into html
by matching against known following accounts
"""
if content.startswith('<p>'):
2021-12-29 21:55:09 +00:00
content = html_replace_email_quote(content)
return html_replace_quote_marks(content)
2021-12-30 20:24:05 +00:00
max_word_length = 40
2020-05-22 11:32:38 +00:00
content = content.replace('\r', '')
2020-04-02 09:56:17 +00:00
content = content.replace('\n', ' --linebreak-- ')
2021-12-29 21:55:09 +00:00
content = _add_music_tag(content, 'nowplaying')
2021-12-30 20:24:05 +00:00
content_simplified = \
2020-10-16 19:49:34 +00:00
content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
2021-12-30 20:24:05 +00:00
content_simplified = content_simplified.replace('. ', ' ').strip()
if content_simplified.endswith('.'):
content_simplified = content_simplified[:len(content_simplified)-1]
words = content_simplified.split(' ')
2020-03-22 21:16:02 +00:00
# remove . for words which are not mentions
2021-12-30 20:24:05 +00:00
new_words = []
for word_index in range(0, len(words)):
word_str = words[word_index]
if word_str.endswith('.'):
if not word_str.startswith('@'):
word_str = word_str[:-1]
if word_str.startswith('.'):
word_str = word_str[1:]
new_words.append(word_str)
words = new_words
replace_mentions = {}
replace_hashtags = {}
replace_emoji = {}
emoji_dict = {}
original_domain = domain
2021-12-26 18:17:37 +00:00
domain = remove_domain_port(domain)
2021-12-30 20:24:05 +00:00
following_filename = \
acct_dir(base_dir, nickname, domain) + '/following.txt'
2019-08-09 09:09:21 +00:00
# read the following list so that we can detect just @nick
# in addition to @nick@domain
2020-04-02 09:56:17 +00:00
following = None
2021-01-29 21:33:23 +00:00
petnames = None
2019-10-18 12:24:31 +00:00
if '@' in words:
2021-12-30 20:24:05 +00:00
if os.path.isfile(following_filename):
2021-11-26 12:28:20 +00:00
following = []
try:
2021-12-30 20:24:05 +00:00
with open(following_filename, 'r') as foll_file:
following = foll_file.readlines()
2021-11-26 12:28:20 +00:00
except OSError:
2021-12-30 20:24:05 +00:00
print('EX: unable to read ' + following_filename)
2021-11-26 12:28:20 +00:00
for handle in following:
2021-12-29 21:55:09 +00:00
pet = get_pet_name(base_dir, nickname, domain, handle)
2021-11-26 12:28:20 +00:00
if pet:
petnames.append(pet + '\n')
2019-08-09 09:09:21 +00:00
# extract mentions and tags from words
2021-12-30 20:24:05 +00:00
long_words_list = []
prev_word_str = ''
auto_tags_list = _load_auto_tags(base_dir, nickname, domain)
append_tags = []
for word_str in words:
word_len = len(word_str)
if word_len > 2:
if word_len > max_word_length:
long_words_list.append(word_str)
first_char = word_str[0]
if first_char == '@':
if _add_mention(word_str, http_prefix, following, petnames,
replace_mentions, recipients, hashtags):
prev_word_str = ''
2019-10-18 12:24:31 +00:00
continue
2021-12-30 20:24:05 +00:00
elif first_char == '#':
2021-02-13 12:12:06 +00:00
# remove any endings from the hashtag
2021-12-30 20:24:05 +00:00
hash_tag_endings = ('.', ':', ';', '-', '\n')
for ending in hash_tag_endings:
if word_str.endswith(ending):
word_str = word_str[:len(word_str) - 1]
2021-02-13 12:26:25 +00:00
break
2021-02-13 12:12:06 +00:00
2021-12-30 20:24:05 +00:00
if _add_hash_tags(word_str, http_prefix, original_domain,
replace_hashtags, hashtags):
prev_word_str = ''
2019-10-18 12:24:31 +00:00
continue
2021-12-30 20:24:05 +00:00
elif ':' in word_str:
word_str2 = word_str.split(':')[1]
# print('TAG: emoji located - ' + word_str)
if not emoji_dict:
2020-03-29 09:59:54 +00:00
# emoji.json is generated so that it can be customized and
# the changes will be retained even if default_emoji.json
# is subsequently updated
2021-12-25 16:17:53 +00:00
if not os.path.isfile(base_dir + '/emoji/emoji.json'):
copyfile(base_dir + '/emoji/default_emoji.json',
base_dir + '/emoji/emoji.json')
2021-12-30 20:24:05 +00:00
emoji_dict = load_json(base_dir + '/emoji/emoji.json')
2020-04-02 09:56:17 +00:00
2021-11-01 22:45:57 +00:00
# append custom emoji to the dict
2021-12-25 16:17:53 +00:00
if os.path.isfile(base_dir + '/emojicustom/emoji.json'):
2021-12-30 20:24:05 +00:00
custom_emoji_dict = \
2021-12-26 15:13:34 +00:00
load_json(base_dir + '/emojicustom/emoji.json')
2021-12-30 20:24:05 +00:00
if custom_emoji_dict:
2022-01-30 18:07:20 +00:00
emojis_combined = True
2022-01-30 18:03:55 +00:00
try:
emoji_dict = dict(emoji_dict, **custom_emoji_dict)
except BaseException:
2022-01-30 18:07:20 +00:00
emojis_combined = False
if not emojis_combined:
# combine emoji dicts one by one
2022-01-30 18:08:37 +00:00
for ename, eitem in custom_emoji_dict.items():
2022-01-30 18:13:02 +00:00
if ename and eitem:
if not emoji_dict.get(ename):
emoji_dict[ename] = eitem
2021-11-01 22:45:57 +00:00
2021-12-30 20:24:05 +00:00
# print('TAG: looking up emoji for :' + word_str2 + ':')
_add_emoji(base_dir, ':' + word_str2 + ':', http_prefix,
original_domain, replace_emoji, hashtags,
emoji_dict)
2020-09-13 14:42:17 +00:00
else:
2021-12-30 20:24:05 +00:00
if _auto_tag(base_dir, nickname, domain, word_str,
auto_tags_list, append_tags):
prev_word_str = ''
2020-09-13 14:42:17 +00:00
continue
2021-12-30 20:24:05 +00:00
if prev_word_str:
2021-12-29 21:55:09 +00:00
if _auto_tag(base_dir, nickname, domain,
2021-12-30 20:24:05 +00:00
prev_word_str + ' ' + word_str,
auto_tags_list, append_tags):
prev_word_str = ''
2020-09-13 14:42:17 +00:00
continue
2021-12-30 20:24:05 +00:00
prev_word_str = word_str
2020-09-13 14:42:17 +00:00
# add any auto generated tags
2021-12-30 20:24:05 +00:00
for appended in append_tags:
2020-09-13 14:42:17 +00:00
content = content + ' ' + appended
2021-12-30 20:24:05 +00:00
_add_hash_tags(appended, http_prefix, original_domain,
replace_hashtags, hashtags)
2019-08-09 09:09:21 +00:00
# replace words with their html versions
2021-12-30 20:24:05 +00:00
for word_str, replace_str in replace_mentions.items():
content = content.replace(word_str, replace_str)
for word_str, replace_str in replace_hashtags.items():
content = content.replace(word_str, replace_str)
if not is_json_content:
for word_str, replace_str in replace_emoji.items():
content = content.replace(word_str, replace_str)
2021-12-29 21:55:09 +00:00
content = add_web_links(content)
2021-12-30 20:24:05 +00:00
if long_words_list:
content = remove_long_words(content, max_word_length, long_words_list)
2021-12-29 21:55:09 +00:00
content = limit_repeated_words(content, 6)
2020-04-02 09:56:17 +00:00
content = content.replace(' --linebreak-- ', '</p><p>')
2021-12-29 21:55:09 +00:00
content = html_replace_email_quote(content)
return '<p>' + html_replace_quote_marks(content) + '</p>'
2020-03-22 21:16:02 +00:00
2020-04-02 09:56:17 +00:00
2021-12-30 20:24:05 +00:00
def get_mentions_from_html(html_text: str, match_str: str) -> []:
2019-08-05 19:13:15 +00:00
"""Extracts mentioned actors from the given html content string
"""
2020-04-02 09:56:17 +00:00
mentions = []
2021-12-30 20:24:05 +00:00
if match_str not in html_text:
2019-08-05 19:13:15 +00:00
return mentions
2021-12-30 20:24:05 +00:00
mentions_list = html_text.split(match_str)
for mention_str in mentions_list:
if '"' not in mention_str:
2019-08-05 19:13:15 +00:00
continue
2021-12-30 20:24:05 +00:00
actor_str = mention_str.split('"')[0]
if actor_str.startswith('http') or \
actor_str.startswith('gnunet') or \
actor_str.startswith('i2p') or \
actor_str.startswith('hyper') or \
actor_str.startswith('dat:'):
if actor_str not in mentions:
mentions.append(actor_str)
2019-08-05 19:13:15 +00:00
return mentions
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
2021-12-30 20:24:05 +00:00
def extract_media_in_form_post(post_bytes, boundary, name: str):
2020-03-29 09:59:54 +00:00
"""Extracts the binary encoding for image/video/audio within a http
form POST
2019-11-10 11:37:24 +00:00
Returns the media bytes and the remaining bytes
"""
2021-12-30 20:24:05 +00:00
image_start_boundary = b'Content-Disposition: form-data; name="' + \
2020-04-02 09:56:17 +00:00
name.encode('utf8', 'ignore') + b'";'
2021-12-30 20:24:05 +00:00
image_start_location = post_bytes.find(image_start_boundary)
if image_start_location == -1:
return None, post_bytes
2019-11-10 11:37:24 +00:00
# bytes after the start boundary appears
2021-12-30 20:24:05 +00:00
media_bytes = post_bytes[image_start_location:]
2019-11-10 11:37:24 +00:00
# look for the next boundary
2021-12-30 20:24:05 +00:00
image_end_boundary = boundary.encode('utf8', 'ignore')
image_end_location = media_bytes.find(image_end_boundary)
if image_end_location == -1:
2019-11-10 11:37:24 +00:00
# no ending boundary
2021-12-30 20:24:05 +00:00
return media_bytes, post_bytes[:image_start_location]
2019-11-10 11:37:24 +00:00
# remaining bytes after the end of the image
2021-12-30 20:24:05 +00:00
remainder = media_bytes[image_end_location:]
2019-11-10 11:37:24 +00:00
# remove bytes after the end boundary
2021-12-30 20:24:05 +00:00
media_bytes = media_bytes[:image_end_location]
2019-11-10 11:37:24 +00:00
# return the media and the before+after bytes
2021-12-30 20:24:05 +00:00
return media_bytes, post_bytes[:image_start_location] + remainder
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
2021-12-30 20:24:05 +00:00
def save_media_in_form_post(media_bytes, debug: bool,
filename_base: str = None) -> (str, str):
2019-11-10 11:37:24 +00:00
"""Saves the given media bytes extracted from http form POST
Returns the filename and attachment type
"""
2021-12-30 20:24:05 +00:00
if not media_bytes:
if filename_base:
# remove any existing files
2021-12-30 20:24:05 +00:00
extension_types = get_image_extensions()
for ex in extension_types:
possible_other_format = filename_base + '.' + ex
if os.path.isfile(possible_other_format):
try:
2021-12-30 20:24:05 +00:00
os.remove(possible_other_format)
2021-11-25 18:42:38 +00:00
except OSError:
2021-10-29 16:31:20 +00:00
if debug:
2021-12-29 21:55:09 +00:00
print('EX: save_media_in_form_post ' +
2021-10-29 16:31:20 +00:00
'unable to delete other ' +
2021-12-30 20:24:05 +00:00
str(possible_other_format))
if os.path.isfile(filename_base):
try:
2021-12-30 20:24:05 +00:00
os.remove(filename_base)
2021-11-25 18:42:38 +00:00
except OSError:
2021-10-29 16:31:20 +00:00
if debug:
2021-12-29 21:55:09 +00:00
print('EX: save_media_in_form_post ' +
2021-10-29 16:31:20 +00:00
'unable to delete ' +
2021-12-30 20:24:05 +00:00
str(filename_base))
2019-11-10 11:37:24 +00:00
if debug:
print('DEBUG: No media found within POST')
2020-04-02 09:56:17 +00:00
return None, None
2019-11-10 11:37:24 +00:00
2021-12-30 20:24:05 +00:00
media_location = -1
search_str = ''
2020-04-02 09:56:17 +00:00
filename = None
2020-03-22 21:16:02 +00:00
2019-11-10 11:37:24 +00:00
# directly search the binary array for the beginning
# of an image
2021-12-30 20:24:05 +00:00
extension_list = {
2019-11-10 11:37:24 +00:00
'png': 'image/png',
'jpeg': 'image/jpeg',
2022-02-06 11:04:49 +00:00
'jxl': 'image/jxl',
2019-11-10 11:37:24 +00:00
'gif': 'image/gif',
2021-01-11 22:27:57 +00:00
'svg': 'image/svg+xml',
2019-11-14 13:30:54 +00:00
'webp': 'image/webp',
'avif': 'image/avif',
2019-11-10 11:37:24 +00:00
'mp4': 'video/mp4',
'ogv': 'video/ogv',
'mp3': 'audio/mpeg',
2021-05-29 11:04:03 +00:00
'ogg': 'audio/ogg',
2021-08-03 09:09:04 +00:00
'flac': 'audio/flac',
2021-05-29 11:04:03 +00:00
'zip': 'application/zip'
2019-11-10 11:37:24 +00:00
}
2021-12-30 20:24:05 +00:00
detected_extension = None
for extension, content_type in extension_list.items():
search_str = b'Content-Type: ' + content_type.encode('utf8', 'ignore')
media_location = media_bytes.find(search_str)
if media_location > -1:
2020-05-26 19:05:03 +00:00
# image/video/audio binaries
2020-04-02 09:56:17 +00:00
if extension == 'jpeg':
extension = 'jpg'
elif extension == 'mpeg':
extension = 'mp3'
2021-12-30 20:24:05 +00:00
if filename_base:
filename = filename_base + '.' + extension
attachment_media_type = \
search_str.decode().split('/')[0].replace('Content-Type: ', '')
detected_extension = extension
2019-11-10 11:37:24 +00:00
break
if not filename:
2020-04-02 09:56:17 +00:00
return None, None
2019-11-10 11:37:24 +00:00
2020-05-26 19:29:15 +00:00
# locate the beginning of the image, after any
# carriage returns
2021-12-30 20:24:05 +00:00
start_pos = media_location + len(search_str)
2020-05-26 19:29:15 +00:00
for offset in range(1, 8):
2021-12-30 20:24:05 +00:00
if media_bytes[start_pos+offset] != 10:
if media_bytes[start_pos+offset] != 13:
start_pos += offset
2020-05-26 19:29:15 +00:00
break
2019-11-10 11:37:24 +00:00
2019-11-14 13:30:54 +00:00
# remove any existing image files with a different format
2021-12-30 20:24:05 +00:00
if detected_extension != 'zip':
extension_types = get_image_extensions()
for ex in extension_types:
if ex == detected_extension:
2021-05-29 11:04:03 +00:00
continue
2021-12-30 20:24:05 +00:00
possible_other_format = \
2021-05-29 11:04:03 +00:00
filename.replace('.temp', '').replace('.' +
2021-12-30 20:24:05 +00:00
detected_extension, '.' +
2021-05-29 11:04:03 +00:00
ex)
2021-12-30 20:24:05 +00:00
if os.path.isfile(possible_other_format):
try:
2021-12-30 20:24:05 +00:00
os.remove(possible_other_format)
2021-11-25 18:42:38 +00:00
except OSError:
2021-10-29 16:31:20 +00:00
if debug:
2021-12-29 21:55:09 +00:00
print('EX: save_media_in_form_post ' +
2021-10-29 16:31:20 +00:00
'unable to delete other 2 ' +
2021-12-30 20:24:05 +00:00
str(possible_other_format))
2019-11-14 13:30:54 +00:00
# don't allow scripts within svg files
2021-12-30 20:24:05 +00:00
if detected_extension == 'svg':
svg_str = media_bytes[start_pos:]
svg_str = svg_str.decode()
if dangerous_svg(svg_str, False):
return None, None
2021-11-25 18:42:38 +00:00
try:
2021-12-30 20:24:05 +00:00
with open(filename, 'wb') as fp_media:
fp_media.write(media_bytes[start_pos:])
2021-11-25 18:42:38 +00:00
except OSError:
2021-11-25 22:22:54 +00:00
print('EX: unable to write media')
2019-12-04 18:52:27 +00:00
2021-03-06 23:16:54 +00:00
if not os.path.isfile(filename):
print('WARN: Media file could not be written to file: ' + filename)
return None, None
2021-03-06 23:19:03 +00:00
print('Uploaded media file written: ' + filename)
2021-03-06 23:16:54 +00:00
2021-12-30 20:24:05 +00:00
return filename, attachment_media_type
2020-04-02 09:56:17 +00:00
2019-11-10 11:37:24 +00:00
2021-12-30 20:24:05 +00:00
def extract_text_fields_in_post(post_bytes, boundary: str, debug: bool,
2021-12-29 21:55:09 +00:00
unit_testData: str = None) -> {}:
2019-11-10 11:37:24 +00:00
"""Returns a dictionary containing the text fields of a http form POST
The boundary argument comes from the http header
2020-03-22 21:16:02 +00:00
"""
2021-12-25 21:32:15 +00:00
if not unit_testData:
2021-12-30 20:24:05 +00:00
msg_bytes = email.parser.BytesParser().parsebytes(post_bytes)
message_fields = msg_bytes.get_payload(decode=True).decode('utf-8')
else:
2021-12-30 20:24:05 +00:00
message_fields = unit_testData
2019-11-10 11:54:45 +00:00
if debug:
2021-12-30 20:24:05 +00:00
print('DEBUG: POST arriving ' + message_fields)
2021-12-30 20:24:05 +00:00
message_fields = message_fields.split(boundary)
2020-04-02 09:56:17 +00:00
fields = {}
2021-12-30 20:24:05 +00:00
fields_with_semicolon_allowed = (
2021-03-01 12:19:49 +00:00
'message', 'bio', 'autoCW', 'password', 'passwordconfirm',
'instanceDescription', 'instanceDescriptionShort',
'subject', 'location', 'imageDescription'
2021-03-01 12:15:06 +00:00
)
2019-11-10 11:37:24 +00:00
# examine each section of the POST, separated by the boundary
2021-12-30 20:24:05 +00:00
for fld in message_fields:
if fld == '--':
2019-11-10 11:37:24 +00:00
continue
2021-12-30 20:24:05 +00:00
if ' name="' not in fld:
2020-03-22 21:16:02 +00:00
continue
2021-12-30 20:24:05 +00:00
post_str = fld.split(' name="', 1)[1]
if '"' not in post_str:
2019-11-10 11:37:24 +00:00
continue
2021-12-30 20:24:05 +00:00
post_key = post_str.split('"', 1)[0]
post_value_str = post_str.split('"', 1)[1]
if ';' in post_value_str:
if post_key not in fields_with_semicolon_allowed and \
not post_key.startswith('edited'):
continue
2021-12-30 20:24:05 +00:00
if '\r\n' not in post_value_str:
2019-11-10 11:37:24 +00:00
continue
2021-12-30 20:24:05 +00:00
post_lines = post_value_str.split('\r\n')
post_value = ''
if len(post_lines) > 2:
for line in range(2, len(post_lines)-1):
2020-04-02 09:56:17 +00:00
if line > 2:
2021-12-30 20:24:05 +00:00
post_value += '\n'
post_value += post_lines[line]
fields[post_key] = urllib.parse.unquote(post_value)
2019-11-10 11:37:24 +00:00
return fields
2021-12-30 20:24:05 +00:00
def limit_repeated_words(text: str, max_repeats: int) -> str:
"""Removes words which are repeated many times
"""
words = text.replace('\n', ' ').split(' ')
2021-12-30 20:24:05 +00:00
repeat_ctr = 0
repeated_text = ''
replacements = {}
2021-12-30 20:24:05 +00:00
prev_word = ''
for word in words:
2021-12-30 20:24:05 +00:00
if word == prev_word:
repeat_ctr += 1
if repeated_text:
repeated_text += ' ' + word
else:
2021-12-30 20:24:05 +00:00
repeated_text = word + ' ' + word
else:
2021-12-30 20:24:05 +00:00
if repeat_ctr > max_repeats:
new_text = ((prev_word + ' ') * max_repeats).strip()
replacements[prev_word] = [repeated_text, new_text]
repeat_ctr = 0
repeated_text = ''
prev_word = word
2021-12-30 20:24:05 +00:00
if repeat_ctr > max_repeats:
new_text = ((prev_word + ' ') * max_repeats).strip()
replacements[prev_word] = [repeated_text, new_text]
for word, item in replacements.items():
text = text.replace(item[0], item[1])
return text
2021-08-07 17:03:41 +00:00
2021-12-29 21:55:09 +00:00
def get_price_from_string(priceStr: str) -> (str, str):
2021-08-07 17:03:41 +00:00
"""Returns the item price and currency
"""
2021-12-26 17:29:09 +00:00
currencies = get_currencies()
2021-08-07 17:03:41 +00:00
for symbol, name in currencies.items():
if symbol in priceStr:
price = priceStr.replace(symbol, '')
2021-12-26 18:03:39 +00:00
if is_float(price):
2021-08-07 17:03:41 +00:00
return price, name
elif name in priceStr:
price = priceStr.replace(name, '')
2021-12-26 18:03:39 +00:00
if is_float(price):
2021-08-07 17:03:41 +00:00
return price, name
2021-12-26 18:03:39 +00:00
if is_float(priceStr):
2021-08-07 17:03:41 +00:00
return priceStr, "EUR"
return "0.00", "EUR"
2021-10-14 15:12:35 +00:00
2021-12-29 21:55:09 +00:00
def _words_similarity_histogram(words: []) -> {}:
2021-10-14 15:40:19 +00:00
"""Returns a histogram for word combinations
"""
histogram = {}
for index in range(1, len(words)):
2021-12-30 20:24:05 +00:00
combined_words = words[index - 1] + words[index]
if histogram.get(combined_words):
histogram[combined_words] += 1
2021-10-14 15:40:19 +00:00
else:
2021-12-30 20:24:05 +00:00
histogram[combined_words] = 1
2021-10-14 15:40:19 +00:00
return histogram
2021-12-29 21:55:09 +00:00
def _words_similarity_words_list(content: str) -> []:
2021-10-14 15:53:04 +00:00
"""Returns a list of words for the given content
"""
2021-12-30 20:24:05 +00:00
remove_punctuation = ('.', ',', ';', '-', ':', '"')
2021-12-27 15:43:22 +00:00
content = remove_html(content).lower()
2021-12-30 20:24:05 +00:00
for punc in remove_punctuation:
content = content.replace(punc, ' ')
2021-10-14 15:53:04 +00:00
content = content.replace(' ', ' ')
return content.split(' ')
2021-12-30 20:24:05 +00:00
def words_similarity(content1: str, content2: str, min_words: int) -> int:
2021-10-14 15:12:35 +00:00
"""Returns percentage similarity
"""
if content1 == content2:
return 100
2021-10-14 15:40:19 +00:00
2021-12-29 21:55:09 +00:00
words1 = _words_similarity_words_list(content1)
2021-12-30 20:24:05 +00:00
if len(words1) < min_words:
2021-10-14 15:12:35 +00:00
return 0
2021-10-14 15:40:19 +00:00
2021-12-29 21:55:09 +00:00
words2 = _words_similarity_words_list(content2)
2021-12-30 20:24:05 +00:00
if len(words2) < min_words:
2021-10-14 15:12:35 +00:00
return 0
2021-12-29 21:55:09 +00:00
histogram1 = _words_similarity_histogram(words1)
histogram2 = _words_similarity_histogram(words2)
2021-10-14 15:12:35 +00:00
diff = 0
2021-12-30 20:24:05 +00:00
for combined_words, _ in histogram1.items():
if not histogram2.get(combined_words):
2021-10-14 15:12:35 +00:00
diff += 1
else:
2021-12-30 20:24:05 +00:00
diff += \
abs(histogram2[combined_words] - histogram1[combined_words])
2021-10-14 15:12:35 +00:00
return 100 - int(diff * 100 / len(histogram1.items()))
2021-12-29 21:55:09 +00:00
def contains_invalid_local_links(content: str) -> bool:
"""Returns true if the given content has invalid links
"""
2021-12-30 20:24:05 +00:00
for inv_str in INVALID_CONTENT_STRINGS:
if '?' + inv_str + '=' in content:
return True
return False
2022-03-24 13:14:41 +00:00
def bold_reading_string(text: str) -> str:
"""Returns bold reading formatted text
"""
2022-03-24 15:15:53 +00:00
text = html.unescape(text)
2022-03-24 13:14:41 +00:00
add_paragraph_markup = False
if '<p>' in text:
text = text.replace('</p>', '\n').replace('<p>', '')
add_paragraph_markup = True
paragraphs = text.split('\n')
parag_ctr = 0
2022-03-24 14:40:28 +00:00
new_text = ''
2022-03-24 13:14:41 +00:00
for parag in paragraphs:
words = parag.split(' ')
new_parag = ''
2022-03-24 14:08:07 +00:00
reading_markup = False
2022-03-24 13:14:41 +00:00
for wrd in words:
if '<' in wrd:
2022-03-24 14:08:07 +00:00
reading_markup = True
if reading_markup and '>' in wrd:
2022-03-24 14:08:07 +00:00
reading_markup = False
wrd_len = len(wrd)
if not reading_markup and wrd_len > 1 and \
2022-03-24 13:38:10 +00:00
'<' not in wrd and '>' not in wrd and \
'&' not in wrd and '=' not in wrd and \
not wrd.startswith(':'):
2022-03-24 13:45:55 +00:00
prefix = ''
postfix = ''
if wrd.startswith('"'):
prefix = '"'
wrd = wrd[1:]
if wrd.endswith('"'):
postfix = '"'
wrd = wrd[:wrd_len - 1]
2022-03-24 13:45:55 +00:00
initial_chars = int(math.ceil(wrd_len / 2.0))
2022-03-24 13:14:41 +00:00
new_parag += \
2022-03-24 13:45:55 +00:00
prefix + '<b>' + wrd[:initial_chars] + '</b>' + \
wrd[initial_chars:] + postfix + ' '
2022-03-24 13:14:41 +00:00
else:
new_parag += wrd + ' '
parag_ctr += 1
new_parag = new_parag.strip()
if not new_parag:
continue
2022-03-24 13:14:41 +00:00
if parag_ctr < len(paragraphs):
if not add_paragraph_markup:
new_text += new_parag + '\n'
else:
new_text += '<p>' + new_parag + '</p>'
else:
if not add_paragraph_markup:
new_text += new_parag
else:
new_text += '<p>' + new_parag + '</p>'
return new_text