2020-04-02 09:56:17 +00:00
|
|
|
|
__filename__ = "content.py"
|
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
|
__license__ = "AGPL3+"
|
2022-02-03 13:58:20 +00:00
|
|
|
|
__version__ = "1.3.0"
|
2020-04-02 09:56:17 +00:00
|
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
|
__email__ = "bob@libreserver.org"
|
2020-04-02 09:56:17 +00:00
|
|
|
|
__status__ = "Production"
|
2021-06-25 16:10:09 +00:00
|
|
|
|
__module_group__ = "Core"
|
2019-07-15 14:11:31 +00:00
|
|
|
|
|
2022-04-10 19:19:40 +00:00
|
|
|
|
import difflib
|
2022-03-24 16:16:36 +00:00
|
|
|
|
import math
|
2022-03-24 15:15:53 +00:00
|
|
|
|
import html
|
2019-07-15 14:11:31 +00:00
|
|
|
|
import os
|
2019-11-10 11:37:24 +00:00
|
|
|
|
import email.parser
|
2020-12-03 14:59:07 +00:00
|
|
|
|
import urllib.parse
|
2019-08-11 16:55:22 +00:00
|
|
|
|
from shutil import copyfile
|
2022-04-10 22:50:44 +00:00
|
|
|
|
from dateutil.parser import parse
|
2022-09-03 17:09:00 +00:00
|
|
|
|
from utils import get_user_paths
|
2022-04-10 22:50:44 +00:00
|
|
|
|
from utils import convert_published_to_local_timezone
|
|
|
|
|
from utils import has_object_dict
|
2022-01-13 15:10:41 +00:00
|
|
|
|
from utils import valid_hash_tag
|
2021-12-27 21:44:48 +00:00
|
|
|
|
from utils import dangerous_svg
|
2021-12-26 18:17:37 +00:00
|
|
|
|
from utils import remove_domain_port
|
2021-12-26 14:26:16 +00:00
|
|
|
|
from utils import get_image_extensions
|
2021-12-26 15:13:34 +00:00
|
|
|
|
from utils import load_json
|
2021-12-26 14:47:21 +00:00
|
|
|
|
from utils import save_json
|
2021-12-28 14:01:37 +00:00
|
|
|
|
from utils import file_last_modified
|
2021-12-27 17:32:34 +00:00
|
|
|
|
from utils import get_link_prefixes
|
2021-12-27 21:42:08 +00:00
|
|
|
|
from utils import dangerous_markup
|
2021-12-26 19:15:36 +00:00
|
|
|
|
from utils import is_pgp_encrypted
|
|
|
|
|
from utils import contains_pgp_public_key
|
2021-12-26 12:02:29 +00:00
|
|
|
|
from utils import acct_dir
|
2021-12-26 18:03:39 +00:00
|
|
|
|
from utils import is_float
|
2021-12-26 17:29:09 +00:00
|
|
|
|
from utils import get_currencies
|
2021-12-27 15:43:22 +00:00
|
|
|
|
from utils import remove_html
|
2022-06-21 11:58:50 +00:00
|
|
|
|
from utils import remove_eol
|
2021-12-29 21:55:09 +00:00
|
|
|
|
from petnames import get_pet_name
|
|
|
|
|
from session import download_image
|
2019-07-15 14:11:31 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
MUSIC_SITES = ('soundcloud.com', 'bandcamp.com')
|
|
|
|
|
|
|
|
|
|
MAX_LINK_LENGTH = 40
|
|
|
|
|
|
|
|
|
|
REMOVE_MARKUP = (
|
|
|
|
|
'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
|
|
|
|
|
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
INVALID_CONTENT_STRINGS = (
|
|
|
|
|
'mute', 'unmute', 'editeventpost', 'notifypost',
|
|
|
|
|
'delete', 'options', 'page', 'repeat',
|
|
|
|
|
'bm', 'tl', 'actor', 'unrepeat', 'eventid',
|
|
|
|
|
'unannounce', 'like', 'unlike', 'bookmark',
|
|
|
|
|
'unbookmark', 'likedBy', 'time',
|
|
|
|
|
'year', 'month', 'day', 'editnewpost',
|
|
|
|
|
'graph', 'showshare', 'category', 'showwanted',
|
|
|
|
|
'rmshare', 'rmwanted', 'repeatprivate',
|
|
|
|
|
'unrepeatprivate', 'replyto',
|
2022-02-08 10:52:03 +00:00
|
|
|
|
'replyfollowers', 'replydm', 'replychat', 'editblogpost',
|
2021-12-30 20:24:05 +00:00
|
|
|
|
'handle', 'blockdomain'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2022-05-17 11:40:05 +00:00
|
|
|
|
def valid_url_lengths(content: str, max_url_length: int) -> bool:
|
|
|
|
|
"""Returns true if the given content contains urls which are too long
|
|
|
|
|
"""
|
|
|
|
|
if '://' not in content:
|
|
|
|
|
return True
|
|
|
|
|
sections = content.split('://')
|
|
|
|
|
ctr = 0
|
|
|
|
|
for text in sections:
|
|
|
|
|
if ctr == 0:
|
|
|
|
|
ctr += 1
|
|
|
|
|
continue
|
|
|
|
|
if '"' in text:
|
|
|
|
|
url = text.split('"')[0]
|
|
|
|
|
if '<' not in url and '>' not in url:
|
|
|
|
|
if len(url) > max_url_length:
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def remove_html_tag(html_str: str, tag: str) -> str:
|
2020-10-11 09:33:31 +00:00
|
|
|
|
"""Removes a given tag from a html string
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
tag_found = True
|
|
|
|
|
while tag_found:
|
|
|
|
|
match_str = ' ' + tag + '="'
|
|
|
|
|
if match_str not in html_str:
|
|
|
|
|
tag_found = False
|
2020-10-11 09:33:31 +00:00
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
sections = html_str.split(match_str, 1)
|
2020-10-11 09:33:31 +00:00
|
|
|
|
if '"' not in sections[1]:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
tag_found = False
|
2020-10-11 09:33:31 +00:00
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
html_str = sections[0] + sections[1].split('"', 1)[1]
|
|
|
|
|
return html_str
|
2020-10-11 09:33:31 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _remove_quotes_within_quotes(content: str) -> str:
|
2020-09-30 22:52:39 +00:00
|
|
|
|
"""Removes any blockquote inside blockquote
|
|
|
|
|
"""
|
|
|
|
|
if '<blockquote>' not in content:
|
|
|
|
|
return content
|
|
|
|
|
if '</blockquote>' not in content:
|
|
|
|
|
return content
|
|
|
|
|
ctr = 1
|
|
|
|
|
found = True
|
|
|
|
|
while found:
|
|
|
|
|
prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
|
2021-12-30 20:24:05 +00:00
|
|
|
|
quoted_str = content.split('<blockquote>', ctr)[1]
|
|
|
|
|
if '</blockquote>' not in quoted_str:
|
2020-09-30 22:52:39 +00:00
|
|
|
|
found = False
|
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
end_str = quoted_str.split('</blockquote>')[1]
|
|
|
|
|
quoted_str = quoted_str.split('</blockquote>')[0]
|
|
|
|
|
if '<blockquote>' not in end_str:
|
2020-09-30 22:52:39 +00:00
|
|
|
|
found = False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '<blockquote>' in quoted_str:
|
|
|
|
|
quoted_str = quoted_str.replace('<blockquote>', '')
|
|
|
|
|
content = prefix + quoted_str + '</blockquote>' + end_str
|
2020-09-30 22:52:39 +00:00
|
|
|
|
ctr += 1
|
|
|
|
|
return content
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def html_replace_email_quote(content: str) -> str:
|
2020-09-14 09:33:42 +00:00
|
|
|
|
"""Replaces an email style quote "> Some quote" with html blockquote
|
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2020-09-14 11:30:56 +00:00
|
|
|
|
# replace quote paragraph
|
|
|
|
|
if '<p>"' in content:
|
|
|
|
|
if '"</p>' in content:
|
2020-10-30 12:10:57 +00:00
|
|
|
|
if content.count('<p>"') == content.count('"</p>'):
|
|
|
|
|
content = content.replace('<p>"', '<p><blockquote>')
|
|
|
|
|
content = content.replace('"</p>', '</blockquote></p>')
|
2020-09-14 12:17:11 +00:00
|
|
|
|
if '>\u201c' in content:
|
|
|
|
|
if '\u201d<' in content:
|
2020-10-30 12:10:57 +00:00
|
|
|
|
if content.count('>\u201c') == content.count('\u201d<'):
|
2020-10-30 12:12:09 +00:00
|
|
|
|
content = content.replace('>\u201c', '><blockquote>')
|
|
|
|
|
content = content.replace('\u201d<', '</blockquote><')
|
2020-09-14 11:30:56 +00:00
|
|
|
|
# replace email style quote
|
2020-09-14 09:33:42 +00:00
|
|
|
|
if '>> ' not in content:
|
|
|
|
|
return content
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content_str = content.replace('<p>', '')
|
|
|
|
|
content_lines = content_str.split('</p>')
|
|
|
|
|
new_content = ''
|
|
|
|
|
for line_str in content_lines:
|
|
|
|
|
if not line_str:
|
2020-09-14 09:33:42 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '>> ' not in line_str:
|
|
|
|
|
if line_str.startswith('> '):
|
|
|
|
|
line_str = line_str.replace('> ', '<blockquote>')
|
|
|
|
|
line_str = line_str.replace('>', '<br>')
|
|
|
|
|
new_content += '<p>' + line_str + '</blockquote></p>'
|
2020-09-14 10:25:12 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
new_content += '<p>' + line_str + '</p>'
|
2020-09-14 09:33:42 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
line_str = line_str.replace('>> ', '><blockquote>')
|
|
|
|
|
if line_str.startswith('>'):
|
|
|
|
|
line_str = line_str.replace('>', '<blockquote>', 1)
|
2020-09-30 22:52:39 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
line_str = line_str.replace('>', '<br>')
|
|
|
|
|
new_content += '<p>' + line_str + '</blockquote></p>'
|
|
|
|
|
return _remove_quotes_within_quotes(new_content)
|
2020-09-14 09:33:42 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def html_replace_quote_marks(content: str) -> str:
|
2020-08-02 17:01:12 +00:00
|
|
|
|
"""Replaces quotes with html formatting
|
|
|
|
|
"hello" becomes <q>hello</q>
|
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2020-08-02 17:01:12 +00:00
|
|
|
|
if '"' not in content:
|
2020-08-03 17:03:30 +00:00
|
|
|
|
if '"' not in content:
|
|
|
|
|
return content
|
2020-10-30 12:03:29 +00:00
|
|
|
|
|
|
|
|
|
# only if there are a few quote marks
|
|
|
|
|
if content.count('"') > 4:
|
|
|
|
|
return content
|
|
|
|
|
if content.count('"') > 4:
|
|
|
|
|
return content
|
2020-08-02 17:01:12 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
new_content = content
|
2020-08-03 17:03:30 +00:00
|
|
|
|
if '"' in content:
|
|
|
|
|
sections = content.split('"')
|
|
|
|
|
if len(sections) > 1:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
new_content = ''
|
|
|
|
|
open_quote = True
|
2020-08-02 17:17:51 +00:00
|
|
|
|
markup = False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for char in content:
|
|
|
|
|
curr_char = char
|
|
|
|
|
if char == '<':
|
2020-08-03 17:03:30 +00:00
|
|
|
|
markup = True
|
2021-12-30 20:24:05 +00:00
|
|
|
|
elif char == '>':
|
2020-08-03 17:03:30 +00:00
|
|
|
|
markup = False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
elif char == '"' and not markup:
|
|
|
|
|
if open_quote:
|
|
|
|
|
curr_char = '“'
|
2020-08-03 17:03:30 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
curr_char = '”'
|
|
|
|
|
open_quote = not open_quote
|
|
|
|
|
new_content += curr_char
|
|
|
|
|
|
|
|
|
|
if '"' in new_content:
|
|
|
|
|
open_quote = True
|
|
|
|
|
content = new_content
|
|
|
|
|
new_content = ''
|
2020-08-02 19:16:22 +00:00
|
|
|
|
ctr = 0
|
|
|
|
|
sections = content.split('"')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
no_of_sections = len(sections)
|
|
|
|
|
for sec in sections:
|
|
|
|
|
new_content += sec
|
|
|
|
|
if ctr < no_of_sections - 1:
|
|
|
|
|
if open_quote:
|
|
|
|
|
new_content += '“'
|
2020-08-02 19:16:22 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
new_content += '”'
|
|
|
|
|
open_quote = not open_quote
|
2020-08-02 19:16:22 +00:00
|
|
|
|
ctr += 1
|
2021-12-30 20:24:05 +00:00
|
|
|
|
return new_content
|
2020-08-02 17:01:12 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
|
2020-11-15 11:01:05 +00:00
|
|
|
|
"""Returns true is the css file contains code which
|
|
|
|
|
can create security problems
|
|
|
|
|
"""
|
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
|
return False
|
|
|
|
|
|
2021-11-26 12:28:20 +00:00
|
|
|
|
content = None
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(filename, 'r', encoding='utf-8') as css_file:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content = css_file.read().lower()
|
2021-11-26 12:28:20 +00:00
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: unable to read css file ' + filename)
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not content:
|
|
|
|
|
return False
|
2020-11-15 11:01:05 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
css_matches = (
|
|
|
|
|
'behavior:', ':expression', '?php', '.php',
|
|
|
|
|
'google', 'regexp', 'localhost',
|
|
|
|
|
'127.0.', '192.168', '10.0.', '@import'
|
|
|
|
|
)
|
|
|
|
|
for cssmatch in css_matches:
|
|
|
|
|
if cssmatch in content:
|
2020-11-15 11:01:05 +00:00
|
|
|
|
return True
|
2021-12-30 20:24:05 +00:00
|
|
|
|
|
|
|
|
|
# search for non-local web links
|
|
|
|
|
if 'url(' in content:
|
|
|
|
|
url_list = content.split('url(')
|
|
|
|
|
ctr = 0
|
|
|
|
|
for url_str in url_list:
|
|
|
|
|
if ctr > 0:
|
|
|
|
|
if ')' in url_str:
|
|
|
|
|
url_str = url_str.split(')')[0]
|
2022-04-29 13:54:13 +00:00
|
|
|
|
if 'http' in url_str or \
|
|
|
|
|
'ipfs' in url_str or \
|
|
|
|
|
'ipns' in url_str:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
print('ERROR: non-local web link in CSS ' +
|
|
|
|
|
filename)
|
|
|
|
|
return True
|
|
|
|
|
ctr += 1
|
|
|
|
|
|
|
|
|
|
# an attacker can include html inside of the css
|
|
|
|
|
# file as a comment and this may then be run from the html
|
|
|
|
|
if dangerous_markup(content, allow_local_network_access):
|
|
|
|
|
return True
|
2020-11-15 11:01:05 +00:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def switch_words(base_dir: str, nickname: str, domain: str, content: str,
|
|
|
|
|
rules: [] = []) -> str:
|
2020-02-19 18:51:08 +00:00
|
|
|
|
"""Performs word replacements. eg. Trump -> The Orange Menace
|
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2021-07-06 16:29:03 +00:00
|
|
|
|
|
|
|
|
|
if not rules:
|
2021-12-30 12:23:55 +00:00
|
|
|
|
switch_words_filename = \
|
2021-12-26 12:02:29 +00:00
|
|
|
|
acct_dir(base_dir, nickname, domain) + '/replacewords.txt'
|
2021-12-30 12:23:55 +00:00
|
|
|
|
if not os.path.isfile(switch_words_filename):
|
2021-07-06 16:29:03 +00:00
|
|
|
|
return content
|
2021-11-26 12:28:20 +00:00
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(switch_words_filename, 'r',
|
|
|
|
|
encoding='utf-8') as words_file:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
rules = words_file.readlines()
|
2021-11-26 12:28:20 +00:00
|
|
|
|
except OSError:
|
2021-12-30 12:23:55 +00:00
|
|
|
|
print('EX: unable to read switches ' + switch_words_filename)
|
2021-07-06 16:29:03 +00:00
|
|
|
|
|
|
|
|
|
for line in rules:
|
2022-06-21 11:58:50 +00:00
|
|
|
|
replace_str = remove_eol(line)
|
2021-07-06 16:29:03 +00:00
|
|
|
|
splitters = ('->', ':', ',', ';', '-')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
word_transform = None
|
|
|
|
|
for split_str in splitters:
|
|
|
|
|
if split_str in replace_str:
|
|
|
|
|
word_transform = replace_str.split(split_str)
|
2021-07-06 16:29:03 +00:00
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not word_transform:
|
2021-07-06 16:29:03 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(word_transform) == 2:
|
|
|
|
|
replace_str1 = word_transform[0].strip().replace('"', '')
|
|
|
|
|
replace_str2 = word_transform[1].strip().replace('"', '')
|
|
|
|
|
content = content.replace(replace_str1, replace_str2)
|
2020-02-19 18:51:08 +00:00
|
|
|
|
return content
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _save_custom_emoji(session, base_dir: str, emojiName: str, url: str,
|
|
|
|
|
debug: bool) -> None:
|
2021-11-01 17:12:17 +00:00
|
|
|
|
"""Saves custom emoji to file
|
|
|
|
|
"""
|
|
|
|
|
if not session:
|
2021-11-01 17:50:38 +00:00
|
|
|
|
if debug:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
print('EX: _save_custom_emoji no session')
|
2021-11-01 17:12:17 +00:00
|
|
|
|
return
|
|
|
|
|
if '.' not in url:
|
|
|
|
|
return
|
|
|
|
|
ext = url.split('.')[-1]
|
|
|
|
|
if ext != 'png':
|
2021-11-01 17:50:38 +00:00
|
|
|
|
if debug:
|
2021-11-01 18:33:32 +00:00
|
|
|
|
print('EX: Custom emoji is wrong format ' + url)
|
2021-11-01 17:12:17 +00:00
|
|
|
|
return
|
2021-11-01 20:12:04 +00:00
|
|
|
|
emojiName = emojiName.replace(':', '').strip().lower()
|
2021-12-30 20:24:05 +00:00
|
|
|
|
custom_emoji_dir = base_dir + '/emojicustom'
|
|
|
|
|
if not os.path.isdir(custom_emoji_dir):
|
|
|
|
|
os.mkdir(custom_emoji_dir)
|
|
|
|
|
emoji_image_filename = custom_emoji_dir + '/' + emojiName + '.' + ext
|
2022-06-14 10:24:29 +00:00
|
|
|
|
if not download_image(session, url,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_image_filename, debug, False):
|
2021-11-01 18:33:32 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('EX: custom emoji not downloaded ' + url)
|
2021-11-01 17:12:17 +00:00
|
|
|
|
return
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_json_filename = custom_emoji_dir + '/emoji.json'
|
|
|
|
|
emoji_json = {}
|
|
|
|
|
if os.path.isfile(emoji_json_filename):
|
|
|
|
|
emoji_json = load_json(emoji_json_filename, 0, 1)
|
|
|
|
|
if not emoji_json:
|
|
|
|
|
emoji_json = {}
|
|
|
|
|
if not emoji_json.get(emojiName):
|
|
|
|
|
emoji_json[emojiName] = emojiName
|
|
|
|
|
save_json(emoji_json, emoji_json_filename)
|
2021-11-01 17:50:38 +00:00
|
|
|
|
if debug:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
print('EX: Saved custom emoji ' + emoji_json_filename)
|
2021-11-01 18:33:32 +00:00
|
|
|
|
elif debug:
|
|
|
|
|
print('EX: cusom emoji already saved')
|
2021-11-01 17:12:17 +00:00
|
|
|
|
|
|
|
|
|
|
2022-04-19 12:54:37 +00:00
|
|
|
|
def _get_emoji_name_from_code(base_dir: str, emoji_code: str) -> str:
|
|
|
|
|
"""Returns the emoji name from its code
|
|
|
|
|
"""
|
|
|
|
|
emojis_filename = base_dir + '/emoji/emoji.json'
|
|
|
|
|
if not os.path.isfile(emojis_filename):
|
|
|
|
|
emojis_filename = base_dir + '/emoji/default_emoji.json'
|
|
|
|
|
if not os.path.isfile(emojis_filename):
|
|
|
|
|
return None
|
|
|
|
|
emojis_json = load_json(emojis_filename)
|
|
|
|
|
if not emojis_json:
|
|
|
|
|
return None
|
|
|
|
|
for emoji_name, code in emojis_json.items():
|
|
|
|
|
if code == emoji_code:
|
|
|
|
|
return emoji_name
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2022-04-18 21:02:03 +00:00
|
|
|
|
def _update_common_emoji(base_dir: str, emoji_content: str) -> None:
|
|
|
|
|
"""Updates the list of commonly used emoji
|
|
|
|
|
"""
|
2022-04-18 21:13:08 +00:00
|
|
|
|
if '.' in emoji_content:
|
|
|
|
|
emoji_content = emoji_content.split('.')[0]
|
2022-04-18 21:02:03 +00:00
|
|
|
|
emoji_content = emoji_content.replace(':', '')
|
2022-04-19 12:54:37 +00:00
|
|
|
|
if emoji_content.startswith('0x'):
|
|
|
|
|
# lookup the name for an emoji code
|
|
|
|
|
emoji_code = emoji_content[2:]
|
|
|
|
|
emoji_content = _get_emoji_name_from_code(base_dir, emoji_code)
|
|
|
|
|
if not emoji_content:
|
|
|
|
|
return
|
2022-04-18 21:02:03 +00:00
|
|
|
|
common_emoji_filename = base_dir + '/accounts/common_emoji.txt'
|
|
|
|
|
common_emoji = None
|
|
|
|
|
if os.path.isfile(common_emoji_filename):
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(common_emoji_filename, 'r',
|
|
|
|
|
encoding='utf-8') as fp_emoji:
|
2022-04-18 21:02:03 +00:00
|
|
|
|
common_emoji = fp_emoji.readlines()
|
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: unable to load common emoji file')
|
|
|
|
|
if common_emoji:
|
|
|
|
|
new_common_emoji = []
|
|
|
|
|
emoji_found = False
|
|
|
|
|
for line in common_emoji:
|
|
|
|
|
if ' ' + emoji_content in line:
|
|
|
|
|
if not emoji_found:
|
|
|
|
|
emoji_found = True
|
|
|
|
|
counter = 1
|
|
|
|
|
count_str = line.split(' ')[0]
|
|
|
|
|
if count_str.isdigit():
|
|
|
|
|
counter = int(count_str) + 1
|
|
|
|
|
count_str = str(counter).zfill(16)
|
|
|
|
|
line = count_str + ' ' + emoji_content
|
|
|
|
|
new_common_emoji.append(line)
|
|
|
|
|
else:
|
2022-06-21 11:58:50 +00:00
|
|
|
|
line1 = remove_eol(line)
|
|
|
|
|
new_common_emoji.append(line1)
|
2022-04-18 21:02:03 +00:00
|
|
|
|
if not emoji_found:
|
|
|
|
|
new_common_emoji.append(str(1).zfill(16) + ' ' + emoji_content)
|
|
|
|
|
new_common_emoji.sort(reverse=True)
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(common_emoji_filename, 'w+',
|
|
|
|
|
encoding='utf-8') as fp_emoji:
|
2022-04-18 21:02:03 +00:00
|
|
|
|
for line in new_common_emoji:
|
|
|
|
|
fp_emoji.write(line + '\n')
|
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: error writing common emoji 1')
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
line = str(1).zfill(16) + ' ' + emoji_content + '\n'
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(common_emoji_filename, 'w+',
|
|
|
|
|
encoding='utf-8') as fp_emoji:
|
2022-04-18 21:02:03 +00:00
|
|
|
|
fp_emoji.write(line)
|
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: error writing common emoji 2')
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def replace_emoji_from_tags(session, base_dir: str,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content: str, tag: [], message_type: str,
|
2022-04-21 13:03:40 +00:00
|
|
|
|
debug: bool, screen_readable: bool) -> str:
|
2019-09-29 16:28:02 +00:00
|
|
|
|
"""Uses the tags to replace :emoji: with html image markup
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for tag_item in tag:
|
|
|
|
|
if not tag_item.get('type'):
|
2019-09-29 17:20:10 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if tag_item['type'] != 'Emoji':
|
2019-09-29 17:20:10 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not tag_item.get('name'):
|
2019-09-29 16:28:02 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not tag_item.get('icon'):
|
2019-09-29 16:28:02 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not tag_item['icon'].get('url'):
|
2019-09-29 16:28:02 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '/' not in tag_item['icon']['url']:
|
2020-02-21 15:09:31 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if tag_item['name'] not in content:
|
2019-09-29 16:28:02 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
icon_name = tag_item['icon']['url'].split('/')[-1]
|
|
|
|
|
if icon_name:
|
|
|
|
|
if len(icon_name) > 1:
|
|
|
|
|
if icon_name[0].isdigit():
|
|
|
|
|
if '.' in icon_name:
|
|
|
|
|
icon_name = icon_name.split('.')[0]
|
2020-04-02 09:56:17 +00:00
|
|
|
|
# see https://unicode.org/
|
|
|
|
|
# emoji/charts/full-emoji-list.html
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '-' not in icon_name:
|
2020-02-21 21:08:24 +00:00
|
|
|
|
# a single code
|
2021-11-01 17:23:39 +00:00
|
|
|
|
replaced = False
|
2020-02-21 21:08:24 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_char = chr(int("0x" + icon_name, 16))
|
2022-04-21 13:03:40 +00:00
|
|
|
|
if not screen_readable:
|
|
|
|
|
replace_char = \
|
|
|
|
|
'<span aria-hidden="true">' + \
|
|
|
|
|
replace_char + '</span>'
|
|
|
|
|
content = \
|
|
|
|
|
content.replace(tag_item['name'],
|
|
|
|
|
replace_char)
|
2021-11-01 17:12:17 +00:00
|
|
|
|
replaced = True
|
2020-04-02 09:56:17 +00:00
|
|
|
|
except BaseException:
|
2022-04-02 17:24:56 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('EX: replace_emoji_from_tags 1 ' +
|
|
|
|
|
'no conversion of ' +
|
|
|
|
|
str(icon_name) + ' to chr ' +
|
|
|
|
|
tag_item['name'] + ' ' +
|
|
|
|
|
tag_item['icon']['url'])
|
2021-11-01 17:12:17 +00:00
|
|
|
|
if not replaced:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
_save_custom_emoji(session, base_dir,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
tag_item['name'],
|
|
|
|
|
tag_item['icon']['url'],
|
2021-12-29 21:55:09 +00:00
|
|
|
|
debug)
|
2022-04-19 12:54:37 +00:00
|
|
|
|
_update_common_emoji(base_dir,
|
|
|
|
|
icon_name)
|
|
|
|
|
else:
|
|
|
|
|
_update_common_emoji(base_dir,
|
|
|
|
|
"0x" + icon_name)
|
2020-02-21 21:08:24 +00:00
|
|
|
|
else:
|
|
|
|
|
# sequence of codes
|
2021-12-30 20:24:05 +00:00
|
|
|
|
icon_codes = icon_name.split('-')
|
|
|
|
|
icon_code_sequence = ''
|
|
|
|
|
for icode in icon_codes:
|
2021-11-01 17:23:39 +00:00
|
|
|
|
replaced = False
|
2020-02-21 21:08:24 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
icon_code_sequence += chr(int("0x" +
|
|
|
|
|
icode, 16))
|
2021-11-01 17:12:17 +00:00
|
|
|
|
replaced = True
|
2020-04-02 09:56:17 +00:00
|
|
|
|
except BaseException:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
icon_code_sequence = ''
|
2022-04-02 17:24:56 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('EX: ' +
|
|
|
|
|
'replace_emoji_from_tags 2 ' +
|
|
|
|
|
'no conversion of ' +
|
|
|
|
|
str(icode) + ' to chr ' +
|
|
|
|
|
tag_item['name'] + ' ' +
|
|
|
|
|
tag_item['icon']['url'])
|
2021-11-01 17:12:17 +00:00
|
|
|
|
if not replaced:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
_save_custom_emoji(session, base_dir,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
tag_item['name'],
|
|
|
|
|
tag_item['icon']['url'],
|
2021-12-29 21:55:09 +00:00
|
|
|
|
debug)
|
2022-04-19 12:54:37 +00:00
|
|
|
|
_update_common_emoji(base_dir,
|
|
|
|
|
icon_name)
|
|
|
|
|
else:
|
|
|
|
|
_update_common_emoji(base_dir,
|
|
|
|
|
"0x" + icon_name)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if icon_code_sequence:
|
2022-04-21 13:03:40 +00:00
|
|
|
|
if not screen_readable:
|
|
|
|
|
icon_code_sequence = \
|
|
|
|
|
'<span aria-hidden="true">' + \
|
|
|
|
|
icon_code_sequence + '</span>'
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content = content.replace(tag_item['name'],
|
|
|
|
|
icon_code_sequence)
|
|
|
|
|
|
|
|
|
|
html_class = 'emoji'
|
|
|
|
|
if message_type == 'post header':
|
|
|
|
|
html_class = 'emojiheader'
|
|
|
|
|
if message_type == 'profile':
|
|
|
|
|
html_class = 'emojiprofile'
|
2022-04-21 13:03:40 +00:00
|
|
|
|
if screen_readable:
|
|
|
|
|
emoji_tag_name = tag_item['name'].replace(':', '')
|
|
|
|
|
else:
|
|
|
|
|
emoji_tag_name = ''
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_html = "<img src=\"" + tag_item['icon']['url'] + "\" alt=\"" + \
|
2022-04-21 13:03:40 +00:00
|
|
|
|
emoji_tag_name + \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
"\" align=\"middle\" class=\"" + html_class + "\"/>"
|
|
|
|
|
content = content.replace(tag_item['name'], emoji_html)
|
2019-09-29 16:28:02 +00:00
|
|
|
|
return content
|
|
|
|
|
|
2020-02-21 15:09:31 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _add_music_tag(content: str, tag: str) -> str:
|
2020-03-29 09:59:54 +00:00
|
|
|
|
"""If a music link is found then ensure that the post is
|
|
|
|
|
tagged appropriately
|
2019-09-05 09:54:27 +00:00
|
|
|
|
"""
|
2020-10-11 09:50:17 +00:00
|
|
|
|
if '#podcast' in content or '#documentary' in content:
|
|
|
|
|
return content
|
2019-09-05 09:54:27 +00:00
|
|
|
|
if '#' not in tag:
|
2020-10-11 09:50:17 +00:00
|
|
|
|
tag = '#' + tag
|
2019-09-05 09:54:27 +00:00
|
|
|
|
if tag in content:
|
|
|
|
|
return content
|
2021-12-30 20:24:05 +00:00
|
|
|
|
music_site_found = False
|
|
|
|
|
for site in MUSIC_SITES:
|
2021-06-22 12:42:52 +00:00
|
|
|
|
if site + '/' in content:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
music_site_found = True
|
2019-09-05 09:54:27 +00:00
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not music_site_found:
|
2019-09-05 09:54:27 +00:00
|
|
|
|
return content
|
2020-04-02 09:56:17 +00:00
|
|
|
|
return ':music: ' + content + ' ' + tag + ' '
|
|
|
|
|
|
2019-09-05 09:54:27 +00:00
|
|
|
|
|
2022-05-04 11:34:33 +00:00
|
|
|
|
def _shorten_linked_urls(content: str) -> str:
|
|
|
|
|
"""If content comes with a web link included then make sure
|
|
|
|
|
that it is short enough
|
|
|
|
|
"""
|
|
|
|
|
if 'href=' not in content:
|
|
|
|
|
return content
|
|
|
|
|
if '>' not in content:
|
|
|
|
|
return content
|
|
|
|
|
if '<' not in content:
|
|
|
|
|
return content
|
|
|
|
|
sections = content.split('>')
|
|
|
|
|
ctr = 0
|
|
|
|
|
for section_text in sections:
|
|
|
|
|
if ctr == 0:
|
|
|
|
|
ctr += 1
|
|
|
|
|
continue
|
|
|
|
|
if '<' not in section_text:
|
|
|
|
|
ctr += 1
|
|
|
|
|
continue
|
|
|
|
|
section_text = section_text.split('<')[0]
|
|
|
|
|
if ' ' in section_text:
|
|
|
|
|
continue
|
|
|
|
|
if len(section_text) > MAX_LINK_LENGTH:
|
|
|
|
|
content = content.replace('>' + section_text + '<',
|
|
|
|
|
'>' +
|
|
|
|
|
section_text[:MAX_LINK_LENGTH-1] + '<')
|
|
|
|
|
ctr += 1
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def add_web_links(content: str) -> str:
|
2019-08-21 12:07:30 +00:00
|
|
|
|
"""Adds markup for web links
|
|
|
|
|
"""
|
2022-05-04 11:34:33 +00:00
|
|
|
|
content = _shorten_linked_urls(content)
|
|
|
|
|
|
2020-06-11 09:43:48 +00:00
|
|
|
|
if ':' not in content:
|
|
|
|
|
return content
|
|
|
|
|
|
2021-12-27 17:32:34 +00:00
|
|
|
|
prefixes = get_link_prefixes()
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
|
|
|
|
# do any of these prefixes exist within the content?
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prefix_found = False
|
2020-06-11 11:56:08 +00:00
|
|
|
|
for prefix in prefixes:
|
|
|
|
|
if prefix in content:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prefix_found = True
|
2020-06-11 11:56:08 +00:00
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# if there are no prefixes then just keep the content we have
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not prefix_found:
|
2019-08-21 12:07:30 +00:00
|
|
|
|
return content
|
|
|
|
|
|
2020-05-22 11:32:38 +00:00
|
|
|
|
content = content.replace('\r', '')
|
2020-04-02 09:56:17 +00:00
|
|
|
|
words = content.replace('\n', ' --linebreak-- ').split(' ')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_dict = {}
|
|
|
|
|
for wrd in words:
|
|
|
|
|
if ':' not in wrd:
|
2020-06-11 09:43:48 +00:00
|
|
|
|
continue
|
2020-06-11 11:56:08 +00:00
|
|
|
|
# does the word begin with a prefix?
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prefix_found = False
|
2020-06-11 11:56:08 +00:00
|
|
|
|
for prefix in prefixes:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if wrd.startswith(prefix):
|
|
|
|
|
prefix_found = True
|
2020-06-11 11:56:08 +00:00
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not prefix_found:
|
2020-06-11 11:56:08 +00:00
|
|
|
|
continue
|
|
|
|
|
# the word contains a prefix
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if wrd.endswith('.') or wrd.endswith(';'):
|
|
|
|
|
wrd = wrd[:-1]
|
2022-05-25 12:57:31 +00:00
|
|
|
|
markup = '<a href="' + wrd + '" tabindex="10" ' + \
|
|
|
|
|
'rel="nofollow noopener noreferrer" target="_blank">'
|
2020-06-11 11:56:08 +00:00
|
|
|
|
for prefix in prefixes:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if wrd.startswith(prefix):
|
2020-06-11 11:56:08 +00:00
|
|
|
|
markup += '<span class="invisible">' + prefix + '</span>'
|
|
|
|
|
break
|
2021-12-30 20:24:05 +00:00
|
|
|
|
link_text = wrd
|
2020-06-11 11:56:08 +00:00
|
|
|
|
for prefix in prefixes:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
link_text = link_text.replace(prefix, '')
|
2020-06-11 11:56:08 +00:00
|
|
|
|
# prevent links from becoming too long
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(link_text) > MAX_LINK_LENGTH:
|
2020-06-11 11:56:08 +00:00
|
|
|
|
markup += '<span class="ellipsis">' + \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
link_text[:MAX_LINK_LENGTH] + '</span>'
|
2020-06-11 11:56:08 +00:00
|
|
|
|
markup += '<span class="invisible">' + \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
link_text[MAX_LINK_LENGTH:] + '</span></a>'
|
2020-06-11 11:56:08 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
markup += '<span class="ellipsis">' + link_text + '</span></a>'
|
|
|
|
|
replace_dict[wrd] = markup
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
|
|
|
|
# do the replacements
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for url, markup in replace_dict.items():
|
2020-04-02 09:56:17 +00:00
|
|
|
|
content = content.replace(url, markup)
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
|
|
|
|
# replace any line breaks
|
2020-04-02 09:56:17 +00:00
|
|
|
|
content = content.replace(' --linebreak-- ', '<br>')
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
2019-08-21 12:07:30 +00:00
|
|
|
|
return content
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2022-01-14 10:20:37 +00:00
|
|
|
|
def safe_web_text(arbitrary_html: str) -> str:
|
|
|
|
|
"""Turns arbitrary html into something safe.
|
|
|
|
|
So if the arbitrary html contains attack scripts those will be removed
|
|
|
|
|
"""
|
|
|
|
|
# first remove the markup, so that we have something safe
|
|
|
|
|
safe_text = remove_html(arbitrary_html)
|
|
|
|
|
if not safe_text:
|
|
|
|
|
return ''
|
|
|
|
|
# remove any spurious characters found in podcast descriptions
|
2022-01-14 19:05:26 +00:00
|
|
|
|
remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]', '__')
|
2022-01-14 10:20:37 +00:00
|
|
|
|
for remchar in remove_chars:
|
|
|
|
|
safe_text = safe_text.replace(remchar, '')
|
|
|
|
|
# recreate any url links safely
|
|
|
|
|
return add_web_links(safe_text)
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
|
|
|
|
|
replace_hashtags: {}, post_hashtags: {}) -> bool:
|
2019-08-09 11:12:08 +00:00
|
|
|
|
"""Detects hashtags and adds them to the replacements dict
|
|
|
|
|
Also updates the hashtags list to be added to the post
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if replace_hashtags.get(word_str):
|
2020-04-02 09:56:17 +00:00
|
|
|
|
return True
|
2021-12-30 20:24:05 +00:00
|
|
|
|
hashtag = word_str[1:]
|
2021-12-29 21:55:09 +00:00
|
|
|
|
if not valid_hash_tag(hashtag):
|
2019-08-09 11:12:08 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
hashtag_url = http_prefix + "://" + domain + "/tags/" + hashtag
|
|
|
|
|
post_hashtags[hashtag] = {
|
|
|
|
|
'href': hashtag_url,
|
2020-10-16 20:13:23 +00:00
|
|
|
|
'name': '#' + hashtag,
|
2019-08-09 11:12:08 +00:00
|
|
|
|
'type': 'Hashtag'
|
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_hashtags[word_str] = "<a href=\"" + hashtag_url + \
|
2022-05-25 13:20:49 +00:00
|
|
|
|
"\" class=\"mention hashtag\" rel=\"tag\" tabindex=\"10\">#<span>" + \
|
2020-04-02 09:56:17 +00:00
|
|
|
|
hashtag + "</span></a>"
|
2019-08-09 11:12:08 +00:00
|
|
|
|
return True
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def _add_emoji(base_dir: str, word_str: str,
|
2021-12-29 21:55:09 +00:00
|
|
|
|
http_prefix: str, domain: str,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_emoji: {}, post_tags: {},
|
|
|
|
|
emoji_dict: {}) -> bool:
|
2019-08-09 16:18:00 +00:00
|
|
|
|
"""Detects Emoji and adds them to the replacements dict
|
|
|
|
|
Also updates the tags list to be added to the post
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not word_str.startswith(':'):
|
2019-08-09 16:18:00 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not word_str.endswith(':'):
|
2019-08-09 16:18:00 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(word_str) < 3:
|
2019-08-09 16:18:00 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if replace_emoji.get(word_str):
|
2020-04-02 09:56:17 +00:00
|
|
|
|
return True
|
2019-09-23 11:11:13 +00:00
|
|
|
|
# remove leading and trailing : characters
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji = word_str[1:]
|
2020-04-02 09:56:17 +00:00
|
|
|
|
emoji = emoji[:-1]
|
2019-09-23 11:11:13 +00:00
|
|
|
|
# is the text of the emoji valid?
|
2021-12-29 21:55:09 +00:00
|
|
|
|
if not valid_hash_tag(emoji):
|
2019-08-09 16:18:00 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not emoji_dict.get(emoji):
|
2019-08-09 16:18:00 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_filename = base_dir + '/emoji/' + emoji_dict[emoji] + '.png'
|
|
|
|
|
if not os.path.isfile(emoji_filename):
|
2022-03-29 21:10:09 +00:00
|
|
|
|
emoji_filename = \
|
|
|
|
|
base_dir + '/emojicustom/' + emoji_dict[emoji] + '.png'
|
|
|
|
|
if not os.path.isfile(emoji_filename):
|
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_url = http_prefix + "://" + domain + \
|
|
|
|
|
"/emoji/" + emoji_dict[emoji] + '.png'
|
|
|
|
|
post_tags[emoji] = {
|
2019-08-19 13:35:55 +00:00
|
|
|
|
'icon': {
|
|
|
|
|
'mediaType': 'image/png',
|
|
|
|
|
'type': 'Image',
|
2021-12-30 20:24:05 +00:00
|
|
|
|
'url': emoji_url
|
2019-08-19 13:35:55 +00:00
|
|
|
|
},
|
2021-06-22 12:42:52 +00:00
|
|
|
|
'name': ':' + emoji + ':',
|
2021-12-30 20:24:05 +00:00
|
|
|
|
"updated": file_last_modified(emoji_filename),
|
|
|
|
|
"id": emoji_url.replace('.png', ''),
|
2019-08-09 16:18:00 +00:00
|
|
|
|
'type': 'Emoji'
|
|
|
|
|
}
|
|
|
|
|
return True
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def post_tag_exists(tagType: str, tagName: str, tags: {}) -> bool:
|
2020-12-13 20:07:45 +00:00
|
|
|
|
"""Returns true if a tag exists in the given dict
|
|
|
|
|
"""
|
|
|
|
|
for tag in tags:
|
|
|
|
|
if tag['name'] == tagName and tag['type'] == tagType:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2022-09-03 17:09:00 +00:00
|
|
|
|
def _mention_to_url(base_dir: str, http_prefix: str,
|
|
|
|
|
domain: str, nickname: str) -> str:
|
|
|
|
|
"""Convert https://somedomain/@somenick to
|
|
|
|
|
https://somedomain/users/somenick
|
|
|
|
|
This uses the hack of trying the cache directory to see if
|
|
|
|
|
there is a matching actor
|
|
|
|
|
"""
|
|
|
|
|
possible_paths = get_user_paths()
|
|
|
|
|
cache_dir = base_dir + '/cache/actors'
|
|
|
|
|
cache_path_start = cache_dir + '/' + http_prefix + ':##' + domain
|
|
|
|
|
for users_path in possible_paths:
|
|
|
|
|
users_path = users_path.replace('/', '#')
|
|
|
|
|
possible_cache_entry = \
|
|
|
|
|
cache_path_start + users_path + nickname + '.json'
|
|
|
|
|
if os.path.isfile(possible_cache_entry):
|
|
|
|
|
return http_prefix + '://' + \
|
|
|
|
|
domain + users_path.replace('#', '/') + nickname
|
|
|
|
|
return http_prefix + '://' + domain + '/users/' + nickname
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _add_mention(base_dir: str, word_str: str, http_prefix: str,
|
|
|
|
|
following: [], petnames: [], replace_mentions: {},
|
2021-12-30 20:24:05 +00:00
|
|
|
|
recipients: [], tags: {}) -> bool:
|
2020-03-29 09:59:54 +00:00
|
|
|
|
"""Detects mentions and adds them to the replacements dict and
|
|
|
|
|
recipients list
|
2019-08-09 09:09:21 +00:00
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_handle = word_str[1:]
|
2019-08-19 10:05:50 +00:00
|
|
|
|
# @nick
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if following and '@' not in possible_handle:
|
2019-08-09 09:48:51 +00:00
|
|
|
|
# fall back to a best effort match against the following list
|
|
|
|
|
# if no domain was specified. eg. @nick
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_nickname = possible_handle
|
2019-08-09 09:48:51 +00:00
|
|
|
|
for follow in following:
|
2021-01-29 21:33:23 +00:00
|
|
|
|
if '@' not in follow:
|
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
follow_nick = follow.split('@')[0]
|
|
|
|
|
if possible_nickname == follow_nick:
|
2022-06-21 11:58:50 +00:00
|
|
|
|
follow_str = remove_eol(follow)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_domain = follow_str.split('@')[1]
|
2022-09-03 17:09:00 +00:00
|
|
|
|
recipient_actor = \
|
|
|
|
|
_mention_to_url(base_dir, http_prefix,
|
|
|
|
|
replace_domain, possible_nickname)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if recipient_actor not in recipients:
|
|
|
|
|
recipients.append(recipient_actor)
|
|
|
|
|
tags[word_str] = {
|
|
|
|
|
'href': recipient_actor,
|
|
|
|
|
'name': word_str,
|
2019-08-19 12:13:18 +00:00
|
|
|
|
'type': 'Mention'
|
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_mentions[word_str] = \
|
2022-09-03 16:19:09 +00:00
|
|
|
|
"<span class=\"h-card\"><a href=\"" + recipient_actor + \
|
2022-05-25 12:57:31 +00:00
|
|
|
|
"\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_nickname + "</span></a></span>"
|
2019-08-09 09:48:51 +00:00
|
|
|
|
return True
|
2021-01-29 21:33:23 +00:00
|
|
|
|
# try replacing petnames with mentions
|
2021-12-30 20:24:05 +00:00
|
|
|
|
follow_ctr = 0
|
2021-01-29 21:33:23 +00:00
|
|
|
|
for follow in following:
|
|
|
|
|
if '@' not in follow:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
follow_ctr += 1
|
2021-01-29 21:33:23 +00:00
|
|
|
|
continue
|
2022-06-21 11:58:50 +00:00
|
|
|
|
pet = remove_eol(petnames[follow_ctr])
|
2021-01-29 21:33:23 +00:00
|
|
|
|
if pet:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if possible_nickname == pet:
|
2022-06-21 11:58:50 +00:00
|
|
|
|
follow_str = remove_eol(follow)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_nickname = follow_str.split('@')[0]
|
|
|
|
|
replace_domain = follow_str.split('@')[1]
|
2022-09-03 17:09:00 +00:00
|
|
|
|
recipient_actor = \
|
|
|
|
|
_mention_to_url(base_dir, http_prefix,
|
|
|
|
|
replace_domain, replace_nickname)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if recipient_actor not in recipients:
|
|
|
|
|
recipients.append(recipient_actor)
|
|
|
|
|
tags[word_str] = {
|
|
|
|
|
'href': recipient_actor,
|
|
|
|
|
'name': word_str,
|
2021-01-29 21:33:23 +00:00
|
|
|
|
'type': 'Mention'
|
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_mentions[word_str] = \
|
2022-09-03 16:19:09 +00:00
|
|
|
|
"<span class=\"h-card\"><a href=\"" + \
|
|
|
|
|
recipient_actor + "\" tabindex=\"10\" " + \
|
2022-05-25 12:57:31 +00:00
|
|
|
|
"class=\"u-url mention\">@<span>" + \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_nickname + "</span></a></span>"
|
2021-01-29 21:33:23 +00:00
|
|
|
|
return True
|
2021-12-30 20:24:05 +00:00
|
|
|
|
follow_ctr += 1
|
2019-08-09 09:48:51 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_nickname = None
|
|
|
|
|
possible_domain = None
|
|
|
|
|
if '@' not in possible_handle:
|
2019-10-29 20:15:21 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_nickname = possible_handle.split('@')[0]
|
|
|
|
|
if not possible_nickname:
|
2019-10-29 20:15:21 +00:00
|
|
|
|
return False
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_domain = \
|
|
|
|
|
possible_handle.split('@')[1].strip('\n').strip('\r')
|
|
|
|
|
if not possible_domain:
|
2019-10-29 20:15:21 +00:00
|
|
|
|
return False
|
2019-08-19 11:41:15 +00:00
|
|
|
|
if following:
|
|
|
|
|
for follow in following:
|
2022-06-21 11:58:50 +00:00
|
|
|
|
if remove_eol(follow) != possible_handle:
|
2019-08-19 11:41:15 +00:00
|
|
|
|
continue
|
2022-09-03 17:09:00 +00:00
|
|
|
|
recipient_actor = \
|
|
|
|
|
_mention_to_url(base_dir, http_prefix,
|
|
|
|
|
possible_domain, possible_nickname)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if recipient_actor not in recipients:
|
|
|
|
|
recipients.append(recipient_actor)
|
|
|
|
|
tags[word_str] = {
|
|
|
|
|
'href': recipient_actor,
|
|
|
|
|
'name': word_str,
|
2019-08-19 12:13:18 +00:00
|
|
|
|
'type': 'Mention'
|
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_mentions[word_str] = \
|
2022-09-03 16:22:56 +00:00
|
|
|
|
"<span class=\"h-card\"><a href=\"" + recipient_actor + \
|
2022-05-25 12:57:31 +00:00
|
|
|
|
"\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
|
|
|
|
|
possible_nickname + "</span></a></span>"
|
2019-08-19 11:41:15 +00:00
|
|
|
|
return True
|
2019-08-19 10:05:50 +00:00
|
|
|
|
# @nick@domain
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not (possible_domain == 'localhost' or '.' in possible_domain):
|
2020-03-22 21:16:02 +00:00
|
|
|
|
return False
|
2022-09-03 17:09:00 +00:00
|
|
|
|
recipient_actor = \
|
|
|
|
|
_mention_to_url(base_dir, http_prefix,
|
|
|
|
|
possible_domain, possible_nickname)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if recipient_actor not in recipients:
|
|
|
|
|
recipients.append(recipient_actor)
|
|
|
|
|
tags[word_str] = {
|
|
|
|
|
'href': recipient_actor,
|
|
|
|
|
'name': word_str,
|
2019-10-29 20:15:21 +00:00
|
|
|
|
'type': 'Mention'
|
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
replace_mentions[word_str] = \
|
2022-09-03 16:22:56 +00:00
|
|
|
|
"<span class=\"h-card\"><a href=\"" + recipient_actor + \
|
2022-05-25 12:57:31 +00:00
|
|
|
|
"\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
|
|
|
|
|
possible_nickname + "</span></a></span>"
|
2019-10-29 20:15:21 +00:00
|
|
|
|
return True
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def replace_content_duplicates(content: str) -> str:
|
2020-05-12 09:34:58 +00:00
|
|
|
|
"""Replaces invalid duplicates within content
|
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2020-05-12 09:34:58 +00:00
|
|
|
|
while '<<' in content:
|
|
|
|
|
content = content.replace('<<', '<')
|
|
|
|
|
while '>>' in content:
|
|
|
|
|
content = content.replace('>>', '>')
|
2020-05-12 09:42:24 +00:00
|
|
|
|
content = content.replace('<\\p>', '')
|
2020-05-12 09:34:58 +00:00
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
2022-03-24 14:40:28 +00:00
|
|
|
|
def remove_text_formatting(content: str, bold_reading: bool) -> str:
|
2020-06-14 13:25:38 +00:00
|
|
|
|
"""Removes markup for bold, italics, etc
|
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2020-06-14 13:25:38 +00:00
|
|
|
|
if '<' not in content:
|
|
|
|
|
return content
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for markup in REMOVE_MARKUP:
|
2022-03-24 14:40:28 +00:00
|
|
|
|
if bold_reading:
|
|
|
|
|
if markup == 'b':
|
|
|
|
|
continue
|
2020-06-14 13:39:03 +00:00
|
|
|
|
content = content.replace('<' + markup + '>', '')
|
|
|
|
|
content = content.replace('</' + markup + '>', '')
|
|
|
|
|
content = content.replace('<' + markup.upper() + '>', '')
|
|
|
|
|
content = content.replace('</' + markup.upper() + '>', '')
|
2020-06-14 13:25:38 +00:00
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def remove_long_words(content: str, max_word_length: int,
|
|
|
|
|
long_words_list: []) -> str:
|
2020-03-29 09:59:54 +00:00
|
|
|
|
"""Breaks up long words so that on mobile screens this doesn't
|
|
|
|
|
disrupt the layout
|
2019-10-09 12:19:17 +00:00
|
|
|
|
"""
|
2021-12-26 19:15:36 +00:00
|
|
|
|
if is_pgp_encrypted(content) or contains_pgp_public_key(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
|
return content
|
2021-12-29 21:55:09 +00:00
|
|
|
|
content = replace_content_duplicates(content)
|
2019-12-13 12:41:26 +00:00
|
|
|
|
if ' ' not in content:
|
|
|
|
|
# handle a single very long string with no spaces
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content_str = content.replace('<p>', '').replace(r'<\p>', '')
|
|
|
|
|
if '://' not in content_str:
|
|
|
|
|
if len(content_str) > max_word_length:
|
2019-12-13 12:41:26 +00:00
|
|
|
|
if '<p>' in content:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content = '<p>' + content_str[:max_word_length] + r'<\p>'
|
2019-12-13 12:41:26 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content = content[:max_word_length]
|
2019-12-13 12:41:26 +00:00
|
|
|
|
return content
|
2022-04-08 16:16:16 +00:00
|
|
|
|
content = content.replace('<p></p>', '<p> </p>')
|
2020-04-02 09:56:17 +00:00
|
|
|
|
words = content.split(' ')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not long_words_list:
|
|
|
|
|
long_words_list = []
|
|
|
|
|
for word_str in words:
|
|
|
|
|
if len(word_str) > max_word_length:
|
|
|
|
|
if word_str not in long_words_list:
|
|
|
|
|
long_words_list.append(word_str)
|
|
|
|
|
for word_str in long_words_list:
|
|
|
|
|
if word_str.startswith('<p>'):
|
|
|
|
|
word_str = word_str.replace('<p>', '')
|
|
|
|
|
if word_str.startswith('<'):
|
2019-10-18 12:24:31 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(word_str) == 76:
|
|
|
|
|
if word_str.upper() == word_str:
|
2020-03-22 14:29:34 +00:00
|
|
|
|
# tox address
|
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '=\"' in word_str:
|
2019-11-04 21:08:43 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '@' in word_str:
|
|
|
|
|
if '@@' not in word_str:
|
2019-11-04 21:11:09 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '=.ed25519' in word_str:
|
2020-01-25 10:49:59 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '.onion' in word_str:
|
2020-01-25 10:49:59 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '.i2p' in word_str:
|
2020-01-25 10:49:59 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if 'https:' in word_str:
|
2019-10-25 18:27:32 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'http:' in word_str:
|
2019-11-04 20:39:14 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'i2p:' in word_str:
|
2020-02-17 17:18:21 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'gnunet:' in word_str:
|
2020-06-09 11:51:51 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'dat:' in word_str:
|
2019-11-04 20:39:14 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'rad:' in word_str:
|
2020-12-06 10:18:41 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'hyper:' in word_str:
|
2020-05-17 09:37:59 +00:00
|
|
|
|
continue
|
2022-06-09 14:46:30 +00:00
|
|
|
|
if 'briar:' in word_str:
|
2020-05-17 09:37:59 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '<' in word_str:
|
|
|
|
|
replace_word = word_str.split('<', 1)[0]
|
|
|
|
|
# if len(replace_word) > max_word_length:
|
|
|
|
|
# replace_word = replace_word[:max_word_length]
|
|
|
|
|
content = content.replace(word_str, replace_word)
|
|
|
|
|
word_str = replace_word
|
|
|
|
|
if '/' in word_str:
|
2019-10-25 18:27:32 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(word_str[max_word_length:]) < max_word_length:
|
|
|
|
|
content = content.replace(word_str,
|
|
|
|
|
word_str[:max_word_length] + '\n' +
|
|
|
|
|
word_str[max_word_length:])
|
2019-10-18 12:24:31 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
content = content.replace(word_str,
|
|
|
|
|
word_str[:max_word_length])
|
2020-01-24 11:27:12 +00:00
|
|
|
|
if content.startswith('<p>'):
|
|
|
|
|
if not content.endswith('</p>'):
|
2020-10-31 23:10:38 +00:00
|
|
|
|
content = content.strip() + '</p>'
|
2022-04-08 16:16:16 +00:00
|
|
|
|
content = content.replace('<p> </p>', '<p></p>')
|
2019-10-09 12:19:17 +00:00
|
|
|
|
return content
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _load_auto_tags(base_dir: str, nickname: str, domain: str) -> []:
|
2020-09-13 14:42:17 +00:00
|
|
|
|
"""Loads automatic tags file and returns a list containing
|
|
|
|
|
the lines of the file
|
|
|
|
|
"""
|
2021-12-26 12:02:29 +00:00
|
|
|
|
filename = acct_dir(base_dir, nickname, domain) + '/autotags.txt'
|
2020-09-13 14:42:17 +00:00
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
|
return []
|
2021-11-26 12:28:20 +00:00
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(filename, 'r', encoding='utf-8') as tags_file:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
return tags_file.readlines()
|
2021-11-26 12:28:20 +00:00
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: unable to read auto tags ' + filename)
|
2020-09-13 14:42:17 +00:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _auto_tag(base_dir: str, nickname: str, domain: str,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
word_str: str, auto_tag_list: [],
|
|
|
|
|
append_tags: []):
|
2020-09-13 14:42:17 +00:00
|
|
|
|
"""Generates a list of tags to be automatically appended to the content
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for tag_rule in auto_tag_list:
|
|
|
|
|
if word_str not in tag_rule:
|
2020-09-13 14:42:17 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '->' not in tag_rule:
|
2020-09-13 14:42:17 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
rulematch = tag_rule.split('->')[0].strip()
|
|
|
|
|
if rulematch != word_str:
|
2020-09-13 14:42:17 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
tag_name = tag_rule.split('->')[1].strip()
|
|
|
|
|
if tag_name.startswith('#'):
|
|
|
|
|
if tag_name not in append_tags:
|
|
|
|
|
append_tags.append(tag_name)
|
2020-09-13 14:42:17 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '#' + tag_name not in append_tags:
|
|
|
|
|
append_tags.append('#' + tag_name)
|
2020-09-13 14:42:17 +00:00
|
|
|
|
|
|
|
|
|
|
2022-07-05 11:12:07 +00:00
|
|
|
|
def _get_simplified_content(content: str) -> str:
|
2022-07-05 11:12:58 +00:00
|
|
|
|
"""Returns a simplified version of the content suitable for
|
|
|
|
|
splitting up into individual words
|
2022-07-05 11:12:07 +00:00
|
|
|
|
"""
|
|
|
|
|
content_simplified = \
|
|
|
|
|
content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
|
|
|
|
|
content_simplified = content_simplified.replace('. ', ' ').strip()
|
|
|
|
|
if content_simplified.endswith('.'):
|
|
|
|
|
content_simplified = content_simplified[:len(content_simplified)-1]
|
2022-07-05 11:15:26 +00:00
|
|
|
|
return content_simplified
|
2022-07-05 11:12:07 +00:00
|
|
|
|
|
|
|
|
|
|
2022-07-05 12:30:21 +00:00
|
|
|
|
def detect_dogwhistles(content: str, dogwhistles: {}) -> {}:
|
2022-07-05 11:37:35 +00:00
|
|
|
|
"""Returns a dict containing any detected dogwhistle words
|
|
|
|
|
"""
|
2022-07-05 18:47:23 +00:00
|
|
|
|
content = remove_html(content).lower()
|
2022-07-05 11:37:35 +00:00
|
|
|
|
result = {}
|
|
|
|
|
words = _get_simplified_content(content).split(' ')
|
|
|
|
|
for whistle, category in dogwhistles.items():
|
2022-07-05 16:25:31 +00:00
|
|
|
|
if not category:
|
|
|
|
|
continue
|
2022-07-05 11:37:35 +00:00
|
|
|
|
ending = False
|
2022-07-05 16:21:48 +00:00
|
|
|
|
starting = False
|
2022-07-05 18:47:23 +00:00
|
|
|
|
whistle = whistle.lower()
|
2022-07-05 16:21:48 +00:00
|
|
|
|
|
2022-07-05 18:47:23 +00:00
|
|
|
|
if whistle.startswith('x-'):
|
2022-07-05 11:37:35 +00:00
|
|
|
|
whistle = whistle[2:]
|
|
|
|
|
ending = True
|
|
|
|
|
elif (whistle.startswith('*') or
|
|
|
|
|
whistle.startswith('~') or
|
|
|
|
|
whistle.startswith('-')):
|
|
|
|
|
whistle = whistle[1:]
|
|
|
|
|
ending = True
|
|
|
|
|
|
|
|
|
|
if ending:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = ''
|
2022-07-05 11:37:35 +00:00
|
|
|
|
for wrd in words:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
wrd2 = (prev_wrd + ' ' + wrd).strip()
|
|
|
|
|
if wrd.endswith(whistle) or wrd2.endswith(whistle):
|
2022-07-05 11:37:35 +00:00
|
|
|
|
if not result.get(whistle):
|
|
|
|
|
result[whistle] = {
|
|
|
|
|
"count": 1,
|
|
|
|
|
"category": category
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
result[whistle]['count'] += 1
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = wrd
|
2022-07-05 19:35:38 +00:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if whistle.lower().endswith('-x'):
|
|
|
|
|
whistle = whistle[:len(whistle)-2]
|
|
|
|
|
starting = True
|
|
|
|
|
elif (whistle.endswith('*') or
|
|
|
|
|
whistle.endswith('~') or
|
|
|
|
|
whistle.endswith('-')):
|
|
|
|
|
whistle = whistle[:len(whistle)-1]
|
|
|
|
|
starting = True
|
|
|
|
|
|
|
|
|
|
if starting:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = ''
|
2022-07-05 19:35:38 +00:00
|
|
|
|
for wrd in words:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
wrd2 = (prev_wrd + ' ' + wrd).strip()
|
|
|
|
|
if wrd.startswith(whistle) or wrd2.startswith(whistle):
|
2022-07-05 19:35:38 +00:00
|
|
|
|
if not result.get(whistle):
|
|
|
|
|
result[whistle] = {
|
|
|
|
|
"count": 1,
|
|
|
|
|
"category": category
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
result[whistle]['count'] += 1
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = wrd
|
2022-07-05 19:35:38 +00:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if '*' in whistle:
|
|
|
|
|
whistle_start = whistle.split('*', 1)[0]
|
|
|
|
|
whistle_end = whistle.split('*', 1)[1]
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = ''
|
2022-07-05 19:35:38 +00:00
|
|
|
|
for wrd in words:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
wrd2 = (prev_wrd + ' ' + wrd).strip()
|
|
|
|
|
if ((wrd.startswith(whistle_start) and
|
|
|
|
|
wrd.endswith(whistle_end)) or
|
|
|
|
|
(wrd2.startswith(whistle_start) and
|
|
|
|
|
wrd2.endswith(whistle_end))):
|
2022-07-05 19:35:38 +00:00
|
|
|
|
if not result.get(whistle):
|
|
|
|
|
result[whistle] = {
|
|
|
|
|
"count": 1,
|
|
|
|
|
"category": category
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
result[whistle]['count'] += 1
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = wrd
|
2022-07-05 19:35:38 +00:00
|
|
|
|
continue
|
|
|
|
|
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = ''
|
2022-07-05 19:35:38 +00:00
|
|
|
|
for wrd in words:
|
2022-07-05 19:45:18 +00:00
|
|
|
|
wrd2 = (prev_wrd + ' ' + wrd).strip()
|
|
|
|
|
if whistle in (wrd, wrd2):
|
2022-07-05 19:35:38 +00:00
|
|
|
|
if not result.get(whistle):
|
|
|
|
|
result[whistle] = {
|
|
|
|
|
"count": 1,
|
|
|
|
|
"category": category
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
result[whistle]['count'] += 1
|
2022-07-05 19:45:18 +00:00
|
|
|
|
prev_wrd = wrd
|
2022-07-05 11:37:35 +00:00
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
2022-07-05 12:30:21 +00:00
|
|
|
|
def load_dogwhistles(filename: str) -> {}:
|
|
|
|
|
"""Loads a list of dogwhistles from file
|
|
|
|
|
"""
|
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
|
return {}
|
|
|
|
|
dogwhistle_lines = []
|
|
|
|
|
try:
|
|
|
|
|
with open(filename, 'r', encoding='utf-8') as fp_dogwhistles:
|
|
|
|
|
dogwhistle_lines = fp_dogwhistles.readlines()
|
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: unable to load dogwhistles from ' + filename)
|
|
|
|
|
return {}
|
2022-07-05 17:52:19 +00:00
|
|
|
|
separators = ('->', '=>', ',', ';', '|', '=')
|
2022-07-05 12:30:21 +00:00
|
|
|
|
dogwhistles = {}
|
|
|
|
|
for line in dogwhistle_lines:
|
2022-07-05 18:14:57 +00:00
|
|
|
|
line = remove_eol(line).strip()
|
2022-07-05 12:30:21 +00:00
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
if line.startswith('#'):
|
|
|
|
|
continue
|
|
|
|
|
whistle = None
|
|
|
|
|
category = None
|
|
|
|
|
for sep in separators:
|
|
|
|
|
if sep in line:
|
2022-07-05 17:52:19 +00:00
|
|
|
|
whistle = line.split(sep, 1)[0].strip()
|
|
|
|
|
category = line.split(sep, 1)[1].strip()
|
2022-07-05 12:30:21 +00:00
|
|
|
|
break
|
|
|
|
|
if not whistle:
|
|
|
|
|
whistle = line
|
|
|
|
|
dogwhistles[whistle] = category
|
|
|
|
|
return dogwhistles
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def add_html_tags(base_dir: str, http_prefix: str,
|
|
|
|
|
nickname: str, domain: str, content: str,
|
2022-07-18 16:18:04 +00:00
|
|
|
|
recipients: [], hashtags: {}, translate: {},
|
2021-12-30 20:24:05 +00:00
|
|
|
|
is_json_content: bool = False) -> str:
|
2019-07-15 14:11:31 +00:00
|
|
|
|
""" Replaces plaintext mentions such as @nick@domain into html
|
|
|
|
|
by matching against known following accounts
|
|
|
|
|
"""
|
|
|
|
|
if content.startswith('<p>'):
|
2021-12-29 21:55:09 +00:00
|
|
|
|
content = html_replace_email_quote(content)
|
|
|
|
|
return html_replace_quote_marks(content)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
max_word_length = 40
|
2020-05-22 11:32:38 +00:00
|
|
|
|
content = content.replace('\r', '')
|
2020-04-02 09:56:17 +00:00
|
|
|
|
content = content.replace('\n', ' --linebreak-- ')
|
2022-07-18 16:18:04 +00:00
|
|
|
|
now_playing_str = 'NowPlaying'
|
|
|
|
|
if translate.get(now_playing_str):
|
|
|
|
|
now_playing_str = translate[now_playing_str]
|
2022-07-19 13:59:51 +00:00
|
|
|
|
now_playing_lower_str = 'nowplaying'
|
|
|
|
|
if translate.get(now_playing_lower_str):
|
|
|
|
|
now_playing_lower_str = translate[now_playing_lower_str]
|
|
|
|
|
if '#' + now_playing_lower_str in content:
|
|
|
|
|
content = content.replace('#' + now_playing_lower_str,
|
|
|
|
|
'#' + now_playing_str)
|
2022-07-18 16:18:04 +00:00
|
|
|
|
content = _add_music_tag(content, now_playing_str)
|
2022-07-05 11:12:07 +00:00
|
|
|
|
words = _get_simplified_content(content).split(' ')
|
2020-03-22 21:16:02 +00:00
|
|
|
|
|
2019-08-19 11:07:04 +00:00
|
|
|
|
# remove . for words which are not mentions
|
2021-12-30 20:24:05 +00:00
|
|
|
|
new_words = []
|
|
|
|
|
for word_index in range(0, len(words)):
|
|
|
|
|
word_str = words[word_index]
|
|
|
|
|
if word_str.endswith('.'):
|
|
|
|
|
if not word_str.startswith('@'):
|
|
|
|
|
word_str = word_str[:-1]
|
|
|
|
|
if word_str.startswith('.'):
|
|
|
|
|
word_str = word_str[1:]
|
|
|
|
|
new_words.append(word_str)
|
|
|
|
|
words = new_words
|
|
|
|
|
|
|
|
|
|
replace_mentions = {}
|
|
|
|
|
replace_hashtags = {}
|
|
|
|
|
replace_emoji = {}
|
|
|
|
|
emoji_dict = {}
|
|
|
|
|
original_domain = domain
|
2021-12-26 18:17:37 +00:00
|
|
|
|
domain = remove_domain_port(domain)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
following_filename = \
|
|
|
|
|
acct_dir(base_dir, nickname, domain) + '/following.txt'
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
|
|
# read the following list so that we can detect just @nick
|
|
|
|
|
# in addition to @nick@domain
|
2020-04-02 09:56:17 +00:00
|
|
|
|
following = None
|
2021-01-29 21:33:23 +00:00
|
|
|
|
petnames = None
|
2019-10-18 12:24:31 +00:00
|
|
|
|
if '@' in words:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if os.path.isfile(following_filename):
|
2021-11-26 12:28:20 +00:00
|
|
|
|
following = []
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(following_filename, 'r',
|
|
|
|
|
encoding='utf-8') as foll_file:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
following = foll_file.readlines()
|
2021-11-26 12:28:20 +00:00
|
|
|
|
except OSError:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
print('EX: unable to read ' + following_filename)
|
2021-11-26 12:28:20 +00:00
|
|
|
|
for handle in following:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
pet = get_pet_name(base_dir, nickname, domain, handle)
|
2021-11-26 12:28:20 +00:00
|
|
|
|
if pet:
|
|
|
|
|
petnames.append(pet + '\n')
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
|
|
# extract mentions and tags from words
|
2021-12-30 20:24:05 +00:00
|
|
|
|
long_words_list = []
|
|
|
|
|
prev_word_str = ''
|
|
|
|
|
auto_tags_list = _load_auto_tags(base_dir, nickname, domain)
|
|
|
|
|
append_tags = []
|
|
|
|
|
for word_str in words:
|
|
|
|
|
word_len = len(word_str)
|
|
|
|
|
if word_len > 2:
|
|
|
|
|
if word_len > max_word_length:
|
|
|
|
|
long_words_list.append(word_str)
|
|
|
|
|
first_char = word_str[0]
|
|
|
|
|
if first_char == '@':
|
2022-09-03 17:09:00 +00:00
|
|
|
|
if _add_mention(base_dir, word_str, http_prefix, following,
|
|
|
|
|
petnames, replace_mentions, recipients,
|
|
|
|
|
hashtags):
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prev_word_str = ''
|
2019-10-18 12:24:31 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
elif first_char == '#':
|
2021-02-13 12:12:06 +00:00
|
|
|
|
# remove any endings from the hashtag
|
2021-12-30 20:24:05 +00:00
|
|
|
|
hash_tag_endings = ('.', ':', ';', '-', '\n')
|
|
|
|
|
for ending in hash_tag_endings:
|
|
|
|
|
if word_str.endswith(ending):
|
|
|
|
|
word_str = word_str[:len(word_str) - 1]
|
2021-02-13 12:26:25 +00:00
|
|
|
|
break
|
2021-02-13 12:12:06 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if _add_hash_tags(word_str, http_prefix, original_domain,
|
|
|
|
|
replace_hashtags, hashtags):
|
|
|
|
|
prev_word_str = ''
|
2019-10-18 12:24:31 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
elif ':' in word_str:
|
|
|
|
|
word_str2 = word_str.split(':')[1]
|
|
|
|
|
# print('TAG: emoji located - ' + word_str)
|
|
|
|
|
if not emoji_dict:
|
2020-03-29 09:59:54 +00:00
|
|
|
|
# emoji.json is generated so that it can be customized and
|
|
|
|
|
# the changes will be retained even if default_emoji.json
|
|
|
|
|
# is subsequently updated
|
2021-12-25 16:17:53 +00:00
|
|
|
|
if not os.path.isfile(base_dir + '/emoji/emoji.json'):
|
|
|
|
|
copyfile(base_dir + '/emoji/default_emoji.json',
|
|
|
|
|
base_dir + '/emoji/emoji.json')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
emoji_dict = load_json(base_dir + '/emoji/emoji.json')
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-11-01 22:45:57 +00:00
|
|
|
|
# append custom emoji to the dict
|
2022-03-29 21:10:09 +00:00
|
|
|
|
custom_emoji_filename = base_dir + '/emojicustom/emoji.json'
|
|
|
|
|
if os.path.isfile(custom_emoji_filename):
|
|
|
|
|
custom_emoji_dict = load_json(custom_emoji_filename)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if custom_emoji_dict:
|
2022-03-29 21:10:09 +00:00
|
|
|
|
# combine emoji dicts one by one
|
|
|
|
|
for ename, eitem in custom_emoji_dict.items():
|
|
|
|
|
if ename and eitem:
|
|
|
|
|
if not emoji_dict.get(ename):
|
|
|
|
|
emoji_dict[ename] = eitem
|
2021-11-01 22:45:57 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
# print('TAG: looking up emoji for :' + word_str2 + ':')
|
|
|
|
|
_add_emoji(base_dir, ':' + word_str2 + ':', http_prefix,
|
|
|
|
|
original_domain, replace_emoji, hashtags,
|
|
|
|
|
emoji_dict)
|
2020-09-13 14:42:17 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if _auto_tag(base_dir, nickname, domain, word_str,
|
|
|
|
|
auto_tags_list, append_tags):
|
|
|
|
|
prev_word_str = ''
|
2020-09-13 14:42:17 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if prev_word_str:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
if _auto_tag(base_dir, nickname, domain,
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prev_word_str + ' ' + word_str,
|
|
|
|
|
auto_tags_list, append_tags):
|
|
|
|
|
prev_word_str = ''
|
2020-09-13 14:42:17 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prev_word_str = word_str
|
2020-09-13 14:42:17 +00:00
|
|
|
|
|
|
|
|
|
# add any auto generated tags
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for appended in append_tags:
|
2020-09-13 14:42:17 +00:00
|
|
|
|
content = content + ' ' + appended
|
2021-12-30 20:24:05 +00:00
|
|
|
|
_add_hash_tags(appended, http_prefix, original_domain,
|
|
|
|
|
replace_hashtags, hashtags)
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
|
|
# replace words with their html versions
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for word_str, replace_str in replace_mentions.items():
|
|
|
|
|
content = content.replace(word_str, replace_str)
|
|
|
|
|
for word_str, replace_str in replace_hashtags.items():
|
|
|
|
|
content = content.replace(word_str, replace_str)
|
|
|
|
|
if not is_json_content:
|
|
|
|
|
for word_str, replace_str in replace_emoji.items():
|
|
|
|
|
content = content.replace(word_str, replace_str)
|
2019-10-29 13:04:38 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
content = add_web_links(content)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if long_words_list:
|
|
|
|
|
content = remove_long_words(content, max_word_length, long_words_list)
|
2021-12-29 21:55:09 +00:00
|
|
|
|
content = limit_repeated_words(content, 6)
|
2020-04-02 09:56:17 +00:00
|
|
|
|
content = content.replace(' --linebreak-- ', '</p><p>')
|
2021-12-29 21:55:09 +00:00
|
|
|
|
content = html_replace_email_quote(content)
|
|
|
|
|
return '<p>' + html_replace_quote_marks(content) + '</p>'
|
2020-03-22 21:16:02 +00:00
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def get_mentions_from_html(html_text: str, match_str: str) -> []:
|
2019-08-05 19:13:15 +00:00
|
|
|
|
"""Extracts mentioned actors from the given html content string
|
|
|
|
|
"""
|
2020-04-02 09:56:17 +00:00
|
|
|
|
mentions = []
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if match_str not in html_text:
|
2019-08-05 19:13:15 +00:00
|
|
|
|
return mentions
|
2021-12-30 20:24:05 +00:00
|
|
|
|
mentions_list = html_text.split(match_str)
|
|
|
|
|
for mention_str in mentions_list:
|
|
|
|
|
if '"' not in mention_str:
|
2019-08-05 19:13:15 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
actor_str = mention_str.split('"')[0]
|
|
|
|
|
if actor_str.startswith('http') or \
|
|
|
|
|
actor_str.startswith('gnunet') or \
|
|
|
|
|
actor_str.startswith('i2p') or \
|
2022-04-29 13:54:13 +00:00
|
|
|
|
actor_str.startswith('ipfs') or \
|
|
|
|
|
actor_str.startswith('ipns') or \
|
2021-12-30 20:24:05 +00:00
|
|
|
|
actor_str.startswith('hyper') or \
|
|
|
|
|
actor_str.startswith('dat:'):
|
|
|
|
|
if actor_str not in mentions:
|
|
|
|
|
mentions.append(actor_str)
|
2019-08-05 19:13:15 +00:00
|
|
|
|
return mentions
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def extract_media_in_form_post(post_bytes, boundary, name: str):
|
2020-03-29 09:59:54 +00:00
|
|
|
|
"""Extracts the binary encoding for image/video/audio within a http
|
|
|
|
|
form POST
|
2019-11-10 11:37:24 +00:00
|
|
|
|
Returns the media bytes and the remaining bytes
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
image_start_boundary = b'Content-Disposition: form-data; name="' + \
|
2020-04-02 09:56:17 +00:00
|
|
|
|
name.encode('utf8', 'ignore') + b'";'
|
2021-12-30 20:24:05 +00:00
|
|
|
|
image_start_location = post_bytes.find(image_start_boundary)
|
|
|
|
|
if image_start_location == -1:
|
|
|
|
|
return None, post_bytes
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
|
|
# bytes after the start boundary appears
|
2021-12-30 20:24:05 +00:00
|
|
|
|
media_bytes = post_bytes[image_start_location:]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
|
|
# look for the next boundary
|
2021-12-30 20:24:05 +00:00
|
|
|
|
image_end_boundary = boundary.encode('utf8', 'ignore')
|
|
|
|
|
image_end_location = media_bytes.find(image_end_boundary)
|
|
|
|
|
if image_end_location == -1:
|
2019-11-10 11:37:24 +00:00
|
|
|
|
# no ending boundary
|
2021-12-30 20:24:05 +00:00
|
|
|
|
return media_bytes, post_bytes[:image_start_location]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
|
|
# remaining bytes after the end of the image
|
2021-12-30 20:24:05 +00:00
|
|
|
|
remainder = media_bytes[image_end_location:]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
|
|
# remove bytes after the end boundary
|
2021-12-30 20:24:05 +00:00
|
|
|
|
media_bytes = media_bytes[:image_end_location]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
|
|
# return the media and the before+after bytes
|
2021-12-30 20:24:05 +00:00
|
|
|
|
return media_bytes, post_bytes[:image_start_location] + remainder
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2022-07-21 10:16:14 +00:00
|
|
|
|
def _valid_follows_csv(content: str) -> bool:
|
|
|
|
|
"""is the given content a valid csv file containing imported follows?
|
|
|
|
|
"""
|
|
|
|
|
if ',' not in content:
|
|
|
|
|
return False
|
|
|
|
|
if 'Account address,' not in content:
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def save_media_in_form_post(media_bytes, debug: bool,
|
|
|
|
|
filename_base: str = None) -> (str, str):
|
2019-11-10 11:37:24 +00:00
|
|
|
|
"""Saves the given media bytes extracted from http form POST
|
|
|
|
|
Returns the filename and attachment type
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if not media_bytes:
|
|
|
|
|
if filename_base:
|
2021-08-09 21:27:13 +00:00
|
|
|
|
# remove any existing files
|
2021-12-30 20:24:05 +00:00
|
|
|
|
extension_types = get_image_extensions()
|
|
|
|
|
for ex in extension_types:
|
|
|
|
|
possible_other_format = filename_base + '.' + ex
|
|
|
|
|
if os.path.isfile(possible_other_format):
|
2021-09-05 10:17:43 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
os.remove(possible_other_format)
|
2021-11-25 18:42:38 +00:00
|
|
|
|
except OSError:
|
2021-10-29 16:31:20 +00:00
|
|
|
|
if debug:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
print('EX: save_media_in_form_post ' +
|
2021-10-29 16:31:20 +00:00
|
|
|
|
'unable to delete other ' +
|
2021-12-30 20:24:05 +00:00
|
|
|
|
str(possible_other_format))
|
|
|
|
|
if os.path.isfile(filename_base):
|
2021-09-05 10:17:43 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
os.remove(filename_base)
|
2021-11-25 18:42:38 +00:00
|
|
|
|
except OSError:
|
2021-10-29 16:31:20 +00:00
|
|
|
|
if debug:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
print('EX: save_media_in_form_post ' +
|
2021-10-29 16:31:20 +00:00
|
|
|
|
'unable to delete ' +
|
2021-12-30 20:24:05 +00:00
|
|
|
|
str(filename_base))
|
2021-08-09 21:27:13 +00:00
|
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('DEBUG: No media found within POST')
|
2020-04-02 09:56:17 +00:00
|
|
|
|
return None, None
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
media_location = -1
|
|
|
|
|
search_str = ''
|
2020-04-02 09:56:17 +00:00
|
|
|
|
filename = None
|
2020-03-22 21:16:02 +00:00
|
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
|
# directly search the binary array for the beginning
|
2022-07-21 10:16:14 +00:00
|
|
|
|
# of an image, zip or csv
|
2021-12-30 20:24:05 +00:00
|
|
|
|
extension_list = {
|
2019-11-10 11:37:24 +00:00
|
|
|
|
'png': 'image/png',
|
|
|
|
|
'jpeg': 'image/jpeg',
|
2022-02-06 11:04:49 +00:00
|
|
|
|
'jxl': 'image/jxl',
|
2019-11-10 11:37:24 +00:00
|
|
|
|
'gif': 'image/gif',
|
2021-01-11 22:27:57 +00:00
|
|
|
|
'svg': 'image/svg+xml',
|
2019-11-14 13:30:54 +00:00
|
|
|
|
'webp': 'image/webp',
|
2020-09-09 15:09:38 +00:00
|
|
|
|
'avif': 'image/avif',
|
2019-11-10 11:37:24 +00:00
|
|
|
|
'mp4': 'video/mp4',
|
|
|
|
|
'ogv': 'video/ogv',
|
|
|
|
|
'mp3': 'audio/mpeg',
|
2021-05-29 11:04:03 +00:00
|
|
|
|
'ogg': 'audio/ogg',
|
2022-10-31 11:05:11 +00:00
|
|
|
|
'wav': 'audio/vnd.wave',
|
|
|
|
|
'wav2': 'audio/wav',
|
|
|
|
|
'wav3': 'audio/x-wav',
|
|
|
|
|
'wav4': 'audio/x-pn-wave',
|
2022-04-18 13:21:45 +00:00
|
|
|
|
'opus': 'audio/opus',
|
2022-10-20 19:37:59 +00:00
|
|
|
|
'spx': 'audio/speex',
|
2021-08-03 09:09:04 +00:00
|
|
|
|
'flac': 'audio/flac',
|
2022-07-21 09:58:28 +00:00
|
|
|
|
'zip': 'application/zip',
|
2022-07-21 10:45:27 +00:00
|
|
|
|
'csv': 'text/csv',
|
|
|
|
|
'csv2': 'text/plain'
|
2019-11-10 11:37:24 +00:00
|
|
|
|
}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
detected_extension = None
|
|
|
|
|
for extension, content_type in extension_list.items():
|
|
|
|
|
search_str = b'Content-Type: ' + content_type.encode('utf8', 'ignore')
|
|
|
|
|
media_location = media_bytes.find(search_str)
|
|
|
|
|
if media_location > -1:
|
2020-05-26 19:05:03 +00:00
|
|
|
|
# image/video/audio binaries
|
2020-04-02 09:56:17 +00:00
|
|
|
|
if extension == 'jpeg':
|
|
|
|
|
extension = 'jpg'
|
|
|
|
|
elif extension == 'mpeg':
|
|
|
|
|
extension = 'mp3'
|
2022-07-21 10:45:27 +00:00
|
|
|
|
elif extension == 'csv2':
|
|
|
|
|
extension = 'csv'
|
2022-10-31 11:05:11 +00:00
|
|
|
|
elif extension == 'wav2':
|
|
|
|
|
extension = 'wav'
|
|
|
|
|
elif extension == 'wav3':
|
|
|
|
|
extension = 'wav'
|
|
|
|
|
elif extension == 'wav4':
|
|
|
|
|
extension = 'wav'
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if filename_base:
|
|
|
|
|
filename = filename_base + '.' + extension
|
2022-06-01 14:26:50 +00:00
|
|
|
|
search_lst = search_str.decode().split('/', maxsplit=1)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
attachment_media_type = \
|
2022-06-01 14:26:50 +00:00
|
|
|
|
search_lst[0].replace('Content-Type: ', '')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
detected_extension = extension
|
2019-11-10 11:37:24 +00:00
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not filename:
|
2020-04-02 09:56:17 +00:00
|
|
|
|
return None, None
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2020-05-26 19:29:15 +00:00
|
|
|
|
# locate the beginning of the image, after any
|
|
|
|
|
# carriage returns
|
2021-12-30 20:24:05 +00:00
|
|
|
|
start_pos = media_location + len(search_str)
|
2020-05-26 19:29:15 +00:00
|
|
|
|
for offset in range(1, 8):
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if media_bytes[start_pos+offset] != 10:
|
|
|
|
|
if media_bytes[start_pos+offset] != 13:
|
|
|
|
|
start_pos += offset
|
2020-05-26 19:29:15 +00:00
|
|
|
|
break
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2019-11-14 13:30:54 +00:00
|
|
|
|
# remove any existing image files with a different format
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if detected_extension != 'zip':
|
|
|
|
|
extension_types = get_image_extensions()
|
|
|
|
|
for ex in extension_types:
|
|
|
|
|
if ex == detected_extension:
|
2021-05-29 11:04:03 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
possible_other_format = \
|
2021-05-29 11:04:03 +00:00
|
|
|
|
filename.replace('.temp', '').replace('.' +
|
2021-12-30 20:24:05 +00:00
|
|
|
|
detected_extension, '.' +
|
2021-05-29 11:04:03 +00:00
|
|
|
|
ex)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if os.path.isfile(possible_other_format):
|
2021-09-05 10:17:43 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
os.remove(possible_other_format)
|
2021-11-25 18:42:38 +00:00
|
|
|
|
except OSError:
|
2021-10-29 16:31:20 +00:00
|
|
|
|
if debug:
|
2021-12-29 21:55:09 +00:00
|
|
|
|
print('EX: save_media_in_form_post ' +
|
2021-10-29 16:31:20 +00:00
|
|
|
|
'unable to delete other 2 ' +
|
2021-12-30 20:24:05 +00:00
|
|
|
|
str(possible_other_format))
|
2019-11-14 13:30:54 +00:00
|
|
|
|
|
2021-09-13 17:51:33 +00:00
|
|
|
|
# don't allow scripts within svg files
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if detected_extension == 'svg':
|
|
|
|
|
svg_str = media_bytes[start_pos:]
|
|
|
|
|
svg_str = svg_str.decode()
|
|
|
|
|
if dangerous_svg(svg_str, False):
|
2021-09-13 17:51:33 +00:00
|
|
|
|
return None, None
|
2022-07-21 09:58:28 +00:00
|
|
|
|
elif detected_extension == 'csv':
|
|
|
|
|
csv_str = media_bytes[start_pos:]
|
2022-07-21 10:47:17 +00:00
|
|
|
|
csv_str = csv_str.decode()
|
2022-07-21 10:16:14 +00:00
|
|
|
|
if not _valid_follows_csv(csv_str):
|
2022-07-21 09:58:28 +00:00
|
|
|
|
return None, None
|
2021-09-13 17:51:33 +00:00
|
|
|
|
|
2021-11-25 18:42:38 +00:00
|
|
|
|
try:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
with open(filename, 'wb') as fp_media:
|
|
|
|
|
fp_media.write(media_bytes[start_pos:])
|
2021-11-25 18:42:38 +00:00
|
|
|
|
except OSError:
|
2021-11-25 22:22:54 +00:00
|
|
|
|
print('EX: unable to write media')
|
2019-12-04 18:52:27 +00:00
|
|
|
|
|
2021-03-06 23:16:54 +00:00
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
|
print('WARN: Media file could not be written to file: ' + filename)
|
|
|
|
|
return None, None
|
2021-03-06 23:19:03 +00:00
|
|
|
|
print('Uploaded media file written: ' + filename)
|
2021-03-06 23:16:54 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
return filename, attachment_media_type
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
2022-07-11 14:45:40 +00:00
|
|
|
|
def combine_textarea_lines(text: str) -> str:
|
|
|
|
|
"""Combines separate lines
|
|
|
|
|
"""
|
|
|
|
|
result = ''
|
|
|
|
|
ctr = 0
|
|
|
|
|
paragraphs = text.split('\n\n')
|
|
|
|
|
for para in paragraphs:
|
2022-07-11 15:52:50 +00:00
|
|
|
|
para = para.replace('\n* ', '***BULLET POINT*** ')
|
|
|
|
|
para = para.replace('\n * ', '***BULLET POINT*** ')
|
|
|
|
|
para = para.replace('\n- ', '***DASH POINT*** ')
|
|
|
|
|
para = para.replace('\n - ', '***DASH POINT*** ')
|
2022-07-11 14:45:40 +00:00
|
|
|
|
para = para.replace('\n', ' ')
|
|
|
|
|
para = para.replace(' ', ' ')
|
2022-07-11 15:52:50 +00:00
|
|
|
|
para = para.replace('***BULLET POINT*** ', '\n* ')
|
|
|
|
|
para = para.replace('***DASH POINT*** ', '\n- ')
|
2022-07-11 14:45:40 +00:00
|
|
|
|
if ctr > 0:
|
2022-07-11 15:13:39 +00:00
|
|
|
|
result += '</p><p>'
|
2022-07-11 14:45:40 +00:00
|
|
|
|
result += para
|
|
|
|
|
ctr += 1
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def extract_text_fields_in_post(post_bytes, boundary: str, debug: bool,
|
2022-06-01 14:26:50 +00:00
|
|
|
|
unit_test_data: str = None) -> {}:
|
2019-11-10 11:37:24 +00:00
|
|
|
|
"""Returns a dictionary containing the text fields of a http form POST
|
|
|
|
|
The boundary argument comes from the http header
|
2020-03-22 21:16:02 +00:00
|
|
|
|
"""
|
2022-07-10 21:52:24 +00:00
|
|
|
|
if boundary == 'LYNX':
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('POST from lynx browser')
|
2022-07-10 21:52:39 +00:00
|
|
|
|
boundary = '--LYNX'
|
2022-07-10 21:52:24 +00:00
|
|
|
|
|
2022-06-01 14:26:50 +00:00
|
|
|
|
if not unit_test_data:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
msg_bytes = email.parser.BytesParser().parsebytes(post_bytes)
|
|
|
|
|
message_fields = msg_bytes.get_payload(decode=True).decode('utf-8')
|
2021-03-01 10:02:55 +00:00
|
|
|
|
else:
|
2022-06-01 14:26:50 +00:00
|
|
|
|
message_fields = unit_test_data
|
2021-03-01 10:02:55 +00:00
|
|
|
|
|
2022-07-10 21:15:06 +00:00
|
|
|
|
if debug:
|
|
|
|
|
if 'password' not in message_fields:
|
|
|
|
|
print('DEBUG: POST arriving ' + message_fields)
|
2021-03-01 10:02:55 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
message_fields = message_fields.split(boundary)
|
2020-04-02 09:56:17 +00:00
|
|
|
|
fields = {}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
fields_with_semicolon_allowed = (
|
2021-03-01 12:19:49 +00:00
|
|
|
|
'message', 'bio', 'autoCW', 'password', 'passwordconfirm',
|
|
|
|
|
'instanceDescription', 'instanceDescriptionShort',
|
|
|
|
|
'subject', 'location', 'imageDescription'
|
2021-03-01 12:15:06 +00:00
|
|
|
|
)
|
2022-07-10 21:15:06 +00:00
|
|
|
|
if debug:
|
|
|
|
|
if 'password' not in message_fields:
|
|
|
|
|
print('DEBUG: POST message_fields: ' + str(message_fields))
|
2022-07-10 19:18:37 +00:00
|
|
|
|
lynx_content_type = 'Content-Type: text/plain; charset=utf-8\r\n'
|
2019-11-10 11:37:24 +00:00
|
|
|
|
# examine each section of the POST, separated by the boundary
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for fld in message_fields:
|
|
|
|
|
if fld == '--':
|
2019-11-10 11:37:24 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if ' name="' not in fld:
|
2020-03-22 21:16:02 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_str = fld.split(' name="', 1)[1]
|
|
|
|
|
if '"' not in post_str:
|
2019-11-10 11:37:24 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_key = post_str.split('"', 1)[0]
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('post_key: ' + post_key)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_value_str = post_str.split('"', 1)[1]
|
2022-07-10 19:04:54 +00:00
|
|
|
|
if boundary == '--LYNX':
|
|
|
|
|
post_value_str = \
|
|
|
|
|
post_value_str.replace(lynx_content_type, '')
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug and 'password' not in post_key:
|
2022-07-10 21:48:36 +00:00
|
|
|
|
print('boundary: ' + boundary)
|
2022-07-10 21:44:12 +00:00
|
|
|
|
print('post_value_str1: ' + post_value_str)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if ';' in post_value_str:
|
|
|
|
|
if post_key not in fields_with_semicolon_allowed and \
|
|
|
|
|
not post_key.startswith('edited'):
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('extract_text_fields_in_post exit 1')
|
2021-03-01 10:02:55 +00:00
|
|
|
|
continue
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug and 'password' not in post_key:
|
2022-07-10 21:44:12 +00:00
|
|
|
|
print('post_value_str2: ' + post_value_str)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if '\r\n' not in post_value_str:
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug:
|
|
|
|
|
print('extract_text_fields_in_post exit 2')
|
2019-11-10 11:37:24 +00:00
|
|
|
|
continue
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_lines = post_value_str.split('\r\n')
|
2022-07-10 21:57:26 +00:00
|
|
|
|
if debug and 'password' not in post_key:
|
2022-07-10 21:44:12 +00:00
|
|
|
|
print('post_lines: ' + str(post_lines))
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_value = ''
|
|
|
|
|
if len(post_lines) > 2:
|
|
|
|
|
for line in range(2, len(post_lines)-1):
|
2020-04-02 09:56:17 +00:00
|
|
|
|
if line > 2:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
post_value += '\n'
|
|
|
|
|
post_value += post_lines[line]
|
|
|
|
|
fields[post_key] = urllib.parse.unquote(post_value)
|
2022-07-11 14:45:40 +00:00
|
|
|
|
if boundary == '--LYNX' and post_key in ('message', 'bio'):
|
|
|
|
|
fields[post_key] = combine_textarea_lines(fields[post_key])
|
2019-11-10 11:37:24 +00:00
|
|
|
|
return fields
|
2021-07-10 09:38:59 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def limit_repeated_words(text: str, max_repeats: int) -> str:
|
2021-07-10 09:38:59 +00:00
|
|
|
|
"""Removes words which are repeated many times
|
|
|
|
|
"""
|
|
|
|
|
words = text.replace('\n', ' ').split(' ')
|
2021-12-30 20:24:05 +00:00
|
|
|
|
repeat_ctr = 0
|
|
|
|
|
repeated_text = ''
|
2021-07-10 09:38:59 +00:00
|
|
|
|
replacements = {}
|
2021-12-30 20:24:05 +00:00
|
|
|
|
prev_word = ''
|
2021-07-10 09:38:59 +00:00
|
|
|
|
for word in words:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if word == prev_word:
|
|
|
|
|
repeat_ctr += 1
|
|
|
|
|
if repeated_text:
|
|
|
|
|
repeated_text += ' ' + word
|
2021-07-10 09:38:59 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
repeated_text = word + ' ' + word
|
2021-07-10 09:38:59 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if repeat_ctr > max_repeats:
|
|
|
|
|
new_text = ((prev_word + ' ') * max_repeats).strip()
|
|
|
|
|
replacements[prev_word] = [repeated_text, new_text]
|
|
|
|
|
repeat_ctr = 0
|
|
|
|
|
repeated_text = ''
|
|
|
|
|
prev_word = word
|
2021-07-10 09:38:59 +00:00
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if repeat_ctr > max_repeats:
|
|
|
|
|
new_text = ((prev_word + ' ') * max_repeats).strip()
|
|
|
|
|
replacements[prev_word] = [repeated_text, new_text]
|
2021-07-10 09:38:59 +00:00
|
|
|
|
|
|
|
|
|
for word, item in replacements.items():
|
|
|
|
|
text = text.replace(item[0], item[1])
|
|
|
|
|
return text
|
2021-08-07 17:03:41 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def get_price_from_string(priceStr: str) -> (str, str):
|
2021-08-07 17:03:41 +00:00
|
|
|
|
"""Returns the item price and currency
|
|
|
|
|
"""
|
2021-12-26 17:29:09 +00:00
|
|
|
|
currencies = get_currencies()
|
2021-08-07 17:03:41 +00:00
|
|
|
|
for symbol, name in currencies.items():
|
|
|
|
|
if symbol in priceStr:
|
|
|
|
|
price = priceStr.replace(symbol, '')
|
2021-12-26 18:03:39 +00:00
|
|
|
|
if is_float(price):
|
2021-08-07 17:03:41 +00:00
|
|
|
|
return price, name
|
|
|
|
|
elif name in priceStr:
|
|
|
|
|
price = priceStr.replace(name, '')
|
2021-12-26 18:03:39 +00:00
|
|
|
|
if is_float(price):
|
2021-08-07 17:03:41 +00:00
|
|
|
|
return price, name
|
2021-12-26 18:03:39 +00:00
|
|
|
|
if is_float(priceStr):
|
2021-08-07 17:03:41 +00:00
|
|
|
|
return priceStr, "EUR"
|
|
|
|
|
return "0.00", "EUR"
|
2021-10-14 15:12:35 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _words_similarity_histogram(words: []) -> {}:
|
2021-10-14 15:40:19 +00:00
|
|
|
|
"""Returns a histogram for word combinations
|
|
|
|
|
"""
|
|
|
|
|
histogram = {}
|
|
|
|
|
for index in range(1, len(words)):
|
2021-12-30 20:24:05 +00:00
|
|
|
|
combined_words = words[index - 1] + words[index]
|
|
|
|
|
if histogram.get(combined_words):
|
|
|
|
|
histogram[combined_words] += 1
|
2021-10-14 15:40:19 +00:00
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
histogram[combined_words] = 1
|
2021-10-14 15:40:19 +00:00
|
|
|
|
return histogram
|
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def _words_similarity_words_list(content: str) -> []:
|
2021-10-14 15:53:04 +00:00
|
|
|
|
"""Returns a list of words for the given content
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
remove_punctuation = ('.', ',', ';', '-', ':', '"')
|
2021-12-27 15:43:22 +00:00
|
|
|
|
content = remove_html(content).lower()
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for punc in remove_punctuation:
|
|
|
|
|
content = content.replace(punc, ' ')
|
2021-10-14 15:53:04 +00:00
|
|
|
|
content = content.replace(' ', ' ')
|
|
|
|
|
return content.split(' ')
|
|
|
|
|
|
|
|
|
|
|
2021-12-30 20:24:05 +00:00
|
|
|
|
def words_similarity(content1: str, content2: str, min_words: int) -> int:
|
2021-10-14 15:12:35 +00:00
|
|
|
|
"""Returns percentage similarity
|
|
|
|
|
"""
|
|
|
|
|
if content1 == content2:
|
|
|
|
|
return 100
|
2021-10-14 15:40:19 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
words1 = _words_similarity_words_list(content1)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(words1) < min_words:
|
2021-10-14 15:12:35 +00:00
|
|
|
|
return 0
|
2021-10-14 15:40:19 +00:00
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
words2 = _words_similarity_words_list(content2)
|
2021-12-30 20:24:05 +00:00
|
|
|
|
if len(words2) < min_words:
|
2021-10-14 15:12:35 +00:00
|
|
|
|
return 0
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
histogram1 = _words_similarity_histogram(words1)
|
|
|
|
|
histogram2 = _words_similarity_histogram(words2)
|
2021-10-14 15:12:35 +00:00
|
|
|
|
|
|
|
|
|
diff = 0
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for combined_words, _ in histogram1.items():
|
|
|
|
|
if not histogram2.get(combined_words):
|
2021-10-14 15:12:35 +00:00
|
|
|
|
diff += 1
|
|
|
|
|
else:
|
2021-12-30 20:24:05 +00:00
|
|
|
|
diff += \
|
|
|
|
|
abs(histogram2[combined_words] - histogram1[combined_words])
|
2021-10-14 15:12:35 +00:00
|
|
|
|
return 100 - int(diff * 100 / len(histogram1.items()))
|
2021-10-26 16:06:22 +00:00
|
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
|
def contains_invalid_local_links(content: str) -> bool:
|
2021-10-26 16:06:22 +00:00
|
|
|
|
"""Returns true if the given content has invalid links
|
|
|
|
|
"""
|
2021-12-30 20:24:05 +00:00
|
|
|
|
for inv_str in INVALID_CONTENT_STRINGS:
|
|
|
|
|
if '?' + inv_str + '=' in content:
|
2021-10-26 16:06:22 +00:00
|
|
|
|
return True
|
|
|
|
|
return False
|
2022-03-24 13:14:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def bold_reading_string(text: str) -> str:
|
|
|
|
|
"""Returns bold reading formatted text
|
|
|
|
|
"""
|
2022-03-24 15:15:53 +00:00
|
|
|
|
text = html.unescape(text)
|
2022-03-24 13:14:41 +00:00
|
|
|
|
add_paragraph_markup = False
|
|
|
|
|
if '<p>' in text:
|
|
|
|
|
text = text.replace('</p>', '\n').replace('<p>', '')
|
|
|
|
|
add_paragraph_markup = True
|
|
|
|
|
paragraphs = text.split('\n')
|
|
|
|
|
parag_ctr = 0
|
2022-03-24 14:40:28 +00:00
|
|
|
|
new_text = ''
|
2022-03-24 13:14:41 +00:00
|
|
|
|
for parag in paragraphs:
|
|
|
|
|
words = parag.split(' ')
|
|
|
|
|
new_parag = ''
|
2022-03-24 14:08:07 +00:00
|
|
|
|
reading_markup = False
|
2022-03-24 13:14:41 +00:00
|
|
|
|
for wrd in words:
|
2022-03-24 15:32:37 +00:00
|
|
|
|
if '<' in wrd:
|
2022-03-24 14:08:07 +00:00
|
|
|
|
reading_markup = True
|
2022-03-24 15:32:37 +00:00
|
|
|
|
if reading_markup and '>' in wrd:
|
2022-03-24 14:08:07 +00:00
|
|
|
|
reading_markup = False
|
2022-03-24 16:16:36 +00:00
|
|
|
|
wrd_len = len(wrd)
|
|
|
|
|
if not reading_markup and wrd_len > 1 and \
|
2022-03-24 13:38:10 +00:00
|
|
|
|
'<' not in wrd and '>' not in wrd and \
|
2022-03-24 15:57:44 +00:00
|
|
|
|
'&' not in wrd and '=' not in wrd and \
|
|
|
|
|
not wrd.startswith(':'):
|
2022-03-24 13:45:55 +00:00
|
|
|
|
|
|
|
|
|
prefix = ''
|
|
|
|
|
postfix = ''
|
|
|
|
|
if wrd.startswith('"'):
|
|
|
|
|
prefix = '"'
|
|
|
|
|
wrd = wrd[1:]
|
|
|
|
|
if wrd.endswith('"'):
|
|
|
|
|
postfix = '"'
|
2022-03-24 16:16:36 +00:00
|
|
|
|
wrd = wrd[:wrd_len - 1]
|
2022-03-24 13:45:55 +00:00
|
|
|
|
|
2022-03-24 16:16:36 +00:00
|
|
|
|
initial_chars = int(math.ceil(wrd_len / 2.0))
|
2022-03-24 13:14:41 +00:00
|
|
|
|
new_parag += \
|
2022-03-24 13:45:55 +00:00
|
|
|
|
prefix + '<b>' + wrd[:initial_chars] + '</b>' + \
|
|
|
|
|
wrd[initial_chars:] + postfix + ' '
|
2022-03-24 13:14:41 +00:00
|
|
|
|
else:
|
|
|
|
|
new_parag += wrd + ' '
|
|
|
|
|
parag_ctr += 1
|
|
|
|
|
new_parag = new_parag.strip()
|
2022-03-24 15:57:44 +00:00
|
|
|
|
if not new_parag:
|
|
|
|
|
continue
|
2022-03-24 13:14:41 +00:00
|
|
|
|
if parag_ctr < len(paragraphs):
|
|
|
|
|
if not add_paragraph_markup:
|
|
|
|
|
new_text += new_parag + '\n'
|
|
|
|
|
else:
|
|
|
|
|
new_text += '<p>' + new_parag + '</p>'
|
|
|
|
|
else:
|
|
|
|
|
if not add_paragraph_markup:
|
|
|
|
|
new_text += new_parag
|
|
|
|
|
else:
|
|
|
|
|
new_text += '<p>' + new_parag + '</p>'
|
|
|
|
|
|
|
|
|
|
return new_text
|
2022-03-29 19:34:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def import_emoji(base_dir: str, import_filename: str, session) -> None:
|
|
|
|
|
"""Imports emoji from the given filename
|
2022-04-11 19:44:39 +00:00
|
|
|
|
Each line should be [emoji url], :emojiname:
|
2022-03-29 19:34:03 +00:00
|
|
|
|
"""
|
|
|
|
|
if not os.path.isfile(import_filename):
|
|
|
|
|
return
|
|
|
|
|
emoji_dict = load_json(base_dir + '/emoji/default_emoji.json', 0, 1)
|
|
|
|
|
added = 0
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(import_filename, "r", encoding='utf-8') as fp_emoji:
|
2022-03-29 19:34:03 +00:00
|
|
|
|
lines = fp_emoji.readlines()
|
|
|
|
|
for line in lines:
|
|
|
|
|
url = line.split(', ')[0]
|
|
|
|
|
tag = line.split(', ')[1].strip()
|
|
|
|
|
tag = tag.split(':')[1]
|
|
|
|
|
if emoji_dict.get(tag):
|
|
|
|
|
continue
|
|
|
|
|
emoji_image_filename = base_dir + '/emoji/' + tag + '.png'
|
|
|
|
|
if os.path.isfile(emoji_image_filename):
|
|
|
|
|
continue
|
2022-06-14 10:24:29 +00:00
|
|
|
|
if download_image(session, url,
|
2022-03-29 19:34:03 +00:00
|
|
|
|
emoji_image_filename, True, False):
|
|
|
|
|
emoji_dict[tag] = tag
|
|
|
|
|
added += 1
|
|
|
|
|
save_json(emoji_dict, base_dir + '/emoji/default_emoji.json')
|
|
|
|
|
print(str(added) + ' custom emoji added')
|
2022-04-10 19:19:40 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def content_diff(content: str, prev_content: str) -> str:
|
|
|
|
|
"""Returns a diff for the given content
|
|
|
|
|
"""
|
2022-05-30 21:41:18 +00:00
|
|
|
|
cdiff = difflib.Differ()
|
2022-04-10 19:19:40 +00:00
|
|
|
|
text1_lines = content.splitlines()
|
2022-04-11 12:13:04 +00:00
|
|
|
|
text1_sentences = []
|
|
|
|
|
for line in text1_lines:
|
|
|
|
|
sentences = line.split('.')
|
|
|
|
|
for sentence in sentences:
|
|
|
|
|
text1_sentences.append(sentence.strip())
|
|
|
|
|
|
2022-04-10 19:19:40 +00:00
|
|
|
|
text2_lines = prev_content.splitlines()
|
2022-04-11 12:13:04 +00:00
|
|
|
|
text2_sentences = []
|
|
|
|
|
for line in text2_lines:
|
|
|
|
|
sentences = line.split('.')
|
|
|
|
|
for sentence in sentences:
|
|
|
|
|
text2_sentences.append(sentence.strip())
|
|
|
|
|
|
2022-05-30 21:41:18 +00:00
|
|
|
|
diff = cdiff.compare(text1_sentences, text2_sentences)
|
2022-04-10 19:19:40 +00:00
|
|
|
|
|
|
|
|
|
diff_text = ''
|
|
|
|
|
for line in diff:
|
|
|
|
|
if line.startswith('- '):
|
2022-04-11 12:13:04 +00:00
|
|
|
|
if not diff_text:
|
|
|
|
|
diff_text = '<p>'
|
|
|
|
|
else:
|
|
|
|
|
diff_text += '<br>'
|
|
|
|
|
diff_text += '<label class="diff_add">+ ' + line[2:] + '</label>'
|
|
|
|
|
elif line.startswith('+ '):
|
|
|
|
|
if not diff_text:
|
|
|
|
|
diff_text = '<p>'
|
|
|
|
|
else:
|
|
|
|
|
diff_text += '<br>'
|
|
|
|
|
diff_text += \
|
|
|
|
|
'<label class="diff_remove">- ' + line[2:] + '</label>'
|
|
|
|
|
if diff_text:
|
|
|
|
|
diff_text += '</p>'
|
|
|
|
|
return diff_text
|
2022-04-10 22:50:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_edits_html(edits_json: {}, post_json_object: {},
|
2022-04-13 14:20:38 +00:00
|
|
|
|
translate: {}, timezone: str,
|
|
|
|
|
system_language: str) -> str:
|
2022-04-10 22:50:44 +00:00
|
|
|
|
""" Creates html showing historical edits made to a post
|
|
|
|
|
"""
|
|
|
|
|
if not edits_json:
|
|
|
|
|
return ''
|
|
|
|
|
if not has_object_dict(post_json_object):
|
|
|
|
|
return ''
|
|
|
|
|
if not post_json_object['object'].get('content'):
|
2022-04-13 14:20:38 +00:00
|
|
|
|
if not post_json_object['object'].get('contentMap'):
|
|
|
|
|
return ''
|
2022-04-10 22:50:44 +00:00
|
|
|
|
edit_dates_list = []
|
2022-05-30 21:41:18 +00:00
|
|
|
|
for modified, _ in edits_json.items():
|
2022-04-10 22:50:44 +00:00
|
|
|
|
edit_dates_list.append(modified)
|
|
|
|
|
edit_dates_list.sort(reverse=True)
|
|
|
|
|
edits_str = ''
|
2022-04-13 14:20:38 +00:00
|
|
|
|
content = None
|
|
|
|
|
if post_json_object['object'].get('contentMap'):
|
|
|
|
|
if post_json_object['object']['contentMap'].get(system_language):
|
|
|
|
|
content = \
|
|
|
|
|
post_json_object['object']['contentMap'][system_language]
|
|
|
|
|
if not content:
|
|
|
|
|
if post_json_object['object'].get('content'):
|
|
|
|
|
content = post_json_object['object']['content']
|
|
|
|
|
if not content:
|
|
|
|
|
return ''
|
|
|
|
|
content = remove_html(content)
|
2022-04-10 22:50:44 +00:00
|
|
|
|
for modified in edit_dates_list:
|
|
|
|
|
prev_json = edits_json[modified]
|
|
|
|
|
if not has_object_dict(prev_json):
|
|
|
|
|
continue
|
2022-04-13 14:20:38 +00:00
|
|
|
|
prev_content = None
|
2022-04-10 22:50:44 +00:00
|
|
|
|
if not prev_json['object'].get('content'):
|
2022-04-13 14:20:38 +00:00
|
|
|
|
if not prev_json['object'].get('contentMap'):
|
|
|
|
|
continue
|
|
|
|
|
if prev_json['object'].get('contentMap'):
|
|
|
|
|
if prev_json['object']['contentMap'].get(system_language):
|
|
|
|
|
prev_content = \
|
|
|
|
|
prev_json['object']['contentMap'][system_language]
|
|
|
|
|
if not prev_content:
|
|
|
|
|
if prev_json['object'].get('content'):
|
|
|
|
|
prev_content = prev_json['object']['content']
|
|
|
|
|
if not prev_content:
|
2022-04-10 22:50:44 +00:00
|
|
|
|
continue
|
2022-04-13 14:20:38 +00:00
|
|
|
|
prev_content = remove_html(prev_content)
|
2022-04-10 22:50:44 +00:00
|
|
|
|
if content == prev_content:
|
|
|
|
|
continue
|
|
|
|
|
diff = content_diff(content, prev_content)
|
|
|
|
|
if not diff:
|
|
|
|
|
continue
|
|
|
|
|
diff = diff.replace('\n', '</p><p>')
|
|
|
|
|
# convert to local time
|
|
|
|
|
datetime_object = parse(modified)
|
|
|
|
|
datetime_object = \
|
|
|
|
|
convert_published_to_local_timezone(datetime_object, timezone)
|
|
|
|
|
modified_str = datetime_object.strftime("%a %b %d, %H:%M")
|
2022-04-11 12:13:04 +00:00
|
|
|
|
diff = '<p><b>' + modified_str + '</b></p>' + diff
|
2022-04-10 22:50:44 +00:00
|
|
|
|
edits_str += diff
|
|
|
|
|
content = prev_content
|
|
|
|
|
if not edits_str:
|
|
|
|
|
return ''
|
2022-06-10 16:00:55 +00:00
|
|
|
|
return '<details><summary class="cw" tabindex="10">' + \
|
2022-04-10 22:50:44 +00:00
|
|
|
|
translate['SHOW EDITS'] + '</summary>' + \
|
2022-04-11 12:13:04 +00:00
|
|
|
|
edits_str + '</details>'
|
2022-05-26 09:08:02 +00:00
|
|
|
|
|
|
|
|
|
|
2022-05-26 12:17:56 +00:00
|
|
|
|
def remove_script(content: str, log_filename: str,
|
|
|
|
|
actor: str, url: str) -> str:
|
2022-05-26 09:08:02 +00:00
|
|
|
|
"""Removes <script> from some content
|
|
|
|
|
"""
|
|
|
|
|
separators = [['<', '>'], ['<', '>']]
|
|
|
|
|
for sep in separators:
|
|
|
|
|
prefix = sep[0] + 'script'
|
|
|
|
|
ending = '/script' + sep[1]
|
2022-06-01 14:26:50 +00:00
|
|
|
|
if prefix not in content:
|
|
|
|
|
continue
|
|
|
|
|
sections = content.split(prefix)
|
|
|
|
|
ctr = 0
|
|
|
|
|
for text in sections:
|
|
|
|
|
if ctr == 0:
|
|
|
|
|
ctr += 1
|
|
|
|
|
continue
|
|
|
|
|
if ending not in text:
|
|
|
|
|
if '/' + sep[1] not in text:
|
2022-05-26 09:08:02 +00:00
|
|
|
|
continue
|
2022-06-01 14:26:50 +00:00
|
|
|
|
if ending in text:
|
|
|
|
|
text = prefix + text.split(ending)[0] + ending
|
|
|
|
|
else:
|
|
|
|
|
text = prefix + text.split('/' + sep[1])[0] + '/' + sep[1]
|
|
|
|
|
if log_filename and actor:
|
|
|
|
|
# write the detected script to a log file
|
|
|
|
|
log_str = actor + ' ' + url + ' ' + text + '\n'
|
|
|
|
|
write_type = 'a+'
|
|
|
|
|
if os.path.isfile(log_filename):
|
|
|
|
|
write_type = 'w+'
|
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
|
with open(log_filename, write_type,
|
|
|
|
|
encoding='utf-8') as fp_log:
|
2022-06-01 14:26:50 +00:00
|
|
|
|
fp_log.write(log_str)
|
|
|
|
|
except OSError:
|
|
|
|
|
print('EX: cannot append to svg script log')
|
|
|
|
|
content = content.replace(text, '')
|
2022-05-26 09:08:02 +00:00
|
|
|
|
return content
|
2022-07-27 09:13:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reject_twitter_summary(base_dir: str, nickname: str, domain: str,
|
|
|
|
|
summary: str) -> bool:
|
|
|
|
|
"""Returns true if the post should be rejected due to twitter
|
|
|
|
|
existing within the summary
|
|
|
|
|
"""
|
|
|
|
|
if not summary:
|
|
|
|
|
return False
|
|
|
|
|
remove_twitter = \
|
|
|
|
|
acct_dir(base_dir, nickname, domain) + '/.removeTwitter'
|
|
|
|
|
if not os.path.isfile(remove_twitter):
|
|
|
|
|
return False
|
|
|
|
|
summary_lower = summary.lower()
|
|
|
|
|
if 'twitter' in summary_lower or \
|
|
|
|
|
'birdsite' in summary_lower:
|
|
|
|
|
return True
|
|
|
|
|
return False
|