epicyon/content.py

__filename__ = "content.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.6.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"

import difflib
import math
import html
import os
import email.parser
import urllib.parse
from shutil import copyfile
from dateutil.parser import parse
from flags import is_pgp_encrypted
from flags import contains_pgp_public_key
from flags import is_float
from flags import is_right_to_left_text
from utils import replace_strings
from utils import data_dir
from utils import remove_link_tracking
from utils import string_contains
from utils import string_ends_with
from utils import is_account_dir
from utils import get_url_from_post
from utils import language_right_to_left
from utils import binary_is_image
from utils import get_content_from_post
from utils import get_full_domain
from utils import get_user_paths
from utils import convert_published_to_local_timezone
from utils import has_object_dict
from utils import valid_hash_tag
from utils import dangerous_svg
from utils import remove_domain_port
from utils import get_image_extensions
from utils import load_json
from utils import save_json
from utils import file_last_modified
from utils import get_link_prefixes
from utils import dangerous_markup
from utils import acct_dir
from utils import get_currencies
from utils import remove_html
from utils import remove_eol
from petnames import get_pet_name
from session import download_image

MUSIC_SITES = ('soundcloud.com', 'bandcamp.com', 'resonate.coop')

MAX_LINK_LENGTH = 40

REMOVE_MARKUP = (
    'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
    'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
)

INVALID_CONTENT_STRINGS = (
    'mute', 'unmute', 'editeventpost', 'notifypost',
    'delete', 'options', 'page', 'repeat',
    'bm', 'tl', 'actor', 'unrepeat', 'eventid',
    'unannounce', 'like', 'unlike', 'bookmark',
    'unbookmark', 'likedBy', 'time',
    'year', 'month', 'day', 'editnewpost',
    'graph', 'showshare', 'category', 'showwanted',
    'rmshare', 'rmwanted', 'repeatprivate',
    'unrepeatprivate', 'replyto',
    'replyfollowers', 'replydm', 'replychat', 'editblogpost',
    'handle', 'blockdomain'
)


def valid_url_lengths(content: str, max_url_length: int) -> bool:
    """Returns true if the given content contains urls which are too long
    """
    if '://' not in content:
        return True
    sections = content.split('://')
    ctr = 0
    for text in sections:
        if ctr == 0:
            ctr += 1
            continue
        if '"' not in text:
            continue
        url = text.split('"')[0]
        if '<' not in url and '>' not in url:
            if len(url) > max_url_length:
                return False
    return True


def remove_html_tag(html_str: str, tag: str) -> str:
    """Removes a given tag from a html string
    """
    tag_found = True
    while tag_found:
        match_str = ' ' + tag + '="'
        if match_str not in html_str:
            tag_found = False
            break
        sections = html_str.split(match_str, 1)
        if '"' not in sections[1]:
            tag_found = False
            break
        html_str = sections[0] + sections[1].split('"', 1)[1]
    return html_str


def _remove_quotes_within_quotes(content: str) -> str:
    """Removes any blockquote inside blockquote
    """
    if '<blockquote>' not in content:
        return content
    if '</blockquote>' not in content:
        return content
    ctr = 1
    found = True
    while found:
        prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
        quoted_str = content.split('<blockquote>', ctr)[1]
        if '</blockquote>' not in quoted_str:
            found = False
            continue

        end_str = quoted_str.split('</blockquote>')[1]
        quoted_str = quoted_str.split('</blockquote>')[0]
        if '<blockquote>' not in end_str:
            found = False
        if '<blockquote>' in quoted_str:
            quoted_str = quoted_str.replace('<blockquote>', '')
            content = prefix + quoted_str + '</blockquote>' + end_str
        ctr += 1
    return content


def html_replace_email_quote(content: str) -> str:
    """Replaces an email style quote "> Some quote" with html blockquote
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    # replace quote paragraph
    if '<p>&quot;' in content:
        if '&quot;</p>' in content:
            if content.count('<p>&quot;') == content.count('&quot;</p>'):
                replacements = {
                    '<p>&quot;': '<p><blockquote>',
                    '&quot;</p>': '</blockquote></p>'
                }
                content = replace_strings(content, replacements)
    if '>\u201c' in content:
        if '\u201d<' in content:
            if content.count('>\u201c') == content.count('\u201d<'):
                replacements = {
                    '>\u201c': '><blockquote>',
                    '\u201d<': '</blockquote><'
                }
                content = replace_strings(content, replacements)
    # replace email style quote
    if '>&gt; ' not in content:
        return content
    content_str = content.replace('<p>', '')
    content_lines = content_str.split('</p>')
    new_content = ''
    for line_str in content_lines:
        if not line_str:
            continue
        if '>&gt; ' not in line_str:
            if line_str.startswith('&gt; '):
                replacements = {
                    '&gt; ': '<blockquote>',
                    '&gt;': '<br>'
                }
                line_str = replace_strings(line_str, replacements)
                new_content += '<p>' + line_str + '</blockquote></p>'
            else:
                new_content += '<p>' + line_str + '</p>'
            continue

        line_str = line_str.replace('>&gt; ', '><blockquote>')
        if line_str.startswith('&gt;'):
            line_str = line_str.replace('&gt;', '<blockquote>', 1)
        else:
            line_str = line_str.replace('&gt;', '<br>')
        new_content += '<p>' + line_str + '</blockquote></p>'
    return _remove_quotes_within_quotes(new_content)


def html_replace_quote_marks(content: str) -> str:
    """Replaces quotes with html formatting
    "hello" becomes <q>hello</q>
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    if '"' not in content:
        if '&quot;' not in content:
            return content

    # only if there are a few quote marks
    if content.count('"') > 4:
        return content
    if content.count('&quot;') > 4:
        return content

    new_content = content
    if '"' in content:
        sections = content.split('"')
        if len(sections) > 1:
            new_content = ''
            open_quote = True
            markup = False
            for char in content:
                curr_char = char
                if char == '<':
                    markup = True
                elif char == '>':
                    markup = False
                elif char == '"' and not markup:
                    if open_quote:
                        curr_char = '“'
                    else:
                        curr_char = '”'
                    open_quote = not open_quote
                new_content += curr_char

    if '&quot;' in new_content:
        open_quote = True
        content = new_content
        new_content = ''
        ctr = 0
        sections = content.split('&quot;')
        no_of_sections = len(sections)
        for sec in sections:
            new_content += sec
            if ctr < no_of_sections - 1:
                if open_quote:
                    new_content += '“'
                else:
                    new_content += '”'
                open_quote = not open_quote
            ctr += 1
    return new_content


def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
    """Returns true is the css file contains code which
    can create security problems
    """
    if not os.path.isfile(filename):
        return False

    content = None
    try:
        with open(filename, 'r', encoding='utf-8') as fp_css:
            content = fp_css.read().lower()
    except OSError:
        print('EX: unable to read css file ' + filename)

    if not content:
        return False

    css_matches = (
        'behavior:', ':expression', '?php', '.php',
        'google', 'regexp', 'localhost',
        '127.0.', '192.168', '10.0.', '@import'
    )
    for cssmatch in css_matches:
        if cssmatch in content:
            return True

    # search for non-local web links
    if 'url(' in content:
        url_list = content.split('url(')
        ctr = 0
        for url_str in url_list:
            if ctr == 0:
                ctr = 1
                continue
            if ')' in url_str:
                url_str = url_str.split(')')[0]
                if string_contains(url_str, ('http', 'ipfs', 'ipns')):
                    print('ERROR: non-local web link in CSS ' +
                          filename)
                    return True
            ctr += 1

    # an attacker can include html inside of the css
    # file as a comment and this may then be run from the html
    if dangerous_markup(content, allow_local_network_access, []):
        return True
    return False


def switch_words(base_dir: str, nickname: str, domain: str, content: str,
                 rules: list[str] = []) -> str:
    """Performs word replacements. eg. Trump -> The Orange Menace
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content

    if not rules:
        switch_words_filename = \
            acct_dir(base_dir, nickname, domain) + '/replacewords.txt'
        if not os.path.isfile(switch_words_filename):
            return content
        try:
            with open(switch_words_filename, 'r',
                      encoding='utf-8') as fp_words:
                rules = fp_words.readlines()
        except OSError:
            print('EX: unable to read switches ' + switch_words_filename)

    for line in rules:
        replace_str = remove_eol(line)
        splitters = ('->', ':', ',', ';', '-')
        word_transform = None
        for split_str in splitters:
            if split_str in replace_str:
                word_transform = replace_str.split(split_str)
                break
        if not word_transform:
            continue
        if len(word_transform) == 2:
            replace_str1 = word_transform[0].strip().replace('"', '')
            replace_str2 = word_transform[1].strip().replace('"', '')
            content = content.replace(replace_str1, replace_str2)
    return content


def _save_custom_emoji(session, base_dir: str, emoji_name: str, url: str,
                       debug: bool) -> None:
    """Saves custom emoji to file
    """
    if not session:
        if debug:
            print('EX: _save_custom_emoji no session')
        return
    if '.' not in url:
        return
    ext = url.split('.')[-1]
    if ext != 'png':
        if debug:
            print('EX: Custom emoji is wrong format ' + url)
        return
    emoji_name = emoji_name.replace(':', '').strip().lower()
    custom_emoji_dir = base_dir + '/emojicustom'
    if not os.path.isdir(custom_emoji_dir):
        os.mkdir(custom_emoji_dir)
    emoji_image_filename = custom_emoji_dir + '/' + emoji_name + '.' + ext
    if not download_image(session, url,
                          emoji_image_filename, debug, False):
        if debug:
            print('EX: custom emoji not downloaded ' + url)
        return
    emoji_json_filename = custom_emoji_dir + '/emoji.json'
    emoji_json = {}
    if os.path.isfile(emoji_json_filename):
        emoji_json = load_json(emoji_json_filename)
        if not emoji_json:
            emoji_json = {}
    if not emoji_json.get(emoji_name):
        emoji_json[emoji_name] = emoji_name
        save_json(emoji_json, emoji_json_filename)
        if debug:
            print('EX: Saved custom emoji ' + emoji_json_filename)
    elif debug:
        print('EX: cusom emoji already saved')


def _get_emoji_name_from_code(base_dir: str, emoji_code: str) -> str:
    """Returns the emoji name from its code
    """
    emojis_filename = base_dir + '/emoji/emoji.json'
    if not os.path.isfile(emojis_filename):
        emojis_filename = base_dir + '/emoji/default_emoji.json'
        if not os.path.isfile(emojis_filename):
            return None
    emojis_json = load_json(emojis_filename)
    if not emojis_json:
        return None
    for emoji_name, code in emojis_json.items():
        if code == emoji_code:
            return emoji_name
    return None


def _update_common_emoji(base_dir: str, emoji_content: str) -> None:
    """Updates the list of commonly used emoji
    """
    if '.' in emoji_content:
        emoji_content = emoji_content.split('.')[0]
    emoji_content = emoji_content.replace(':', '')
    if emoji_content.startswith('0x'):
        # lookup the name for an emoji code
        emoji_code = emoji_content[2:]
        emoji_content = _get_emoji_name_from_code(base_dir, emoji_code)
        if not emoji_content:
            return
    common_emoji_filename = data_dir(base_dir) + '/common_emoji.txt'
    common_emoji = None
    if os.path.isfile(common_emoji_filename):
        try:
            with open(common_emoji_filename, 'r',
                      encoding='utf-8') as fp_emoji:
                common_emoji = fp_emoji.readlines()
        except OSError:
            print('EX: unable to load common emoji file')
    if common_emoji:
        new_common_emoji: list[str] = []
        emoji_found = False
        for line in common_emoji:
            if ' ' + emoji_content in line:
                if not emoji_found:
                    emoji_found = True
                    counter = 1
                    count_str = line.split(' ')[0]
                    if count_str.isdigit():
                        counter = int(count_str) + 1
                    count_str = str(counter).zfill(16)
                    line = count_str + ' ' + emoji_content
                    new_common_emoji.append(line)
            else:
                line1 = remove_eol(line)
                new_common_emoji.append(line1)
        if not emoji_found:
            new_common_emoji.append(str(1).zfill(16) + ' ' + emoji_content)
        new_common_emoji.sort(reverse=True)
        try:
            with open(common_emoji_filename, 'w+',
                      encoding='utf-8') as fp_emoji:
                for line in new_common_emoji:
                    fp_emoji.write(line + '\n')
        except OSError:
            print('EX: error writing common emoji 1')
            return
    else:
        line = str(1).zfill(16) + ' ' + emoji_content + '\n'
        try:
            with open(common_emoji_filename, 'w+',
                      encoding='utf-8') as fp_emoji:
                fp_emoji.write(line)
        except OSError:
            print('EX: error writing common emoji 2')
            return


def replace_emoji_from_tags(session, base_dir: str,
                            content: str, tag: [], message_type: str,
                            debug: bool, screen_readable: bool) -> str:
    """Uses the tags to replace :emoji: with html image markup
    """
    for tag_item in tag:
        if not isinstance(tag_item, dict):
            continue
        if not tag_item.get('type'):
            continue
        if tag_item['type'] != 'Emoji':
            continue
        if not tag_item.get('name'):
            continue
        if not tag_item.get('icon'):
            continue
        if not tag_item['icon'].get('url'):
            continue
        url_str = get_url_from_post(tag_item['icon']['url'])
        if '/' not in url_str:
            continue
        if tag_item['name'] not in content:
            continue
        tag_url = remove_html(url_str)
        if not tag_url:
            continue
        icon_name = tag_url.split('/')[-1]
        if len(icon_name) <= 1:
            continue
        if '.' not in icon_name:
            continue
        icon_name = icon_name.split('.')[0]
        # see https://unicode.org/
        # emoji/charts/full-emoji-list.html
        if '-' not in icon_name:
            # a single code
            replaced = False
            try:
                replace_char = chr(int("0x" + icon_name, 16))
                if not screen_readable:
                    replace_char = \
                        '<span aria-hidden="true">' + \
                        replace_char + '</span>'
                content = \
                    content.replace(tag_item['name'],
                                    replace_char)
                replaced = True
            except BaseException:
                if debug:
                    print('EX: replace_emoji_from_tags 1 ' +
                          'no conversion of ' +
                          str(icon_name) + ' to chr ' +
                          tag_item['name'] + ' ' +
                          tag_url)
            if not replaced:
                _save_custom_emoji(session, base_dir,
                                   tag_item['name'],
                                   tag_url, debug)
                _update_common_emoji(base_dir, icon_name)
            else:
                _update_common_emoji(base_dir,
                                     "0x" + icon_name)
        else:
            # sequence of codes
            icon_codes = icon_name.split('-')
            icon_code_sequence = ''
            for icode in icon_codes:
                replaced = False
                try:
                    icon_code_sequence += chr(int("0x" +
                                                  icode, 16))
                    replaced = True
                except BaseException:
                    icon_code_sequence = ''
                    if debug:
                        print('EX: ' +
                              'replace_emoji_from_tags 2 ' +
                              'no conversion of ' +
                              str(icode) + ' to chr ' +
                              tag_item['name'] + ' ' +
                              tag_url)
                if not replaced:
                    _save_custom_emoji(session, base_dir,
                                       tag_item['name'],
                                       tag_url, debug)
                    _update_common_emoji(base_dir,
                                         icon_name)
                else:
                    _update_common_emoji(base_dir,
                                         "0x" + icon_name)
            if icon_code_sequence:
                if not screen_readable:
                    icon_code_sequence = \
                        '<span aria-hidden="true">' + \
                        icon_code_sequence + '</span>'
                content = content.replace(tag_item['name'],
                                          icon_code_sequence)

        html_class = 'emoji'
        if message_type == 'post header':
            html_class = 'emojiheader'
        if message_type == 'profile':
            html_class = 'emojiprofile'
        if screen_readable:
            emoji_tag_name = tag_item['name'].replace(':', '')
        else:
            emoji_tag_name = ''
        url_str = get_url_from_post(tag_item['icon']['url'])
        tag_url = remove_html(url_str)
        emoji_html = "<img src=\"" + tag_url + "\" alt=\"" + \
            emoji_tag_name + \
            "\" align=\"middle\" class=\"" + html_class + "\"/>"
        content = content.replace(tag_item['name'], emoji_html)
    return content


def _add_music_tag(content: str, tag: str) -> str:
    """If a music link is found then ensure that the post is
    tagged appropriately
    """
    if '#podcast' in content or '#documentary' in content:
        return content
    if '#' not in tag:
        tag = '#' + tag
    if tag in content:
        return content
    music_site_found = False
    for site in MUSIC_SITES:
        if site + '/' in content:
            music_site_found = True
            break
    if not music_site_found:
        return content
    return ':music: ' + content + ' ' + tag + ' '


def _shorten_linked_urls(content: str) -> str:
    """If content comes with a web link included then make sure
    that it is short enough
    """
    if 'href=' not in content:
        return content
    if '>' not in content:
        return content
    if '<' not in content:
        return content
    sections = content.split('>')
    ctr = 0
    for section_text in sections:
        if ctr == 0:
            ctr += 1
            continue
        if '<' not in section_text:
            ctr += 1
            continue
        section_text = section_text.split('<')[0]
        if ' ' in section_text:
            continue
        if len(section_text) > MAX_LINK_LENGTH:
            content = content.replace('>' + section_text + '<',
                                      '>' +
                                      section_text[:MAX_LINK_LENGTH-1] + '<')
        ctr += 1
    return content


def _contains_doi_reference(wrd: str, replace_dict: {}) -> bool:
    """Handle DOI scientific references
    """
    if not wrd.startswith('doi:') and \
       not wrd.startswith('DOI:'):
        return False

    doi_ref_str = wrd.split(':', 1)[1]
    doi_site = 'https://sci-hub.ru'
    markup = '<a href="' + doi_site + '/' + \
        doi_ref_str + '" tabindex="10" ' + \
        'rel="nofollow noopener noreferrer" ' + \
        'target="_blank">' + \
        '<span class="ellipsis">doi:' + doi_ref_str + \
        '</span></a>'
    replace_dict[wrd] = markup
    return True


def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool:
    """Handle arxiv scientific references
    """
    if not wrd.startswith('arXiv:') and \
       not wrd.startswith('arx:') and \
       not wrd.startswith('arxiv:'):
        return False

    arxiv_ref_str = wrd.split(':', 1)[1].lower()
    if '.' in arxiv_ref_str:
        arxiv_ref = arxiv_ref_str.split('.')
    elif ':' in arxiv_ref_str:
        arxiv_ref = arxiv_ref_str.split(':')
    else:
        return False
    if len(arxiv_ref) != 2:
        return False
    if not arxiv_ref[0].isdigit():
        return False
    arxiv_day = arxiv_ref[1]
    if 'v' in arxiv_day:
        arxiv_day = arxiv_day.split('v')[0]
    if not arxiv_day.isdigit():
        return False
    ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
    markup = '<a href="https://arxiv.org/abs/' + \
        ref_str + '" tabindex="10" ' + \
        'rel="nofollow noopener noreferrer" ' + \
        'target="_blank">' + \
        '<span class="ellipsis">arXiv:' + ref_str + \
        '</span></a>'
    replace_dict[wrd] = markup
    return True


def _contains_academic_references(content: str) -> bool:
    """Does the given content contain academic references
    """
    prefixes = (
        'arXiv:', 'arx:', 'arxiv:',
        'doi:', 'DOI:'
    )
    for reference in prefixes:
        if reference in content:
            return True
    return False


def remove_link_trackers_from_content(content: str) -> str:
    """ Removes any link trackers from urls within the content
    """
    if '?utm_' not in content:
        return content
    sections = content.split('?utm_')
    ctr = 0
    new_content = ''
    for section_str in sections:
        if ctr == 0:
            new_content = section_str
            ctr = 1
            continue
        if '"' in section_str:
            new_content += '"' + section_str.split('"', 1)[1]
        else:
            new_content += section_str
        ctr += 1
    return new_content


def add_web_links(content: str) -> str:
    """Adds markup for web links
    """
    content = _shorten_linked_urls(content)

    if ':' not in content:
        return content

    prefixes = get_link_prefixes()

    # do any of these prefixes exist within the content?
    prefix_found = False
    for prefix in prefixes:
        if prefix in content:
            prefix_found = True
            break

    # if there are no prefixes then just keep the content we have
    if not prefix_found:
        if _contains_academic_references(content):
            prefix_found = True
        else:
            return content

    content = content.replace('\r', '')
    words = content.replace('\n', ' --linebreak-- ').split(' ')
    replace_dict = {}
    for wrd in words:
        if ':' not in wrd:
            continue
        if _contains_arxiv_reference(wrd, replace_dict):
            continue
        if _contains_doi_reference(wrd, replace_dict):
            continue
        # does the word begin with a link prefix?
        prefix_found = False
        for prefix in prefixes:
            if wrd.startswith(prefix):
                prefix_found = True
                break
        if not prefix_found:
            continue
        # the word contains a link prefix
        url = wrd
        if url.endswith('.') or wrd.endswith(';'):
            url = url[:-1]
        url = remove_link_tracking(url)
        markup = '<a href="' + url + '" tabindex="10" ' + \
            'rel="nofollow noopener noreferrer" target="_blank">'
        for prefix in prefixes:
            if url.startswith(prefix):
                markup += '<span class="invisible">' + prefix + '</span>'
                break
        link_text = url
        for prefix in prefixes:
            link_text = link_text.replace(prefix, '')
        # prevent links from becoming too long
        if len(link_text) > MAX_LINK_LENGTH:
            markup += '<span class="ellipsis">' + \
                link_text[:MAX_LINK_LENGTH] + '</span>'
            markup += '<span class="invisible">' + \
                link_text[MAX_LINK_LENGTH:] + '</span></a>'
        else:
            markup += '<span class="ellipsis">' + link_text + '</span></a>'
        replace_dict[url] = markup

    # do the replacements
    for url, markup in replace_dict.items():
        content = content.replace(url, markup)

    # replace any line breaks
    content = content.replace(' --linebreak-- ', '<br>')

    return content


def safe_web_text(arbitrary_html: str) -> str:
    """Turns arbitrary html into something safe.
    So if the arbitrary html contains attack scripts those will be removed
    """
    # first remove the markup, so that we have something safe
    safe_text = remove_html(arbitrary_html)
    if not safe_text:
        return ''
    # remove any spurious characters found in podcast descriptions
    remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]', '__')
    for remchar in remove_chars:
        safe_text = safe_text.replace(remchar, '')
    # recreate any url links safely
    return add_web_links(safe_text)


def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
                   replace_hashtags: {}, post_hashtags: {}) -> bool:
    """Detects hashtags and adds them to the replacements dict
    Also updates the hashtags list to be added to the post
    """
    if replace_hashtags.get(word_str):
        return True
    hashtag = word_str[1:]
    if not valid_hash_tag(hashtag):
        return False
    hashtag_url = http_prefix + "://" + domain + "/tags/" + hashtag
    post_hashtags[hashtag] = {
        'href': hashtag_url,
        'name': '#' + hashtag,
        'type': 'Hashtag'
    }
    replace_hashtags[word_str] = "<a href=\"" + hashtag_url + \
        "\" class=\"mention hashtag\" rel=\"tag\" tabindex=\"10\">" + \
        "<span aria-hidden=\"true\">#</span><span>" + \
        hashtag + "</span></a>"
    return True


def replace_remote_hashtags(content: str,
                            nickname: str, domain: str) -> str:
    """Replaces remote hashtags with a local version
    """
    if not domain:
        return content

    if ' href="' not in content:
        return content

    sections = content.split(' href="')
    ctr = 0
    replacements = {}
    for section in sections:
        if ctr == 0:
            ctr += 1
            continue
        if '"' not in section:
            ctr += 1
            continue
        link = section.split('"')[0]
        if '://' not in link:
            continue
        if '?remotetag=' in link:
            ctr += 1
            continue
        if '/tags/' not in link:
            ctr += 1
            continue
        if '/' + domain not in link:
            new_link = '/users/' + nickname + \
                '?remotetag=' + link.replace('/', '--')
            replacements[link] = new_link
        ctr += 1
    if not replacements:
        return content
    for old_link, new_link in replacements.items():
        content = content.replace('"' + old_link + '"',
                                  '"' + new_link + '"')
    return content


def _add_emoji(base_dir: str, word_str: str,
               http_prefix: str, domain: str,
               replace_emoji: {}, post_tags: {},
               emoji_dict: {}) -> bool:
    """Detects Emoji and adds them to the replacements dict
    Also updates the tags list to be added to the post
    """
    if not word_str.startswith(':'):
        return False
    if not word_str.endswith(':'):
        return False
    if len(word_str) < 3:
        return False
    if replace_emoji.get(word_str):
        return True
    # remove leading and trailing : characters
    emoji = word_str[1:]
    emoji = emoji[:-1]
    # is the text of the emoji valid?
    if not valid_hash_tag(emoji):
        return False
    if not emoji_dict.get(emoji):
        return False
    emoji_filename = base_dir + '/emoji/' + emoji_dict[emoji] + '.png'
    if not os.path.isfile(emoji_filename):
        emoji_filename = \
            base_dir + '/emojicustom/' + emoji_dict[emoji] + '.png'
        if not os.path.isfile(emoji_filename):
            return False
    emoji_url = http_prefix + "://" + domain + \
        "/emoji/" + emoji_dict[emoji] + '.png'
    post_tags[emoji] = {
        'icon': {
            'mediaType': 'image/png',
            'type': 'Image',
            'url': emoji_url
        },
        'name': ':' + emoji + ':',
        "updated": file_last_modified(emoji_filename),
        "id": emoji_url.replace('.png', ''),
        'type': 'Emoji'
    }
    return True


def post_tag_exists(tag_type: str, tag_name: str, tags: {}) -> bool:
    """Returns true if a tag exists in the given dict
    """
    for tag in tags:
        if tag['name'] == tag_name and tag['type'] == tag_type:
            return True
    return False


def _mention_to_url(base_dir: str, http_prefix: str,
                    domain: str, nickname: str) -> str:
    """Convert https://somedomain/@somenick to
    https://somedomain/users/somenick
    This uses the hack of trying the cache directory to see if
    there is a matching actor
    """
    possible_paths = get_user_paths()
    cache_dir = base_dir + '/cache/actors'
    cache_path_start = cache_dir + '/' + http_prefix + ':##' + domain
    for users_path in possible_paths:
        users_path = users_path.replace('/', '#')
        possible_cache_entry = \
            cache_path_start + users_path + nickname + '.json'
        if os.path.isfile(possible_cache_entry):
            return http_prefix + '://' + \
                domain + users_path.replace('#', '/') + nickname
    possible_cache_entry = \
        cache_path_start + '#' + nickname + '.json'
    if os.path.isfile(possible_cache_entry):
        return http_prefix + '://' + domain + '/' + nickname
    return http_prefix + '://' + domain + '/users/' + nickname


def _add_mention(base_dir: str, word_str: str, http_prefix: str,
                 following: [], petnames: [], replace_mentions: {},
                 recipients: [], tags: {}) -> bool:
    """Detects mentions and adds them to the replacements dict and
    recipients list
    """
    possible_handle = word_str[1:]
    # @nick
    if following and '@' not in possible_handle:
        # fall back to a best effort match against the following list
        # if no domain was specified. eg. @nick
        possible_nickname = possible_handle
        for follow in following:
            if '@' not in follow:
                continue
            follow_nick = follow.split('@')[0]
            if possible_nickname != follow_nick:
                continue
            follow_str = remove_eol(follow)
            replace_domain = follow_str.split('@')[1]
            recipient_actor = \
                _mention_to_url(base_dir, http_prefix,
                                replace_domain, possible_nickname)
            if recipient_actor not in recipients:
                recipients.append(recipient_actor)
            tags[word_str] = {
                'href': recipient_actor,
                'name': word_str,
                'type': 'Mention'
            }
            replace_mentions[word_str] = \
                "<span class=\"h-card\"><a href=\"" + recipient_actor + \
                "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
                possible_nickname + "</span></a></span>"
            return True
        # try replacing petnames with mentions
        follow_ctr = 0
        for follow in following:
            if '@' not in follow:
                follow_ctr += 1
                continue
            pet = remove_eol(petnames[follow_ctr])
            if pet:
                if possible_nickname != pet:
                    follow_ctr += 1
                    continue
                follow_str = remove_eol(follow)
                replace_nickname = follow_str.split('@')[0]
                replace_domain = follow_str.split('@')[1]
                recipient_actor = \
                    _mention_to_url(base_dir, http_prefix,
                                    replace_domain, replace_nickname)
                if recipient_actor not in recipients:
                    recipients.append(recipient_actor)
                tags[word_str] = {
                    'href': recipient_actor,
                    'name': word_str,
                    'type': 'Mention'
                }
                replace_mentions[word_str] = \
                    "<span class=\"h-card\"><a href=\"" + \
                    recipient_actor + "\" tabindex=\"10\" " + \
                    "class=\"u-url mention\">@<span>" + \
                    replace_nickname + "</span></a></span>"
                return True
            follow_ctr += 1
        return False
    possible_nickname = None
    possible_domain = None
    if '@' not in possible_handle:
        return False
    possible_nickname = possible_handle.split('@')[0]
    if not possible_nickname:
        return False
    possible_domain = \
        possible_handle.split('@')[1].strip('\n').strip('\r')
    if not possible_domain:
        return False
    if following:
        for follow in following:
            if remove_eol(follow) != possible_handle:
                continue
            recipient_actor = \
                _mention_to_url(base_dir, http_prefix,
                                possible_domain, possible_nickname)
            if recipient_actor not in recipients:
                recipients.append(recipient_actor)
            tags[word_str] = {
                'href': recipient_actor,
                'name': word_str,
                'type': 'Mention'
            }
            replace_mentions[word_str] = \
                "<span class=\"h-card\"><a href=\"" + recipient_actor + \
                "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
                possible_nickname + "</span></a></span>"
            return True
    # @nick@domain
    if not (possible_domain == 'localhost' or '.' in possible_domain):
        return False
    recipient_actor = \
        _mention_to_url(base_dir, http_prefix,
                        possible_domain, possible_nickname)
    if recipient_actor not in recipients:
        recipients.append(recipient_actor)
    tags[word_str] = {
        'href': recipient_actor,
        'name': word_str,
        'type': 'Mention'
    }
    replace_mentions[word_str] = \
        "<span class=\"h-card\"><a href=\"" + recipient_actor + \
        "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
        possible_nickname + "</span></a></span>"
    return True


def replace_content_duplicates(content: str) -> str:
    """Replaces invalid duplicates within content
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    while '<<' in content:
        content = content.replace('<<', '<')
    while '>>' in content:
        content = content.replace('>>', '>')
    content = content.replace('<\\p>', '')
    return content


def remove_text_formatting(content: str, bold_reading: bool) -> str:
    """Removes markup for bold, italics, etc
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    if '<' not in content:
        return content
    for markup in REMOVE_MARKUP:
        if bold_reading:
            if markup == 'b':
                continue
        content = content.replace('<' + markup + '>', '')
        content = content.replace('</' + markup + '>', '')
        content = content.replace('<' + markup.upper() + '>', '')
        content = content.replace('</' + markup.upper() + '>', '')
    return content


def remove_long_words(content: str, max_word_length: int,
                      long_words_list: []) -> str:
    """Breaks up long words so that on mobile screens this doesn't
    disrupt the layout
    """
    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
        return content
    content = replace_content_duplicates(content)

    non_html_list = False
    if '\n\n' in content and '<p>' not in content:
        content = '<p>' + content.replace('\n\n', '</p> <p>') + '</p>'
        non_html_list = True
    non_html_list2 = False
    if '\n' in content and '<p>' not in content:
        content = '<p>' + content.replace('\n', '</p> <p>') + '</p>'
        non_html_list2 = True

    if ' ' not in content and '</p><p>' not in content:
        # handle a single very long string with no spaces
        content_str = content.replace('<p>', '').replace(r'<\p>', '')
        if '://' not in content_str:
            if len(content_str) > max_word_length:
                if '<p>' in content:
                    content = '<p>' + content_str[:max_word_length] + r'<\p>'
                else:
                    content = content[:max_word_length]
                return content
    content = content.replace('<p></p>', '<p> </p>')
    words = content.split(' ')
    if not long_words_list:
        long_words_list: list[str] = []
        for word_str in words:
            if len(word_str) > max_word_length:
                if word_str not in long_words_list:
                    long_words_list.append(word_str)
    for word_str in long_words_list:
        original_word_str = word_str
        if word_str.startswith('<p>'):
            word_str = word_str.replace('<p>', '')
        if word_str.startswith('<'):
            continue
        if len(word_str) == 76:
            if word_str.upper() == word_str:
                # tox address
                continue
        if '=\"' in word_str:
            continue
        if '@' in word_str:
            if '@@' not in word_str:
                continue
        if '=.ed25519' in word_str:
            continue
        if '.onion' in word_str:
            continue
        if '.i2p' in word_str:
            continue
        if 'https:' in word_str:
            continue
        if 'http:' in word_str:
            continue
        if 'i2p:' in word_str:
            continue
        if 'gnunet:' in word_str:
            continue
        if 'dat:' in word_str:
            continue
        if 'rad:' in word_str:
            continue
        if 'hyper:' in word_str:
            continue
        if 'briar:' in word_str:
            continue
        if '<' in word_str:
            replace_word = word_str.split('<', 1)[0]
            # if len(replace_word) > max_word_length:
            #     replace_word = replace_word[:max_word_length]
            content = content.replace(word_str, replace_word)
            word_str = replace_word
        if '/' in word_str:
            continue
        if len(word_str[max_word_length:]) < max_word_length:
            end_of_line_char = '\n'
            if '<br>' in original_word_str:
                end_of_line_char = ''
            content = content.replace(word_str,
                                      word_str[:max_word_length] +
                                      end_of_line_char +
                                      word_str[max_word_length:])
        else:
            content = content.replace(word_str,
                                      word_str[:max_word_length])
    if content.startswith('<p>'):
        if not content.endswith('</p>'):
            content = content.strip() + '</p>'
    content = content.replace('<p> </p>', '<p></p>')
    if non_html_list:
        content = content.replace('</p> <p>', '\n\n')
        content = content.replace('<p>', '')
        content = content.replace('</p>', '')
    if non_html_list2:
        content = content.replace('</p> <p>', '\n')
        content = content.replace('<p>', '')
        content = content.replace('</p>', '')
    content = content.replace('</p> <p>', '</p><p>')

    return content


def _load_auto_tags(base_dir: str, nickname: str, domain: str) -> []:
    """Loads automatic tags file and returns a list containing
    the lines of the file
    """
    filename = acct_dir(base_dir, nickname, domain) + '/autotags.txt'
    if not os.path.isfile(filename):
        return []
    try:
        with open(filename, 'r', encoding='utf-8') as fp_tags:
            return fp_tags.readlines()
    except OSError:
        print('EX: unable to read auto tags ' + filename)
    return []


def _auto_tag(word_str: str, auto_tag_list: [], append_tags: []) -> None:
    """Generates a list of tags to be automatically appended to the content
    """
    for tag_rule in auto_tag_list:
        if word_str not in tag_rule:
            continue
        if '->' not in tag_rule:
            continue
        rulematch = tag_rule.split('->')[0].strip()
        if rulematch != word_str:
            continue
        tag_name = tag_rule.split('->')[1].strip()
        if tag_name.startswith('#'):
            if tag_name not in append_tags:
                append_tags.append(tag_name)
        else:
            if '#' + tag_name not in append_tags:
                append_tags.append('#' + tag_name)


def _get_simplified_content(content: str) -> str:
    """Returns a simplified version of the content suitable for
    splitting up into individual words
    """
    replacements = {
        ',': ' ',
        ';': ' ',
        '- ': ' '
    }
    content_simplified = replace_strings(content, replacements)
    content_simplified = content_simplified.replace('. ', ' ').strip()
    if content_simplified.endswith('.'):
        content_simplified = content_simplified[:len(content_simplified)-1]
    return content_simplified


def detect_dogwhistles(content: str, dogwhistles: {}) -> {}:
    """Returns a dict containing any detected dogwhistle words
    """
    content = remove_html(content).lower()
    result = {}
    words = _get_simplified_content(content).split(' ')
    for whistle, category in dogwhistles.items():
        if not category:
            continue
        ending = False
        starting = False
        whistle = whistle.lower()

        if whistle.startswith('x-'):
            whistle = whistle[2:]
            ending = True
        elif string_ends_with(whistle, ('*', '~', '-')):
            whistle = whistle[1:]
            ending = True

        if ending:
            prev_wrd = ''
            for wrd in words:
                wrd2 = (prev_wrd + ' ' + wrd).strip()
                if wrd.endswith(whistle) or wrd2.endswith(whistle):
                    if not result.get(whistle):
                        result[whistle] = {
                            "count": 1,
                            "category": category
                        }
                    else:
                        result[whistle]['count'] += 1
                prev_wrd = wrd
            continue

        if whistle.lower().endswith('-x'):
            whistle = whistle[:len(whistle)-2]
            starting = True
        elif string_ends_with(whistle, ('*', '~', '-')):
            whistle = whistle[:len(whistle)-1]
            starting = True

        if starting:
            prev_wrd = ''
            for wrd in words:
                wrd2 = (prev_wrd + ' ' + wrd).strip()
                if wrd.startswith(whistle) or wrd2.startswith(whistle):
                    if not result.get(whistle):
                        result[whistle] = {
                            "count": 1,
                            "category": category
                        }
                    else:
                        result[whistle]['count'] += 1
                prev_wrd = wrd
            continue

        if '*' in whistle:
            whistle_start = whistle.split('*', 1)[0]
            whistle_end = whistle.split('*', 1)[1]
            prev_wrd = ''
            for wrd in words:
                wrd2 = (prev_wrd + ' ' + wrd).strip()
                if ((wrd.startswith(whistle_start) and
                     wrd.endswith(whistle_end)) or
                    (wrd2.startswith(whistle_start) and
                     wrd2.endswith(whistle_end))):
                    if not result.get(whistle):
                        result[whistle] = {
                            "count": 1,
                            "category": category
                        }
                    else:
                        result[whistle]['count'] += 1
                prev_wrd = wrd
            continue

        prev_wrd = ''
        for wrd in words:
            wrd2 = (prev_wrd + ' ' + wrd).strip()
            if whistle in (wrd, wrd2):
                if not result.get(whistle):
                    result[whistle] = {
                        "count": 1,
                        "category": category
                    }
                else:
                    result[whistle]['count'] += 1
            prev_wrd = wrd
    return result


def load_dogwhistles(filename: str) -> {}:
    """Loads a list of dogwhistles from file
    """
    if not os.path.isfile(filename):
        return {}
    dogwhistle_lines: list[str] = []
    try:
        with open(filename, 'r', encoding='utf-8') as fp_dogwhistles:
            dogwhistle_lines = fp_dogwhistles.readlines()
    except OSError:
        print('EX: unable to load dogwhistles from ' + filename)
        return {}
    separators = ('->', '=>', ',', ';', '|', '=')
    dogwhistles = {}
    for line in dogwhistle_lines:
        line = remove_eol(line).strip()
        if not line:
            continue
        if line.startswith('#'):
            continue
        whistle = None
        category = None
        for sep in separators:
            if sep in line:
                whistle = line.split(sep, 1)[0].strip()
                category = line.split(sep, 1)[1].strip()
                break
        if not whistle:
            whistle = line
        dogwhistles[whistle] = category
    return dogwhistles


def add_html_tags(base_dir: str, http_prefix: str,
                  nickname: str, domain: str, content: str,
                  recipients: [], hashtags: {}, translate: {},
                  is_json_content: bool = False) -> str:
    """ Replaces plaintext mentions such as @nick@domain into html
    by matching against known following accounts
    """
    if content.startswith('<p>'):
        content = html_replace_email_quote(content)
        return html_replace_quote_marks(content)
    max_word_length = 40
    replacements = {
        '\r': '',
        '\n': ' --linebreak-- '
    }
    content = replace_strings(content, replacements)
    now_playing_str = 'NowPlaying'
    if translate.get(now_playing_str):
        now_playing_str = translate[now_playing_str]
    now_playing_lower_str = 'nowplaying'
    if translate.get(now_playing_lower_str):
        now_playing_lower_str = translate[now_playing_lower_str]
    if '#' + now_playing_lower_str in content:
        content = content.replace('#' + now_playing_lower_str,
                                  '#' + now_playing_str)
    content = _add_music_tag(content, now_playing_str)
    words = _get_simplified_content(content).split(' ')

    # remove . for words which are not mentions
    new_words: list[str] = []
    for _, word_str in enumerate(words):
        if word_str.endswith('.'):
            if not word_str.startswith('@'):
                word_str = word_str[:-1]
        if word_str.startswith('.'):
            word_str = word_str[1:]
        new_words.append(word_str)
    words = new_words

    replace_mentions = {}
    replace_hashtags = {}
    replace_emoji = {}
    emoji_dict = {}
    original_domain = domain
    domain = remove_domain_port(domain)
    following_filename = \
        acct_dir(base_dir, nickname, domain) + '/following.txt'

    # read the following list so that we can detect just @nick
    # in addition to @nick@domain
    following = None
    petnames = None
    if '@' in words:
        if os.path.isfile(following_filename):
            following: list[str] = []
            try:
                with open(following_filename, 'r',
                          encoding='utf-8') as fp_foll:
                    following = fp_foll.readlines()
            except OSError:
                print('EX: add_html_tags unable to read ' +
                      following_filename)
            for handle in following:
                pet = get_pet_name(base_dir, nickname, domain, handle)
                if pet:
                    petnames.append(pet + '\n')

    # extract mentions and tags from words
    long_words_list: list[str] = []
    prev_word_str = ''
    auto_tags_list = _load_auto_tags(base_dir, nickname, domain)
    append_tags = []
    for word_str in words:
        word_len = len(word_str)
        if word_len <= 2:
            continue
        if word_len > max_word_length:
            long_words_list.append(word_str)
        first_char = word_str[0]
        if first_char == '@':
            if _add_mention(base_dir, word_str, http_prefix, following,
                            petnames, replace_mentions, recipients,
                            hashtags):
                prev_word_str = ''
                continue
        elif first_char == '#':
            # remove any endings from the hashtag
            hash_tag_endings = ('.', ':', ';', '-', '\n')
            for ending in hash_tag_endings:
                if word_str.endswith(ending):
                    word_str = word_str[:len(word_str) - 1]
                    break

            if _add_hash_tags(word_str, http_prefix, original_domain,
                              replace_hashtags, hashtags):
                prev_word_str = ''
                continue
        elif ':' in word_str:
            word_str2 = word_str.split(':')[1]
            if not emoji_dict:
                # emoji.json is generated so that it can be customized and
                # the changes will be retained even if default_emoji.json
                # is subsequently updated
                if not os.path.isfile(base_dir + '/emoji/emoji.json'):
                    copyfile(base_dir + '/emoji/default_emoji.json',
                             base_dir + '/emoji/emoji.json')
            emoji_dict = load_json(base_dir + '/emoji/emoji.json')

            # append custom emoji to the dict
            custom_emoji_filename = base_dir + '/emojicustom/emoji.json'
            if os.path.isfile(custom_emoji_filename):
                custom_emoji_dict = load_json(custom_emoji_filename)
                if custom_emoji_dict:
                    # combine emoji dicts one by one
                    for ename, eitem in custom_emoji_dict.items():
                        if ename and eitem:
                            if not emoji_dict.get(ename):
                                emoji_dict[ename] = eitem

            _add_emoji(base_dir, ':' + word_str2 + ':', http_prefix,
                       original_domain, replace_emoji, hashtags,
                       emoji_dict)
        else:
            if _auto_tag(word_str, auto_tags_list, append_tags):
                prev_word_str = ''
                continue
            if prev_word_str:
                if _auto_tag(prev_word_str + ' ' + word_str,
                             auto_tags_list, append_tags):
                    prev_word_str = ''
                    continue
        prev_word_str = word_str

    # add any auto generated tags
    for appended in append_tags:
        content = content + ' ' + appended
        _add_hash_tags(appended, http_prefix, original_domain,
                       replace_hashtags, hashtags)

    # replace words with their html versions
    for word_str, replace_str in replace_mentions.items():
        content = content.replace(word_str, replace_str)
    for word_str, replace_str in replace_hashtags.items():
        content = content.replace(word_str, replace_str)
    if not is_json_content:
        for word_str, replace_str in replace_emoji.items():
            content = content.replace(word_str, replace_str)

    content = add_web_links(content)
    if long_words_list:
        content = remove_long_words(content, max_word_length, long_words_list)
    content = limit_repeated_words(content, 6)
    content = content.replace(' --linebreak-- ', '</p><p>')
    content = html_replace_email_quote(content)
    return '<p>' + html_replace_quote_marks(content) + '</p>'


def _string_starts_with_url_prefix(text: str) -> bool:
    """ Does the given text begin with one of the url prefixes?
    """
    url_prefixes = ('http', 'gnunet', 'i2p', 'ipfs', 'ipns', 'hyper', 'dat:')
    for possible_prefix in url_prefixes:
        if text.startswith(possible_prefix):
            return True
    return False


def get_mentions_from_html(html_text: str, match_str: str) -> []:
    """Extracts mentioned actors from the given html content string
    """
    mentions: list[str] = []
    if match_str not in html_text:
        return mentions
    mentions_list = html_text.split(match_str)
    for mention_str in mentions_list:
        if '"' not in mention_str:
            continue
        actor_str = mention_str.split('"')[0]
        if _string_starts_with_url_prefix(actor_str):
            if actor_str not in mentions:
                mentions.append(actor_str)
    return mentions


def extract_media_in_form_post(post_bytes, boundary, name: str):
    """Extracts the binary encoding for image/video/audio within a http
    form POST
    Returns the media bytes and the remaining bytes
    """
    image_start_boundary = b'Content-Disposition: form-data; name="' + \
        name.encode('utf8', 'ignore') + b'";'
    image_start_location = post_bytes.find(image_start_boundary)
    if image_start_location == -1:
        return None, post_bytes

    # bytes after the start boundary appears
    media_bytes = post_bytes[image_start_location:]

    # look for the next boundary
    image_end_boundary = boundary.encode('utf8', 'ignore')
    image_end_location = media_bytes.find(image_end_boundary)
    if image_end_location == -1:
        # no ending boundary
        return media_bytes, post_bytes[:image_start_location]

    # remaining bytes after the end of the image
    remainder = media_bytes[image_end_location:]

    # remove bytes after the end boundary
    media_bytes = media_bytes[:image_end_location]

    # return the media and the before+after bytes
    return media_bytes, post_bytes[:image_start_location] + remainder


def _valid_follows_csv(content: str) -> bool:
    """is the given content a valid csv file containing imported follows?
    """
    if ',' not in content:
        return False
    if 'Account address,' not in content:
        return False
    return True


def save_media_in_form_post(media_bytes, debug: bool,
                            filename_base: str) -> (str, str):
    """Saves the given media bytes extracted from http form POST
    Returns the filename and attachment type
    """
    if not media_bytes:
        if filename_base:
            # remove any existing files
            extension_types = get_image_extensions()
            for ex in extension_types:
                possible_other_format = filename_base + '.' + ex
                if os.path.isfile(possible_other_format):
                    try:
                        os.remove(possible_other_format)
                    except OSError:
                        if debug:
                            print('EX: save_media_in_form_post ' +
                                  'unable to delete other ' +
                                  str(possible_other_format))
            if os.path.isfile(filename_base):
                try:
                    os.remove(filename_base)
                except OSError:
                    if debug:
                        print('EX: save_media_in_form_post ' +
                              'unable to delete ' +
                              str(filename_base))

        if debug:
            print('DEBUG: No media found within POST')
        return None, None

    media_location = -1
    search_str = ''
    filename = None

    # directly search the binary array for the beginning
    # of an image, zip or csv
    extension_list = {
        'png': 'image/png',
        'jpeg': 'image/jpeg',
        'jxl': 'image/jxl',
        'gif': 'image/gif',
        'svg': 'image/svg+xml',
        'webp': 'image/webp',
        'avif': 'image/avif',
        'heic': 'image/heic',
        'mp4': 'video/mp4',
        'ogv': 'video/ogv',
        'mp3': 'audio/mpeg',
        'ogg': 'audio/ogg',
        'wav': 'audio/vnd.wave',
        'wav2': 'audio/wav',
        'wav3': 'audio/x-wav',
        'wav4': 'audio/x-pn-wave',
        'opus': 'audio/opus',
        'spx': 'audio/speex',
        'flac': 'audio/flac',
        'zip': 'application/zip',
        'csv': 'text/csv',
        'csv2': 'text/plain'
    }
    detected_extension = None
    for extension, content_type in extension_list.items():
        search_str = b'Content-Type: ' + content_type.encode('utf8', 'ignore')
        media_location = media_bytes.find(search_str)
        if media_location > -1:
            # image/video/audio binaries
            if extension == 'jpeg':
                extension = 'jpg'
            elif extension == 'mpeg':
                extension = 'mp3'
            elif extension == 'csv2':
                extension = 'csv'
            elif extension == 'wav2':
                extension = 'wav'
            elif extension == 'wav3':
                extension = 'wav'
            elif extension == 'wav4':
                extension = 'wav'
            if filename_base:
                if not filename_base.endswith('.' + extension):
                    filename = filename_base + '.' + extension
                else:
                    # already has the extension
                    filename = filename_base
            search_lst = search_str.decode().split('/', maxsplit=1)
            attachment_media_type = \
                search_lst[0].replace('Content-Type: ', '')
            detected_extension = extension
            break

    if not filename:
        return None, None

    # locate the beginning of the image, after any
    # carriage returns
    start_pos = media_location + len(search_str)
    for offset in range(1, 8):
        if media_bytes[start_pos+offset] != 10:
            if media_bytes[start_pos+offset] != 13:
                start_pos += offset
                break

    # remove any existing image files with a different format
    if detected_extension != 'zip':
        extension_types = get_image_extensions()
        for ex in extension_types:
            if ex == detected_extension:
                continue
            possible_other_format = \
                filename.replace('.temp', '').replace('.' +
                                                      detected_extension, '.' +
                                                      ex)
            if os.path.isfile(possible_other_format):
                try:
                    os.remove(possible_other_format)
                except OSError:
                    if debug:
                        print('EX: save_media_in_form_post ' +
                              'unable to delete other 2 ' +
                              str(possible_other_format))

    # don't allow scripts within svg files
    if detected_extension == 'svg':
        svg_str = media_bytes[start_pos:]
        svg_str = svg_str.decode()
        if dangerous_svg(svg_str, False):
            return None, None
    elif detected_extension == 'csv':
        csv_str = media_bytes[start_pos:]
        csv_str = csv_str.decode()
        if not _valid_follows_csv(csv_str):
            return None, None

    # if this is an image then check that the binary looks like an image
    image_extension_types = get_image_extensions()
    if detected_extension in image_extension_types:
        if not binary_is_image(filename, media_bytes[start_pos:]):
            print('WARN: save_media_in_form_post ' +
                  'image binary not recognized ' + filename)
            return None, None

    try:
        with open(filename, 'wb') as fp_media:
            fp_media.write(media_bytes[start_pos:])
    except OSError:
        print('EX: save_media_in_form_post unable to write media')

    if not os.path.isfile(filename):
        print('WARN: Media file could not be written to file: ' + filename)
        return None, None
    print('Uploaded media file written: ' + filename)

    return filename, attachment_media_type


def combine_textarea_lines(text: str) -> str:
    """Combines separate lines
    """
    result = ''
    ctr = 0
    paragraphs = text.split('\n\n')
    replacements = {
        '\n* ': '***BULLET POINT*** ',
        '\n * ': '***BULLET POINT*** ',
        '\n- ': '***DASH POINT*** ',
        '\n - ': '***DASH POINT*** ',
        '\n': ' ',
        '  ': ' ',
        '***BULLET POINT*** ': '\n* ',
        '***DASH POINT*** ': '\n- '
    }
    for para in paragraphs:
        para = replace_strings(para, replacements)
        if ctr > 0:
            result += '</p><p>'
        result += para
        ctr += 1
    return result


def extract_text_fields_in_post(post_bytes, boundary: str, debug: bool,
                                unit_test_data: str) -> {}:
    """Returns a dictionary containing the text fields of a http form POST
    The boundary argument comes from the http header
    """
    if boundary == 'LYNX':
        if debug:
            print('POST from lynx browser')
        boundary = '--LYNX'

    if not unit_test_data:
        msg_bytes = email.parser.BytesParser().parsebytes(post_bytes)
        message_fields = msg_bytes.get_payload(decode=True).decode('utf-8')
    else:
        message_fields = unit_test_data

    if debug:
        if 'password' not in message_fields:
            print('DEBUG: POST arriving ' + message_fields)

    message_fields = message_fields.split(boundary)
    fields = {}
    fields_with_semicolon_allowed = (
        'message', 'bio', 'autoCW', 'password', 'passwordconfirm',
        'instanceDescription', 'instanceDescriptionShort',
        'subject', 'location', 'imageDescription', 'importBlocks',
        'importFollows', 'importTheme'
    )
    if debug:
        if 'password' not in message_fields:
            print('DEBUG: POST message_fields: ' + str(message_fields))
    lynx_content_type = 'Content-Type: text/plain; charset=utf-8\r\n'
    # examine each section of the POST, separated by the boundary
    for fld in message_fields:
        if fld == '--':
            continue
        if ' name="' not in fld:
            continue
        post_str = fld.split(' name="', 1)[1]
        if '"' not in post_str:
            continue
        post_key = post_str.split('"', 1)[0]
        if debug:
            print('post_key: ' + post_key)
        post_value_str = post_str.split('"', 1)[1]
        if boundary == '--LYNX':
            post_value_str = \
                post_value_str.replace(lynx_content_type, '')
        if debug and 'password' not in post_key:
            print('boundary: ' + boundary)
            print('post_value_str1: ' + post_value_str)
        if ';' in post_value_str:
            if post_key not in fields_with_semicolon_allowed and \
               not post_key.startswith('edited'):
                if debug:
                    print('extract_text_fields_in_post exit 1')
                continue
        if debug and 'password' not in post_key:
            print('post_value_str2: ' + post_value_str)
        if '\r\n' not in post_value_str:
            if debug:
                print('extract_text_fields_in_post exit 2')
            continue
        post_lines = post_value_str.split('\r\n')
        if debug and 'password' not in post_key:
            print('post_lines: ' + str(post_lines))
        post_value = ''
        if len(post_lines) > 2:
            for line in range(2, len(post_lines)-1):
                if line > 2:
                    post_value += '\n'
                post_value += post_lines[line]
        fields[post_key] = urllib.parse.unquote(post_value)
        if boundary == '--LYNX' and post_key in ('message', 'bio'):
            fields[post_key] = combine_textarea_lines(fields[post_key])
    return fields


def limit_repeated_words(text: str, max_repeats: int) -> str:
    """Removes words which are repeated many times
    """
    words = text.replace('\n', ' ').split(' ')
    repeat_ctr = 0
    repeated_text = ''
    replacements = {}
    prev_word = ''
    for word in words:
        if word == prev_word:
            repeat_ctr += 1
            if repeated_text:
                repeated_text += ' ' + word
            else:
                repeated_text = word + ' ' + word
        else:
            if repeat_ctr > max_repeats:
                new_text = ((prev_word + ' ') * max_repeats).strip()
                replacements[prev_word] = [repeated_text, new_text]
            repeat_ctr = 0
            repeated_text = ''
        prev_word = word

    if repeat_ctr > max_repeats:
        new_text = ((prev_word + ' ') * max_repeats).strip()
        replacements[prev_word] = [repeated_text, new_text]

    for word, item in replacements.items():
        text = text.replace(item[0], item[1])
    return text


def get_price_from_string(price_str: str) -> (str, str):
    """Returns the item price and currency
    """
    currencies = get_currencies()
    for symbol, name in currencies.items():
        if symbol in price_str:
            price = price_str.replace(symbol, '')
            if is_float(price):
                return price, name
        elif name in price_str:
            price = price_str.replace(name, '')
            if is_float(price):
                return price, name
    if is_float(price_str):
        return price_str, "EUR"
    return "0.00", "EUR"


def _words_similarity_histogram(words: []) -> {}:
    """Returns a histogram for word combinations
    """
    histogram = {}
    for index in range(1, len(words)):
        combined_words = words[index - 1] + words[index]
        if histogram.get(combined_words):
            histogram[combined_words] += 1
        else:
            histogram[combined_words] = 1
    return histogram


def _words_similarity_words_list(content: str) -> []:
    """Returns a list of words for the given content
    """
    remove_punctuation = ('.', ',', ';', '-', ':', '"')
    content = remove_html(content).lower()
    for punc in remove_punctuation:
        content = content.replace(punc, ' ')
        content = content.replace('  ', ' ')
    return content.split(' ')


def words_similarity(content1: str, content2: str, min_words: int) -> int:
    """Returns percentage similarity
    """
    if content1 == content2:
        return 100

    words1 = _words_similarity_words_list(content1)
    if len(words1) < min_words:
        return 0

    words2 = _words_similarity_words_list(content2)
    if len(words2) < min_words:
        return 0

    histogram1 = _words_similarity_histogram(words1)
    histogram2 = _words_similarity_histogram(words2)

    diff = 0
    for combined_words, histogram1_value in histogram1.items():
        if not histogram2.get(combined_words):
            diff += 1
        else:
            diff += \
                abs(histogram2[combined_words] - histogram1_value)
    return 100 - int(diff * 100 / len(histogram1.items()))


def contains_invalid_local_links(domain_full: str,
                                 onion_domain: str, i2p_domain: str,
                                 content: str) -> bool:
    """Returns true if the given content has invalid links
    """
    for inv_str in INVALID_CONTENT_STRINGS:
        match_str = '?' + inv_str + '='
        if match_str not in content:
            continue
        # extract the urls and check whether they are for the local domain
        ctr = 0
        sections = content.split(match_str)
        final_section_index = len(sections) - 1
        for section_str in sections:
            if ctr == final_section_index:
                continue
            if '://' in section_str:
                url = section_str.split('://')[-1]
                if domain_full in url:
                    return True
                if onion_domain:
                    if onion_domain in url:
                        return True
                if i2p_domain:
                    if i2p_domain in url:
                        return True
            ctr += 1
    return False


def bold_reading_string(text: str) -> str:
    """Returns bold reading formatted text
    """
    text = html.unescape(text)
    add_paragraph_markup = False
    if '<p>' in text:
        text = text.replace('</p>', '\n').replace('<p>', '')
        add_paragraph_markup = True
    paragraphs = text.split('\n')
    parag_ctr = 0
    new_text = ''
    for parag in paragraphs:
        words = parag.split(' ')
        new_parag = ''
        reading_markup = False
        for wrd in words:
            if '<' in wrd:
                reading_markup = True
            if reading_markup and '>' in wrd:
                reading_markup = False
            wrd_len = len(wrd)
            if not reading_markup and wrd_len > 1 and \
               '<' not in wrd and '>' not in wrd and \
               '&' not in wrd and '=' not in wrd and \
               not wrd.startswith(':'):

                prefix = ''
                postfix = ''
                if wrd.startswith('"'):
                    prefix = '"'
                    wrd = wrd[1:]
                if wrd.endswith('"'):
                    postfix = '"'
                    wrd = wrd[:wrd_len - 1]

                initial_chars = int(math.ceil(wrd_len / 2.0))
                new_parag += \
                    prefix + '<b>' + wrd[:initial_chars] + '</b>' + \
                    wrd[initial_chars:] + postfix + ' '
            else:
                new_parag += wrd + ' '
        parag_ctr += 1
        new_parag = new_parag.strip()
        if not new_parag:
            continue
        if parag_ctr < len(paragraphs):
            if not add_paragraph_markup:
                new_text += new_parag + '\n'
            else:
                new_text += '<p>' + new_parag + '</p>'
        else:
            if not add_paragraph_markup:
                new_text += new_parag
            else:
                new_text += '<p>' + new_parag + '</p>'

    return new_text


def import_emoji(base_dir: str, import_filename: str, session) -> None:
    """Imports emoji from the given filename
    Each line should be [emoji url], :emojiname:
    """
    if not os.path.isfile(import_filename):
        return
    emoji_dict = load_json(base_dir + '/emoji/default_emoji.json')
    added = 0
    with open(import_filename, "r", encoding='utf-8') as fp_emoji:
        lines = fp_emoji.readlines()
        for line in lines:
            if ', ' not in line:
                continue
            url = line.split(', ')[0]
            tag = line.split(', ')[1].strip()
            if ':' not in tag:
                continue
            tag = tag.split(':')[1]
            if emoji_dict.get(tag):
                continue
            emoji_image_filename = base_dir + '/emoji/' + tag + '.png'
            if os.path.isfile(emoji_image_filename):
                continue
            if download_image(session, url,
                              emoji_image_filename, True, False):
                emoji_dict[tag] = tag
                added += 1
    save_json(emoji_dict, base_dir + '/emoji/default_emoji.json')
    print(str(added) + ' custom emoji added')


def content_diff(content: str, prev_content: str) -> str:
    """Returns a diff for the given content
    """
    cdiff = difflib.Differ()
    text1_lines = content.splitlines()
    text1_sentences: list[str] = []
    for line in text1_lines:
        sentences = line.split('.')
        for sentence in sentences:
            text1_sentences.append(sentence.strip())

    text2_lines = prev_content.splitlines()
    text2_sentences: list[str] = []
    for line in text2_lines:
        sentences = line.split('.')
        for sentence in sentences:
            text2_sentences.append(sentence.strip())

    diff = cdiff.compare(text1_sentences, text2_sentences)

    diff_text = ''
    for line in diff:
        if line.startswith('- '):
            if not diff_text:
                diff_text = '<p>'
            else:
                diff_text += '<br>'
            diff_text += '<label class="diff_add">+ ' + line[2:] + '</label>'
        elif line.startswith('+ '):
            if not diff_text:
                diff_text = '<p>'
            else:
                diff_text += '<br>'
            diff_text += \
                '<label class="diff_remove">- ' + line[2:] + '</label>'
    if diff_text:
        diff_text += '</p>'
    return diff_text


def create_edits_html(edits_json: {}, post_json_object: {},
                      translate: {}, timezone: str,
                      system_language: str,
                      languages_understood: []) -> str:
    """ Creates html showing historical edits made to a post
    """
    if not edits_json:
        return ''
    if not has_object_dict(post_json_object):
        return ''
    if 'content' not in post_json_object['object']:
        if 'contentMap' not in post_json_object['object']:
            return ''
    edit_dates_list: list[str] = []
    for modified, _ in edits_json.items():
        edit_dates_list.append(modified)
    edit_dates_list.sort(reverse=True)
    edits_str = ''
    content = get_content_from_post(post_json_object, system_language,
                                    languages_understood, "content")
    if not content:
        return ''
    content = remove_html(content)
    for modified in edit_dates_list:
        prev_json = edits_json[modified]
        if not has_object_dict(prev_json):
            continue
        prev_content = get_content_from_post(prev_json, system_language,
                                             languages_understood, "content")
        if not prev_content:
            continue
        prev_content = remove_html(prev_content)
        if content == prev_content:
            continue
        diff = content_diff(content, prev_content)
        if not diff:
            continue
        diff = diff.replace('\n', '</p><p>')
        # convert to local time
        datetime_object = parse(modified)
        datetime_object = \
            convert_published_to_local_timezone(datetime_object, timezone)
        modified_str = datetime_object.strftime("%a %b %d, %H:%M")
        diff = '<p><b>' + modified_str + '</b></p>' + diff
        edits_str += diff
        content = prev_content
    if not edits_str:
        return ''
    return '<details><summary class="cw" tabindex="10">' + \
        translate['SHOW EDITS'] + '</summary>' + \
        edits_str + '</details>'


def remove_script(content: str, log_filename: str,
                  actor: str, url: str) -> str:
    """Removes <script> from some content
    """
    separators = [['<', '>'], ['&lt;', '&gt;']]
    for sep in separators:
        prefix = sep[0] + 'script'
        ending = '/script' + sep[1]
        if prefix not in content:
            continue
        sections = content.split(prefix)
        ctr = 0
        for text in sections:
            if ctr == 0:
                ctr += 1
                continue
            if ending not in text:
                if '/' + sep[1] not in text:
                    continue
            if ending in text:
                text = prefix + text.split(ending)[0] + ending
            else:
                text = prefix + text.split('/' + sep[1])[0] + '/' + sep[1]
                if log_filename and actor:
                    # write the detected script to a log file
                    log_str = actor + ' ' + url + ' ' + text + '\n'
                    write_type = 'a+'
                    if os.path.isfile(log_filename):
                        write_type = 'w+'
                    try:
                        with open(log_filename, write_type,
                                  encoding='utf-8') as fp_log:
                            fp_log.write(log_str)
                    except OSError:
                        print('EX: cannot append to svg script log')
            content = content.replace(text, '')
    return content


def reject_twitter_summary(base_dir: str, nickname: str, domain: str,
                           summary: str) -> bool:
    """Returns true if the post should be rejected due to twitter
    existing within the summary
    """
    if not summary:
        return False
    remove_twitter = \
        acct_dir(base_dir, nickname, domain) + '/.removeTwitter'
    if not os.path.isfile(remove_twitter):
        return False
    summary_lower = summary.lower()
    twitter_strings = ('twitter', '/x.com', ' x.com', 'birdsite')
    if string_contains(summary_lower, twitter_strings):
        return True
    return False


def add_name_emojis_to_tags(base_dir: str, http_prefix: str,
                            domain: str, port: int,
                            actor_json: {}) -> None:
    """Add any custom emojis within the name of an actor to
    the tag list
    """
    if not actor_json.get('name'):
        return
    name = actor_json['name']

    # does the name contain an emoji?
    if ':' not in name:
        return
    if ':' not in name.split(':', 1)[1]:
        return

    # get emojis from the actor name
    words = name.split(' ')
    emojis: list[str] = []
    for wrd in words:
        if wrd.startswith(':') and wrd.endswith(':'):
            if wrd not in emojis:
                emojis.append(wrd)
    if not emojis:
        return

    actor_tags: list[dict] = []
    if actor_json.get('tag'):
        actor_tags = actor_json['tag']

    # is the emoji already in the tag list?
    for tag_dict in actor_tags:
        if not tag_dict.get('type'):
            continue
        if tag_dict['type'] != 'Emoji':
            continue
        if not tag_dict.get('name'):
            continue
        if not tag_dict['name'].startswith(':'):
            continue
        if not tag_dict['name'].endswith(':'):
            continue
        if tag_dict['name'] in emojis:
            emojis.remove(tag_dict['name'])
    if not emojis:
        return

    domain_full = get_full_domain(domain, port)
    for emoji_tag_name in emojis:
        emoji_name = emoji_tag_name.replace(':', '')
        emoji_id = \
            http_prefix + '://' + domain_full + '/emoji/' + \
            emoji_name
        url = emoji_id + '.png'
        emoji_filename = base_dir + '/emoji/' + emoji_name + '.png'
        updated = None
        if os.path.isfile(emoji_filename):
            updated = file_last_modified(emoji_filename)
        new_tag = {
            'icon': {
                'mediaType': 'image/png',
                'type': 'Image',
                'url': url
            },
            'id': emoji_id,
            'name': emoji_tag_name,
            'type': 'Emoji',
            'updated': '2022-11-15T23:45:42Z'
        }
        if updated:
            new_tag['updated'] = updated
        actor_json['tag'].append(new_tag)


def format_mixed_right_to_left(content: str,
                               language: str) -> str:
    """Adds RTL direction formatting for non-RTL language
    eg. where some paragraphs are English and others are Arabic
    """
    # not a RTL language
    if language_right_to_left(language):
        return content
    result = ''
    changed = False
    paragraphs = content.split('<p>')
    for text_html in paragraphs:
        if '</p>' not in text_html:
            continue
        text_html = '<p>' + text_html
        text_plain = remove_html(text_html)
        if is_right_to_left_text(text_plain):
            text_html = text_html.replace('<p>', '<p><div dir="rtl">', 1)
            text_html = text_html.replace('</p>', '</div></p>', 1)
            changed = True
        result += text_html
    if not changed:
        result = ''
        prev_distilled = ''
        distilled = content
        while prev_distilled != distilled:
            prev_distilled = distilled
            distilled = distilled.replace('<br><br><br>', '<br><br>')
        paragraphs = distilled.split('<br><br>')
        ctr = 0
        for text_html in paragraphs:
            ctr += 1
            if ctr < len(paragraphs):
                text_html += '<br><br>'
            text_plain = remove_html(text_html)
            if is_right_to_left_text(text_plain):
                text_html = '<div dir="rtl">' + text_html
                if ctr < len(paragraphs):
                    text_html = \
                        text_html.replace('<br><br>', '</div><br><br>', 1)
                else:
                    text_html += '</div>'
                changed = True
            result += text_html
    if not changed:
        return content
    return result


def _load_auto_cw(base_dir: str, nickname: str, domain: str) -> []:
    """Loads automatic CWs file and returns a list containing
    the lines of the file
    """
    auto_cw_filename = acct_dir(base_dir, nickname, domain) + '/autocw.txt'
    if not os.path.isfile(auto_cw_filename):
        return []
    try:
        with open(auto_cw_filename, 'r', encoding='utf-8') as fp_auto:
            return fp_auto.read().split('\n')
    except OSError:
        print('EX: unable to load auto cw file ' + auto_cw_filename)
    return []


def load_auto_cw_cache(base_dir: str) -> {}:
    """Returns a dictionary containing the automatic content warning lists
    for each account
    """
    auto_cw_cache = {}
    dir_str = data_dir(base_dir)
    for _, dirs, _ in os.walk(dir_str):
        for handle in dirs:
            if not is_account_dir(handle):
                continue
            nickname = handle.split('@')[0]
            domain = handle.split('@')[1]
            auto_cw_cache[nickname] = _load_auto_cw(base_dir, nickname, domain)
        break
    return auto_cw_cache


def add_auto_cw(base_dir: str, nickname: str, domain: str,
                subject: str, content: str,
                auto_cw_cache: {}) -> str:
    """Appends any automatic content warnings to the subject line
    and returns the new subject line
    """
    new_subject = subject
    if auto_cw_cache.get(nickname):
        auto_cw_list = auto_cw_cache[nickname]
    else:
        auto_cw_list = _load_auto_cw(base_dir, nickname, domain)
        auto_cw_cache[nickname] = auto_cw_list
    for cw_rule in auto_cw_list:
        if '->' not in cw_rule:
            continue
        sections = cw_rule.split('->')
        rulematch = sections[0].strip()
        if rulematch not in content:
            continue
        cw_str = sections[1].strip()
        if not cw_str:
            continue
        if new_subject:
            if cw_str not in new_subject and \
               cw_str.title() not in new_subject:
                new_subject += ', ' + cw_str
        else:
            new_subject = cw_str
    return new_subject
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								__filename__ = "content.py"
 								__author__ = "Bob Mottram"
 								__license__ = "AGPL3+"
-												Preparing for next release

											
										
										
											2024-12-22 23:37:30 +00:00
+								__version__ = "1.6.0"
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								__maintainer__ = "Bob Mottram"
-												Change domain to libreserver.org

											
										
										
											2021-09-10 16:14:50 +00:00
+								__email__ = "bob@libreserver.org"
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								__status__ = "Production"
-												Module groups

											
										
										
											2021-06-25 16:10:09 +00:00
+								__module_group__ = "Core"
-												mentions function

											
										
										
											2019-07-15 14:11:31 +00:00
-												Unit test for content diff

											
										
										
											2022-04-10 19:19:40 +00:00
+								import difflib
-												Round up word division for bold reading

											
										
										
											2022-03-24 16:16:36 +00:00
+								import math
-												Unescape before adding bold

											
										
										
											2022-03-24 15:15:53 +00:00
+								import html
-												mentions function

											
										
										
											2019-07-15 14:11:31 +00:00
+								import os
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								import email.parser
-												Ensure that text fields from POST are unquoted

											
										
										
											2020-12-03 14:59:07 +00:00
+								import urllib.parse
-												Document customizations

											
										
										
											2019-08-11 16:55:22 +00:00
+								from shutil import copyfile
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								from dateutil.parser import parse
-												Move flag functions to their own module

											
										
										
											2024-09-13 13:58:14 +00:00
+								from flags import is_pgp_encrypted
 								from flags import contains_pgp_public_key
 								from flags import is_float
 								from flags import is_right_to_left_text
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								from utils import replace_strings
-												Function for accounts data directory

											
										
										
											2024-05-12 12:35:26 +00:00
+								from utils import data_dir
-												Remove tracking from profile links

											
										
										
											2024-04-24 19:35:04 +00:00
+								from utils import remove_link_tracking
-												Tidying

											
										
										
											2024-04-10 12:59:57 +00:00
+								from utils import string_contains
-												Tidying

											
										
										
											2024-04-10 12:23:59 +00:00
+								from utils import string_ends_with
-												Load cached automatic content warnings at startup

											
										
										
											2024-01-18 14:03:35 +00:00
+								from utils import is_account_dir
-												Handle situations where urls are lists

											
										
										
											2023-12-09 14:18:24 +00:00
+								from utils import get_url_from_post
-												Handling of mixed direction post content

											
										
										
											2023-09-20 12:23:45 +00:00
+								from utils import language_right_to_left
-												Whenever saving an image verify that the binary looks like an image

This should help to head off any future nefariousness

											
										
										
											2023-09-12 18:38:56 +00:00
+								from utils import binary_is_image
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								from utils import get_content_from_post
-												Add missing name emojis to actor tag list

											
										
										
											2022-12-08 14:13:15 +00:00
+								from utils import get_full_domain
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								from utils import get_user_paths
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								from utils import convert_published_to_local_timezone
 								from utils import has_object_dict
-												Get categories from podcast feeds

											
										
										
											2022-01-13 15:10:41 +00:00
+								from utils import valid_hash_tag
-												Snake case

											
										
										
											2021-12-27 21:44:48 +00:00
+								from utils import dangerous_svg
-												Snake case

											
										
										
											2021-12-26 18:17:37 +00:00
+								from utils import remove_domain_port
-												Snake case

											
										
										
											2021-12-26 14:26:16 +00:00
+								from utils import get_image_extensions
-												Snake case

											
										
										
											2021-12-26 15:13:34 +00:00
+								from utils import load_json
-												Snake case

											
										
										
											2021-12-26 14:47:21 +00:00
+								from utils import save_json
-												Snake case

											
										
										
											2021-12-28 14:01:37 +00:00
+								from utils import file_last_modified
-												Snake case

											
										
										
											2021-12-27 17:32:34 +00:00
+								from utils import get_link_prefixes
-												Snake case

											
										
										
											2021-12-27 21:42:08 +00:00
+								from utils import dangerous_markup
-												Snake case

											
										
										
											2021-12-26 12:02:29 +00:00
+								from utils import acct_dir
-												Snake case

											
										
										
											2021-12-26 17:29:09 +00:00
+								from utils import get_currencies
-												Snake case

											
										
										
											2021-12-27 15:43:22 +00:00
+								from utils import remove_html
-												Function for line ending characters

											
										
										
											2022-06-21 11:58:50 +00:00
+								from utils import remove_eol
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								from petnames import get_pet_name
 								from session import download_image
-												mentions function

											
										
										
											2019-07-15 14:11:31 +00:00
-												Extra music site

											
										
										
											2023-10-16 21:31:46 +00:00
+								MUSIC_SITES = ('soundcloud.com', 'bandcamp.com', 'resonate.coop')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
 								MAX_LINK_LENGTH = 40
 								REMOVE_MARKUP = (
 								    'b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
 								    'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5'
 								)
 								INVALID_CONTENT_STRINGS = (
 								    'mute', 'unmute', 'editeventpost', 'notifypost',
 								    'delete', 'options', 'page', 'repeat',
 								    'bm', 'tl', 'actor', 'unrepeat', 'eventid',
 								    'unannounce', 'like', 'unlike', 'bookmark',
 								    'unbookmark', 'likedBy', 'time',
 								    'year', 'month', 'day', 'editnewpost',
 								    'graph', 'showshare', 'category', 'showwanted',
 								    'rmshare', 'rmwanted', 'repeatprivate',
 								    'unrepeatprivate', 'replyto',
-												Replying to a chat message has a different path

											
										
										
											2022-02-08 10:52:03 +00:00
+								    'replyfollowers', 'replydm', 'replychat', 'editblogpost',
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    'handle', 'blockdomain'
 								)
-												Check for very long urls within incoming posts

											
										
										
											2022-05-17 11:40:05 +00:00
+								def valid_url_lengths(content: str, max_url_length: int) -> bool:
 								    """Returns true if the given content contains urls which are too long
 								    """
 								    if '://' not in content:
 								        return True
 								    sections = content.split('://')
 								    ctr = 0
 								    for text in sections:
 								        if ctr == 0:
 								            ctr += 1
 								            continue
-												Less indentation

											
										
										
											2024-08-30 10:40:13 +00:00
+								        if '"' not in text:
 								            continue
 								        url = text.split('"')[0]
 								        if '<' not in url and '>' not in url:
 								            if len(url) > max_url_length:
 								                return False
-												Check for very long urls within incoming posts

											
										
										
											2022-05-17 11:40:05 +00:00
+								    return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def remove_html_tag(html_str: str, tag: str) -> str:
-												Remove dangerous markup from rss feeds

											
										
										
											2020-10-11 09:33:31 +00:00
+								    """Removes a given tag from a html string
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    tag_found = True
 								    while tag_found:
 								        match_str = ' ' + tag + '="'
 								        if match_str not in html_str:
 								            tag_found = False
-												Remove dangerous markup from rss feeds

											
										
										
											2020-10-11 09:33:31 +00:00
+								            break
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        sections = html_str.split(match_str, 1)
-												Remove dangerous markup from rss feeds

											
										
										
											2020-10-11 09:33:31 +00:00
+								        if '"' not in sections[1]:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            tag_found = False
-												Remove dangerous markup from rss feeds

											
										
										
											2020-10-11 09:33:31 +00:00
+								            break
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        html_str = sections[0] + sections[1].split('"', 1)[1]
 								    return html_str
-												Remove dangerous markup from rss feeds

											
										
										
											2020-10-11 09:33:31 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def _remove_quotes_within_quotes(content: str) -> str:
-												Fix test

											
										
										
											2020-09-30 22:52:39 +00:00
+								    """Removes any blockquote inside blockquote
 								    """
 								    if '<blockquote>' not in content:
 								        return content
 								    if '</blockquote>' not in content:
 								        return content
 								    ctr = 1
 								    found = True
 								    while found:
 								        prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        quoted_str = content.split('<blockquote>', ctr)[1]
 								        if '</blockquote>' not in quoted_str:
-												Fix test

											
										
										
											2020-09-30 22:52:39 +00:00
+								            found = False
-												Less indentation

											
										
										
											2024-11-06 10:20:23 +00:00
+								            continue
 								        end_str = quoted_str.split('</blockquote>')[1]
 								        quoted_str = quoted_str.split('</blockquote>')[0]
 								        if '<blockquote>' not in end_str:
 								            found = False
 								        if '<blockquote>' in quoted_str:
 								            quoted_str = quoted_str.replace('<blockquote>', '')
 								            content = prefix + quoted_str + '</blockquote>' + end_str
-												Fix test

											
										
										
											2020-09-30 22:52:39 +00:00
+								        ctr += 1
 								    return content
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def html_replace_email_quote(content: str) -> str:
-												Turn email style quotes into blockquotes

											
										
										
											2020-09-14 09:33:42 +00:00
+								    """Replaces an email style quote "> Some quote" with html blockquote
 								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Style of quoted paragraphs

											
										
										
											2020-09-14 11:30:56 +00:00
+								    # replace quote paragraph
 								    if '<p>&quot;' in content:
 								        if '&quot;</p>' in content:
-												Check for balanced quotes

											
										
										
											2020-10-30 12:10:57 +00:00
+								            if content.count('<p>&quot;') == content.count('&quot;</p>'):
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								                replacements = {
 								                    '<p>&quot;': '<p><blockquote>',
 								                    '&quot;</p>': '</blockquote></p>'
 								                }
 								                content = replace_strings(content, replacements)
-												Replace quotes

											
										
										
											2020-09-14 12:17:11 +00:00
+								    if '>\u201c' in content:
 								        if '\u201d<' in content:
-												Check for balanced quotes

											
										
										
											2020-10-30 12:10:57 +00:00
+								            if content.count('>\u201c') == content.count('\u201d<'):
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								                replacements = {
 								                    '>\u201c': '><blockquote>',
 								                    '\u201d<': '</blockquote><'
 								                }
 								                content = replace_strings(content, replacements)
-												Style of quoted paragraphs

											
										
										
											2020-09-14 11:30:56 +00:00
+								    # replace email style quote
-												Turn email style quotes into blockquotes

											
										
										
											2020-09-14 09:33:42 +00:00
+								    if '>&gt; ' not in content:
 								        return content
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    content_str = content.replace('<p>', '')
 								    content_lines = content_str.split('</p>')
 								    new_content = ''
 								    for line_str in content_lines:
 								        if not line_str:
-												Turn email style quotes into blockquotes

											
										
										
											2020-09-14 09:33:42 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '>&gt; ' not in line_str:
 								            if line_str.startswith('&gt; '):
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								                replacements = {
 								                    '&gt; ': '<blockquote>',
 								                    '&gt;': '<br>'
 								                }
 								                line_str = replace_strings(line_str, replacements)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                new_content += '<p>' + line_str + '</blockquote></p>'
-												Extra blockquote check

											
										
										
											2020-09-14 10:25:12 +00:00
+								            else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                new_content += '<p>' + line_str + '</p>'
-												Less indentation

											
										
										
											2024-11-06 10:21:47 +00:00
+								            continue
 								        line_str = line_str.replace('>&gt; ', '><blockquote>')
 								        if line_str.startswith('&gt;'):
 								            line_str = line_str.replace('&gt;', '<blockquote>', 1)
-												Turn email style quotes into blockquotes

											
										
										
											2020-09-14 09:33:42 +00:00
+								        else:
-												Less indentation

											
										
										
											2024-11-06 10:21:47 +00:00
+								            line_str = line_str.replace('&gt;', '<br>')
 								        new_content += '<p>' + line_str + '</blockquote></p>'
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    return _remove_quotes_within_quotes(new_content)
-												Turn email style quotes into blockquotes

											
										
										
											2020-09-14 09:33:42 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def html_replace_quote_marks(content: str) -> str:
-												Use html formatting for quotes

											
										
										
											2020-08-02 17:01:12 +00:00
+								    """Replaces quotes with html formatting
 								    "hello" becomes <q>hello</q>
 								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Use html formatting for quotes

											
										
										
											2020-08-02 17:01:12 +00:00
+								    if '"' not in content:
-												Better handling of quotes

											
										
										
											2020-08-03 17:03:30 +00:00
+								        if '&quot;' not in content:
 								            return content
-												Only html format quotes if there are a few

If there are lots then things can get out of hand

											
										
										
											2020-10-30 12:03:29 +00:00
 								    # only if there are a few quote marks
 								    if content.count('"') > 4:
 								        return content
 								    if content.count('&quot;') > 4:
 								        return content
-												Use html formatting for quotes

											
										
										
											2020-08-02 17:01:12 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    new_content = content
-												Better handling of quotes

											
										
										
											2020-08-03 17:03:30 +00:00
+								    if '"' in content:
 								        sections = content.split('"')
 								        if len(sections) > 1:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            new_content = ''
 								            open_quote = True
-												Avoid adding quites within markup

											
										
										
											2020-08-02 17:17:51 +00:00
+								            markup = False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            for char in content:
 								                curr_char = char
 								                if char == '<':
-												Better handling of quotes

											
										
										
											2020-08-03 17:03:30 +00:00
+								                    markup = True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                elif char == '>':
-												Better handling of quotes

											
										
										
											2020-08-03 17:03:30 +00:00
+								                    markup = False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                elif char == '"' and not markup:
 								                    if open_quote:
 								                        curr_char = '“'
-												Better handling of quotes

											
										
										
											2020-08-03 17:03:30 +00:00
+								                    else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                        curr_char = '”'
 								                    open_quote = not open_quote
 								                new_content += curr_char
 								    if '&quot;' in new_content:
 								        open_quote = True
 								        content = new_content
 								        new_content = ''
-												Replace quotes

											
										
										
											2020-08-02 19:16:22 +00:00
+								        ctr = 0
 								        sections = content.split('&quot;')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        no_of_sections = len(sections)
 								        for sec in sections:
 								            new_content += sec
 								            if ctr < no_of_sections - 1:
 								                if open_quote:
 								                    new_content += '“'
-												Replace quotes

											
										
										
											2020-08-02 19:16:22 +00:00
+								                else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    new_content += '”'
 								                open_quote = not open_quote
-												Replace quotes

											
										
										
											2020-08-02 19:16:22 +00:00
+								            ctr += 1
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    return new_content
-												Use html formatting for quotes

											
										
										
											2020-08-02 17:01:12 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def dangerous_css(filename: str, allow_local_network_access: bool) -> bool:
-												Extra checks on css

											
										
										
											2020-11-15 11:01:05 +00:00
+								    """Returns true is the css file contains code which
 								    can create security problems
 								    """
 								    if not os.path.isfile(filename):
 								        return False
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								    content = None
 								    try:
-												Standardise file pointer names

											
										
										
											2024-07-14 13:01:46 +00:00
+								        with open(filename, 'r', encoding='utf-8') as fp_css:
 								            content = fp_css.read().lower()
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								    except OSError:
 								        print('EX: unable to read css file ' + filename)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not content:
 								        return False
-												Extra checks on css

											
										
										
											2020-11-15 11:01:05 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    css_matches = (
 								        'behavior:', ':expression', '?php', '.php',
 								        'google', 'regexp', 'localhost',
 								        '127.0.', '192.168', '10.0.', '@import'
 								    )
 								    for cssmatch in css_matches:
 								        if cssmatch in content:
-												Extra checks on css

											
										
										
											2020-11-15 11:01:05 +00:00
+								            return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
 								    # search for non-local web links
 								    if 'url(' in content:
 								        url_list = content.split('url(')
 								        ctr = 0
 								        for url_str in url_list:
-												Less indentation

											
										
										
											2024-08-30 11:02:06 +00:00
+								            if ctr == 0:
 								                ctr = 1
 								                continue
 								            if ')' in url_str:
 								                url_str = url_str.split(')')[0]
 								                if string_contains(url_str, ('http', 'ipfs', 'ipns')):
 								                    print('ERROR: non-local web link in CSS ' +
 								                          filename)
 								                    return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            ctr += 1
 								    # an attacker can include html inside of the css
 								    # file as a comment and this may then be run from the html
-												Allow pre tag through dangerous markup filter in some cases, then remove it when rendering

											
										
										
											2023-05-18 11:15:18 +00:00
+								    if dangerous_markup(content, allow_local_network_access, []):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        return True
-												Extra checks on css

											
										
										
											2020-11-15 11:01:05 +00:00
+								    return False
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def switch_words(base_dir: str, nickname: str, domain: str, content: str,
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								                 rules: list[str] = []) -> str:
-												Switch words in html

											
										
										
											2020-02-19 18:51:08 +00:00
+								    """Performs word replacements. eg. Trump -> The Orange Menace
 								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
 								    if not rules:
-												POST variables should be camel case

											
										
										
											2021-12-30 12:23:55 +00:00
+								        switch_words_filename = \
-												Snake case

											
										
										
											2021-12-26 12:02:29 +00:00
+								            acct_dir(base_dir, nickname, domain) + '/replacewords.txt'
-												POST variables should be camel case

											
										
										
											2021-12-30 12:23:55 +00:00
+								        if not os.path.isfile(switch_words_filename):
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
+								            return content
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								        try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								            with open(switch_words_filename, 'r',
-												Standardise file pointer names

											
										
										
											2024-07-14 13:01:46 +00:00
+								                      encoding='utf-8') as fp_words:
 								                rules = fp_words.readlines()
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								        except OSError:
-												POST variables should be camel case

											
										
										
											2021-12-30 12:23:55 +00:00
+								            print('EX: unable to read switches ' + switch_words_filename)
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
 								    for line in rules:
-												Function for line ending characters

											
										
										
											2022-06-21 11:58:50 +00:00
+								        replace_str = remove_eol(line)
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
+								        splitters = ('->', ':', ',', ';', '-')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        word_transform = None
 								        for split_str in splitters:
 								            if split_str in replace_str:
 								                word_transform = replace_str.split(split_str)
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
+								                break
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not word_transform:
-												Unit test for word switching

											
										
										
											2021-07-06 16:29:03 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if len(word_transform) == 2:
 								            replace_str1 = word_transform[0].strip().replace('"', '')
 								            replace_str2 = word_transform[1].strip().replace('"', '')
 								            content = content.replace(replace_str1, replace_str2)
-												Switch words in html

											
										
										
											2020-02-19 18:51:08 +00:00
+								    return content
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								def _save_custom_emoji(session, base_dir: str, emoji_name: str, url: str,
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								                       debug: bool) -> None:
-												Store custom emoji

											
										
										
											2021-11-01 17:12:17 +00:00
+								    """Saves custom emoji to file
 								    """
 								    if not session:
-												More debug

											
										
										
											2021-11-01 17:50:38 +00:00
+								        if debug:
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								            print('EX: _save_custom_emoji no session')
-												Store custom emoji

											
										
										
											2021-11-01 17:12:17 +00:00
+								        return
 								    if '.' not in url:
 								        return
 								    ext = url.split('.')[-1]
 								    if ext != 'png':
-												More debug

											
										
										
											2021-11-01 17:50:38 +00:00
+								        if debug:
-												More debug

											
										
										
											2021-11-01 18:33:32 +00:00
+								            print('EX: Custom emoji is wrong format ' + url)
-												Store custom emoji

											
										
										
											2021-11-01 17:12:17 +00:00
+								        return
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								    emoji_name = emoji_name.replace(':', '').strip().lower()
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    custom_emoji_dir = base_dir + '/emojicustom'
 								    if not os.path.isdir(custom_emoji_dir):
 								        os.mkdir(custom_emoji_dir)
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								    emoji_image_filename = custom_emoji_dir + '/' + emoji_name + '.' + ext
-												Tidying

											
										
										
											2022-06-14 10:24:29 +00:00
+								    if not download_image(session, url,
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                          emoji_image_filename, debug, False):
-												More debug

											
										
										
											2021-11-01 18:33:32 +00:00
+								        if debug:
 								            print('EX: custom emoji not downloaded ' + url)
-												Store custom emoji

											
										
										
											2021-11-01 17:12:17 +00:00
+								        return
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    emoji_json_filename = custom_emoji_dir + '/emoji.json'
 								    emoji_json = {}
 								    if os.path.isfile(emoji_json_filename):
-												json loading retries may not be needed

											
										
										
											2024-06-20 10:47:58 +00:00
+								        emoji_json = load_json(emoji_json_filename)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not emoji_json:
 								            emoji_json = {}
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								    if not emoji_json.get(emoji_name):
 								        emoji_json[emoji_name] = emoji_name
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        save_json(emoji_json, emoji_json_filename)
-												More debug

											
										
										
											2021-11-01 17:50:38 +00:00
+								        if debug:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            print('EX: Saved custom emoji ' + emoji_json_filename)
-												More debug

											
										
										
											2021-11-01 18:33:32 +00:00
+								    elif debug:
 								        print('EX: cusom emoji already saved')
-												Store custom emoji

											
										
										
											2021-11-01 17:12:17 +00:00
-												Lookup the name for emojis which are codes

											
										
										
											2022-04-19 12:54:37 +00:00
+								def _get_emoji_name_from_code(base_dir: str, emoji_code: str) -> str:
 								    """Returns the emoji name from its code
 								    """
 								    emojis_filename = base_dir + '/emoji/emoji.json'
 								    if not os.path.isfile(emojis_filename):
 								        emojis_filename = base_dir + '/emoji/default_emoji.json'
 								        if not os.path.isfile(emojis_filename):
 								            return None
 								    emojis_json = load_json(emojis_filename)
 								    if not emojis_json:
 								        return None
 								    for emoji_name, code in emojis_json.items():
 								        if code == emoji_code:
 								            return emoji_name
 								    return None
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								def _update_common_emoji(base_dir: str, emoji_content: str) -> None:
 								    """Updates the list of commonly used emoji
 								    """
-												Remove file ending

											
										
										
											2022-04-18 21:13:08 +00:00
+								    if '.' in emoji_content:
 								        emoji_content = emoji_content.split('.')[0]
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								    emoji_content = emoji_content.replace(':', '')
-												Lookup the name for emojis which are codes

											
										
										
											2022-04-19 12:54:37 +00:00
+								    if emoji_content.startswith('0x'):
 								        # lookup the name for an emoji code
 								        emoji_code = emoji_content[2:]
 								        emoji_content = _get_emoji_name_from_code(base_dir, emoji_code)
 								        if not emoji_content:
 								            return
-												Function for accounts data directory

											
										
										
											2024-05-12 12:35:26 +00:00
+								    common_emoji_filename = data_dir(base_dir) + '/common_emoji.txt'
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								    common_emoji = None
 								    if os.path.isfile(common_emoji_filename):
 								        try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								            with open(common_emoji_filename, 'r',
 								                      encoding='utf-8') as fp_emoji:
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								                common_emoji = fp_emoji.readlines()
 								        except OSError:
 								            print('EX: unable to load common emoji file')
 								    if common_emoji:
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								        new_common_emoji: list[str] = []
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								        emoji_found = False
 								        for line in common_emoji:
 								            if ' ' + emoji_content in line:
 								                if not emoji_found:
 								                    emoji_found = True
 								                    counter = 1
 								                    count_str = line.split(' ')[0]
 								                    if count_str.isdigit():
 								                        counter = int(count_str) + 1
 								                    count_str = str(counter).zfill(16)
 								                    line = count_str + ' ' + emoji_content
 								                    new_common_emoji.append(line)
 								            else:
-												Function for line ending characters

											
										
										
											2022-06-21 11:58:50 +00:00
+								                line1 = remove_eol(line)
 								                new_common_emoji.append(line1)
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								        if not emoji_found:
 								            new_common_emoji.append(str(1).zfill(16) + ' ' + emoji_content)
 								        new_common_emoji.sort(reverse=True)
 								        try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								            with open(common_emoji_filename, 'w+',
 								                      encoding='utf-8') as fp_emoji:
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								                for line in new_common_emoji:
 								                    fp_emoji.write(line + '\n')
 								        except OSError:
 								            print('EX: error writing common emoji 1')
 								            return
 								    else:
 								        line = str(1).zfill(16) + ' ' + emoji_content + '\n'
 								        try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								            with open(common_emoji_filename, 'w+',
 								                      encoding='utf-8') as fp_emoji:
-												Store commonly used emoji

											
										
										
											2022-04-18 21:02:03 +00:00
+								                fp_emoji.write(line)
 								        except OSError:
 								            print('EX: error writing common emoji 2')
 								            return
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def replace_emoji_from_tags(session, base_dir: str,
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                            content: str, tag: [], message_type: str,
-												Don't speak emoji eithin display names

											
										
										
											2022-04-21 13:03:40 +00:00
+								                            debug: bool, screen_readable: bool) -> str:
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								    """Uses the tags to replace :emoji: with html image markup
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for tag_item in tag:
-												Check tag type

											
										
										
											2023-09-19 12:34:30 +00:00
+								        if not isinstance(tag_item, dict):
 								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not tag_item.get('type'):
-												Separate tag replacement functions

											
										
										
											2019-09-29 17:20:10 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if tag_item['type'] != 'Emoji':
-												Separate tag replacement functions

											
										
										
											2019-09-29 17:20:10 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not tag_item.get('name'):
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not tag_item.get('icon'):
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not tag_item['icon'].get('url'):
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								            continue
-												Handle situations where urls are lists

											
										
										
											2023-12-09 14:18:24 +00:00
+								        url_str = get_url_from_post(tag_item['icon']['url'])
 								        if '/' not in url_str:
-												Replace tags with unicode characters

											
										
										
											2020-02-21 15:09:31 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if tag_item['name'] not in content:
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								            continue
-												Handle situations where urls are lists

											
										
										
											2023-12-09 14:18:24 +00:00
+								        tag_url = remove_html(url_str)
 								        if not tag_url:
 								            continue
-												Sanitise links to avoid injection attacks in rendered html

											
										
										
											2023-07-12 11:08:02 +00:00
+								        icon_name = tag_url.split('/')[-1]
-												Check queue length before pop

											
										
										
											2024-11-06 14:03:30 +00:00
+								        if len(icon_name) <= 1:
 								            continue
-												Fixing emojis

											
										
										
											2025-01-17 19:28:38 +00:00
+								        if '.' not in icon_name:
-												Check queue length before pop

											
										
										
											2024-11-06 14:03:30 +00:00
+								            continue
 								        icon_name = icon_name.split('.')[0]
 								        # see https://unicode.org/
 								        # emoji/charts/full-emoji-list.html
 								        if '-' not in icon_name:
 								            # a single code
 								            replaced = False
 								            try:
 								                replace_char = chr(int("0x" + icon_name, 16))
 								                if not screen_readable:
 								                    replace_char = \
 								                        '<span aria-hidden="true">' + \
 								                        replace_char + '</span>'
 								                content = \
 								                    content.replace(tag_item['name'],
 								                                    replace_char)
 								                replaced = True
 								            except BaseException:
 								                if debug:
 								                    print('EX: replace_emoji_from_tags 1 ' +
 								                          'no conversion of ' +
 								                          str(icon_name) + ' to chr ' +
 								                          tag_item['name'] + ' ' +
 								                          tag_url)
 								            if not replaced:
 								                _save_custom_emoji(session, base_dir,
 								                                   tag_item['name'],
 								                                   tag_url, debug)
 								                _update_common_emoji(base_dir, icon_name)
 								            else:
 								                _update_common_emoji(base_dir,
 								                                     "0x" + icon_name)
 								        else:
 								            # sequence of codes
 								            icon_codes = icon_name.split('-')
 								            icon_code_sequence = ''
 								            for icode in icon_codes:
 								                replaced = False
 								                try:
 								                    icon_code_sequence += chr(int("0x" +
 								                                                  icode, 16))
 								                    replaced = True
 								                except BaseException:
-												Less indentation

											
										
										
											2024-08-30 12:25:23 +00:00
+								                    icon_code_sequence = ''
-												Check queue length before pop

											
										
										
											2024-11-06 14:03:30 +00:00
+								                    if debug:
 								                        print('EX: ' +
 								                              'replace_emoji_from_tags 2 ' +
 								                              'no conversion of ' +
 								                              str(icode) + ' to chr ' +
 								                              tag_item['name'] + ' ' +
 								                              tag_url)
 								                if not replaced:
 								                    _save_custom_emoji(session, base_dir,
 								                                       tag_item['name'],
 								                                       tag_url, debug)
 								                    _update_common_emoji(base_dir,
 								                                         icon_name)
 								                else:
 								                    _update_common_emoji(base_dir,
 								                                         "0x" + icon_name)
 								            if icon_code_sequence:
 								                if not screen_readable:
 								                    icon_code_sequence = \
 								                        '<span aria-hidden="true">' + \
 								                        icon_code_sequence + '</span>'
 								                content = content.replace(tag_item['name'],
 								                                          icon_code_sequence)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
 								        html_class = 'emoji'
 								        if message_type == 'post header':
 								            html_class = 'emojiheader'
 								        if message_type == 'profile':
 								            html_class = 'emojiprofile'
-												Don't speak emoji eithin display names

											
										
										
											2022-04-21 13:03:40 +00:00
+								        if screen_readable:
 								            emoji_tag_name = tag_item['name'].replace(':', '')
 								        else:
 								            emoji_tag_name = ''
-												Handle situations where urls are lists

											
										
										
											2023-12-09 14:18:24 +00:00
+								        url_str = get_url_from_post(tag_item['icon']['url'])
 								        tag_url = remove_html(url_str)
-												Sanitise links to avoid injection attacks in rendered html

											
										
										
											2023-07-12 11:08:02 +00:00
+								        emoji_html = "<img src=\"" + tag_url + "\" alt=\"" + \
-												Don't speak emoji eithin display names

											
										
										
											2022-04-21 13:03:40 +00:00
+								            emoji_tag_name + \
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            "\" align=\"middle\" class=\"" + html_class + "\"/>"
 								        content = content.replace(tag_item['name'], emoji_html)
-												Insert emoji

											
										
										
											2019-09-29 16:28:02 +00:00
+								    return content
-												Replace tags with unicode characters

											
										
										
											2020-02-21 15:09:31 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def _add_music_tag(content: str, tag: str) -> str:
-												Decrease line lengths

											
										
										
											2020-03-29 09:59:54 +00:00
+								    """If a music link is found then ensure that the post is
 								    tagged appropriately
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
+								    """
-												Don't tag podcasts as music

											
										
										
											2020-10-11 09:50:17 +00:00
+								    if '#podcast' in content or '#documentary' in content:
 								        return content
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
+								    if '#' not in tag:
-												Don't tag podcasts as music

											
										
										
											2020-10-11 09:50:17 +00:00
+								        tag = '#' + tag
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
+								    if tag in content:
 								        return content
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    music_site_found = False
 								    for site in MUSIC_SITES:
-												Spaces

											
										
										
											2021-06-22 12:42:52 +00:00
+								        if site + '/' in content:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            music_site_found = True
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
+								            break
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not music_site_found:
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
+								        return content
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    return ':music: ' + content + ' ' + tag + ' '
-												Add music tags

											
										
										
											2019-09-05 09:54:27 +00:00
-												Shorten any long text inside of existing links

											
										
										
											2022-05-04 11:34:33 +00:00
+								def _shorten_linked_urls(content: str) -> str:
 								    """If content comes with a web link included then make sure
 								    that it is short enough
 								    """
 								    if 'href=' not in content:
 								        return content
 								    if '>' not in content:
 								        return content
 								    if '<' not in content:
 								        return content
 								    sections = content.split('>')
 								    ctr = 0
 								    for section_text in sections:
 								        if ctr == 0:
 								            ctr += 1
 								            continue
 								        if '<' not in section_text:
 								            ctr += 1
 								            continue
 								        section_text = section_text.split('<')[0]
 								        if ' ' in section_text:
 								            continue
 								        if len(section_text) > MAX_LINK_LENGTH:
 								            content = content.replace('>' + section_text + '<',
 								                                      '>' +
 								                                      section_text[:MAX_LINK_LENGTH-1] + '<')
 								        ctr += 1
 								    return content
-												Link to DOI scientific references

											
										
										
											2023-01-04 13:33:05 +00:00
+								def _contains_doi_reference(wrd: str, replace_dict: {}) -> bool:
 								    """Handle DOI scientific references
 								    """
 								    if not wrd.startswith('doi:') and \
 								       not wrd.startswith('DOI:'):
 								        return False
 								    doi_ref_str = wrd.split(':', 1)[1]
 								    doi_site = 'https://sci-hub.ru'
 								    markup = '<a href="' + doi_site + '/' + \
 								        doi_ref_str + '" tabindex="10" ' + \
 								        'rel="nofollow noopener noreferrer" ' + \
 								        'target="_blank">' + \
 								        '<span class="ellipsis">doi:' + doi_ref_str + \
 								        '</span></a>'
 								    replace_dict[wrd] = markup
 								    return True
-												Tidying

											
										
										
											2023-01-04 12:45:36 +00:00
+								def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool:
 								    """Handle arxiv scientific references
 								    """
 								    if not wrd.startswith('arXiv:') and \
 								       not wrd.startswith('arx:') and \
 								       not wrd.startswith('arxiv:'):
 								        return False
 								    arxiv_ref_str = wrd.split(':', 1)[1].lower()
 								    if '.' in arxiv_ref_str:
 								        arxiv_ref = arxiv_ref_str.split('.')
 								    elif ':' in arxiv_ref_str:
 								        arxiv_ref = arxiv_ref_str.split(':')
 								    else:
 								        return False
 								    if len(arxiv_ref) != 2:
 								        return False
 								    if not arxiv_ref[0].isdigit():
 								        return False
 								    arxiv_day = arxiv_ref[1]
 								    if 'v' in arxiv_day:
 								        arxiv_day = arxiv_day.split('v')[0]
 								    if not arxiv_day.isdigit():
 								        return False
 								    ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
 								    markup = '<a href="https://arxiv.org/abs/' + \
 								        ref_str + '" tabindex="10" ' + \
 								        'rel="nofollow noopener noreferrer" ' + \
 								        'target="_blank">' + \
 								        '<span class="ellipsis">arXiv:' + ref_str + \
 								        '</span></a>'
 								    replace_dict[wrd] = markup
 								    return True
-												Link to DOI scientific references

											
										
										
											2023-01-04 13:33:05 +00:00
+								def _contains_academic_references(content: str) -> bool:
 								    """Does the given content contain academic references
 								    """
 								    prefixes = (
 								        'arXiv:', 'arx:', 'arxiv:',
 								        'doi:', 'DOI:'
 								    )
 								    for reference in prefixes:
 								        if reference in content:
 								            return True
 								    return False
-												Remove link tracking gloop from post urls

											
										
										
											2024-04-24 11:10:50 +00:00
+								def remove_link_trackers_from_content(content: str) -> str:
 								    """ Removes any link trackers from urls within the content
 								    """
 								    if '?utm_' not in content:
 								        return content
 								    sections = content.split('?utm_')
 								    ctr = 0
 								    new_content = ''
 								    for section_str in sections:
 								        if ctr == 0:
 								            new_content = section_str
 								            ctr = 1
 								            continue
 								        if '"' in section_str:
 								            new_content += '"' + section_str.split('"', 1)[1]
 								        else:
 								            new_content += section_str
 								        ctr += 1
 								    return new_content
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def add_web_links(content: str) -> str:
-												Add web links

											
										
										
											2019-08-21 12:07:30 +00:00
+								    """Adds markup for web links
 								    """
-												Shorten any long text inside of existing links

											
										
										
											2022-05-04 11:34:33 +00:00
+								    content = _shorten_linked_urls(content)
-												Sanity check for adding links

											
										
										
											2020-06-11 09:43:48 +00:00
+								    if ':' not in content:
 								        return content
-												Snake case

											
										
										
											2021-12-27 17:32:34 +00:00
+								    prefixes = get_link_prefixes()
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
 								    # do any of these prefixes exist within the content?
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    prefix_found = False
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								    for prefix in prefixes:
 								        if prefix in content:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            prefix_found = True
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								            break
 								    # if there are no prefixes then just keep the content we have
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not prefix_found:
-												Link to DOI scientific references

											
										
										
											2023-01-04 13:33:05 +00:00
+								        if _contains_academic_references(content):
-												Support arxiv scientific publication references

											
										
										
											2023-01-04 11:53:15 +00:00
+								            prefix_found = True
 								        else:
 								            return content
-												Add web links

											
										
										
											2019-08-21 12:07:30 +00:00
-												Remove carriage returns

											
										
										
											2020-05-22 11:32:38 +00:00
+								    content = content.replace('\r', '')
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    words = content.replace('\n', ' --linebreak-- ').split(' ')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    replace_dict = {}
 								    for wrd in words:
 								        if ':' not in wrd:
-												Sanity check for adding links

											
										
										
											2020-06-11 09:43:48 +00:00
+								            continue
-												Tidying

											
										
										
											2023-01-04 12:45:36 +00:00
+								        if _contains_arxiv_reference(wrd, replace_dict):
-												Support arxiv scientific publication references

											
										
										
											2023-01-04 11:53:15 +00:00
+								            continue
-												Link to DOI scientific references

											
										
										
											2023-01-04 13:33:05 +00:00
+								        if _contains_doi_reference(wrd, replace_dict):
 								            continue
-												Remove link tracking for outgoing posts

											
										
										
											2024-04-24 10:38:45 +00:00
+								        # does the word begin with a link prefix?
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        prefix_found = False
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								        for prefix in prefixes:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if wrd.startswith(prefix):
 								                prefix_found = True
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								                break
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not prefix_found:
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								            continue
-												Remove link tracking for outgoing posts

											
										
										
											2024-04-24 10:38:45 +00:00
+								        # the word contains a link prefix
 								        url = wrd
 								        if url.endswith('.') or wrd.endswith(';'):
 								            url = url[:-1]
 								        url = remove_link_tracking(url)
 								        markup = '<a href="' + url + '" tabindex="10" ' + \
-												Tab index for post links

											
										
										
											2022-05-25 12:57:31 +00:00
+								            'rel="nofollow noopener noreferrer" target="_blank">'
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								        for prefix in prefixes:
-												Remove link tracking for outgoing posts

											
										
										
											2024-04-24 10:38:45 +00:00
+								            if url.startswith(prefix):
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								                markup += '<span class="invisible">' + prefix + '</span>'
 								                break
-												Remove link tracking for outgoing posts

											
										
										
											2024-04-24 10:38:45 +00:00
+								        link_text = url
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								        for prefix in prefixes:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            link_text = link_text.replace(prefix, '')
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								        # prevent links from becoming too long
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if len(link_text) > MAX_LINK_LENGTH:
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								            markup += '<span class="ellipsis">' + \
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                link_text[:MAX_LINK_LENGTH] + '</span>'
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								            markup += '<span class="invisible">' + \
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                link_text[MAX_LINK_LENGTH:] + '</span></a>'
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
+								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            markup += '<span class="ellipsis">' + link_text + '</span></a>'
-												Remove link tracking for outgoing posts

											
										
										
											2024-04-24 10:38:45 +00:00
+								        replace_dict[url] = markup
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
 								    # do the replacements
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for url, markup in replace_dict.items():
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        content = content.replace(url, markup)
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
 								    # replace any line breaks
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    content = content.replace(' --linebreak-- ', '<br>')
-												Tidying

											
										
										
											2020-06-11 11:56:08 +00:00
-												Add web links

											
										
										
											2019-08-21 12:07:30 +00:00
+								    return content
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Unit test for safe html

											
										
										
											2022-01-14 10:20:37 +00:00
+								def safe_web_text(arbitrary_html: str) -> str:
 								    """Turns arbitrary html into something safe.
 								    So if the arbitrary html contains attack scripts those will be removed
 								    """
 								    # first remove the markup, so that we have something safe
 								    safe_text = remove_html(arbitrary_html)
 								    if not safe_text:
 								        return ''
 								    # remove any spurious characters found in podcast descriptions
-												Extra removal

											
										
										
											2022-01-14 19:05:26 +00:00
+								    remove_chars = ('Œ', 'â€', 'ğŸ', '<EFBFBD>', ']]', '__')
-												Unit test for safe html

											
										
										
											2022-01-14 10:20:37 +00:00
+								    for remchar in remove_chars:
 								        safe_text = safe_text.replace(remchar, '')
 								    # recreate any url links safely
 								    return add_web_links(safe_text)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def _add_hash_tags(word_str: str, http_prefix: str, domain: str,
 								                   replace_hashtags: {}, post_hashtags: {}) -> bool:
-												Add hashtag conversion to html

											
										
										
											2019-08-09 11:12:08 +00:00
+								    """Detects hashtags and adds them to the replacements dict
 								    Also updates the hashtags list to be added to the post
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if replace_hashtags.get(word_str):
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    hashtag = word_str[1:]
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    if not valid_hash_tag(hashtag):
-												Add hashtag conversion to html

											
										
										
											2019-08-09 11:12:08 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    hashtag_url = http_prefix + "://" + domain + "/tags/" + hashtag
 								    post_hashtags[hashtag] = {
 								        'href': hashtag_url,
-												Extract hashtags from newswire feeds

											
										
										
											2020-10-16 20:13:23 +00:00
+								        'name': '#' + hashtag,
-												Add hashtag conversion to html

											
										
										
											2019-08-09 11:12:08 +00:00
+								        'type': 'Hashtag'
 								    }
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    replace_hashtags[word_str] = "<a href=\"" + hashtag_url + \
-												Screen readers don't speak the # symbol

											
										
										
											2022-11-11 11:08:48 +00:00
+								        "\" class=\"mention hashtag\" rel=\"tag\" tabindex=\"10\">" + \
-												Use hidden

											
										
										
											2022-11-11 11:40:43 +00:00
+								        "<span aria-hidden=\"true\">#</span><span>" + \
 								        hashtag + "</span></a>"
-												Add hashtag conversion to html

											
										
										
											2019-08-09 11:12:08 +00:00
+								    return True
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Display of remote hashtags to avoid showing Mastodon UI

											
										
										
											2023-01-05 22:19:38 +00:00
+								def replace_remote_hashtags(content: str,
 								                            nickname: str, domain: str) -> str:
 								    """Replaces remote hashtags with a local version
 								    """
 								    if not domain:
 								        return content
 								    if ' href="' not in content:
 								        return content
 								    sections = content.split(' href="')
 								    ctr = 0
 								    replacements = {}
 								    for section in sections:
 								        if ctr == 0:
 								            ctr += 1
 								            continue
 								        if '"' not in section:
 								            ctr += 1
 								            continue
 								        link = section.split('"')[0]
-												Check link

											
										
										
											2023-01-05 22:22:42 +00:00
+								        if '://' not in link:
 								            continue
-												Display of remote hashtags to avoid showing Mastodon UI

											
										
										
											2023-01-05 22:19:38 +00:00
+								        if '?remotetag=' in link:
 								            ctr += 1
 								            continue
 								        if '/tags/' not in link:
 								            ctr += 1
 								            continue
 								        if '/' + domain not in link:
 								            new_link = '/users/' + nickname + \
 								                '?remotetag=' + link.replace('/', '--')
 								            replacements[link] = new_link
 								        ctr += 1
 								    if not replacements:
 								        return content
 								    for old_link, new_link in replacements.items():
 								        content = content.replace('"' + old_link + '"',
 								                                  '"' + new_link + '"')
 								    return content
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def _add_emoji(base_dir: str, word_str: str,
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								               http_prefix: str, domain: str,
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								               replace_emoji: {}, post_tags: {},
 								               emoji_dict: {}) -> bool:
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								    """Detects Emoji and adds them to the replacements dict
 								    Also updates the tags list to be added to the post
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not word_str.startswith(':'):
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not word_str.endswith(':'):
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if len(word_str) < 3:
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if replace_emoji.get(word_str):
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        return True
-												Comments

											
										
										
											2019-09-23 11:11:13 +00:00
+								    # remove leading and trailing : characters
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    emoji = word_str[1:]
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    emoji = emoji[:-1]
-												Comments

											
										
										
											2019-09-23 11:11:13 +00:00
+								    # is the text of the emoji valid?
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    if not valid_hash_tag(emoji):
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not emoji_dict.get(emoji):
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    emoji_filename = base_dir + '/emoji/' + emoji_dict[emoji] + '.png'
 								    if not os.path.isfile(emoji_filename):
-												Adding post tags for custom emoji

											
										
										
											2022-03-29 21:10:09 +00:00
+								        emoji_filename = \
 								            base_dir + '/emojicustom/' + emoji_dict[emoji] + '.png'
 								        if not os.path.isfile(emoji_filename):
 								            return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    emoji_url = http_prefix + "://" + domain + \
 								        "/emoji/" + emoji_dict[emoji] + '.png'
 								    post_tags[emoji] = {
-												Emoji tag icon

											
										
										
											2019-08-19 13:35:55 +00:00
+								        'icon': {
 								            'mediaType': 'image/png',
 								            'type': 'Image',
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            'url': emoji_url
-												Emoji tag icon

											
										
										
											2019-08-19 13:35:55 +00:00
+								        },
-												Spaces

											
										
										
											2021-06-22 12:42:52 +00:00
+								        'name': ':' + emoji + ':',
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        "updated": file_last_modified(emoji_filename),
 								        "id": emoji_url.replace('.png', ''),
-												Emoji in posts

											
										
										
											2019-08-09 16:18:00 +00:00
+								        'type': 'Emoji'
 								    }
 								    return True
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								def post_tag_exists(tag_type: str, tag_name: str, tags: {}) -> bool:
-												Check if tag already exists

											
										
										
											2020-12-13 20:07:45 +00:00
+								    """Returns true if a tag exists in the given dict
 								    """
 								    for tag in tags:
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								        if tag['name'] == tag_name and tag['type'] == tag_type:
-												Check if tag already exists

											
										
										
											2020-12-13 20:07:45 +00:00
+								            return True
 								    return False
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								def _mention_to_url(base_dir: str, http_prefix: str,
 								                    domain: str, nickname: str) -> str:
 								    """Convert https://somedomain/@somenick to
 								    https://somedomain/users/somenick
 								    This uses the hack of trying the cache directory to see if
 								    there is a matching actor
 								    """
 								    possible_paths = get_user_paths()
 								    cache_dir = base_dir + '/cache/actors'
 								    cache_path_start = cache_dir + '/' + http_prefix + ':##' + domain
 								    for users_path in possible_paths:
 								        users_path = users_path.replace('/', '#')
 								        possible_cache_entry = \
 								            cache_path_start + users_path + nickname + '.json'
 								        if os.path.isfile(possible_cache_entry):
 								            return http_prefix + '://' + \
 								                domain + users_path.replace('#', '/') + nickname
-												Handle no user path

											
										
										
											2023-05-10 11:53:38 +00:00
+								    possible_cache_entry = \
 								        cache_path_start + '#' + nickname + '.json'
 								    if os.path.isfile(possible_cache_entry):
 								        return http_prefix + '://' + domain + '/' + nickname
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								    return http_prefix + '://' + domain + '/users/' + nickname
 								def _add_mention(base_dir: str, word_str: str, http_prefix: str,
 								                 following: [], petnames: [], replace_mentions: {},
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                 recipients: [], tags: {}) -> bool:
-												Decrease line lengths

											
										
										
											2020-03-29 09:59:54 +00:00
+								    """Detects mentions and adds them to the replacements dict and
 								    recipients list
-												Refactor mentions replacements

											
										
										
											2019-08-09 09:09:21 +00:00
+								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    possible_handle = word_str[1:]
-												Detecting mentions for @nick@domain

											
										
										
											2019-08-19 10:05:50 +00:00
+								    # @nick
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if following and '@' not in possible_handle:
-												Handle mentions without domains

											
										
										
											2019-08-09 09:48:51 +00:00
+								        # fall back to a best effort match against the following list
 								        # if no domain was specified. eg. @nick
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        possible_nickname = possible_handle
-												Handle mentions without domains

											
										
										
											2019-08-09 09:48:51 +00:00
+								        for follow in following:
-												Fallback mentions replacement

											
										
										
											2021-01-29 21:33:23 +00:00
+								            if '@' not in follow:
 								                continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            follow_nick = follow.split('@')[0]
-												Less indentation

											
										
										
											2024-08-30 12:25:23 +00:00
+								            if possible_nickname != follow_nick:
 								                continue
 								            follow_str = remove_eol(follow)
 								            replace_domain = follow_str.split('@')[1]
 								            recipient_actor = \
 								                _mention_to_url(base_dir, http_prefix,
 								                                replace_domain, possible_nickname)
 								            if recipient_actor not in recipients:
 								                recipients.append(recipient_actor)
 								            tags[word_str] = {
 								                'href': recipient_actor,
 								                'name': word_str,
 								                'type': 'Mention'
 								            }
 								            replace_mentions[word_str] = \
 								                "<span class=\"h-card\"><a href=\"" + recipient_actor + \
 								                "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
 								                possible_nickname + "</span></a></span>"
 								            return True
 								        # try replacing petnames with mentions
 								        follow_ctr = 0
 								        for follow in following:
 								            if '@' not in follow:
 								                follow_ctr += 1
 								                continue
 								            pet = remove_eol(petnames[follow_ctr])
 								            if pet:
 								                if possible_nickname != pet:
 								                    follow_ctr += 1
 								                    continue
-												Function for line ending characters

											
										
										
											2022-06-21 11:58:50 +00:00
+								                follow_str = remove_eol(follow)
-												Less indentation

											
										
										
											2024-08-30 12:25:23 +00:00
+								                replace_nickname = follow_str.split('@')[0]
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                replace_domain = follow_str.split('@')[1]
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								                recipient_actor = \
 								                    _mention_to_url(base_dir, http_prefix,
-												Less indentation

											
										
										
											2024-08-30 12:25:23 +00:00
+								                                    replace_domain, replace_nickname)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                if recipient_actor not in recipients:
 								                    recipients.append(recipient_actor)
 								                tags[word_str] = {
 								                    'href': recipient_actor,
 								                    'name': word_str,
-												Add mention tags

											
										
										
											2019-08-19 12:13:18 +00:00
+								                    'type': 'Mention'
 								                }
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                replace_mentions[word_str] = \
-												Less indentation

											
										
										
											2024-08-30 12:25:23 +00:00
+								                    "<span class=\"h-card\"><a href=\"" + \
 								                    recipient_actor + "\" tabindex=\"10\" " + \
 								                    "class=\"u-url mention\">@<span>" + \
 								                    replace_nickname + "</span></a></span>"
-												Handle mentions without domains

											
										
										
											2019-08-09 09:48:51 +00:00
+								                return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            follow_ctr += 1
-												Handle mentions without domains

											
										
										
											2019-08-09 09:48:51 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    possible_nickname = None
 								    possible_domain = None
 								    if '@' not in possible_handle:
-												Check for valid handle

											
										
										
											2019-10-29 20:15:21 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    possible_nickname = possible_handle.split('@')[0]
 								    if not possible_nickname:
-												Check for valid handle

											
										
										
											2019-10-29 20:15:21 +00:00
+								        return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    possible_domain = \
 								        possible_handle.split('@')[1].strip('\n').strip('\r')
 								    if not possible_domain:
-												Check for valid handle

											
										
										
											2019-10-29 20:15:21 +00:00
+								        return False
-												Handle mentions even if not following

											
										
										
											2019-08-19 11:41:15 +00:00
+								    if following:
 								        for follow in following:
-												Function for line ending characters

											
										
										
											2022-06-21 11:58:50 +00:00
+								            if remove_eol(follow) != possible_handle:
-												Handle mentions even if not following

											
										
										
											2019-08-19 11:41:15 +00:00
+								                continue
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								            recipient_actor = \
 								                _mention_to_url(base_dir, http_prefix,
 								                                possible_domain, possible_nickname)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if recipient_actor not in recipients:
 								                recipients.append(recipient_actor)
 								            tags[word_str] = {
 								                'href': recipient_actor,
 								                'name': word_str,
-												Add mention tags

											
										
										
											2019-08-19 12:13:18 +00:00
+								                'type': 'Mention'
 								            }
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            replace_mentions[word_str] = \
-												Tidying

											
										
										
											2022-09-03 16:22:56 +00:00
+								                "<span class=\"h-card\"><a href=\"" + recipient_actor + \
-												Tab index for post links

											
										
										
											2022-05-25 12:57:31 +00:00
+								                "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
 								                possible_nickname + "</span></a></span>"
-												Handle mentions even if not following

											
										
										
											2019-08-19 11:41:15 +00:00
+								            return True
-												Detecting mentions for @nick@domain

											
										
										
											2019-08-19 10:05:50 +00:00
+								    # @nick@domain
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not (possible_domain == 'localhost' or '.' in possible_domain):
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
+								        return False
-												Full url for mentions

											
										
										
											2022-09-03 17:09:00 +00:00
+								    recipient_actor = \
 								        _mention_to_url(base_dir, http_prefix,
 								                        possible_domain, possible_nickname)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if recipient_actor not in recipients:
 								        recipients.append(recipient_actor)
 								    tags[word_str] = {
 								        'href': recipient_actor,
 								        'name': word_str,
-												Check for valid handle

											
										
										
											2019-10-29 20:15:21 +00:00
+								        'type': 'Mention'
 								    }
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    replace_mentions[word_str] = \
-												Tidying

											
										
										
											2022-09-03 16:22:56 +00:00
+								        "<span class=\"h-card\"><a href=\"" + recipient_actor + \
-												Tab index for post links

											
										
										
											2022-05-25 12:57:31 +00:00
+								        "\" tabindex=\"10\" class=\"u-url mention\">@<span>" + \
 								        possible_nickname + "</span></a></span>"
-												Check for valid handle

											
										
										
											2019-10-29 20:15:21 +00:00
+								    return True
-												Refactor mentions replacements

											
										
										
											2019-08-09 09:09:21 +00:00
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def replace_content_duplicates(content: str) -> str:
-												Extra checks for bad content format

											
										
										
											2020-05-12 09:34:58 +00:00
+								    """Replaces invalid duplicates within content
 								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Extra checks for bad content format

											
										
										
											2020-05-12 09:34:58 +00:00
+								    while '<<' in content:
 								        content = content.replace('<<', '<')
 								    while '>>' in content:
 								        content = content.replace('>>', '>')
-												Check for duplicate brackets in content

											
										
										
											2020-05-12 09:42:24 +00:00
+								    content = content.replace('<\\p>', '')
-												Extra checks for bad content format

											
										
										
											2020-05-12 09:34:58 +00:00
+								    return content
-												Allow bold markup for bold reading

											
										
										
											2022-03-24 14:40:28 +00:00
+								def remove_text_formatting(content: str, bold_reading: bool) -> str:
-												Remove formatting such as bold and italics from posts

This is to be conformant with what Mastodon is doing. Probably markdown should be an optional extra

											
										
										
											2020-06-14 13:25:38 +00:00
+								    """Removes markup for bold, italics, etc
 								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Remove formatting such as bold and italics from posts

This is to be conformant with what Mastodon is doing. Probably markdown should be an optional extra

											
										
										
											2020-06-14 13:25:38 +00:00
+								    if '<' not in content:
 								        return content
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for markup in REMOVE_MARKUP:
-												Allow bold markup for bold reading

											
										
										
											2022-03-24 14:40:28 +00:00
+								        if bold_reading:
 								            if markup == 'b':
 								                continue
-												Tidying

											
										
										
											2020-06-14 13:39:03 +00:00
+								        content = content.replace('<' + markup + '>', '')
 								        content = content.replace('</' + markup + '>', '')
 								        content = content.replace('<' + markup.upper() + '>', '')
 								        content = content.replace('</' + markup.upper() + '>', '')
-												Remove formatting such as bold and italics from posts

This is to be conformant with what Mastodon is doing. Probably markdown should be an optional extra

											
										
										
											2020-06-14 13:25:38 +00:00
+								    return content
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def remove_long_words(content: str, max_word_length: int,
 								                      long_words_list: []) -> str:
-												Decrease line lengths

											
										
										
											2020-03-29 09:59:54 +00:00
+								    """Breaks up long words so that on mobile screens this doesn't
 								    disrupt the layout
-												Remove very long words

											
										
										
											2019-10-09 12:19:17 +00:00
+								    """
-												Snake case

											
										
										
											2021-12-26 19:15:36 +00:00
+								    if is_pgp_encrypted(content) or contains_pgp_public_key(content):
-												Decrypt pgp encrypted DMs

											
										
										
											2021-03-11 17:15:32 +00:00
+								        return content
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    content = replace_content_duplicates(content)
-												Fix for removing long words

											
										
										
											2024-09-21 20:52:13 +00:00
 								    non_html_list = False
 								    if '\n\n' in content and '<p>' not in content:
 								        content = '<p>' + content.replace('\n\n', '</p> <p>') + '</p>'
 								        non_html_list = True
 								    non_html_list2 = False
 								    if '\n' in content and '<p>' not in content:
 								        content = '<p>' + content.replace('\n', '</p> <p>') + '</p>'
 								        non_html_list2 = True
-												Another fix for removing long words

											
										
										
											2024-02-07 10:48:46 +00:00
+								    if ' ' not in content and '</p><p>' not in content:
-												Handle single long lines with no spaces

											
										
										
											2019-12-13 12:41:26 +00:00
+								        # handle a single very long string with no spaces
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        content_str = content.replace('<p>', '').replace(r'<\p>', '')
 								        if '://' not in content_str:
 								            if len(content_str) > max_word_length:
-												Handle single long lines with no spaces

											
										
										
											2019-12-13 12:41:26 +00:00
+								                if '<p>' in content:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    content = '<p>' + content_str[:max_word_length] + r'<\p>'
-												Handle single long lines with no spaces

											
										
										
											2019-12-13 12:41:26 +00:00
+								                else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    content = content[:max_word_length]
-												Handle single long lines with no spaces

											
										
										
											2019-12-13 12:41:26 +00:00
+								                return content
-												Fix for empty paragraphs during long string check

											
										
										
											2022-04-08 16:16:16 +00:00
+								    content = content.replace('<p></p>', '<p> </p>')
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    words = content.split(' ')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not long_words_list:
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								        long_words_list: list[str] = []
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        for word_str in words:
 								            if len(word_str) > max_word_length:
 								                if word_str not in long_words_list:
 								                    long_words_list.append(word_str)
 								    for word_str in long_words_list:
-												Another fix for removing long words

											
										
										
											2024-02-07 10:48:46 +00:00
+								        original_word_str = word_str
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if word_str.startswith('<p>'):
 								            word_str = word_str.replace('<p>', '')
 								        if word_str.startswith('<'):
-												Tidying

											
										
										
											2019-10-18 12:24:31 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if len(word_str) == 76:
 								            if word_str.upper() == word_str:
-												Allow for tox addresses

											
										
										
											2020-03-22 14:29:34 +00:00
+								                # tox address
 								                continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '=\"' in word_str:
-												@

											
										
										
											2019-11-04 21:08:43 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '@' in word_str:
 								            if '@@' not in word_str:
-												Check for repeats

											
										
										
											2019-11-04 21:11:09 +00:00
+								                continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '=.ed25519' in word_str:
-												Extra exceptions

											
										
										
											2020-01-25 10:49:59 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '.onion' in word_str:
-												Extra exceptions

											
										
										
											2020-01-25 10:49:59 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '.i2p' in word_str:
-												Extra exceptions

											
										
										
											2020-01-25 10:49:59 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if 'https:' in word_str:
-												Don't trunkate web links

											
										
										
											2019-10-25 18:27:32 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'http:' in word_str:
-												Try to remove long words

											
										
										
											2019-11-04 20:39:14 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'i2p:' in word_str:
-												Support use on i2p addresses

											
										
										
											2020-02-17 17:18:21 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'gnunet:' in word_str:
-												Support for gnunet

											
										
										
											2020-06-09 11:51:51 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'dat:' in word_str:
-												Try to remove long words

											
										
										
											2019-11-04 20:39:14 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'rad:' in word_str:
-												Don't remove radicle links

											
										
										
											2020-12-06 10:18:41 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'hyper:' in word_str:
-												Support hypercore protocol

											
										
										
											2020-05-17 09:37:59 +00:00
+								            continue
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								        if 'briar:' in word_str:
-												Support hypercore protocol

											
										
										
											2020-05-17 09:37:59 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '<' in word_str:
 								            replace_word = word_str.split('<', 1)[0]
 								            # if len(replace_word) > max_word_length:
 								            #     replace_word = replace_word[:max_word_length]
 								            content = content.replace(word_str, replace_word)
 								            word_str = replace_word
 								        if '/' in word_str:
-												Don't trunkate web links

											
										
										
											2019-10-25 18:27:32 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if len(word_str[max_word_length:]) < max_word_length:
-												Another fix for removing long words

											
										
										
											2024-02-07 10:48:46 +00:00
+								            end_of_line_char = '\n'
 								            if '<br>' in original_word_str:
 								                end_of_line_char = ''
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            content = content.replace(word_str,
-												Another fix for removing long words

											
										
										
											2024-02-07 10:48:46 +00:00
+								                                      word_str[:max_word_length] +
 								                                      end_of_line_char +
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                                      word_str[max_word_length:])
-												Tidying

											
										
										
											2019-10-18 12:24:31 +00:00
+								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            content = content.replace(word_str,
 								                                      word_str[:max_word_length])
-												Another test case for long text

											
										
										
											2020-01-24 11:27:12 +00:00
+								    if content.startswith('<p>'):
 								        if not content.endswith('</p>'):
-												Removing long words from feed titles

											
										
										
											2020-10-31 23:10:38 +00:00
+								            content = content.strip() + '</p>'
-												Fix for empty paragraphs during long string check

											
										
										
											2022-04-08 16:16:16 +00:00
+								    content = content.replace('<p> </p>', '<p></p>')
-												Fix for removing long words

											
										
										
											2024-09-21 20:52:13 +00:00
+								    if non_html_list:
 								        content = content.replace('</p> <p>', '\n\n')
 								        content = content.replace('<p>', '')
 								        content = content.replace('</p>', '')
 								    if non_html_list2:
 								        content = content.replace('</p> <p>', '\n')
 								        content = content.replace('<p>', '')
 								        content = content.replace('</p>', '')
 								    content = content.replace('</p> <p>', '</p><p>')
-												Remove very long words

											
										
										
											2019-10-09 12:19:17 +00:00
+								    return content
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def _load_auto_tags(base_dir: str, nickname: str, domain: str) -> []:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								    """Loads automatic tags file and returns a list containing
 								    the lines of the file
 								    """
-												Snake case

											
										
										
											2021-12-26 12:02:29 +00:00
+								    filename = acct_dir(base_dir, nickname, domain) + '/autotags.txt'
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								    if not os.path.isfile(filename):
 								        return []
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								    try:
-												Standardise file pointer names

											
										
										
											2024-07-14 13:01:46 +00:00
+								        with open(filename, 'r', encoding='utf-8') as fp_tags:
 								            return fp_tags.readlines()
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								    except OSError:
 								        print('EX: unable to read auto tags ' + filename)
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								    return []
-												Tidying

											
										
										
											2024-03-19 21:03:09 +00:00
+								def _auto_tag(word_str: str, auto_tag_list: [], append_tags: []) -> None:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								    """Generates a list of tags to be automatically appended to the content
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for tag_rule in auto_tag_list:
 								        if word_str not in tag_rule:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '->' not in tag_rule:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        rulematch = tag_rule.split('->')[0].strip()
 								        if rulematch != word_str:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        tag_name = tag_rule.split('->')[1].strip()
 								        if tag_name.startswith('#'):
 								            if tag_name not in append_tags:
 								                append_tags.append(tag_name)
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if '#' + tag_name not in append_tags:
 								                append_tags.append('#' + tag_name)
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
-												Refator

											
										
										
											2022-07-05 11:12:07 +00:00
+								def _get_simplified_content(content: str) -> str:
-												Comment

											
										
										
											2022-07-05 11:12:58 +00:00
+								    """Returns a simplified version of the content suitable for
 								    splitting up into individual words
-												Refator

											
										
										
											2022-07-05 11:12:07 +00:00
+								    """
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								    replacements = {
 								        ',': ' ',
 								        ';': ' ',
 								        '- ': ' '
 								    }
 								    content_simplified = replace_strings(content, replacements)
-												Refator

											
										
										
											2022-07-05 11:12:07 +00:00
+								    content_simplified = content_simplified.replace('. ', ' ').strip()
 								    if content_simplified.endswith('.'):
 								        content_simplified = content_simplified[:len(content_simplified)-1]
-												Indentation

											
										
										
											2022-07-05 11:15:26 +00:00
+								    return content_simplified
-												Refator

											
										
										
											2022-07-05 11:12:07 +00:00
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								def detect_dogwhistles(content: str, dogwhistles: {}) -> {}:
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								    """Returns a dict containing any detected dogwhistle words
 								    """
-												Compare lower case

											
										
										
											2022-07-05 18:47:23 +00:00
+								    content = remove_html(content).lower()
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								    result = {}
 								    words = _get_simplified_content(content).split(' ')
 								    for whistle, category in dogwhistles.items():
-												Require a category

											
										
										
											2022-07-05 16:25:31 +00:00
+								        if not category:
 								            continue
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								        ending = False
-												Add dogwhistle warnings

											
										
										
											2022-07-05 16:21:48 +00:00
+								        starting = False
-												Compare lower case

											
										
										
											2022-07-05 18:47:23 +00:00
+								        whistle = whistle.lower()
-												Add dogwhistle warnings

											
										
										
											2022-07-05 16:21:48 +00:00
-												Compare lower case

											
										
										
											2022-07-05 18:47:23 +00:00
+								        if whistle.startswith('x-'):
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								            whistle = whistle[2:]
 								            ending = True
-												Tidying

											
										
										
											2024-04-10 12:23:59 +00:00
+								        elif string_ends_with(whistle, ('*', '~', '-')):
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								            whistle = whistle[1:]
 								            ending = True
 								        if ending:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								            prev_wrd = ''
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								            for wrd in words:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                wrd2 = (prev_wrd + ' ' + wrd).strip()
 								                if wrd.endswith(whistle) or wrd2.endswith(whistle):
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								                    if not result.get(whistle):
 								                        result[whistle] = {
 								                            "count": 1,
 								                            "category": category
 								                        }
 								                    else:
 								                        result[whistle]['count'] += 1
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                prev_wrd = wrd
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            continue
 								        if whistle.lower().endswith('-x'):
 								            whistle = whistle[:len(whistle)-2]
 								            starting = True
-												Tidying

											
										
										
											2024-04-10 12:23:59 +00:00
+								        elif string_ends_with(whistle, ('*', '~', '-')):
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            whistle = whistle[:len(whistle)-1]
 								            starting = True
 								        if starting:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								            prev_wrd = ''
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            for wrd in words:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                wrd2 = (prev_wrd + ' ' + wrd).strip()
 								                if wrd.startswith(whistle) or wrd2.startswith(whistle):
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								                    if not result.get(whistle):
 								                        result[whistle] = {
 								                            "count": 1,
 								                            "category": category
 								                        }
 								                    else:
 								                        result[whistle]['count'] += 1
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                prev_wrd = wrd
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            continue
 								        if '*' in whistle:
 								            whistle_start = whistle.split('*', 1)[0]
 								            whistle_end = whistle.split('*', 1)[1]
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								            prev_wrd = ''
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            for wrd in words:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                wrd2 = (prev_wrd + ' ' + wrd).strip()
 								                if ((wrd.startswith(whistle_start) and
 								                     wrd.endswith(whistle_end)) or
 								                    (wrd2.startswith(whistle_start) and
 								                     wrd2.endswith(whistle_end))):
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								                    if not result.get(whistle):
 								                        result[whistle] = {
 								                            "count": 1,
 								                            "category": category
 								                        }
 								                    else:
 								                        result[whistle]['count'] += 1
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								                prev_wrd = wrd
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								            continue
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								        prev_wrd = ''
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								        for wrd in words:
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								            wrd2 = (prev_wrd + ' ' + wrd).strip()
 								            if whistle in (wrd, wrd2):
-												Wildcard in middle

											
										
										
											2022-07-05 19:35:38 +00:00
+								                if not result.get(whistle):
 								                    result[whistle] = {
 								                        "count": 1,
 								                        "category": category
 								                    }
 								                else:
 								                    result[whistle]['count'] += 1
-												Handle double words

											
										
										
											2022-07-05 19:45:18 +00:00
+								            prev_wrd = wrd
-												Unit test for dogwhistles

											
										
										
											2022-07-05 11:37:35 +00:00
+								    return result
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								def load_dogwhistles(filename: str) -> {}:
 								    """Loads a list of dogwhistles from file
 								    """
 								    if not os.path.isfile(filename):
 								        return {}
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    dogwhistle_lines: list[str] = []
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								    try:
 								        with open(filename, 'r', encoding='utf-8') as fp_dogwhistles:
 								            dogwhistle_lines = fp_dogwhistles.readlines()
 								    except OSError:
 								        print('EX: unable to load dogwhistles from ' + filename)
 								        return {}
-												Edit dogwhistle words

											
										
										
											2022-07-05 17:52:19 +00:00
+								    separators = ('->', '=>', ',', ';', '|', '=')
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								    dogwhistles = {}
 								    for line in dogwhistle_lines:
-												Typo

											
										
										
											2022-07-05 18:14:57 +00:00
+								        line = remove_eol(line).strip()
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								        if not line:
 								            continue
 								        if line.startswith('#'):
 								            continue
 								        whistle = None
 								        category = None
 								        for sep in separators:
 								            if sep in line:
-												Edit dogwhistle words

											
										
										
											2022-07-05 17:52:19 +00:00
+								                whistle = line.split(sep, 1)[0].strip()
 								                category = line.split(sep, 1)[1].strip()
-												Load dogwhistles from file

											
										
										
											2022-07-05 12:30:21 +00:00
+								                break
 								        if not whistle:
 								            whistle = line
 								        dogwhistles[whistle] = category
 								    return dogwhistles
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def add_html_tags(base_dir: str, http_prefix: str,
 								                  nickname: str, domain: str, content: str,
-												Translate the nowplaying hashtag

											
										
										
											2022-07-18 16:18:04 +00:00
+								                  recipients: [], hashtags: {}, translate: {},
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                  is_json_content: bool = False) -> str:
-												mentions function

											
										
										
											2019-07-15 14:11:31 +00:00
+								    """ Replaces plaintext mentions such as @nick@domain into html
 								    by matching against known following accounts
 								    """
 								    if content.startswith('<p>'):
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								        content = html_replace_email_quote(content)
 								        return html_replace_quote_marks(content)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    max_word_length = 40
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								    replacements = {
 								        '\r': '',
 								        '\n': ' --linebreak-- '
 								    }
 								    content = replace_strings(content, replacements)
-												Translate the nowplaying hashtag

											
										
										
											2022-07-18 16:18:04 +00:00
+								    now_playing_str = 'NowPlaying'
 								    if translate.get(now_playing_str):
 								        now_playing_str = translate[now_playing_str]
-												Replace nowplaying with capitalized version

											
										
										
											2022-07-19 13:59:51 +00:00
+								    now_playing_lower_str = 'nowplaying'
 								    if translate.get(now_playing_lower_str):
 								        now_playing_lower_str = translate[now_playing_lower_str]
 								    if '#' + now_playing_lower_str in content:
 								        content = content.replace('#' + now_playing_lower_str,
 								                                  '#' + now_playing_str)
-												Translate the nowplaying hashtag

											
										
										
											2022-07-18 16:18:04 +00:00
+								    content = _add_music_tag(content, now_playing_str)
-												Refator

											
										
										
											2022-07-05 11:12:07 +00:00
+								    words = _get_simplified_content(content).split(' ')
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
-												Handle . within content when extracting tags

											
										
										
											2019-08-19 11:07:04 +00:00
+								    # remove . for words which are not mentions
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    new_words: list[str] = []
-												Tidying

											
										
										
											2024-03-20 13:04:25 +00:00
+								    for _, word_str in enumerate(words):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if word_str.endswith('.'):
 								            if not word_str.startswith('@'):
 								                word_str = word_str[:-1]
 								        if word_str.startswith('.'):
 								            word_str = word_str[1:]
 								        new_words.append(word_str)
 								    words = new_words
 								    replace_mentions = {}
 								    replace_hashtags = {}
 								    replace_emoji = {}
 								    emoji_dict = {}
 								    original_domain = domain
-												Snake case

											
										
										
											2021-12-26 18:17:37 +00:00
+								    domain = remove_domain_port(domain)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    following_filename = \
 								        acct_dir(base_dir, nickname, domain) + '/following.txt'
-												Refactor mentions replacements

											
										
										
											2019-08-09 09:09:21 +00:00
 								    # read the following list so that we can detect just @nick
 								    # in addition to @nick@domain
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    following = None
-												Fallback mentions replacement

											
										
										
											2021-01-29 21:33:23 +00:00
+								    petnames = None
-												Tidying

											
										
										
											2019-10-18 12:24:31 +00:00
+								    if '@' in words:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if os.path.isfile(following_filename):
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								            following: list[str] = []
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								            try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								                with open(following_filename, 'r',
-												Standardise file pointer names

											
										
										
											2024-07-14 13:01:46 +00:00
+								                          encoding='utf-8') as fp_foll:
 								                    following = fp_foll.readlines()
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								            except OSError:
-												Include function name in exception

											
										
										
											2024-07-02 22:16:13 +00:00
+								                print('EX: add_html_tags unable to read ' +
 								                      following_filename)
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								            for handle in following:
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								                pet = get_pet_name(base_dir, nickname, domain, handle)
-												File reading exception handling

											
										
										
											2021-11-26 12:28:20 +00:00
+								                if pet:
 								                    petnames.append(pet + '\n')
-												Refactor mentions replacements

											
										
										
											2019-08-09 09:09:21 +00:00
 								    # extract mentions and tags from words
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    long_words_list: list[str] = []
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    prev_word_str = ''
 								    auto_tags_list = _load_auto_tags(base_dir, nickname, domain)
 								    append_tags = []
 								    for word_str in words:
 								        word_len = len(word_str)
-												Tidying

											
										
										
											2024-03-20 13:04:25 +00:00
+								        if word_len <= 2:
 								            continue
 								        if word_len > max_word_length:
 								            long_words_list.append(word_str)
 								        first_char = word_str[0]
 								        if first_char == '@':
 								            if _add_mention(base_dir, word_str, http_prefix, following,
 								                            petnames, replace_mentions, recipients,
 								                            hashtags):
 								                prev_word_str = ''
 								                continue
 								        elif first_char == '#':
 								            # remove any endings from the hashtag
 								            hash_tag_endings = ('.', ':', ';', '-', '\n')
 								            for ending in hash_tag_endings:
 								                if word_str.endswith(ending):
 								                    word_str = word_str[:len(word_str) - 1]
 								                    break
 								            if _add_hash_tags(word_str, http_prefix, original_domain,
 								                              replace_hashtags, hashtags):
 								                prev_word_str = ''
 								                continue
 								        elif ':' in word_str:
 								            word_str2 = word_str.split(':')[1]
 								            if not emoji_dict:
 								                # emoji.json is generated so that it can be customized and
 								                # the changes will be retained even if default_emoji.json
 								                # is subsequently updated
 								                if not os.path.isfile(base_dir + '/emoji/emoji.json'):
 								                    copyfile(base_dir + '/emoji/default_emoji.json',
 								                             base_dir + '/emoji/emoji.json')
 								            emoji_dict = load_json(base_dir + '/emoji/emoji.json')
 								            # append custom emoji to the dict
 								            custom_emoji_filename = base_dir + '/emojicustom/emoji.json'
 								            if os.path.isfile(custom_emoji_filename):
 								                custom_emoji_dict = load_json(custom_emoji_filename)
 								                if custom_emoji_dict:
 								                    # combine emoji dicts one by one
 								                    for ename, eitem in custom_emoji_dict.items():
 								                        if ename and eitem:
 								                            if not emoji_dict.get(ename):
 								                                emoji_dict[ename] = eitem
 								            _add_emoji(base_dir, ':' + word_str2 + ':', http_prefix,
 								                       original_domain, replace_emoji, hashtags,
 								                       emoji_dict)
 								        else:
 								            if _auto_tag(word_str, auto_tags_list, append_tags):
 								                prev_word_str = ''
 								                continue
 								            if prev_word_str:
 								                if _auto_tag(prev_word_str + ' ' + word_str,
 								                             auto_tags_list, append_tags):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    prev_word_str = ''
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								                    continue
-												Tidying

											
										
										
											2024-03-20 13:04:25 +00:00
+								        prev_word_str = word_str
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
 								    # add any auto generated tags
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for appended in append_tags:
-												Automatically add hashtags

											
										
										
											2020-09-13 14:42:17 +00:00
+								        content = content + ' ' + appended
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        _add_hash_tags(appended, http_prefix, original_domain,
 								                       replace_hashtags, hashtags)
-												Refactor mentions replacements

											
										
										
											2019-08-09 09:09:21 +00:00
 								    # replace words with their html versions
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for word_str, replace_str in replace_mentions.items():
 								        content = content.replace(word_str, replace_str)
 								    for word_str, replace_str in replace_hashtags.items():
 								        content = content.replace(word_str, replace_str)
 								    if not is_json_content:
 								        for word_str, replace_str in replace_emoji.items():
 								            content = content.replace(word_str, replace_str)
-												Don't replace emoji within json content

											
										
										
											2019-10-29 13:04:38 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    content = add_web_links(content)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if long_words_list:
 								        content = remove_long_words(content, max_word_length, long_words_list)
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    content = limit_repeated_words(content, 6)
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    content = content.replace(' --linebreak-- ', '</p><p>')
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    content = html_replace_email_quote(content)
 								    return '<p>' + html_replace_quote_marks(content) + '</p>'
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Tidying

											
										
										
											2024-03-20 13:10:56 +00:00
+								def _string_starts_with_url_prefix(text: str) -> bool:
 								    """ Does the given text begin with one of the url prefixes?
 								    """
 								    url_prefixes = ('http', 'gnunet', 'i2p', 'ipfs', 'ipns', 'hyper', 'dat:')
 								    for possible_prefix in url_prefixes:
 								        if text.startswith(possible_prefix):
 								            return True
 								    return False
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def get_mentions_from_html(html_text: str, match_str: str) -> []:
-												Include mentions in new posts

											
										
										
											2019-08-05 19:13:15 +00:00
+								    """Extracts mentioned actors from the given html content string
 								    """
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    mentions: list[str] = []
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if match_str not in html_text:
-												Include mentions in new posts

											
										
										
											2019-08-05 19:13:15 +00:00
+								        return mentions
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    mentions_list = html_text.split(match_str)
 								    for mention_str in mentions_list:
 								        if '"' not in mention_str:
-												Include mentions in new posts

											
										
										
											2019-08-05 19:13:15 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        actor_str = mention_str.split('"')[0]
-												Tidying

											
										
										
											2024-03-20 13:10:56 +00:00
+								        if _string_starts_with_url_prefix(actor_str):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if actor_str not in mentions:
 								                mentions.append(actor_str)
-												Include mentions in new posts

											
										
										
											2019-08-05 19:13:15 +00:00
+								    return mentions
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def extract_media_in_form_post(post_bytes, boundary, name: str):
-												Decrease line lengths

											
										
										
											2020-03-29 09:59:54 +00:00
+								    """Extracts the binary encoding for image/video/audio within a http
 								    form POST
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    Returns the media bytes and the remaining bytes
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    image_start_boundary = b'Content-Disposition: form-data; name="' + \
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        name.encode('utf8', 'ignore') + b'";'
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    image_start_location = post_bytes.find(image_start_boundary)
 								    if image_start_location == -1:
 								        return None, post_bytes
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
 								    # bytes after the start boundary appears
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    media_bytes = post_bytes[image_start_location:]
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
 								    # look for the next boundary
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    image_end_boundary = boundary.encode('utf8', 'ignore')
 								    image_end_location = media_bytes.find(image_end_boundary)
 								    if image_end_location == -1:
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								        # no ending boundary
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        return media_bytes, post_bytes[:image_start_location]
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
 								    # remaining bytes after the end of the image
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    remainder = media_bytes[image_end_location:]
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
 								    # remove bytes after the end boundary
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    media_bytes = media_bytes[:image_end_location]
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
 								    # return the media and the before+after bytes
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    return media_bytes, post_bytes[:image_start_location] + remainder
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Extra validation on csv import

											
										
										
											2022-07-21 10:16:14 +00:00
+								def _valid_follows_csv(content: str) -> bool:
 								    """is the given content a valid csv file containing imported follows?
 								    """
 								    if ',' not in content:
 								        return False
 								    if 'Account address,' not in content:
 								        return False
 								    return True
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def save_media_in_form_post(media_bytes, debug: bool,
-												Remove default arguments

											
										
										
											2024-02-19 14:44:52 +00:00
+								                            filename_base: str) -> (str, str):
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    """Saves the given media bytes extracted from http form POST
 								    Returns the filename and attachment type
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if not media_bytes:
 								        if filename_base:
-												Remove existing media files if not media is detected within a post

											
										
										
											2021-08-09 21:27:13 +00:00
+								            # remove any existing files
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            extension_types = get_image_extensions()
 								            for ex in extension_types:
 								                possible_other_format = filename_base + '.' + ex
 								                if os.path.isfile(possible_other_format):
-												Exception handling when deleting files

This can fail if a file is manually deleted or deleted in another thread

											
										
										
											2021-09-05 10:17:43 +00:00
+								                    try:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                        os.remove(possible_other_format)
-												More specific exceptions

											
										
										
											2021-11-25 18:42:38 +00:00
+								                    except OSError:
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                        if debug:
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								                            print('EX: save_media_in_form_post ' +
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                                  'unable to delete other ' +
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                                  str(possible_other_format))
 								            if os.path.isfile(filename_base):
-												Exception handling when deleting files

This can fail if a file is manually deleted or deleted in another thread

											
										
										
											2021-09-05 10:17:43 +00:00
+								                try:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    os.remove(filename_base)
-												More specific exceptions

											
										
										
											2021-11-25 18:42:38 +00:00
+								                except OSError:
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                    if debug:
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								                        print('EX: save_media_in_form_post ' +
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                              'unable to delete ' +
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                              str(filename_base))
-												Remove existing media files if not media is detected within a post

											
										
										
											2021-08-09 21:27:13 +00:00
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								        if debug:
 								            print('DEBUG: No media found within POST')
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        return None, None
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    media_location = -1
 								    search_str = ''
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    filename = None
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    # directly search the binary array for the beginning
-												Extra validation on csv import

											
										
										
											2022-07-21 10:16:14 +00:00
+								    # of an image, zip or csv
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    extension_list = {
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								        'png': 'image/png',
 								        'jpeg': 'image/jpeg',
-												Support jpeg-xl format

											
										
										
											2022-02-06 11:04:49 +00:00
+								        'jxl': 'image/jxl',
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								        'gif': 'image/gif',
-												Support for svg format images

											
										
										
											2021-01-11 22:27:57 +00:00
+								        'svg': 'image/svg+xml',
-												Support different image formats

											
										
										
											2019-11-14 13:30:54 +00:00
+								        'webp': 'image/webp',
-												Support AVIF image format

See https://jakearchibald.com/2020/avif-has-landed

											
										
										
											2020-09-09 15:09:38 +00:00
+								        'avif': 'image/avif',
-												Support heic formatted images

											
										
										
											2022-10-31 17:26:31 +00:00
+								        'heic': 'image/heic',
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								        'mp4': 'video/mp4',
 								        'ogv': 'video/ogv',
 								        'mp3': 'audio/mpeg',
-												Importing themes

											
										
										
											2021-05-29 11:04:03 +00:00
+								        'ogg': 'audio/ogg',
-												Support wave file attachments

											
										
										
											2022-10-31 11:05:11 +00:00
+								        'wav': 'audio/vnd.wave',
 								        'wav2': 'audio/wav',
 								        'wav3': 'audio/x-wav',
 								        'wav4': 'audio/x-pn-wave',
-												Support opus audio format

											
										
										
											2022-04-18 13:21:45 +00:00
+								        'opus': 'audio/opus',
-												Support speex audio format

											
										
										
											2022-10-20 19:37:59 +00:00
+								        'spx': 'audio/speex',
-												Tidying of mime types

											
										
										
											2021-08-03 09:09:04 +00:00
+								        'flac': 'audio/flac',
-												Import follows csv from profile screen

											
										
										
											2022-07-21 09:58:28 +00:00
+								        'zip': 'application/zip',
-												Missing comma

											
										
										
											2022-07-21 10:45:27 +00:00
+								        'csv': 'text/csv',
 								        'csv2': 'text/plain'
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    }
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    detected_extension = None
 								    for extension, content_type in extension_list.items():
 								        search_str = b'Content-Type: ' + content_type.encode('utf8', 'ignore')
 								        media_location = media_bytes.find(search_str)
 								        if media_location > -1:
-												Detect font binaries

											
										
										
											2020-05-26 19:05:03 +00:00
+								            # image/video/audio binaries
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								            if extension == 'jpeg':
 								                extension = 'jpg'
 								            elif extension == 'mpeg':
 								                extension = 'mp3'
-												Missing comma

											
										
										
											2022-07-21 10:45:27 +00:00
+								            elif extension == 'csv2':
 								                extension = 'csv'
-												Support wave file attachments

											
										
										
											2022-10-31 11:05:11 +00:00
+								            elif extension == 'wav2':
 								                extension = 'wav'
 								            elif extension == 'wav3':
 								                extension = 'wav'
 								            elif extension == 'wav4':
 								                extension = 'wav'
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if filename_base:
-												Avoid duplicated file extension

											
										
										
											2022-12-10 11:17:33 +00:00
+								                if not filename_base.endswith('.' + extension):
 								                    filename = filename_base + '.' + extension
 								                else:
 								                    # already has the extension
 								                    filename = filename_base
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								            search_lst = search_str.decode().split('/', maxsplit=1)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            attachment_media_type = \
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								                search_lst[0].replace('Content-Type: ', '')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            detected_extension = extension
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								            break
 								    if not filename:
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								        return None, None
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												Detecting font files

											
										
										
											2020-05-26 19:29:15 +00:00
+								    # locate the beginning of the image, after any
 								    # carriage returns
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    start_pos = media_location + len(search_str)
-												Detecting font files

											
										
										
											2020-05-26 19:29:15 +00:00
+								    for offset in range(1, 8):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if media_bytes[start_pos+offset] != 10:
 								            if media_bytes[start_pos+offset] != 13:
 								                start_pos += offset
-												Detecting font files

											
										
										
											2020-05-26 19:29:15 +00:00
+								                break
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												Support different image formats

											
										
										
											2019-11-14 13:30:54 +00:00
+								    # remove any existing image files with a different format
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if detected_extension != 'zip':
 								        extension_types = get_image_extensions()
 								        for ex in extension_types:
 								            if ex == detected_extension:
-												Importing themes

											
										
										
											2021-05-29 11:04:03 +00:00
+								                continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            possible_other_format = \
-												Importing themes

											
										
										
											2021-05-29 11:04:03 +00:00
+								                filename.replace('.temp', '').replace('.' +
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                                                      detected_extension, '.' +
-												Importing themes

											
										
										
											2021-05-29 11:04:03 +00:00
+								                                                      ex)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if os.path.isfile(possible_other_format):
-												Exception handling when deleting files

This can fail if a file is manually deleted or deleted in another thread

											
										
										
											2021-09-05 10:17:43 +00:00
+								                try:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    os.remove(possible_other_format)
-												More specific exceptions

											
										
										
											2021-11-25 18:42:38 +00:00
+								                except OSError:
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                    if debug:
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								                        print('EX: save_media_in_form_post ' +
-												Adding debug to exceptions

											
										
										
											2021-10-29 16:31:20 +00:00
+								                              'unable to delete other 2 ' +
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                              str(possible_other_format))
-												Support different image formats

											
										
										
											2019-11-14 13:30:54 +00:00
-												Don't allow svg files containing scripts

											
										
										
											2021-09-13 17:51:33 +00:00
+								    # don't allow scripts within svg files
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if detected_extension == 'svg':
 								        svg_str = media_bytes[start_pos:]
 								        svg_str = svg_str.decode()
 								        if dangerous_svg(svg_str, False):
-												Don't allow svg files containing scripts

											
										
										
											2021-09-13 17:51:33 +00:00
+								            return None, None
-												Import follows csv from profile screen

											
										
										
											2022-07-21 09:58:28 +00:00
+								    elif detected_extension == 'csv':
 								        csv_str = media_bytes[start_pos:]
-												Wrong variable name

											
										
										
											2022-07-21 10:47:17 +00:00
+								        csv_str = csv_str.decode()
-												Extra validation on csv import

											
										
										
											2022-07-21 10:16:14 +00:00
+								        if not _valid_follows_csv(csv_str):
-												Import follows csv from profile screen

											
										
										
											2022-07-21 09:58:28 +00:00
+								            return None, None
-												Don't allow svg files containing scripts

											
										
										
											2021-09-13 17:51:33 +00:00
-												Whenever saving an image verify that the binary looks like an image

This should help to head off any future nefariousness

											
										
										
											2023-09-12 18:38:56 +00:00
+								    # if this is an image then check that the binary looks like an image
 								    image_extension_types = get_image_extensions()
 								    if detected_extension in image_extension_types:
 								        if not binary_is_image(filename, media_bytes[start_pos:]):
 								            print('WARN: save_media_in_form_post ' +
 								                  'image binary not recognized ' + filename)
 								            return None, None
-												More specific exceptions

											
										
										
											2021-11-25 18:42:38 +00:00
+								    try:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        with open(filename, 'wb') as fp_media:
 								            fp_media.write(media_bytes[start_pos:])
-												More specific exceptions

											
										
										
											2021-11-25 18:42:38 +00:00
+								    except OSError:
-												Include function name in exception

											
										
										
											2024-07-02 22:16:13 +00:00
+								        print('EX: save_media_in_form_post unable to write media')
-												Create etag at time of media upload

											
										
										
											2019-12-04 18:52:27 +00:00
-												Check that media file is written

											
										
										
											2021-03-06 23:16:54 +00:00
+								    if not os.path.isfile(filename):
 								        print('WARN: Media file could not be written to file: ' + filename)
 								        return None, None
-												Debug

											
										
										
											2021-03-06 23:19:03 +00:00
+								    print('Uploaded media file written: ' + filename)
-												Check that media file is written

											
										
										
											2021-03-06 23:16:54 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    return filename, attachment_media_type
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
-												Combining lines within paragraphs to work around the lack of word-wrap within Lynx textarea

											
										
										
											2022-07-11 14:45:40 +00:00
+								def combine_textarea_lines(text: str) -> str:
 								    """Combines separate lines
 								    """
 								    result = ''
 								    ctr = 0
 								    paragraphs = text.split('\n\n')
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								    replacements = {
 								        '\n* ': '***BULLET POINT*** ',
 								        '\n * ': '***BULLET POINT*** ',
 								        '\n- ': '***DASH POINT*** ',
 								        '\n - ': '***DASH POINT*** ',
 								        '\n': ' ',
 								        '  ': ' ',
 								        '***BULLET POINT*** ': '\n* ',
 								        '***DASH POINT*** ': '\n- '
 								    }
-												Combining lines within paragraphs to work around the lack of word-wrap within Lynx textarea

											
										
										
											2022-07-11 14:45:40 +00:00
+								    for para in paragraphs:
-												Replacing multiple strings

											
										
										
											2024-08-08 17:23:33 +00:00
+								        para = replace_strings(para, replacements)
-												Combining lines within paragraphs to work around the lack of word-wrap within Lynx textarea

											
										
										
											2022-07-11 14:45:40 +00:00
+								        if ctr > 0:
-												Add paragraphs

											
										
										
											2022-07-11 15:13:39 +00:00
+								            result += '</p><p>'
-												Combining lines within paragraphs to work around the lack of word-wrap within Lynx textarea

											
										
										
											2022-07-11 14:45:40 +00:00
+								        result += para
 								        ctr += 1
 								    return result
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def extract_text_fields_in_post(post_bytes, boundary: str, debug: bool,
-												Remove default arguments

											
										
										
											2024-02-19 14:46:59 +00:00
+								                                unit_test_data: str) -> {}:
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    """Returns a dictionary containing the text fields of a http form POST
 								    The boundary argument comes from the http header
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
+								    """
-												Prepend dashes

											
										
										
											2022-07-10 21:52:24 +00:00
+								    if boundary == 'LYNX':
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								        if debug:
 								            print('POST from lynx browser')
-												Prepend dashes

											
										
										
											2022-07-10 21:52:39 +00:00
+								        boundary = '--LYNX'
-												Prepend dashes

											
										
										
											2022-07-10 21:52:24 +00:00
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								    if not unit_test_data:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        msg_bytes = email.parser.BytesParser().parsebytes(post_bytes)
 								        message_fields = msg_bytes.get_payload(decode=True).decode('utf-8')
-												Don't use payload function when parsing form-data

											
										
										
											2021-03-01 10:02:55 +00:00
+								    else:
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								        message_fields = unit_test_data
-												Don't use payload function when parsing form-data

											
										
										
											2021-03-01 10:02:55 +00:00
-												Exclude password from debug

											
										
										
											2022-07-10 21:15:06 +00:00
+								    if debug:
 								        if 'password' not in message_fields:
 								            print('DEBUG: POST arriving ' + message_fields)
-												Don't use payload function when parsing form-data

											
										
										
											2021-03-01 10:02:55 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    message_fields = message_fields.split(boundary)
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								    fields = {}
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    fields_with_semicolon_allowed = (
-												Allow semicolons in some other fields

											
										
										
											2021-03-01 12:19:49 +00:00
+								        'message', 'bio', 'autoCW', 'password', 'passwordconfirm',
 								        'instanceDescription', 'instanceDescriptionShort',
-												Importing blocks from csv

											
										
										
											2023-02-10 18:42:58 +00:00
+								        'subject', 'location', 'imageDescription', 'importBlocks',
 								        'importFollows', 'importTheme'
-												Allow semicolons in some other fields

											
										
										
											2021-03-01 12:15:06 +00:00
+								    )
-												Exclude password from debug

											
										
										
											2022-07-10 21:15:06 +00:00
+								    if debug:
 								        if 'password' not in message_fields:
 								            print('DEBUG: POST message_fields: ' + str(message_fields))
-												Extra newline

											
										
										
											2022-07-10 19:18:37 +00:00
+								    lynx_content_type = 'Content-Type: text/plain; charset=utf-8\r\n'
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    # examine each section of the POST, separated by the boundary
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for fld in message_fields:
 								        if fld == '--':
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if ' name="' not in fld:
-												Remove trailing whitespace

											
										
										
											2020-03-22 21:16:02 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        post_str = fld.split(' name="', 1)[1]
 								        if '"' not in post_str:
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        post_key = post_str.split('"', 1)[0]
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								        if debug:
 								            print('post_key: ' + post_key)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        post_value_str = post_str.split('"', 1)[1]
-												Add unit test for Lynx browser POST

											
										
										
											2022-07-10 19:04:54 +00:00
+								        if boundary == '--LYNX':
 								            post_value_str = \
 								                post_value_str.replace(lynx_content_type, '')
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								        if debug and 'password' not in post_key:
-												Debug

											
										
										
											2022-07-10 21:48:36 +00:00
+								            print('boundary: ' + boundary)
-												Debug

											
										
										
											2022-07-10 21:44:12 +00:00
+								            print('post_value_str1: ' + post_value_str)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if ';' in post_value_str:
 								            if post_key not in fields_with_semicolon_allowed and \
 								               not post_key.startswith('edited'):
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								                if debug:
 								                    print('extract_text_fields_in_post exit 1')
-												Don't use payload function when parsing form-data

											
										
										
											2021-03-01 10:02:55 +00:00
+								                continue
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								        if debug and 'password' not in post_key:
-												Debug

											
										
										
											2022-07-10 21:44:12 +00:00
+								            print('post_value_str2: ' + post_value_str)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if '\r\n' not in post_value_str:
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								            if debug:
 								                print('extract_text_fields_in_post exit 2')
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								            continue
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        post_lines = post_value_str.split('\r\n')
-												Add debug flags

											
										
										
											2022-07-10 21:57:26 +00:00
+								        if debug and 'password' not in post_key:
-												Debug

											
										
										
											2022-07-10 21:44:12 +00:00
+								            print('post_lines: ' + str(post_lines))
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        post_value = ''
 								        if len(post_lines) > 2:
 								            for line in range(2, len(post_lines)-1):
-												flake8 style

											
										
										
											2020-04-02 09:56:17 +00:00
+								                if line > 2:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                    post_value += '\n'
 								                post_value += post_lines[line]
 								        fields[post_key] = urllib.parse.unquote(post_value)
-												Combining lines within paragraphs to work around the lack of word-wrap within Lynx textarea

											
										
										
											2022-07-11 14:45:40 +00:00
+								        if boundary == '--LYNX' and post_key in ('message', 'bio'):
 								            fields[post_key] = combine_textarea_lines(fields[post_key])
-												Refactoring receiving of posts

											
										
										
											2019-11-10 11:37:24 +00:00
+								    return fields
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def limit_repeated_words(text: str, max_repeats: int) -> str:
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
+								    """Removes words which are repeated many times
 								    """
 								    words = text.replace('\n', ' ').split(' ')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    repeat_ctr = 0
 								    repeated_text = ''
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
+								    replacements = {}
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    prev_word = ''
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
+								    for word in words:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if word == prev_word:
 								            repeat_ctr += 1
 								            if repeated_text:
 								                repeated_text += ' ' + word
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
+								            else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								                repeated_text = word + ' ' + word
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
+								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            if repeat_ctr > max_repeats:
 								                new_text = ((prev_word + ' ') * max_repeats).strip()
 								                replacements[prev_word] = [repeated_text, new_text]
 								            repeat_ctr = 0
 								            repeated_text = ''
 								        prev_word = word
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if repeat_ctr > max_repeats:
 								        new_text = ((prev_word + ' ') * max_repeats).strip()
 								        replacements[prev_word] = [repeated_text, new_text]
-												Limit the number of times that the same word can be repeated

											
										
										
											2021-07-10 09:38:59 +00:00
 								    for word, item in replacements.items():
 								        text = text.replace(item[0], item[1])
 								    return text
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								def get_price_from_string(price_str: str) -> (str, str):
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
+								    """Returns the item price and currency
 								    """
-												Snake case

											
										
										
											2021-12-26 17:29:09 +00:00
+								    currencies = get_currencies()
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
+								    for symbol, name in currencies.items():
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								        if symbol in price_str:
 								            price = price_str.replace(symbol, '')
-												Snake case

											
										
										
											2021-12-26 18:03:39 +00:00
+								            if is_float(price):
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
+								                return price, name
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								        elif name in price_str:
 								            price = price_str.replace(name, '')
-												Snake case

											
										
										
											2021-12-26 18:03:39 +00:00
+								            if is_float(price):
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
+								                return price, name
-												snake case

											
										
										
											2024-02-26 19:22:25 +00:00
+								    if is_float(price_str):
 								        return price_str, "EUR"
-												Get currency symbol from price

											
										
										
											2021-08-07 17:03:41 +00:00
+								    return "0.00", "EUR"
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def _words_similarity_histogram(words: []) -> {}:
-												Tidying

											
										
										
											2021-10-14 15:40:19 +00:00
+								    """Returns a histogram for word combinations
 								    """
 								    histogram = {}
 								    for index in range(1, len(words)):
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        combined_words = words[index - 1] + words[index]
 								        if histogram.get(combined_words):
 								            histogram[combined_words] += 1
-												Tidying

											
										
										
											2021-10-14 15:40:19 +00:00
+								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            histogram[combined_words] = 1
-												Tidying

											
										
										
											2021-10-14 15:40:19 +00:00
+								    return histogram
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								def _words_similarity_words_list(content: str) -> []:
-												Tidying

											
										
										
											2021-10-14 15:53:04 +00:00
+								    """Returns a list of words for the given content
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    remove_punctuation = ('.', ',', ';', '-', ':', '"')
-												Snake case

											
										
										
											2021-12-27 15:43:22 +00:00
+								    content = remove_html(content).lower()
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for punc in remove_punctuation:
 								        content = content.replace(punc, ' ')
-												Tidying

											
										
										
											2021-10-14 15:53:04 +00:00
+								        content = content.replace('  ', ' ')
 								    return content.split(' ')
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								def words_similarity(content1: str, content2: str, min_words: int) -> int:
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
+								    """Returns percentage similarity
 								    """
 								    if content1 == content2:
 								        return 100
-												Tidying

											
										
										
											2021-10-14 15:40:19 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    words1 = _words_similarity_words_list(content1)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if len(words1) < min_words:
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
+								        return 0
-												Tidying

											
										
										
											2021-10-14 15:40:19 +00:00
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    words2 = _words_similarity_words_list(content2)
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    if len(words2) < min_words:
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
+								        return 0
-												Moving to snake case

											
										
										
											2021-12-29 21:55:09 +00:00
+								    histogram1 = _words_similarity_histogram(words1)
 								    histogram2 = _words_similarity_histogram(words2)
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
 								    diff = 0
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								    for combined_words, histogram1_value in histogram1.items():
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								        if not histogram2.get(combined_words):
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
+								            diff += 1
 								        else:
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								            diff += \
-												Variable name

											
										
										
											2024-05-18 16:37:19 +00:00
+								                abs(histogram2[combined_words] - histogram1_value)
-												Check for edited posts

											
										
										
											2021-10-14 15:12:35 +00:00
+								    return 100 - int(diff * 100 / len(histogram1.items()))
-												Check for invalid local links in incoming posts

											
										
										
											2021-10-26 16:06:22 +00:00
-												Check for invalid local links at a later stage in the inbox queue

											
										
										
											2024-06-12 10:24:59 +00:00
+								def contains_invalid_local_links(domain_full: str,
 								                                 onion_domain: str, i2p_domain: str,
 								                                 content: str) -> bool:
-												Check for invalid local links in incoming posts

											
										
										
											2021-10-26 16:06:22 +00:00
+								    """Returns true if the given content has invalid links
 								    """
-												Snake case

											
										
										
											2021-12-30 20:24:05 +00:00
+								    for inv_str in INVALID_CONTENT_STRINGS:
-												Check for invalid local links at a later stage in the inbox queue

											
										
										
											2024-06-12 10:24:59 +00:00
+								        match_str = '?' + inv_str + '='
 								        if match_str not in content:
 								            continue
 								        # extract the urls and check whether they are for the local domain
 								        ctr = 0
 								        sections = content.split(match_str)
 								        final_section_index = len(sections) - 1
 								        for section_str in sections:
 								            if ctr == final_section_index:
 								                continue
 								            if '://' in section_str:
 								                url = section_str.split('://')[-1]
 								                if domain_full in url:
 								                    return True
 								                if onion_domain:
 								                    if onion_domain in url:
 								                        return True
 								                if i2p_domain:
 								                    if i2p_domain in url:
 								                        return True
 								            ctr += 1
-												Check for invalid local links in incoming posts

											
										
										
											2021-10-26 16:06:22 +00:00
+								    return False
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
 								def bold_reading_string(text: str) -> str:
 								    """Returns bold reading formatted text
 								    """
-												Unescape before adding bold

											
										
										
											2022-03-24 15:15:53 +00:00
+								    text = html.unescape(text)
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								    add_paragraph_markup = False
 								    if '<p>' in text:
 								        text = text.replace('</p>', '\n').replace('<p>', '')
 								        add_paragraph_markup = True
 								    paragraphs = text.split('\n')
 								    parag_ctr = 0
-												Allow bold markup for bold reading

											
										
										
											2022-03-24 14:40:28 +00:00
+								    new_text = ''
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								    for parag in paragraphs:
 								        words = parag.split(' ')
 								        new_parag = ''
-												Avoid emboldening spaces within markup

											
										
										
											2022-03-24 14:08:07 +00:00
+								        reading_markup = False
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								        for wrd in words:
-												Improve handling of markup while emboldening text

											
										
										
											2022-03-24 15:32:37 +00:00
+								            if '<' in wrd:
-												Avoid emboldening spaces within markup

											
										
										
											2022-03-24 14:08:07 +00:00
+								                reading_markup = True
-												Improve handling of markup while emboldening text

											
										
										
											2022-03-24 15:32:37 +00:00
+								            if reading_markup and '>' in wrd:
-												Avoid emboldening spaces within markup

											
										
										
											2022-03-24 14:08:07 +00:00
+								                reading_markup = False
-												Round up word division for bold reading

											
										
										
											2022-03-24 16:16:36 +00:00
+								            wrd_len = len(wrd)
 								            if not reading_markup and wrd_len > 1 and \
-												Don't embolden emoji strings

											
										
										
											2022-03-24 13:38:10 +00:00
+								               '<' not in wrd and '>' not in wrd and \
-												Another unit test for markup with bold reading

											
										
										
											2022-03-24 15:57:44 +00:00
+								               '&' not in wrd and '=' not in wrd and \
 								               not wrd.startswith(':'):
-												Handle emboldened text with quotes

											
										
										
											2022-03-24 13:45:55 +00:00
 								                prefix = ''
 								                postfix = ''
 								                if wrd.startswith('"'):
 								                    prefix = '"'
 								                    wrd = wrd[1:]
 								                if wrd.endswith('"'):
 								                    postfix = '"'
-												Round up word division for bold reading

											
										
										
											2022-03-24 16:16:36 +00:00
+								                    wrd = wrd[:wrd_len - 1]
-												Handle emboldened text with quotes

											
										
										
											2022-03-24 13:45:55 +00:00
-												Round up word division for bold reading

											
										
										
											2022-03-24 16:16:36 +00:00
+								                initial_chars = int(math.ceil(wrd_len / 2.0))
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								                new_parag += \
-												Handle emboldened text with quotes

											
										
										
											2022-03-24 13:45:55 +00:00
+								                    prefix + '<b>' + wrd[:initial_chars] + '</b>' + \
 								                    wrd[initial_chars:] + postfix + ' '
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								            else:
 								                new_parag += wrd + ' '
 								        parag_ctr += 1
 								        new_parag = new_parag.strip()
-												Another unit test for markup with bold reading

											
										
										
											2022-03-24 15:57:44 +00:00
+								        if not new_parag:
 								            continue
-												Bold reading accessibility feature

											
										
										
											2022-03-24 13:14:41 +00:00
+								        if parag_ctr < len(paragraphs):
 								            if not add_paragraph_markup:
 								                new_text += new_parag + '\n'
 								            else:
 								                new_text += '<p>' + new_parag + '</p>'
 								        else:
 								            if not add_paragraph_markup:
 								                new_text += new_parag
 								            else:
 								                new_text += '<p>' + new_parag + '</p>'
 								    return new_text
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
 								def import_emoji(base_dir: str, import_filename: str, session) -> None:
 								    """Imports emoji from the given filename
-												More emoji

											
										
										
											2022-04-11 19:44:39 +00:00
+								    Each line should be [emoji url], :emojiname:
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								    """
 								    if not os.path.isfile(import_filename):
 								        return
-												json loading retries may not be needed

											
										
										
											2024-06-20 10:47:58 +00:00
+								    emoji_dict = load_json(base_dir + '/emoji/default_emoji.json')
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								    added = 0
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								    with open(import_filename, "r", encoding='utf-8') as fp_emoji:
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								        lines = fp_emoji.readlines()
 								        for line in lines:
-												Add emojis

											
										
										
											2022-11-08 12:36:59 +00:00
+								            if ', ' not in line:
 								                continue
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								            url = line.split(', ')[0]
 								            tag = line.split(', ')[1].strip()
-												Add emojis

											
										
										
											2022-11-08 12:36:59 +00:00
+								            if ':' not in tag:
 								                continue
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								            tag = tag.split(':')[1]
 								            if emoji_dict.get(tag):
 								                continue
 								            emoji_image_filename = base_dir + '/emoji/' + tag + '.png'
 								            if os.path.isfile(emoji_image_filename):
 								                continue
-												Tidying

											
										
										
											2022-06-14 10:24:29 +00:00
+								            if download_image(session, url,
-												Adding more emoji

											
										
										
											2022-03-29 19:34:03 +00:00
+								                              emoji_image_filename, True, False):
 								                emoji_dict[tag] = tag
 								                added += 1
 								    save_json(emoji_dict, base_dir + '/emoji/default_emoji.json')
 								    print(str(added) + ' custom emoji added')
-												Unit test for content diff

											
										
										
											2022-04-10 19:19:40 +00:00
 								def content_diff(content: str, prev_content: str) -> str:
 								    """Returns a diff for the given content
 								    """
-												Tidying

											
										
										
											2022-05-30 21:41:18 +00:00
+								    cdiff = difflib.Differ()
-												Unit test for content diff

											
										
										
											2022-04-10 19:19:40 +00:00
+								    text1_lines = content.splitlines()
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    text1_sentences: list[str] = []
-												Fixing edit diffs

											
										
										
											2022-04-11 12:13:04 +00:00
+								    for line in text1_lines:
 								        sentences = line.split('.')
 								        for sentence in sentences:
 								            text1_sentences.append(sentence.strip())
-												Unit test for content diff

											
										
										
											2022-04-10 19:19:40 +00:00
+								    text2_lines = prev_content.splitlines()
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    text2_sentences: list[str] = []
-												Fixing edit diffs

											
										
										
											2022-04-11 12:13:04 +00:00
+								    for line in text2_lines:
 								        sentences = line.split('.')
 								        for sentence in sentences:
 								            text2_sentences.append(sentence.strip())
-												Tidying

											
										
										
											2022-05-30 21:41:18 +00:00
+								    diff = cdiff.compare(text1_sentences, text2_sentences)
-												Unit test for content diff

											
										
										
											2022-04-10 19:19:40 +00:00
 								    diff_text = ''
 								    for line in diff:
 								        if line.startswith('- '):
-												Fixing edit diffs

											
										
										
											2022-04-11 12:13:04 +00:00
+								            if not diff_text:
 								                diff_text = '<p>'
 								            else:
 								                diff_text += '<br>'
 								            diff_text += '<label class="diff_add">+ ' + line[2:] + '</label>'
 								        elif line.startswith('+ '):
 								            if not diff_text:
 								                diff_text = '<p>'
 								            else:
 								                diff_text += '<br>'
 								            diff_text += \
 								                '<label class="diff_remove">- ' + line[2:] + '</label>'
 								    if diff_text:
 								        diff_text += '</p>'
 								    return diff_text
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
 								def create_edits_html(edits_json: {}, post_json_object: {},
-												Use contentMap, if available, when showing post diff

											
										
										
											2022-04-13 14:20:38 +00:00
+								                      translate: {}, timezone: str,
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								                      system_language: str,
 								                      languages_understood: []) -> str:
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								    """ Creates html showing historical edits made to a post
 								    """
 								    if not edits_json:
 								        return ''
 								    if not has_object_dict(post_json_object):
 								        return ''
-												Improve checking for content

											
										
										
											2023-01-08 22:23:02 +00:00
+								    if 'content' not in post_json_object['object']:
 								        if 'contentMap' not in post_json_object['object']:
-												Use contentMap, if available, when showing post diff

											
										
										
											2022-04-13 14:20:38 +00:00
+								            return ''
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    edit_dates_list: list[str] = []
-												Tidying

											
										
										
											2022-05-30 21:41:18 +00:00
+								    for modified, _ in edits_json.items():
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								        edit_dates_list.append(modified)
 								    edit_dates_list.sort(reverse=True)
 								    edits_str = ''
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								    content = get_content_from_post(post_json_object, system_language,
-												Remove default arguments

											
										
										
											2024-02-19 20:54:46 +00:00
+								                                    languages_understood, "content")
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								    if not content:
-												Use contentMap, if available, when showing post diff

											
										
										
											2022-04-13 14:20:38 +00:00
+								        return ''
 								    content = remove_html(content)
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								    for modified in edit_dates_list:
 								        prev_json = edits_json[modified]
 								        if not has_object_dict(prev_json):
 								            continue
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								        prev_content = get_content_from_post(prev_json, system_language,
-												Remove default arguments

											
										
										
											2024-02-19 20:54:46 +00:00
+								                                             languages_understood, "content")
-												Use function to obtain content

											
										
										
											2023-03-20 18:04:38 +00:00
+								        if not prev_content:
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								            continue
-												Use contentMap, if available, when showing post diff

											
										
										
											2022-04-13 14:20:38 +00:00
+								        prev_content = remove_html(prev_content)
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								        if content == prev_content:
 								            continue
 								        diff = content_diff(content, prev_content)
 								        if not diff:
 								            continue
 								        diff = diff.replace('\n', '</p><p>')
 								        # convert to local time
 								        datetime_object = parse(modified)
 								        datetime_object = \
 								            convert_published_to_local_timezone(datetime_object, timezone)
 								        modified_str = datetime_object.strftime("%a %b %d, %H:%M")
-												Fixing edit diffs

											
										
										
											2022-04-11 12:13:04 +00:00
+								        diff = '<p><b>' + modified_str + '</b></p>' + diff
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								        edits_str += diff
 								        content = prev_content
 								    if not edits_str:
 								        return ''
-												Tab index for edits

											
										
										
											2022-06-10 16:00:55 +00:00
+								    return '<details><summary class="cw" tabindex="10">' + \
-												Function to show edits to a post

											
										
										
											2022-04-10 22:50:44 +00:00
+								        translate['SHOW EDITS'] + '</summary>' + \
-												Fixing edit diffs

											
										
										
											2022-04-11 12:13:04 +00:00
+								        edits_str + '</details>'
-												Function to remove scripts from content

											
										
										
											2022-05-26 09:08:02 +00:00
-												Log svg scripts for subsequent review

											
										
										
											2022-05-26 12:17:56 +00:00
+								def remove_script(content: str, log_filename: str,
 								                  actor: str, url: str) -> str:
-												Function to remove scripts from content

											
										
										
											2022-05-26 09:08:02 +00:00
+								    """Removes <script> from some content
 								    """
 								    separators = [['<', '>'], ['&lt;', '&gt;']]
 								    for sep in separators:
 								        prefix = sep[0] + 'script'
 								        ending = '/script' + sep[1]
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								        if prefix not in content:
 								            continue
 								        sections = content.split(prefix)
 								        ctr = 0
 								        for text in sections:
 								            if ctr == 0:
 								                ctr += 1
 								                continue
 								            if ending not in text:
 								                if '/' + sep[1] not in text:
-												Function to remove scripts from content

											
										
										
											2022-05-26 09:08:02 +00:00
+								                    continue
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								            if ending in text:
 								                text = prefix + text.split(ending)[0] + ending
 								            else:
 								                text = prefix + text.split('/' + sep[1])[0] + '/' + sep[1]
 								                if log_filename and actor:
 								                    # write the detected script to a log file
 								                    log_str = actor + ' ' + url + ' ' + text + '\n'
 								                    write_type = 'a+'
 								                    if os.path.isfile(log_filename):
 								                        write_type = 'w+'
 								                    try:
-												Explicitly set file encoding

											
										
										
											2022-06-09 14:46:30 +00:00
+								                        with open(log_filename, write_type,
 								                                  encoding='utf-8') as fp_log:
-												Use appropriate session to send follow rejects

											
										
										
											2022-06-01 14:26:50 +00:00
+								                            fp_log.write(log_str)
 								                    except OSError:
 								                        print('EX: cannot append to svg script log')
 								            content = content.replace(text, '')
-												Function to remove scripts from content

											
										
										
											2022-05-26 09:08:02 +00:00
+								    return content
-												Check for twitter content within announces

											
										
										
											2022-07-27 09:13:30 +00:00
 								def reject_twitter_summary(base_dir: str, nickname: str, domain: str,
 								                           summary: str) -> bool:
 								    """Returns true if the post should be rejected due to twitter
 								    existing within the summary
 								    """
 								    if not summary:
 								        return False
 								    remove_twitter = \
 								        acct_dir(base_dir, nickname, domain) + '/.removeTwitter'
 								    if not os.path.isfile(remove_twitter):
 								        return False
 								    summary_lower = summary.lower()
-												Extra twitter summary matches

											
										
										
											2024-08-02 11:16:57 +00:00
+								    twitter_strings = ('twitter', '/x.com', ' x.com', 'birdsite')
 								    if string_contains(summary_lower, twitter_strings):
-												Check for twitter content within announces

											
										
										
											2022-07-27 09:13:30 +00:00
+								        return True
 								    return False
-												Add missing name emojis to actor tag list

											
										
										
											2022-12-08 14:13:15 +00:00
 								def add_name_emojis_to_tags(base_dir: str, http_prefix: str,
 								                            domain: str, port: int,
 								                            actor_json: {}) -> None:
 								    """Add any custom emojis within the name of an actor to
 								    the tag list
 								    """
 								    if not actor_json.get('name'):
 								        return
 								    name = actor_json['name']
 								    # does the name contain an emoji?
 								    if ':' not in name:
 								        return
 								    if ':' not in name.split(':', 1)[1]:
 								        return
 								    # get emojis from the actor name
 								    words = name.split(' ')
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    emojis: list[str] = []
-												Add missing name emojis to actor tag list

											
										
										
											2022-12-08 14:13:15 +00:00
+								    for wrd in words:
 								        if wrd.startswith(':') and wrd.endswith(':'):
 								            if wrd not in emojis:
 								                emojis.append(wrd)
 								    if not emojis:
 								        return
-												Add list types

											
										
										
											2024-12-23 15:39:55 +00:00
+								    actor_tags: list[dict] = []
-												Add missing name emojis to actor tag list

											
										
										
											2022-12-08 14:13:15 +00:00
+								    if actor_json.get('tag'):
 								        actor_tags = actor_json['tag']
 								    # is the emoji already in the tag list?
 								    for tag_dict in actor_tags:
 								        if not tag_dict.get('type'):
 								            continue
 								        if tag_dict['type'] != 'Emoji':
 								            continue
 								        if not tag_dict.get('name'):
 								            continue
 								        if not tag_dict['name'].startswith(':'):
 								            continue
 								        if not tag_dict['name'].endswith(':'):
 								            continue
 								        if tag_dict['name'] in emojis:
 								            emojis.remove(tag_dict['name'])
 								    if not emojis:
 								        return
 								    domain_full = get_full_domain(domain, port)
 								    for emoji_tag_name in emojis:
 								        emoji_name = emoji_tag_name.replace(':', '')
 								        emoji_id = \
 								            http_prefix + '://' + domain_full + '/emoji/' + \
 								            emoji_name
 								        url = emoji_id + '.png'
 								        emoji_filename = base_dir + '/emoji/' + emoji_name + '.png'
 								        updated = None
 								        if os.path.isfile(emoji_filename):
 								            updated = file_last_modified(emoji_filename)
 								        new_tag = {
 								            'icon': {
 								                'mediaType': 'image/png',
 								                'type': 'Image',
 								                'url': url
 								            },
 								            'id': emoji_id,
 								            'name': emoji_tag_name,
 								            'type': 'Emoji',
 								            'updated': '2022-11-15T23:45:42Z'
 								        }
 								        if updated:
 								            new_tag['updated'] = updated
 								        actor_json['tag'].append(new_tag)
-												Handling of mixed direction post content

											
										
										
											2023-09-20 12:23:45 +00:00
 								def format_mixed_right_to_left(content: str,
 								                               language: str) -> str:
 								    """Adds RTL direction formatting for non-RTL language
 								    eg. where some paragraphs are English and others are Arabic
 								    """
 								    # not a RTL language
 								    if language_right_to_left(language):
 								        return content
 								    result = ''
 								    changed = False
-												Handle mixed RTL with line breaks

											
										
										
											2023-09-20 12:50:29 +00:00
+								    paragraphs = content.split('<p>')
-												Handling of mixed direction post content

											
										
										
											2023-09-20 12:23:45 +00:00
+								    for text_html in paragraphs:
 								        if '</p>' not in text_html:
 								            continue
 								        text_html = '<p>' + text_html
 								        text_plain = remove_html(text_html)
 								        if is_right_to_left_text(text_plain):
 								            text_html = text_html.replace('<p>', '<p><div dir="rtl">', 1)
 								            text_html = text_html.replace('</p>', '</div></p>', 1)
 								            changed = True
 								        result += text_html
-												Handle mixed RTL with line breaks

											
										
										
											2023-09-20 12:50:29 +00:00
+								    if not changed:
-												Clear result

											
										
										
											2023-09-20 12:55:18 +00:00
+								        result = ''
-												Distill line breaks

											
										
										
											2023-09-20 13:00:36 +00:00
+								        prev_distilled = ''
 								        distilled = content
 								        while prev_distilled != distilled:
 								            prev_distilled = distilled
 								            distilled = distilled.replace('<br><br><br>', '<br><br>')
 								        paragraphs = distilled.split('<br><br>')
-												Handle mixed RTL with line breaks

											
										
										
											2023-09-20 12:50:29 +00:00
+								        ctr = 0
 								        for text_html in paragraphs:
 								            ctr += 1
 								            if ctr < len(paragraphs):
 								                text_html += '<br><br>'
 								            text_plain = remove_html(text_html)
 								            if is_right_to_left_text(text_plain):
 								                text_html = '<div dir="rtl">' + text_html
 								                if ctr < len(paragraphs):
 								                    text_html = \
 								                        text_html.replace('<br><br>', '</div><br><br>', 1)
 								                else:
 								                    text_html += '</div>'
 								                changed = True
 								            result += text_html
-												Handling of mixed direction post content

											
										
										
											2023-09-20 12:23:45 +00:00
+								    if not changed:
 								        return content
 								    return result
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
 								def _load_auto_cw(base_dir: str, nickname: str, domain: str) -> []:
 								    """Loads automatic CWs file and returns a list containing
 								    the lines of the file
 								    """
 								    auto_cw_filename = acct_dir(base_dir, nickname, domain) + '/autocw.txt'
 								    if not os.path.isfile(auto_cw_filename):
 								        return []
 								    try:
 								        with open(auto_cw_filename, 'r', encoding='utf-8') as fp_auto:
-												Tidying

											
										
										
											2024-01-17 20:02:35 +00:00
+								            return fp_auto.read().split('\n')
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								    except OSError:
 								        print('EX: unable to load auto cw file ' + auto_cw_filename)
 								    return []
-												Load cached automatic content warnings at startup

											
										
										
											2024-01-18 14:03:35 +00:00
+								def load_auto_cw_cache(base_dir: str) -> {}:
 								    """Returns a dictionary containing the automatic content warning lists
 								    for each account
 								    """
 								    auto_cw_cache = {}
-												Function for accounts data directory

											
										
										
											2024-05-12 12:35:26 +00:00
+								    dir_str = data_dir(base_dir)
 								    for _, dirs, _ in os.walk(dir_str):
-												Load cached automatic content warnings at startup

											
										
										
											2024-01-18 14:03:35 +00:00
+								        for handle in dirs:
 								            if not is_account_dir(handle):
 								                continue
 								            nickname = handle.split('@')[0]
 								            domain = handle.split('@')[1]
 								            auto_cw_cache[nickname] = _load_auto_cw(base_dir, nickname, domain)
 								        break
 								    return auto_cw_cache
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								def add_auto_cw(base_dir: str, nickname: str, domain: str,
-												Add cache for automatic content warnings

											
										
										
											2024-01-18 13:27:22 +00:00
+								                subject: str, content: str,
 								                auto_cw_cache: {}) -> str:
-												Load cached automatic content warnings at startup

											
										
										
											2024-01-18 14:03:35 +00:00
+								    """Appends any automatic content warnings to the subject line
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								    and returns the new subject line
 								    """
 								    new_subject = subject
-												Add cache for automatic content warnings

											
										
										
											2024-01-18 13:27:22 +00:00
+								    if auto_cw_cache.get(nickname):
 								        auto_cw_list = auto_cw_cache[nickname]
 								    else:
 								        auto_cw_list = _load_auto_cw(base_dir, nickname, domain)
 								        auto_cw_cache[nickname] = auto_cw_list
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								    for cw_rule in auto_cw_list:
 								        if '->' not in cw_rule:
 								            continue
-												Tidying

											
										
										
											2024-01-17 20:02:35 +00:00
+								        sections = cw_rule.split('->')
 								        rulematch = sections[0].strip()
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								        if rulematch not in content:
 								            continue
-												Tidying

											
										
										
											2024-01-17 20:02:35 +00:00
+								        cw_str = sections[1].strip()
 								        if not cw_str:
 								            continue
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								        if new_subject:
-												Avoid autoCW repetitions with capitalisation

											
										
										
											2024-07-14 14:04:09 +00:00
+								            if cw_str not in new_subject and \
 								               cw_str.title() not in new_subject:
-												Move function

											
										
										
											2024-01-17 11:53:56 +00:00
+								                new_subject += ', ' + cw_str
 								        else:
 								            new_subject = cw_str
 								    return new_subject