epicyon/markdown.py

__filename__ = "markdown.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.5.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Web Interface"


def _markdown_get_sections(markdown: str) -> []:
    """Returns a list of sections for markdown
    """
    if '<code>' not in markdown:
        return [markdown]
    lines = markdown.split('\n')
    sections = []
    section_text = ''
    section_active = False
    ctr = 0
    for line in lines:
        if ctr > 0:
            section_text += '\n'

        if not section_active:
            if '<code>' in line:
                section_active = True
                sections.append(section_text)
                section_text = ''
        else:
            if '</code>' in line:
                section_active = False
                sections.append(section_text)
                section_text = ''

        section_text += line
        ctr += 1
    if section_text.strip():
        sections.append(section_text)
    return sections


def _markdown_emphasis_html(markdown: str) -> str:
    """Add italics and bold html markup to the given markdown
    """
    replacements = {
        ' **': ' <b>',
        '** ': '</b> ',
        '**.': '</b>.',
        '**:': '</b>:',
        '**;': '</b>;',
        '?**': '?</b>',
        '\n**': '\n<b>',
        '**,': '</b>,',
        '**\n': '</b>\n',
        '(**': '(<b>)',
        '**)': '</b>)',
        '>**': '><b>',
        '**<': '</b><',
        '>*': '><i>',
        '*<': '</i><',
        ' *': ' <i>',
        '* ': '</i> ',
        '?*': '?</i>',
        '\n*': '\n<i>',
        '*.': '</i>.',
        '*:': '</i>:',
        '*;': '</i>;',
        '(*': '(<i>)',
        '*)': '</i>)',
        '*,': '</i>,',
        '*\n': '</i>\n',
        '(_': '(<u>',
        '_)': '</u>)',
        ' _': ' <u>',
        '_ ': '</u> ',
        '_.': '</u>.',
        '_:': '</u>:',
        '_;': '</u>;',
        '_,': '</u>,',
        '_\n': '</u>\n',
        ' `': ' <em>',
        '`.': '</em>.',
        '`:': '</em>:',
        "`'": "</em>'",
        "(`": "(<em>",
        "`)": "</em>)",
        '`;': '</em>;',
        '`,': '</em>,',
        '`\n': '</em>\n',
        '` ': '</em> '
    }

    sections = _markdown_get_sections(markdown)
    markdown = ''
    for section_text in sections:
        if '<code>' in section_text:
            markdown += section_text
            continue
        for md_str, html in replacements.items():
            section_text = section_text.replace(md_str, html)

        if section_text.startswith('**'):
            section_text = section_text[2:] + '<b>'
        elif section_text.startswith('*'):
            section_text = section_text[1:] + '<i>'
        elif section_text.startswith('_'):
            section_text = section_text[1:] + '<u>'

        if section_text.endswith('**'):
            section_text = section_text[:len(section_text) - 2] + '</b>'
        elif section_text.endswith('*'):
            section_text = section_text[:len(section_text) - 1] + '</i>'
        elif section_text.endswith('_'):
            section_text = section_text[:len(section_text) - 1] + '</u>'

        if section_text.strip():
            markdown += section_text
    return markdown


def _markdown_replace_quotes(markdown: str) -> str:
    """Replaces > quotes with html blockquote
    """
    if '> ' not in markdown:
        return markdown
    lines = markdown.split('\n')
    result = ''
    prev_quote_line = None
    code_section = False
    for line in lines:
        # avoid code sections
        if not code_section:
            if '<code>' in line:
                code_section = True
        else:
            if '</code>' in line:
                code_section = False
        if code_section:
            result += line + '\n'
            continue

        if '> ' not in line:
            result += line + '\n'
            prev_quote_line = None
            continue
        line_str = line.strip()
        if not line_str.startswith('> '):
            result += line + '\n'
            prev_quote_line = None
            continue
        line_str = line_str.replace('> ', '', 1).strip()
        if prev_quote_line:
            new_prev_line = prev_quote_line.replace('</i></blockquote>\n', '')
            result = result.replace(prev_quote_line, new_prev_line) + ' '
            line_str += '</i></blockquote>\n'
        else:
            line_str = '<blockquote><i>' + line_str + '</i></blockquote>\n'
        result += line_str
        prev_quote_line = line_str

    if '</blockquote>\n' in result:
        result = result.replace('</blockquote>\n', '</blockquote>')

    if result.endswith('\n') and \
       not markdown.endswith('\n'):
        result = result[:len(result) - 1]
    return result


def _markdown_replace_links(markdown: str) -> str:
    """Replaces markdown links with html
    Optionally replace image links
    """
    sections = _markdown_get_sections(markdown)
    result = ''
    for section_text in sections:
        if '<code>' in section_text or \
           '](' not in section_text:
            result += section_text
            continue
        sections_links = section_text.split('](')
        ctr = 0
        for link_section in sections_links:
            if ctr == 0:
                ctr += 1
                continue
            if '[' in sections_links[ctr - 1] and \
               ')' in link_section:
                link_text = sections_links[ctr - 1].split('[')[-1]
                link_url = link_section.split(')')[0]
                replace_str = '[' + link_text + '](' + link_url + ')'
                link_text = link_text.replace('`', '')
                if '!' + replace_str in section_text:
                    html_link = \
                        '<img class="markdownImage" src="' + \
                        link_url + '" alt="' + link_text + '" />'
                    section_text = \
                        section_text.replace('!' + replace_str, html_link)
                if replace_str in section_text:
                    if not link_url.startswith('#'):
                        # external link
                        html_link = \
                            '<a href="' + link_url + '" target="_blank" ' + \
                            'rel="nofollow noopener noreferrer">' + \
                            link_text + '</a>'
                    else:
                        # bookmark
                        html_link = \
                            '<a href="' + link_url + '">' + link_text + '</a>'
                    section_text = \
                        section_text.replace(replace_str, html_link)
            ctr += 1
        result += section_text
    return result


def _markdown_replace_bullet_points(markdown: str) -> str:
    """Replaces bullet points
    """
    lines = markdown.split('\n')
    bullet_style = ('* ', ' * ', '- ', ' - ')
    bullet_matched = ''
    start_line = -1
    line_ctr = 0
    changed = False
    code_section = False
    for line in lines:
        if not line.strip():
            # skip blank lines
            line_ctr += 1
            continue

        # skip over code sections
        if not code_section:
            if '<code>' in line:
                code_section = True
        else:
            if '</code>' in line:
                code_section = False
        if code_section:
            line_ctr += 1
            continue

        if not bullet_matched:
            for test_str in bullet_style:
                if line.startswith(test_str):
                    bullet_matched = test_str
                    start_line = line_ctr
                    break
        else:
            if not line.startswith(bullet_matched):
                for index in range(start_line, line_ctr):
                    line_text = lines[index].replace(bullet_matched, '', 1)
                    if index == start_line:
                        lines[index] = \
                            '<ul class="md_list">\n<li>' + line_text + '</li>'
                    elif index == line_ctr - 1:
                        lines[index] = '<li>' + line_text + '</li>\n</ul>'
                    else:
                        lines[index] = '<li>' + line_text + '</li>'
                changed = True
                start_line = -1
                bullet_matched = ''
        line_ctr += 1

    if not changed:
        return markdown

    markdown = ''
    for line in lines:
        markdown += line + '\n'
    return markdown


def _markdown_replace_code(markdown: str) -> str:
    """Replaces code sections within markdown
    """
    lines = markdown.split('\n')
    start_line = -1
    line_ctr = 0
    changed = False
    section_active = False
    url_encode = False
    html_escape_table = {
        "&": "&amp;",
        '"': "&quot;",
        "'": "&apos;",
        ">": "&gt;",
        "<": "&lt;"
    }
    for line in lines:
        if not line.strip():
            # skip blank lines
            line_ctr += 1
            continue
        if line.startswith('```'):
            if not section_active:
                if 'html' in line or 'xml' in line or 'rdf' in line:
                    url_encode = True
                start_line = line_ctr
                section_active = True
            else:
                lines[start_line] = '<code>'
                lines[line_ctr] = '</code>'
                if url_encode:
                    lines[start_line] = '<pre>\n<code>'
                    lines[line_ctr] = '</code>\n</pre>'
                    for line_num in range(start_line + 1, line_ctr):
                        lines[line_num] = \
                            "".join(html_escape_table.get(char, char)
                                    for char in lines[line_num])
                section_active = False
                changed = True
                url_encode = False
        line_ctr += 1

    if not changed:
        return markdown

    markdown = ''
    for line in lines:
        markdown += line + '\n'
    return markdown


def markdown_example_numbers(markdown: str) -> str:
    """Ensures that example numbers in the ActivityPub specification
    document are sequential
    """
    lines = markdown.split('\n')
    example_number = 1
    line_ctr = 0
    for line in lines:
        if not line.strip():
            # skip blank lines
            line_ctr += 1
            continue
        if line.startswith('##') and '## Example ' in line:
            header_str = line.split(' Example ')[0]
            lines[line_ctr] = header_str + ' Example ' + str(example_number)
            example_number += 1
        line_ctr += 1

    markdown = ''
    for line in lines:
        markdown += line + '\n'
    return markdown


def markdown_to_html(markdown: str) -> str:
    """Converts markdown formatted text to html
    """
    markdown = _markdown_replace_code(markdown)
    markdown = _markdown_replace_bullet_points(markdown)
    markdown = _markdown_replace_quotes(markdown)
    markdown = _markdown_emphasis_html(markdown)
    markdown = _markdown_replace_links(markdown)

    # replace headers
    lines_list = markdown.split('\n')
    html_str = ''
    ctr = 0
    code_section = False
    titles = {
        "h6": '######',
        "h5": '#####',
        "h4": '####',
        "h3": '###',
        "h2": '##',
        "h1": '#'
    }
    for line in lines_list:
        if ctr > 0:
            if not code_section:
                html_str += '<br>\n'
            else:
                html_str += '\n'

        # avoid code sections
        if not code_section:
            if '<code>' in line:
                code_section = True
        else:
            if '</code>' in line:
                code_section = False
        if code_section:
            html_str += line
            ctr += 1
            continue

        for hsh, hashes in titles.items():
            if line.startswith(hashes):
                bookmark_str = line.split(' ', 1)[1].lower().replace(' ', '-')
                line = line.replace(hashes, '').strip()
                line = '<' + hsh + ' id="' + bookmark_str + '">' + \
                    line + '</' + hsh + '>\n'
                ctr = -1
                break
        html_str += line
        ctr += 1

    replacements = (
        ('<code><br>', '<code>'),
        ('</code><br>', '</code>'),
        ('<ul class="md_list"><br>', '<ul class="md_list">'),
        ('</li><br>', '</li>')
    )
    for pair in replacements:
        html_str = html_str.replace(pair[0], pair[1])

    return html_str