epicyon/content.py

999 lines
35 KiB
Python
Raw Normal View History

2020-04-02 09:56:17 +00:00
__filename__ = "content.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
2019-07-15 14:11:31 +00:00
import os
2019-11-10 11:37:24 +00:00
import email.parser
import urllib.parse
2019-08-11 16:55:22 +00:00
from shutil import copyfile
2020-11-21 11:54:29 +00:00
from utils import getImageExtensions
from utils import loadJson
2020-02-21 10:19:02 +00:00
from utils import fileLastModified
2020-06-11 12:26:15 +00:00
from utils import getLinkPrefixes
2019-07-15 14:11:31 +00:00
2020-09-30 22:55:53 +00:00
2020-10-11 09:33:31 +00:00
def removeHtmlTag(htmlStr: str, tag: str) -> str:
"""Removes a given tag from a html string
"""
tagFound = True
while tagFound:
matchStr = ' ' + tag + '="'
if matchStr not in htmlStr:
tagFound = False
break
sections = htmlStr.split(matchStr, 1)
if '"' not in sections[1]:
tagFound = False
break
htmlStr = sections[0] + sections[1].split('"', 1)[1]
return htmlStr
2020-09-30 22:52:39 +00:00
def removeQuotesWithinQuotes(content: str) -> str:
"""Removes any blockquote inside blockquote
"""
if '<blockquote>' not in content:
return content
if '</blockquote>' not in content:
return content
ctr = 1
found = True
while found:
prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
quotedStr = content.split('<blockquote>', ctr)[1]
if '</blockquote>' not in quotedStr:
found = False
else:
endStr = quotedStr.split('</blockquote>')[1]
quotedStr = quotedStr.split('</blockquote>')[0]
if '<blockquote>' not in endStr:
found = False
if '<blockquote>' in quotedStr:
quotedStr = quotedStr.replace('<blockquote>', '')
content = prefix + quotedStr + '</blockquote>' + endStr
ctr += 1
return content
2020-04-02 09:56:17 +00:00
def htmlReplaceEmailQuote(content: str) -> str:
"""Replaces an email style quote "> Some quote" with html blockquote
"""
2020-09-14 11:30:56 +00:00
# replace quote paragraph
if '<p>&quot;' in content:
if '&quot;</p>' in content:
2020-10-30 12:10:57 +00:00
if content.count('<p>&quot;') == content.count('&quot;</p>'):
content = content.replace('<p>&quot;', '<p><blockquote>')
content = content.replace('&quot;</p>', '</blockquote></p>')
2020-09-14 12:17:11 +00:00
if '>\u201c' in content:
if '\u201d<' in content:
2020-10-30 12:10:57 +00:00
if content.count('>\u201c') == content.count('\u201d<'):
2020-10-30 12:12:09 +00:00
content = content.replace('>\u201c', '><blockquote>')
content = content.replace('\u201d<', '</blockquote><')
2020-09-14 11:30:56 +00:00
# replace email style quote
if '>&gt; ' not in content:
return content
contentStr = content.replace('<p>', '')
contentLines = contentStr.split('</p>')
newContent = ''
for lineStr in contentLines:
if not lineStr:
continue
if '>&gt; ' not in lineStr:
2020-09-14 10:25:12 +00:00
if lineStr.startswith('&gt; '):
lineStr = lineStr.replace('&gt; ', '<blockquote>')
lineStr = lineStr.replace('&gt;', '<br>')
newContent += '<p>' + lineStr + '</blockquote></p>'
else:
newContent += '<p>' + lineStr + '</p>'
else:
lineStr = lineStr.replace('>&gt; ', '><blockquote>')
2020-09-30 22:52:39 +00:00
if lineStr.startswith('&gt;'):
lineStr = lineStr.replace('&gt;', '<blockquote>', 1)
else:
lineStr = lineStr.replace('&gt;', '<br>')
newContent += '<p>' + lineStr + '</blockquote></p>'
2020-09-30 22:52:39 +00:00
return removeQuotesWithinQuotes(newContent)
2020-08-02 17:01:12 +00:00
def htmlReplaceQuoteMarks(content: str) -> str:
"""Replaces quotes with html formatting
"hello" becomes <q>hello</q>
"""
if '"' not in content:
2020-08-03 17:03:30 +00:00
if '&quot;' not in content:
return content
# only if there are a few quote marks
if content.count('"') > 4:
return content
if content.count('&quot;') > 4:
return content
2020-08-02 17:01:12 +00:00
2020-08-03 17:03:30 +00:00
newContent = content
if '"' in content:
sections = content.split('"')
if len(sections) > 1:
newContent = ''
openQuote = True
2020-08-02 17:17:51 +00:00
markup = False
2020-08-03 17:03:30 +00:00
for ch in content:
currChar = ch
if ch == '<':
markup = True
elif ch == '>':
markup = False
elif ch == '"' and not markup:
if openQuote:
currChar = ''
else:
currChar = ''
openQuote = not openQuote
newContent += currChar
2020-08-02 19:16:22 +00:00
if '&quot;' in newContent:
openQuote = True
content = newContent
newContent = ''
ctr = 0
sections = content.split('&quot;')
noOfSections = len(sections)
for s in sections:
newContent += s
if ctr < noOfSections - 1:
if openQuote:
newContent += ''
else:
newContent += ''
openQuote = not openQuote
ctr += 1
2020-08-02 17:01:12 +00:00
return newContent
def dangerousMarkup(content: str, allowLocalNetworkAccess: bool) -> bool:
2020-07-10 14:15:01 +00:00
"""Returns true if the given content contains dangerous html markup
"""
if '<' not in content:
return False
if '>' not in content:
return False
contentSections = content.split('<')
invalidPartials = ()
if not allowLocalNetworkAccess:
invalidPartials = ('127.0.', '192.168', '10.0.')
2020-07-10 14:15:01 +00:00
invalidStrings = ('script', 'canvas', 'style', 'abbr',
'frame', 'iframe', 'html', 'body',
2020-12-11 10:02:23 +00:00
'hr', 'allow-popups', 'allow-scripts')
2020-07-10 14:15:01 +00:00
for markup in contentSections:
if '>' not in markup:
continue
markup = markup.split('>')[0].strip()
2020-11-11 09:42:48 +00:00
for partialMatch in invalidPartials:
if partialMatch in markup:
return True
2020-07-10 14:15:01 +00:00
if ' ' not in markup:
for badStr in invalidStrings:
if badStr in markup:
return True
else:
for badStr in invalidStrings:
if badStr + ' ' in markup:
return True
return False
def dangerousCSS(filename: str, allowLocalNetworkAccess: bool) -> bool:
2020-11-15 11:01:05 +00:00
"""Returns true is the css file contains code which
can create security problems
"""
if not os.path.isfile(filename):
return False
with open(filename, 'r') as fp:
2020-11-15 11:14:26 +00:00
content = fp.read().lower()
2020-11-15 11:01:05 +00:00
cssMatches = ('behavior:', ':expression', '?php', '.php',
'google')
2020-11-15 11:01:05 +00:00
for match in cssMatches:
if match in content:
return True
# an attacker can include html inside of the css
# file as a comment and this may then be run from the html
if dangerousMarkup(content, allowLocalNetworkAccess):
2020-11-15 11:01:05 +00:00
return True
return False
2020-04-02 09:56:17 +00:00
def switchWords(baseDir: str, nickname: str, domain: str, content: str) -> str:
2020-02-19 18:51:08 +00:00
"""Performs word replacements. eg. Trump -> The Orange Menace
"""
2020-04-02 09:56:17 +00:00
switchWordsFilename = baseDir + '/accounts/' + \
nickname + '@' + domain + '/replacewords.txt'
2020-02-19 18:51:08 +00:00
if not os.path.isfile(switchWordsFilename):
return content
with open(switchWordsFilename, 'r') as fp:
for line in fp:
2020-05-22 11:32:38 +00:00
replaceStr = line.replace('\n', '').replace('\r', '')
2020-04-02 09:56:17 +00:00
wordTransform = None
2020-02-19 18:51:08 +00:00
if '->' in replaceStr:
2020-04-02 09:56:17 +00:00
wordTransform = replaceStr.split('->')
2020-02-19 18:51:08 +00:00
elif ':' in replaceStr:
2020-04-02 09:56:17 +00:00
wordTransform = replaceStr.split(':')
2020-02-19 18:51:08 +00:00
elif ',' in replaceStr:
2020-04-02 09:56:17 +00:00
wordTransform = replaceStr.split(',')
2020-02-19 18:51:08 +00:00
elif ';' in replaceStr:
2020-04-02 09:56:17 +00:00
wordTransform = replaceStr.split(';')
2020-02-19 18:51:08 +00:00
elif '-' in replaceStr:
2020-04-02 09:56:17 +00:00
wordTransform = replaceStr.split('-')
2020-02-19 18:51:08 +00:00
if not wordTransform:
continue
2020-04-02 09:56:17 +00:00
if len(wordTransform) == 2:
replaceStr1 = wordTransform[0].strip().replace('"', '')
replaceStr2 = wordTransform[1].strip().replace('"', '')
content = content.replace(replaceStr1, replaceStr2)
2020-02-19 18:51:08 +00:00
return content
2020-04-02 09:56:17 +00:00
def replaceEmojiFromTags(content: str, tag: [], messageType: str) -> str:
2019-09-29 16:28:02 +00:00
"""Uses the tags to replace :emoji: with html image markup
"""
2019-09-29 17:20:10 +00:00
for tagItem in tag:
if not tagItem.get('type'):
continue
2020-04-02 09:56:17 +00:00
if tagItem['type'] != 'Emoji':
2019-09-29 17:20:10 +00:00
continue
if not tagItem.get('name'):
2019-09-29 16:28:02 +00:00
continue
if not tagItem.get('icon'):
continue
if not tagItem['icon'].get('url'):
continue
2020-02-21 15:09:31 +00:00
if '/' not in tagItem['icon']['url']:
continue
2019-09-29 16:28:02 +00:00
if tagItem['name'] not in content:
continue
2020-04-02 09:56:17 +00:00
iconName = tagItem['icon']['url'].split('/')[-1]
2020-02-21 15:09:31 +00:00
if iconName:
2020-04-02 09:56:17 +00:00
if len(iconName) > 1:
2020-02-21 17:45:20 +00:00
if iconName[0].isdigit():
2020-02-21 21:08:24 +00:00
if '.' in iconName:
2020-04-02 09:56:17 +00:00
iconName = iconName.split('.')[0]
# see https://unicode.org/
# emoji/charts/full-emoji-list.html
2020-02-21 21:08:24 +00:00
if '-' not in iconName:
# a single code
try:
2020-04-02 09:56:17 +00:00
replaceChar = chr(int("0x" + iconName, 16))
content = content.replace(tagItem['name'],
replaceChar)
except BaseException:
2020-02-21 21:08:24 +00:00
pass
else:
# sequence of codes
2020-04-02 09:56:17 +00:00
iconCodes = iconName.split('-')
iconCodeSequence = ''
2020-02-21 21:08:24 +00:00
for icode in iconCodes:
try:
2020-04-02 09:56:17 +00:00
iconCodeSequence += chr(int("0x" +
icode, 16))
except BaseException:
iconCodeSequence = ''
2020-02-21 21:08:24 +00:00
break
if iconCodeSequence:
2020-04-02 09:56:17 +00:00
content = content.replace(tagItem['name'],
iconCodeSequence)
htmlClass = 'emoji'
if messageType == 'post header':
htmlClass = 'emojiheader'
if messageType == 'profile':
htmlClass = 'emojiprofile'
emojiHtml = "<img src=\"" + tagItem['icon']['url'] + "\" alt=\"" + \
tagItem['name'].replace(':', '') + \
"\" align=\"middle\" class=\"" + htmlClass + "\"/>"
content = content.replace(tagItem['name'], emojiHtml)
2019-09-29 16:28:02 +00:00
return content
2020-02-21 15:09:31 +00:00
2020-04-02 09:56:17 +00:00
def addMusicTag(content: str, tag: str) -> str:
2020-03-29 09:59:54 +00:00
"""If a music link is found then ensure that the post is
tagged appropriately
2019-09-05 09:54:27 +00:00
"""
2020-10-11 09:50:17 +00:00
if '#podcast' in content or '#documentary' in content:
return content
2019-09-05 09:54:27 +00:00
if '#' not in tag:
2020-10-11 09:50:17 +00:00
tag = '#' + tag
2019-09-05 09:54:27 +00:00
if tag in content:
return content
2020-06-11 11:56:08 +00:00
musicSites = ('soundcloud.com', 'bandcamp.com')
2020-04-02 09:56:17 +00:00
musicSiteFound = False
2019-09-05 09:54:27 +00:00
for site in musicSites:
if site+'/' in content:
2020-04-02 09:56:17 +00:00
musicSiteFound = True
2019-09-05 09:54:27 +00:00
break
if not musicSiteFound:
return content
2020-04-02 09:56:17 +00:00
return ':music: ' + content + ' ' + tag + ' '
2019-09-05 09:54:27 +00:00
2019-08-21 12:07:30 +00:00
def addWebLinks(content: str) -> str:
"""Adds markup for web links
"""
2020-06-11 09:43:48 +00:00
if ':' not in content:
return content
2020-06-11 12:26:15 +00:00
prefixes = getLinkPrefixes()
2020-06-11 11:56:08 +00:00
# do any of these prefixes exist within the content?
prefixFound = False
for prefix in prefixes:
if prefix in content:
prefixFound = True
break
# if there are no prefixes then just keep the content we have
if not prefixFound:
2019-08-21 12:07:30 +00:00
return content
2020-04-02 09:56:17 +00:00
maxLinkLength = 40
2020-05-22 11:32:38 +00:00
content = content.replace('\r', '')
2020-04-02 09:56:17 +00:00
words = content.replace('\n', ' --linebreak-- ').split(' ')
replaceDict = {}
2019-08-21 12:07:30 +00:00
for w in words:
2020-06-11 09:43:48 +00:00
if ':' not in w:
continue
2020-06-11 11:56:08 +00:00
# does the word begin with a prefix?
prefixFound = False
for prefix in prefixes:
if w.startswith(prefix):
prefixFound = True
break
if not prefixFound:
continue
# the word contains a prefix
if w.endswith('.') or w.endswith(';'):
w = w[:-1]
markup = '<a href="' + w + \
2020-12-11 10:14:58 +00:00
'" rel="nofollow noopener noreferrer" target="_blank">'
2020-06-11 11:56:08 +00:00
for prefix in prefixes:
if w.startswith(prefix):
markup += '<span class="invisible">' + prefix + '</span>'
break
linkText = w
for prefix in prefixes:
linkText = linkText.replace(prefix, '')
# prevent links from becoming too long
if len(linkText) > maxLinkLength:
markup += '<span class="ellipsis">' + \
linkText[:maxLinkLength] + '</span>'
markup += '<span class="invisible">' + \
linkText[maxLinkLength:] + '</span></a>'
else:
markup += '<span class="ellipsis">' + linkText + '</span></a>'
replaceDict[w] = markup
# do the replacements
2020-04-02 09:56:17 +00:00
for url, markup in replaceDict.items():
content = content.replace(url, markup)
2020-06-11 11:56:08 +00:00
# replace any line breaks
2020-04-02 09:56:17 +00:00
content = content.replace(' --linebreak-- ', '<br>')
2020-06-11 11:56:08 +00:00
2019-08-21 12:07:30 +00:00
return content
2020-04-02 09:56:17 +00:00
2019-08-09 11:12:08 +00:00
def validHashTag(hashtag: str) -> bool:
"""Returns true if the give hashtag contains valid characters
"""
2020-08-07 20:43:54 +00:00
# long hashtags are not valid
if len(hashtag) >= 32:
return False
2020-10-17 12:05:41 +00:00
# TODO: this may need to be an international character set
2020-04-02 09:56:17 +00:00
validChars = set('0123456789' +
'abcdefghijklmnopqrstuvwxyz' +
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
2019-08-09 11:12:08 +00:00
if set(hashtag).issubset(validChars):
return True
return False
2020-04-02 09:56:17 +00:00
def addHashTags(wordStr: str, httpPrefix: str, domain: str,
replaceHashTags: {}, postHashtags: {}) -> bool:
2019-08-09 11:12:08 +00:00
"""Detects hashtags and adds them to the replacements dict
Also updates the hashtags list to be added to the post
"""
if replaceHashTags.get(wordStr):
2020-04-02 09:56:17 +00:00
return True
hashtag = wordStr[1:]
2019-08-09 11:12:08 +00:00
if not validHashTag(hashtag):
return False
2020-04-02 09:56:17 +00:00
hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag
postHashtags[hashtag] = {
2019-08-09 11:12:08 +00:00
'href': hashtagUrl,
2020-10-16 20:13:23 +00:00
'name': '#' + hashtag,
2019-08-09 11:12:08 +00:00
'type': 'Hashtag'
}
2020-04-02 09:56:17 +00:00
replaceHashTags[wordStr] = "<a href=\"" + hashtagUrl + \
"\" class=\"mention hashtag\" rel=\"tag\">#<span>" + \
hashtag + "</span></a>"
2019-08-09 11:12:08 +00:00
return True
2020-04-02 09:56:17 +00:00
def loadEmojiDict(emojiDataFilename: str, emojiDict: {}) -> None:
2019-08-09 16:18:00 +00:00
"""Creates an emoji dictionary based on emoji/emoji-data.txt
"""
if not os.path.isfile(emojiDataFilename):
return
2020-04-02 09:56:17 +00:00
with open(emojiDataFilename, "r") as fileHandler:
2019-08-09 16:18:00 +00:00
for line in fileHandler:
2020-04-02 09:56:17 +00:00
if len(line) < 5:
2019-08-09 16:18:00 +00:00
continue
if line.startswith('#'):
continue
if '; Emoji' not in line:
continue
if ')' not in line:
continue
2020-04-02 09:56:17 +00:00
emojiUnicode = line.split(' ')[0]
if len(emojiUnicode) < 4:
2019-08-09 16:18:00 +00:00
continue
if '..' in emojiUnicode:
2020-04-02 09:56:17 +00:00
emojiUnicode = emojiUnicode.split('..')[0]
2020-05-22 11:32:38 +00:00
emojiName = line.split(')', 1)[1].strip()
emojiName = emojiName.replace('\n', '').replace('\r', '')
2020-04-02 09:56:17 +00:00
emojiName = emojiName.replace(' ', '').replace('-', '')
2019-08-09 16:18:00 +00:00
if '..' in emojiName:
2020-04-02 09:56:17 +00:00
emojiName = emojiName.split('..')[0]
emojiDict[emojiName.lower()] = emojiUnicode
2019-08-09 16:18:00 +00:00
2020-04-02 09:56:17 +00:00
def addEmoji(baseDir: str, wordStr: str,
httpPrefix: str, domain: str,
replaceEmoji: {}, postTags: {},
2020-02-21 10:19:02 +00:00
emojiDict: {}) -> bool:
2019-08-09 16:18:00 +00:00
"""Detects Emoji and adds them to the replacements dict
Also updates the tags list to be added to the post
"""
if not wordStr.startswith(':'):
return False
if not wordStr.endswith(':'):
return False
2020-04-02 09:56:17 +00:00
if len(wordStr) < 3:
2019-08-09 16:18:00 +00:00
return False
if replaceEmoji.get(wordStr):
2020-04-02 09:56:17 +00:00
return True
2019-09-23 11:11:13 +00:00
# remove leading and trailing : characters
2020-04-02 09:56:17 +00:00
emoji = wordStr[1:]
emoji = emoji[:-1]
2019-09-23 11:11:13 +00:00
# is the text of the emoji valid?
2019-08-09 16:18:00 +00:00
if not validHashTag(emoji):
return False
if not emojiDict.get(emoji):
return False
2020-04-02 09:56:17 +00:00
emojiFilename = baseDir + '/emoji/' + emojiDict[emoji] + '.png'
2019-08-09 16:18:00 +00:00
if not os.path.isfile(emojiFilename):
return False
2020-04-02 09:56:17 +00:00
emojiUrl = httpPrefix + "://" + domain + \
"/emoji/" + emojiDict[emoji] + '.png'
postTags[emoji] = {
2019-08-19 13:35:55 +00:00
'icon': {
'mediaType': 'image/png',
'type': 'Image',
'url': emojiUrl
},
2019-08-09 16:18:00 +00:00
'name': ':'+emoji+':',
2020-02-21 10:19:02 +00:00
"updated": fileLastModified(emojiFilename),
2020-04-02 09:56:17 +00:00
"id": emojiUrl.replace('.png', ''),
2019-08-09 16:18:00 +00:00
'type': 'Emoji'
}
return True
2020-04-02 09:56:17 +00:00
def addMention(wordStr: str, httpPrefix: str, following: str,
replaceMentions: {}, recipients: [], tags: {}) -> bool:
2020-03-29 09:59:54 +00:00
"""Detects mentions and adds them to the replacements dict and
recipients list
2019-08-09 09:09:21 +00:00
"""
2020-04-02 09:56:17 +00:00
possibleHandle = wordStr[1:]
2019-08-19 10:05:50 +00:00
# @nick
2019-08-19 11:41:15 +00:00
if following and '@' not in possibleHandle:
2019-08-09 09:48:51 +00:00
# fall back to a best effort match against the following list
# if no domain was specified. eg. @nick
2020-04-02 09:56:17 +00:00
possibleNickname = possibleHandle
2019-08-09 09:48:51 +00:00
for follow in following:
2020-04-02 09:56:17 +00:00
if follow.startswith(possibleNickname + '@'):
2020-05-22 11:32:38 +00:00
replaceDomain = \
follow.replace('\n', '').replace('\r', '').split('@')[1]
2020-04-02 09:56:17 +00:00
recipientActor = httpPrefix + "://" + \
replaceDomain + "/users/" + possibleNickname
2019-08-09 09:48:51 +00:00
if recipientActor not in recipients:
recipients.append(recipientActor)
2020-04-02 09:56:17 +00:00
tags[wordStr] = {
2019-08-19 12:13:18 +00:00
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
2020-04-02 09:56:17 +00:00
replaceMentions[wordStr] = \
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
"://" + replaceDomain + "/@" + possibleNickname + \
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
2020-03-29 09:59:54 +00:00
"</span></a></span>"
2019-08-09 09:48:51 +00:00
return True
return False
2020-04-02 09:56:17 +00:00
possibleNickname = None
possibleDomain = None
2019-10-29 20:15:21 +00:00
if '@' not in possibleHandle:
return False
2020-04-02 09:56:17 +00:00
possibleNickname = possibleHandle.split('@')[0]
2019-10-29 20:15:21 +00:00
if not possibleNickname:
return False
2020-05-22 11:32:38 +00:00
possibleDomain = \
possibleHandle.split('@')[1].strip('\n').strip('\r')
2019-10-29 20:15:21 +00:00
if not possibleDomain:
return False
2019-08-19 11:41:15 +00:00
if following:
for follow in following:
2020-05-22 11:32:38 +00:00
if follow.replace('\n', '').replace('\r', '') != possibleHandle:
2019-08-19 11:41:15 +00:00
continue
2020-04-02 09:56:17 +00:00
recipientActor = httpPrefix + "://" + \
possibleDomain + "/users/" + possibleNickname
2019-08-19 11:41:15 +00:00
if recipientActor not in recipients:
recipients.append(recipientActor)
2020-04-02 09:56:17 +00:00
tags[wordStr] = {
2019-08-19 12:13:18 +00:00
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
2020-04-02 09:56:17 +00:00
replaceMentions[wordStr] = \
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
"://" + possibleDomain + "/@" + possibleNickname + \
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
2020-03-29 09:59:54 +00:00
"</span></a></span>"
2019-08-19 11:41:15 +00:00
return True
2019-08-19 10:05:50 +00:00
# @nick@domain
2020-04-02 09:56:17 +00:00
if not (possibleDomain == 'localhost' or '.' in possibleDomain):
2020-03-22 21:16:02 +00:00
return False
2020-04-02 09:56:17 +00:00
recipientActor = httpPrefix + "://" + \
possibleDomain + "/users/" + possibleNickname
2019-10-29 20:15:21 +00:00
if recipientActor not in recipients:
recipients.append(recipientActor)
2020-04-02 09:56:17 +00:00
tags[wordStr] = {
2019-10-29 20:15:21 +00:00
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
2020-04-02 09:56:17 +00:00
replaceMentions[wordStr] = \
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
"://" + possibleDomain + "/@" + possibleNickname + \
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
2020-03-29 09:59:54 +00:00
"</span></a></span>"
2019-10-29 20:15:21 +00:00
return True
2019-08-09 09:09:21 +00:00
2020-04-02 09:56:17 +00:00
2020-05-12 09:34:58 +00:00
def replaceContentDuplicates(content: str) -> str:
"""Replaces invalid duplicates within content
"""
while '<<' in content:
content = content.replace('<<', '<')
while '>>' in content:
content = content.replace('>>', '>')
content = content.replace('<\\p>', '')
2020-05-12 09:34:58 +00:00
return content
def removeTextFormatting(content: str) -> str:
"""Removes markup for bold, italics, etc
"""
if '<' not in content:
return content
2020-06-14 13:39:03 +00:00
removeMarkup = ('b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5')
for markup in removeMarkup:
content = content.replace('<' + markup + '>', '')
content = content.replace('</' + markup + '>', '')
content = content.replace('<' + markup.upper() + '>', '')
content = content.replace('</' + markup.upper() + '>', '')
return content
2020-04-02 09:56:17 +00:00
def removeLongWords(content: str, maxWordLength: int,
longWordsList: []) -> str:
2020-03-29 09:59:54 +00:00
"""Breaks up long words so that on mobile screens this doesn't
disrupt the layout
2019-10-09 12:19:17 +00:00
"""
2020-05-12 09:34:58 +00:00
content = replaceContentDuplicates(content)
if ' ' not in content:
# handle a single very long string with no spaces
2020-04-02 09:56:17 +00:00
contentStr = content.replace('<p>', '').replace(r'<\p>', '')
if '://' not in contentStr:
2020-04-02 09:56:17 +00:00
if len(contentStr) > maxWordLength:
if '<p>' in content:
2020-04-02 09:56:17 +00:00
content = '<p>' + contentStr[:maxWordLength] + r'<\p>'
else:
2020-04-02 09:56:17 +00:00
content = content[:maxWordLength]
return content
2020-04-02 09:56:17 +00:00
words = content.split(' ')
2019-11-04 20:39:14 +00:00
if not longWordsList:
2020-04-02 09:56:17 +00:00
longWordsList = []
2019-11-04 20:39:14 +00:00
for wordStr in words:
2020-04-02 09:56:17 +00:00
if len(wordStr) > maxWordLength:
2019-11-04 20:39:14 +00:00
if wordStr not in longWordsList:
longWordsList.append(wordStr)
2019-10-18 12:24:31 +00:00
for wordStr in longWordsList:
if wordStr.startswith('<'):
continue
2020-04-02 09:56:17 +00:00
if len(wordStr) == 76:
if wordStr.upper() == wordStr:
2020-03-22 14:29:34 +00:00
# tox address
continue
2019-11-04 21:08:43 +00:00
if '=\"' in wordStr:
continue
if '@' in wordStr:
2019-11-04 21:11:09 +00:00
if '@@' not in wordStr:
continue
2020-01-25 10:49:59 +00:00
if '=.ed25519' in wordStr:
continue
if '.onion' in wordStr:
continue
if '.i2p' in wordStr:
continue
2019-11-04 20:39:14 +00:00
if 'https:' in wordStr:
2019-10-25 18:27:32 +00:00
continue
2019-11-04 20:39:14 +00:00
elif 'http:' in wordStr:
continue
2020-02-17 17:18:21 +00:00
elif 'i2p:' in wordStr:
continue
2020-06-09 11:51:51 +00:00
elif 'gnunet:' in wordStr:
continue
2019-11-04 20:39:14 +00:00
elif 'dat:' in wordStr:
continue
2020-12-06 10:18:41 +00:00
elif 'rad:' in wordStr:
continue
2020-05-17 09:37:59 +00:00
elif 'hyper:' in wordStr:
continue
elif 'briar:' in wordStr:
continue
2019-11-04 20:39:14 +00:00
if '<' in wordStr:
2020-04-02 09:56:17 +00:00
replaceWord = wordStr.split('<', 1)[0]
content = content.replace(wordStr, replaceWord)
wordStr = replaceWord
2019-10-25 18:27:32 +00:00
if '/' in wordStr:
continue
2020-04-02 09:56:17 +00:00
if len(wordStr[maxWordLength:]) < maxWordLength:
content = content.replace(wordStr,
wordStr[:maxWordLength] + '\n' +
wordStr[maxWordLength:])
2019-10-18 12:24:31 +00:00
else:
2020-04-02 09:56:17 +00:00
content = content.replace(wordStr,
wordStr[:maxWordLength])
2020-01-24 11:27:12 +00:00
if content.startswith('<p>'):
if not content.endswith('</p>'):
2020-10-31 23:10:38 +00:00
content = content.strip() + '</p>'
2019-10-09 12:19:17 +00:00
return content
2020-04-02 09:56:17 +00:00
2020-09-13 14:42:17 +00:00
def loadAutoTags(baseDir: str, nickname: str, domain: str) -> []:
"""Loads automatic tags file and returns a list containing
the lines of the file
"""
filename = baseDir + '/accounts/' + \
nickname + '@' + domain + '/autotags.txt'
if not os.path.isfile(filename):
return []
with open(filename, "r") as f:
return f.readlines()
return []
def autoTag(baseDir: str, nickname: str, domain: str,
wordStr: str, autoTagList: [],
appendTags: []):
"""Generates a list of tags to be automatically appended to the content
"""
for tagRule in autoTagList:
if wordStr not in tagRule:
continue
if '->' not in tagRule:
continue
match = tagRule.split('->')[0].strip()
if match != wordStr:
continue
tagName = tagRule.split('->')[1].strip()
if tagName.startswith('#'):
if tagName not in appendTags:
appendTags.append(tagName)
else:
if '#' + tagName not in appendTags:
appendTags.append('#' + tagName)
2020-04-02 09:56:17 +00:00
def addHtmlTags(baseDir: str, httpPrefix: str,
nickname: str, domain: str, content: str,
recipients: [], hashtags: {}, isJsonContent=False) -> str:
2019-07-15 14:11:31 +00:00
""" Replaces plaintext mentions such as @nick@domain into html
by matching against known following accounts
"""
if content.startswith('<p>'):
content = htmlReplaceEmailQuote(content)
2020-08-02 18:30:35 +00:00
return htmlReplaceQuoteMarks(content)
2020-04-02 09:56:17 +00:00
maxWordLength = 40
2020-05-22 11:32:38 +00:00
content = content.replace('\r', '')
2020-04-02 09:56:17 +00:00
content = content.replace('\n', ' --linebreak-- ')
content = addMusicTag(content, 'nowplaying')
2020-10-16 19:49:34 +00:00
contentSimplified = \
content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
contentSimplified = contentSimplified.replace('. ', ' ').strip()
if contentSimplified.endswith('.'):
contentSimplified = contentSimplified[:len(contentSimplified)-1]
words = contentSimplified.split(' ')
2020-03-22 21:16:02 +00:00
# remove . for words which are not mentions
2020-04-02 09:56:17 +00:00
newWords = []
for wordIndex in range(0, len(words)):
wordStr = words[wordIndex]
if wordStr.endswith('.'):
if not wordStr.startswith('@'):
2020-04-02 09:56:17 +00:00
wordStr = wordStr[:-1]
2019-08-19 11:08:47 +00:00
if wordStr.startswith('.'):
2020-04-02 09:56:17 +00:00
wordStr = wordStr[1:]
2019-08-19 11:14:38 +00:00
newWords.append(wordStr)
2020-04-02 09:56:17 +00:00
words = newWords
2019-08-19 11:14:38 +00:00
2020-04-02 09:56:17 +00:00
replaceMentions = {}
replaceHashTags = {}
replaceEmoji = {}
emojiDict = {}
originalDomain = domain
2019-07-15 14:24:33 +00:00
if ':' in domain:
2020-04-02 09:56:17 +00:00
domain = domain.split(':')[0]
followingFilename = baseDir + '/accounts/' + \
nickname + '@' + domain + '/following.txt'
2019-08-09 09:09:21 +00:00
# read the following list so that we can detect just @nick
# in addition to @nick@domain
2020-04-02 09:56:17 +00:00
following = None
2019-10-18 12:24:31 +00:00
if '@' in words:
if os.path.isfile(followingFilename):
with open(followingFilename, "r") as f:
2020-04-02 09:56:17 +00:00
following = f.readlines()
2019-08-09 09:09:21 +00:00
# extract mentions and tags from words
2020-04-02 09:56:17 +00:00
longWordsList = []
2020-09-13 14:42:17 +00:00
prevWordStr = ''
autoTagsList = loadAutoTags(baseDir, nickname, domain)
appendTags = []
2019-07-15 14:11:31 +00:00
for wordStr in words:
2020-04-02 09:56:17 +00:00
wordLen = len(wordStr)
if wordLen > 2:
if wordLen > maxWordLength:
2019-10-18 12:24:31 +00:00
longWordsList.append(wordStr)
2020-04-02 09:56:17 +00:00
firstChar = wordStr[0]
if firstChar == '@':
if addMention(wordStr, httpPrefix, following,
replaceMentions, recipients, hashtags):
2020-09-13 14:42:17 +00:00
prevWordStr = ''
2019-10-18 12:24:31 +00:00
continue
2020-04-02 09:56:17 +00:00
elif firstChar == '#':
if addHashTags(wordStr, httpPrefix, originalDomain,
replaceHashTags, hashtags):
2020-09-13 14:42:17 +00:00
prevWordStr = ''
2019-10-18 12:24:31 +00:00
continue
elif ':' in wordStr:
2020-04-02 09:56:17 +00:00
wordStr2 = wordStr.split(':')[1]
# print('TAG: emoji located - '+wordStr)
2019-10-09 18:32:53 +00:00
if not emojiDict:
2020-03-29 09:59:54 +00:00
# emoji.json is generated so that it can be customized and
# the changes will be retained even if default_emoji.json
# is subsequently updated
2020-04-02 09:56:17 +00:00
if not os.path.isfile(baseDir + '/emoji/emoji.json'):
copyfile(baseDir + '/emoji/default_emoji.json',
baseDir + '/emoji/emoji.json')
emojiDict = loadJson(baseDir + '/emoji/emoji.json')
# print('TAG: looking up emoji for :'+wordStr2+':')
addEmoji(baseDir, ':' + wordStr2 + ':', httpPrefix,
originalDomain, replaceEmoji, hashtags,
2020-03-29 09:59:54 +00:00
emojiDict)
2020-09-13 14:42:17 +00:00
else:
if autoTag(baseDir, nickname, domain, wordStr,
autoTagsList, appendTags):
prevWordStr = ''
continue
if prevWordStr:
if autoTag(baseDir, nickname, domain,
prevWordStr + ' ' + wordStr,
autoTagsList, appendTags):
prevWordStr = ''
continue
prevWordStr = wordStr
# add any auto generated tags
for appended in appendTags:
content = content + ' ' + appended
addHashTags(appended, httpPrefix, originalDomain,
replaceHashTags, hashtags)
2019-08-09 09:09:21 +00:00
# replace words with their html versions
2020-04-02 09:56:17 +00:00
for wordStr, replaceStr in replaceMentions.items():
content = content.replace(wordStr, replaceStr)
for wordStr, replaceStr in replaceHashTags.items():
content = content.replace(wordStr, replaceStr)
if not isJsonContent:
2020-04-02 09:56:17 +00:00
for wordStr, replaceStr in replaceEmoji.items():
content = content.replace(wordStr, replaceStr)
2020-04-02 09:56:17 +00:00
content = addWebLinks(content)
2019-10-18 12:24:31 +00:00
if longWordsList:
2020-04-02 09:56:17 +00:00
content = removeLongWords(content, maxWordLength, longWordsList)
content = content.replace(' --linebreak-- ', '</p><p>')
content = htmlReplaceEmailQuote(content)
2020-08-02 18:09:50 +00:00
return '<p>' + htmlReplaceQuoteMarks(content) + '</p>'
2020-03-22 21:16:02 +00:00
2020-04-02 09:56:17 +00:00
def getMentionsFromHtml(htmlText: str,
2020-03-29 09:59:54 +00:00
matchStr="<span class=\"h-card\"><a href=\"") -> []:
2019-08-05 19:13:15 +00:00
"""Extracts mentioned actors from the given html content string
"""
2020-04-02 09:56:17 +00:00
mentions = []
2019-08-05 19:13:15 +00:00
if matchStr not in htmlText:
return mentions
2020-04-02 09:56:17 +00:00
mentionsList = htmlText.split(matchStr)
2019-08-05 19:13:15 +00:00
for mentionStr in mentionsList:
if '"' not in mentionStr:
continue
2020-04-02 09:56:17 +00:00
actorStr = mentionStr.split('"')[0]
2019-08-05 19:13:15 +00:00
if actorStr.startswith('http') or \
2020-06-09 11:51:51 +00:00
actorStr.startswith('gnunet') or \
2020-02-17 17:18:21 +00:00
actorStr.startswith('i2p') or \
2020-05-17 09:37:59 +00:00
actorStr.startswith('hyper') or \
2019-08-05 19:13:15 +00:00
actorStr.startswith('dat:'):
2019-09-22 17:54:33 +00:00
if actorStr not in mentions:
mentions.append(actorStr)
2019-08-05 19:13:15 +00:00
return mentions
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
def extractMediaInFormPOST(postBytes, boundary, name: str):
2020-03-29 09:59:54 +00:00
"""Extracts the binary encoding for image/video/audio within a http
form POST
2019-11-10 11:37:24 +00:00
Returns the media bytes and the remaining bytes
"""
2020-04-02 09:56:17 +00:00
imageStartBoundary = b'Content-Disposition: form-data; name="' + \
name.encode('utf8', 'ignore') + b'";'
imageStartLocation = postBytes.find(imageStartBoundary)
if imageStartLocation == -1:
return None, postBytes
2019-11-10 11:37:24 +00:00
# bytes after the start boundary appears
2020-04-02 09:56:17 +00:00
mediaBytes = postBytes[imageStartLocation:]
2019-11-10 11:37:24 +00:00
# look for the next boundary
2020-04-02 09:56:17 +00:00
imageEndBoundary = boundary.encode('utf8', 'ignore')
imageEndLocation = mediaBytes.find(imageEndBoundary)
if imageEndLocation == -1:
2019-11-10 11:37:24 +00:00
# no ending boundary
2020-04-02 09:56:17 +00:00
return mediaBytes, postBytes[:imageStartLocation]
2019-11-10 11:37:24 +00:00
# remaining bytes after the end of the image
2020-04-02 09:56:17 +00:00
remainder = mediaBytes[imageEndLocation:]
2019-11-10 11:37:24 +00:00
# remove bytes after the end boundary
2020-04-02 09:56:17 +00:00
mediaBytes = mediaBytes[:imageEndLocation]
2019-11-10 11:37:24 +00:00
# return the media and the before+after bytes
2020-04-02 09:56:17 +00:00
return mediaBytes, postBytes[:imageStartLocation] + remainder
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
def saveMediaInFormPOST(mediaBytes, debug: bool,
filenameBase=None) -> (str, str):
2019-11-10 11:37:24 +00:00
"""Saves the given media bytes extracted from http form POST
Returns the filename and attachment type
"""
if not mediaBytes:
if debug:
print('DEBUG: No media found within POST')
2020-04-02 09:56:17 +00:00
return None, None
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
mediaLocation = -1
searchStr = ''
filename = None
2020-03-22 21:16:02 +00:00
2019-11-10 11:37:24 +00:00
# directly search the binary array for the beginning
# of an image
2020-04-02 09:56:17 +00:00
extensionList = {
2019-11-10 11:37:24 +00:00
'png': 'image/png',
'jpeg': 'image/jpeg',
'gif': 'image/gif',
2019-11-14 13:30:54 +00:00
'webp': 'image/webp',
'avif': 'image/avif',
2019-11-10 11:37:24 +00:00
'mp4': 'video/mp4',
'ogv': 'video/ogv',
'mp3': 'audio/mpeg',
2020-05-26 19:29:15 +00:00
'ogg': 'audio/ogg'
2019-11-10 11:37:24 +00:00
}
2020-04-02 09:56:17 +00:00
detectedExtension = None
for extension, contentType in extensionList.items():
searchStr = b'Content-Type: ' + contentType.encode('utf8', 'ignore')
mediaLocation = mediaBytes.find(searchStr)
if mediaLocation > -1:
2020-05-26 19:05:03 +00:00
# image/video/audio binaries
2020-04-02 09:56:17 +00:00
if extension == 'jpeg':
extension = 'jpg'
elif extension == 'mpeg':
extension = 'mp3'
filename = filenameBase + '.' + extension
attachmentMediaType = \
searchStr.decode().split('/')[0].replace('Content-Type: ', '')
detectedExtension = extension
2019-11-10 11:37:24 +00:00
break
if not filename:
2020-04-02 09:56:17 +00:00
return None, None
2019-11-10 11:37:24 +00:00
2020-05-26 19:29:15 +00:00
# locate the beginning of the image, after any
# carriage returns
startPos = mediaLocation + len(searchStr)
for offset in range(1, 8):
if mediaBytes[startPos+offset] != 10:
if mediaBytes[startPos+offset] != 13:
startPos += offset
break
2019-11-10 11:37:24 +00:00
2019-11-14 13:30:54 +00:00
# remove any existing image files with a different format
2020-11-21 11:54:29 +00:00
extensionTypes = getImageExtensions()
2019-11-14 14:29:17 +00:00
for ex in extensionTypes:
2020-04-02 09:56:17 +00:00
if ex == detectedExtension:
2019-11-14 14:29:17 +00:00
continue
2020-04-02 09:56:17 +00:00
possibleOtherFormat = \
filename.replace('.temp', '').replace('.' +
detectedExtension, '.' +
ex)
2019-11-14 14:29:17 +00:00
if os.path.isfile(possibleOtherFormat):
os.remove(possibleOtherFormat)
2019-11-14 13:30:54 +00:00
2020-04-02 09:56:17 +00:00
fd = open(filename, 'wb')
2019-11-10 11:37:24 +00:00
fd.write(mediaBytes[startPos:])
fd.close()
2019-12-04 18:52:27 +00:00
2020-04-02 09:56:17 +00:00
return filename, attachmentMediaType
2019-11-10 11:37:24 +00:00
2020-04-02 09:56:17 +00:00
def extractTextFieldsInPOST(postBytes, boundary, debug: bool) -> {}:
2019-11-10 11:37:24 +00:00
"""Returns a dictionary containing the text fields of a http form POST
The boundary argument comes from the http header
2020-03-22 21:16:02 +00:00
"""
2020-04-02 09:56:17 +00:00
msg = email.parser.BytesParser().parsebytes(postBytes)
2019-11-10 11:54:45 +00:00
if debug:
2020-04-02 09:56:17 +00:00
print('DEBUG: POST arriving ' +
2020-03-29 09:59:54 +00:00
msg.get_payload(decode=True).decode('utf-8'))
2020-04-02 09:56:17 +00:00
messageFields = msg.get_payload(decode=True)
messageFields = messageFields.decode('utf-8').split(boundary)
fields = {}
2019-11-10 11:37:24 +00:00
# examine each section of the POST, separated by the boundary
for f in messageFields:
2020-04-02 09:56:17 +00:00
if f == '--':
2019-11-10 11:37:24 +00:00
continue
if ' name="' not in f:
2020-03-22 21:16:02 +00:00
continue
2020-04-02 09:56:17 +00:00
postStr = f.split(' name="', 1)[1]
2019-11-10 11:37:24 +00:00
if '"' not in postStr:
continue
2020-04-02 09:56:17 +00:00
postKey = postStr.split('"', 1)[0]
postValueStr = postStr.split('"', 1)[1]
2019-11-10 11:37:24 +00:00
if ';' in postValueStr:
continue
if '\r\n' not in postValueStr:
continue
2020-04-02 09:56:17 +00:00
postLines = postValueStr.split('\r\n')
postValue = ''
if len(postLines) > 2:
for line in range(2, len(postLines)-1):
if line > 2:
postValue += '\n'
postValue += postLines[line]
fields[postKey] = urllib.parse.unquote_plus(postValue)
2019-11-10 11:37:24 +00:00
return fields