__filename__ = "content.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.1.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
import email.parser
from shutil import copyfile
from utils import loadJson
from utils import fileLastModified
from utils import getLinkPrefixes
def dangerousMarkup(content: str) -> bool:
"""Returns true if the given content contains dangerous html markup
"""
if '<' not in content:
return False
if '>' not in content:
return False
contentSections = content.split('<')
invalidStrings = ('script', 'canvas', 'style', 'abbr',
'frame', 'iframe', 'html', 'body',
'hr')
for markup in contentSections:
if '>' not in markup:
continue
markup = markup.split('>')[0].strip()
if ' ' not in markup:
for badStr in invalidStrings:
if badStr in markup:
return True
else:
for badStr in invalidStrings:
if badStr + ' ' in markup:
return True
return False
def switchWords(baseDir: str, nickname: str, domain: str, content: str) -> str:
"""Performs word replacements. eg. Trump -> The Orange Menace
"""
switchWordsFilename = baseDir + '/accounts/' + \
nickname + '@' + domain + '/replacewords.txt'
if not os.path.isfile(switchWordsFilename):
return content
with open(switchWordsFilename, 'r') as fp:
for line in fp:
replaceStr = line.replace('\n', '').replace('\r', '')
wordTransform = None
if '->' in replaceStr:
wordTransform = replaceStr.split('->')
elif ':' in replaceStr:
wordTransform = replaceStr.split(':')
elif ',' in replaceStr:
wordTransform = replaceStr.split(',')
elif ';' in replaceStr:
wordTransform = replaceStr.split(';')
elif '-' in replaceStr:
wordTransform = replaceStr.split('-')
if not wordTransform:
continue
if len(wordTransform) == 2:
replaceStr1 = wordTransform[0].strip().replace('"', '')
replaceStr2 = wordTransform[1].strip().replace('"', '')
content = content.replace(replaceStr1, replaceStr2)
return content
def replaceEmojiFromTags(content: str, tag: [], messageType: str) -> str:
"""Uses the tags to replace :emoji: with html image markup
"""
for tagItem in tag:
if not tagItem.get('type'):
continue
if tagItem['type'] != 'Emoji':
continue
if not tagItem.get('name'):
continue
if not tagItem.get('icon'):
continue
if not tagItem['icon'].get('url'):
continue
if '/' not in tagItem['icon']['url']:
continue
if tagItem['name'] not in content:
continue
iconName = tagItem['icon']['url'].split('/')[-1]
if iconName:
if len(iconName) > 1:
if iconName[0].isdigit():
if '.' in iconName:
iconName = iconName.split('.')[0]
# see https://unicode.org/
# emoji/charts/full-emoji-list.html
if '-' not in iconName:
# a single code
try:
replaceChar = chr(int("0x" + iconName, 16))
content = content.replace(tagItem['name'],
replaceChar)
except BaseException:
pass
else:
# sequence of codes
iconCodes = iconName.split('-')
iconCodeSequence = ''
for icode in iconCodes:
try:
iconCodeSequence += chr(int("0x" +
icode, 16))
except BaseException:
iconCodeSequence = ''
break
if iconCodeSequence:
content = content.replace(tagItem['name'],
iconCodeSequence)
htmlClass = 'emoji'
if messageType == 'post header':
htmlClass = 'emojiheader'
if messageType == 'profile':
htmlClass = 'emojiprofile'
emojiHtml = ""
content = content.replace(tagItem['name'], emojiHtml)
return content
def addMusicTag(content: str, tag: str) -> str:
"""If a music link is found then ensure that the post is
tagged appropriately
"""
if '#' not in tag:
tag = '#'+tag
if tag in content:
return content
musicSites = ('soundcloud.com', 'bandcamp.com')
musicSiteFound = False
for site in musicSites:
if site+'/' in content:
musicSiteFound = True
break
if not musicSiteFound:
return content
return ':music: ' + content + ' ' + tag + ' '
def addWebLinks(content: str) -> str:
"""Adds markup for web links
"""
if ':' not in content:
return content
prefixes = getLinkPrefixes()
# do any of these prefixes exist within the content?
prefixFound = False
for prefix in prefixes:
if prefix in content:
prefixFound = True
break
# if there are no prefixes then just keep the content we have
if not prefixFound:
return content
maxLinkLength = 40
content = content.replace('\r', '')
words = content.replace('\n', ' --linebreak-- ').split(' ')
replaceDict = {}
for w in words:
if ':' not in w:
continue
# does the word begin with a prefix?
prefixFound = False
for prefix in prefixes:
if w.startswith(prefix):
prefixFound = True
break
if not prefixFound:
continue
# the word contains a prefix
if w.endswith('.') or w.endswith(';'):
w = w[:-1]
markup = ''
for prefix in prefixes:
if w.startswith(prefix):
markup += '' + prefix + ''
break
linkText = w
for prefix in prefixes:
linkText = linkText.replace(prefix, '')
# prevent links from becoming too long
if len(linkText) > maxLinkLength:
markup += '' + \
linkText[:maxLinkLength] + ''
markup += '' + \
linkText[maxLinkLength:] + ''
else:
markup += '' + linkText + ''
replaceDict[w] = markup
# do the replacements
for url, markup in replaceDict.items():
content = content.replace(url, markup)
# replace any line breaks
content = content.replace(' --linebreak-- ', '
')
return content
def validHashTag(hashtag: str) -> bool:
"""Returns true if the give hashtag contains valid characters
"""
validChars = set('0123456789' +
'abcdefghijklmnopqrstuvwxyz' +
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
if set(hashtag).issubset(validChars):
return True
return False
def addHashTags(wordStr: str, httpPrefix: str, domain: str,
replaceHashTags: {}, postHashtags: {}) -> bool:
"""Detects hashtags and adds them to the replacements dict
Also updates the hashtags list to be added to the post
"""
if replaceHashTags.get(wordStr):
return True
hashtag = wordStr[1:]
if not validHashTag(hashtag):
return False
hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag
postHashtags[hashtag] = {
'href': hashtagUrl,
'name': '#'+hashtag,
'type': 'Hashtag'
}
replaceHashTags[wordStr] = "#" + \
hashtag + ""
return True
def loadEmojiDict(emojiDataFilename: str, emojiDict: {}) -> None:
"""Creates an emoji dictionary based on emoji/emoji-data.txt
"""
if not os.path.isfile(emojiDataFilename):
return
with open(emojiDataFilename, "r") as fileHandler:
for line in fileHandler:
if len(line) < 5:
continue
if line.startswith('#'):
continue
if '; Emoji' not in line:
continue
if ')' not in line:
continue
emojiUnicode = line.split(' ')[0]
if len(emojiUnicode) < 4:
continue
if '..' in emojiUnicode:
emojiUnicode = emojiUnicode.split('..')[0]
emojiName = line.split(')', 1)[1].strip()
emojiName = emojiName.replace('\n', '').replace('\r', '')
emojiName = emojiName.replace(' ', '').replace('-', '')
if '..' in emojiName:
emojiName = emojiName.split('..')[0]
emojiDict[emojiName.lower()] = emojiUnicode
def addEmoji(baseDir: str, wordStr: str,
httpPrefix: str, domain: str,
replaceEmoji: {}, postTags: {},
emojiDict: {}) -> bool:
"""Detects Emoji and adds them to the replacements dict
Also updates the tags list to be added to the post
"""
if not wordStr.startswith(':'):
return False
if not wordStr.endswith(':'):
return False
if len(wordStr) < 3:
return False
if replaceEmoji.get(wordStr):
return True
# remove leading and trailing : characters
emoji = wordStr[1:]
emoji = emoji[:-1]
# is the text of the emoji valid?
if not validHashTag(emoji):
return False
if not emojiDict.get(emoji):
return False
emojiFilename = baseDir + '/emoji/' + emojiDict[emoji] + '.png'
if not os.path.isfile(emojiFilename):
return False
emojiUrl = httpPrefix + "://" + domain + \
"/emoji/" + emojiDict[emoji] + '.png'
postTags[emoji] = {
'icon': {
'mediaType': 'image/png',
'type': 'Image',
'url': emojiUrl
},
'name': ':'+emoji+':',
"updated": fileLastModified(emojiFilename),
"id": emojiUrl.replace('.png', ''),
'type': 'Emoji'
}
return True
def addMention(wordStr: str, httpPrefix: str, following: str,
replaceMentions: {}, recipients: [], tags: {}) -> bool:
"""Detects mentions and adds them to the replacements dict and
recipients list
"""
possibleHandle = wordStr[1:]
# @nick
if following and '@' not in possibleHandle:
# fall back to a best effort match against the following list
# if no domain was specified. eg. @nick
possibleNickname = possibleHandle
for follow in following:
if follow.startswith(possibleNickname + '@'):
replaceDomain = \
follow.replace('\n', '').replace('\r', '').split('@')[1]
recipientActor = httpPrefix + "://" + \
replaceDomain + "/users/" + possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
tags[wordStr] = {
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
replaceMentions[wordStr] = \
"@" + possibleNickname + \
""
return True
return False
possibleNickname = None
possibleDomain = None
if '@' not in possibleHandle:
return False
possibleNickname = possibleHandle.split('@')[0]
if not possibleNickname:
return False
possibleDomain = \
possibleHandle.split('@')[1].strip('\n').strip('\r')
if not possibleDomain:
return False
if following:
for follow in following:
if follow.replace('\n', '').replace('\r', '') != possibleHandle:
continue
recipientActor = httpPrefix + "://" + \
possibleDomain + "/users/" + possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
tags[wordStr] = {
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
replaceMentions[wordStr] = \
"@" + possibleNickname + \
""
return True
# @nick@domain
if not (possibleDomain == 'localhost' or '.' in possibleDomain):
return False
recipientActor = httpPrefix + "://" + \
possibleDomain + "/users/" + possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
tags[wordStr] = {
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
replaceMentions[wordStr] = \
"@" + possibleNickname + \
""
return True
def replaceContentDuplicates(content: str) -> str:
"""Replaces invalid duplicates within content
"""
while '<<' in content:
content = content.replace('<<', '<')
while '>>' in content:
content = content.replace('>>', '>')
content = content.replace('<\\p>', '')
return content
def removeTextFormatting(content: str) -> str:
"""Removes markup for bold, italics, etc
"""
if '<' not in content:
return content
removeMarkup = ('b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5')
for markup in removeMarkup:
content = content.replace('<' + markup + '>', '')
content = content.replace('' + markup + '>', '')
content = content.replace('<' + markup.upper() + '>', '')
content = content.replace('' + markup.upper() + '>', '')
return content
def removeHtml(content: str) -> str:
"""Removes html links from the given content.
Used to ensure that profile descriptions don't contain dubious content
"""
if '<' not in content:
return content
removing = False
result = ''
for ch in content:
if ch == '<':
removing = True
elif ch == '>':
removing = False
elif not removing:
result += ch
return result
def removeLongWords(content: str, maxWordLength: int,
longWordsList: []) -> str:
"""Breaks up long words so that on mobile screens this doesn't
disrupt the layout
"""
content = replaceContentDuplicates(content)
if ' ' not in content:
# handle a single very long string with no spaces
contentStr = content.replace('
', '').replace(r'<\p>', '') if '://' not in contentStr: if len(contentStr) > maxWordLength: if '
' in content: content = '
' + contentStr[:maxWordLength] + r'<\p>' else: content = content[:maxWordLength] return content words = content.split(' ') if not longWordsList: longWordsList = [] for wordStr in words: if len(wordStr) > maxWordLength: if wordStr not in longWordsList: longWordsList.append(wordStr) for wordStr in longWordsList: if wordStr.startswith('<'): continue if len(wordStr) == 76: if wordStr.upper() == wordStr: # tox address continue if '=\"' in wordStr: continue if '@' in wordStr: if '@@' not in wordStr: continue if '=.ed25519' in wordStr: continue if '.onion' in wordStr: continue if '.i2p' in wordStr: continue if 'https:' in wordStr: continue elif 'http:' in wordStr: continue elif 'i2p:' in wordStr: continue elif 'gnunet:' in wordStr: continue elif 'dat:' in wordStr: continue elif 'hyper:' in wordStr: continue elif 'briar:' in wordStr: continue if '<' in wordStr: replaceWord = wordStr.split('<', 1)[0] content = content.replace(wordStr, replaceWord) wordStr = replaceWord if '/' in wordStr: continue if len(wordStr[maxWordLength:]) < maxWordLength: content = content.replace(wordStr, wordStr[:maxWordLength] + '\n' + wordStr[maxWordLength:]) else: content = content.replace(wordStr, wordStr[:maxWordLength]) if content.startswith('
'): if not content.endswith('
'): content = content.strip()+'' return content def addHtmlTags(baseDir: str, httpPrefix: str, nickname: str, domain: str, content: str, recipients: [], hashtags: {}, isJsonContent=False) -> str: """ Replaces plaintext mentions such as @nick@domain into html by matching against known following accounts """ if content.startswith(''): return content maxWordLength = 40 content = content.replace('\r', '') content = content.replace('\n', ' --linebreak-- ') content = addMusicTag(content, 'nowplaying') words = content.replace(',', ' ').replace(';', ' ').split(' ') # remove . for words which are not mentions newWords = [] for wordIndex in range(0, len(words)): wordStr = words[wordIndex] if wordStr.endswith('.'): if not wordStr.startswith('@'): wordStr = wordStr[:-1] if wordStr.startswith('.'): wordStr = wordStr[1:] newWords.append(wordStr) words = newWords replaceMentions = {} replaceHashTags = {} replaceEmoji = {} emojiDict = {} originalDomain = domain if ':' in domain: domain = domain.split(':')[0] followingFilename = baseDir + '/accounts/' + \ nickname + '@' + domain + '/following.txt' # read the following list so that we can detect just @nick # in addition to @nick@domain following = None if '@' in words: if os.path.isfile(followingFilename): with open(followingFilename, "r") as f: following = f.readlines() # extract mentions and tags from words longWordsList = [] for wordStr in words: wordLen = len(wordStr) if wordLen > 2: if wordLen > maxWordLength: longWordsList.append(wordStr) firstChar = wordStr[0] if firstChar == '@': if addMention(wordStr, httpPrefix, following, replaceMentions, recipients, hashtags): continue elif firstChar == '#': if addHashTags(wordStr, httpPrefix, originalDomain, replaceHashTags, hashtags): continue elif ':' in wordStr: wordStr2 = wordStr.split(':')[1] # print('TAG: emoji located - '+wordStr) if not emojiDict: # emoji.json is generated so that it can be customized and # the changes will be retained even if default_emoji.json # is subsequently updated if not os.path.isfile(baseDir + '/emoji/emoji.json'): copyfile(baseDir + '/emoji/default_emoji.json', baseDir + '/emoji/emoji.json') emojiDict = loadJson(baseDir + '/emoji/emoji.json') # print('TAG: looking up emoji for :'+wordStr2+':') addEmoji(baseDir, ':' + wordStr2 + ':', httpPrefix, originalDomain, replaceEmoji, hashtags, emojiDict) # replace words with their html versions for wordStr, replaceStr in replaceMentions.items(): content = content.replace(wordStr, replaceStr) for wordStr, replaceStr in replaceHashTags.items(): content = content.replace(wordStr, replaceStr) if not isJsonContent: for wordStr, replaceStr in replaceEmoji.items(): content = content.replace(wordStr, replaceStr) content = addWebLinks(content) if longWordsList: content = removeLongWords(content, maxWordLength, longWordsList) content = content.replace(' --linebreak-- ', '
') return '
' + content + '
' def getMentionsFromHtml(htmlText: str, matchStr=" []: """Extracts mentioned actors from the given html content string """ mentions = [] if matchStr not in htmlText: return mentions mentionsList = htmlText.split(matchStr) for mentionStr in mentionsList: if '"' not in mentionStr: continue actorStr = mentionStr.split('"')[0] if actorStr.startswith('http') or \ actorStr.startswith('gnunet') or \ actorStr.startswith('i2p') or \ actorStr.startswith('hyper') or \ actorStr.startswith('dat:'): if actorStr not in mentions: mentions.append(actorStr) return mentions def extractMediaInFormPOST(postBytes, boundary, name: str): """Extracts the binary encoding for image/video/audio within a http form POST Returns the media bytes and the remaining bytes """ imageStartBoundary = b'Content-Disposition: form-data; name="' + \ name.encode('utf8', 'ignore') + b'";' imageStartLocation = postBytes.find(imageStartBoundary) if imageStartLocation == -1: return None, postBytes # bytes after the start boundary appears mediaBytes = postBytes[imageStartLocation:] # look for the next boundary imageEndBoundary = boundary.encode('utf8', 'ignore') imageEndLocation = mediaBytes.find(imageEndBoundary) if imageEndLocation == -1: # no ending boundary return mediaBytes, postBytes[:imageStartLocation] # remaining bytes after the end of the image remainder = mediaBytes[imageEndLocation:] # remove bytes after the end boundary mediaBytes = mediaBytes[:imageEndLocation] # return the media and the before+after bytes return mediaBytes, postBytes[:imageStartLocation] + remainder def saveMediaInFormPOST(mediaBytes, debug: bool, filenameBase=None) -> (str, str): """Saves the given media bytes extracted from http form POST Returns the filename and attachment type """ if not mediaBytes: if debug: print('DEBUG: No media found within POST') return None, None mediaLocation = -1 searchStr = '' filename = None # directly search the binary array for the beginning # of an image extensionList = { 'png': 'image/png', 'jpeg': 'image/jpeg', 'gif': 'image/gif', 'webp': 'image/webp', 'mp4': 'video/mp4', 'ogv': 'video/ogv', 'mp3': 'audio/mpeg', 'ogg': 'audio/ogg' } detectedExtension = None for extension, contentType in extensionList.items(): searchStr = b'Content-Type: ' + contentType.encode('utf8', 'ignore') mediaLocation = mediaBytes.find(searchStr) if mediaLocation > -1: # image/video/audio binaries if extension == 'jpeg': extension = 'jpg' elif extension == 'mpeg': extension = 'mp3' filename = filenameBase + '.' + extension attachmentMediaType = \ searchStr.decode().split('/')[0].replace('Content-Type: ', '') detectedExtension = extension break if not filename: return None, None # locate the beginning of the image, after any # carriage returns startPos = mediaLocation + len(searchStr) for offset in range(1, 8): if mediaBytes[startPos+offset] != 10: if mediaBytes[startPos+offset] != 13: startPos += offset break # remove any existing image files with a different format extensionTypes = ('png', 'jpg', 'jpeg', 'gif', 'webp') for ex in extensionTypes: if ex == detectedExtension: continue possibleOtherFormat = \ filename.replace('.temp', '').replace('.' + detectedExtension, '.' + ex) if os.path.isfile(possibleOtherFormat): os.remove(possibleOtherFormat) fd = open(filename, 'wb') fd.write(mediaBytes[startPos:]) fd.close() return filename, attachmentMediaType def extractTextFieldsInPOST(postBytes, boundary, debug: bool) -> {}: """Returns a dictionary containing the text fields of a http form POST The boundary argument comes from the http header """ msg = email.parser.BytesParser().parsebytes(postBytes) if debug: print('DEBUG: POST arriving ' + msg.get_payload(decode=True).decode('utf-8')) messageFields = msg.get_payload(decode=True) messageFields = messageFields.decode('utf-8').split(boundary) fields = {} # examine each section of the POST, separated by the boundary for f in messageFields: if f == '--': continue if ' name="' not in f: continue postStr = f.split(' name="', 1)[1] if '"' not in postStr: continue postKey = postStr.split('"', 1)[0] postValueStr = postStr.split('"', 1)[1] if ';' in postValueStr: continue if '\r\n' not in postValueStr: continue postLines = postValueStr.split('\r\n') postValue = '' if len(postLines) > 2: for line in range(2, len(postLines)-1): if line > 2: postValue += '\n' postValue += postLines[line] fields[postKey] = postValue return fields