epicyon/content.py

517 lines
19 KiB
Python
Raw Normal View History

2019-07-15 14:11:31 +00:00
__filename__ = "content.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2019-08-29 13:35:29 +00:00
__version__ = "1.0.0"
2019-07-15 14:11:31 +00:00
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import os
2019-09-29 14:48:17 +00:00
import time
2019-07-15 14:11:31 +00:00
import commentjson
2019-11-10 11:37:24 +00:00
import email.parser
2019-08-11 16:55:22 +00:00
from shutil import copyfile
2019-07-15 14:11:31 +00:00
2019-09-29 17:20:10 +00:00
def replaceEmojiFromTags(content: str,tag: [],messageType: str) -> str:
2019-09-29 16:28:02 +00:00
"""Uses the tags to replace :emoji: with html image markup
"""
2019-09-29 17:20:10 +00:00
for tagItem in tag:
if not tagItem.get('type'):
continue
if tagItem['type']!='Emoji':
continue
if not tagItem.get('name'):
2019-09-29 16:28:02 +00:00
continue
if not tagItem.get('icon'):
continue
if not tagItem['icon'].get('url'):
continue
if tagItem['name'] not in content:
continue
htmlClass='emoji'
if messageType=='post header':
htmlClass='emojiheader'
if messageType=='profile':
htmlClass='emojiprofile'
emojiHtml="<img src=\""+tagItem['icon']['url']+"\" alt=\""+tagItem['name'].replace(':','')+"\" align=\"middle\" class=\""+htmlClass+"\"/>"
content=content.replace(tagItem['name'],emojiHtml)
return content
2019-09-05 09:54:27 +00:00
def addMusicTag(content: str,tag: str) -> str:
"""If a music link is found then ensure that the post is tagged appropriately
"""
if '#' not in tag:
tag='#'+tag
if tag in content:
return content
musicSites=['soundcloud.com','bandcamp.com']
musicSiteFound=False
for site in musicSites:
if site+'/' in content:
musicSiteFound=True
break
if not musicSiteFound:
return content
2019-09-24 09:27:34 +00:00
return ':music: '+content+' '+tag+' '
2019-09-05 09:54:27 +00:00
2019-08-21 12:07:30 +00:00
def addWebLinks(content: str) -> str:
"""Adds markup for web links
"""
if not ('https://' in content or 'http://' in content):
return content
2019-10-09 12:23:20 +00:00
maxLinkLength=40
2019-10-01 10:36:51 +00:00
words=content.replace('\n',' --linebreak-- ').split(' ')
2019-08-21 12:07:30 +00:00
replaceDict={}
for w in words:
2019-10-01 08:54:52 +00:00
if w.startswith('https://') or \
w.startswith('http://') or \
w.startswith('dat://'):
2019-08-21 12:07:30 +00:00
if w.endswith('.') or w.endswith(';'):
w=w[:-1]
markup='<a href="'+w+'" rel="nofollow noopener" target="_blank">'
if w.startswith('https://'):
markup+='<span class="invisible">https://</span>'
elif w.startswith('http://'):
markup+='<span class="invisible">http://</span>'
2019-10-01 08:54:52 +00:00
elif w.startswith('dat://'):
markup+='<span class="invisible">dat://</span>'
linkText=w.replace('https://','').replace('http://','').replace('dat://','')
2019-09-18 08:37:42 +00:00
# prevent links from becoming too long
2019-10-09 12:23:20 +00:00
if len(linkText)>maxLinkLength:
markup+='<span class="ellipsis">'+linkText[:maxLinkLength]+'</span>'
markup+='<span class="invisible">'+linkText[maxLinkLength:]+'</span></a>'
2019-10-01 10:47:47 +00:00
else:
markup+='<span class="ellipsis">'+linkText+'</span></a>'
2019-08-21 12:07:30 +00:00
replaceDict[w]=markup
for url,markup in replaceDict.items():
content=content.replace(url,markup)
2019-10-01 10:36:51 +00:00
content=content.replace(' --linebreak-- ','<br>')
2019-08-21 12:07:30 +00:00
return content
2019-08-09 11:12:08 +00:00
def validHashTag(hashtag: str) -> bool:
"""Returns true if the give hashtag contains valid characters
"""
validChars = set('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
if set(hashtag).issubset(validChars):
return True
return False
def addHashTags(wordStr: str,httpPrefix: str,domain: str,replaceHashTags: {},postHashtags: {}) -> bool:
"""Detects hashtags and adds them to the replacements dict
Also updates the hashtags list to be added to the post
"""
if replaceHashTags.get(wordStr):
return True
hashtag=wordStr[1:]
if not validHashTag(hashtag):
return False
hashtagUrl=httpPrefix+"://"+domain+"/tags/"+hashtag
postHashtags[hashtag]= {
'href': hashtagUrl,
'name': '#'+hashtag,
'type': 'Hashtag'
}
replaceHashTags[wordStr]= \
"<a href=\""+hashtagUrl+"\" class=\"mention hashtag\" rel=\"tag\">#<span>"+hashtag+"</span></a>"
return True
2019-08-09 16:18:00 +00:00
def loadEmojiDict(emojiDataFilename: str,emojiDict: {}) -> None:
"""Creates an emoji dictionary based on emoji/emoji-data.txt
"""
if not os.path.isfile(emojiDataFilename):
return
with open (emojiDataFilename, "r") as fileHandler:
for line in fileHandler:
if len(line)<5:
continue
if line.startswith('#'):
continue
if '; Emoji' not in line:
continue
if ')' not in line:
continue
emojiUnicode=line.split(' ')[0]
if len(emojiUnicode)<4:
continue
if '..' in emojiUnicode:
emojiUnicode=emojiUnicode.split('..')[0]
2019-08-09 18:48:40 +00:00
emojiName=line.split(')',1)[1].strip().replace('\n','').replace(' ','').replace('-','')
2019-08-09 16:18:00 +00:00
if '..' in emojiName:
emojiName=emojiName.split('..')[0]
emojiDict[emojiName.lower()]=emojiUnicode
2019-09-23 11:36:54 +00:00
def addEmoji(baseDir: str,wordStr: str,httpPrefix: str,domain: str,replaceEmoji: {},postTags: {},emojiDict: {}) -> bool:
2019-08-09 16:18:00 +00:00
"""Detects Emoji and adds them to the replacements dict
Also updates the tags list to be added to the post
"""
if not wordStr.startswith(':'):
return False
if not wordStr.endswith(':'):
return False
if len(wordStr)<3:
return False
if replaceEmoji.get(wordStr):
return True
2019-09-23 11:11:13 +00:00
# remove leading and trailing : characters
2019-08-09 16:18:00 +00:00
emoji=wordStr[1:]
emoji=emoji[:-1]
2019-09-23 11:11:13 +00:00
# is the text of the emoji valid?
2019-08-09 16:18:00 +00:00
if not validHashTag(emoji):
return False
if not emojiDict.get(emoji):
return False
emojiFilename=baseDir+'/emoji/'+emojiDict[emoji]+'.png'
if not os.path.isfile(emojiFilename):
return False
emojiUrl=httpPrefix+"://"+domain+"/emoji/"+emojiDict[emoji]+'.png'
postTags[emoji]= {
2019-08-19 13:35:55 +00:00
'icon': {
'mediaType': 'image/png',
'type': 'Image',
'url': emojiUrl
},
2019-08-09 16:18:00 +00:00
'name': ':'+emoji+':',
'type': 'Emoji'
}
return True
2019-08-19 12:13:18 +00:00
def addMention(wordStr: str,httpPrefix: str,following: str,replaceMentions: {},recipients: [],tags: {}) -> bool:
2019-08-09 09:09:21 +00:00
"""Detects mentions and adds them to the replacements dict and recipients list
"""
possibleHandle=wordStr[1:]
2019-08-19 10:05:50 +00:00
# @nick
2019-08-19 11:41:15 +00:00
if following and '@' not in possibleHandle:
2019-08-09 09:48:51 +00:00
# fall back to a best effort match against the following list
# if no domain was specified. eg. @nick
possibleNickname=possibleHandle
for follow in following:
if follow.startswith(possibleNickname+'@'):
replaceDomain=follow.replace('\n','').split('@')[1]
recipientActor=httpPrefix+"://"+replaceDomain+"/users/"+possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
2019-08-19 12:13:18 +00:00
tags[wordStr]={
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
2019-08-09 09:48:51 +00:00
replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+replaceDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
return True
return False
2019-10-29 20:15:21 +00:00
possibleNickname=None
possibleDomain=None
if '@' not in possibleHandle:
return False
2019-08-09 09:09:21 +00:00
possibleNickname=possibleHandle.split('@')[0]
2019-10-29 20:15:21 +00:00
if not possibleNickname:
return False
2019-08-19 10:43:52 +00:00
possibleDomain=possibleHandle.split('@')[1].strip('\n')
2019-10-29 20:15:21 +00:00
if not possibleDomain:
return False
2019-08-19 11:41:15 +00:00
if following:
for follow in following:
if follow.replace('\n','')!=possibleHandle:
continue
recipientActor=httpPrefix+"://"+possibleDomain+"/users/"+possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
2019-08-19 12:13:18 +00:00
tags[wordStr]={
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
2019-08-19 11:41:15 +00:00
replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+possibleDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
return True
2019-08-19 10:05:50 +00:00
# @nick@domain
2019-10-29 20:15:21 +00:00
if not (possibleDomain=='localhost' or '.' in possibleDomain):
return False
recipientActor=httpPrefix+"://"+possibleDomain+"/users/"+possibleNickname
if recipientActor not in recipients:
recipients.append(recipientActor)
tags[wordStr]={
'href': recipientActor,
'name': wordStr,
'type': 'Mention'
}
replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+possibleDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
return True
2019-08-09 09:09:21 +00:00
2019-10-18 12:24:31 +00:00
def removeLongWords(content: str,maxWordLength: int,longWordsList: []) -> str:
2019-10-09 12:19:17 +00:00
"""Breaks up long words so that on mobile screens this doesn't disrupt the layout
"""
words=content.split(' ')
2019-11-04 20:39:14 +00:00
if not longWordsList:
longWordsList=[]
for wordStr in words:
if len(wordStr)>maxWordLength:
if wordStr not in longWordsList:
longWordsList.append(wordStr)
2019-10-18 12:24:31 +00:00
for wordStr in longWordsList:
if wordStr.startswith('<'):
continue
2019-11-04 21:08:43 +00:00
if '=\"' in wordStr:
continue
if '@' in wordStr:
2019-11-04 21:11:09 +00:00
if '@@' not in wordStr:
continue
2019-11-04 20:39:14 +00:00
if 'https:' in wordStr:
2019-10-25 18:27:32 +00:00
continue
2019-11-04 20:39:14 +00:00
elif 'http:' in wordStr:
continue
elif 'dat:' in wordStr:
continue
if '<' in wordStr:
wordStr=wordStr.split('<',1)[0]
2019-10-25 18:27:32 +00:00
if '/' in wordStr:
continue
2019-10-18 12:24:31 +00:00
if len(wordStr[maxWordLength:])<maxWordLength:
content= \
content.replace(wordStr, \
2019-11-04 20:57:41 +00:00
wordStr[:maxWordLength]+'\n'+ \
2019-11-04 20:45:10 +00:00
wordStr[maxWordLength:])
2019-10-18 12:24:31 +00:00
else:
content= \
content.replace(wordStr, \
2019-11-04 20:45:10 +00:00
wordStr[:maxWordLength])
2019-10-09 12:19:17 +00:00
return content
2019-08-09 09:09:21 +00:00
def addHtmlTags(baseDir: str,httpPrefix: str, \
2019-08-05 16:56:32 +00:00
nickname: str,domain: str,content: str, \
recipients: [],hashtags: {},isJsonContent=False) -> str:
2019-07-15 14:11:31 +00:00
""" Replaces plaintext mentions such as @nick@domain into html
by matching against known following accounts
"""
if content.startswith('<p>'):
return content
2019-10-09 12:19:17 +00:00
maxWordLength=40
2019-09-05 10:26:08 +00:00
content=content.replace('\n',' --linebreak-- ')
2019-09-05 10:29:09 +00:00
content=addMusicTag(content,'nowplaying')
2019-09-05 10:26:08 +00:00
words=content.replace(',',' ').replace(';',' ').split(' ')
2019-09-05 09:54:27 +00:00
# remove . for words which are not mentions
wordCtr=0
2019-08-19 11:14:38 +00:00
newWords=[]
2019-08-19 11:22:05 +00:00
for wordIndex in range(0,len(words)):
wordStr=words[wordIndex]
if wordStr.endswith('.'):
if not wordStr.startswith('@'):
2019-08-19 11:14:38 +00:00
wordStr=wordStr[:-1]
2019-08-19 11:08:47 +00:00
if wordStr.startswith('.'):
2019-08-19 11:14:38 +00:00
wordStr=wordStr[1:]
newWords.append(wordStr)
words=newWords
2019-07-15 14:11:31 +00:00
replaceMentions={}
2019-08-09 11:12:08 +00:00
replaceHashTags={}
2019-08-09 16:18:00 +00:00
replaceEmoji={}
emojiDict={}
originalDomain=domain
2019-07-15 14:24:33 +00:00
if ':' in domain:
domain=domain.split(':')[0]
2019-07-15 14:11:31 +00:00
followingFilename=baseDir+'/accounts/'+nickname+'@'+domain+'/following.txt'
2019-08-09 09:09:21 +00:00
# read the following list so that we can detect just @nick
# in addition to @nick@domain
2019-08-10 16:55:17 +00:00
following=None
2019-10-18 12:24:31 +00:00
if '@' in words:
if os.path.isfile(followingFilename):
with open(followingFilename, "r") as f:
following = f.readlines()
2019-08-09 09:09:21 +00:00
# extract mentions and tags from words
2019-10-18 12:24:31 +00:00
longWordsList=[]
2019-07-15 14:11:31 +00:00
for wordStr in words:
2019-10-18 12:24:31 +00:00
wordLen=len(wordStr)
if wordLen>2:
if wordLen>maxWordLength:
longWordsList.append(wordStr)
firstChar=wordStr[0]
if firstChar=='@':
if addMention(wordStr,httpPrefix,following,replaceMentions,recipients,hashtags):
continue
elif firstChar=='#':
if addHashTags(wordStr,httpPrefix,originalDomain,replaceHashTags,hashtags):
continue
elif ':' in wordStr:
2019-10-12 12:55:05 +00:00
#print('TAG: emoji located - '+wordStr)
2019-10-09 18:05:24 +00:00
wordStr2=wordStr.split(':')[1]
2019-10-09 18:32:53 +00:00
if not emojiDict:
2019-11-03 14:46:30 +00:00
# emoji.json is generated so that it can be customized and the changes
# will be retained even if default_emoji.json is subsequently updated
2019-10-09 18:32:53 +00:00
if not os.path.isfile(baseDir+'/emoji/emoji.json'):
copyfile(baseDir+'/emoji/default_emoji.json',baseDir+'/emoji/emoji.json')
emojiDictCtr=0
2019-10-16 14:21:01 +00:00
while not emojiDict and emojiDictCtr<5:
2019-10-09 18:32:53 +00:00
if emojiDictCtr>0:
print('Retry emoji load '+baseDir+'/emoji/emoji.json')
try:
with open(baseDir+'/emoji/emoji.json', 'r') as fp:
emojiDict=commentjson.load(fp)
2019-10-16 14:21:01 +00:00
if emojiDictCtr>0:
print('emojiDict loaded on try '+str(emojiDictCtr))
2019-10-12 09:37:21 +00:00
break
2019-10-26 13:01:32 +00:00
except:
print('WARN: commentjson exception addHtmlTags')
print('Failed to load emoji (try '+str(emojiDictCtr)+'): '+baseDir+'/emoji/emoji.json')
2019-10-09 18:32:53 +00:00
time.sleep(1)
2019-10-09 18:34:59 +00:00
emojiDictCtr+=1
2019-10-12 12:55:05 +00:00
#print('TAG: looking up emoji for :'+wordStr2+':')
2019-10-09 18:32:53 +00:00
addEmoji(baseDir,':'+wordStr2+':',httpPrefix,originalDomain,replaceEmoji,hashtags,emojiDict)
2019-08-09 09:09:21 +00:00
# replace words with their html versions
2019-07-15 14:11:31 +00:00
for wordStr,replaceStr in replaceMentions.items():
content=content.replace(wordStr,replaceStr)
2019-08-09 11:12:08 +00:00
for wordStr,replaceStr in replaceHashTags.items():
content=content.replace(wordStr,replaceStr)
if not isJsonContent:
for wordStr,replaceStr in replaceEmoji.items():
content=content.replace(wordStr,replaceStr)
2019-08-21 12:07:30 +00:00
content=addWebLinks(content)
2019-10-18 12:24:31 +00:00
if longWordsList:
content=removeLongWords(content,maxWordLength,longWordsList)
2019-09-05 10:23:22 +00:00
content=content.replace(' --linebreak-- ','</p><p>')
2019-07-15 14:11:31 +00:00
return '<p>'+content+'</p>'
2019-08-05 19:13:15 +00:00
def getMentionsFromHtml(htmlText: str,matchStr="<span class=\"h-card\"><a href=\"") -> []:
"""Extracts mentioned actors from the given html content string
"""
mentions=[]
if matchStr not in htmlText:
return mentions
2019-08-05 19:20:13 +00:00
mentionsList=htmlText.split(matchStr)
2019-08-05 19:13:15 +00:00
for mentionStr in mentionsList:
if '"' not in mentionStr:
continue
actorStr=mentionStr.split('"')[0]
if actorStr.startswith('http') or \
actorStr.startswith('dat:'):
2019-09-22 17:54:33 +00:00
if actorStr not in mentions:
mentions.append(actorStr)
2019-08-05 19:13:15 +00:00
return mentions
2019-11-10 11:37:24 +00:00
def extractMediaInFormPOST(postBytes,boundary,name: str):
"""Extracts the binary encoding for image/video/audio within a http form POST
Returns the media bytes and the remaining bytes
"""
2019-11-10 11:42:49 +00:00
imageStartBoundary=b'Content-Disposition: form-data; name="'+name.encode('utf8', 'ignore')+b'";'
2019-11-10 11:37:24 +00:00
imageStartLocation=postBytes.find(imageStartBoundary)
if imageStartLocation==-1:
return None,postBytes
# bytes after the start boundary appears
mediaBytes=postBytes[imageStartLocation:]
# look for the next boundary
2019-11-10 11:45:31 +00:00
imageEndBoundary=boundary.encode('utf8', 'ignore')
2019-11-10 11:37:24 +00:00
imageEndLocation=mediaBytes.find(imageEndBoundary)
if imageEndLocation==-1:
# no ending boundary
return mediaBytes,postBytes[:imageStartLocation]
# remaining bytes after the end of the image
remainder=mediaBytes[imageEndLocation:]
# remove bytes after the end boundary
mediaBytes=mediaBytes[:imageEndLocation]
# return the media and the before+after bytes
return mediaBytes,postBytes[:imageStartLocation]+remainder
def saveMediaInFormPOST(mediaBytes,baseDir: str, \
nickname: str,domain: str,debug: bool, \
filenameBase=None) -> (str,str):
"""Saves the given media bytes extracted from http form POST
Returns the filename and attachment type
"""
if not mediaBytes:
if debug:
print('DEBUG: No media found within POST')
return None,None
mediaLocation=-1
searchStr=''
filename=None
# directly search the binary array for the beginning
# of an image
extensionList= {
'png': 'image/png',
'jpeg': 'image/jpeg',
'gif': 'image/gif',
'mp4': 'video/mp4',
'ogv': 'video/ogv',
'mp3': 'audio/mpeg',
'ogg': 'audio/ogg'
}
for extension,contentType in extensionList.items():
2019-11-10 11:42:49 +00:00
searchStr=b'Content-Type: '+contentType.encode('utf8', 'ignore')
2019-11-10 11:37:24 +00:00
mediaLocation=mediaBytes.find(searchStr)
if not filenameBase:
filenameBase= \
baseDir+'/accounts/'+ \
nickname+'@'+domain+'/upload'
if mediaLocation>-1:
mediaFound=True
if extension=='jpeg':
extension='jpg'
elif extension=='mpeg':
extension='mp3'
filename=filenameBase+'.'+extension
attachmentMediaType= \
searchStr.decode().split('/')[0].replace('Content-Type: ','')
break
if not filename:
return None,None
# locate the beginning of the image, after any
# carriage returns
startPos=mediaLocation+len(searchStr)
for offset in range(1,8):
if mediaBytes[startPos+offset]!=10:
if mediaBytes[startPos+offset]!=13:
startPos+=offset
break
fd = open(filename, 'wb')
fd.write(mediaBytes[startPos:])
fd.close()
return filename,attachmentMediaType
2019-11-10 11:54:45 +00:00
def extractTextFieldsInPOST(postBytes,boundary,debug: bool) -> {}:
2019-11-10 11:37:24 +00:00
"""Returns a dictionary containing the text fields of a http form POST
The boundary argument comes from the http header
"""
msg = email.parser.BytesParser().parsebytes(postBytes)
2019-11-10 11:54:45 +00:00
if debug:
print('DEBUG: POST arriving '+msg.get_payload(decode=True).decode('utf-8'))
2019-11-10 11:37:24 +00:00
messageFields=msg.get_payload(decode=True).decode('utf-8').split(boundary)
fields={}
# examine each section of the POST, separated by the boundary
for f in messageFields:
if f=='--':
continue
if ' name="' not in f:
continue
postStr=f.split(' name="',1)[1]
if '"' not in postStr:
continue
postKey=postStr.split('"',1)[0]
postValueStr=postStr.split('"',1)[1]
if ';' in postValueStr:
continue
if '\r\n' not in postValueStr:
continue
postLines=postValueStr.split('\r\n')
postValue=''
if len(postLines)>2:
for line in range(2,len(postLines)-1):
if line>2:
postValue+='\n'
postValue+=postLines[line]
fields[postKey]=postValue
return fields