2020-04-02 09:56:17 +00:00
|
|
|
__filename__ = "content.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2021-01-26 10:07:42 +00:00
|
|
|
__version__ = "1.2.0"
|
2020-04-02 09:56:17 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2020-04-02 09:56:17 +00:00
|
|
|
__status__ = "Production"
|
2021-06-25 16:10:09 +00:00
|
|
|
__module_group__ = "Core"
|
2019-07-15 14:11:31 +00:00
|
|
|
|
|
|
|
import os
|
2019-11-10 11:37:24 +00:00
|
|
|
import email.parser
|
2020-12-03 14:59:07 +00:00
|
|
|
import urllib.parse
|
2019-08-11 16:55:22 +00:00
|
|
|
from shutil import copyfile
|
2021-09-13 17:51:33 +00:00
|
|
|
from utils import dangerousSVG
|
2021-06-26 14:21:24 +00:00
|
|
|
from utils import removeDomainPort
|
2021-02-09 14:41:32 +00:00
|
|
|
from utils import isValidLanguage
|
2020-11-21 11:54:29 +00:00
|
|
|
from utils import getImageExtensions
|
2019-11-23 10:08:00 +00:00
|
|
|
from utils import loadJson
|
2020-02-21 10:19:02 +00:00
|
|
|
from utils import fileLastModified
|
2020-06-11 12:26:15 +00:00
|
|
|
from utils import getLinkPrefixes
|
2021-01-31 11:05:17 +00:00
|
|
|
from utils import dangerousMarkup
|
2021-03-12 12:04:34 +00:00
|
|
|
from utils import isPGPEncrypted
|
|
|
|
from utils import containsPGPPublicKey
|
2021-07-13 21:59:53 +00:00
|
|
|
from utils import acctDir
|
2021-08-07 17:03:41 +00:00
|
|
|
from utils import isfloat
|
2021-08-07 17:44:25 +00:00
|
|
|
from utils import getCurrencies
|
2021-10-14 15:12:35 +00:00
|
|
|
from utils import removeHtml
|
2021-01-29 21:33:23 +00:00
|
|
|
from petnames import getPetName
|
2019-07-15 14:11:31 +00:00
|
|
|
|
2020-09-30 22:55:53 +00:00
|
|
|
|
2020-10-11 09:33:31 +00:00
|
|
|
def removeHtmlTag(htmlStr: str, tag: str) -> str:
|
|
|
|
"""Removes a given tag from a html string
|
|
|
|
"""
|
|
|
|
tagFound = True
|
|
|
|
while tagFound:
|
|
|
|
matchStr = ' ' + tag + '="'
|
|
|
|
if matchStr not in htmlStr:
|
|
|
|
tagFound = False
|
|
|
|
break
|
|
|
|
sections = htmlStr.split(matchStr, 1)
|
|
|
|
if '"' not in sections[1]:
|
|
|
|
tagFound = False
|
|
|
|
break
|
|
|
|
htmlStr = sections[0] + sections[1].split('"', 1)[1]
|
|
|
|
return htmlStr
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _removeQuotesWithinQuotes(content: str) -> str:
|
2020-09-30 22:52:39 +00:00
|
|
|
"""Removes any blockquote inside blockquote
|
|
|
|
"""
|
|
|
|
if '<blockquote>' not in content:
|
|
|
|
return content
|
|
|
|
if '</blockquote>' not in content:
|
|
|
|
return content
|
|
|
|
ctr = 1
|
|
|
|
found = True
|
|
|
|
while found:
|
|
|
|
prefix = content.split('<blockquote>', ctr)[0] + '<blockquote>'
|
|
|
|
quotedStr = content.split('<blockquote>', ctr)[1]
|
|
|
|
if '</blockquote>' not in quotedStr:
|
|
|
|
found = False
|
|
|
|
else:
|
|
|
|
endStr = quotedStr.split('</blockquote>')[1]
|
|
|
|
quotedStr = quotedStr.split('</blockquote>')[0]
|
|
|
|
if '<blockquote>' not in endStr:
|
|
|
|
found = False
|
|
|
|
if '<blockquote>' in quotedStr:
|
|
|
|
quotedStr = quotedStr.replace('<blockquote>', '')
|
|
|
|
content = prefix + quotedStr + '</blockquote>' + endStr
|
|
|
|
ctr += 1
|
|
|
|
return content
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-09-14 09:33:42 +00:00
|
|
|
def htmlReplaceEmailQuote(content: str) -> str:
|
|
|
|
"""Replaces an email style quote "> Some quote" with html blockquote
|
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2020-09-14 11:30:56 +00:00
|
|
|
# replace quote paragraph
|
|
|
|
if '<p>"' in content:
|
|
|
|
if '"</p>' in content:
|
2020-10-30 12:10:57 +00:00
|
|
|
if content.count('<p>"') == content.count('"</p>'):
|
|
|
|
content = content.replace('<p>"', '<p><blockquote>')
|
|
|
|
content = content.replace('"</p>', '</blockquote></p>')
|
2020-09-14 12:17:11 +00:00
|
|
|
if '>\u201c' in content:
|
|
|
|
if '\u201d<' in content:
|
2020-10-30 12:10:57 +00:00
|
|
|
if content.count('>\u201c') == content.count('\u201d<'):
|
2020-10-30 12:12:09 +00:00
|
|
|
content = content.replace('>\u201c', '><blockquote>')
|
|
|
|
content = content.replace('\u201d<', '</blockquote><')
|
2020-09-14 11:30:56 +00:00
|
|
|
# replace email style quote
|
2020-09-14 09:33:42 +00:00
|
|
|
if '>> ' not in content:
|
|
|
|
return content
|
|
|
|
contentStr = content.replace('<p>', '')
|
|
|
|
contentLines = contentStr.split('</p>')
|
|
|
|
newContent = ''
|
|
|
|
for lineStr in contentLines:
|
|
|
|
if not lineStr:
|
|
|
|
continue
|
|
|
|
if '>> ' not in lineStr:
|
2020-09-14 10:25:12 +00:00
|
|
|
if lineStr.startswith('> '):
|
|
|
|
lineStr = lineStr.replace('> ', '<blockquote>')
|
|
|
|
lineStr = lineStr.replace('>', '<br>')
|
|
|
|
newContent += '<p>' + lineStr + '</blockquote></p>'
|
|
|
|
else:
|
|
|
|
newContent += '<p>' + lineStr + '</p>'
|
2020-09-14 09:33:42 +00:00
|
|
|
else:
|
|
|
|
lineStr = lineStr.replace('>> ', '><blockquote>')
|
2020-09-30 22:52:39 +00:00
|
|
|
if lineStr.startswith('>'):
|
|
|
|
lineStr = lineStr.replace('>', '<blockquote>', 1)
|
|
|
|
else:
|
|
|
|
lineStr = lineStr.replace('>', '<br>')
|
2020-09-14 09:33:42 +00:00
|
|
|
newContent += '<p>' + lineStr + '</blockquote></p>'
|
2020-12-22 18:06:23 +00:00
|
|
|
return _removeQuotesWithinQuotes(newContent)
|
2020-09-14 09:33:42 +00:00
|
|
|
|
|
|
|
|
2020-08-02 17:01:12 +00:00
|
|
|
def htmlReplaceQuoteMarks(content: str) -> str:
|
|
|
|
"""Replaces quotes with html formatting
|
|
|
|
"hello" becomes <q>hello</q>
|
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2020-08-02 17:01:12 +00:00
|
|
|
if '"' not in content:
|
2020-08-03 17:03:30 +00:00
|
|
|
if '"' not in content:
|
|
|
|
return content
|
2020-10-30 12:03:29 +00:00
|
|
|
|
|
|
|
# only if there are a few quote marks
|
|
|
|
if content.count('"') > 4:
|
|
|
|
return content
|
|
|
|
if content.count('"') > 4:
|
|
|
|
return content
|
2020-08-02 17:01:12 +00:00
|
|
|
|
2020-08-03 17:03:30 +00:00
|
|
|
newContent = content
|
|
|
|
if '"' in content:
|
|
|
|
sections = content.split('"')
|
|
|
|
if len(sections) > 1:
|
|
|
|
newContent = ''
|
|
|
|
openQuote = True
|
2020-08-02 17:17:51 +00:00
|
|
|
markup = False
|
2020-08-03 17:03:30 +00:00
|
|
|
for ch in content:
|
|
|
|
currChar = ch
|
|
|
|
if ch == '<':
|
|
|
|
markup = True
|
|
|
|
elif ch == '>':
|
|
|
|
markup = False
|
|
|
|
elif ch == '"' and not markup:
|
|
|
|
if openQuote:
|
|
|
|
currChar = '“'
|
|
|
|
else:
|
|
|
|
currChar = '”'
|
|
|
|
openQuote = not openQuote
|
|
|
|
newContent += currChar
|
2020-08-02 19:16:22 +00:00
|
|
|
|
|
|
|
if '"' in newContent:
|
|
|
|
openQuote = True
|
|
|
|
content = newContent
|
|
|
|
newContent = ''
|
|
|
|
ctr = 0
|
|
|
|
sections = content.split('"')
|
|
|
|
noOfSections = len(sections)
|
|
|
|
for s in sections:
|
|
|
|
newContent += s
|
|
|
|
if ctr < noOfSections - 1:
|
|
|
|
if openQuote:
|
|
|
|
newContent += '“'
|
|
|
|
else:
|
|
|
|
newContent += '”'
|
|
|
|
openQuote = not openQuote
|
|
|
|
ctr += 1
|
2020-08-02 17:01:12 +00:00
|
|
|
return newContent
|
|
|
|
|
|
|
|
|
2020-11-20 10:58:49 +00:00
|
|
|
def dangerousCSS(filename: str, allowLocalNetworkAccess: bool) -> bool:
|
2020-11-15 11:01:05 +00:00
|
|
|
"""Returns true is the css file contains code which
|
|
|
|
can create security problems
|
|
|
|
"""
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
return False
|
|
|
|
|
2021-06-21 22:52:04 +00:00
|
|
|
with open(filename, 'r') as fp:
|
|
|
|
content = fp.read().lower()
|
2020-11-15 11:01:05 +00:00
|
|
|
|
2020-11-15 11:26:23 +00:00
|
|
|
cssMatches = ('behavior:', ':expression', '?php', '.php',
|
2020-12-12 20:59:52 +00:00
|
|
|
'google', 'regexp', 'localhost',
|
2020-12-12 21:42:10 +00:00
|
|
|
'127.0.', '192.168', '10.0.', '@import')
|
2021-10-07 19:03:01 +00:00
|
|
|
for cssmatch in cssMatches:
|
|
|
|
if cssmatch in content:
|
2020-11-15 11:01:05 +00:00
|
|
|
return True
|
|
|
|
|
2020-12-12 21:21:06 +00:00
|
|
|
# search for non-local web links
|
|
|
|
if 'url(' in content:
|
|
|
|
urlList = content.split('url(')
|
|
|
|
ctr = 0
|
|
|
|
for urlStr in urlList:
|
|
|
|
if ctr > 0:
|
|
|
|
if ')' in urlStr:
|
|
|
|
urlStr = urlStr.split(')')[0]
|
|
|
|
if 'http' in urlStr:
|
|
|
|
print('ERROR: non-local web link in CSS ' +
|
|
|
|
filename)
|
|
|
|
return True
|
|
|
|
ctr += 1
|
|
|
|
|
2020-11-15 11:01:05 +00:00
|
|
|
# an attacker can include html inside of the css
|
|
|
|
# file as a comment and this may then be run from the html
|
2020-11-20 10:58:49 +00:00
|
|
|
if dangerousMarkup(content, allowLocalNetworkAccess):
|
2020-11-15 11:01:05 +00:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-07-06 16:29:03 +00:00
|
|
|
def switchWords(baseDir: str, nickname: str, domain: str, content: str,
|
|
|
|
rules: [] = []) -> str:
|
2020-02-19 18:51:08 +00:00
|
|
|
"""Performs word replacements. eg. Trump -> The Orange Menace
|
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2021-07-06 16:29:03 +00:00
|
|
|
|
|
|
|
if not rules:
|
2021-07-13 21:59:53 +00:00
|
|
|
switchWordsFilename = \
|
|
|
|
acctDir(baseDir, nickname, domain) + '/replacewords.txt'
|
2021-07-06 16:29:03 +00:00
|
|
|
if not os.path.isfile(switchWordsFilename):
|
|
|
|
return content
|
|
|
|
with open(switchWordsFilename, 'r') as fp:
|
|
|
|
rules = fp.readlines()
|
|
|
|
|
|
|
|
for line in rules:
|
|
|
|
replaceStr = line.replace('\n', '').replace('\r', '')
|
|
|
|
splitters = ('->', ':', ',', ';', '-')
|
|
|
|
wordTransform = None
|
|
|
|
for splitStr in splitters:
|
|
|
|
if splitStr in replaceStr:
|
|
|
|
wordTransform = replaceStr.split(splitStr)
|
|
|
|
break
|
|
|
|
if not wordTransform:
|
|
|
|
continue
|
|
|
|
if len(wordTransform) == 2:
|
|
|
|
replaceStr1 = wordTransform[0].strip().replace('"', '')
|
|
|
|
replaceStr2 = wordTransform[1].strip().replace('"', '')
|
|
|
|
content = content.replace(replaceStr1, replaceStr2)
|
2020-02-19 18:51:08 +00:00
|
|
|
return content
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
|
|
def replaceEmojiFromTags(content: str, tag: [], messageType: str) -> str:
|
2019-09-29 16:28:02 +00:00
|
|
|
"""Uses the tags to replace :emoji: with html image markup
|
|
|
|
"""
|
2019-09-29 17:20:10 +00:00
|
|
|
for tagItem in tag:
|
|
|
|
if not tagItem.get('type'):
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
if tagItem['type'] != 'Emoji':
|
2019-09-29 17:20:10 +00:00
|
|
|
continue
|
|
|
|
if not tagItem.get('name'):
|
2019-09-29 16:28:02 +00:00
|
|
|
continue
|
|
|
|
if not tagItem.get('icon'):
|
|
|
|
continue
|
|
|
|
if not tagItem['icon'].get('url'):
|
|
|
|
continue
|
2020-02-21 15:09:31 +00:00
|
|
|
if '/' not in tagItem['icon']['url']:
|
|
|
|
continue
|
2019-09-29 16:28:02 +00:00
|
|
|
if tagItem['name'] not in content:
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
iconName = tagItem['icon']['url'].split('/')[-1]
|
2020-02-21 15:09:31 +00:00
|
|
|
if iconName:
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(iconName) > 1:
|
2020-02-21 17:45:20 +00:00
|
|
|
if iconName[0].isdigit():
|
2020-02-21 21:08:24 +00:00
|
|
|
if '.' in iconName:
|
2020-04-02 09:56:17 +00:00
|
|
|
iconName = iconName.split('.')[0]
|
|
|
|
# see https://unicode.org/
|
|
|
|
# emoji/charts/full-emoji-list.html
|
2020-02-21 21:08:24 +00:00
|
|
|
if '-' not in iconName:
|
|
|
|
# a single code
|
|
|
|
try:
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceChar = chr(int("0x" + iconName, 16))
|
|
|
|
content = content.replace(tagItem['name'],
|
|
|
|
replaceChar)
|
|
|
|
except BaseException:
|
2021-10-29 16:31:20 +00:00
|
|
|
print('EX: replaceEmojiFromTags name ' +
|
|
|
|
str(iconName))
|
2020-02-21 21:08:24 +00:00
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# sequence of codes
|
2020-04-02 09:56:17 +00:00
|
|
|
iconCodes = iconName.split('-')
|
|
|
|
iconCodeSequence = ''
|
2020-02-21 21:08:24 +00:00
|
|
|
for icode in iconCodes:
|
|
|
|
try:
|
2020-04-02 09:56:17 +00:00
|
|
|
iconCodeSequence += chr(int("0x" +
|
|
|
|
icode, 16))
|
|
|
|
except BaseException:
|
|
|
|
iconCodeSequence = ''
|
2021-10-29 16:31:20 +00:00
|
|
|
print('EX: replaceEmojiFromTags code ' +
|
|
|
|
str(icode))
|
2020-02-21 21:08:24 +00:00
|
|
|
break
|
|
|
|
if iconCodeSequence:
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace(tagItem['name'],
|
|
|
|
iconCodeSequence)
|
|
|
|
|
|
|
|
htmlClass = 'emoji'
|
|
|
|
if messageType == 'post header':
|
|
|
|
htmlClass = 'emojiheader'
|
|
|
|
if messageType == 'profile':
|
|
|
|
htmlClass = 'emojiprofile'
|
|
|
|
emojiHtml = "<img src=\"" + tagItem['icon']['url'] + "\" alt=\"" + \
|
|
|
|
tagItem['name'].replace(':', '') + \
|
|
|
|
"\" align=\"middle\" class=\"" + htmlClass + "\"/>"
|
|
|
|
content = content.replace(tagItem['name'], emojiHtml)
|
2019-09-29 16:28:02 +00:00
|
|
|
return content
|
|
|
|
|
2020-02-21 15:09:31 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addMusicTag(content: str, tag: str) -> str:
|
2020-03-29 09:59:54 +00:00
|
|
|
"""If a music link is found then ensure that the post is
|
|
|
|
tagged appropriately
|
2019-09-05 09:54:27 +00:00
|
|
|
"""
|
2020-10-11 09:50:17 +00:00
|
|
|
if '#podcast' in content or '#documentary' in content:
|
|
|
|
return content
|
2019-09-05 09:54:27 +00:00
|
|
|
if '#' not in tag:
|
2020-10-11 09:50:17 +00:00
|
|
|
tag = '#' + tag
|
2019-09-05 09:54:27 +00:00
|
|
|
if tag in content:
|
|
|
|
return content
|
2020-06-11 11:56:08 +00:00
|
|
|
musicSites = ('soundcloud.com', 'bandcamp.com')
|
2020-04-02 09:56:17 +00:00
|
|
|
musicSiteFound = False
|
2019-09-05 09:54:27 +00:00
|
|
|
for site in musicSites:
|
2021-06-22 12:42:52 +00:00
|
|
|
if site + '/' in content:
|
2020-04-02 09:56:17 +00:00
|
|
|
musicSiteFound = True
|
2019-09-05 09:54:27 +00:00
|
|
|
break
|
|
|
|
if not musicSiteFound:
|
|
|
|
return content
|
2020-04-02 09:56:17 +00:00
|
|
|
return ':music: ' + content + ' ' + tag + ' '
|
|
|
|
|
2019-09-05 09:54:27 +00:00
|
|
|
|
2019-08-21 12:07:30 +00:00
|
|
|
def addWebLinks(content: str) -> str:
|
|
|
|
"""Adds markup for web links
|
|
|
|
"""
|
2020-06-11 09:43:48 +00:00
|
|
|
if ':' not in content:
|
|
|
|
return content
|
|
|
|
|
2020-06-11 12:26:15 +00:00
|
|
|
prefixes = getLinkPrefixes()
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
|
|
# do any of these prefixes exist within the content?
|
|
|
|
prefixFound = False
|
|
|
|
for prefix in prefixes:
|
|
|
|
if prefix in content:
|
|
|
|
prefixFound = True
|
|
|
|
break
|
|
|
|
|
|
|
|
# if there are no prefixes then just keep the content we have
|
|
|
|
if not prefixFound:
|
2019-08-21 12:07:30 +00:00
|
|
|
return content
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
maxLinkLength = 40
|
2020-05-22 11:32:38 +00:00
|
|
|
content = content.replace('\r', '')
|
2020-04-02 09:56:17 +00:00
|
|
|
words = content.replace('\n', ' --linebreak-- ').split(' ')
|
|
|
|
replaceDict = {}
|
2019-08-21 12:07:30 +00:00
|
|
|
for w in words:
|
2020-06-11 09:43:48 +00:00
|
|
|
if ':' not in w:
|
|
|
|
continue
|
2020-06-11 11:56:08 +00:00
|
|
|
# does the word begin with a prefix?
|
|
|
|
prefixFound = False
|
|
|
|
for prefix in prefixes:
|
|
|
|
if w.startswith(prefix):
|
|
|
|
prefixFound = True
|
|
|
|
break
|
|
|
|
if not prefixFound:
|
|
|
|
continue
|
|
|
|
# the word contains a prefix
|
|
|
|
if w.endswith('.') or w.endswith(';'):
|
|
|
|
w = w[:-1]
|
|
|
|
markup = '<a href="' + w + \
|
2020-12-11 10:14:58 +00:00
|
|
|
'" rel="nofollow noopener noreferrer" target="_blank">'
|
2020-06-11 11:56:08 +00:00
|
|
|
for prefix in prefixes:
|
|
|
|
if w.startswith(prefix):
|
|
|
|
markup += '<span class="invisible">' + prefix + '</span>'
|
|
|
|
break
|
|
|
|
linkText = w
|
|
|
|
for prefix in prefixes:
|
|
|
|
linkText = linkText.replace(prefix, '')
|
|
|
|
# prevent links from becoming too long
|
|
|
|
if len(linkText) > maxLinkLength:
|
|
|
|
markup += '<span class="ellipsis">' + \
|
|
|
|
linkText[:maxLinkLength] + '</span>'
|
|
|
|
markup += '<span class="invisible">' + \
|
|
|
|
linkText[maxLinkLength:] + '</span></a>'
|
|
|
|
else:
|
|
|
|
markup += '<span class="ellipsis">' + linkText + '</span></a>'
|
|
|
|
replaceDict[w] = markup
|
|
|
|
|
|
|
|
# do the replacements
|
2020-04-02 09:56:17 +00:00
|
|
|
for url, markup in replaceDict.items():
|
|
|
|
content = content.replace(url, markup)
|
2020-06-11 11:56:08 +00:00
|
|
|
|
|
|
|
# replace any line breaks
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace(' --linebreak-- ', '<br>')
|
2020-06-11 11:56:08 +00:00
|
|
|
|
2019-08-21 12:07:30 +00:00
|
|
|
return content
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2019-08-09 11:12:08 +00:00
|
|
|
def validHashTag(hashtag: str) -> bool:
|
|
|
|
"""Returns true if the give hashtag contains valid characters
|
|
|
|
"""
|
2020-08-07 20:43:54 +00:00
|
|
|
# long hashtags are not valid
|
|
|
|
if len(hashtag) >= 32:
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
validChars = set('0123456789' +
|
|
|
|
'abcdefghijklmnopqrstuvwxyz' +
|
2021-02-09 13:28:42 +00:00
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
|
|
|
|
'¡¿ÄäÀàÁáÂâÃãÅåǍǎĄąĂăÆæĀā' +
|
|
|
|
'ÇçĆćĈĉČčĎđĐďðÈèÉéÊêËëĚěĘęĖėĒē' +
|
|
|
|
'ĜĝĢģĞğĤĥÌìÍíÎîÏïıĪīĮįĴĵĶķ' +
|
|
|
|
'ĹĺĻļŁłĽľĿŀÑñŃńŇňŅņÖöÒòÓóÔôÕõŐőØøŒœ' +
|
|
|
|
'ŔŕŘřẞߌśŜŝŞşŠšȘșŤťŢţÞþȚțÜüÙùÚúÛûŰűŨũŲųŮůŪū' +
|
|
|
|
'ŴŵÝýŸÿŶŷŹźŽžŻż')
|
2019-08-09 11:12:08 +00:00
|
|
|
if set(hashtag).issubset(validChars):
|
2021-02-09 13:28:42 +00:00
|
|
|
return True
|
2021-02-09 14:41:32 +00:00
|
|
|
if isValidLanguage(hashtag):
|
|
|
|
return True
|
2019-08-09 11:12:08 +00:00
|
|
|
return False
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addHashTags(wordStr: str, httpPrefix: str, domain: str,
|
|
|
|
replaceHashTags: {}, postHashtags: {}) -> bool:
|
2019-08-09 11:12:08 +00:00
|
|
|
"""Detects hashtags and adds them to the replacements dict
|
|
|
|
Also updates the hashtags list to be added to the post
|
|
|
|
"""
|
|
|
|
if replaceHashTags.get(wordStr):
|
2020-04-02 09:56:17 +00:00
|
|
|
return True
|
|
|
|
hashtag = wordStr[1:]
|
2019-08-09 11:12:08 +00:00
|
|
|
if not validHashTag(hashtag):
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag
|
|
|
|
postHashtags[hashtag] = {
|
2019-08-09 11:12:08 +00:00
|
|
|
'href': hashtagUrl,
|
2020-10-16 20:13:23 +00:00
|
|
|
'name': '#' + hashtag,
|
2019-08-09 11:12:08 +00:00
|
|
|
'type': 'Hashtag'
|
|
|
|
}
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceHashTags[wordStr] = "<a href=\"" + hashtagUrl + \
|
|
|
|
"\" class=\"mention hashtag\" rel=\"tag\">#<span>" + \
|
|
|
|
hashtag + "</span></a>"
|
2019-08-09 11:12:08 +00:00
|
|
|
return True
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _addEmoji(baseDir: str, wordStr: str,
|
|
|
|
httpPrefix: str, domain: str,
|
|
|
|
replaceEmoji: {}, postTags: {},
|
|
|
|
emojiDict: {}) -> bool:
|
2019-08-09 16:18:00 +00:00
|
|
|
"""Detects Emoji and adds them to the replacements dict
|
|
|
|
Also updates the tags list to be added to the post
|
|
|
|
"""
|
|
|
|
if not wordStr.startswith(':'):
|
|
|
|
return False
|
|
|
|
if not wordStr.endswith(':'):
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(wordStr) < 3:
|
2019-08-09 16:18:00 +00:00
|
|
|
return False
|
|
|
|
if replaceEmoji.get(wordStr):
|
2020-04-02 09:56:17 +00:00
|
|
|
return True
|
2019-09-23 11:11:13 +00:00
|
|
|
# remove leading and trailing : characters
|
2020-04-02 09:56:17 +00:00
|
|
|
emoji = wordStr[1:]
|
|
|
|
emoji = emoji[:-1]
|
2019-09-23 11:11:13 +00:00
|
|
|
# is the text of the emoji valid?
|
2019-08-09 16:18:00 +00:00
|
|
|
if not validHashTag(emoji):
|
|
|
|
return False
|
|
|
|
if not emojiDict.get(emoji):
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
emojiFilename = baseDir + '/emoji/' + emojiDict[emoji] + '.png'
|
2019-08-09 16:18:00 +00:00
|
|
|
if not os.path.isfile(emojiFilename):
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
emojiUrl = httpPrefix + "://" + domain + \
|
|
|
|
"/emoji/" + emojiDict[emoji] + '.png'
|
|
|
|
postTags[emoji] = {
|
2019-08-19 13:35:55 +00:00
|
|
|
'icon': {
|
|
|
|
'mediaType': 'image/png',
|
|
|
|
'type': 'Image',
|
|
|
|
'url': emojiUrl
|
|
|
|
},
|
2021-06-22 12:42:52 +00:00
|
|
|
'name': ':' + emoji + ':',
|
2020-02-21 10:19:02 +00:00
|
|
|
"updated": fileLastModified(emojiFilename),
|
2020-04-02 09:56:17 +00:00
|
|
|
"id": emojiUrl.replace('.png', ''),
|
2019-08-09 16:18:00 +00:00
|
|
|
'type': 'Emoji'
|
|
|
|
}
|
|
|
|
return True
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-12-13 20:07:45 +00:00
|
|
|
def tagExists(tagType: str, tagName: str, tags: {}) -> bool:
|
|
|
|
"""Returns true if a tag exists in the given dict
|
|
|
|
"""
|
|
|
|
for tag in tags:
|
|
|
|
if tag['name'] == tagName and tag['type'] == tagType:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2021-01-29 21:33:23 +00:00
|
|
|
def _addMention(wordStr: str, httpPrefix: str, following: str, petnames: str,
|
2020-12-22 18:06:23 +00:00
|
|
|
replaceMentions: {}, recipients: [], tags: {}) -> bool:
|
2020-03-29 09:59:54 +00:00
|
|
|
"""Detects mentions and adds them to the replacements dict and
|
|
|
|
recipients list
|
2019-08-09 09:09:21 +00:00
|
|
|
"""
|
2020-04-02 09:56:17 +00:00
|
|
|
possibleHandle = wordStr[1:]
|
2019-08-19 10:05:50 +00:00
|
|
|
# @nick
|
2019-08-19 11:41:15 +00:00
|
|
|
if following and '@' not in possibleHandle:
|
2019-08-09 09:48:51 +00:00
|
|
|
# fall back to a best effort match against the following list
|
|
|
|
# if no domain was specified. eg. @nick
|
2020-04-02 09:56:17 +00:00
|
|
|
possibleNickname = possibleHandle
|
2019-08-09 09:48:51 +00:00
|
|
|
for follow in following:
|
2021-01-29 21:33:23 +00:00
|
|
|
if '@' not in follow:
|
|
|
|
continue
|
|
|
|
followNick = follow.split('@')[0]
|
|
|
|
if possibleNickname == followNick:
|
|
|
|
followStr = follow.replace('\n', '').replace('\r', '')
|
|
|
|
replaceDomain = followStr.split('@')[1]
|
2020-04-02 09:56:17 +00:00
|
|
|
recipientActor = httpPrefix + "://" + \
|
2021-08-14 08:44:58 +00:00
|
|
|
replaceDomain + "/@" + possibleNickname
|
2019-08-09 09:48:51 +00:00
|
|
|
if recipientActor not in recipients:
|
|
|
|
recipients.append(recipientActor)
|
2020-04-02 09:56:17 +00:00
|
|
|
tags[wordStr] = {
|
2019-08-19 12:13:18 +00:00
|
|
|
'href': recipientActor,
|
|
|
|
'name': wordStr,
|
|
|
|
'type': 'Mention'
|
|
|
|
}
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceMentions[wordStr] = \
|
|
|
|
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
|
|
|
|
"://" + replaceDomain + "/@" + possibleNickname + \
|
|
|
|
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
|
2020-03-29 09:59:54 +00:00
|
|
|
"</span></a></span>"
|
2019-08-09 09:48:51 +00:00
|
|
|
return True
|
2021-01-29 21:33:23 +00:00
|
|
|
# try replacing petnames with mentions
|
|
|
|
followCtr = 0
|
|
|
|
for follow in following:
|
|
|
|
if '@' not in follow:
|
2021-01-29 21:34:08 +00:00
|
|
|
followCtr += 1
|
2021-01-29 21:33:23 +00:00
|
|
|
continue
|
|
|
|
pet = petnames[followCtr].replace('\n', '')
|
|
|
|
if pet:
|
|
|
|
if possibleNickname == pet:
|
|
|
|
followStr = follow.replace('\n', '').replace('\r', '')
|
|
|
|
replaceNickname = followStr.split('@')[0]
|
|
|
|
replaceDomain = followStr.split('@')[1]
|
|
|
|
recipientActor = httpPrefix + "://" + \
|
2021-08-14 08:44:58 +00:00
|
|
|
replaceDomain + "/@" + replaceNickname
|
2021-01-29 21:33:23 +00:00
|
|
|
if recipientActor not in recipients:
|
|
|
|
recipients.append(recipientActor)
|
|
|
|
tags[wordStr] = {
|
|
|
|
'href': recipientActor,
|
|
|
|
'name': wordStr,
|
|
|
|
'type': 'Mention'
|
|
|
|
}
|
|
|
|
replaceMentions[wordStr] = \
|
|
|
|
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
|
|
|
|
"://" + replaceDomain + "/@" + replaceNickname + \
|
|
|
|
"\" class=\"u-url mention\">@<span>" + \
|
|
|
|
replaceNickname + "</span></a></span>"
|
|
|
|
return True
|
|
|
|
followCtr += 1
|
2019-08-09 09:48:51 +00:00
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
possibleNickname = None
|
|
|
|
possibleDomain = None
|
2019-10-29 20:15:21 +00:00
|
|
|
if '@' not in possibleHandle:
|
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
possibleNickname = possibleHandle.split('@')[0]
|
2019-10-29 20:15:21 +00:00
|
|
|
if not possibleNickname:
|
|
|
|
return False
|
2020-05-22 11:32:38 +00:00
|
|
|
possibleDomain = \
|
|
|
|
possibleHandle.split('@')[1].strip('\n').strip('\r')
|
2019-10-29 20:15:21 +00:00
|
|
|
if not possibleDomain:
|
|
|
|
return False
|
2019-08-19 11:41:15 +00:00
|
|
|
if following:
|
|
|
|
for follow in following:
|
2020-05-22 11:32:38 +00:00
|
|
|
if follow.replace('\n', '').replace('\r', '') != possibleHandle:
|
2019-08-19 11:41:15 +00:00
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
recipientActor = httpPrefix + "://" + \
|
2021-08-14 08:44:58 +00:00
|
|
|
possibleDomain + "/@" + possibleNickname
|
2019-08-19 11:41:15 +00:00
|
|
|
if recipientActor not in recipients:
|
|
|
|
recipients.append(recipientActor)
|
2020-04-02 09:56:17 +00:00
|
|
|
tags[wordStr] = {
|
2019-08-19 12:13:18 +00:00
|
|
|
'href': recipientActor,
|
|
|
|
'name': wordStr,
|
|
|
|
'type': 'Mention'
|
|
|
|
}
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceMentions[wordStr] = \
|
|
|
|
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
|
|
|
|
"://" + possibleDomain + "/@" + possibleNickname + \
|
|
|
|
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
|
2020-03-29 09:59:54 +00:00
|
|
|
"</span></a></span>"
|
2019-08-19 11:41:15 +00:00
|
|
|
return True
|
2019-08-19 10:05:50 +00:00
|
|
|
# @nick@domain
|
2020-04-02 09:56:17 +00:00
|
|
|
if not (possibleDomain == 'localhost' or '.' in possibleDomain):
|
2020-03-22 21:16:02 +00:00
|
|
|
return False
|
2020-04-02 09:56:17 +00:00
|
|
|
recipientActor = httpPrefix + "://" + \
|
2021-08-14 08:44:58 +00:00
|
|
|
possibleDomain + "/@" + possibleNickname
|
2019-10-29 20:15:21 +00:00
|
|
|
if recipientActor not in recipients:
|
|
|
|
recipients.append(recipientActor)
|
2020-04-02 09:56:17 +00:00
|
|
|
tags[wordStr] = {
|
2019-10-29 20:15:21 +00:00
|
|
|
'href': recipientActor,
|
|
|
|
'name': wordStr,
|
|
|
|
'type': 'Mention'
|
|
|
|
}
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceMentions[wordStr] = \
|
|
|
|
"<span class=\"h-card\"><a href=\"" + httpPrefix + \
|
|
|
|
"://" + possibleDomain + "/@" + possibleNickname + \
|
|
|
|
"\" class=\"u-url mention\">@<span>" + possibleNickname + \
|
2020-03-29 09:59:54 +00:00
|
|
|
"</span></a></span>"
|
2019-10-29 20:15:21 +00:00
|
|
|
return True
|
2019-08-09 09:09:21 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-05-12 09:34:58 +00:00
|
|
|
def replaceContentDuplicates(content: str) -> str:
|
|
|
|
"""Replaces invalid duplicates within content
|
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2020-05-12 09:34:58 +00:00
|
|
|
while '<<' in content:
|
|
|
|
content = content.replace('<<', '<')
|
|
|
|
while '>>' in content:
|
|
|
|
content = content.replace('>>', '>')
|
2020-05-12 09:42:24 +00:00
|
|
|
content = content.replace('<\\p>', '')
|
2020-05-12 09:34:58 +00:00
|
|
|
return content
|
|
|
|
|
|
|
|
|
2020-06-14 13:25:38 +00:00
|
|
|
def removeTextFormatting(content: str) -> str:
|
|
|
|
"""Removes markup for bold, italics, etc
|
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2020-06-14 13:25:38 +00:00
|
|
|
if '<' not in content:
|
|
|
|
return content
|
2020-06-14 13:39:03 +00:00
|
|
|
removeMarkup = ('b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
|
|
|
|
'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5')
|
|
|
|
for markup in removeMarkup:
|
|
|
|
content = content.replace('<' + markup + '>', '')
|
|
|
|
content = content.replace('</' + markup + '>', '')
|
|
|
|
content = content.replace('<' + markup.upper() + '>', '')
|
|
|
|
content = content.replace('</' + markup.upper() + '>', '')
|
2020-06-14 13:25:38 +00:00
|
|
|
return content
|
|
|
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
def removeLongWords(content: str, maxWordLength: int,
|
|
|
|
longWordsList: []) -> str:
|
2020-03-29 09:59:54 +00:00
|
|
|
"""Breaks up long words so that on mobile screens this doesn't
|
|
|
|
disrupt the layout
|
2019-10-09 12:19:17 +00:00
|
|
|
"""
|
2021-03-12 12:04:34 +00:00
|
|
|
if isPGPEncrypted(content) or containsPGPPublicKey(content):
|
2021-03-11 17:15:32 +00:00
|
|
|
return content
|
2020-05-12 09:34:58 +00:00
|
|
|
content = replaceContentDuplicates(content)
|
2019-12-13 12:41:26 +00:00
|
|
|
if ' ' not in content:
|
|
|
|
# handle a single very long string with no spaces
|
2020-04-02 09:56:17 +00:00
|
|
|
contentStr = content.replace('<p>', '').replace(r'<\p>', '')
|
2019-12-13 12:41:26 +00:00
|
|
|
if '://' not in contentStr:
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(contentStr) > maxWordLength:
|
2019-12-13 12:41:26 +00:00
|
|
|
if '<p>' in content:
|
2020-04-02 09:56:17 +00:00
|
|
|
content = '<p>' + contentStr[:maxWordLength] + r'<\p>'
|
2019-12-13 12:41:26 +00:00
|
|
|
else:
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content[:maxWordLength]
|
2019-12-13 12:41:26 +00:00
|
|
|
return content
|
2020-04-02 09:56:17 +00:00
|
|
|
words = content.split(' ')
|
2019-11-04 20:39:14 +00:00
|
|
|
if not longWordsList:
|
2020-04-02 09:56:17 +00:00
|
|
|
longWordsList = []
|
2019-11-04 20:39:14 +00:00
|
|
|
for wordStr in words:
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(wordStr) > maxWordLength:
|
2019-11-04 20:39:14 +00:00
|
|
|
if wordStr not in longWordsList:
|
|
|
|
longWordsList.append(wordStr)
|
2019-10-18 12:24:31 +00:00
|
|
|
for wordStr in longWordsList:
|
2021-03-17 21:17:27 +00:00
|
|
|
if wordStr.startswith('<p>'):
|
|
|
|
wordStr = wordStr.replace('<p>', '')
|
2019-10-18 12:24:31 +00:00
|
|
|
if wordStr.startswith('<'):
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(wordStr) == 76:
|
|
|
|
if wordStr.upper() == wordStr:
|
2020-03-22 14:29:34 +00:00
|
|
|
# tox address
|
|
|
|
continue
|
2019-11-04 21:08:43 +00:00
|
|
|
if '=\"' in wordStr:
|
|
|
|
continue
|
|
|
|
if '@' in wordStr:
|
2019-11-04 21:11:09 +00:00
|
|
|
if '@@' not in wordStr:
|
|
|
|
continue
|
2020-01-25 10:49:59 +00:00
|
|
|
if '=.ed25519' in wordStr:
|
|
|
|
continue
|
|
|
|
if '.onion' in wordStr:
|
|
|
|
continue
|
|
|
|
if '.i2p' in wordStr:
|
|
|
|
continue
|
2019-11-04 20:39:14 +00:00
|
|
|
if 'https:' in wordStr:
|
2019-10-25 18:27:32 +00:00
|
|
|
continue
|
2019-11-04 20:39:14 +00:00
|
|
|
elif 'http:' in wordStr:
|
|
|
|
continue
|
2020-02-17 17:18:21 +00:00
|
|
|
elif 'i2p:' in wordStr:
|
|
|
|
continue
|
2020-06-09 11:51:51 +00:00
|
|
|
elif 'gnunet:' in wordStr:
|
|
|
|
continue
|
2019-11-04 20:39:14 +00:00
|
|
|
elif 'dat:' in wordStr:
|
|
|
|
continue
|
2020-12-06 10:18:41 +00:00
|
|
|
elif 'rad:' in wordStr:
|
|
|
|
continue
|
2020-05-17 09:37:59 +00:00
|
|
|
elif 'hyper:' in wordStr:
|
|
|
|
continue
|
|
|
|
elif 'briar:' in wordStr:
|
|
|
|
continue
|
2019-11-04 20:39:14 +00:00
|
|
|
if '<' in wordStr:
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceWord = wordStr.split('<', 1)[0]
|
2021-03-17 21:17:27 +00:00
|
|
|
# if len(replaceWord) > maxWordLength:
|
|
|
|
# replaceWord = replaceWord[:maxWordLength]
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace(wordStr, replaceWord)
|
|
|
|
wordStr = replaceWord
|
2019-10-25 18:27:32 +00:00
|
|
|
if '/' in wordStr:
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
if len(wordStr[maxWordLength:]) < maxWordLength:
|
|
|
|
content = content.replace(wordStr,
|
|
|
|
wordStr[:maxWordLength] + '\n' +
|
|
|
|
wordStr[maxWordLength:])
|
2019-10-18 12:24:31 +00:00
|
|
|
else:
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace(wordStr,
|
|
|
|
wordStr[:maxWordLength])
|
2020-01-24 11:27:12 +00:00
|
|
|
if content.startswith('<p>'):
|
|
|
|
if not content.endswith('</p>'):
|
2020-10-31 23:10:38 +00:00
|
|
|
content = content.strip() + '</p>'
|
2019-10-09 12:19:17 +00:00
|
|
|
return content
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _loadAutoTags(baseDir: str, nickname: str, domain: str) -> []:
|
2020-09-13 14:42:17 +00:00
|
|
|
"""Loads automatic tags file and returns a list containing
|
|
|
|
the lines of the file
|
|
|
|
"""
|
2021-07-13 21:59:53 +00:00
|
|
|
filename = acctDir(baseDir, nickname, domain) + '/autotags.txt'
|
2020-09-13 14:42:17 +00:00
|
|
|
if not os.path.isfile(filename):
|
|
|
|
return []
|
2021-07-13 14:40:49 +00:00
|
|
|
with open(filename, 'r') as f:
|
2020-09-13 14:42:17 +00:00
|
|
|
return f.readlines()
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
def _autoTag(baseDir: str, nickname: str, domain: str,
|
|
|
|
wordStr: str, autoTagList: [],
|
|
|
|
appendTags: []):
|
2020-09-13 14:42:17 +00:00
|
|
|
"""Generates a list of tags to be automatically appended to the content
|
|
|
|
"""
|
|
|
|
for tagRule in autoTagList:
|
|
|
|
if wordStr not in tagRule:
|
|
|
|
continue
|
|
|
|
if '->' not in tagRule:
|
|
|
|
continue
|
2021-10-07 19:03:01 +00:00
|
|
|
rulematch = tagRule.split('->')[0].strip()
|
|
|
|
if rulematch != wordStr:
|
2020-09-13 14:42:17 +00:00
|
|
|
continue
|
|
|
|
tagName = tagRule.split('->')[1].strip()
|
|
|
|
if tagName.startswith('#'):
|
|
|
|
if tagName not in appendTags:
|
|
|
|
appendTags.append(tagName)
|
|
|
|
else:
|
|
|
|
if '#' + tagName not in appendTags:
|
|
|
|
appendTags.append('#' + tagName)
|
|
|
|
|
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
def addHtmlTags(baseDir: str, httpPrefix: str,
|
|
|
|
nickname: str, domain: str, content: str,
|
2021-06-20 11:28:35 +00:00
|
|
|
recipients: [], hashtags: {},
|
|
|
|
isJsonContent: bool = False) -> str:
|
2019-07-15 14:11:31 +00:00
|
|
|
""" Replaces plaintext mentions such as @nick@domain into html
|
|
|
|
by matching against known following accounts
|
|
|
|
"""
|
|
|
|
if content.startswith('<p>'):
|
2020-09-14 09:33:42 +00:00
|
|
|
content = htmlReplaceEmailQuote(content)
|
2020-08-02 18:30:35 +00:00
|
|
|
return htmlReplaceQuoteMarks(content)
|
2020-04-02 09:56:17 +00:00
|
|
|
maxWordLength = 40
|
2020-05-22 11:32:38 +00:00
|
|
|
content = content.replace('\r', '')
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace('\n', ' --linebreak-- ')
|
2020-12-22 18:06:23 +00:00
|
|
|
content = _addMusicTag(content, 'nowplaying')
|
2020-10-16 19:49:34 +00:00
|
|
|
contentSimplified = \
|
|
|
|
content.replace(',', ' ').replace(';', ' ').replace('- ', ' ')
|
|
|
|
contentSimplified = contentSimplified.replace('. ', ' ').strip()
|
|
|
|
if contentSimplified.endswith('.'):
|
|
|
|
contentSimplified = contentSimplified[:len(contentSimplified)-1]
|
|
|
|
words = contentSimplified.split(' ')
|
2020-03-22 21:16:02 +00:00
|
|
|
|
2019-08-19 11:07:04 +00:00
|
|
|
# remove . for words which are not mentions
|
2020-04-02 09:56:17 +00:00
|
|
|
newWords = []
|
|
|
|
for wordIndex in range(0, len(words)):
|
|
|
|
wordStr = words[wordIndex]
|
2019-08-19 11:07:04 +00:00
|
|
|
if wordStr.endswith('.'):
|
|
|
|
if not wordStr.startswith('@'):
|
2020-04-02 09:56:17 +00:00
|
|
|
wordStr = wordStr[:-1]
|
2019-08-19 11:08:47 +00:00
|
|
|
if wordStr.startswith('.'):
|
2020-04-02 09:56:17 +00:00
|
|
|
wordStr = wordStr[1:]
|
2019-08-19 11:14:38 +00:00
|
|
|
newWords.append(wordStr)
|
2020-04-02 09:56:17 +00:00
|
|
|
words = newWords
|
2019-08-19 11:14:38 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
replaceMentions = {}
|
|
|
|
replaceHashTags = {}
|
|
|
|
replaceEmoji = {}
|
|
|
|
emojiDict = {}
|
|
|
|
originalDomain = domain
|
2021-06-23 21:31:50 +00:00
|
|
|
domain = removeDomainPort(domain)
|
2021-07-13 21:59:53 +00:00
|
|
|
followingFilename = acctDir(baseDir, nickname, domain) + '/following.txt'
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
# read the following list so that we can detect just @nick
|
|
|
|
# in addition to @nick@domain
|
2020-04-02 09:56:17 +00:00
|
|
|
following = None
|
2021-01-29 21:33:23 +00:00
|
|
|
petnames = None
|
2019-10-18 12:24:31 +00:00
|
|
|
if '@' in words:
|
|
|
|
if os.path.isfile(followingFilename):
|
2021-07-13 14:40:49 +00:00
|
|
|
with open(followingFilename, 'r') as f:
|
2020-04-02 09:56:17 +00:00
|
|
|
following = f.readlines()
|
2021-01-29 21:33:23 +00:00
|
|
|
for handle in following:
|
|
|
|
pet = getPetName(baseDir, nickname, domain, handle)
|
2021-02-04 13:43:56 +00:00
|
|
|
if pet:
|
|
|
|
petnames.append(pet + '\n')
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
# extract mentions and tags from words
|
2020-04-02 09:56:17 +00:00
|
|
|
longWordsList = []
|
2020-09-13 14:42:17 +00:00
|
|
|
prevWordStr = ''
|
2020-12-22 18:06:23 +00:00
|
|
|
autoTagsList = _loadAutoTags(baseDir, nickname, domain)
|
2020-09-13 14:42:17 +00:00
|
|
|
appendTags = []
|
2019-07-15 14:11:31 +00:00
|
|
|
for wordStr in words:
|
2020-04-02 09:56:17 +00:00
|
|
|
wordLen = len(wordStr)
|
|
|
|
if wordLen > 2:
|
|
|
|
if wordLen > maxWordLength:
|
2019-10-18 12:24:31 +00:00
|
|
|
longWordsList.append(wordStr)
|
2020-04-02 09:56:17 +00:00
|
|
|
firstChar = wordStr[0]
|
|
|
|
if firstChar == '@':
|
2021-01-29 21:33:23 +00:00
|
|
|
if _addMention(wordStr, httpPrefix, following, petnames,
|
2020-12-22 18:06:23 +00:00
|
|
|
replaceMentions, recipients, hashtags):
|
2020-09-13 14:42:17 +00:00
|
|
|
prevWordStr = ''
|
2019-10-18 12:24:31 +00:00
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
elif firstChar == '#':
|
2021-02-13 12:12:06 +00:00
|
|
|
# remove any endings from the hashtag
|
|
|
|
hashTagEndings = ('.', ':', ';', '-', '\n')
|
|
|
|
for ending in hashTagEndings:
|
|
|
|
if wordStr.endswith(ending):
|
2021-02-13 12:17:03 +00:00
|
|
|
wordStr = wordStr[:len(wordStr) - 1]
|
2021-02-13 12:26:25 +00:00
|
|
|
break
|
2021-02-13 12:12:06 +00:00
|
|
|
|
2020-12-22 18:06:23 +00:00
|
|
|
if _addHashTags(wordStr, httpPrefix, originalDomain,
|
|
|
|
replaceHashTags, hashtags):
|
2020-09-13 14:42:17 +00:00
|
|
|
prevWordStr = ''
|
2019-10-18 12:24:31 +00:00
|
|
|
continue
|
|
|
|
elif ':' in wordStr:
|
2020-04-02 09:56:17 +00:00
|
|
|
wordStr2 = wordStr.split(':')[1]
|
2021-06-22 12:42:52 +00:00
|
|
|
# print('TAG: emoji located - ' + wordStr)
|
2019-10-09 18:32:53 +00:00
|
|
|
if not emojiDict:
|
2020-03-29 09:59:54 +00:00
|
|
|
# emoji.json is generated so that it can be customized and
|
|
|
|
# the changes will be retained even if default_emoji.json
|
|
|
|
# is subsequently updated
|
2020-04-02 09:56:17 +00:00
|
|
|
if not os.path.isfile(baseDir + '/emoji/emoji.json'):
|
|
|
|
copyfile(baseDir + '/emoji/default_emoji.json',
|
|
|
|
baseDir + '/emoji/emoji.json')
|
|
|
|
emojiDict = loadJson(baseDir + '/emoji/emoji.json')
|
|
|
|
|
2021-06-22 12:42:52 +00:00
|
|
|
# print('TAG: looking up emoji for :' + wordStr2 + ':')
|
2020-12-22 18:06:23 +00:00
|
|
|
_addEmoji(baseDir, ':' + wordStr2 + ':', httpPrefix,
|
|
|
|
originalDomain, replaceEmoji, hashtags,
|
|
|
|
emojiDict)
|
2020-09-13 14:42:17 +00:00
|
|
|
else:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _autoTag(baseDir, nickname, domain, wordStr,
|
|
|
|
autoTagsList, appendTags):
|
2020-09-13 14:42:17 +00:00
|
|
|
prevWordStr = ''
|
|
|
|
continue
|
|
|
|
if prevWordStr:
|
2020-12-22 18:06:23 +00:00
|
|
|
if _autoTag(baseDir, nickname, domain,
|
|
|
|
prevWordStr + ' ' + wordStr,
|
|
|
|
autoTagsList, appendTags):
|
2020-09-13 14:42:17 +00:00
|
|
|
prevWordStr = ''
|
|
|
|
continue
|
|
|
|
prevWordStr = wordStr
|
|
|
|
|
|
|
|
# add any auto generated tags
|
|
|
|
for appended in appendTags:
|
|
|
|
content = content + ' ' + appended
|
2020-12-22 18:06:23 +00:00
|
|
|
_addHashTags(appended, httpPrefix, originalDomain,
|
|
|
|
replaceHashTags, hashtags)
|
2019-08-09 09:09:21 +00:00
|
|
|
|
|
|
|
# replace words with their html versions
|
2020-04-02 09:56:17 +00:00
|
|
|
for wordStr, replaceStr in replaceMentions.items():
|
|
|
|
content = content.replace(wordStr, replaceStr)
|
|
|
|
for wordStr, replaceStr in replaceHashTags.items():
|
|
|
|
content = content.replace(wordStr, replaceStr)
|
2019-10-29 13:04:38 +00:00
|
|
|
if not isJsonContent:
|
2020-04-02 09:56:17 +00:00
|
|
|
for wordStr, replaceStr in replaceEmoji.items():
|
|
|
|
content = content.replace(wordStr, replaceStr)
|
2019-10-29 13:04:38 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
content = addWebLinks(content)
|
2019-10-18 12:24:31 +00:00
|
|
|
if longWordsList:
|
2020-04-02 09:56:17 +00:00
|
|
|
content = removeLongWords(content, maxWordLength, longWordsList)
|
2021-07-10 09:38:59 +00:00
|
|
|
content = limitRepeatedWords(content, 6)
|
2020-04-02 09:56:17 +00:00
|
|
|
content = content.replace(' --linebreak-- ', '</p><p>')
|
2020-09-14 09:33:42 +00:00
|
|
|
content = htmlReplaceEmailQuote(content)
|
2020-08-02 18:09:50 +00:00
|
|
|
return '<p>' + htmlReplaceQuoteMarks(content) + '</p>'
|
2020-03-22 21:16:02 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
|
|
def getMentionsFromHtml(htmlText: str,
|
2020-03-29 09:59:54 +00:00
|
|
|
matchStr="<span class=\"h-card\"><a href=\"") -> []:
|
2019-08-05 19:13:15 +00:00
|
|
|
"""Extracts mentioned actors from the given html content string
|
|
|
|
"""
|
2020-04-02 09:56:17 +00:00
|
|
|
mentions = []
|
2019-08-05 19:13:15 +00:00
|
|
|
if matchStr not in htmlText:
|
|
|
|
return mentions
|
2020-04-02 09:56:17 +00:00
|
|
|
mentionsList = htmlText.split(matchStr)
|
2019-08-05 19:13:15 +00:00
|
|
|
for mentionStr in mentionsList:
|
|
|
|
if '"' not in mentionStr:
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
actorStr = mentionStr.split('"')[0]
|
2019-08-05 19:13:15 +00:00
|
|
|
if actorStr.startswith('http') or \
|
2020-06-09 11:51:51 +00:00
|
|
|
actorStr.startswith('gnunet') or \
|
2020-02-17 17:18:21 +00:00
|
|
|
actorStr.startswith('i2p') or \
|
2020-05-17 09:37:59 +00:00
|
|
|
actorStr.startswith('hyper') or \
|
2019-08-05 19:13:15 +00:00
|
|
|
actorStr.startswith('dat:'):
|
2019-09-22 17:54:33 +00:00
|
|
|
if actorStr not in mentions:
|
|
|
|
mentions.append(actorStr)
|
2019-08-05 19:13:15 +00:00
|
|
|
return mentions
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
|
|
def extractMediaInFormPOST(postBytes, boundary, name: str):
|
2020-03-29 09:59:54 +00:00
|
|
|
"""Extracts the binary encoding for image/video/audio within a http
|
|
|
|
form POST
|
2019-11-10 11:37:24 +00:00
|
|
|
Returns the media bytes and the remaining bytes
|
|
|
|
"""
|
2020-04-02 09:56:17 +00:00
|
|
|
imageStartBoundary = b'Content-Disposition: form-data; name="' + \
|
|
|
|
name.encode('utf8', 'ignore') + b'";'
|
|
|
|
imageStartLocation = postBytes.find(imageStartBoundary)
|
|
|
|
if imageStartLocation == -1:
|
|
|
|
return None, postBytes
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
# bytes after the start boundary appears
|
2020-04-02 09:56:17 +00:00
|
|
|
mediaBytes = postBytes[imageStartLocation:]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
# look for the next boundary
|
2020-04-02 09:56:17 +00:00
|
|
|
imageEndBoundary = boundary.encode('utf8', 'ignore')
|
|
|
|
imageEndLocation = mediaBytes.find(imageEndBoundary)
|
|
|
|
if imageEndLocation == -1:
|
2019-11-10 11:37:24 +00:00
|
|
|
# no ending boundary
|
2020-04-02 09:56:17 +00:00
|
|
|
return mediaBytes, postBytes[:imageStartLocation]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
# remaining bytes after the end of the image
|
2020-04-02 09:56:17 +00:00
|
|
|
remainder = mediaBytes[imageEndLocation:]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
# remove bytes after the end boundary
|
2020-04-02 09:56:17 +00:00
|
|
|
mediaBytes = mediaBytes[:imageEndLocation]
|
2019-11-10 11:37:24 +00:00
|
|
|
|
|
|
|
# return the media and the before+after bytes
|
2020-04-02 09:56:17 +00:00
|
|
|
return mediaBytes, postBytes[:imageStartLocation] + remainder
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
|
|
|
|
def saveMediaInFormPOST(mediaBytes, debug: bool,
|
2021-06-20 11:28:35 +00:00
|
|
|
filenameBase: str = None) -> (str, str):
|
2019-11-10 11:37:24 +00:00
|
|
|
"""Saves the given media bytes extracted from http form POST
|
|
|
|
Returns the filename and attachment type
|
|
|
|
"""
|
|
|
|
if not mediaBytes:
|
2021-08-09 21:27:13 +00:00
|
|
|
if filenameBase:
|
|
|
|
# remove any existing files
|
|
|
|
extensionTypes = getImageExtensions()
|
|
|
|
for ex in extensionTypes:
|
|
|
|
possibleOtherFormat = filenameBase + '.' + ex
|
|
|
|
if os.path.isfile(possibleOtherFormat):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(possibleOtherFormat)
|
|
|
|
except BaseException:
|
2021-10-29 16:31:20 +00:00
|
|
|
if debug:
|
|
|
|
print('EX: saveMediaInFormPOST ' +
|
|
|
|
'unable to delete other ' +
|
|
|
|
str(possibleOtherFormat))
|
2021-09-05 10:17:43 +00:00
|
|
|
pass
|
2021-08-09 21:27:13 +00:00
|
|
|
if os.path.isfile(filenameBase):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(filenameBase)
|
|
|
|
except BaseException:
|
2021-10-29 16:31:20 +00:00
|
|
|
if debug:
|
|
|
|
print('EX: saveMediaInFormPOST ' +
|
|
|
|
'unable to delete ' +
|
|
|
|
str(filenameBase))
|
2021-09-05 10:17:43 +00:00
|
|
|
pass
|
2021-08-09 21:27:13 +00:00
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
if debug:
|
|
|
|
print('DEBUG: No media found within POST')
|
2020-04-02 09:56:17 +00:00
|
|
|
return None, None
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
mediaLocation = -1
|
|
|
|
searchStr = ''
|
|
|
|
filename = None
|
2020-03-22 21:16:02 +00:00
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
# directly search the binary array for the beginning
|
|
|
|
# of an image
|
2020-04-02 09:56:17 +00:00
|
|
|
extensionList = {
|
2019-11-10 11:37:24 +00:00
|
|
|
'png': 'image/png',
|
|
|
|
'jpeg': 'image/jpeg',
|
|
|
|
'gif': 'image/gif',
|
2021-01-11 22:27:57 +00:00
|
|
|
'svg': 'image/svg+xml',
|
2019-11-14 13:30:54 +00:00
|
|
|
'webp': 'image/webp',
|
2020-09-09 15:09:38 +00:00
|
|
|
'avif': 'image/avif',
|
2019-11-10 11:37:24 +00:00
|
|
|
'mp4': 'video/mp4',
|
|
|
|
'ogv': 'video/ogv',
|
|
|
|
'mp3': 'audio/mpeg',
|
2021-05-29 11:04:03 +00:00
|
|
|
'ogg': 'audio/ogg',
|
2021-08-03 09:09:04 +00:00
|
|
|
'flac': 'audio/flac',
|
2021-05-29 11:04:03 +00:00
|
|
|
'zip': 'application/zip'
|
2019-11-10 11:37:24 +00:00
|
|
|
}
|
2020-04-02 09:56:17 +00:00
|
|
|
detectedExtension = None
|
|
|
|
for extension, contentType in extensionList.items():
|
|
|
|
searchStr = b'Content-Type: ' + contentType.encode('utf8', 'ignore')
|
|
|
|
mediaLocation = mediaBytes.find(searchStr)
|
|
|
|
if mediaLocation > -1:
|
2020-05-26 19:05:03 +00:00
|
|
|
# image/video/audio binaries
|
2020-04-02 09:56:17 +00:00
|
|
|
if extension == 'jpeg':
|
|
|
|
extension = 'jpg'
|
|
|
|
elif extension == 'mpeg':
|
|
|
|
extension = 'mp3'
|
2021-05-29 11:04:03 +00:00
|
|
|
if filenameBase:
|
|
|
|
filename = filenameBase + '.' + extension
|
2020-04-02 09:56:17 +00:00
|
|
|
attachmentMediaType = \
|
|
|
|
searchStr.decode().split('/')[0].replace('Content-Type: ', '')
|
|
|
|
detectedExtension = extension
|
2019-11-10 11:37:24 +00:00
|
|
|
break
|
|
|
|
|
|
|
|
if not filename:
|
2020-04-02 09:56:17 +00:00
|
|
|
return None, None
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2020-05-26 19:29:15 +00:00
|
|
|
# locate the beginning of the image, after any
|
|
|
|
# carriage returns
|
|
|
|
startPos = mediaLocation + len(searchStr)
|
|
|
|
for offset in range(1, 8):
|
|
|
|
if mediaBytes[startPos+offset] != 10:
|
|
|
|
if mediaBytes[startPos+offset] != 13:
|
|
|
|
startPos += offset
|
|
|
|
break
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2019-11-14 13:30:54 +00:00
|
|
|
# remove any existing image files with a different format
|
2021-05-29 11:04:03 +00:00
|
|
|
if detectedExtension != 'zip':
|
|
|
|
extensionTypes = getImageExtensions()
|
|
|
|
for ex in extensionTypes:
|
|
|
|
if ex == detectedExtension:
|
|
|
|
continue
|
|
|
|
possibleOtherFormat = \
|
|
|
|
filename.replace('.temp', '').replace('.' +
|
|
|
|
detectedExtension, '.' +
|
|
|
|
ex)
|
|
|
|
if os.path.isfile(possibleOtherFormat):
|
2021-09-05 10:17:43 +00:00
|
|
|
try:
|
|
|
|
os.remove(possibleOtherFormat)
|
|
|
|
except BaseException:
|
2021-10-29 16:31:20 +00:00
|
|
|
if debug:
|
|
|
|
print('EX: saveMediaInFormPOST ' +
|
|
|
|
'unable to delete other 2 ' +
|
|
|
|
str(possibleOtherFormat))
|
2021-09-05 10:17:43 +00:00
|
|
|
pass
|
2019-11-14 13:30:54 +00:00
|
|
|
|
2021-09-13 17:51:33 +00:00
|
|
|
# don't allow scripts within svg files
|
|
|
|
if detectedExtension == 'svg':
|
|
|
|
svgStr = mediaBytes[startPos:]
|
|
|
|
svgStr = svgStr.decode()
|
|
|
|
if dangerousSVG(svgStr, False):
|
|
|
|
return None, None
|
|
|
|
|
2021-07-05 11:48:20 +00:00
|
|
|
with open(filename, 'wb') as fp:
|
|
|
|
fp.write(mediaBytes[startPos:])
|
2019-12-04 18:52:27 +00:00
|
|
|
|
2021-03-06 23:16:54 +00:00
|
|
|
if not os.path.isfile(filename):
|
|
|
|
print('WARN: Media file could not be written to file: ' + filename)
|
|
|
|
return None, None
|
2021-03-06 23:19:03 +00:00
|
|
|
print('Uploaded media file written: ' + filename)
|
2021-03-06 23:16:54 +00:00
|
|
|
|
2020-04-02 09:56:17 +00:00
|
|
|
return filename, attachmentMediaType
|
|
|
|
|
2019-11-10 11:37:24 +00:00
|
|
|
|
2021-06-20 11:28:35 +00:00
|
|
|
def extractTextFieldsInPOST(postBytes, boundary: str, debug: bool,
|
|
|
|
unitTestData: str = None) -> {}:
|
2019-11-10 11:37:24 +00:00
|
|
|
"""Returns a dictionary containing the text fields of a http form POST
|
|
|
|
The boundary argument comes from the http header
|
2020-03-22 21:16:02 +00:00
|
|
|
"""
|
2021-03-01 10:02:55 +00:00
|
|
|
if not unitTestData:
|
|
|
|
msgBytes = email.parser.BytesParser().parsebytes(postBytes)
|
2021-03-01 10:06:13 +00:00
|
|
|
messageFields = msgBytes.get_payload(decode=True).decode('utf-8')
|
2021-03-01 10:02:55 +00:00
|
|
|
else:
|
|
|
|
messageFields = unitTestData
|
|
|
|
|
2019-11-10 11:54:45 +00:00
|
|
|
if debug:
|
2021-03-01 10:02:55 +00:00
|
|
|
print('DEBUG: POST arriving ' + messageFields)
|
|
|
|
|
|
|
|
messageFields = messageFields.split(boundary)
|
2020-04-02 09:56:17 +00:00
|
|
|
fields = {}
|
2021-03-01 12:15:06 +00:00
|
|
|
fieldsWithSemicolonAllowed = (
|
2021-03-01 12:19:49 +00:00
|
|
|
'message', 'bio', 'autoCW', 'password', 'passwordconfirm',
|
|
|
|
'instanceDescription', 'instanceDescriptionShort',
|
|
|
|
'subject', 'location', 'imageDescription'
|
2021-03-01 12:15:06 +00:00
|
|
|
)
|
2019-11-10 11:37:24 +00:00
|
|
|
# examine each section of the POST, separated by the boundary
|
|
|
|
for f in messageFields:
|
2020-04-02 09:56:17 +00:00
|
|
|
if f == '--':
|
2019-11-10 11:37:24 +00:00
|
|
|
continue
|
|
|
|
if ' name="' not in f:
|
2020-03-22 21:16:02 +00:00
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
postStr = f.split(' name="', 1)[1]
|
2019-11-10 11:37:24 +00:00
|
|
|
if '"' not in postStr:
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
postKey = postStr.split('"', 1)[0]
|
|
|
|
postValueStr = postStr.split('"', 1)[1]
|
2019-11-10 11:37:24 +00:00
|
|
|
if ';' in postValueStr:
|
2021-03-01 12:15:06 +00:00
|
|
|
if postKey not in fieldsWithSemicolonAllowed and \
|
|
|
|
not postKey.startswith('edited'):
|
2021-03-01 10:02:55 +00:00
|
|
|
continue
|
2019-11-10 11:37:24 +00:00
|
|
|
if '\r\n' not in postValueStr:
|
|
|
|
continue
|
2020-04-02 09:56:17 +00:00
|
|
|
postLines = postValueStr.split('\r\n')
|
|
|
|
postValue = ''
|
|
|
|
if len(postLines) > 2:
|
|
|
|
for line in range(2, len(postLines)-1):
|
|
|
|
if line > 2:
|
|
|
|
postValue += '\n'
|
|
|
|
postValue += postLines[line]
|
2021-05-20 09:41:36 +00:00
|
|
|
fields[postKey] = urllib.parse.unquote(postValue)
|
2019-11-10 11:37:24 +00:00
|
|
|
return fields
|
2021-07-10 09:38:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
def limitRepeatedWords(text: str, maxRepeats: int) -> str:
|
|
|
|
"""Removes words which are repeated many times
|
|
|
|
"""
|
|
|
|
words = text.replace('\n', ' ').split(' ')
|
|
|
|
repeatCtr = 0
|
|
|
|
repeatedText = ''
|
|
|
|
replacements = {}
|
|
|
|
prevWord = ''
|
|
|
|
for word in words:
|
|
|
|
if word == prevWord:
|
|
|
|
repeatCtr += 1
|
|
|
|
if repeatedText:
|
|
|
|
repeatedText += ' ' + word
|
|
|
|
else:
|
|
|
|
repeatedText = word + ' ' + word
|
|
|
|
else:
|
|
|
|
if repeatCtr > maxRepeats:
|
|
|
|
newText = ((prevWord + ' ') * maxRepeats).strip()
|
|
|
|
replacements[prevWord] = [repeatedText, newText]
|
|
|
|
repeatCtr = 0
|
|
|
|
repeatedText = ''
|
|
|
|
prevWord = word
|
|
|
|
|
|
|
|
if repeatCtr > maxRepeats:
|
|
|
|
newText = ((prevWord + ' ') * maxRepeats).strip()
|
|
|
|
replacements[prevWord] = [repeatedText, newText]
|
|
|
|
|
|
|
|
for word, item in replacements.items():
|
|
|
|
text = text.replace(item[0], item[1])
|
|
|
|
return text
|
2021-08-07 17:03:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
def getPriceFromString(priceStr: str) -> (str, str):
|
|
|
|
"""Returns the item price and currency
|
|
|
|
"""
|
2021-08-07 17:44:25 +00:00
|
|
|
currencies = getCurrencies()
|
2021-08-07 17:03:41 +00:00
|
|
|
for symbol, name in currencies.items():
|
|
|
|
if symbol in priceStr:
|
|
|
|
price = priceStr.replace(symbol, '')
|
|
|
|
if isfloat(price):
|
|
|
|
return price, name
|
|
|
|
elif name in priceStr:
|
|
|
|
price = priceStr.replace(name, '')
|
|
|
|
if isfloat(price):
|
|
|
|
return price, name
|
|
|
|
if isfloat(priceStr):
|
|
|
|
return priceStr, "EUR"
|
|
|
|
return "0.00", "EUR"
|
2021-10-14 15:12:35 +00:00
|
|
|
|
|
|
|
|
2021-10-14 15:40:19 +00:00
|
|
|
def _wordsSimilarityHistogram(words: []) -> {}:
|
|
|
|
"""Returns a histogram for word combinations
|
|
|
|
"""
|
|
|
|
histogram = {}
|
|
|
|
for index in range(1, len(words)):
|
|
|
|
combinedWords = words[index - 1] + words[index]
|
|
|
|
if histogram.get(combinedWords):
|
|
|
|
histogram[combinedWords] += 1
|
|
|
|
else:
|
|
|
|
histogram[combinedWords] = 1
|
|
|
|
return histogram
|
|
|
|
|
|
|
|
|
2021-10-14 15:53:04 +00:00
|
|
|
def _wordsSimilarityWordsList(content: str) -> []:
|
|
|
|
"""Returns a list of words for the given content
|
|
|
|
"""
|
2021-10-14 17:26:24 +00:00
|
|
|
removePunctuation = ('.', ',', ';', '-', ':', '"')
|
2021-10-14 15:53:04 +00:00
|
|
|
content = removeHtml(content).lower()
|
|
|
|
for p in removePunctuation:
|
|
|
|
content = content.replace(p, ' ')
|
|
|
|
content = content.replace(' ', ' ')
|
|
|
|
return content.split(' ')
|
|
|
|
|
|
|
|
|
2021-10-14 15:12:35 +00:00
|
|
|
def wordsSimilarity(content1: str, content2: str, minWords: int) -> int:
|
|
|
|
"""Returns percentage similarity
|
|
|
|
"""
|
|
|
|
if content1 == content2:
|
|
|
|
return 100
|
2021-10-14 15:40:19 +00:00
|
|
|
|
2021-10-14 15:53:04 +00:00
|
|
|
words1 = _wordsSimilarityWordsList(content1)
|
2021-10-14 15:12:35 +00:00
|
|
|
if len(words1) < minWords:
|
|
|
|
return 0
|
2021-10-14 15:40:19 +00:00
|
|
|
|
2021-10-14 15:53:04 +00:00
|
|
|
words2 = _wordsSimilarityWordsList(content2)
|
2021-10-14 15:12:35 +00:00
|
|
|
if len(words2) < minWords:
|
|
|
|
return 0
|
|
|
|
|
2021-10-14 15:40:19 +00:00
|
|
|
histogram1 = _wordsSimilarityHistogram(words1)
|
|
|
|
histogram2 = _wordsSimilarityHistogram(words2)
|
2021-10-14 15:12:35 +00:00
|
|
|
|
|
|
|
diff = 0
|
|
|
|
for combinedWords, hits in histogram1.items():
|
|
|
|
if not histogram2.get(combinedWords):
|
|
|
|
diff += 1
|
|
|
|
else:
|
|
|
|
diff += abs(histogram2[combinedWords] - histogram1[combinedWords])
|
|
|
|
return 100 - int(diff * 100 / len(histogram1.items()))
|
2021-10-26 16:06:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
def containsInvalidLocalLinks(content: str) -> bool:
|
|
|
|
"""Returns true if the given content has invalid links
|
|
|
|
"""
|
|
|
|
invalidStrings = (
|
|
|
|
'mute', 'unmute', 'editeventpost', 'notifypost',
|
|
|
|
'delete', 'options', 'page', 'repeat',
|
|
|
|
'bm', 'tl', 'actor', 'unrepeat',
|
|
|
|
'unannounce', 'like', 'unlike', 'bookmark',
|
|
|
|
'unbookmark', 'likedBy', 'id', 'time',
|
|
|
|
'year', 'month', 'day', 'editnewpost',
|
|
|
|
'graph', 'showshare', 'category', 'showwanted',
|
|
|
|
'rmshare', 'rmwanted', 'repeatprivate',
|
|
|
|
'unrepeatprivate', 'replyto',
|
|
|
|
'replyfollowers', 'replydm', 'editblogpost',
|
|
|
|
'handle', 'blockdomain'
|
|
|
|
)
|
|
|
|
for invStr in invalidStrings:
|
2021-10-26 16:52:17 +00:00
|
|
|
if '?' + invStr + '=' in content:
|
2021-10-26 16:06:22 +00:00
|
|
|
return True
|
|
|
|
return False
|