From 1a15d07dfd1afffeb2a652369ec342bb606475a6 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Fri, 10 Jul 2020 14:15:01 +0000 Subject: [PATCH] Improve checking for bad markup --- content.py | 26 ++++++++++++++++++++++++++ inbox.py | 21 ++++++++++----------- tests.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 11 deletions(-) diff --git a/content.py b/content.py index 1df206fd..b7952ace 100644 --- a/content.py +++ b/content.py @@ -14,6 +14,32 @@ from utils import fileLastModified from utils import getLinkPrefixes +def dangerousMarkup(content: str) -> bool: + """Returns true if the given content contains dangerous html markup + """ + if '<' not in content: + return False + if '>' not in content: + return False + contentSections = content.split('<') + invalidStrings = ('script', 'canvas', 'style', 'abbr', + 'frame', 'iframe', 'html', 'body', + 'hr', 'br') + for markup in contentSections: + if '>' not in markup: + continue + markup = markup.split('>')[0].strip() + if ' ' not in markup: + for badStr in invalidStrings: + if badStr in markup: + return True + else: + for badStr in invalidStrings: + if badStr + ' ' in markup: + return True + return False + + def switchWords(baseDir: str, nickname: str, domain: str, content: str) -> str: """Performs word replacements. eg. Trump -> The Orange Menace """ diff --git a/inbox.py b/inbox.py index 5164365a..20496828 100644 --- a/inbox.py +++ b/inbox.py @@ -63,6 +63,7 @@ from media import replaceYouTube from git import isGitPatch from git import receiveGitPatch from followingCalendar import receivingCalendarEvents +from content import dangerousMarkup def storeHashTags(baseDir: str, nickname: str, postJsonObject: {}) -> None: @@ -1599,22 +1600,20 @@ def validPostContent(baseDir: str, nickname: str, domain: str, return False if 'Z' not in messageJson['object']['published']: return False + if isGitPatch(baseDir, nickname, domain, messageJson['object']['type'], messageJson['object']['summary'], messageJson['object']['content']): return True - # check for bad html - invalidStrings = ('', '', - '', '', - '', '', '
', '
') - for badStr in invalidStrings: - if badStr in messageJson['object']['content']: - if messageJson['object'].get('id'): - print('REJECT ARBITRARY HTML: ' + messageJson['object']['id']) - print('REJECT ARBITRARY HTML: bad string in post - ' + - messageJson['object']['content']) - return False + + if dangerousMarkup(messageJson['object']['content']): + if messageJson['object'].get('id'): + print('REJECT ARBITRARY HTML: ' + messageJson['object']['id']) + print('REJECT ARBITRARY HTML: bad string in post - ' + + messageJson['object']['content']) + return False + # check (rough) number of mentions mentionsEst = estimateNumberOfMentions(messageJson['object']['content']) if mentionsEst > maxMentions: diff --git a/tests.py b/tests.py index 55b0f758..12faa463 100644 --- a/tests.py +++ b/tests.py @@ -64,6 +64,7 @@ from media import getAttachmentMediaType from delete import sendDeleteViaServer from inbox import validInbox from inbox import validInboxFilenames +from content import dangerousMarkup from content import removeHtml from content import addWebLinks from content import replaceEmojiFromTags @@ -1882,8 +1883,40 @@ def testRemoveHtml(): assert(removeHtml(testStr) == 'This string has html.') +def testDangerousMarkup(): + print('testDangerousMarkup') + content = '

This is a valid message

' + assert(not dangerousMarkup(content)) + content = 'This is a valid message without markup' + assert(not dangerousMarkup(content)) + content = '

This is a valid-looking message. But wait... ' + \ + '

' + assert(dangerousMarkup(content)) + content = '

This is a valid-looking message. But wait... ' + \ + '