diff --git a/inbox.py b/inbox.py index 1b840da33..ba6435b23 100644 --- a/inbox.py +++ b/inbox.py @@ -1282,6 +1282,167 @@ def _receive_update_to_question(recent_posts_cache: {}, message_json: {}, return True +def _valid_post_content(base_dir: str, nickname: str, domain: str, + message_json: {}, max_mentions: int, max_emoji: int, + allow_local_network_access: bool, debug: bool, + system_language: str, + http_prefix: str, domain_full: str, + person_cache: {}, + max_hashtags: int) -> bool: + """Is the content of a received post valid? + Check for bad html + Check for hellthreads + Check that the language is understood + Check if it's a git patch + Check number of tags and mentions is reasonable + """ + if not has_object_dict(message_json): + return True + if 'content' not in message_json['object']: + return True + + if not message_json['object'].get('published'): + return False + published = message_json['object']['published'] + if 'T' not in published: + return False + if 'Z' not in published: + print('REJECT inbox post does not use Zulu time format. ' + + published) + return False + if '.' in published: + # converts 2022-03-30T17:37:58.734Z into 2022-03-30T17:37:58Z + published = published.split('.')[0] + 'Z' + message_json['object']['published'] = published + if not valid_post_date(published, 90, debug): + return False + + # if the post has been edited then check its edit date + if message_json['object'].get('updated'): + published_update = message_json['object']['updated'] + if 'T' not in published_update: + return False + if 'Z' not in published_update: + return False + if '.' in published_update: + # converts 2022-03-30T17:37:58.734Z into 2022-03-30T17:37:58Z + published_update = published_update.split('.')[0] + 'Z' + message_json['object']['updated'] = published_update + if not valid_post_date(published_update, 90, debug): + return False + + summary = None + if message_json['object'].get('summary'): + summary = message_json['object']['summary'] + if not isinstance(summary, str): + print('WARN: content warning is not a string') + return False + if summary != valid_content_warning(summary): + print('WARN: invalid content warning ' + summary) + return False + if dangerous_markup(summary, allow_local_network_access): + if message_json['object'].get('id'): + print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) + print('REJECT ARBITRARY HTML: bad string in summary - ' + + summary) + return False + + # check for patches before dangeousMarkup, which excludes code + if is_git_patch(base_dir, nickname, domain, + message_json['object']['type'], + summary, + message_json['object']['content']): + return True + + if is_question(message_json): + if is_question_filtered(base_dir, nickname, domain, + system_language, message_json): + print('REJECT: incoming question options filter') + return False + if dangerous_question(message_json, allow_local_network_access): + print('REJECT: incoming question markup filter') + return False + + content_str = get_base_content_from_post(message_json, system_language) + if dangerous_markup(content_str, allow_local_network_access): + if message_json['object'].get('id'): + print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) + print('REJECT ARBITRARY HTML: bad string in post - ' + + content_str) + return False + + # check (rough) number of mentions + mentions_est = _estimate_number_of_mentions(content_str) + if mentions_est > max_mentions: + if message_json['object'].get('id'): + print('REJECT HELLTHREAD: ' + message_json['object']['id']) + print('REJECT HELLTHREAD: Too many mentions in post - ' + + content_str) + return False + if _estimate_number_of_emoji(content_str) > max_emoji: + if message_json['object'].get('id'): + print('REJECT EMOJI OVERLOAD: ' + message_json['object']['id']) + print('REJECT EMOJI OVERLOAD: Too many emoji in post - ' + + content_str) + return False + if _estimate_number_of_hashtags(content_str) > max_hashtags: + if message_json['object'].get('id'): + print('REJECT HASHTAG OVERLOAD: ' + message_json['object']['id']) + print('REJECT HASHTAG OVERLOAD: Too many hashtags in post - ' + + content_str) + return False + # check number of tags + if message_json['object'].get('tag'): + if not isinstance(message_json['object']['tag'], list): + message_json['object']['tag'] = [] + else: + if len(message_json['object']['tag']) > int(max_mentions * 2): + if message_json['object'].get('id'): + print('REJECT: ' + message_json['object']['id']) + print('REJECT: Too many tags in post - ' + + message_json['object']['tag']) + return False + # check that the post is in a language suitable for this account + if not understood_post_language(base_dir, nickname, + message_json, system_language, + http_prefix, domain_full, + person_cache): + return False + + # check for urls which are too long + if not valid_url_lengths(content_str, 2048): + print('REJECT: url within content too long') + return False + + # check for filtered content + media_descriptions = get_media_descriptions_from_post(message_json) + content_all = content_str + if summary: + content_all = summary + ' ' + content_str + ' ' + media_descriptions + if is_filtered(base_dir, nickname, domain, content_all, + system_language): + print('REJECT: content filtered') + return False + if message_json['object'].get('inReplyTo'): + if isinstance(message_json['object']['inReplyTo'], str): + original_post_id = message_json['object']['inReplyTo'] + post_post_filename = locate_post(base_dir, nickname, domain, + original_post_id) + if post_post_filename: + if not _post_allow_comments(post_post_filename): + print('REJECT: reply to post which does not ' + + 'allow comments: ' + original_post_id) + return False + if invalid_ciphertext(message_json['object']['content']): + print('REJECT: malformed ciphertext in content ' + + message_json['object']['id'] + ' ' + + message_json['object']['content']) + return False + if debug: + print('ACCEPT: post content is valid') + return True + + def receive_edit_to_post(recent_posts_cache: {}, message_json: {}, base_dir: str, nickname: str, domain: str, @@ -3066,167 +3227,6 @@ def _estimate_number_of_hashtags(content: str) -> int: return content.count('>#<') -def _valid_post_content(base_dir: str, nickname: str, domain: str, - message_json: {}, max_mentions: int, max_emoji: int, - allow_local_network_access: bool, debug: bool, - system_language: str, - http_prefix: str, domain_full: str, - person_cache: {}, - max_hashtags: int) -> bool: - """Is the content of a received post valid? - Check for bad html - Check for hellthreads - Check that the language is understood - Check if it's a git patch - Check number of tags and mentions is reasonable - """ - if not has_object_dict(message_json): - return True - if 'content' not in message_json['object']: - return True - - if not message_json['object'].get('published'): - return False - published = message_json['object']['published'] - if 'T' not in published: - return False - if 'Z' not in published: - print('REJECT inbox post does not use Zulu time format. ' + - published) - return False - if '.' in published: - # converts 2022-03-30T17:37:58.734Z into 2022-03-30T17:37:58Z - published = published.split('.')[0] + 'Z' - message_json['object']['published'] = published - if not valid_post_date(published, 90, debug): - return False - - # if the post has been edited then check its edit date - if message_json['object'].get('updated'): - published_update = message_json['object']['updated'] - if 'T' not in published_update: - return False - if 'Z' not in published_update: - return False - if '.' in published_update: - # converts 2022-03-30T17:37:58.734Z into 2022-03-30T17:37:58Z - published_update = published_update.split('.')[0] + 'Z' - message_json['object']['updated'] = published_update - if not valid_post_date(published_update, 90, debug): - return False - - summary = None - if message_json['object'].get('summary'): - summary = message_json['object']['summary'] - if not isinstance(summary, str): - print('WARN: content warning is not a string') - return False - if summary != valid_content_warning(summary): - print('WARN: invalid content warning ' + summary) - return False - if dangerous_markup(summary, allow_local_network_access): - if message_json['object'].get('id'): - print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) - print('REJECT ARBITRARY HTML: bad string in summary - ' + - summary) - return False - - # check for patches before dangeousMarkup, which excludes code - if is_git_patch(base_dir, nickname, domain, - message_json['object']['type'], - summary, - message_json['object']['content']): - return True - - if is_question(message_json): - if is_question_filtered(base_dir, nickname, domain, - system_language, message_json): - print('REJECT: incoming question options filter') - return False - if dangerous_question(message_json, allow_local_network_access): - print('REJECT: incoming question markup filter') - return False - - content_str = get_base_content_from_post(message_json, system_language) - if dangerous_markup(content_str, allow_local_network_access): - if message_json['object'].get('id'): - print('REJECT ARBITRARY HTML: ' + message_json['object']['id']) - print('REJECT ARBITRARY HTML: bad string in post - ' + - content_str) - return False - - # check (rough) number of mentions - mentions_est = _estimate_number_of_mentions(content_str) - if mentions_est > max_mentions: - if message_json['object'].get('id'): - print('REJECT HELLTHREAD: ' + message_json['object']['id']) - print('REJECT HELLTHREAD: Too many mentions in post - ' + - content_str) - return False - if _estimate_number_of_emoji(content_str) > max_emoji: - if message_json['object'].get('id'): - print('REJECT EMOJI OVERLOAD: ' + message_json['object']['id']) - print('REJECT EMOJI OVERLOAD: Too many emoji in post - ' + - content_str) - return False - if _estimate_number_of_hashtags(content_str) > max_hashtags: - if message_json['object'].get('id'): - print('REJECT HASHTAG OVERLOAD: ' + message_json['object']['id']) - print('REJECT HASHTAG OVERLOAD: Too many hashtags in post - ' + - content_str) - return False - # check number of tags - if message_json['object'].get('tag'): - if not isinstance(message_json['object']['tag'], list): - message_json['object']['tag'] = [] - else: - if len(message_json['object']['tag']) > int(max_mentions * 2): - if message_json['object'].get('id'): - print('REJECT: ' + message_json['object']['id']) - print('REJECT: Too many tags in post - ' + - message_json['object']['tag']) - return False - # check that the post is in a language suitable for this account - if not understood_post_language(base_dir, nickname, - message_json, system_language, - http_prefix, domain_full, - person_cache): - return False - - # check for urls which are too long - if not valid_url_lengths(content_str, 2048): - print('REJECT: url within content too long') - return False - - # check for filtered content - media_descriptions = get_media_descriptions_from_post(message_json) - content_all = content_str - if summary: - content_all = summary + ' ' + content_str + ' ' + media_descriptions - if is_filtered(base_dir, nickname, domain, content_all, - system_language): - print('REJECT: content filtered') - return False - if message_json['object'].get('inReplyTo'): - if isinstance(message_json['object']['inReplyTo'], str): - original_post_id = message_json['object']['inReplyTo'] - post_post_filename = locate_post(base_dir, nickname, domain, - original_post_id) - if post_post_filename: - if not _post_allow_comments(post_post_filename): - print('REJECT: reply to post which does not ' + - 'allow comments: ' + original_post_id) - return False - if invalid_ciphertext(message_json['object']['content']): - print('REJECT: malformed ciphertext in content ' + - message_json['object']['id'] + ' ' + - message_json['object']['content']) - return False - if debug: - print('ACCEPT: post content is valid') - return True - - def _obtain_avatar_for_reply_post(session, base_dir: str, http_prefix: str, domain: str, onion_domain: str, i2p_domain: str,