diff --git a/daemon.py b/daemon.py index a9ecabc9d..4f25f5e7e 100644 --- a/daemon.py +++ b/daemon.py @@ -837,6 +837,12 @@ def run_daemon(accounts_data_dir: str, # cache for automatic content warnings httpd.auto_cw_cache = load_auto_cw_cache(base_dir) + # loads a catalog of http header fields + headers_catalog_fieldname = data_dir(base_dir) + '/headers_catalog.json' + httpd.headers_catalog = {} + if os.path.isfile(headers_catalog_fieldname): + httpd.headers_catalog = load_json(headers_catalog_fieldname) + # list of websites which are currently down httpd.sites_unavailable = load_unavailable_sites(base_dir) diff --git a/daemon_get.py b/daemon_get.py index 13e71d766..cfbbb8f97 100644 --- a/daemon_get.py +++ b/daemon_get.py @@ -67,6 +67,7 @@ from daemon_utils import has_accept from daemon_utils import show_person_options from daemon_utils import is_authorized from daemon_utils import get_user_agent +from httpheaders import update_headers_catalog from httpheaders import set_headers_etag from httpheaders import login_headers from httpheaders import redirect_headers @@ -243,6 +244,11 @@ def daemon_http_get(self) -> None: calling_domain = self.server.domain_full + # record header fields encountered + update_headers_catalog(self.server.base_dir, + self.server.headers_catalog, + self.headers) + if self.headers.get('Server'): if self.headers['Server'] in corp_servers(): print('GET HTTP Corporate leech bounced: ' + diff --git a/daemon_post.py b/daemon_post.py index c9a0929e0..14aae2c9b 100644 --- a/daemon_post.py +++ b/daemon_post.py @@ -34,6 +34,7 @@ from httpcodes import http_400 from httpcodes import http_402 from httpcodes import http_404 from httpcodes import http_503 +from httpheaders import update_headers_catalog from httpheaders import redirect_headers from daemon_utils import get_user_agent from daemon_utils import post_to_outbox @@ -87,6 +88,10 @@ def daemon_http_post(self) -> None: ' path: ' + self.path + ' busy: ' + str(self.server.postreq_busy)) + update_headers_catalog(self.server.base_dir, + self.server.headers_catalog, + self.headers) + calling_domain = self.server.domain_full if self.headers.get('Host'): calling_domain = decoded_host(self.headers['Host']) diff --git a/httpheaders.py b/httpheaders.py index 1c6b6d5b6..9054381b7 100644 --- a/httpheaders.py +++ b/httpheaders.py @@ -12,6 +12,8 @@ import urllib.parse from hashlib import md5 from utils import string_contains from utils import get_instance_url +from utils import data_dir +from utils import save_json def login_headers(self, file_format: str, length: int, @@ -205,3 +207,20 @@ def set_headers_etag(self, media_filename: str, file_format: str, self.send_header('last-modified', last_modified) self.send_header('accept-ranges', 'bytes') self.end_headers() + + +def update_headers_catalog(base_dir: str, headers_catalog: {}, + headers: {}) -> None: + """Creates a catalog of headers + This allows us to spot anything unexpected for later investigation + """ + headers_catalog_fieldname = data_dir(base_dir) + '/headers_catalog.json' + changed = False + for fieldname, fieldvalue in headers.items(): + if fieldname in headers_catalog: + continue + headers_catalog[fieldname] = fieldvalue + changed = True + + if changed: + save_json(headers_catalog, headers_catalog_fieldname)