From e200f822d32f40c3ca646852fada6eb3865877be Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Tue, 20 Aug 2024 10:08:04 +0100 Subject: [PATCH] Return 404 for images requested by LLM crawlers --- daemon_get.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/daemon_get.py b/daemon_get.py index 27c7c1187..d2daaf235 100644 --- a/daemon_get.py +++ b/daemon_get.py @@ -273,6 +273,9 @@ def daemon_http_get(self) -> None: # oai-host-hash requests come from Microsoft Corporation, # which has a long term partnership with OpenAI if 'oai-host-hash' in self.headers: + if is_image_file(self.path): + http_404(self) + return print('GET HTTP LLM scraper poisoned: ' + str(self.headers)) msg = html_poisoned(self.server.dictionary, self.server.twograms) @@ -346,6 +349,9 @@ def daemon_http_get(self) -> None: self.path, self.server.block_military) if block: if llm: + if is_image_file(self.path): + http_404(self) + return # if this is an LLM crawler then feed it some trash print('GET HTTP LLM scraper poisoned: ' + str(self.headers)) msg = html_poisoned(self.server.dictionary,