From 9ccc3af807988eb6725df8a8cdeb5fb0fc979d4a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 12 Oct 2023 15:34:49 +0100 Subject: [PATCH] Block corporate LLM scrapers --- daemon.py | 17 +++++++++++++++++ utils.py | 7 +++++++ 2 files changed, 24 insertions(+) diff --git a/daemon.py b/daemon.py index 0087db701..58604edba 100644 --- a/daemon.py +++ b/daemon.py @@ -300,6 +300,7 @@ from languages import set_actor_languages from languages import get_understood_languages from like import update_likes_collection from reaction import update_reaction_collection +from utils import corp_servers from utils import get_attributed_to from utils import get_memorials from utils import set_memorials @@ -1243,6 +1244,15 @@ class PubServer(BaseHTTPRequestHandler): self._http_return_code(401, 'Unauthorized', post_msg, None) + def _402(self, post_msg: str) -> None: + if self.server.translate: + ok_str = self.server.translate[post_msg] + self._http_return_code(402, self.server.translate['Unauthorized'], + ok_str, None) + else: + self._http_return_code(402, 'Unauthorized', + post_msg, None) + def _201(self, etag: str) -> None: if self.server.translate: done_str = self.server.translate['It is done'] @@ -17233,6 +17243,13 @@ class PubServer(BaseHTTPRequestHandler): calling_domain = self.server.domain_full + if self.headers.get('Server'): + if self.headers['Server'] in corp_servers(): + self._402("If you are a BigTech corp trying to steal " + + "data then it's time to see the color of " + + "your money") + return + if self.headers.get('Host'): calling_domain = decoded_host(self.headers['Host']) if self.server.onion_domain: diff --git a/utils.py b/utils.py index 011933bee..dad83c6d6 100644 --- a/utils.py +++ b/utils.py @@ -4657,3 +4657,10 @@ def lines_in_file(filename: str) -> int: except OSError: print('EX: lines_in_file error reading ' + filename) return 0 + + +def corp_servers() -> (): + """Returns a list of despised corporate thieves + """ + return ('GitHub.com', 'github.com', 'cloudflare', 'microsoft.com', + 'google.com')