From 9d3bad77e1b6cebce1589087e282b95e24d9c831 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sat, 17 Aug 2024 21:15:08 +0100
Subject: [PATCH] Poison some other LLM scrapers

---
 crawlers.py    | 46 +++++++++++++++++++++++++++++++---------------
 daemon_get.py  | 11 ++++++++++-
 daemon_post.py |  2 +-
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/crawlers.py b/crawlers.py
index 10478791f..fa34efed0 100644
--- a/crawlers.py
+++ b/crawlers.py
@@ -110,32 +110,49 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
                        blocked_cache_update_secs: int,
                        crawlers_allowed: [],
                        known_bots: [], path: str,
-                       block_military: {}) -> (bool,):
+                       block_military: {}):
     """Should a GET or POST be blocked based upon its user agent?
     """
     if not agent_str:
-        return True, blocked_cache_last_updated
+        return True, blocked_cache_last_updated, False
 
     agent_str_lower = agent_str.lower()
     for ua_block in default_user_agent_blocks:
         if ua_block in agent_str_lower:
             print('BLOCK: Blocked User agent 1: ' + ua_block)
-            return True, blocked_cache_last_updated
+            return True, blocked_cache_last_updated, False
 
     agent_domain = None
 
     if agent_str:
+        contains_bot_string = False
+        llm = False
+
+        # is this an LLM crawler?
+        llm_bot_strings = (
+            'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
+            'anthropic', 'mlbot'
+        )
+        for bot_str in llm_bot_strings:
+            if bot_str in agent_str_lower:
+                if '://bot' not in agent_str_lower and \
+                   '://robot' not in agent_str_lower and \
+                   '://spider' not in agent_str_lower and \
+                   'pixelfedbot/' not in agent_str_lower:
+                    contains_bot_string = True
+                    llm = True
+                    break
+
         # is this a web crawler? If so then block it by default
         # unless this is a news instance or if it is in the allowed list
         bot_strings = (
             'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
-            '/robot', 'gptbot', '-ai/', ' ai/', '-ai ',
-            ' ai ', 'spider/', 'spider.ht', '/spider.', '-spider',
-            'externalhit/', 'chatgpt', 'google', 'anthropic',
+            '/robot', 'spider/', 'spider.ht', '/spider.', '-spider',
+            'externalhit/', 'google',
             'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
             'gigablast', 'archive.org', 'httrack',
             'spider-', ' spider ', 'findlink', 'ips-agent',
-            'woriobot', 'mlbot', 'webbot', 'webcrawl',
+            'woriobot', 'webbot', 'webcrawl',
             'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot',
             'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright',
             'apercite', 'bot (', 'summify', 'ccbot', 'linkfind',
@@ -143,7 +160,6 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
             'drupact', 'searchengine', 'coccoc',
             'explorer/', 'explorer;', 'crystalsemantics',
             'scraper/', ' scraper ', ' scrape ', 'scraping')
-        contains_bot_string = False
         for bot_str in bot_strings:
             if bot_str in agent_str_lower:
                 if '://bot' not in agent_str_lower and \
@@ -160,18 +176,18 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
             # if this is a news instance then we want it
             # to be indexed by search engines
             if news_instance:
-                return False, blocked_cache_last_updated
+                return False, blocked_cache_last_updated, llm
             # is this crawler allowed?
             for crawler in crawlers_allowed:
                 if crawler.lower() in agent_str_lower:
-                    return False, blocked_cache_last_updated
+                    return False, blocked_cache_last_updated, llm
             print('BLOCK: Blocked Crawler: ' + agent_str)
-            return True, blocked_cache_last_updated
+            return True, blocked_cache_last_updated, llm
         # get domain name from User-Agent
         agent_domain = user_agent_domain(agent_str, debug)
     else:
         # no User-Agent header is present
-        return True, blocked_cache_last_updated
+        return True, blocked_cache_last_updated, False
 
     # is the User-Agent type blocked? eg. "Mastodon"
     if user_agents_blocked:
@@ -181,10 +197,10 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
                 blocked_ua = True
                 break
         if blocked_ua:
-            return True, blocked_cache_last_updated
+            return True, blocked_cache_last_updated, False
 
     if not agent_domain:
-        return False, blocked_cache_last_updated
+        return False, blocked_cache_last_updated, False
 
     # is the User-Agent domain blocked
     blocked_ua = False
@@ -225,4 +241,4 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
                                   agent_domain)
                             break
 
-    return blocked_ua, blocked_cache_last_updated
+    return blocked_ua, blocked_cache_last_updated, False
diff --git a/daemon_get.py b/daemon_get.py
index 94ff6a7e1..35e121060 100644
--- a/daemon_get.py
+++ b/daemon_get.py
@@ -320,7 +320,7 @@ def daemon_http_get(self) -> None:
     ua_str = get_user_agent(self)
 
     if not _permitted_crawler_path(self.path):
-        block, self.server.blocked_cache_last_updated = \
+        block, self.server.blocked_cache_last_updated, llm = \
             blocked_user_agent(calling_domain, ua_str,
                                self.server.news_instance,
                                self.server.debug,
@@ -334,6 +334,15 @@ def daemon_http_get(self) -> None:
                                self.server.known_bots,
                                self.path, self.server.block_military)
         if block:
+            if llm:
+                msg = html_poisoned(self.server.dictionary)
+                msg = msg.encode('utf-8')
+                msglen = len(msg)
+                set_headers(self, 'text/html', msglen,
+                            '', calling_domain, False)
+                write2(self, msg)
+                print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
+                return
             http_400(self)
             return
 
diff --git a/daemon_post.py b/daemon_post.py
index f55dc47a4..f18bb1144 100644
--- a/daemon_post.py
+++ b/daemon_post.py
@@ -163,7 +163,7 @@ def daemon_http_post(self) -> None:
 
     ua_str = get_user_agent(self)
 
-    block, self.server.blocked_cache_last_updated = \
+    block, self.server.blocked_cache_last_updated, _ = \
         blocked_user_agent(calling_domain, ua_str,
                            self.server.news_instance,
                            self.server.debug,