From 224dfc346f65039fed86c55f3e8b6d5ab6637ea9 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 4 Jan 2026 12:08:42 +0000 Subject: [PATCH] More llm crawlers --- crawlers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crawlers.py b/crawlers.py index 3fa3b6d4a..47d216461 100644 --- a/crawlers.py +++ b/crawlers.py @@ -137,7 +137,15 @@ def blocked_user_agent(calling_domain: str, agent_str: str, # is this an LLM crawler? # https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.txt llm_bot_strings = ( - 'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt', + 'addsearchbot', 'aihitbot', 'andibot', 'channel3bot', + 'chatglm', 'cloudflare-autorag', 'crawl4ai', + 'facebookbot', 'facebookexternalhit', + 'google-cloudvertexbot', 'google-extended', 'googleother', + 'google-firebase', 'notebooklm', 'googleagent', + 'iaskbot', 'imagespider', 'kunato', 'laion', 'lcc', + 'linkupbot', 'manus-', 'poggio', 'tavily', 'twinagent', + 'wrtnbot', 'yandexadditional', 'zanista', + 'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'aibot', 'chatgpt', 'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot', 'piplbot', 'oai-search', 'meta-external', 'diffbot', 'perplexitybot', 'perplexity‑', 'novaact', 'operator', @@ -155,7 +163,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, '-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot', 'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent', 'iboubot', 'linerbot', 'linguee bot', 'meta-externalagent', - 'meta-externalfetcher', 'meta-webindexer', 'mycentralaiscraperbot', + 'meta-externalfetcher', 'meta-webindexer', 'aiscraper', 'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot', 'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta', 'velenpublicwebcrawler', 'wpbot', 'yak'