From d206d6390d391f4f6281f2dfdb39685c35663f90 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Tue, 21 Jan 2025 18:35:29 +0000 Subject: [PATCH] More LLM crawlers --- crawlers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crawlers.py b/crawlers.py index f7bc605f8..c59a263ec 100644 --- a/crawlers.py +++ b/crawlers.py @@ -144,9 +144,11 @@ def blocked_user_agent(calling_domain: str, agent_str: str, 'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot', 'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler', 'googleother', 'icc-crawler', 'scrapy', 'timpibot', - 'velenpublic', 'webzio-extended', 'cohere-ai', 'facebookexternal', + 'velenpublic', 'webzio-extended', 'cohere-ai', + 'cohere-train', 'crawlspace', 'facebookexternal', 'img2dataset', 'isscyberriskcrawler', 'sidetrade', 'kangaroo.ai', - 'kangaroo bot', 'iaskspider', 'duckassistbot' + 'kangaroo bot', 'iaskspider', 'duckassistbot', 'pangubot', + 'semrush' ) for bot_str in llm_bot_strings: if bot_str in agent_str_lower: