diff --git a/crawlers.py b/crawlers.py index 19815cfa8..c1f5d4c18 100644 --- a/crawlers.py +++ b/crawlers.py @@ -139,14 +139,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str, llm_bot_strings = ( 'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt', 'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot', - 'facebookbot', 'facebookexternalhit', 'google-extended', - 'piplbot', 'oai-search', 'applebot', 'meta-external', + 'piplbot', 'oai-search', 'meta-external', 'diffbot', 'perplexitybot', 'perplexity‑', 'novaact', 'operator', 'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot', 'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler', - 'googleother', 'icc-crawler', 'scrapy', 'timpibot', + 'icc-crawler', 'scrapy', 'timpibot', 'velenpublic', 'webzio-extended', 'cohere-ai', - 'cohere-train', 'crawlspace', 'facebookexternal', + 'cohere-train', 'crawlspace', 'img2dataset', 'imgproxy', 'isscyberriskcrawler', 'sidetrade', 'kangaroo.ai', 'kangaroo bot', 'iaskspider', 'duckassistbot', 'pangubot', 'semrush', 'poseidon research', 'awario', @@ -156,14 +155,12 @@ def blocked_user_agent(calling_domain: str, agent_str: str, 'bedrockbot', 'bigsur', 'bravebot', 'brightbot', 'buddybot', '-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot', 'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent', - 'google-cloudvertexbot', 'google-firebase', - 'google-Notebooklm', 'googleagent-mariner', - 'google-', 'iboubot', 'linerbot', 'linguee bot', + 'iboubot', 'linerbot', 'linguee bot', 'meta-externalagent', 'meta-externalfetcher', 'meta-webindexer', 'mycentralaiscraperbot', 'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot', 'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta', - 'velenpublicwebcrawler', 'wpbot', 'yak', 'yandex' + 'velenpublicwebcrawler', 'wpbot', 'yak' ) for bot_str in llm_bot_strings: if bot_str in agent_str_lower: @@ -180,7 +177,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, bot_strings = ( 'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ', '/robot', 'spider/', 'spider.ht', '/spider.', '-spider', - 'externalhit/', 'google', + 'externalhit/', 'google', 'applebot', 'yandex', 'facebook', 'slurp', 'crawler', 'crawling', ' crawl ', 'gigablast', 'archive.org', 'httrack', 'spider-', ' spider ', 'findlink', 'ips-agent',