More llm crawlers

main
Bob Mottram 2026-01-04 12:08:42 +00:00
parent a71a7891e3
commit 224dfc346f
1 changed files with 10 additions and 2 deletions

View File

@ -137,7 +137,15 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
# is this an LLM crawler?
# https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.txt
llm_bot_strings = (
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
'addsearchbot', 'aihitbot', 'andibot', 'channel3bot',
'chatglm', 'cloudflare-autorag', 'crawl4ai',
'facebookbot', 'facebookexternalhit',
'google-cloudvertexbot', 'google-extended', 'googleother',
'google-firebase', 'notebooklm', 'googleagent',
'iaskbot', 'imagespider', 'kunato', 'laion', 'lcc',
'linkupbot', 'manus-', 'poggio', 'tavily', 'twinagent',
'wrtnbot', 'yandexadditional', 'zanista',
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'aibot', 'chatgpt',
'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot',
'piplbot', 'oai-search', 'meta-external', 'diffbot',
'perplexitybot', 'perplexity', 'novaact', 'operator',
@ -155,7 +163,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
'-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot',
'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent',
'iboubot', 'linerbot', 'linguee bot', 'meta-externalagent',
'meta-externalfetcher', 'meta-webindexer', 'mycentralaiscraperbot',
'meta-externalfetcher', 'meta-webindexer', 'aiscraper',
'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot',
'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta',
'velenpublicwebcrawler', 'wpbot', 'yak'