mirror of https://gitlab.com/bashrc2/epicyon
More llm crawlers
parent
a71a7891e3
commit
224dfc346f
12
crawlers.py
12
crawlers.py
|
|
@ -137,7 +137,15 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
# is this an LLM crawler?
|
||||
# https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.txt
|
||||
llm_bot_strings = (
|
||||
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
|
||||
'addsearchbot', 'aihitbot', 'andibot', 'channel3bot',
|
||||
'chatglm', 'cloudflare-autorag', 'crawl4ai',
|
||||
'facebookbot', 'facebookexternalhit',
|
||||
'google-cloudvertexbot', 'google-extended', 'googleother',
|
||||
'google-firebase', 'notebooklm', 'googleagent',
|
||||
'iaskbot', 'imagespider', 'kunato', 'laion', 'lcc',
|
||||
'linkupbot', 'manus-', 'poggio', 'tavily', 'twinagent',
|
||||
'wrtnbot', 'yandexadditional', 'zanista',
|
||||
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'aibot', 'chatgpt',
|
||||
'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot',
|
||||
'piplbot', 'oai-search', 'meta-external', 'diffbot',
|
||||
'perplexitybot', 'perplexity‑', 'novaact', 'operator',
|
||||
|
|
@ -155,7 +163,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
'-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot',
|
||||
'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent',
|
||||
'iboubot', 'linerbot', 'linguee bot', 'meta-externalagent',
|
||||
'meta-externalfetcher', 'meta-webindexer', 'mycentralaiscraperbot',
|
||||
'meta-externalfetcher', 'meta-webindexer', 'aiscraper',
|
||||
'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot',
|
||||
'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta',
|
||||
'velenpublicwebcrawler', 'wpbot', 'yak'
|
||||
|
|
|
|||
Loading…
Reference in New Issue