mirror of https://gitlab.com/bashrc2/epicyon
Distinguish more between LLM crawlers and web crawlers (increasingly difficult)
parent
54604dbf27
commit
b02d542b62
15
crawlers.py
15
crawlers.py
|
|
@ -139,14 +139,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
llm_bot_strings = (
|
llm_bot_strings = (
|
||||||
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
|
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
|
||||||
'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot',
|
'anthropic', 'mlbot', 'claude-', 'claudebot', 'ccbot',
|
||||||
'facebookbot', 'facebookexternalhit', 'google-extended',
|
'piplbot', 'oai-search', 'meta-external',
|
||||||
'piplbot', 'oai-search', 'applebot', 'meta-external',
|
|
||||||
'diffbot', 'perplexitybot', 'perplexity‑', 'novaact', 'operator',
|
'diffbot', 'perplexitybot', 'perplexity‑', 'novaact', 'operator',
|
||||||
'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot',
|
'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot',
|
||||||
'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler',
|
'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler',
|
||||||
'googleother', 'icc-crawler', 'scrapy', 'timpibot',
|
'icc-crawler', 'scrapy', 'timpibot',
|
||||||
'velenpublic', 'webzio-extended', 'cohere-ai',
|
'velenpublic', 'webzio-extended', 'cohere-ai',
|
||||||
'cohere-train', 'crawlspace', 'facebookexternal',
|
'cohere-train', 'crawlspace',
|
||||||
'img2dataset', 'imgproxy', 'isscyberriskcrawler', 'sidetrade',
|
'img2dataset', 'imgproxy', 'isscyberriskcrawler', 'sidetrade',
|
||||||
'kangaroo.ai', 'kangaroo bot', 'iaskspider', 'duckassistbot',
|
'kangaroo.ai', 'kangaroo bot', 'iaskspider', 'duckassistbot',
|
||||||
'pangubot', 'semrush', 'poseidon research', 'awario',
|
'pangubot', 'semrush', 'poseidon research', 'awario',
|
||||||
|
|
@ -156,14 +155,12 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
'bedrockbot', 'bigsur', 'bravebot', 'brightbot', 'buddybot',
|
'bedrockbot', 'bigsur', 'bravebot', 'brightbot', 'buddybot',
|
||||||
'-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot',
|
'-autorag', 'cloudvertexbot', 'cotoyogi', 'deepseekbot',
|
||||||
'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent',
|
'devin', 'echoboxbot', 'factset_spyderbot', 'firecrawlagent',
|
||||||
'google-cloudvertexbot', 'google-firebase',
|
'iboubot', 'linerbot', 'linguee bot',
|
||||||
'google-Notebooklm', 'googleagent-mariner',
|
|
||||||
'google-', 'iboubot', 'linerbot', 'linguee bot',
|
|
||||||
'meta-externalagent', 'meta-externalfetcher', 'meta-webindexer',
|
'meta-externalagent', 'meta-externalfetcher', 'meta-webindexer',
|
||||||
'mycentralaiscraperbot',
|
'mycentralaiscraperbot',
|
||||||
'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot',
|
'openai', 'panscient', 'phindbot', 'qualifiedbot', 'quillbot',
|
||||||
'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta',
|
'sbIntuitionsbot', 'semrushbot', 'shapbot', 'terracotta',
|
||||||
'velenpublicwebcrawler', 'wpbot', 'yak', 'yandex'
|
'velenpublicwebcrawler', 'wpbot', 'yak'
|
||||||
)
|
)
|
||||||
for bot_str in llm_bot_strings:
|
for bot_str in llm_bot_strings:
|
||||||
if bot_str in agent_str_lower:
|
if bot_str in agent_str_lower:
|
||||||
|
|
@ -180,7 +177,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
bot_strings = (
|
bot_strings = (
|
||||||
'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
|
'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
|
||||||
'/robot', 'spider/', 'spider.ht', '/spider.', '-spider',
|
'/robot', 'spider/', 'spider.ht', '/spider.', '-spider',
|
||||||
'externalhit/', 'google',
|
'externalhit/', 'google', 'applebot', 'yandex',
|
||||||
'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
|
'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
|
||||||
'gigablast', 'archive.org', 'httrack',
|
'gigablast', 'archive.org', 'httrack',
|
||||||
'spider-', ' spider ', 'findlink', 'ips-agent',
|
'spider-', ' spider ', 'findlink', 'ips-agent',
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue