diff --git a/crawlers.py b/crawlers.py index 54383bc0d..529563b98 100644 --- a/crawlers.py +++ b/crawlers.py @@ -126,21 +126,22 @@ def blocked_user_agent(calling_domain: str, agent_str: str, if agent_str: # is this a web crawler? If so then block it by default # unless this is a news instance or if it is in the allowed list - bot_strings = ('bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ', - '/robot', 'gptbot', '-ai/', ' ai/', '-ai ', - ' ai ', 'spider/', 'spider.ht', '/spider.', '-spider', - 'externalhit/', 'chatgpt', 'google', 'anthropic', - 'facebook', 'slurp', 'crawler', 'crawling', ' crawl ', - 'gigablast', 'archive.org', 'httrack', - 'spider-', ' spider ', 'findlink', 'ips-agent', - 'woriobot', 'mlbot', 'webbot', 'webcrawl', - 'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot', - 'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright', - 'apercite', 'bot (', 'summify', 'ccbot', 'linkfind', - 'linkanalyze', 'analyzer', 'wotbox', 'ichiro', - 'drupact', 'searchengine', 'coccoc', - 'explorer/', 'explorer;', 'crystalsemantics', - 'scraper/', ' scraper ', ' scrape ', 'scraping') + bot_strings = ( + 'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ', + '/robot', 'gptbot', '-ai/', ' ai/', '-ai ', + ' ai ', 'spider/', 'spider.ht', '/spider.', '-spider', + 'externalhit/', 'chatgpt', 'google', 'anthropic', + 'facebook', 'slurp', 'crawler', 'crawling', ' crawl ', + 'gigablast', 'archive.org', 'httrack', + 'spider-', ' spider ', 'findlink', 'ips-agent', + 'woriobot', 'mlbot', 'webbot', 'webcrawl', + 'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot', + 'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright', + 'apercite', 'bot (', 'summify', 'ccbot', 'linkfind', + 'linkanalyze', 'analyzer', 'wotbox', 'ichiro', + 'drupact', 'searchengine', 'coccoc', + 'explorer/', 'explorer;', 'crystalsemantics', + 'scraper/', ' scraper ', ' scrape ', 'scraping') contains_bot_string = False for bot_str in bot_strings: if bot_str in agent_str_lower: