mirror of https://gitlab.com/bashrc2/epicyon
				
				
				
			More crawlers
							parent
							
								
									2ea94354da
								
							
						
					
					
						commit
						18508fab89
					
				
							
								
								
									
										21
									
								
								crawlers.py
								
								
								
								
							
							
						
						
									
										21
									
								
								crawlers.py
								
								
								
								
							| 
						 | 
				
			
			@ -126,16 +126,27 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
 | 
			
		|||
    if agent_str:
 | 
			
		||||
        # is this a web crawler? If so then block it by default
 | 
			
		||||
        # unless this is a news instance or if it is in the allowed list
 | 
			
		||||
        bot_strings = ('bot/', 'bot-', '/bot', '/robot', 'gptbot',
 | 
			
		||||
                       '-ai/', ' ai/', '-ai ', ' ai ', 'spider/',
 | 
			
		||||
                       'externalhit/', 'chatgpt', 'google',
 | 
			
		||||
                       'anthropic', 'facebook', 'slurp', 'crawler',
 | 
			
		||||
                       'crawling', 'gigablast', 'archive.org')
 | 
			
		||||
        bot_strings = ('bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
 | 
			
		||||
                       '/robot', 'gptbot', '-ai/', ' ai/', '-ai ',
 | 
			
		||||
                       ' ai ', 'spider/', 'spider.ht', '/spider.', '-spider',
 | 
			
		||||
                       'externalhit/', 'chatgpt', 'google', 'anthropic',
 | 
			
		||||
                       'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
 | 
			
		||||
                       'gigablast', 'archive.org', 'httrack', 'spider/',
 | 
			
		||||
                       'spider-', ' spider ', 'findlink', 'ips-agent',
 | 
			
		||||
                       'woriobot', 'mlbot', 'webbot', 'webcrawl',
 | 
			
		||||
                       'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot',
 | 
			
		||||
                       'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright',
 | 
			
		||||
                       'apercite', 'bot (', 'summify', 'ccbot', 'linkfind',
 | 
			
		||||
                       'linkanalyze', 'analyzer', 'wotbox', 'ichiro',
 | 
			
		||||
                       'drupact', 'searchengine', 'coccoc',
 | 
			
		||||
                       'explorer/', 'explorer;', 'crystalsemantics',
 | 
			
		||||
                       'scraper/', ' scraper ', ' scrape ', 'scraping')
 | 
			
		||||
        contains_bot_string = False
 | 
			
		||||
        for bot_str in bot_strings:
 | 
			
		||||
            if bot_str in agent_str_lower:
 | 
			
		||||
                if '://bot' not in agent_str_lower and \
 | 
			
		||||
                   '://robot' not in agent_str_lower and \
 | 
			
		||||
                   '://spider' not in agent_str_lower and \
 | 
			
		||||
                   'pixelfedbot/' not in agent_str_lower:
 | 
			
		||||
                    contains_bot_string = True
 | 
			
		||||
                    break
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue