mirror of https://gitlab.com/bashrc2/epicyon
				
				
				
			
		
			
				
	
	
		
			278 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			278 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
| __filename__ = "crawlers.py"
 | ||
| __author__ = "Bob Mottram"
 | ||
| __license__ = "AGPL3+"
 | ||
| __version__ = "1.6.0"
 | ||
| __maintainer__ = "Bob Mottram"
 | ||
| __email__ = "bob@libreserver.org"
 | ||
| __status__ = "Production"
 | ||
| __module_group__ = "Core"
 | ||
| 
 | ||
| import os
 | ||
| import time
 | ||
| from utils import data_dir
 | ||
| from utils import save_json
 | ||
| from utils import user_agent_domain
 | ||
| from utils import remove_eol
 | ||
| from blocking import get_mil_domains_list
 | ||
| from blocking import get_gov_domains_list
 | ||
| from blocking import get_bsky_domains_list
 | ||
| from blocking import get_nostr_domains_list
 | ||
| from blocking import update_blocked_cache
 | ||
| from blocking import is_blocked_domain
 | ||
| 
 | ||
| default_user_agent_blocks = [
 | ||
|     'fedilist', 'ncsc scan', 'fedifetcher'
 | ||
| ]
 | ||
| 
 | ||
| 
 | ||
| def update_known_crawlers(ua_str: str,
 | ||
|                           base_dir: str, known_crawlers: {},
 | ||
|                           last_known_crawler: int) -> int:
 | ||
|     """Updates a dictionary of known crawlers accessing nodeinfo
 | ||
|     or the masto API
 | ||
|     """
 | ||
|     if not ua_str:
 | ||
|         return None
 | ||
| 
 | ||
|     curr_time = int(time.time())
 | ||
|     if known_crawlers.get(ua_str):
 | ||
|         known_crawlers[ua_str]['hits'] += 1
 | ||
|         known_crawlers[ua_str]['lastseen'] = curr_time
 | ||
|     else:
 | ||
|         known_crawlers[ua_str] = {
 | ||
|             "lastseen": curr_time,
 | ||
|             "hits": 1
 | ||
|         }
 | ||
| 
 | ||
|     if curr_time - last_known_crawler >= 30:
 | ||
|         # remove any old observations
 | ||
|         remove_crawlers: list[str] = []
 | ||
|         for uagent, item in known_crawlers.items():
 | ||
|             if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
 | ||
|                 remove_crawlers.append(uagent)
 | ||
|         for uagent in remove_crawlers:
 | ||
|             del known_crawlers[uagent]
 | ||
|         # save the list of crawlers
 | ||
|         dir_str = data_dir(base_dir)
 | ||
|         save_json(known_crawlers, dir_str + '/knownCrawlers.json')
 | ||
|     return curr_time
 | ||
| 
 | ||
| 
 | ||
| def load_known_web_bots(base_dir: str) -> []:
 | ||
|     """Returns a list of known web bots
 | ||
|     """
 | ||
|     known_bots_filename = data_dir(base_dir) + '/knownBots.txt'
 | ||
|     if not os.path.isfile(known_bots_filename):
 | ||
|         return []
 | ||
|     crawlers_str = None
 | ||
|     try:
 | ||
|         with open(known_bots_filename, 'r', encoding='utf-8') as fp_crawlers:
 | ||
|             crawlers_str = fp_crawlers.read()
 | ||
|     except OSError:
 | ||
|         print('EX: unable to load web bots from ' +
 | ||
|               known_bots_filename)
 | ||
|     if not crawlers_str:
 | ||
|         return []
 | ||
|     known_bots: list[str] = []
 | ||
|     crawlers_list = crawlers_str.split('\n')
 | ||
|     for crawler in crawlers_list:
 | ||
|         if not crawler:
 | ||
|             continue
 | ||
|         crawler = remove_eol(crawler).strip()
 | ||
|         if not crawler:
 | ||
|             continue
 | ||
|         if crawler not in known_bots:
 | ||
|             known_bots.append(crawler)
 | ||
|     return known_bots
 | ||
| 
 | ||
| 
 | ||
| def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
 | ||
|     """Saves a list of known web bots
 | ||
|     """
 | ||
|     known_bots_filename = data_dir(base_dir) + '/knownBots.txt'
 | ||
|     known_bots_str = ''
 | ||
|     for crawler in known_bots:
 | ||
|         known_bots_str += crawler.strip() + '\n'
 | ||
|     try:
 | ||
|         with open(known_bots_filename, 'w+', encoding='utf-8') as fp_crawlers:
 | ||
|             fp_crawlers.write(known_bots_str)
 | ||
|     except OSError:
 | ||
|         print("EX: unable to save known web bots to " +
 | ||
|               known_bots_filename)
 | ||
|         return False
 | ||
|     return True
 | ||
| 
 | ||
| 
 | ||
| def blocked_user_agent(calling_domain: str, agent_str: str,
 | ||
|                        news_instance: bool, debug: bool,
 | ||
|                        user_agents_blocked: [],
 | ||
|                        blocked_cache_last_updated,
 | ||
|                        base_dir: str,
 | ||
|                        blocked_cache: [],
 | ||
|                        block_federated: [],
 | ||
|                        blocked_cache_update_secs: int,
 | ||
|                        crawlers_allowed: [],
 | ||
|                        known_bots: [], path: str,
 | ||
|                        block_military: {},
 | ||
|                        block_government: {},
 | ||
|                        block_bluesky: {},
 | ||
|                        block_nostr: {}):
 | ||
|     """Should a GET or POST be blocked based upon its user agent?
 | ||
|     """
 | ||
|     if not agent_str:
 | ||
|         return True, blocked_cache_last_updated, False
 | ||
| 
 | ||
|     agent_str_lower = agent_str.lower()
 | ||
|     for ua_block in default_user_agent_blocks:
 | ||
|         if ua_block in agent_str_lower:
 | ||
|             print('BLOCK: Blocked User agent 1: ' + ua_block)
 | ||
|             return True, blocked_cache_last_updated, False
 | ||
| 
 | ||
|     agent_domain = None
 | ||
| 
 | ||
|     if agent_str:
 | ||
|         contains_bot_string = False
 | ||
|         llm = False
 | ||
| 
 | ||
|         # is this an LLM crawler?
 | ||
|         # https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.txt
 | ||
|         llm_bot_strings = (
 | ||
|             'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
 | ||
|             'anthropic', 'mlbot', 'claude-web', 'claudebot', 'ccbot',
 | ||
|             'facebookbot', 'google-extended', 'piplbot', 'oai-search',
 | ||
|             'applebot', 'meta-external', 'diffbot', 'perplexitybot',
 | ||
|             'perplexity‑', 'novaact', 'operator',
 | ||
|             'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot',
 | ||
|             'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler',
 | ||
|             'googleother', 'icc-crawler', 'scrapy', 'timpibot',
 | ||
|             'velenpublic', 'webzio-extended', 'cohere-ai',
 | ||
|             'cohere-train', 'crawlspace', 'facebookexternal',
 | ||
|             'img2dataset', 'imgproxy', 'isscyberriskcrawler', 'sidetrade',
 | ||
|             'kangaroo.ai', 'kangaroo bot', 'iaskspider', 'duckassistbot',
 | ||
|             'pangubot', 'semrush'
 | ||
|         )
 | ||
|         for bot_str in llm_bot_strings:
 | ||
|             if bot_str in agent_str_lower:
 | ||
|                 if '://bot' not in agent_str_lower and \
 | ||
|                    '://robot' not in agent_str_lower and \
 | ||
|                    '://spider' not in agent_str_lower and \
 | ||
|                    'pixelfedbot/' not in agent_str_lower:
 | ||
|                     contains_bot_string = True
 | ||
|                     llm = True
 | ||
|                     break
 | ||
| 
 | ||
|         # is this a web crawler? If so then block it by default
 | ||
|         # unless this is a news instance or if it is in the allowed list
 | ||
|         bot_strings = (
 | ||
|             'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
 | ||
|             '/robot', 'spider/', 'spider.ht', '/spider.', '-spider',
 | ||
|             'externalhit/', 'google',
 | ||
|             'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
 | ||
|             'gigablast', 'archive.org', 'httrack',
 | ||
|             'spider-', ' spider ', 'findlink', 'ips-agent',
 | ||
|             'woriobot', 'webbot', 'webcrawl',
 | ||
|             'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot',
 | ||
|             'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright',
 | ||
|             'apercite', 'bot (', 'summify', 'linkfind',
 | ||
|             'linkanalyze', 'analyzer', 'wotbox', 'ichiro',
 | ||
|             'drupact', 'searchengine', 'coccoc',
 | ||
|             'explorer/', 'explorer;', 'crystalsemantics',
 | ||
|             'scraper/', ' scraper ', ' scrape ', 'scraping')
 | ||
|         for bot_str in bot_strings:
 | ||
|             if bot_str in agent_str_lower:
 | ||
|                 if '://bot' not in agent_str_lower and \
 | ||
|                    '://robot' not in agent_str_lower and \
 | ||
|                    '://spider' not in agent_str_lower and \
 | ||
|                    'pixelfedbot/' not in agent_str_lower:
 | ||
|                     contains_bot_string = True
 | ||
|                     break
 | ||
|         if contains_bot_string:
 | ||
|             if agent_str_lower not in known_bots:
 | ||
|                 known_bots.append(agent_str_lower)
 | ||
|                 known_bots.sort()
 | ||
|                 _save_known_web_bots(base_dir, known_bots)
 | ||
|             # if this is a news instance then we want it
 | ||
|             # to be indexed by search engines
 | ||
|             if news_instance:
 | ||
|                 return False, blocked_cache_last_updated, llm
 | ||
|             # is this crawler allowed?
 | ||
|             for crawler in crawlers_allowed:
 | ||
|                 if crawler.lower() in agent_str_lower:
 | ||
|                     return False, blocked_cache_last_updated, llm
 | ||
|             print('BLOCK: Blocked Crawler: ' + agent_str)
 | ||
|             return True, blocked_cache_last_updated, llm
 | ||
|         # get domain name from User-Agent
 | ||
|         agent_domain = user_agent_domain(agent_str, debug)
 | ||
|     else:
 | ||
|         # no User-Agent header is present
 | ||
|         return True, blocked_cache_last_updated, False
 | ||
| 
 | ||
|     # is the User-Agent type blocked? eg. "Mastodon"
 | ||
|     if user_agents_blocked:
 | ||
|         blocked_ua = False
 | ||
|         for agent_name in user_agents_blocked:
 | ||
|             if agent_name in agent_str:
 | ||
|                 blocked_ua = True
 | ||
|                 break
 | ||
|         if blocked_ua:
 | ||
|             return True, blocked_cache_last_updated, False
 | ||
| 
 | ||
|     if not agent_domain:
 | ||
|         return False, blocked_cache_last_updated, False
 | ||
| 
 | ||
|     # is the User-Agent domain blocked
 | ||
|     blocked_ua = False
 | ||
|     if not agent_domain.startswith(calling_domain):
 | ||
|         blocked_cache_last_updated = \
 | ||
|             update_blocked_cache(base_dir, blocked_cache,
 | ||
|                                  blocked_cache_last_updated,
 | ||
|                                  blocked_cache_update_secs)
 | ||
| 
 | ||
|         blocked_ua = \
 | ||
|             is_blocked_domain(base_dir, agent_domain,
 | ||
|                               blocked_cache, block_federated)
 | ||
|         if blocked_ua:
 | ||
|             print('BLOCK: Blocked User agent 2: ' + agent_domain)
 | ||
| 
 | ||
|     block_dicts = {
 | ||
|         "military": block_military,
 | ||
|         "government": block_government,
 | ||
|         "bluesky": block_bluesky,
 | ||
|         "nostr": block_nostr
 | ||
|     }
 | ||
|     for block_type, block_dict in block_dicts.items():
 | ||
|         if blocked_ua or not block_dict:
 | ||
|             continue
 | ||
|         if '/users/' not in path:
 | ||
|             continue
 | ||
|         # which accounts is this?
 | ||
|         nickname = path.split('/users/')[1]
 | ||
|         if '/' in nickname:
 | ||
|             nickname = nickname.split('/')[0]
 | ||
|         # does this account block?
 | ||
|         if not block_dict.get(nickname):
 | ||
|             continue
 | ||
|         if block_type == "military":
 | ||
|             blk_domains = get_mil_domains_list()
 | ||
|         elif block_type == "government":
 | ||
|             blk_domains = get_gov_domains_list()
 | ||
|         elif block_type == "nostr":
 | ||
|             blk_domains = get_nostr_domains_list()
 | ||
|         else:
 | ||
|             blk_domains = get_bsky_domains_list()
 | ||
|         for domain_str in blk_domains:
 | ||
|             if '.' not in domain_str:
 | ||
|                 tld = domain_str
 | ||
|                 if agent_domain.endswith('.' + tld):
 | ||
|                     blocked_ua = True
 | ||
|                     print('BLOCK: Blocked ' + block_type +
 | ||
|                           ' tld user agent: ' + agent_domain)
 | ||
|                     break
 | ||
|             elif agent_domain.endswith(domain_str):
 | ||
|                 blocked_ua = True
 | ||
|                 print('BLOCK: Blocked ' + block_type +
 | ||
|                       ' user agent: ' + agent_domain)
 | ||
|                 break
 | ||
| 
 | ||
|     return blocked_ua, blocked_cache_last_updated, False
 |