| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | __filename__ = "crawlers.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2024-12-22 23:37:30 +00:00
										 |  |  | __version__ = "1.6.0" | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							|  |  |  | __email__ = "bob@libreserver.org" | 
					
						
							|  |  |  | __status__ = "Production" | 
					
						
							|  |  |  | __module_group__ = "Core" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | import time | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  | from utils import data_dir | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | from utils import save_json | 
					
						
							|  |  |  | from utils import user_agent_domain | 
					
						
							| 
									
										
										
										
											2022-06-21 11:58:50 +00:00
										 |  |  | from utils import remove_eol | 
					
						
							| 
									
										
										
										
											2023-07-18 14:55:26 +00:00
										 |  |  | from blocking import get_mil_domains_list | 
					
						
							| 
									
										
										
										
											2024-12-15 13:01:26 +00:00
										 |  |  | from blocking import get_gov_domains_list | 
					
						
							| 
									
										
										
										
											2024-12-14 18:02:43 +00:00
										 |  |  | from blocking import get_bsky_domains_list | 
					
						
							| 
									
										
										
										
											2024-12-29 14:17:06 +00:00
										 |  |  | from blocking import get_nostr_domains_list | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | from blocking import update_blocked_cache | 
					
						
							|  |  |  | from blocking import is_blocked_domain | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | default_user_agent_blocks = [ | 
					
						
							| 
									
										
										
										
											2024-06-24 16:01:16 +00:00
										 |  |  |     'fedilist', 'ncsc scan', 'fedifetcher' | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def update_known_crawlers(ua_str: str, | 
					
						
							|  |  |  |                           base_dir: str, known_crawlers: {}, | 
					
						
							| 
									
										
										
										
											2024-05-07 11:45:43 +00:00
										 |  |  |                           last_known_crawler: int) -> int: | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |     """Updates a dictionary of known crawlers accessing nodeinfo
 | 
					
						
							|  |  |  |     or the masto API | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not ua_str: | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     curr_time = int(time.time()) | 
					
						
							|  |  |  |     if known_crawlers.get(ua_str): | 
					
						
							|  |  |  |         known_crawlers[ua_str]['hits'] += 1 | 
					
						
							|  |  |  |         known_crawlers[ua_str]['lastseen'] = curr_time | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         known_crawlers[ua_str] = { | 
					
						
							|  |  |  |             "lastseen": curr_time, | 
					
						
							|  |  |  |             "hits": 1 | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if curr_time - last_known_crawler >= 30: | 
					
						
							|  |  |  |         # remove any old observations | 
					
						
							| 
									
										
										
										
											2024-12-23 15:39:55 +00:00
										 |  |  |         remove_crawlers: list[str] = [] | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |         for uagent, item in known_crawlers.items(): | 
					
						
							|  |  |  |             if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30: | 
					
						
							|  |  |  |                 remove_crawlers.append(uagent) | 
					
						
							|  |  |  |         for uagent in remove_crawlers: | 
					
						
							|  |  |  |             del known_crawlers[uagent] | 
					
						
							|  |  |  |         # save the list of crawlers | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  |         dir_str = data_dir(base_dir) | 
					
						
							|  |  |  |         save_json(known_crawlers, dir_str + '/knownCrawlers.json') | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |     return curr_time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  | def load_known_web_bots(base_dir: str) -> []: | 
					
						
							|  |  |  |     """Returns a list of known web bots
 | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  |     known_bots_filename = data_dir(base_dir) + '/knownBots.txt' | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |     if not os.path.isfile(known_bots_filename): | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |         return [] | 
					
						
							|  |  |  |     crawlers_str = None | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |         with open(known_bots_filename, 'r', encoding='utf-8') as fp_crawlers: | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |             crawlers_str = fp_crawlers.read() | 
					
						
							|  |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |         print('EX: unable to load web bots from ' + | 
					
						
							|  |  |  |               known_bots_filename) | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     if not crawlers_str: | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2024-12-23 15:39:55 +00:00
										 |  |  |     known_bots: list[str] = [] | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     crawlers_list = crawlers_str.split('\n') | 
					
						
							|  |  |  |     for crawler in crawlers_list: | 
					
						
							|  |  |  |         if not crawler: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-06-21 11:58:50 +00:00
										 |  |  |         crawler = remove_eol(crawler).strip() | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |         if not crawler: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |         if crawler not in known_bots: | 
					
						
							|  |  |  |             known_bots.append(crawler) | 
					
						
							|  |  |  |     return known_bots | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  | def _save_known_web_bots(base_dir: str, known_bots: []) -> bool: | 
					
						
							|  |  |  |     """Saves a list of known web bots
 | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  |     known_bots_filename = data_dir(base_dir) + '/knownBots.txt' | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |     known_bots_str = '' | 
					
						
							|  |  |  |     for crawler in known_bots: | 
					
						
							|  |  |  |         known_bots_str += crawler.strip() + '\n' | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2022-06-09 14:46:30 +00:00
										 |  |  |         with open(known_bots_filename, 'w+', encoding='utf-8') as fp_crawlers: | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |             fp_crawlers.write(known_bots_str) | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |         print("EX: unable to save known web bots to " + | 
					
						
							|  |  |  |               known_bots_filename) | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |         return False | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | def blocked_user_agent(calling_domain: str, agent_str: str, | 
					
						
							|  |  |  |                        news_instance: bool, debug: bool, | 
					
						
							|  |  |  |                        user_agents_blocked: [], | 
					
						
							|  |  |  |                        blocked_cache_last_updated, | 
					
						
							|  |  |  |                        base_dir: str, | 
					
						
							|  |  |  |                        blocked_cache: [], | 
					
						
							| 
									
										
										
										
											2024-02-09 21:25:40 +00:00
										 |  |  |                        block_federated: [], | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |                        blocked_cache_update_secs: int, | 
					
						
							| 
									
										
										
										
											2022-03-06 14:02:26 +00:00
										 |  |  |                        crawlers_allowed: [], | 
					
						
							| 
									
										
										
										
											2023-07-18 14:55:26 +00:00
										 |  |  |                        known_bots: [], path: str, | 
					
						
							| 
									
										
										
										
											2024-12-14 18:02:43 +00:00
										 |  |  |                        block_military: {}, | 
					
						
							| 
									
										
										
										
											2024-12-15 13:01:26 +00:00
										 |  |  |                        block_government: {}, | 
					
						
							| 
									
										
										
										
											2024-12-29 14:17:06 +00:00
										 |  |  |                        block_bluesky: {}, | 
					
						
							|  |  |  |                        block_nostr: {}): | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |     """Should a GET or POST be blocked based upon its user agent?
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not agent_str: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         return True, blocked_cache_last_updated, False | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     agent_str_lower = agent_str.lower() | 
					
						
							|  |  |  |     for ua_block in default_user_agent_blocks: | 
					
						
							| 
									
										
										
										
											2022-03-31 16:14:19 +00:00
										 |  |  |         if ua_block in agent_str_lower: | 
					
						
							| 
									
										
										
										
											2023-07-18 14:55:26 +00:00
										 |  |  |             print('BLOCK: Blocked User agent 1: ' + ua_block) | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |             return True, blocked_cache_last_updated, False | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     agent_domain = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if agent_str: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         contains_bot_string = False | 
					
						
							|  |  |  |         llm = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # is this an LLM crawler? | 
					
						
							| 
									
										
										
										
											2024-09-03 12:52:04 +00:00
										 |  |  |         # https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.txt | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         llm_bot_strings = ( | 
					
						
							|  |  |  |             'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt', | 
					
						
							| 
									
										
										
										
											2024-08-19 18:32:38 +00:00
										 |  |  |             'anthropic', 'mlbot', 'claude-web', 'claudebot', 'ccbot', | 
					
						
							|  |  |  |             'facebookbot', 'google-extended', 'piplbot', 'oai-search', | 
					
						
							| 
									
										
										
										
											2024-09-01 20:05:34 +00:00
										 |  |  |             'applebot', 'meta-external', 'diffbot', 'perplexitybot', | 
					
						
							| 
									
										
										
										
											2024-08-21 12:33:06 +00:00
										 |  |  |             'omgili', 'imagesiftbot', 'bytespider', 'amazonbot', 'youbot', | 
					
						
							| 
									
										
										
										
											2024-09-01 20:05:34 +00:00
										 |  |  |             'petalbot', 'ai2bot', 'allenai', 'firecrawl', 'friendlycrawler', | 
					
						
							|  |  |  |             'googleother', 'icc-crawler', 'scrapy', 'timpibot', | 
					
						
							| 
									
										
										
										
											2025-01-21 18:35:29 +00:00
										 |  |  |             'velenpublic', 'webzio-extended', 'cohere-ai', | 
					
						
							|  |  |  |             'cohere-train', 'crawlspace', 'facebookexternal', | 
					
						
							| 
									
										
										
										
											2024-10-05 16:16:21 +00:00
										 |  |  |             'img2dataset', 'isscyberriskcrawler', 'sidetrade', 'kangaroo.ai', | 
					
						
							| 
									
										
										
										
											2025-01-21 18:35:29 +00:00
										 |  |  |             'kangaroo bot', 'iaskspider', 'duckassistbot', 'pangubot', | 
					
						
							|  |  |  |             'semrush' | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         ) | 
					
						
							|  |  |  |         for bot_str in llm_bot_strings: | 
					
						
							|  |  |  |             if bot_str in agent_str_lower: | 
					
						
							|  |  |  |                 if '://bot' not in agent_str_lower and \ | 
					
						
							|  |  |  |                    '://robot' not in agent_str_lower and \ | 
					
						
							|  |  |  |                    '://spider' not in agent_str_lower and \ | 
					
						
							|  |  |  |                    'pixelfedbot/' not in agent_str_lower: | 
					
						
							|  |  |  |                     contains_bot_string = True | 
					
						
							|  |  |  |                     llm = True | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-06 13:26:06 +00:00
										 |  |  |         # is this a web crawler? If so then block it by default | 
					
						
							|  |  |  |         # unless this is a news instance or if it is in the allowed list | 
					
						
							| 
									
										
										
										
											2024-05-07 09:20:10 +00:00
										 |  |  |         bot_strings = ( | 
					
						
							|  |  |  |             'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ', | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |             '/robot', 'spider/', 'spider.ht', '/spider.', '-spider', | 
					
						
							|  |  |  |             'externalhit/', 'google', | 
					
						
							| 
									
										
										
										
											2024-05-07 09:20:10 +00:00
										 |  |  |             'facebook', 'slurp', 'crawler', 'crawling', ' crawl ', | 
					
						
							|  |  |  |             'gigablast', 'archive.org', 'httrack', | 
					
						
							|  |  |  |             'spider-', ' spider ', 'findlink', 'ips-agent', | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |             'woriobot', 'webbot', 'webcrawl', | 
					
						
							| 
									
										
										
										
											2024-05-07 09:20:10 +00:00
										 |  |  |             'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot', | 
					
						
							|  |  |  |             'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright', | 
					
						
							| 
									
										
										
										
											2024-08-19 18:23:54 +00:00
										 |  |  |             'apercite', 'bot (', 'summify', 'linkfind', | 
					
						
							| 
									
										
										
										
											2024-05-07 09:20:10 +00:00
										 |  |  |             'linkanalyze', 'analyzer', 'wotbox', 'ichiro', | 
					
						
							|  |  |  |             'drupact', 'searchengine', 'coccoc', | 
					
						
							|  |  |  |             'explorer/', 'explorer;', 'crystalsemantics', | 
					
						
							|  |  |  |             'scraper/', ' scraper ', ' scrape ', 'scraping') | 
					
						
							| 
									
										
										
										
											2022-03-08 12:40:15 +00:00
										 |  |  |         for bot_str in bot_strings: | 
					
						
							|  |  |  |             if bot_str in agent_str_lower: | 
					
						
							| 
									
										
										
										
											2022-03-08 20:03:41 +00:00
										 |  |  |                 if '://bot' not in agent_str_lower and \ | 
					
						
							| 
									
										
										
										
											2023-08-04 16:16:04 +00:00
										 |  |  |                    '://robot' not in agent_str_lower and \ | 
					
						
							| 
									
										
										
										
											2024-05-06 22:16:56 +00:00
										 |  |  |                    '://spider' not in agent_str_lower and \ | 
					
						
							| 
									
										
										
										
											2023-08-04 16:16:04 +00:00
										 |  |  |                    'pixelfedbot/' not in agent_str_lower: | 
					
						
							| 
									
										
										
										
											2022-03-08 20:03:41 +00:00
										 |  |  |                     contains_bot_string = True | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2022-03-08 12:40:15 +00:00
										 |  |  |         if contains_bot_string: | 
					
						
							| 
									
										
										
										
											2022-03-06 14:20:25 +00:00
										 |  |  |             if agent_str_lower not in known_bots: | 
					
						
							|  |  |  |                 known_bots.append(agent_str_lower) | 
					
						
							|  |  |  |                 known_bots.sort() | 
					
						
							|  |  |  |                 _save_known_web_bots(base_dir, known_bots) | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |             # if this is a news instance then we want it | 
					
						
							|  |  |  |             # to be indexed by search engines | 
					
						
							|  |  |  |             if news_instance: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |                 return False, blocked_cache_last_updated, llm | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |             # is this crawler allowed? | 
					
						
							|  |  |  |             for crawler in crawlers_allowed: | 
					
						
							|  |  |  |                 if crawler.lower() in agent_str_lower: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |                     return False, blocked_cache_last_updated, llm | 
					
						
							| 
									
										
										
										
											2023-07-18 14:55:26 +00:00
										 |  |  |             print('BLOCK: Blocked Crawler: ' + agent_str) | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |             return True, blocked_cache_last_updated, llm | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |         # get domain name from User-Agent | 
					
						
							|  |  |  |         agent_domain = user_agent_domain(agent_str, debug) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         # no User-Agent header is present | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         return True, blocked_cache_last_updated, False | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # is the User-Agent type blocked? eg. "Mastodon" | 
					
						
							|  |  |  |     if user_agents_blocked: | 
					
						
							|  |  |  |         blocked_ua = False | 
					
						
							|  |  |  |         for agent_name in user_agents_blocked: | 
					
						
							|  |  |  |             if agent_name in agent_str: | 
					
						
							|  |  |  |                 blocked_ua = True | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |         if blocked_ua: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |             return True, blocked_cache_last_updated, False | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if not agent_domain: | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |         return False, blocked_cache_last_updated, False | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # is the User-Agent domain blocked | 
					
						
							|  |  |  |     blocked_ua = False | 
					
						
							|  |  |  |     if not agent_domain.startswith(calling_domain): | 
					
						
							|  |  |  |         blocked_cache_last_updated = \ | 
					
						
							|  |  |  |             update_blocked_cache(base_dir, blocked_cache, | 
					
						
							|  |  |  |                                  blocked_cache_last_updated, | 
					
						
							|  |  |  |                                  blocked_cache_update_secs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         blocked_ua = \ | 
					
						
							| 
									
										
										
										
											2024-02-09 21:25:40 +00:00
										 |  |  |             is_blocked_domain(base_dir, agent_domain, | 
					
						
							| 
									
										
										
										
											2024-02-11 13:52:25 +00:00
										 |  |  |                               blocked_cache, block_federated) | 
					
						
							| 
									
										
										
										
											2022-03-06 12:56:26 +00:00
										 |  |  |         if blocked_ua: | 
					
						
							| 
									
										
										
										
											2023-07-18 14:55:26 +00:00
										 |  |  |             print('BLOCK: Blocked User agent 2: ' + agent_domain) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-15 18:53:17 +00:00
										 |  |  |     block_dicts = { | 
					
						
							|  |  |  |         "military": block_military, | 
					
						
							|  |  |  |         "government": block_government, | 
					
						
							| 
									
										
										
										
											2024-12-29 14:17:06 +00:00
										 |  |  |         "bluesky": block_bluesky, | 
					
						
							|  |  |  |         "nostr": block_nostr | 
					
						
							| 
									
										
										
										
											2024-12-15 18:53:17 +00:00
										 |  |  |     } | 
					
						
							|  |  |  |     for block_type, block_dict in block_dicts.items(): | 
					
						
							|  |  |  |         if blocked_ua or not block_dict: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '/users/' not in path: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         # which accounts is this? | 
					
						
							|  |  |  |         nickname = path.split('/users/')[1] | 
					
						
							|  |  |  |         if '/' in nickname: | 
					
						
							|  |  |  |             nickname = nickname.split('/')[0] | 
					
						
							|  |  |  |         # does this account block? | 
					
						
							|  |  |  |         if not block_dict.get(nickname): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if block_type == "military": | 
					
						
							|  |  |  |             blk_domains = get_mil_domains_list() | 
					
						
							|  |  |  |         elif block_type == "government": | 
					
						
							|  |  |  |             blk_domains = get_gov_domains_list() | 
					
						
							| 
									
										
										
										
											2024-12-29 14:17:06 +00:00
										 |  |  |         elif block_type == "nostr": | 
					
						
							|  |  |  |             blk_domains = get_nostr_domains_list() | 
					
						
							| 
									
										
										
										
											2024-12-15 18:53:17 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             blk_domains = get_bsky_domains_list() | 
					
						
							|  |  |  |         for domain_str in blk_domains: | 
					
						
							|  |  |  |             if '.' not in domain_str: | 
					
						
							|  |  |  |                 tld = domain_str | 
					
						
							|  |  |  |                 if agent_domain.endswith('.' + tld): | 
					
						
							|  |  |  |                     blocked_ua = True | 
					
						
							|  |  |  |                     print('BLOCK: Blocked ' + block_type + | 
					
						
							|  |  |  |                           ' tld user agent: ' + agent_domain) | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |             elif agent_domain.endswith(domain_str): | 
					
						
							|  |  |  |                 blocked_ua = True | 
					
						
							|  |  |  |                 print('BLOCK: Blocked ' + block_type + | 
					
						
							|  |  |  |                       ' user agent: ' + agent_domain) | 
					
						
							|  |  |  |                 break | 
					
						
							| 
									
										
										
										
											2024-12-14 18:02:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-17 20:15:08 +00:00
										 |  |  |     return blocked_ua, blocked_cache_last_updated, False |