From e227c58ef4b2c0dd903bd96bcf5f2869e55d0e66 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Tue, 18 Jul 2023 15:55:26 +0100 Subject: [PATCH] Optionally block military user agents --- blocking.py | 8 +++++++- crawlers.py | 28 ++++++++++++++++++++++++---- daemon.py | 6 ++++-- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/blocking.py b/blocking.py index 2a3874b0f..e9e5f23a7 100644 --- a/blocking.py +++ b/blocking.py @@ -1792,10 +1792,16 @@ def save_blocked_military(base_dir: str, block_military: {}) -> None: print('EX: error while saving block military file') +def get_mil_domains_list() -> []: + """returns a list of military top level domains + """ + return ('army', 'navy', 'airforce', 'mil') + + def contains_military_domain(message_str: str) -> bool: """Returns true if the given string contains a military domain """ - mil_domains = ('army', 'navy', 'airforce') + mil_domains = get_mil_domains_list() for tld in mil_domains: if '.' + tld + '"' in message_str or \ '.' + tld + '/' in message_str: diff --git a/crawlers.py b/crawlers.py index 5df7f2b63..b7629ec12 100644 --- a/crawlers.py +++ b/crawlers.py @@ -12,6 +12,7 @@ import time from utils import save_json from utils import user_agent_domain from utils import remove_eol +from blocking import get_mil_domains_list from blocking import update_blocked_cache from blocking import is_blocked_domain @@ -106,7 +107,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str, blocked_cache: [], blocked_cache_update_secs: int, crawlers_allowed: [], - known_bots: []): + known_bots: [], path: str, + block_military: {}): """Should a GET or POST be blocked based upon its user agent? """ if not agent_str: @@ -115,7 +117,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, agent_str_lower = agent_str.lower() for ua_block in default_user_agent_blocks: if ua_block in agent_str_lower: - print('Blocked User agent 1: ' + ua_block) + print('BLOCK: Blocked User agent 1: ' + ua_block) return True, blocked_cache_last_updated agent_domain = None @@ -144,7 +146,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, for crawler in crawlers_allowed: if crawler.lower() in agent_str_lower: return False, blocked_cache_last_updated - print('Blocked Crawler: ' + agent_str) + print('BLOCK: Blocked Crawler: ' + agent_str) return True, blocked_cache_last_updated # get domain name from User-Agent agent_domain = user_agent_domain(agent_str, debug) @@ -177,5 +179,23 @@ def blocked_user_agent(calling_domain: str, agent_str: str, is_blocked_domain(base_dir, agent_domain, blocked_cache) # if self.server.debug: if blocked_ua: - print('Blocked User agent 2: ' + agent_domain) + print('BLOCK: Blocked User agent 2: ' + agent_domain) + + # optionally block military domains on a per account basis + if not blocked_ua and block_military: + if '/users/' in path: + # which accounts is this? + nickname = path.split('/users/')[1] + if '/' in nickname: + nickname = nickname.split('/')[0] + # does this account block military domains? + if block_military.get(nickname): + mil_domains = get_mil_domains_list() + for tld in mil_domains: + if agent_domain.endswith('.' + tld): + blocked_ua = True + print('BLOCK: Blocked military user agent: ' + + agent_domain) + break + return blocked_ua, blocked_cache_last_updated diff --git a/daemon.py b/daemon.py index d957871c0..8365dbb22 100644 --- a/daemon.py +++ b/daemon.py @@ -17006,7 +17006,8 @@ class PubServer(BaseHTTPRequestHandler): self.server.blocked_cache, self.server.blocked_cache_update_secs, self.server.crawlers_allowed, - self.server.known_bots) + self.server.known_bots, + self.path, self.server.block_military) if block: self._400() return @@ -22606,7 +22607,8 @@ class PubServer(BaseHTTPRequestHandler): self.server.blocked_cache, self.server.blocked_cache_update_secs, self.server.crawlers_allowed, - self.server.known_bots) + self.server.known_bots, + self.path, self.server.block_military) if block: self._400() self.server.postreq_busy = False