Optionally block military user agents

main
Bob Mottram 2023-07-18 15:55:26 +01:00
parent 5c4ee4b96c
commit e227c58ef4
3 changed files with 35 additions and 7 deletions

View File

@ -1792,10 +1792,16 @@ def save_blocked_military(base_dir: str, block_military: {}) -> None:
print('EX: error while saving block military file') print('EX: error while saving block military file')
def get_mil_domains_list() -> []:
"""returns a list of military top level domains
"""
return ('army', 'navy', 'airforce', 'mil')
def contains_military_domain(message_str: str) -> bool: def contains_military_domain(message_str: str) -> bool:
"""Returns true if the given string contains a military domain """Returns true if the given string contains a military domain
""" """
mil_domains = ('army', 'navy', 'airforce') mil_domains = get_mil_domains_list()
for tld in mil_domains: for tld in mil_domains:
if '.' + tld + '"' in message_str or \ if '.' + tld + '"' in message_str or \
'.' + tld + '/' in message_str: '.' + tld + '/' in message_str:

View File

@ -12,6 +12,7 @@ import time
from utils import save_json from utils import save_json
from utils import user_agent_domain from utils import user_agent_domain
from utils import remove_eol from utils import remove_eol
from blocking import get_mil_domains_list
from blocking import update_blocked_cache from blocking import update_blocked_cache
from blocking import is_blocked_domain from blocking import is_blocked_domain
@ -106,7 +107,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
blocked_cache: [], blocked_cache: [],
blocked_cache_update_secs: int, blocked_cache_update_secs: int,
crawlers_allowed: [], crawlers_allowed: [],
known_bots: []): known_bots: [], path: str,
block_military: {}):
"""Should a GET or POST be blocked based upon its user agent? """Should a GET or POST be blocked based upon its user agent?
""" """
if not agent_str: if not agent_str:
@ -115,7 +117,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
agent_str_lower = agent_str.lower() agent_str_lower = agent_str.lower()
for ua_block in default_user_agent_blocks: for ua_block in default_user_agent_blocks:
if ua_block in agent_str_lower: if ua_block in agent_str_lower:
print('Blocked User agent 1: ' + ua_block) print('BLOCK: Blocked User agent 1: ' + ua_block)
return True, blocked_cache_last_updated return True, blocked_cache_last_updated
agent_domain = None agent_domain = None
@ -144,7 +146,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
for crawler in crawlers_allowed: for crawler in crawlers_allowed:
if crawler.lower() in agent_str_lower: if crawler.lower() in agent_str_lower:
return False, blocked_cache_last_updated return False, blocked_cache_last_updated
print('Blocked Crawler: ' + agent_str) print('BLOCK: Blocked Crawler: ' + agent_str)
return True, blocked_cache_last_updated return True, blocked_cache_last_updated
# get domain name from User-Agent # get domain name from User-Agent
agent_domain = user_agent_domain(agent_str, debug) agent_domain = user_agent_domain(agent_str, debug)
@ -177,5 +179,23 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
is_blocked_domain(base_dir, agent_domain, blocked_cache) is_blocked_domain(base_dir, agent_domain, blocked_cache)
# if self.server.debug: # if self.server.debug:
if blocked_ua: if blocked_ua:
print('Blocked User agent 2: ' + agent_domain) print('BLOCK: Blocked User agent 2: ' + agent_domain)
# optionally block military domains on a per account basis
if not blocked_ua and block_military:
if '/users/' in path:
# which accounts is this?
nickname = path.split('/users/')[1]
if '/' in nickname:
nickname = nickname.split('/')[0]
# does this account block military domains?
if block_military.get(nickname):
mil_domains = get_mil_domains_list()
for tld in mil_domains:
if agent_domain.endswith('.' + tld):
blocked_ua = True
print('BLOCK: Blocked military user agent: ' +
agent_domain)
break
return blocked_ua, blocked_cache_last_updated return blocked_ua, blocked_cache_last_updated

View File

@ -17006,7 +17006,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.blocked_cache, self.server.blocked_cache,
self.server.blocked_cache_update_secs, self.server.blocked_cache_update_secs,
self.server.crawlers_allowed, self.server.crawlers_allowed,
self.server.known_bots) self.server.known_bots,
self.path, self.server.block_military)
if block: if block:
self._400() self._400()
return return
@ -22606,7 +22607,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.blocked_cache, self.server.blocked_cache,
self.server.blocked_cache_update_secs, self.server.blocked_cache_update_secs,
self.server.crawlers_allowed, self.server.crawlers_allowed,
self.server.known_bots) self.server.known_bots,
self.path, self.server.block_military)
if block: if block:
self._400() self._400()
self.server.postreq_busy = False self.server.postreq_busy = False