mirror of https://gitlab.com/bashrc2/epicyon
Optionally block military user agents
parent
5c4ee4b96c
commit
e227c58ef4
|
@ -1792,10 +1792,16 @@ def save_blocked_military(base_dir: str, block_military: {}) -> None:
|
||||||
print('EX: error while saving block military file')
|
print('EX: error while saving block military file')
|
||||||
|
|
||||||
|
|
||||||
|
def get_mil_domains_list() -> []:
|
||||||
|
"""returns a list of military top level domains
|
||||||
|
"""
|
||||||
|
return ('army', 'navy', 'airforce', 'mil')
|
||||||
|
|
||||||
|
|
||||||
def contains_military_domain(message_str: str) -> bool:
|
def contains_military_domain(message_str: str) -> bool:
|
||||||
"""Returns true if the given string contains a military domain
|
"""Returns true if the given string contains a military domain
|
||||||
"""
|
"""
|
||||||
mil_domains = ('army', 'navy', 'airforce')
|
mil_domains = get_mil_domains_list()
|
||||||
for tld in mil_domains:
|
for tld in mil_domains:
|
||||||
if '.' + tld + '"' in message_str or \
|
if '.' + tld + '"' in message_str or \
|
||||||
'.' + tld + '/' in message_str:
|
'.' + tld + '/' in message_str:
|
||||||
|
|
28
crawlers.py
28
crawlers.py
|
@ -12,6 +12,7 @@ import time
|
||||||
from utils import save_json
|
from utils import save_json
|
||||||
from utils import user_agent_domain
|
from utils import user_agent_domain
|
||||||
from utils import remove_eol
|
from utils import remove_eol
|
||||||
|
from blocking import get_mil_domains_list
|
||||||
from blocking import update_blocked_cache
|
from blocking import update_blocked_cache
|
||||||
from blocking import is_blocked_domain
|
from blocking import is_blocked_domain
|
||||||
|
|
||||||
|
@ -106,7 +107,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
blocked_cache: [],
|
blocked_cache: [],
|
||||||
blocked_cache_update_secs: int,
|
blocked_cache_update_secs: int,
|
||||||
crawlers_allowed: [],
|
crawlers_allowed: [],
|
||||||
known_bots: []):
|
known_bots: [], path: str,
|
||||||
|
block_military: {}):
|
||||||
"""Should a GET or POST be blocked based upon its user agent?
|
"""Should a GET or POST be blocked based upon its user agent?
|
||||||
"""
|
"""
|
||||||
if not agent_str:
|
if not agent_str:
|
||||||
|
@ -115,7 +117,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
agent_str_lower = agent_str.lower()
|
agent_str_lower = agent_str.lower()
|
||||||
for ua_block in default_user_agent_blocks:
|
for ua_block in default_user_agent_blocks:
|
||||||
if ua_block in agent_str_lower:
|
if ua_block in agent_str_lower:
|
||||||
print('Blocked User agent 1: ' + ua_block)
|
print('BLOCK: Blocked User agent 1: ' + ua_block)
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated
|
||||||
|
|
||||||
agent_domain = None
|
agent_domain = None
|
||||||
|
@ -144,7 +146,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
for crawler in crawlers_allowed:
|
for crawler in crawlers_allowed:
|
||||||
if crawler.lower() in agent_str_lower:
|
if crawler.lower() in agent_str_lower:
|
||||||
return False, blocked_cache_last_updated
|
return False, blocked_cache_last_updated
|
||||||
print('Blocked Crawler: ' + agent_str)
|
print('BLOCK: Blocked Crawler: ' + agent_str)
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated
|
||||||
# get domain name from User-Agent
|
# get domain name from User-Agent
|
||||||
agent_domain = user_agent_domain(agent_str, debug)
|
agent_domain = user_agent_domain(agent_str, debug)
|
||||||
|
@ -177,5 +179,23 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
is_blocked_domain(base_dir, agent_domain, blocked_cache)
|
is_blocked_domain(base_dir, agent_domain, blocked_cache)
|
||||||
# if self.server.debug:
|
# if self.server.debug:
|
||||||
if blocked_ua:
|
if blocked_ua:
|
||||||
print('Blocked User agent 2: ' + agent_domain)
|
print('BLOCK: Blocked User agent 2: ' + agent_domain)
|
||||||
|
|
||||||
|
# optionally block military domains on a per account basis
|
||||||
|
if not blocked_ua and block_military:
|
||||||
|
if '/users/' in path:
|
||||||
|
# which accounts is this?
|
||||||
|
nickname = path.split('/users/')[1]
|
||||||
|
if '/' in nickname:
|
||||||
|
nickname = nickname.split('/')[0]
|
||||||
|
# does this account block military domains?
|
||||||
|
if block_military.get(nickname):
|
||||||
|
mil_domains = get_mil_domains_list()
|
||||||
|
for tld in mil_domains:
|
||||||
|
if agent_domain.endswith('.' + tld):
|
||||||
|
blocked_ua = True
|
||||||
|
print('BLOCK: Blocked military user agent: ' +
|
||||||
|
agent_domain)
|
||||||
|
break
|
||||||
|
|
||||||
return blocked_ua, blocked_cache_last_updated
|
return blocked_ua, blocked_cache_last_updated
|
||||||
|
|
|
@ -17006,7 +17006,8 @@ class PubServer(BaseHTTPRequestHandler):
|
||||||
self.server.blocked_cache,
|
self.server.blocked_cache,
|
||||||
self.server.blocked_cache_update_secs,
|
self.server.blocked_cache_update_secs,
|
||||||
self.server.crawlers_allowed,
|
self.server.crawlers_allowed,
|
||||||
self.server.known_bots)
|
self.server.known_bots,
|
||||||
|
self.path, self.server.block_military)
|
||||||
if block:
|
if block:
|
||||||
self._400()
|
self._400()
|
||||||
return
|
return
|
||||||
|
@ -22606,7 +22607,8 @@ class PubServer(BaseHTTPRequestHandler):
|
||||||
self.server.blocked_cache,
|
self.server.blocked_cache,
|
||||||
self.server.blocked_cache_update_secs,
|
self.server.blocked_cache_update_secs,
|
||||||
self.server.crawlers_allowed,
|
self.server.crawlers_allowed,
|
||||||
self.server.known_bots)
|
self.server.known_bots,
|
||||||
|
self.path, self.server.block_military)
|
||||||
if block:
|
if block:
|
||||||
self._400()
|
self._400()
|
||||||
self.server.postreq_busy = False
|
self.server.postreq_busy = False
|
||||||
|
|
Loading…
Reference in New Issue