From 92b9942183ee66b5335dd01295f31b9819ef634f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 31 Mar 2022 16:12:30 +0100 Subject: [PATCH 1/4] Improve blocking of user agent domains --- crawlers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crawlers.py b/crawlers.py index 92a8246ae..e870ac320 100644 --- a/crawlers.py +++ b/crawlers.py @@ -113,8 +113,14 @@ def blocked_user_agent(calling_domain: str, agent_str: str, agent_str_lower = agent_str.lower() for ua_block in default_user_agent_blocks: - if ua_block in agent_str_lower: - print('Blocked User agent: ' + ua_block) + if agent_str_lower.endswith(ua_block) or \ + ua_block + "/" in agent_str_lower or \ + ua_block + ")" in agent_str_lower or \ + ua_block + ";" in agent_str_lower or \ + ua_block + ">" in agent_str_lower or \ + ua_block + "<" in agent_str_lower or \ + ua_block + " " in agent_str_lower: + print('Blocked User agent 1: ' + ua_block) return True, blocked_cache_last_updated agent_domain = None @@ -176,5 +182,5 @@ def blocked_user_agent(calling_domain: str, agent_str: str, is_blocked_domain(base_dir, agent_domain, blocked_cache) # if self.server.debug: if blocked_ua: - print('Blocked User agent: ' + agent_domain) + print('Blocked User agent 2: ' + agent_domain) return blocked_ua, blocked_cache_last_updated From c9cd6ea9abb379d3f39611107ceb5cecc67595cc Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 31 Mar 2022 16:18:42 +0100 Subject: [PATCH 2/4] Improve blocking of user agent domains --- crawlers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawlers.py b/crawlers.py index e870ac320..7ba56d222 100644 --- a/crawlers.py +++ b/crawlers.py @@ -117,6 +117,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, ua_block + "/" in agent_str_lower or \ ua_block + ")" in agent_str_lower or \ ua_block + ";" in agent_str_lower or \ + ua_block + "," in agent_str_lower or \ ua_block + ">" in agent_str_lower or \ ua_block + "<" in agent_str_lower or \ ua_block + " " in agent_str_lower: From c395688c185865386d071124afdf54b56fa09e38 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 31 Mar 2022 17:14:19 +0100 Subject: [PATCH 3/4] Improve user agent blocking --- blocking.py | 8 ++++---- crawlers.py | 9 +-------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/blocking.py b/blocking.py index 7fcde830a..d0e24173f 100644 --- a/blocking.py +++ b/blocking.py @@ -327,10 +327,10 @@ def is_blocked_domain(base_dir: str, domain: str, if not broch_mode_is_active(base_dir): if blocked_cache: for blocked_str in blocked_cache: - if '*@' + domain in blocked_str: + if blocked_str == '*@' + domain: return True if short_domain: - if '*@' + short_domain in blocked_str: + if blocked_str == '*@' + short_domain: return True else: # instance block list @@ -339,10 +339,10 @@ def is_blocked_domain(base_dir: str, domain: str, try: with open(global_blocking_filename, 'r') as fp_blocked: blocked_str = fp_blocked.read() - if '*@' + domain in blocked_str: + if '*@' + domain + '\n' in blocked_str: return True if short_domain: - if '*@' + short_domain in blocked_str: + if '*@' + short_domain + '\n' in blocked_str: return True except OSError as ex: print('EX: unable to read ' + global_blocking_filename + diff --git a/crawlers.py b/crawlers.py index 7ba56d222..fcca4d056 100644 --- a/crawlers.py +++ b/crawlers.py @@ -113,14 +113,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str, agent_str_lower = agent_str.lower() for ua_block in default_user_agent_blocks: - if agent_str_lower.endswith(ua_block) or \ - ua_block + "/" in agent_str_lower or \ - ua_block + ")" in agent_str_lower or \ - ua_block + ";" in agent_str_lower or \ - ua_block + "," in agent_str_lower or \ - ua_block + ">" in agent_str_lower or \ - ua_block + "<" in agent_str_lower or \ - ua_block + " " in agent_str_lower: + if ua_block in agent_str_lower: print('Blocked User agent 1: ' + ua_block) return True, blocked_cache_last_updated From 92f3bbb763da9d8d7b65c7ceae6bdbe0bd975f2e Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Thu, 31 Mar 2022 17:33:21 +0100 Subject: [PATCH 4/4] Improve user agent blocking --- blocking.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/blocking.py b/blocking.py index d0e24173f..d2e5f7d64 100644 --- a/blocking.py +++ b/blocking.py @@ -379,7 +379,7 @@ def is_blocked(base_dir: str, nickname: str, domain: str, if '*@' + domain in blocked_str: return True if block_handle: - if block_handle in blocked_str: + if blocked_str == block_handle: return True else: global_blocks_filename = base_dir + '/accounts/blocking.txt' @@ -387,33 +387,34 @@ def is_blocked(base_dir: str, nickname: str, domain: str, if '*@' + block_domain in open(global_blocks_filename).read(): return True if block_handle: - if block_handle in open(global_blocks_filename).read(): + block_str = block_handle + '\n' + if block_str in open(global_blocks_filename).read(): return True else: # instance allow list allow_filename = base_dir + '/accounts/allowedinstances.txt' short_domain = _get_short_domain(block_domain) if not short_domain: - if block_domain not in open(allow_filename).read(): + if block_domain + '\n' not in open(allow_filename).read(): return True else: - if short_domain not in open(allow_filename).read(): + if short_domain + '\n' not in open(allow_filename).read(): return True # account level allow list account_dir = acct_dir(base_dir, nickname, domain) allow_filename = account_dir + '/allowedinstances.txt' if os.path.isfile(allow_filename): - if block_domain not in open(allow_filename).read(): + if block_domain + '\n' not in open(allow_filename).read(): return True # account level block list blocking_filename = account_dir + '/blocking.txt' if os.path.isfile(blocking_filename): - if '*@' + block_domain in open(blocking_filename).read(): + if '*@' + block_domain + '\n' in open(blocking_filename).read(): return True if block_handle: - if block_handle in open(blocking_filename).read(): + if block_handle + '\n' in open(blocking_filename).read(): return True return False