2022-03-06 12:56:26 +00:00
|
|
|
__filename__ = "crawlers.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2023-01-21 23:03:30 +00:00
|
|
|
__version__ = "1.4.0"
|
2022-03-06 12:56:26 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
|
|
|
__email__ = "bob@libreserver.org"
|
|
|
|
__status__ = "Production"
|
|
|
|
__module_group__ = "Core"
|
|
|
|
|
2022-03-06 14:02:26 +00:00
|
|
|
import os
|
2022-03-06 12:56:26 +00:00
|
|
|
import time
|
|
|
|
from utils import save_json
|
|
|
|
from utils import user_agent_domain
|
2022-06-21 11:58:50 +00:00
|
|
|
from utils import remove_eol
|
2023-07-18 14:55:26 +00:00
|
|
|
from blocking import get_mil_domains_list
|
2022-03-06 12:56:26 +00:00
|
|
|
from blocking import update_blocked_cache
|
|
|
|
from blocking import is_blocked_domain
|
|
|
|
|
|
|
|
default_user_agent_blocks = [
|
2022-11-06 09:56:42 +00:00
|
|
|
'fedilist', 'ncsc scan'
|
2022-03-06 12:56:26 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def update_known_crawlers(ua_str: str,
|
|
|
|
base_dir: str, known_crawlers: {},
|
|
|
|
last_known_crawler: int):
|
|
|
|
"""Updates a dictionary of known crawlers accessing nodeinfo
|
|
|
|
or the masto API
|
|
|
|
"""
|
|
|
|
if not ua_str:
|
|
|
|
return None
|
|
|
|
|
|
|
|
curr_time = int(time.time())
|
|
|
|
if known_crawlers.get(ua_str):
|
|
|
|
known_crawlers[ua_str]['hits'] += 1
|
|
|
|
known_crawlers[ua_str]['lastseen'] = curr_time
|
|
|
|
else:
|
|
|
|
known_crawlers[ua_str] = {
|
|
|
|
"lastseen": curr_time,
|
|
|
|
"hits": 1
|
|
|
|
}
|
|
|
|
|
|
|
|
if curr_time - last_known_crawler >= 30:
|
|
|
|
# remove any old observations
|
|
|
|
remove_crawlers = []
|
|
|
|
for uagent, item in known_crawlers.items():
|
|
|
|
if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
|
|
|
|
remove_crawlers.append(uagent)
|
|
|
|
for uagent in remove_crawlers:
|
|
|
|
del known_crawlers[uagent]
|
|
|
|
# save the list of crawlers
|
|
|
|
save_json(known_crawlers,
|
|
|
|
base_dir + '/accounts/knownCrawlers.json')
|
|
|
|
return curr_time
|
|
|
|
|
|
|
|
|
2022-03-06 14:20:25 +00:00
|
|
|
def load_known_web_bots(base_dir: str) -> []:
|
|
|
|
"""Returns a list of known web bots
|
2022-03-06 14:02:26 +00:00
|
|
|
"""
|
2022-03-06 14:21:49 +00:00
|
|
|
known_bots_filename = base_dir + '/accounts/knownBots.txt'
|
2022-03-06 14:20:25 +00:00
|
|
|
if not os.path.isfile(known_bots_filename):
|
2022-03-06 14:02:26 +00:00
|
|
|
return []
|
|
|
|
crawlers_str = None
|
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(known_bots_filename, 'r', encoding='utf-8') as fp_crawlers:
|
2022-03-06 14:02:26 +00:00
|
|
|
crawlers_str = fp_crawlers.read()
|
|
|
|
except OSError:
|
2022-03-06 14:20:25 +00:00
|
|
|
print('EX: unable to load web bots from ' +
|
|
|
|
known_bots_filename)
|
2022-03-06 14:02:26 +00:00
|
|
|
if not crawlers_str:
|
|
|
|
return []
|
2022-03-06 14:20:25 +00:00
|
|
|
known_bots = []
|
2022-03-06 14:02:26 +00:00
|
|
|
crawlers_list = crawlers_str.split('\n')
|
|
|
|
for crawler in crawlers_list:
|
|
|
|
if not crawler:
|
|
|
|
continue
|
2022-06-21 11:58:50 +00:00
|
|
|
crawler = remove_eol(crawler).strip()
|
2022-03-06 14:02:26 +00:00
|
|
|
if not crawler:
|
|
|
|
continue
|
2022-03-06 14:20:25 +00:00
|
|
|
if crawler not in known_bots:
|
|
|
|
known_bots.append(crawler)
|
|
|
|
return known_bots
|
2022-03-06 14:02:26 +00:00
|
|
|
|
|
|
|
|
2022-03-06 14:20:25 +00:00
|
|
|
def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
|
|
|
|
"""Saves a list of known web bots
|
2022-03-06 14:02:26 +00:00
|
|
|
"""
|
2022-03-06 14:21:49 +00:00
|
|
|
known_bots_filename = base_dir + '/accounts/knownBots.txt'
|
2022-03-06 14:20:25 +00:00
|
|
|
known_bots_str = ''
|
|
|
|
for crawler in known_bots:
|
|
|
|
known_bots_str += crawler.strip() + '\n'
|
2022-03-06 14:02:26 +00:00
|
|
|
try:
|
2022-06-09 14:46:30 +00:00
|
|
|
with open(known_bots_filename, 'w+', encoding='utf-8') as fp_crawlers:
|
2022-03-06 14:20:25 +00:00
|
|
|
fp_crawlers.write(known_bots_str)
|
2022-03-06 14:02:26 +00:00
|
|
|
except OSError:
|
2022-03-06 14:20:25 +00:00
|
|
|
print("EX: unable to save known web bots to " +
|
|
|
|
known_bots_filename)
|
2022-03-06 14:02:26 +00:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2022-03-06 12:56:26 +00:00
|
|
|
def blocked_user_agent(calling_domain: str, agent_str: str,
|
|
|
|
news_instance: bool, debug: bool,
|
|
|
|
user_agents_blocked: [],
|
|
|
|
blocked_cache_last_updated,
|
|
|
|
base_dir: str,
|
|
|
|
blocked_cache: [],
|
|
|
|
blocked_cache_update_secs: int,
|
2022-03-06 14:02:26 +00:00
|
|
|
crawlers_allowed: [],
|
2023-07-18 14:55:26 +00:00
|
|
|
known_bots: [], path: str,
|
|
|
|
block_military: {}):
|
2022-03-06 12:56:26 +00:00
|
|
|
"""Should a GET or POST be blocked based upon its user agent?
|
|
|
|
"""
|
|
|
|
if not agent_str:
|
2023-03-23 20:45:12 +00:00
|
|
|
return True, blocked_cache_last_updated
|
2022-03-06 12:56:26 +00:00
|
|
|
|
|
|
|
agent_str_lower = agent_str.lower()
|
|
|
|
for ua_block in default_user_agent_blocks:
|
2022-03-31 16:14:19 +00:00
|
|
|
if ua_block in agent_str_lower:
|
2023-07-18 14:55:26 +00:00
|
|
|
print('BLOCK: Blocked User agent 1: ' + ua_block)
|
2022-03-06 12:56:26 +00:00
|
|
|
return True, blocked_cache_last_updated
|
|
|
|
|
|
|
|
agent_domain = None
|
|
|
|
|
|
|
|
if agent_str:
|
2022-03-06 13:26:06 +00:00
|
|
|
# is this a web crawler? If so then block it by default
|
|
|
|
# unless this is a news instance or if it is in the allowed list
|
2023-08-07 09:08:14 +00:00
|
|
|
bot_strings = ('bot/', 'bot-', '/bot', '/robot', 'gptbot')
|
2022-03-08 12:40:15 +00:00
|
|
|
contains_bot_string = False
|
|
|
|
for bot_str in bot_strings:
|
|
|
|
if bot_str in agent_str_lower:
|
2022-03-08 20:03:41 +00:00
|
|
|
if '://bot' not in agent_str_lower and \
|
2023-08-04 16:16:04 +00:00
|
|
|
'://robot' not in agent_str_lower and \
|
|
|
|
'pixelfedbot/' not in agent_str_lower:
|
2022-03-08 20:03:41 +00:00
|
|
|
contains_bot_string = True
|
|
|
|
break
|
2022-03-08 12:40:15 +00:00
|
|
|
if contains_bot_string:
|
2022-03-06 14:20:25 +00:00
|
|
|
if agent_str_lower not in known_bots:
|
|
|
|
known_bots.append(agent_str_lower)
|
|
|
|
known_bots.sort()
|
|
|
|
_save_known_web_bots(base_dir, known_bots)
|
2022-03-06 12:56:26 +00:00
|
|
|
# if this is a news instance then we want it
|
|
|
|
# to be indexed by search engines
|
|
|
|
if news_instance:
|
|
|
|
return False, blocked_cache_last_updated
|
|
|
|
# is this crawler allowed?
|
|
|
|
for crawler in crawlers_allowed:
|
|
|
|
if crawler.lower() in agent_str_lower:
|
|
|
|
return False, blocked_cache_last_updated
|
2023-07-18 14:55:26 +00:00
|
|
|
print('BLOCK: Blocked Crawler: ' + agent_str)
|
2022-03-06 12:56:26 +00:00
|
|
|
return True, blocked_cache_last_updated
|
|
|
|
# get domain name from User-Agent
|
|
|
|
agent_domain = user_agent_domain(agent_str, debug)
|
|
|
|
else:
|
|
|
|
# no User-Agent header is present
|
|
|
|
return True, blocked_cache_last_updated
|
|
|
|
|
|
|
|
# is the User-Agent type blocked? eg. "Mastodon"
|
|
|
|
if user_agents_blocked:
|
|
|
|
blocked_ua = False
|
|
|
|
for agent_name in user_agents_blocked:
|
|
|
|
if agent_name in agent_str:
|
|
|
|
blocked_ua = True
|
|
|
|
break
|
|
|
|
if blocked_ua:
|
|
|
|
return True, blocked_cache_last_updated
|
|
|
|
|
|
|
|
if not agent_domain:
|
|
|
|
return False, blocked_cache_last_updated
|
|
|
|
|
|
|
|
# is the User-Agent domain blocked
|
|
|
|
blocked_ua = False
|
|
|
|
if not agent_domain.startswith(calling_domain):
|
|
|
|
blocked_cache_last_updated = \
|
|
|
|
update_blocked_cache(base_dir, blocked_cache,
|
|
|
|
blocked_cache_last_updated,
|
|
|
|
blocked_cache_update_secs)
|
|
|
|
|
|
|
|
blocked_ua = \
|
|
|
|
is_blocked_domain(base_dir, agent_domain, blocked_cache)
|
|
|
|
# if self.server.debug:
|
|
|
|
if blocked_ua:
|
2023-07-18 14:55:26 +00:00
|
|
|
print('BLOCK: Blocked User agent 2: ' + agent_domain)
|
|
|
|
|
|
|
|
# optionally block military domains on a per account basis
|
|
|
|
if not blocked_ua and block_military:
|
|
|
|
if '/users/' in path:
|
|
|
|
# which accounts is this?
|
|
|
|
nickname = path.split('/users/')[1]
|
|
|
|
if '/' in nickname:
|
|
|
|
nickname = nickname.split('/')[0]
|
|
|
|
# does this account block military domains?
|
|
|
|
if block_military.get(nickname):
|
|
|
|
mil_domains = get_mil_domains_list()
|
2023-12-12 18:30:31 +00:00
|
|
|
for domain_str in mil_domains:
|
|
|
|
if '.' not in domain_str:
|
|
|
|
tld = domain_str
|
|
|
|
if agent_domain.endswith('.' + tld):
|
|
|
|
blocked_ua = True
|
|
|
|
print('BLOCK: Blocked military tld user agent: ' +
|
|
|
|
agent_domain)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if agent_domain.endswith(domain_str):
|
|
|
|
blocked_ua = True
|
|
|
|
print('BLOCK: Blocked military user agent: ' +
|
|
|
|
agent_domain)
|
|
|
|
break
|
2023-07-18 14:55:26 +00:00
|
|
|
|
2022-03-06 12:56:26 +00:00
|
|
|
return blocked_ua, blocked_cache_last_updated
|