mirror of https://gitlab.com/bashrc2/epicyon
Save a list of known web crawlers
parent
885a2b82c9
commit
9be61c2b38
53
crawlers.py
53
crawlers.py
|
@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
|
|||
__status__ = "Production"
|
||||
__module_group__ = "Core"
|
||||
|
||||
import os
|
||||
import time
|
||||
from utils import save_json
|
||||
from utils import user_agent_domain
|
||||
|
@ -51,6 +52,51 @@ def update_known_crawlers(ua_str: str,
|
|||
return curr_time
|
||||
|
||||
|
||||
def load_known_web_crawlers(base_dir: str) -> []:
|
||||
"""Returns a list of known web crawlers
|
||||
"""
|
||||
known_crawlers_filename = base_dir + '/accounts/known_crawlers.txt'
|
||||
if not os.path.isfile(known_crawlers_filename):
|
||||
return []
|
||||
crawlers_str = None
|
||||
try:
|
||||
with open(known_crawlers_filename, 'r') as fp_crawlers:
|
||||
crawlers_str = fp_crawlers.read()
|
||||
except OSError:
|
||||
print('EX: unable to load web crawlers from ' +
|
||||
known_crawlers_filename)
|
||||
if not crawlers_str:
|
||||
return []
|
||||
known_crawlers = []
|
||||
crawlers_list = crawlers_str.split('\n')
|
||||
for crawler in crawlers_list:
|
||||
if not crawler:
|
||||
continue
|
||||
crawler = crawler.replace('\n', '').strip()
|
||||
if not crawler:
|
||||
continue
|
||||
if crawler not in known_crawlers:
|
||||
known_crawlers.append(crawler)
|
||||
return known_crawlers
|
||||
|
||||
|
||||
def _save_known_web_crawlers(base_dir: str, known_crawlers: []) -> bool:
|
||||
"""Saves a list of known web crawlers
|
||||
"""
|
||||
known_crawlers_filename = base_dir + '/accounts/known_crawlers.txt'
|
||||
known_crawlers_str = ''
|
||||
for crawler in known_crawlers:
|
||||
known_crawlers_str += crawler.strip() + '\n'
|
||||
try:
|
||||
with open(known_crawlers_filename, 'w+') as fp_crawlers:
|
||||
fp_crawlers.write(known_crawlers_str)
|
||||
except OSError:
|
||||
print("EX: unable to save known web crawlers to " +
|
||||
known_crawlers_filename)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||
news_instance: bool, debug: bool,
|
||||
user_agents_blocked: [],
|
||||
|
@ -58,7 +104,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
base_dir: str,
|
||||
blocked_cache: [],
|
||||
blocked_cache_update_secs: int,
|
||||
crawlers_allowed: []):
|
||||
crawlers_allowed: [],
|
||||
known_crawlers: []):
|
||||
"""Should a GET or POST be blocked based upon its user agent?
|
||||
"""
|
||||
if not agent_str:
|
||||
|
@ -76,6 +123,10 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
# is this a web crawler? If so then block it by default
|
||||
# unless this is a news instance or if it is in the allowed list
|
||||
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
|
||||
if agent_str_lower not in known_crawlers:
|
||||
known_crawlers.append(agent_str_lower)
|
||||
known_crawlers.sort()
|
||||
_save_known_web_crawlers(base_dir, known_crawlers)
|
||||
# if this is a news instance then we want it
|
||||
# to be indexed by search engines
|
||||
if news_instance:
|
||||
|
|
10
daemon.py
10
daemon.py
|
@ -380,6 +380,7 @@ from siteactive import referer_is_active
|
|||
from webapp_likers import html_likers_of_post
|
||||
from crawlers import update_known_crawlers
|
||||
from crawlers import blocked_user_agent
|
||||
from crawlers import load_known_web_crawlers
|
||||
import os
|
||||
|
||||
|
||||
|
@ -14008,7 +14009,8 @@ class PubServer(BaseHTTPRequestHandler):
|
|||
self.server.base_dir,
|
||||
self.server.blocked_cache,
|
||||
self.server.blocked_cache_update_secs,
|
||||
self.server.crawlers_allowed)
|
||||
self.server.crawlers_allowed,
|
||||
self.server.known_crawlers)
|
||||
if block:
|
||||
self._400()
|
||||
return
|
||||
|
@ -18550,7 +18552,8 @@ class PubServer(BaseHTTPRequestHandler):
|
|||
self.server.base_dir,
|
||||
self.server.blocked_cache,
|
||||
self.server.blocked_cache_update_secs,
|
||||
self.server.crawlers_allowed)
|
||||
self.server.crawlers_allowed,
|
||||
self.server.known_crawlers)
|
||||
if block:
|
||||
self._400()
|
||||
self.server.postreq_busy = False
|
||||
|
@ -19666,6 +19669,9 @@ def run_daemon(crawlers_allowed: [],
|
|||
# list of crawler bots permitted within the User-Agent header
|
||||
httpd.crawlers_allowed = crawlers_allowed
|
||||
|
||||
# list of web crawlers known to the system
|
||||
httpd.known_crawlers = load_known_web_crawlers(base_dir)
|
||||
|
||||
httpd.unit_test = unit_test
|
||||
httpd.allow_local_network_access = allow_local_network_access
|
||||
if unit_test:
|
||||
|
|
Loading…
Reference in New Issue