mirror of https://gitlab.com/bashrc2/epicyon
Change variable name to avoid confusion
parent
5f1f973d85
commit
5b218919b3
52
crawlers.py
52
crawlers.py
|
@ -52,22 +52,22 @@ def update_known_crawlers(ua_str: str,
|
||||||
return curr_time
|
return curr_time
|
||||||
|
|
||||||
|
|
||||||
def load_known_web_crawlers(base_dir: str) -> []:
|
def load_known_web_bots(base_dir: str) -> []:
|
||||||
"""Returns a list of known web crawlers
|
"""Returns a list of known web bots
|
||||||
"""
|
"""
|
||||||
known_crawlers_filename = base_dir + '/accounts/known_web_bots.txt'
|
known_bots_filename = base_dir + '/accounts/known_web_bots.txt'
|
||||||
if not os.path.isfile(known_crawlers_filename):
|
if not os.path.isfile(known_bots_filename):
|
||||||
return []
|
return []
|
||||||
crawlers_str = None
|
crawlers_str = None
|
||||||
try:
|
try:
|
||||||
with open(known_crawlers_filename, 'r') as fp_crawlers:
|
with open(known_bots_filename, 'r') as fp_crawlers:
|
||||||
crawlers_str = fp_crawlers.read()
|
crawlers_str = fp_crawlers.read()
|
||||||
except OSError:
|
except OSError:
|
||||||
print('EX: unable to load web crawlers from ' +
|
print('EX: unable to load web bots from ' +
|
||||||
known_crawlers_filename)
|
known_bots_filename)
|
||||||
if not crawlers_str:
|
if not crawlers_str:
|
||||||
return []
|
return []
|
||||||
known_crawlers = []
|
known_bots = []
|
||||||
crawlers_list = crawlers_str.split('\n')
|
crawlers_list = crawlers_str.split('\n')
|
||||||
for crawler in crawlers_list:
|
for crawler in crawlers_list:
|
||||||
if not crawler:
|
if not crawler:
|
||||||
|
@ -75,24 +75,24 @@ def load_known_web_crawlers(base_dir: str) -> []:
|
||||||
crawler = crawler.replace('\n', '').strip()
|
crawler = crawler.replace('\n', '').strip()
|
||||||
if not crawler:
|
if not crawler:
|
||||||
continue
|
continue
|
||||||
if crawler not in known_crawlers:
|
if crawler not in known_bots:
|
||||||
known_crawlers.append(crawler)
|
known_bots.append(crawler)
|
||||||
return known_crawlers
|
return known_bots
|
||||||
|
|
||||||
|
|
||||||
def _save_known_web_crawlers(base_dir: str, known_crawlers: []) -> bool:
|
def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
|
||||||
"""Saves a list of known web crawlers
|
"""Saves a list of known web bots
|
||||||
"""
|
"""
|
||||||
known_crawlers_filename = base_dir + '/accounts/known_web_bots.txt'
|
known_bots_filename = base_dir + '/accounts/known_web_bots.txt'
|
||||||
known_crawlers_str = ''
|
known_bots_str = ''
|
||||||
for crawler in known_crawlers:
|
for crawler in known_bots:
|
||||||
known_crawlers_str += crawler.strip() + '\n'
|
known_bots_str += crawler.strip() + '\n'
|
||||||
try:
|
try:
|
||||||
with open(known_crawlers_filename, 'w+') as fp_crawlers:
|
with open(known_bots_filename, 'w+') as fp_crawlers:
|
||||||
fp_crawlers.write(known_crawlers_str)
|
fp_crawlers.write(known_bots_str)
|
||||||
except OSError:
|
except OSError:
|
||||||
print("EX: unable to save known web crawlers to " +
|
print("EX: unable to save known web bots to " +
|
||||||
known_crawlers_filename)
|
known_bots_filename)
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
blocked_cache: [],
|
blocked_cache: [],
|
||||||
blocked_cache_update_secs: int,
|
blocked_cache_update_secs: int,
|
||||||
crawlers_allowed: [],
|
crawlers_allowed: [],
|
||||||
known_crawlers: []):
|
known_bots: []):
|
||||||
"""Should a GET or POST be blocked based upon its user agent?
|
"""Should a GET or POST be blocked based upon its user agent?
|
||||||
"""
|
"""
|
||||||
if not agent_str:
|
if not agent_str:
|
||||||
|
@ -123,10 +123,10 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
# is this a web crawler? If so then block it by default
|
# is this a web crawler? If so then block it by default
|
||||||
# unless this is a news instance or if it is in the allowed list
|
# unless this is a news instance or if it is in the allowed list
|
||||||
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
|
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
|
||||||
if agent_str_lower not in known_crawlers:
|
if agent_str_lower not in known_bots:
|
||||||
known_crawlers.append(agent_str_lower)
|
known_bots.append(agent_str_lower)
|
||||||
known_crawlers.sort()
|
known_bots.sort()
|
||||||
_save_known_web_crawlers(base_dir, known_crawlers)
|
_save_known_web_bots(base_dir, known_bots)
|
||||||
# if this is a news instance then we want it
|
# if this is a news instance then we want it
|
||||||
# to be indexed by search engines
|
# to be indexed by search engines
|
||||||
if news_instance:
|
if news_instance:
|
||||||
|
|
|
@ -380,7 +380,7 @@ from siteactive import referer_is_active
|
||||||
from webapp_likers import html_likers_of_post
|
from webapp_likers import html_likers_of_post
|
||||||
from crawlers import update_known_crawlers
|
from crawlers import update_known_crawlers
|
||||||
from crawlers import blocked_user_agent
|
from crawlers import blocked_user_agent
|
||||||
from crawlers import load_known_web_crawlers
|
from crawlers import load_known_web_bots
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
@ -14010,7 +14010,7 @@ class PubServer(BaseHTTPRequestHandler):
|
||||||
self.server.blocked_cache,
|
self.server.blocked_cache,
|
||||||
self.server.blocked_cache_update_secs,
|
self.server.blocked_cache_update_secs,
|
||||||
self.server.crawlers_allowed,
|
self.server.crawlers_allowed,
|
||||||
self.server.known_crawlers)
|
self.server.known_bots)
|
||||||
if block:
|
if block:
|
||||||
self._400()
|
self._400()
|
||||||
return
|
return
|
||||||
|
@ -18553,7 +18553,7 @@ class PubServer(BaseHTTPRequestHandler):
|
||||||
self.server.blocked_cache,
|
self.server.blocked_cache,
|
||||||
self.server.blocked_cache_update_secs,
|
self.server.blocked_cache_update_secs,
|
||||||
self.server.crawlers_allowed,
|
self.server.crawlers_allowed,
|
||||||
self.server.known_crawlers)
|
self.server.known_bots)
|
||||||
if block:
|
if block:
|
||||||
self._400()
|
self._400()
|
||||||
self.server.postreq_busy = False
|
self.server.postreq_busy = False
|
||||||
|
@ -19670,7 +19670,7 @@ def run_daemon(crawlers_allowed: [],
|
||||||
httpd.crawlers_allowed = crawlers_allowed
|
httpd.crawlers_allowed = crawlers_allowed
|
||||||
|
|
||||||
# list of web crawlers known to the system
|
# list of web crawlers known to the system
|
||||||
httpd.known_crawlers = load_known_web_crawlers(base_dir)
|
httpd.known_bots = load_known_web_bots(base_dir)
|
||||||
|
|
||||||
httpd.unit_test = unit_test
|
httpd.unit_test = unit_test
|
||||||
httpd.allow_local_network_access = allow_local_network_access
|
httpd.allow_local_network_access = allow_local_network_access
|
||||||
|
|
Loading…
Reference in New Issue