Option to allow some crawlers

merge-requests/30/head
Bob Mottram 2022-03-06 12:31:58 +00:00
parent 3aab054c04
commit 35883119be
3 changed files with 41 additions and 10 deletions

View File

@ -13983,7 +13983,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.blocked_cache_last_updated,
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs)
self.server.blocked_cache_update_secs,
self.server.crawlers_allowed)
if block:
self._400()
return
@ -18524,7 +18525,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.blocked_cache_last_updated,
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs)
self.server.blocked_cache_update_secs,
self.server.crawlers_allowed)
if block:
self._400()
self.server.postreq_busy = False
@ -19457,7 +19459,8 @@ def load_tokens(base_dir: str, tokens_dict: {}, tokens_lookup: {}) -> None:
break
def run_daemon(dyslexic_font: bool,
def run_daemon(crawlers_allowed: [],
dyslexic_font: bool,
content_license_url: str,
lists_enabled: str,
default_reply_interval_hrs: int,
@ -19636,6 +19639,9 @@ def run_daemon(dyslexic_font: bool,
# list of blocked user agent types within the User-Agent header
httpd.user_agents_blocked = user_agents_blocked
# list of crawler bots permitted within the User-Agent header
httpd.crawlers_allowed = crawlers_allowed
httpd.unit_test = unit_test
httpd.allow_local_network_access = allow_local_network_access
if unit_test:

View File

@ -141,6 +141,10 @@ parser.add_argument('--lists_enabled', type=str,
parser.add_argument('--userAgentBlocks', type=str,
default=None,
help='List of blocked user agents, separated by commas')
parser.add_argument('--crawlersAllowed', type=str,
default=None,
help='List of permitted web crawler user agents, ' +
'separated by commas')
parser.add_argument('--libretranslate', dest='libretranslateUrl', type=str,
default=None,
help='URL for LibreTranslate service')
@ -3301,8 +3305,20 @@ else:
get_config_param(base_dir, 'userAgentsBlocked')
if user_agents_blocked_str:
agent_blocks_list = user_agents_blocked_str.split(',')
for agentBlockStr in agent_blocks_list:
user_agents_blocked.append(agentBlockStr.strip())
for user_agents_blocked_str2 in agent_blocks_list:
user_agents_blocked.append(user_agents_blocked_str2.strip())
crawlers_allowed = []
if args.crawlersAllowed:
crawlers_allowed_str = args.crawlersAllowed
set_config_param(base_dir, 'crawlersAllowed', crawlers_allowed_str)
else:
crawlers_allowed_str = \
get_config_param(base_dir, 'crawlersAllowed')
if crawlers_allowed_str:
crawlers_allowed_list = crawlers_allowed_str.split(',')
for crawlers_allowed_str2 in crawlers_allowed_list:
crawlers_allowed.append(crawlers_allowed_str2.strip())
lists_enabled = ''
if args.lists_enabled:
@ -3365,7 +3381,8 @@ if args.defaultCurrency:
print('Default currency set to ' + args.defaultCurrency)
if __name__ == "__main__":
run_daemon(args.dyslexic_font,
run_daemon(crawlers_allowed,
args.dyslexic_font,
content_license_url,
lists_enabled,
args.default_reply_interval_hrs,

View File

@ -822,8 +822,10 @@ def create_server_alice(path: str, domain: str, port: int,
lists_enabled = ''
content_license_url = 'https://creativecommons.org/licenses/by/4.0'
dyslexic_font = False
crawlers_allowed = []
print('Server running: Alice')
run_daemon(dyslexic_font,
run_daemon(crawlers_allowed,
dyslexic_font,
content_license_url,
lists_enabled, default_reply_interval_hrs,
low_bandwidth, max_like_count,
@ -975,8 +977,10 @@ def create_server_bob(path: str, domain: str, port: int,
lists_enabled = ''
content_license_url = 'https://creativecommons.org/licenses/by/4.0'
dyslexic_font = False
crawlers_allowed = []
print('Server running: Bob')
run_daemon(dyslexic_font,
run_daemon(crawlers_allowed,
dyslexic_font,
content_license_url,
lists_enabled, default_reply_interval_hrs,
low_bandwidth, max_like_count,
@ -1051,8 +1055,10 @@ def create_server_eve(path: str, domain: str, port: int, federation_list: [],
lists_enabled = ''
content_license_url = 'https://creativecommons.org/licenses/by/4.0'
dyslexic_font = False
crawlers_allowed = []
print('Server running: Eve')
run_daemon(dyslexic_font,
run_daemon(crawlers_allowed,
dyslexic_font,
content_license_url,
lists_enabled, default_reply_interval_hrs,
low_bandwidth, max_like_count,
@ -1129,8 +1135,10 @@ def create_server_group(path: str, domain: str, port: int,
lists_enabled = ''
content_license_url = 'https://creativecommons.org/licenses/by/4.0'
dyslexic_font = False
crawlers_allowed = []
print('Server running: Group')
run_daemon(dyslexic_font,
run_daemon(crawlers_allowed,
dyslexic_font,
content_license_url,
lists_enabled, default_reply_interval_hrs,
low_bandwidth, max_like_count,