mirror of https://gitlab.com/bashrc2/epicyon
Poison some other LLM scrapers
parent
544a5644b1
commit
9d3bad77e1
46
crawlers.py
46
crawlers.py
|
@ -110,32 +110,49 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
blocked_cache_update_secs: int,
|
blocked_cache_update_secs: int,
|
||||||
crawlers_allowed: [],
|
crawlers_allowed: [],
|
||||||
known_bots: [], path: str,
|
known_bots: [], path: str,
|
||||||
block_military: {}) -> (bool,):
|
block_military: {}):
|
||||||
"""Should a GET or POST be blocked based upon its user agent?
|
"""Should a GET or POST be blocked based upon its user agent?
|
||||||
"""
|
"""
|
||||||
if not agent_str:
|
if not agent_str:
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated, False
|
||||||
|
|
||||||
agent_str_lower = agent_str.lower()
|
agent_str_lower = agent_str.lower()
|
||||||
for ua_block in default_user_agent_blocks:
|
for ua_block in default_user_agent_blocks:
|
||||||
if ua_block in agent_str_lower:
|
if ua_block in agent_str_lower:
|
||||||
print('BLOCK: Blocked User agent 1: ' + ua_block)
|
print('BLOCK: Blocked User agent 1: ' + ua_block)
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated, False
|
||||||
|
|
||||||
agent_domain = None
|
agent_domain = None
|
||||||
|
|
||||||
if agent_str:
|
if agent_str:
|
||||||
|
contains_bot_string = False
|
||||||
|
llm = False
|
||||||
|
|
||||||
|
# is this an LLM crawler?
|
||||||
|
llm_bot_strings = (
|
||||||
|
'gptbot', '-ai/', ' ai/', '-ai ', ' ai ', 'chatgpt',
|
||||||
|
'anthropic', 'mlbot'
|
||||||
|
)
|
||||||
|
for bot_str in llm_bot_strings:
|
||||||
|
if bot_str in agent_str_lower:
|
||||||
|
if '://bot' not in agent_str_lower and \
|
||||||
|
'://robot' not in agent_str_lower and \
|
||||||
|
'://spider' not in agent_str_lower and \
|
||||||
|
'pixelfedbot/' not in agent_str_lower:
|
||||||
|
contains_bot_string = True
|
||||||
|
llm = True
|
||||||
|
break
|
||||||
|
|
||||||
# is this a web crawler? If so then block it by default
|
# is this a web crawler? If so then block it by default
|
||||||
# unless this is a news instance or if it is in the allowed list
|
# unless this is a news instance or if it is in the allowed list
|
||||||
bot_strings = (
|
bot_strings = (
|
||||||
'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
|
'bot/', 'bot-', '/bot', '_bot', 'bot_', 'bot;', ' bot ',
|
||||||
'/robot', 'gptbot', '-ai/', ' ai/', '-ai ',
|
'/robot', 'spider/', 'spider.ht', '/spider.', '-spider',
|
||||||
' ai ', 'spider/', 'spider.ht', '/spider.', '-spider',
|
'externalhit/', 'google',
|
||||||
'externalhit/', 'chatgpt', 'google', 'anthropic',
|
|
||||||
'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
|
'facebook', 'slurp', 'crawler', 'crawling', ' crawl ',
|
||||||
'gigablast', 'archive.org', 'httrack',
|
'gigablast', 'archive.org', 'httrack',
|
||||||
'spider-', ' spider ', 'findlink', 'ips-agent',
|
'spider-', ' spider ', 'findlink', 'ips-agent',
|
||||||
'woriobot', 'mlbot', 'webbot', 'webcrawl',
|
'woriobot', 'webbot', 'webcrawl',
|
||||||
'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot',
|
'voilabot', 'rank/', 'ezooms', 'heritrix', 'indeedbot',
|
||||||
'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright',
|
'woobot', 'infobot', 'viewbot', 'swimgbot', 'eright',
|
||||||
'apercite', 'bot (', 'summify', 'ccbot', 'linkfind',
|
'apercite', 'bot (', 'summify', 'ccbot', 'linkfind',
|
||||||
|
@ -143,7 +160,6 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
'drupact', 'searchengine', 'coccoc',
|
'drupact', 'searchengine', 'coccoc',
|
||||||
'explorer/', 'explorer;', 'crystalsemantics',
|
'explorer/', 'explorer;', 'crystalsemantics',
|
||||||
'scraper/', ' scraper ', ' scrape ', 'scraping')
|
'scraper/', ' scraper ', ' scrape ', 'scraping')
|
||||||
contains_bot_string = False
|
|
||||||
for bot_str in bot_strings:
|
for bot_str in bot_strings:
|
||||||
if bot_str in agent_str_lower:
|
if bot_str in agent_str_lower:
|
||||||
if '://bot' not in agent_str_lower and \
|
if '://bot' not in agent_str_lower and \
|
||||||
|
@ -160,18 +176,18 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
# if this is a news instance then we want it
|
# if this is a news instance then we want it
|
||||||
# to be indexed by search engines
|
# to be indexed by search engines
|
||||||
if news_instance:
|
if news_instance:
|
||||||
return False, blocked_cache_last_updated
|
return False, blocked_cache_last_updated, llm
|
||||||
# is this crawler allowed?
|
# is this crawler allowed?
|
||||||
for crawler in crawlers_allowed:
|
for crawler in crawlers_allowed:
|
||||||
if crawler.lower() in agent_str_lower:
|
if crawler.lower() in agent_str_lower:
|
||||||
return False, blocked_cache_last_updated
|
return False, blocked_cache_last_updated, llm
|
||||||
print('BLOCK: Blocked Crawler: ' + agent_str)
|
print('BLOCK: Blocked Crawler: ' + agent_str)
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated, llm
|
||||||
# get domain name from User-Agent
|
# get domain name from User-Agent
|
||||||
agent_domain = user_agent_domain(agent_str, debug)
|
agent_domain = user_agent_domain(agent_str, debug)
|
||||||
else:
|
else:
|
||||||
# no User-Agent header is present
|
# no User-Agent header is present
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated, False
|
||||||
|
|
||||||
# is the User-Agent type blocked? eg. "Mastodon"
|
# is the User-Agent type blocked? eg. "Mastodon"
|
||||||
if user_agents_blocked:
|
if user_agents_blocked:
|
||||||
|
@ -181,10 +197,10 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
blocked_ua = True
|
blocked_ua = True
|
||||||
break
|
break
|
||||||
if blocked_ua:
|
if blocked_ua:
|
||||||
return True, blocked_cache_last_updated
|
return True, blocked_cache_last_updated, False
|
||||||
|
|
||||||
if not agent_domain:
|
if not agent_domain:
|
||||||
return False, blocked_cache_last_updated
|
return False, blocked_cache_last_updated, False
|
||||||
|
|
||||||
# is the User-Agent domain blocked
|
# is the User-Agent domain blocked
|
||||||
blocked_ua = False
|
blocked_ua = False
|
||||||
|
@ -225,4 +241,4 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||||
agent_domain)
|
agent_domain)
|
||||||
break
|
break
|
||||||
|
|
||||||
return blocked_ua, blocked_cache_last_updated
|
return blocked_ua, blocked_cache_last_updated, False
|
||||||
|
|
|
@ -320,7 +320,7 @@ def daemon_http_get(self) -> None:
|
||||||
ua_str = get_user_agent(self)
|
ua_str = get_user_agent(self)
|
||||||
|
|
||||||
if not _permitted_crawler_path(self.path):
|
if not _permitted_crawler_path(self.path):
|
||||||
block, self.server.blocked_cache_last_updated = \
|
block, self.server.blocked_cache_last_updated, llm = \
|
||||||
blocked_user_agent(calling_domain, ua_str,
|
blocked_user_agent(calling_domain, ua_str,
|
||||||
self.server.news_instance,
|
self.server.news_instance,
|
||||||
self.server.debug,
|
self.server.debug,
|
||||||
|
@ -334,6 +334,15 @@ def daemon_http_get(self) -> None:
|
||||||
self.server.known_bots,
|
self.server.known_bots,
|
||||||
self.path, self.server.block_military)
|
self.path, self.server.block_military)
|
||||||
if block:
|
if block:
|
||||||
|
if llm:
|
||||||
|
msg = html_poisoned(self.server.dictionary)
|
||||||
|
msg = msg.encode('utf-8')
|
||||||
|
msglen = len(msg)
|
||||||
|
set_headers(self, 'text/html', msglen,
|
||||||
|
'', calling_domain, False)
|
||||||
|
write2(self, msg)
|
||||||
|
print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
|
||||||
|
return
|
||||||
http_400(self)
|
http_400(self)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -163,7 +163,7 @@ def daemon_http_post(self) -> None:
|
||||||
|
|
||||||
ua_str = get_user_agent(self)
|
ua_str = get_user_agent(self)
|
||||||
|
|
||||||
block, self.server.blocked_cache_last_updated = \
|
block, self.server.blocked_cache_last_updated, _ = \
|
||||||
blocked_user_agent(calling_domain, ua_str,
|
blocked_user_agent(calling_domain, ua_str,
|
||||||
self.server.news_instance,
|
self.server.news_instance,
|
||||||
self.server.debug,
|
self.server.debug,
|
||||||
|
|
Loading…
Reference in New Issue