mirror of https://gitlab.com/bashrc2/epicyon
Check if LLM crawlers are encountered too frequently
parent
60c15d2358
commit
fc7feaf1cd
|
@ -687,6 +687,9 @@ def run_daemon(accounts_data_dir: str,
|
||||||
|
|
||||||
httpd.starting_daemon = True
|
httpd.starting_daemon = True
|
||||||
|
|
||||||
|
# the last time when an LLM scraper was replied to
|
||||||
|
httpd.last_llm_time = None
|
||||||
|
|
||||||
# width, position and opacity of watermark applied to attached images
|
# width, position and opacity of watermark applied to attached images
|
||||||
# as a percentage of the attached image width
|
# as a percentage of the attached image width
|
||||||
httpd.watermark_width_percent = watermark_width_percent
|
httpd.watermark_width_percent = watermark_width_percent
|
||||||
|
|
|
@ -87,6 +87,7 @@ from httpcodes import http_304
|
||||||
from httpcodes import http_400
|
from httpcodes import http_400
|
||||||
from httpcodes import http_503
|
from httpcodes import http_503
|
||||||
from httpcodes import write2
|
from httpcodes import write2
|
||||||
|
from utils import date_utcnow
|
||||||
from utils import replace_strings
|
from utils import replace_strings
|
||||||
from utils import contains_invalid_chars
|
from utils import contains_invalid_chars
|
||||||
from utils import save_json
|
from utils import save_json
|
||||||
|
@ -349,8 +350,16 @@ def daemon_http_get(self) -> None:
|
||||||
self.path, self.server.block_military)
|
self.path, self.server.block_military)
|
||||||
if block:
|
if block:
|
||||||
if llm:
|
if llm:
|
||||||
|
# check if LLM is too frequent
|
||||||
|
if self.server.last_llm_time:
|
||||||
|
curr_date = date_utcnow()
|
||||||
|
time_diff = curr_date - self.server.last_llm_time
|
||||||
|
diff_secs = time_diff.total_seconds()
|
||||||
|
if diff_secs < 60:
|
||||||
|
http_402(self)
|
||||||
|
return
|
||||||
if is_image_file(self.path):
|
if is_image_file(self.path):
|
||||||
http_404(self, 723)
|
http_402(self)
|
||||||
return
|
return
|
||||||
# if this is an LLM crawler then feed it some trash
|
# if this is an LLM crawler then feed it some trash
|
||||||
print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
|
print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
|
||||||
|
@ -361,6 +370,7 @@ def daemon_http_get(self) -> None:
|
||||||
set_headers(self, 'text/html', msglen,
|
set_headers(self, 'text/html', msglen,
|
||||||
'', calling_domain, False)
|
'', calling_domain, False)
|
||||||
write2(self, msg)
|
write2(self, msg)
|
||||||
|
self.server.last_llm_time = date_utcnow()
|
||||||
return
|
return
|
||||||
http_400(self)
|
http_400(self)
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue