mirror of https://gitlab.com/bashrc2/epicyon
Merge branch 'main' of gitlab.com:bashrc2/epicyon
commit
aff09b378e
56
crawlers.py
56
crawlers.py
|
@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
|
|||
__status__ = "Production"
|
||||
__module_group__ = "Core"
|
||||
|
||||
import os
|
||||
import time
|
||||
from utils import save_json
|
||||
from utils import user_agent_domain
|
||||
|
@ -51,6 +52,51 @@ def update_known_crawlers(ua_str: str,
|
|||
return curr_time
|
||||
|
||||
|
||||
def load_known_web_bots(base_dir: str) -> []:
|
||||
"""Returns a list of known web bots
|
||||
"""
|
||||
known_bots_filename = base_dir + '/accounts/knownBots.txt'
|
||||
if not os.path.isfile(known_bots_filename):
|
||||
return []
|
||||
crawlers_str = None
|
||||
try:
|
||||
with open(known_bots_filename, 'r') as fp_crawlers:
|
||||
crawlers_str = fp_crawlers.read()
|
||||
except OSError:
|
||||
print('EX: unable to load web bots from ' +
|
||||
known_bots_filename)
|
||||
if not crawlers_str:
|
||||
return []
|
||||
known_bots = []
|
||||
crawlers_list = crawlers_str.split('\n')
|
||||
for crawler in crawlers_list:
|
||||
if not crawler:
|
||||
continue
|
||||
crawler = crawler.replace('\n', '').strip()
|
||||
if not crawler:
|
||||
continue
|
||||
if crawler not in known_bots:
|
||||
known_bots.append(crawler)
|
||||
return known_bots
|
||||
|
||||
|
||||
def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
|
||||
"""Saves a list of known web bots
|
||||
"""
|
||||
known_bots_filename = base_dir + '/accounts/knownBots.txt'
|
||||
known_bots_str = ''
|
||||
for crawler in known_bots:
|
||||
known_bots_str += crawler.strip() + '\n'
|
||||
try:
|
||||
with open(known_bots_filename, 'w+') as fp_crawlers:
|
||||
fp_crawlers.write(known_bots_str)
|
||||
except OSError:
|
||||
print("EX: unable to save known web bots to " +
|
||||
known_bots_filename)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def blocked_user_agent(calling_domain: str, agent_str: str,
|
||||
news_instance: bool, debug: bool,
|
||||
user_agents_blocked: [],
|
||||
|
@ -58,7 +104,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
base_dir: str,
|
||||
blocked_cache: [],
|
||||
blocked_cache_update_secs: int,
|
||||
crawlers_allowed: []):
|
||||
crawlers_allowed: [],
|
||||
known_bots: []):
|
||||
"""Should a GET or POST be blocked based upon its user agent?
|
||||
"""
|
||||
if not agent_str:
|
||||
|
@ -73,8 +120,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
agent_domain = None
|
||||
|
||||
if agent_str:
|
||||
# is this a web crawler? If so the block it
|
||||
# is this a web crawler? If so then block it by default
|
||||
# unless this is a news instance or if it is in the allowed list
|
||||
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
|
||||
if agent_str_lower not in known_bots:
|
||||
known_bots.append(agent_str_lower)
|
||||
known_bots.sort()
|
||||
_save_known_web_bots(base_dir, known_bots)
|
||||
# if this is a news instance then we want it
|
||||
# to be indexed by search engines
|
||||
if news_instance:
|
||||
|
|
28
daemon.py
28
daemon.py
|
@ -380,6 +380,7 @@ from siteactive import referer_is_active
|
|||
from webapp_likers import html_likers_of_post
|
||||
from crawlers import update_known_crawlers
|
||||
from crawlers import blocked_user_agent
|
||||
from crawlers import load_known_web_bots
|
||||
import os
|
||||
|
||||
|
||||
|
@ -14008,7 +14009,8 @@ class PubServer(BaseHTTPRequestHandler):
|
|||
self.server.base_dir,
|
||||
self.server.blocked_cache,
|
||||
self.server.blocked_cache_update_secs,
|
||||
self.server.crawlers_allowed)
|
||||
self.server.crawlers_allowed,
|
||||
self.server.known_bots)
|
||||
if block:
|
||||
self._400()
|
||||
return
|
||||
|
@ -14185,6 +14187,24 @@ class PubServer(BaseHTTPRequestHandler):
|
|||
'_GET', 'isAuthorized',
|
||||
self.server.debug)
|
||||
|
||||
if authorized and self.path.endswith('/bots.txt'):
|
||||
known_bots_str = ''
|
||||
for bot_name in self.server.known_bots:
|
||||
known_bots_str += bot_name + '\n'
|
||||
# TODO
|
||||
msg = known_bots_str.encode('utf-8')
|
||||
msglen = len(msg)
|
||||
self._set_headers('text/plain; charset=utf-8',
|
||||
msglen, None, calling_domain, True)
|
||||
self._write(msg)
|
||||
if self.server.debug:
|
||||
print('Sent known bots: ' +
|
||||
self.server.path + ' ' + calling_domain)
|
||||
fitness_performance(getreq_start_time, self.server.fitness,
|
||||
'_GET', 'get_known_bots',
|
||||
self.server.debug)
|
||||
return
|
||||
|
||||
# shared items catalog for this instance
|
||||
# this is only accessible to instance members or to
|
||||
# other instances which present an authorization token
|
||||
|
@ -18550,7 +18570,8 @@ class PubServer(BaseHTTPRequestHandler):
|
|||
self.server.base_dir,
|
||||
self.server.blocked_cache,
|
||||
self.server.blocked_cache_update_secs,
|
||||
self.server.crawlers_allowed)
|
||||
self.server.crawlers_allowed,
|
||||
self.server.known_bots)
|
||||
if block:
|
||||
self._400()
|
||||
self.server.postreq_busy = False
|
||||
|
@ -19666,6 +19687,9 @@ def run_daemon(crawlers_allowed: [],
|
|||
# list of crawler bots permitted within the User-Agent header
|
||||
httpd.crawlers_allowed = crawlers_allowed
|
||||
|
||||
# list of web crawlers known to the system
|
||||
httpd.known_bots = load_known_web_bots(base_dir)
|
||||
|
||||
httpd.unit_test = unit_test
|
||||
httpd.allow_local_network_access = allow_local_network_access
|
||||
if unit_test:
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "أظهر من كرر هذا المنصب",
|
||||
"Repeated by": "يتكرر بواسطة",
|
||||
"Register": "يسجل",
|
||||
"Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
|
||||
"Web Bots Allowed": "مسموح روبوتات الويب",
|
||||
"Known Search Bots": "روبوتات بحث الويب المعروفة"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
|
||||
"Repeated by": "Repetit per",
|
||||
"Register": "Registra't",
|
||||
"Web Crawlers Allowed": "Es permeten rastrejadors web"
|
||||
"Web Bots Allowed": "Bots web permesos",
|
||||
"Known Search Bots": "Bots de cerca web coneguts"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
|
||||
"Repeated by": "Ailadrodd gan",
|
||||
"Register": "Cofrestrwch",
|
||||
"Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
|
||||
"Web Bots Allowed": "Web Bots a Ganiateir",
|
||||
"Known Search Bots": "Bots Chwilio Gwe Hysbys"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
|
||||
"Repeated by": "Wiederholt von",
|
||||
"Register": "Registrieren",
|
||||
"Web Crawlers Allowed": "Webcrawler erlaubt"
|
||||
"Web Bots Allowed": "Webbots erlaubt",
|
||||
"Known Search Bots": "Bekannte Bots für die Websuche"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Show who repeated this post",
|
||||
"Repeated by": "Repeated by",
|
||||
"Register": "Register",
|
||||
"Web Crawlers Allowed": "Web Crawlers Allowed"
|
||||
"Web Bots Allowed": "Web Search Bots Allowed",
|
||||
"Known Search Bots": "Known Web Search Bots"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Mostrar quién repitió esta publicación",
|
||||
"Repeated by": "Repetido por",
|
||||
"Register": "Registrarse",
|
||||
"Web Crawlers Allowed": "Rastreadores web permitidos"
|
||||
"Web Bots Allowed": "Bots web permitidos",
|
||||
"Known Search Bots": "Bots de búsqueda web conocidos"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Montrer qui a répété ce post",
|
||||
"Repeated by": "Répété par",
|
||||
"Register": "S'inscrire",
|
||||
"Web Crawlers Allowed": "Robots d'exploration Web autorisés"
|
||||
"Web Bots Allowed": "Robots Web autorisés",
|
||||
"Known Search Bots": "Robots de recherche Web connus"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
|
||||
"Repeated by": "Arís agus arís eile ag",
|
||||
"Register": "Clár",
|
||||
"Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
|
||||
"Web Bots Allowed": "Róbónna Gréasáin Ceadaithe",
|
||||
"Known Search Bots": "Róbónna Cuardach Gréasáin Aitheanta"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
|
||||
"Repeated by": "द्वारा दोहराया गया",
|
||||
"Register": "रजिस्टर करें",
|
||||
"Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
|
||||
"Web Bots Allowed": "वेब बॉट्स की अनुमति है",
|
||||
"Known Search Bots": "ज्ञात वेब खोज बॉट्स"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Mostra chi ha ripetuto questo post",
|
||||
"Repeated by": "Ripetuto da",
|
||||
"Register": "Registrati",
|
||||
"Web Crawlers Allowed": "Web crawler consentiti"
|
||||
"Web Bots Allowed": "Web bot consentiti",
|
||||
"Known Search Bots": "Bot di ricerca Web noti"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "この投稿を繰り返した人を表示する",
|
||||
"Repeated by": "によって繰り返される",
|
||||
"Register": "登録",
|
||||
"Web Crawlers Allowed": "許可されるWebクローラー"
|
||||
"Web Bots Allowed": "許可されたWebボット",
|
||||
"Known Search Bots": "既知のWeb検索ボット"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "이 포스트를 반복한 사람 표시",
|
||||
"Repeated by": "반복한 사람",
|
||||
"Register": "등록",
|
||||
"Web Crawlers Allowed": "웹 크롤러 허용"
|
||||
"Web Bots Allowed": "웹 봇 허용",
|
||||
"Known Search Bots": "알려진 웹 검색 봇"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
|
||||
"Repeated by": "Ji hêla dubare kirin",
|
||||
"Register": "Fêhrist",
|
||||
"Web Crawlers Allowed": "Crawlers Web Destûrdar in"
|
||||
"Web Bots Allowed": "Web Bots Destûrdar in",
|
||||
"Known Search Bots": "Botên Lêgerîna Webê yên naskirî"
|
||||
}
|
||||
|
|
|
@ -512,5 +512,6 @@
|
|||
"Show who repeated this post": "Show who repeated this post",
|
||||
"Repeated by": "Repeated by",
|
||||
"Register": "Register",
|
||||
"Web Crawlers Allowed": "Web Crawlers Allowed"
|
||||
"Web Bots Allowed": "Web Search Bots Allowed",
|
||||
"Known Search Bots": "Known Web Search Bots"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Pokaż, kto powtórzył ten post",
|
||||
"Repeated by": "Powtórzone przez",
|
||||
"Register": "Zarejestrować",
|
||||
"Web Crawlers Allowed": "Dozwolone roboty sieciowe"
|
||||
"Web Bots Allowed": "Dozwolone boty internetowe",
|
||||
"Known Search Bots": "Znane boty wyszukiwania w sieci"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Mostrar quem repetiu esta postagem",
|
||||
"Repeated by": "Repetido por",
|
||||
"Register": "Registro",
|
||||
"Web Crawlers Allowed": "Rastreadores da Web permitidos"
|
||||
"Web Bots Allowed": "Webbots permitidos",
|
||||
"Known Search Bots": "Bots de pesquisa na Web conhecidos"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Показать, кто повторил этот пост",
|
||||
"Repeated by": "Повторено",
|
||||
"Register": "регистр",
|
||||
"Web Crawlers Allowed": "Веб-сканеры разрешены"
|
||||
"Web Bots Allowed": "Веб-боты разрешены",
|
||||
"Known Search Bots": "Известные боты веб-поиска"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
|
||||
"Repeated by": "Imerudiwa na",
|
||||
"Register": "Sajili",
|
||||
"Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
|
||||
"Web Bots Allowed": "Mtandao wa Boti Unaruhusiwa",
|
||||
"Known Search Bots": "Vijibu vya Utafutaji wa Wavuti vinavyojulikana"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "Покажіть, хто повторив цей пост",
|
||||
"Repeated by": "Повторюється за",
|
||||
"Register": "Реєстрація",
|
||||
"Web Crawlers Allowed": "Веб-сканери дозволені"
|
||||
"Web Bots Allowed": "Веб-боти дозволені",
|
||||
"Known Search Bots": "Відомі пошукові роботи в Інтернеті"
|
||||
}
|
||||
|
|
|
@ -516,5 +516,6 @@
|
|||
"Show who repeated this post": "显示谁重复了这篇文章",
|
||||
"Repeated by": "重复",
|
||||
"Register": "登记",
|
||||
"Web Crawlers Allowed": "允许网络爬虫"
|
||||
"Web Bots Allowed": "允许网络机器人",
|
||||
"Known Search Bots": "已知的网络搜索机器人"
|
||||
}
|
||||
|
|
|
@ -1808,13 +1808,17 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
|
|||
'userAgentsBlockedStr', user_agents_blocked_str,
|
||||
200, '', False)
|
||||
|
||||
edit_profile_form += \
|
||||
'<a href="/users/' + nickname + '/bots.txt">' + \
|
||||
translate['Known Search Bots'] + '</a><br>\n'
|
||||
|
||||
crawlers_allowed_str = ''
|
||||
for uagent in crawlers_allowed:
|
||||
if crawlers_allowed_str:
|
||||
crawlers_allowed_str += '\n'
|
||||
crawlers_allowed_str += uagent
|
||||
edit_profile_form += \
|
||||
edit_text_area(translate['Web Crawlers Allowed'],
|
||||
edit_text_area(translate['Web Bots Allowed'],
|
||||
'crawlersAllowedStr', crawlers_allowed_str,
|
||||
200, '', False)
|
||||
|
||||
|
|
Loading…
Reference in New Issue