Merge branch 'main' of gitlab.com:bashrc2/epicyon

merge-requests/30/head
Bob Mottram 2022-03-06 15:45:55 +00:00
commit aff09b378e
23 changed files with 125 additions and 25 deletions

View File

@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import os
import time
from utils import save_json
from utils import user_agent_domain
@ -51,6 +52,51 @@ def update_known_crawlers(ua_str: str,
return curr_time
def load_known_web_bots(base_dir: str) -> []:
"""Returns a list of known web bots
"""
known_bots_filename = base_dir + '/accounts/knownBots.txt'
if not os.path.isfile(known_bots_filename):
return []
crawlers_str = None
try:
with open(known_bots_filename, 'r') as fp_crawlers:
crawlers_str = fp_crawlers.read()
except OSError:
print('EX: unable to load web bots from ' +
known_bots_filename)
if not crawlers_str:
return []
known_bots = []
crawlers_list = crawlers_str.split('\n')
for crawler in crawlers_list:
if not crawler:
continue
crawler = crawler.replace('\n', '').strip()
if not crawler:
continue
if crawler not in known_bots:
known_bots.append(crawler)
return known_bots
def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
"""Saves a list of known web bots
"""
known_bots_filename = base_dir + '/accounts/knownBots.txt'
known_bots_str = ''
for crawler in known_bots:
known_bots_str += crawler.strip() + '\n'
try:
with open(known_bots_filename, 'w+') as fp_crawlers:
fp_crawlers.write(known_bots_str)
except OSError:
print("EX: unable to save known web bots to " +
known_bots_filename)
return False
return True
def blocked_user_agent(calling_domain: str, agent_str: str,
news_instance: bool, debug: bool,
user_agents_blocked: [],
@ -58,7 +104,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
base_dir: str,
blocked_cache: [],
blocked_cache_update_secs: int,
crawlers_allowed: []):
crawlers_allowed: [],
known_bots: []):
"""Should a GET or POST be blocked based upon its user agent?
"""
if not agent_str:
@ -73,8 +120,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
agent_domain = None
if agent_str:
# is this a web crawler? If so the block it
# is this a web crawler? If so then block it by default
# unless this is a news instance or if it is in the allowed list
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
if agent_str_lower not in known_bots:
known_bots.append(agent_str_lower)
known_bots.sort()
_save_known_web_bots(base_dir, known_bots)
# if this is a news instance then we want it
# to be indexed by search engines
if news_instance:

View File

@ -380,6 +380,7 @@ from siteactive import referer_is_active
from webapp_likers import html_likers_of_post
from crawlers import update_known_crawlers
from crawlers import blocked_user_agent
from crawlers import load_known_web_bots
import os
@ -14008,7 +14009,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs,
self.server.crawlers_allowed)
self.server.crawlers_allowed,
self.server.known_bots)
if block:
self._400()
return
@ -14185,6 +14187,24 @@ class PubServer(BaseHTTPRequestHandler):
'_GET', 'isAuthorized',
self.server.debug)
if authorized and self.path.endswith('/bots.txt'):
known_bots_str = ''
for bot_name in self.server.known_bots:
known_bots_str += bot_name + '\n'
# TODO
msg = known_bots_str.encode('utf-8')
msglen = len(msg)
self._set_headers('text/plain; charset=utf-8',
msglen, None, calling_domain, True)
self._write(msg)
if self.server.debug:
print('Sent known bots: ' +
self.server.path + ' ' + calling_domain)
fitness_performance(getreq_start_time, self.server.fitness,
'_GET', 'get_known_bots',
self.server.debug)
return
# shared items catalog for this instance
# this is only accessible to instance members or to
# other instances which present an authorization token
@ -18550,7 +18570,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs,
self.server.crawlers_allowed)
self.server.crawlers_allowed,
self.server.known_bots)
if block:
self._400()
self.server.postreq_busy = False
@ -19666,6 +19687,9 @@ def run_daemon(crawlers_allowed: [],
# list of crawler bots permitted within the User-Agent header
httpd.crawlers_allowed = crawlers_allowed
# list of web crawlers known to the system
httpd.known_bots = load_known_web_bots(base_dir)
httpd.unit_test = unit_test
httpd.allow_local_network_access = allow_local_network_access
if unit_test:

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "أظهر من كرر هذا المنصب",
"Repeated by": "يتكرر بواسطة",
"Register": "يسجل",
"Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
"Web Bots Allowed": "مسموح روبوتات الويب",
"Known Search Bots": "روبوتات بحث الويب المعروفة"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
"Repeated by": "Repetit per",
"Register": "Registra't",
"Web Crawlers Allowed": "Es permeten rastrejadors web"
"Web Bots Allowed": "Bots web permesos",
"Known Search Bots": "Bots de cerca web coneguts"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
"Repeated by": "Ailadrodd gan",
"Register": "Cofrestrwch",
"Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
"Web Bots Allowed": "Web Bots a Ganiateir",
"Known Search Bots": "Bots Chwilio Gwe Hysbys"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
"Repeated by": "Wiederholt von",
"Register": "Registrieren",
"Web Crawlers Allowed": "Webcrawler erlaubt"
"Web Bots Allowed": "Webbots erlaubt",
"Known Search Bots": "Bekannte Bots für die Websuche"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by",
"Register": "Register",
"Web Crawlers Allowed": "Web Crawlers Allowed"
"Web Bots Allowed": "Web Search Bots Allowed",
"Known Search Bots": "Known Web Search Bots"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Mostrar quién repitió esta publicación",
"Repeated by": "Repetido por",
"Register": "Registrarse",
"Web Crawlers Allowed": "Rastreadores web permitidos"
"Web Bots Allowed": "Bots web permitidos",
"Known Search Bots": "Bots de búsqueda web conocidos"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Montrer qui a répété ce post",
"Repeated by": "Répété par",
"Register": "S'inscrire",
"Web Crawlers Allowed": "Robots d'exploration Web autorisés"
"Web Bots Allowed": "Robots Web autorisés",
"Known Search Bots": "Robots de recherche Web connus"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
"Repeated by": "Arís agus arís eile ag",
"Register": "Clár",
"Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
"Web Bots Allowed": "Róbónna Gréasáin Ceadaithe",
"Known Search Bots": "Róbónna Cuardach Gréasáin Aitheanta"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
"Repeated by": "द्वारा दोहराया गया",
"Register": "रजिस्टर करें",
"Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
"Web Bots Allowed": "वेब बॉट्स की अनुमति है",
"Known Search Bots": "ज्ञात वेब खोज बॉट्स"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Mostra chi ha ripetuto questo post",
"Repeated by": "Ripetuto da",
"Register": "Registrati",
"Web Crawlers Allowed": "Web crawler consentiti"
"Web Bots Allowed": "Web bot consentiti",
"Known Search Bots": "Bot di ricerca Web noti"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "この投稿を繰り返した人を表示する",
"Repeated by": "によって繰り返される",
"Register": "登録",
"Web Crawlers Allowed": "許可されるWebクローラー"
"Web Bots Allowed": "許可されたWebボット",
"Known Search Bots": "既知のWeb検索ボット"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "이 포스트를 반복한 사람 표시",
"Repeated by": "반복한 사람",
"Register": "등록",
"Web Crawlers Allowed": "웹 크롤러 허용"
"Web Bots Allowed": "웹 봇 허용",
"Known Search Bots": "알려진 웹 검색 봇"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
"Repeated by": "Ji hêla dubare kirin",
"Register": "Fêhrist",
"Web Crawlers Allowed": "Crawlers Web Destûrdar in"
"Web Bots Allowed": "Web Bots Destûrdar in",
"Known Search Bots": "Botên Lêgerîna Webê yên naskirî"
}

View File

@ -512,5 +512,6 @@
"Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by",
"Register": "Register",
"Web Crawlers Allowed": "Web Crawlers Allowed"
"Web Bots Allowed": "Web Search Bots Allowed",
"Known Search Bots": "Known Web Search Bots"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Pokaż, kto powtórzył ten post",
"Repeated by": "Powtórzone przez",
"Register": "Zarejestrować",
"Web Crawlers Allowed": "Dozwolone roboty sieciowe"
"Web Bots Allowed": "Dozwolone boty internetowe",
"Known Search Bots": "Znane boty wyszukiwania w sieci"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Mostrar quem repetiu esta postagem",
"Repeated by": "Repetido por",
"Register": "Registro",
"Web Crawlers Allowed": "Rastreadores da Web permitidos"
"Web Bots Allowed": "Webbots permitidos",
"Known Search Bots": "Bots de pesquisa na Web conhecidos"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Показать, кто повторил этот пост",
"Repeated by": "Повторено",
"Register": "регистр",
"Web Crawlers Allowed": "Веб-сканеры разрешены"
"Web Bots Allowed": "Веб-боты разрешены",
"Known Search Bots": "Известные боты веб-поиска"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
"Repeated by": "Imerudiwa na",
"Register": "Sajili",
"Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
"Web Bots Allowed": "Mtandao wa Boti Unaruhusiwa",
"Known Search Bots": "Vijibu vya Utafutaji wa Wavuti vinavyojulikana"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "Покажіть, хто повторив цей пост",
"Repeated by": "Повторюється за",
"Register": "Реєстрація",
"Web Crawlers Allowed": "Веб-сканери дозволені"
"Web Bots Allowed": "Веб-боти дозволені",
"Known Search Bots": "Відомі пошукові роботи в Інтернеті"
}

View File

@ -516,5 +516,6 @@
"Show who repeated this post": "显示谁重复了这篇文章",
"Repeated by": "重复",
"Register": "登记",
"Web Crawlers Allowed": "允许网络爬虫"
"Web Bots Allowed": "允许网络机器人",
"Known Search Bots": "已知的网络搜索机器人"
}

View File

@ -1808,13 +1808,17 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
'userAgentsBlockedStr', user_agents_blocked_str,
200, '', False)
edit_profile_form += \
'<a href="/users/' + nickname + '/bots.txt">' + \
translate['Known Search Bots'] + '</a><br>\n'
crawlers_allowed_str = ''
for uagent in crawlers_allowed:
if crawlers_allowed_str:
crawlers_allowed_str += '\n'
crawlers_allowed_str += uagent
edit_profile_form += \
edit_text_area(translate['Web Crawlers Allowed'],
edit_text_area(translate['Web Bots Allowed'],
'crawlersAllowedStr', crawlers_allowed_str,
200, '', False)