Merge branch 'main' of gitlab.com:bashrc2/epicyon

2022-03-06 15:45:55 +00:00 · 2022-03-06 15:45:55 +00:00 · aff09b378e
parent 560f02044a 46fe7c2a9a
commit aff09b378e
23 changed files with 125 additions and 25 deletions
--- a/crawlers.py
+++ b/crawlers.py
@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
 __status__ = "Production"
 __module_group__ = "Core"

+import os
 import time
 from utils import save_json
 from utils import user_agent_domain
@ -51,6 +52,51 @@ def update_known_crawlers(ua_str: str,
    return curr_time


+def load_known_web_bots(base_dir: str) -> []:
+    """Returns a list of known web bots
+    """
+    known_bots_filename = base_dir + '/accounts/knownBots.txt'
+    if not os.path.isfile(known_bots_filename):
+        return []
+    crawlers_str = None
+    try:
+        with open(known_bots_filename, 'r') as fp_crawlers:
+            crawlers_str = fp_crawlers.read()
+    except OSError:
+        print('EX: unable to load web bots from ' +
+              known_bots_filename)
+    if not crawlers_str:
+        return []
+    known_bots = []
+    crawlers_list = crawlers_str.split('\n')
+    for crawler in crawlers_list:
+        if not crawler:
+            continue
+        crawler = crawler.replace('\n', '').strip()
+        if not crawler:
+            continue
+        if crawler not in known_bots:
+            known_bots.append(crawler)
+    return known_bots
+
+
+def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
+    """Saves a list of known web bots
+    """
+    known_bots_filename = base_dir + '/accounts/knownBots.txt'
+    known_bots_str = ''
+    for crawler in known_bots:
+        known_bots_str += crawler.strip() + '\n'
+    try:
+        with open(known_bots_filename, 'w+') as fp_crawlers:
+            fp_crawlers.write(known_bots_str)
+    except OSError:
+        print("EX: unable to save known web bots to " +
+              known_bots_filename)
+        return False
+    return True
+
+
 def blocked_user_agent(calling_domain: str, agent_str: str,
                       news_instance: bool, debug: bool,
                       user_agents_blocked: [],
@ -58,7 +104,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
                       base_dir: str,
                       blocked_cache: [],
                       blocked_cache_update_secs: int,
-                       crawlers_allowed: []):
+                       crawlers_allowed: [],
+                       known_bots: []):
    """Should a GET or POST be blocked based upon its user agent?
    """
    if not agent_str:
@ -73,8 +120,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
    agent_domain = None

    if agent_str:
-        # is this a web crawler? If so the block it
+        # is this a web crawler? If so then block it by default
+        # unless this is a news instance or if it is in the allowed list
        if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
+            if agent_str_lower not in known_bots:
+                known_bots.append(agent_str_lower)
+                known_bots.sort()
+                _save_known_web_bots(base_dir, known_bots)
            # if this is a news instance then we want it
            # to be indexed by search engines
            if news_instance:
--- a/daemon.py
+++ b/daemon.py
@ -380,6 +380,7 @@ from siteactive import referer_is_active
 from webapp_likers import html_likers_of_post
 from crawlers import update_known_crawlers
 from crawlers import blocked_user_agent
+from crawlers import load_known_web_bots
 import os


@ -14008,7 +14009,8 @@ class PubServer(BaseHTTPRequestHandler):
                                   self.server.base_dir,
                                   self.server.blocked_cache,
                                   self.server.blocked_cache_update_secs,
-                                   self.server.crawlers_allowed)
+                                   self.server.crawlers_allowed,
+                                   self.server.known_bots)
            if block:
                self._400()
                return
@ -14185,6 +14187,24 @@ class PubServer(BaseHTTPRequestHandler):
                            '_GET', 'isAuthorized',
                            self.server.debug)

+        if authorized and self.path.endswith('/bots.txt'):
+            known_bots_str = ''
+            for bot_name in self.server.known_bots:
+                known_bots_str += bot_name + '\n'
+            # TODO
+            msg = known_bots_str.encode('utf-8')
+            msglen = len(msg)
+            self._set_headers('text/plain; charset=utf-8',
+                              msglen, None, calling_domain, True)
+            self._write(msg)
+            if self.server.debug:
+                print('Sent known bots: ' +
+                      self.server.path + ' ' + calling_domain)
+            fitness_performance(getreq_start_time, self.server.fitness,
+                                '_GET', 'get_known_bots',
+                                self.server.debug)
+            return
+
        # shared items catalog for this instance
        # this is only accessible to instance members or to
        # other instances which present an authorization token
@ -18550,7 +18570,8 @@ class PubServer(BaseHTTPRequestHandler):
                               self.server.base_dir,
                               self.server.blocked_cache,
                               self.server.blocked_cache_update_secs,
-                               self.server.crawlers_allowed)
+                               self.server.crawlers_allowed,
+                               self.server.known_bots)
        if block:
            self._400()
            self.server.postreq_busy = False
@ -19666,6 +19687,9 @@ def run_daemon(crawlers_allowed: [],
    # list of crawler bots permitted within the User-Agent header
    httpd.crawlers_allowed = crawlers_allowed

+    # list of web crawlers known to the system
+    httpd.known_bots = load_known_web_bots(base_dir)
+
    httpd.unit_test = unit_test
    httpd.allow_local_network_access = allow_local_network_access
    if unit_test:
--- a/translations/ar.json
+++ b/translations/ar.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "أظهر من كرر هذا المنصب",
    "Repeated by": "يتكرر بواسطة",
    "Register": "يسجل",
-    "Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
+    "Web Bots Allowed": "مسموح روبوتات الويب",
+    "Known Search Bots": "روبوتات بحث الويب المعروفة"
 }
--- a/translations/ca.json
+++ b/translations/ca.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
    "Repeated by": "Repetit per",
    "Register": "Registra't",
-    "Web Crawlers Allowed": "Es permeten rastrejadors web"
+    "Web Bots Allowed": "Bots web permesos",
+    "Known Search Bots": "Bots de cerca web coneguts"
 }
--- a/translations/cy.json
+++ b/translations/cy.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
    "Repeated by": "Ailadrodd gan",
    "Register": "Cofrestrwch",
-    "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
+    "Web Bots Allowed": "Web Bots a Ganiateir",
+    "Known Search Bots": "Bots Chwilio Gwe Hysbys"
 }
--- a/translations/de.json
+++ b/translations/de.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
    "Repeated by": "Wiederholt von",
    "Register": "Registrieren",
-    "Web Crawlers Allowed": "Webcrawler erlaubt"
+    "Web Bots Allowed": "Webbots erlaubt",
+    "Known Search Bots": "Bekannte Bots für die Websuche"
 }
--- a/translations/en.json
+++ b/translations/en.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Show who repeated this post",
    "Repeated by": "Repeated by",
    "Register": "Register",
-    "Web Crawlers Allowed": "Web Crawlers Allowed"
+    "Web Bots Allowed": "Web Search Bots Allowed",
+    "Known Search Bots": "Known Web Search Bots"
 }
--- a/translations/es.json
+++ b/translations/es.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Mostrar quién repitió esta publicación",
    "Repeated by": "Repetido por",
    "Register": "Registrarse",
-    "Web Crawlers Allowed": "Rastreadores web permitidos"
+    "Web Bots Allowed": "Bots web permitidos",
+    "Known Search Bots": "Bots de búsqueda web conocidos"
 }
--- a/translations/fr.json
+++ b/translations/fr.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Montrer qui a répété ce post",
    "Repeated by": "Répété par",
    "Register": "S'inscrire",
-    "Web Crawlers Allowed": "Robots d'exploration Web autorisés"
+    "Web Bots Allowed": "Robots Web autorisés",
+    "Known Search Bots": "Robots de recherche Web connus"
 }
--- a/translations/ga.json
+++ b/translations/ga.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
    "Repeated by": "Arís agus arís eile ag",
    "Register": "Clár",
-    "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
+    "Web Bots Allowed": "Róbónna Gréasáin Ceadaithe",
+    "Known Search Bots": "Róbónna Cuardach Gréasáin Aitheanta"
 }
--- a/translations/hi.json
+++ b/translations/hi.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
    "Repeated by": "द्वारा दोहराया गया",
    "Register": "रजिस्टर करें",
-    "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
+    "Web Bots Allowed": "वेब बॉट्स की अनुमति है",
+    "Known Search Bots": "ज्ञात वेब खोज बॉट्स"
 }
--- a/translations/it.json
+++ b/translations/it.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Mostra chi ha ripetuto questo post",
    "Repeated by": "Ripetuto da",
    "Register": "Registrati",
-    "Web Crawlers Allowed": "Web crawler consentiti"
+    "Web Bots Allowed": "Web bot consentiti",
+    "Known Search Bots": "Bot di ricerca Web noti"
 }
--- a/translations/ja.json
+++ b/translations/ja.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "この投稿を繰り返した人を表示する",
    "Repeated by": "によって繰り返される",
    "Register": "登録",
-    "Web Crawlers Allowed": "許可されるWebクローラー"
+    "Web Bots Allowed": "許可されたWebボット",
+    "Known Search Bots": "既知のWeb検索ボット"
 }
--- a/translations/ko.json
+++ b/translations/ko.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "이 포스트를 반복한 사람 표시",
    "Repeated by": "반복한 사람",
    "Register": "등록",
-    "Web Crawlers Allowed": "웹 크롤러 허용"
+    "Web Bots Allowed": "웹 봇 허용",
+    "Known Search Bots": "알려진 웹 검색 봇"
 }
--- a/translations/ku.json
+++ b/translations/ku.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
    "Repeated by": "Ji hêla dubare kirin",
    "Register": "Fêhrist",
-    "Web Crawlers Allowed": "Crawlers Web Destûrdar in"
+    "Web Bots Allowed": "Web Bots Destûrdar in",
+    "Known Search Bots": "Botên Lêgerîna Webê yên naskirî"
 }
--- a/translations/oc.json
+++ b/translations/oc.json
@ -512,5 +512,6 @@
    "Show who repeated this post": "Show who repeated this post",
    "Repeated by": "Repeated by",
    "Register": "Register",
-    "Web Crawlers Allowed": "Web Crawlers Allowed"
+    "Web Bots Allowed": "Web Search Bots Allowed",
+    "Known Search Bots": "Known Web Search Bots"
 }
--- a/translations/pl.json
+++ b/translations/pl.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Pokaż, kto powtórzył ten post",
    "Repeated by": "Powtórzone przez",
    "Register": "Zarejestrować",
-    "Web Crawlers Allowed": "Dozwolone roboty sieciowe"
+    "Web Bots Allowed": "Dozwolone boty internetowe",
+    "Known Search Bots": "Znane boty wyszukiwania w sieci"
 }
--- a/translations/pt.json
+++ b/translations/pt.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Mostrar quem repetiu esta postagem",
    "Repeated by": "Repetido por",
    "Register": "Registro",
-    "Web Crawlers Allowed": "Rastreadores da Web permitidos"
+    "Web Bots Allowed": "Webbots permitidos",
+    "Known Search Bots": "Bots de pesquisa na Web conhecidos"
 }
--- a/translations/ru.json
+++ b/translations/ru.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Показать, кто повторил этот пост",
    "Repeated by": "Повторено",
    "Register": "регистр",
-    "Web Crawlers Allowed": "Веб-сканеры разрешены"
+    "Web Bots Allowed": "Веб-боты разрешены",
+    "Known Search Bots": "Известные боты веб-поиска"
 }
--- a/translations/sw.json
+++ b/translations/sw.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
    "Repeated by": "Imerudiwa na",
    "Register": "Sajili",
-    "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
+    "Web Bots Allowed": "Mtandao wa Boti Unaruhusiwa",
+    "Known Search Bots": "Vijibu vya Utafutaji wa Wavuti vinavyojulikana"
 }
--- a/translations/uk.json
+++ b/translations/uk.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "Покажіть, хто повторив цей пост",
    "Repeated by": "Повторюється за",
    "Register": "Реєстрація",
-    "Web Crawlers Allowed": "Веб-сканери дозволені"
+    "Web Bots Allowed": "Веб-боти дозволені",
+    "Known Search Bots": "Відомі пошукові роботи в Інтернеті"
 }
--- a/translations/zh.json
+++ b/translations/zh.json
@ -516,5 +516,6 @@
    "Show who repeated this post": "显示谁重复了这篇文章",
    "Repeated by": "重复",
    "Register": "登记",
-    "Web Crawlers Allowed": "允许网络爬虫"
+    "Web Bots Allowed": "允许网络机器人",
+    "Known Search Bots": "已知的网络搜索机器人"
 }
--- a/webapp_profile.py
+++ b/webapp_profile.py
@ -1808,13 +1808,17 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
                           'userAgentsBlockedStr', user_agents_blocked_str,
                           200, '', False)

+        edit_profile_form += \
+            '<a href="/users/' + nickname + '/bots.txt">' + \
+            translate['Known Search Bots'] + '</a><br>\n'
+
        crawlers_allowed_str = ''
        for uagent in crawlers_allowed:
            if crawlers_allowed_str:
                crawlers_allowed_str += '\n'
            crawlers_allowed_str += uagent
        edit_profile_form += \
-            edit_text_area(translate['Web Crawlers Allowed'],
+            edit_text_area(translate['Web Bots Allowed'],
                           'crawlersAllowedStr', crawlers_allowed_str,
                           200, '', False)