Add crawlers module

2022-03-06 12:56:26 +00:00 · 2022-03-06 12:56:26 +00:00 · f4fc143b3a
parent 35883119be
commit f4fc143b3a
23 changed files with 199 additions and 23 deletions
--- a/crawlers.py
+++ b/crawlers.py
@ -0,0 +1,120 @@
+__filename__ = "crawlers.py"
+__author__ = "Bob Mottram"
+__license__ = "AGPL3+"
+__version__ = "1.3.0"
+__maintainer__ = "Bob Mottram"
+__email__ = "bob@libreserver.org"
+__status__ = "Production"
+__module_group__ = "Core"
+
+import time
+from utils import save_json
+from utils import user_agent_domain
+from blocking import update_blocked_cache
+from blocking import is_blocked_domain
+
+default_user_agent_blocks = [
+    'fedilist'
+]
+
+
+def update_known_crawlers(ua_str: str,
+                          base_dir: str, known_crawlers: {},
+                          last_known_crawler: int):
+    """Updates a dictionary of known crawlers accessing nodeinfo
+    or the masto API
+    """
+    if not ua_str:
+        return None
+
+    curr_time = int(time.time())
+    if known_crawlers.get(ua_str):
+        known_crawlers[ua_str]['hits'] += 1
+        known_crawlers[ua_str]['lastseen'] = curr_time
+    else:
+        known_crawlers[ua_str] = {
+            "lastseen": curr_time,
+            "hits": 1
+        }
+
+    if curr_time - last_known_crawler >= 30:
+        # remove any old observations
+        remove_crawlers = []
+        for uagent, item in known_crawlers.items():
+            if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
+                remove_crawlers.append(uagent)
+        for uagent in remove_crawlers:
+            del known_crawlers[uagent]
+        # save the list of crawlers
+        save_json(known_crawlers,
+                  base_dir + '/accounts/knownCrawlers.json')
+    return curr_time
+
+
+def blocked_user_agent(calling_domain: str, agent_str: str,
+                       news_instance: bool, debug: bool,
+                       user_agents_blocked: [],
+                       blocked_cache_last_updated,
+                       base_dir: str,
+                       blocked_cache: [],
+                       blocked_cache_update_secs: int,
+                       crawlers_allowed: []):
+    """Should a GET or POST be blocked based upon its user agent?
+    """
+    if not agent_str:
+        return False, blocked_cache_last_updated
+
+    agent_str_lower = agent_str.lower()
+    for ua_block in default_user_agent_blocks:
+        if ua_block in agent_str_lower:
+            print('Blocked User agent: ' + ua_block)
+            return True, blocked_cache_last_updated
+
+    agent_domain = None
+
+    if agent_str:
+        # is this a web crawler? If so the block it
+        if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
+            # if this is a news instance then we want it
+            # to be indexed by search engines
+            if news_instance:
+                return False, blocked_cache_last_updated
+            # is this crawler allowed?
+            for crawler in crawlers_allowed:
+                if crawler.lower() in agent_str_lower:
+                    return False, blocked_cache_last_updated
+            print('Blocked Crawler: ' + agent_str)
+            return True, blocked_cache_last_updated
+        # get domain name from User-Agent
+        agent_domain = user_agent_domain(agent_str, debug)
+    else:
+        # no User-Agent header is present
+        return True, blocked_cache_last_updated
+
+    # is the User-Agent type blocked? eg. "Mastodon"
+    if user_agents_blocked:
+        blocked_ua = False
+        for agent_name in user_agents_blocked:
+            if agent_name in agent_str:
+                blocked_ua = True
+                break
+        if blocked_ua:
+            return True, blocked_cache_last_updated
+
+    if not agent_domain:
+        return False, blocked_cache_last_updated
+
+    # is the User-Agent domain blocked
+    blocked_ua = False
+    if not agent_domain.startswith(calling_domain):
+        blocked_cache_last_updated = \
+            update_blocked_cache(base_dir, blocked_cache,
+                                 blocked_cache_last_updated,
+                                 blocked_cache_update_secs)
+
+        blocked_ua = \
+            is_blocked_domain(base_dir, agent_domain, blocked_cache)
+        # if self.server.debug:
+        if blocked_ua:
+            print('Blocked User agent: ' + agent_domain)
+    return blocked_ua, blocked_cache_last_updated
--- a/daemon.py
+++ b/daemon.py
@ -6689,6 +6689,29 @@ class PubServer(BaseHTTPRequestHandler):
                            set_config_param(base_dir, 'userAgentsBlocked',
                                             user_agents_blocked_str)

+                        # save allowed web crawlers
+                        crawlers_allowed = []
+                        if fields.get('crawlersAllowedStr'):
+                            crawlers_allowed_str = \
+                                fields['crawlersAllowedStr']
+                            crawlers_allowed_list = \
+                                crawlers_allowed_str.split('\n')
+                            for uagent in crawlers_allowed_list:
+                                if uagent in crawlers_allowed:
+                                    continue
+                                crawlers_allowed.append(uagent.strip())
+                        if str(self.server.crawlers_allowed) != \
+                           str(crawlers_allowed):
+                            self.server.crawlers_allowed = \
+                                crawlers_allowed
+                            crawlers_allowed_str = ''
+                            for uagent in crawlers_allowed:
+                                if crawlers_allowed_str:
+                                    crawlers_allowed_str += ','
+                                crawlers_allowed_str += uagent
+                            set_config_param(base_dir, 'crawlersAllowed',
+                                             crawlers_allowed_str)
+
                        # save peertube instances list
                        peertube_instances_file = \
                            base_dir + '/accounts/peertube.txt'
@ -13733,6 +13756,7 @@ class PubServer(BaseHTTPRequestHandler):
                                    self.server.text_mode_banner,
                                    city,
                                    self.server.user_agents_blocked,
+                                    self.server.crawlers_allowed,
                                    access_keys,
                                    default_reply_interval_hrs,
                                    self.server.cw_lists,
--- a/translations/ar.json
+++ b/translations/ar.json
@ -515,5 +515,6 @@
    "Show who liked this post": "أظهر من أحب هذا المنشور",
    "Show who repeated this post": "أظهر من كرر هذا المنصب",
    "Repeated by": "يتكرر بواسطة",
-    "Register": "يسجل"
+    "Register": "يسجل",
+    "Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
 }
--- a/translations/ca.json
+++ b/translations/ca.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Mostra a qui li agrada aquesta publicació",
    "Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
    "Repeated by": "Repetit per",
-    "Register": "Registra't"
+    "Register": "Registra't",
+    "Web Crawlers Allowed": "Es permeten rastrejadors web"
 }
--- a/translations/cy.json
+++ b/translations/cy.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn",
    "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
    "Repeated by": "Ailadrodd gan",
-    "Register": "Cofrestrwch"
+    "Register": "Cofrestrwch",
+    "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
 }
--- a/translations/de.json
+++ b/translations/de.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat",
    "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
    "Repeated by": "Wiederholt von",
-    "Register": "Registrieren"
+    "Register": "Registrieren",
+    "Web Crawlers Allowed": "Webcrawler erlaubt"
 }
--- a/translations/en.json
+++ b/translations/en.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Show who liked this post",
    "Show who repeated this post": "Show who repeated this post",
    "Repeated by": "Repeated by",
-    "Register": "Register"
+    "Register": "Register",
+    "Web Crawlers Allowed": "Web Crawlers Allowed"
 }
--- a/translations/es.json
+++ b/translations/es.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Mostrar a quién le gustó esta publicación",
    "Show who repeated this post": "Mostrar quién repitió esta publicación",
    "Repeated by": "Repetido por",
-    "Register": "Registrarse"
+    "Register": "Registrarse",
+    "Web Crawlers Allowed": "Rastreadores web permitidos"
 }
--- a/translations/fr.json
+++ b/translations/fr.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Montrer qui a aimé ce post",
    "Show who repeated this post": "Montrer qui a répété ce post",
    "Repeated by": "Répété par",
-    "Register": "S'inscrire"
+    "Register": "S'inscrire",
+    "Web Crawlers Allowed": "Robots d'exploration Web autorisés"
 }
--- a/translations/ga.json
+++ b/translations/ga.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Taispeáin cé a thaitin an postáil seo",
    "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
    "Repeated by": "Arís agus arís eile ag",
-    "Register": "Clár"
+    "Register": "Clár",
+    "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
 }
--- a/translations/hi.json
+++ b/translations/hi.json
@ -515,5 +515,6 @@
    "Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया",
    "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
    "Repeated by": "द्वारा दोहराया गया",
-    "Register": "रजिस्टर करें"
+    "Register": "रजिस्टर करें",
+    "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
 }
--- a/translations/it.json
+++ b/translations/it.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Mostra a chi è piaciuto questo post",
    "Show who repeated this post": "Mostra chi ha ripetuto questo post",
    "Repeated by": "Ripetuto da",
-    "Register": "Registrati"
+    "Register": "Registrati",
+    "Web Crawlers Allowed": "Web crawler consentiti"
 }
--- a/translations/ja.json
+++ b/translations/ja.json
@ -515,5 +515,6 @@
    "Show who liked this post": "この投稿を高く評価した人を表示する",
    "Show who repeated this post": "この投稿を繰り返した人を表示する",
    "Repeated by": "によって繰り返される",
-    "Register": "登録"
+    "Register": "登録",
+    "Web Crawlers Allowed": "許可されるWebクローラー"
 }
--- a/translations/ko.json
+++ b/translations/ko.json
@ -515,5 +515,6 @@
    "Show who liked this post": "이 포스트를 좋아한 사람 표시",
    "Show who repeated this post": "이 포스트를 반복한 사람 표시",
    "Repeated by": "반복한 사람",
-    "Register": "등록"
+    "Register": "등록",
+    "Web Crawlers Allowed": "웹 크롤러 허용"
 }
--- a/translations/ku.json
+++ b/translations/ku.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Nîşan bide kê ev post eciband",
    "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
    "Repeated by": "Ji hêla dubare kirin",
-    "Register": "Fêhrist"
+    "Register": "Fêhrist",
+    "Web Crawlers Allowed": "Crawlers Web Destûrdar in"
 }
--- a/translations/oc.json
+++ b/translations/oc.json
@ -511,5 +511,6 @@
    "Show who liked this post": "Show who liked this post",
    "Show who repeated this post": "Show who repeated this post",
    "Repeated by": "Repeated by",
-    "Register": "Register"
+    "Register": "Register",
+    "Web Crawlers Allowed": "Web Crawlers Allowed"
 }
--- a/translations/pl.json
+++ b/translations/pl.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Pokaż, kto polubił ten post",
    "Show who repeated this post": "Pokaż, kto powtórzył ten post",
    "Repeated by": "Powtórzone przez",
-    "Register": "Zarejestrować"
+    "Register": "Zarejestrować",
+    "Web Crawlers Allowed": "Dozwolone roboty sieciowe"
 }
--- a/translations/pt.json
+++ b/translations/pt.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Mostrar quem gostou deste post",
    "Show who repeated this post": "Mostrar quem repetiu esta postagem",
    "Repeated by": "Repetido por",
-    "Register": "Registro"
+    "Register": "Registro",
+    "Web Crawlers Allowed": "Rastreadores da Web permitidos"
 }
--- a/translations/ru.json
+++ b/translations/ru.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Показать, кому понравился этот пост",
    "Show who repeated this post": "Показать, кто повторил этот пост",
    "Repeated by": "Повторено",
-    "Register": "регистр"
+    "Register": "регистр",
+    "Web Crawlers Allowed": "Веб-сканеры разрешены"
 }
--- a/translations/sw.json
+++ b/translations/sw.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Onyesha ni nani aliyependa chapisho hili",
    "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
    "Repeated by": "Imerudiwa na",
-    "Register": "Sajili"
+    "Register": "Sajili",
+    "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
 }
--- a/translations/uk.json
+++ b/translations/uk.json
@ -515,5 +515,6 @@
    "Show who liked this post": "Покажіть, кому сподобався цей пост",
    "Show who repeated this post": "Покажіть, хто повторив цей пост",
    "Repeated by": "Повторюється за",
-    "Register": "Реєстрація"
+    "Register": "Реєстрація",
+    "Web Crawlers Allowed": "Веб-сканери дозволені"
 }
--- a/translations/zh.json
+++ b/translations/zh.json
@ -515,5 +515,6 @@
    "Show who liked this post": "显示谁喜欢这篇文章",
    "Show who repeated this post": "显示谁重复了这篇文章",
    "Repeated by": "重复",
-    "Register": "登记"
+    "Register": "登记",
+    "Web Crawlers Allowed": "允许网络爬虫"
 }
--- a/webapp_profile.py
+++ b/webapp_profile.py
@ -1631,6 +1631,7 @@ def _html_edit_profile_shared_items(base_dir: str, nickname: str, domain: str,

 def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
                                 user_agents_blocked: str,
+                                 crawlers_allowed: str,
                                 translate: {}, reply_interval_hours: int,
                                 cw_lists: {}, lists_enabled: str) -> str:
    """Filtering and blocking section of edit profile screen
@ -1807,6 +1808,16 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
                           'userAgentsBlockedStr', user_agents_blocked_str,
                           200, '', False)

+        crawlers_allowed_str = ''
+        for uagent in crawlers_allowed:
+            if crawlers_allowed_str:
+                crawlers_allowed_str += '\n'
+            crawlers_allowed_str += uagent
+        edit_profile_form += \
+            edit_text_area(translate['Web Crawlers Allowed'],
+                           'crawlersAllowedStr', crawlers_allowed_str,
+                           200, '', False)
+
        cw_lists_str = ''
        for name, _ in cw_lists.items():
            variablename = get_cw_list_variable(name)
@ -2137,7 +2148,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
                      default_timeline: str, theme: str,
                      peertube_instances: [],
                      text_mode_banner: str, city: str,
-                      user_agents_blocked: str,
+                      user_agents_blocked: [],
+                      crawlers_allowed: [],
                      access_keys: {},
                      default_reply_interval_hrs: int,
                      cw_lists: {}, lists_enabled: str) -> str:
@ -2354,8 +2366,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
                                 default_reply_interval_hrs)
    edit_profile_form += \
        _html_edit_profile_filtering(base_dir, nickname, domain,
-                                     user_agents_blocked, translate,
-                                     reply_interval_hours,
+                                     user_agents_blocked, crawlers_allowed,
+                                     translate, reply_interval_hours,
                                     cw_lists, lists_enabled)

    # git projects section