From 7f0d8972997bf9bf20c2f2d88202995ace507e4a Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sun, 6 Mar 2022 11:04:19 +0000
Subject: [PATCH 1/5] Tidying

---
 daemon.py | 58 +++++++++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/daemon.py b/daemon.py
index 8013a49bb..839b6ca20 100644
--- a/daemon.py
+++ b/daemon.py
@@ -378,6 +378,7 @@ from fitnessFunctions import sorted_watch_points
 from fitnessFunctions import html_watch_points_graph
 from siteactive import referer_is_active
 from webapp_likers import html_likers_of_post
+from crawlers import update_known_crawlers
 import os
 
 
@@ -418,36 +419,6 @@ def save_domain_qrcode(base_dir: str, http_prefix: str,
 class PubServer(BaseHTTPRequestHandler):
     protocol_version = 'HTTP/1.1'
 
-    def _update_known_crawlers(self, ua_str: str) -> None:
-        """Updates a dictionary of known crawlers accessing nodeinfo
-        or the masto API
-        """
-        if not ua_str:
-            return
-
-        curr_time = int(time.time())
-        if self.server.known_crawlers.get(ua_str):
-            self.server.known_crawlers[ua_str]['hits'] += 1
-            self.server.known_crawlers[ua_str]['lastseen'] = curr_time
-        else:
-            self.server.known_crawlers[ua_str] = {
-                "lastseen": curr_time,
-                "hits": 1
-            }
-
-        if curr_time - self.server.last_known_crawler >= 30:
-            # remove any old observations
-            remove_crawlers = []
-            for uagent, item in self.server.known_crawlers.items():
-                if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
-                    remove_crawlers.append(uagent)
-            for uagent in remove_crawlers:
-                del self.server.known_crawlers[uagent]
-            # save the list of crawlers
-            save_json(self.server.known_crawlers,
-                      self.server.base_dir + '/accounts/knownCrawlers.json')
-        self.server.last_known_crawler = curr_time
-
     def _get_instance_url(self, calling_domain: str) -> str:
         """Returns the URL for this instance
         """
@@ -1115,7 +1086,8 @@ class PubServer(BaseHTTPRequestHandler):
                       show_node_info_accounts: bool,
                       referer_domain: str,
                       debug: bool,
-                      calling_site_timeout: int) -> bool:
+                      calling_site_timeout: int,
+                      known_crawlers: {}) -> bool:
         """This is a vestigil mastodon API for the purpose
         of returning an empty result to sites like
         https://mastopeek.app-dist.eu
@@ -1171,7 +1143,12 @@ class PubServer(BaseHTTPRequestHandler):
         print('mastodon api v1: authorized ' + str(authorized))
         print('mastodon api v1: nickname ' + str(nickname))
         print('mastodon api v1: referer ' + referer_domain)
-        self._update_known_crawlers(ua_str)
+        crawl_time = \
+            update_known_crawlers(ua_str, base_dir,
+                                  self.server.known_crawlers,
+                                  self.server.last_known_crawler)
+        if crawl_time is not None:
+            self.server.last_known_crawler = crawl_time
 
         broch_mode = broch_mode_is_active(base_dir)
         send_json, send_json_str = \
@@ -1229,14 +1206,16 @@ class PubServer(BaseHTTPRequestHandler):
                    project_version: str,
                    custom_emoji: [],
                    show_node_info_accounts: bool,
-                   referer_domain: str, debug: bool) -> bool:
+                   referer_domain: str, debug: bool,
+                   known_crawlers: {}) -> bool:
         return self._masto_api_v1(path, calling_domain, ua_str, authorized,
                                   http_prefix, base_dir, nickname, domain,
                                   domain_full, onion_domain, i2p_domain,
                                   translate, registration, system_language,
                                   project_version, custom_emoji,
                                   show_node_info_accounts,
-                                  referer_domain, debug, 5)
+                                  referer_domain, debug, 5,
+                                  known_crawlers)
 
     def _show_vcard(self, base_dir: str, path: str, calling_domain: str,
                     referer_domain: str, domain: str, debug: bool) -> bool:
@@ -1349,7 +1328,13 @@ class PubServer(BaseHTTPRequestHandler):
                 return True
         if self.server.debug:
             print('DEBUG: nodeinfo ' + self.path)
-        self._update_known_crawlers(ua_str)
+        crawl_time = \
+            update_known_crawlers(ua_str,
+                                  self.server.base_dir,
+                                  self.server.known_crawlers,
+                                  self.server.last_known_crawler)
+        if crawl_time is not None:
+            self.server.last_known_crawler = crawl_time
 
         # If we are in broch mode then don't show potentially
         # sensitive metadata.
@@ -14430,7 +14415,8 @@ class PubServer(BaseHTTPRequestHandler):
                            self.server.custom_emoji,
                            self.server.show_node_info_accounts,
                            referer_domain,
-                           self.server.debug):
+                           self.server.debug,
+                           self.server.known_crawlers):
             return
 
         fitness_performance(getreq_start_time, self.server.fitness,

From 3aab054c04fc1c29879228f24c6185b5a75cab8c Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sun, 6 Mar 2022 11:59:30 +0000
Subject: [PATCH 2/5] Tidying of user agent blocks

---
 daemon.py | 82 ++++++++++++++-----------------------------------------
 1 file changed, 21 insertions(+), 61 deletions(-)

diff --git a/daemon.py b/daemon.py
index 839b6ca20..5d5a1b72c 100644
--- a/daemon.py
+++ b/daemon.py
@@ -379,6 +379,7 @@ from fitnessFunctions import html_watch_points_graph
 from siteactive import referer_is_active
 from webapp_likers import html_likers_of_post
 from crawlers import update_known_crawlers
+from crawlers import blocked_user_agent
 import os
 
 
@@ -560,65 +561,6 @@ class PubServer(BaseHTTPRequestHandler):
         else:
             print('ERROR: unable to create vote')
 
-    def _blocked_user_agent(self, calling_domain: str, agent_str: str) -> bool:
-        """Should a GET or POST be blocked based upon its user agent?
-        """
-        if not agent_str:
-            return False
-
-        agent_str_lower = agent_str.lower()
-        default_agent_blocks = [
-            'fedilist'
-        ]
-        for ua_block in default_agent_blocks:
-            if ua_block in agent_str_lower:
-                print('Blocked User agent: ' + ua_block)
-                return True
-
-        agent_domain = None
-
-        if agent_str:
-            # is this a web crawler? If so the block it
-            if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
-                if self.server.news_instance:
-                    return False
-                print('Blocked Crawler: ' + agent_str)
-                return True
-            # get domain name from User-Agent
-            agent_domain = user_agent_domain(agent_str, self.server.debug)
-        else:
-            # no User-Agent header is present
-            return True
-
-        # is the User-Agent type blocked? eg. "Mastodon"
-        if self.server.user_agents_blocked:
-            blocked_ua = False
-            for agent_name in self.server.user_agents_blocked:
-                if agent_name in agent_str:
-                    blocked_ua = True
-                    break
-            if blocked_ua:
-                return True
-
-        if not agent_domain:
-            return False
-
-        # is the User-Agent domain blocked
-        blocked_ua = False
-        if not agent_domain.startswith(calling_domain):
-            self.server.blocked_cache_last_updated = \
-                update_blocked_cache(self.server.base_dir,
-                                     self.server.blocked_cache,
-                                     self.server.blocked_cache_last_updated,
-                                     self.server.blocked_cache_update_secs)
-
-            blocked_ua = is_blocked_domain(self.server.base_dir, agent_domain,
-                                           self.server.blocked_cache)
-            # if self.server.debug:
-            if blocked_ua:
-                print('Blocked User agent: ' + agent_domain)
-        return blocked_ua
-
     def _request_csv(self) -> bool:
         """Should a csv response be given?
         """
@@ -14033,7 +13975,16 @@ class PubServer(BaseHTTPRequestHandler):
         ua_str = self._get_user_agent()
 
         if not self._permitted_crawler_path(self.path):
-            if self._blocked_user_agent(calling_domain, ua_str):
+            block, self.server.blocked_cache_last_updated = \
+                blocked_user_agent(calling_domain, ua_str,
+                                   self.server.news_instance,
+                                   self.server.debug,
+                                   self.server.user_agents_blocked,
+                                   self.server.blocked_cache_last_updated,
+                                   self.server.base_dir,
+                                   self.server.blocked_cache,
+                                   self.server.blocked_cache_update_secs)
+            if block:
                 self._400()
                 return
 
@@ -18565,7 +18516,16 @@ class PubServer(BaseHTTPRequestHandler):
 
         ua_str = self._get_user_agent()
 
-        if self._blocked_user_agent(calling_domain, ua_str):
+        block, self.server.blocked_cache_last_updated = \
+            blocked_user_agent(calling_domain, ua_str,
+                               self.server.news_instance,
+                               self.server.debug,
+                               self.server.user_agents_blocked,
+                               self.server.blocked_cache_last_updated,
+                               self.server.base_dir,
+                               self.server.blocked_cache,
+                               self.server.blocked_cache_update_secs)
+        if block:
             self._400()
             self.server.postreq_busy = False
             return

From 35883119bee3a51c4d994799f9e5ea56f7f25a7e Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sun, 6 Mar 2022 12:31:58 +0000
Subject: [PATCH 3/5] Option to allow some crawlers

---
 daemon.py  | 12 +++++++++---
 epicyon.py | 23 ++++++++++++++++++++---
 tests.py   | 16 ++++++++++++----
 3 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/daemon.py b/daemon.py
index 5d5a1b72c..87a0730d4 100644
--- a/daemon.py
+++ b/daemon.py
@@ -13983,7 +13983,8 @@ class PubServer(BaseHTTPRequestHandler):
                                    self.server.blocked_cache_last_updated,
                                    self.server.base_dir,
                                    self.server.blocked_cache,
-                                   self.server.blocked_cache_update_secs)
+                                   self.server.blocked_cache_update_secs,
+                                   self.server.crawlers_allowed)
             if block:
                 self._400()
                 return
@@ -18524,7 +18525,8 @@ class PubServer(BaseHTTPRequestHandler):
                                self.server.blocked_cache_last_updated,
                                self.server.base_dir,
                                self.server.blocked_cache,
-                               self.server.blocked_cache_update_secs)
+                               self.server.blocked_cache_update_secs,
+                               self.server.crawlers_allowed)
         if block:
             self._400()
             self.server.postreq_busy = False
@@ -19457,7 +19459,8 @@ def load_tokens(base_dir: str, tokens_dict: {}, tokens_lookup: {}) -> None:
         break
 
 
-def run_daemon(dyslexic_font: bool,
+def run_daemon(crawlers_allowed: [],
+               dyslexic_font: bool,
                content_license_url: str,
                lists_enabled: str,
                default_reply_interval_hrs: int,
@@ -19636,6 +19639,9 @@ def run_daemon(dyslexic_font: bool,
     # list of blocked user agent types within the User-Agent header
     httpd.user_agents_blocked = user_agents_blocked
 
+    # list of crawler bots permitted within the User-Agent header
+    httpd.crawlers_allowed = crawlers_allowed
+
     httpd.unit_test = unit_test
     httpd.allow_local_network_access = allow_local_network_access
     if unit_test:
diff --git a/epicyon.py b/epicyon.py
index b38ca64fc..87bcbbaf6 100644
--- a/epicyon.py
+++ b/epicyon.py
@@ -141,6 +141,10 @@ parser.add_argument('--lists_enabled', type=str,
 parser.add_argument('--userAgentBlocks', type=str,
                     default=None,
                     help='List of blocked user agents, separated by commas')
+parser.add_argument('--crawlersAllowed', type=str,
+                    default=None,
+                    help='List of permitted web crawler user agents, ' +
+                    'separated by commas')
 parser.add_argument('--libretranslate', dest='libretranslateUrl', type=str,
                     default=None,
                     help='URL for LibreTranslate service')
@@ -3301,8 +3305,20 @@ else:
         get_config_param(base_dir, 'userAgentsBlocked')
 if user_agents_blocked_str:
     agent_blocks_list = user_agents_blocked_str.split(',')
-    for agentBlockStr in agent_blocks_list:
-        user_agents_blocked.append(agentBlockStr.strip())
+    for user_agents_blocked_str2 in agent_blocks_list:
+        user_agents_blocked.append(user_agents_blocked_str2.strip())
+
+crawlers_allowed = []
+if args.crawlersAllowed:
+    crawlers_allowed_str = args.crawlersAllowed
+    set_config_param(base_dir, 'crawlersAllowed', crawlers_allowed_str)
+else:
+    crawlers_allowed_str = \
+        get_config_param(base_dir, 'crawlersAllowed')
+if crawlers_allowed_str:
+    crawlers_allowed_list = crawlers_allowed_str.split(',')
+    for crawlers_allowed_str2 in crawlers_allowed_list:
+        crawlers_allowed.append(crawlers_allowed_str2.strip())
 
 lists_enabled = ''
 if args.lists_enabled:
@@ -3365,7 +3381,8 @@ if args.defaultCurrency:
         print('Default currency set to ' + args.defaultCurrency)
 
 if __name__ == "__main__":
-    run_daemon(args.dyslexic_font,
+    run_daemon(crawlers_allowed,
+               args.dyslexic_font,
                content_license_url,
                lists_enabled,
                args.default_reply_interval_hrs,
diff --git a/tests.py b/tests.py
index 14734a87b..f4121d76a 100644
--- a/tests.py
+++ b/tests.py
@@ -822,8 +822,10 @@ def create_server_alice(path: str, domain: str, port: int,
     lists_enabled = ''
     content_license_url = 'https://creativecommons.org/licenses/by/4.0'
     dyslexic_font = False
+    crawlers_allowed = []
     print('Server running: Alice')
-    run_daemon(dyslexic_font,
+    run_daemon(crawlers_allowed,
+               dyslexic_font,
                content_license_url,
                lists_enabled, default_reply_interval_hrs,
                low_bandwidth, max_like_count,
@@ -975,8 +977,10 @@ def create_server_bob(path: str, domain: str, port: int,
     lists_enabled = ''
     content_license_url = 'https://creativecommons.org/licenses/by/4.0'
     dyslexic_font = False
+    crawlers_allowed = []
     print('Server running: Bob')
-    run_daemon(dyslexic_font,
+    run_daemon(crawlers_allowed,
+               dyslexic_font,
                content_license_url,
                lists_enabled, default_reply_interval_hrs,
                low_bandwidth, max_like_count,
@@ -1051,8 +1055,10 @@ def create_server_eve(path: str, domain: str, port: int, federation_list: [],
     lists_enabled = ''
     content_license_url = 'https://creativecommons.org/licenses/by/4.0'
     dyslexic_font = False
+    crawlers_allowed = []
     print('Server running: Eve')
-    run_daemon(dyslexic_font,
+    run_daemon(crawlers_allowed,
+               dyslexic_font,
                content_license_url,
                lists_enabled, default_reply_interval_hrs,
                low_bandwidth, max_like_count,
@@ -1129,8 +1135,10 @@ def create_server_group(path: str, domain: str, port: int,
     lists_enabled = ''
     content_license_url = 'https://creativecommons.org/licenses/by/4.0'
     dyslexic_font = False
+    crawlers_allowed = []
     print('Server running: Group')
-    run_daemon(dyslexic_font,
+    run_daemon(crawlers_allowed,
+               dyslexic_font,
                content_license_url,
                lists_enabled, default_reply_interval_hrs,
                low_bandwidth, max_like_count,

From f4fc143b3a609789014c1f1ad7ecea79f9f8799c Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sun, 6 Mar 2022 12:56:26 +0000
Subject: [PATCH 4/5] Add crawlers module

---
 crawlers.py          | 120 +++++++++++++++++++++++++++++++++++++++++++
 daemon.py            |  24 +++++++++
 translations/ar.json |   3 +-
 translations/ca.json |   3 +-
 translations/cy.json |   3 +-
 translations/de.json |   3 +-
 translations/en.json |   3 +-
 translations/es.json |   3 +-
 translations/fr.json |   3 +-
 translations/ga.json |   3 +-
 translations/hi.json |   3 +-
 translations/it.json |   3 +-
 translations/ja.json |   3 +-
 translations/ko.json |   3 +-
 translations/ku.json |   3 +-
 translations/oc.json |   3 +-
 translations/pl.json |   3 +-
 translations/pt.json |   3 +-
 translations/ru.json |   3 +-
 translations/sw.json |   3 +-
 translations/uk.json |   3 +-
 translations/zh.json |   3 +-
 webapp_profile.py    |  18 +++++--
 23 files changed, 199 insertions(+), 23 deletions(-)
 create mode 100644 crawlers.py

diff --git a/crawlers.py b/crawlers.py
new file mode 100644
index 000000000..6ec5c43d0
--- /dev/null
+++ b/crawlers.py
@@ -0,0 +1,120 @@
+__filename__ = "crawlers.py"
+__author__ = "Bob Mottram"
+__license__ = "AGPL3+"
+__version__ = "1.3.0"
+__maintainer__ = "Bob Mottram"
+__email__ = "bob@libreserver.org"
+__status__ = "Production"
+__module_group__ = "Core"
+
+import time
+from utils import save_json
+from utils import user_agent_domain
+from blocking import update_blocked_cache
+from blocking import is_blocked_domain
+
+default_user_agent_blocks = [
+    'fedilist'
+]
+
+
+def update_known_crawlers(ua_str: str,
+                          base_dir: str, known_crawlers: {},
+                          last_known_crawler: int):
+    """Updates a dictionary of known crawlers accessing nodeinfo
+    or the masto API
+    """
+    if not ua_str:
+        return None
+
+    curr_time = int(time.time())
+    if known_crawlers.get(ua_str):
+        known_crawlers[ua_str]['hits'] += 1
+        known_crawlers[ua_str]['lastseen'] = curr_time
+    else:
+        known_crawlers[ua_str] = {
+            "lastseen": curr_time,
+            "hits": 1
+        }
+
+    if curr_time - last_known_crawler >= 30:
+        # remove any old observations
+        remove_crawlers = []
+        for uagent, item in known_crawlers.items():
+            if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
+                remove_crawlers.append(uagent)
+        for uagent in remove_crawlers:
+            del known_crawlers[uagent]
+        # save the list of crawlers
+        save_json(known_crawlers,
+                  base_dir + '/accounts/knownCrawlers.json')
+    return curr_time
+
+
+def blocked_user_agent(calling_domain: str, agent_str: str,
+                       news_instance: bool, debug: bool,
+                       user_agents_blocked: [],
+                       blocked_cache_last_updated,
+                       base_dir: str,
+                       blocked_cache: [],
+                       blocked_cache_update_secs: int,
+                       crawlers_allowed: []):
+    """Should a GET or POST be blocked based upon its user agent?
+    """
+    if not agent_str:
+        return False, blocked_cache_last_updated
+
+    agent_str_lower = agent_str.lower()
+    for ua_block in default_user_agent_blocks:
+        if ua_block in agent_str_lower:
+            print('Blocked User agent: ' + ua_block)
+            return True, blocked_cache_last_updated
+
+    agent_domain = None
+
+    if agent_str:
+        # is this a web crawler? If so the block it
+        if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
+            # if this is a news instance then we want it
+            # to be indexed by search engines
+            if news_instance:
+                return False, blocked_cache_last_updated
+            # is this crawler allowed?
+            for crawler in crawlers_allowed:
+                if crawler.lower() in agent_str_lower:
+                    return False, blocked_cache_last_updated
+            print('Blocked Crawler: ' + agent_str)
+            return True, blocked_cache_last_updated
+        # get domain name from User-Agent
+        agent_domain = user_agent_domain(agent_str, debug)
+    else:
+        # no User-Agent header is present
+        return True, blocked_cache_last_updated
+
+    # is the User-Agent type blocked? eg. "Mastodon"
+    if user_agents_blocked:
+        blocked_ua = False
+        for agent_name in user_agents_blocked:
+            if agent_name in agent_str:
+                blocked_ua = True
+                break
+        if blocked_ua:
+            return True, blocked_cache_last_updated
+
+    if not agent_domain:
+        return False, blocked_cache_last_updated
+
+    # is the User-Agent domain blocked
+    blocked_ua = False
+    if not agent_domain.startswith(calling_domain):
+        blocked_cache_last_updated = \
+            update_blocked_cache(base_dir, blocked_cache,
+                                 blocked_cache_last_updated,
+                                 blocked_cache_update_secs)
+
+        blocked_ua = \
+            is_blocked_domain(base_dir, agent_domain, blocked_cache)
+        # if self.server.debug:
+        if blocked_ua:
+            print('Blocked User agent: ' + agent_domain)
+    return blocked_ua, blocked_cache_last_updated
diff --git a/daemon.py b/daemon.py
index 87a0730d4..0d0f5c4b2 100644
--- a/daemon.py
+++ b/daemon.py
@@ -6689,6 +6689,29 @@ class PubServer(BaseHTTPRequestHandler):
                             set_config_param(base_dir, 'userAgentsBlocked',
                                              user_agents_blocked_str)
 
+                        # save allowed web crawlers
+                        crawlers_allowed = []
+                        if fields.get('crawlersAllowedStr'):
+                            crawlers_allowed_str = \
+                                fields['crawlersAllowedStr']
+                            crawlers_allowed_list = \
+                                crawlers_allowed_str.split('\n')
+                            for uagent in crawlers_allowed_list:
+                                if uagent in crawlers_allowed:
+                                    continue
+                                crawlers_allowed.append(uagent.strip())
+                        if str(self.server.crawlers_allowed) != \
+                           str(crawlers_allowed):
+                            self.server.crawlers_allowed = \
+                                crawlers_allowed
+                            crawlers_allowed_str = ''
+                            for uagent in crawlers_allowed:
+                                if crawlers_allowed_str:
+                                    crawlers_allowed_str += ','
+                                crawlers_allowed_str += uagent
+                            set_config_param(base_dir, 'crawlersAllowed',
+                                             crawlers_allowed_str)
+
                         # save peertube instances list
                         peertube_instances_file = \
                             base_dir + '/accounts/peertube.txt'
@@ -13733,6 +13756,7 @@ class PubServer(BaseHTTPRequestHandler):
                                     self.server.text_mode_banner,
                                     city,
                                     self.server.user_agents_blocked,
+                                    self.server.crawlers_allowed,
                                     access_keys,
                                     default_reply_interval_hrs,
                                     self.server.cw_lists,
diff --git a/translations/ar.json b/translations/ar.json
index 37d7da727..69bb8642f 100644
--- a/translations/ar.json
+++ b/translations/ar.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "أظهر من أحب هذا المنشور",
     "Show who repeated this post": "أظهر من كرر هذا المنصب",
     "Repeated by": "يتكرر بواسطة",
-    "Register": "يسجل"
+    "Register": "يسجل",
+    "Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
 }
diff --git a/translations/ca.json b/translations/ca.json
index 3331ed2c0..3dc675148 100644
--- a/translations/ca.json
+++ b/translations/ca.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Mostra a qui li agrada aquesta publicació",
     "Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
     "Repeated by": "Repetit per",
-    "Register": "Registra't"
+    "Register": "Registra't",
+    "Web Crawlers Allowed": "Es permeten rastrejadors web"
 }
diff --git a/translations/cy.json b/translations/cy.json
index bc53cf557..8ce0051e9 100644
--- a/translations/cy.json
+++ b/translations/cy.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn",
     "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
     "Repeated by": "Ailadrodd gan",
-    "Register": "Cofrestrwch"
+    "Register": "Cofrestrwch",
+    "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
 }
diff --git a/translations/de.json b/translations/de.json
index cae215688..cb3ee15b2 100644
--- a/translations/de.json
+++ b/translations/de.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat",
     "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
     "Repeated by": "Wiederholt von",
-    "Register": "Registrieren"
+    "Register": "Registrieren",
+    "Web Crawlers Allowed": "Webcrawler erlaubt"
 }
diff --git a/translations/en.json b/translations/en.json
index b0966cbb8..6391accd0 100644
--- a/translations/en.json
+++ b/translations/en.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Show who liked this post",
     "Show who repeated this post": "Show who repeated this post",
     "Repeated by": "Repeated by",
-    "Register": "Register"
+    "Register": "Register",
+    "Web Crawlers Allowed": "Web Crawlers Allowed"
 }
diff --git a/translations/es.json b/translations/es.json
index 6a4695c4d..e91eb9d20 100644
--- a/translations/es.json
+++ b/translations/es.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Mostrar a quién le gustó esta publicación",
     "Show who repeated this post": "Mostrar quién repitió esta publicación",
     "Repeated by": "Repetido por",
-    "Register": "Registrarse"
+    "Register": "Registrarse",
+    "Web Crawlers Allowed": "Rastreadores web permitidos"
 }
diff --git a/translations/fr.json b/translations/fr.json
index 68c8258f8..429016e58 100644
--- a/translations/fr.json
+++ b/translations/fr.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Montrer qui a aimé ce post",
     "Show who repeated this post": "Montrer qui a répété ce post",
     "Repeated by": "Répété par",
-    "Register": "S'inscrire"
+    "Register": "S'inscrire",
+    "Web Crawlers Allowed": "Robots d'exploration Web autorisés"
 }
diff --git a/translations/ga.json b/translations/ga.json
index 04e2ccf53..0589b6d87 100644
--- a/translations/ga.json
+++ b/translations/ga.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Taispeáin cé a thaitin an postáil seo",
     "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
     "Repeated by": "Arís agus arís eile ag",
-    "Register": "Clár"
+    "Register": "Clár",
+    "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
 }
diff --git a/translations/hi.json b/translations/hi.json
index be544e035..d1ac37dee 100644
--- a/translations/hi.json
+++ b/translations/hi.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया",
     "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
     "Repeated by": "द्वारा दोहराया गया",
-    "Register": "रजिस्टर करें"
+    "Register": "रजिस्टर करें",
+    "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
 }
diff --git a/translations/it.json b/translations/it.json
index 1935c8d61..3c0c311ba 100644
--- a/translations/it.json
+++ b/translations/it.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Mostra a chi è piaciuto questo post",
     "Show who repeated this post": "Mostra chi ha ripetuto questo post",
     "Repeated by": "Ripetuto da",
-    "Register": "Registrati"
+    "Register": "Registrati",
+    "Web Crawlers Allowed": "Web crawler consentiti"
 }
diff --git a/translations/ja.json b/translations/ja.json
index 984ba4109..fb3f075a3 100644
--- a/translations/ja.json
+++ b/translations/ja.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "この投稿を高く評価した人を表示する",
     "Show who repeated this post": "この投稿を繰り返した人を表示する",
     "Repeated by": "によって繰り返される",
-    "Register": "登録"
+    "Register": "登録",
+    "Web Crawlers Allowed": "許可されるWebクローラー"
 }
diff --git a/translations/ko.json b/translations/ko.json
index a01cb53d7..19d6a6b26 100644
--- a/translations/ko.json
+++ b/translations/ko.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "이 포스트를 좋아한 사람 표시",
     "Show who repeated this post": "이 포스트를 반복한 사람 표시",
     "Repeated by": "반복한 사람",
-    "Register": "등록"
+    "Register": "등록",
+    "Web Crawlers Allowed": "웹 크롤러 허용"
 }
diff --git a/translations/ku.json b/translations/ku.json
index 26ed875a2..f55c059cf 100644
--- a/translations/ku.json
+++ b/translations/ku.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Nîşan bide kê ev post eciband",
     "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
     "Repeated by": "Ji hêla dubare kirin",
-    "Register": "Fêhrist"
+    "Register": "Fêhrist",
+    "Web Crawlers Allowed": "Crawlers Web Destûrdar in"
 }
diff --git a/translations/oc.json b/translations/oc.json
index 28141a22d..c5b280708 100644
--- a/translations/oc.json
+++ b/translations/oc.json
@@ -511,5 +511,6 @@
     "Show who liked this post": "Show who liked this post",
     "Show who repeated this post": "Show who repeated this post",
     "Repeated by": "Repeated by",
-    "Register": "Register"
+    "Register": "Register",
+    "Web Crawlers Allowed": "Web Crawlers Allowed"
 }
diff --git a/translations/pl.json b/translations/pl.json
index 5b7402616..bc96da46c 100644
--- a/translations/pl.json
+++ b/translations/pl.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Pokaż, kto polubił ten post",
     "Show who repeated this post": "Pokaż, kto powtórzył ten post",
     "Repeated by": "Powtórzone przez",
-    "Register": "Zarejestrować"
+    "Register": "Zarejestrować",
+    "Web Crawlers Allowed": "Dozwolone roboty sieciowe"
 }
diff --git a/translations/pt.json b/translations/pt.json
index ff9fe2276..03cd5e5aa 100644
--- a/translations/pt.json
+++ b/translations/pt.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Mostrar quem gostou deste post",
     "Show who repeated this post": "Mostrar quem repetiu esta postagem",
     "Repeated by": "Repetido por",
-    "Register": "Registro"
+    "Register": "Registro",
+    "Web Crawlers Allowed": "Rastreadores da Web permitidos"
 }
diff --git a/translations/ru.json b/translations/ru.json
index 253bc985c..762a7d5cc 100644
--- a/translations/ru.json
+++ b/translations/ru.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Показать, кому понравился этот пост",
     "Show who repeated this post": "Показать, кто повторил этот пост",
     "Repeated by": "Повторено",
-    "Register": "регистр"
+    "Register": "регистр",
+    "Web Crawlers Allowed": "Веб-сканеры разрешены"
 }
diff --git a/translations/sw.json b/translations/sw.json
index e0e1ea758..4be3a608f 100644
--- a/translations/sw.json
+++ b/translations/sw.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Onyesha ni nani aliyependa chapisho hili",
     "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
     "Repeated by": "Imerudiwa na",
-    "Register": "Sajili"
+    "Register": "Sajili",
+    "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
 }
diff --git a/translations/uk.json b/translations/uk.json
index 9e36c4a4c..32c2407e9 100644
--- a/translations/uk.json
+++ b/translations/uk.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "Покажіть, кому сподобався цей пост",
     "Show who repeated this post": "Покажіть, хто повторив цей пост",
     "Repeated by": "Повторюється за",
-    "Register": "Реєстрація"
+    "Register": "Реєстрація",
+    "Web Crawlers Allowed": "Веб-сканери дозволені"
 }
diff --git a/translations/zh.json b/translations/zh.json
index b469048bc..784009f13 100644
--- a/translations/zh.json
+++ b/translations/zh.json
@@ -515,5 +515,6 @@
     "Show who liked this post": "显示谁喜欢这篇文章",
     "Show who repeated this post": "显示谁重复了这篇文章",
     "Repeated by": "重复",
-    "Register": "登记"
+    "Register": "登记",
+    "Web Crawlers Allowed": "允许网络爬虫"
 }
diff --git a/webapp_profile.py b/webapp_profile.py
index 8faaf5cba..f57dc07d7 100644
--- a/webapp_profile.py
+++ b/webapp_profile.py
@@ -1631,6 +1631,7 @@ def _html_edit_profile_shared_items(base_dir: str, nickname: str, domain: str,
 
 def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
                                  user_agents_blocked: str,
+                                 crawlers_allowed: str,
                                  translate: {}, reply_interval_hours: int,
                                  cw_lists: {}, lists_enabled: str) -> str:
     """Filtering and blocking section of edit profile screen
@@ -1807,6 +1808,16 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
                            'userAgentsBlockedStr', user_agents_blocked_str,
                            200, '', False)
 
+        crawlers_allowed_str = ''
+        for uagent in crawlers_allowed:
+            if crawlers_allowed_str:
+                crawlers_allowed_str += '\n'
+            crawlers_allowed_str += uagent
+        edit_profile_form += \
+            edit_text_area(translate['Web Crawlers Allowed'],
+                           'crawlersAllowedStr', crawlers_allowed_str,
+                           200, '', False)
+
         cw_lists_str = ''
         for name, _ in cw_lists.items():
             variablename = get_cw_list_variable(name)
@@ -2137,7 +2148,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
                       default_timeline: str, theme: str,
                       peertube_instances: [],
                       text_mode_banner: str, city: str,
-                      user_agents_blocked: str,
+                      user_agents_blocked: [],
+                      crawlers_allowed: [],
                       access_keys: {},
                       default_reply_interval_hrs: int,
                       cw_lists: {}, lists_enabled: str) -> str:
@@ -2354,8 +2366,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
                                  default_reply_interval_hrs)
     edit_profile_form += \
         _html_edit_profile_filtering(base_dir, nickname, domain,
-                                     user_agents_blocked, translate,
-                                     reply_interval_hours,
+                                     user_agents_blocked, crawlers_allowed,
+                                     translate, reply_interval_hours,
                                      cw_lists, lists_enabled)
 
     # git projects section

From 73be47e80f0d2c93e9755a0c41267f66aa2658cc Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Sun, 6 Mar 2022 13:07:25 +0000
Subject: [PATCH 5/5] Note about web crawlers

---
 README_commandline.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README_commandline.md b/README_commandline.md
index 27e57c1ad..1c8a75e26 100644
--- a/README_commandline.md
+++ b/README_commandline.md
@@ -388,3 +388,15 @@ The CalDav endpoint for an account is:
 ```bash
 yourdomain/calendars/yournick
 ```
+
+## Web Crawlers
+
+Having search engines index social media posts is not usually considered appropriate, since even if "public" they may contain personally identifiable information. If you are running a news instance then web crawlers will be permitted by the system, but otherwise by default they will be blocked.
+
+If you want to allow specific web crawlers then when running the daemon (typically with systemd) you can use the **crawlersAllowed** option. It can take a list of bot names, separated by commas. For example:
+
+```bash
+--crawlersAllowed "googlebot, apple"
+```
+
+Typically web crawlers have names ending in "bot", but partial names can also be used.