From 7f0d8972997bf9bf20c2f2d88202995ace507e4a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 6 Mar 2022 11:04:19 +0000 Subject: [PATCH 1/5] Tidying --- daemon.py | 58 +++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/daemon.py b/daemon.py index 8013a49bb..839b6ca20 100644 --- a/daemon.py +++ b/daemon.py @@ -378,6 +378,7 @@ from fitnessFunctions import sorted_watch_points from fitnessFunctions import html_watch_points_graph from siteactive import referer_is_active from webapp_likers import html_likers_of_post +from crawlers import update_known_crawlers import os @@ -418,36 +419,6 @@ def save_domain_qrcode(base_dir: str, http_prefix: str, class PubServer(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' - def _update_known_crawlers(self, ua_str: str) -> None: - """Updates a dictionary of known crawlers accessing nodeinfo - or the masto API - """ - if not ua_str: - return - - curr_time = int(time.time()) - if self.server.known_crawlers.get(ua_str): - self.server.known_crawlers[ua_str]['hits'] += 1 - self.server.known_crawlers[ua_str]['lastseen'] = curr_time - else: - self.server.known_crawlers[ua_str] = { - "lastseen": curr_time, - "hits": 1 - } - - if curr_time - self.server.last_known_crawler >= 30: - # remove any old observations - remove_crawlers = [] - for uagent, item in self.server.known_crawlers.items(): - if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30: - remove_crawlers.append(uagent) - for uagent in remove_crawlers: - del self.server.known_crawlers[uagent] - # save the list of crawlers - save_json(self.server.known_crawlers, - self.server.base_dir + '/accounts/knownCrawlers.json') - self.server.last_known_crawler = curr_time - def _get_instance_url(self, calling_domain: str) -> str: """Returns the URL for this instance """ @@ -1115,7 +1086,8 @@ class PubServer(BaseHTTPRequestHandler): show_node_info_accounts: bool, referer_domain: str, debug: bool, - calling_site_timeout: int) -> bool: + calling_site_timeout: int, + known_crawlers: {}) -> bool: """This is a vestigil mastodon API for the purpose of returning an empty result to sites like https://mastopeek.app-dist.eu @@ -1171,7 +1143,12 @@ class PubServer(BaseHTTPRequestHandler): print('mastodon api v1: authorized ' + str(authorized)) print('mastodon api v1: nickname ' + str(nickname)) print('mastodon api v1: referer ' + referer_domain) - self._update_known_crawlers(ua_str) + crawl_time = \ + update_known_crawlers(ua_str, base_dir, + self.server.known_crawlers, + self.server.last_known_crawler) + if crawl_time is not None: + self.server.last_known_crawler = crawl_time broch_mode = broch_mode_is_active(base_dir) send_json, send_json_str = \ @@ -1229,14 +1206,16 @@ class PubServer(BaseHTTPRequestHandler): project_version: str, custom_emoji: [], show_node_info_accounts: bool, - referer_domain: str, debug: bool) -> bool: + referer_domain: str, debug: bool, + known_crawlers: {}) -> bool: return self._masto_api_v1(path, calling_domain, ua_str, authorized, http_prefix, base_dir, nickname, domain, domain_full, onion_domain, i2p_domain, translate, registration, system_language, project_version, custom_emoji, show_node_info_accounts, - referer_domain, debug, 5) + referer_domain, debug, 5, + known_crawlers) def _show_vcard(self, base_dir: str, path: str, calling_domain: str, referer_domain: str, domain: str, debug: bool) -> bool: @@ -1349,7 +1328,13 @@ class PubServer(BaseHTTPRequestHandler): return True if self.server.debug: print('DEBUG: nodeinfo ' + self.path) - self._update_known_crawlers(ua_str) + crawl_time = \ + update_known_crawlers(ua_str, + self.server.base_dir, + self.server.known_crawlers, + self.server.last_known_crawler) + if crawl_time is not None: + self.server.last_known_crawler = crawl_time # If we are in broch mode then don't show potentially # sensitive metadata. @@ -14430,7 +14415,8 @@ class PubServer(BaseHTTPRequestHandler): self.server.custom_emoji, self.server.show_node_info_accounts, referer_domain, - self.server.debug): + self.server.debug, + self.server.known_crawlers): return fitness_performance(getreq_start_time, self.server.fitness, From 3aab054c04fc1c29879228f24c6185b5a75cab8c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 6 Mar 2022 11:59:30 +0000 Subject: [PATCH 2/5] Tidying of user agent blocks --- daemon.py | 82 ++++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 61 deletions(-) diff --git a/daemon.py b/daemon.py index 839b6ca20..5d5a1b72c 100644 --- a/daemon.py +++ b/daemon.py @@ -379,6 +379,7 @@ from fitnessFunctions import html_watch_points_graph from siteactive import referer_is_active from webapp_likers import html_likers_of_post from crawlers import update_known_crawlers +from crawlers import blocked_user_agent import os @@ -560,65 +561,6 @@ class PubServer(BaseHTTPRequestHandler): else: print('ERROR: unable to create vote') - def _blocked_user_agent(self, calling_domain: str, agent_str: str) -> bool: - """Should a GET or POST be blocked based upon its user agent? - """ - if not agent_str: - return False - - agent_str_lower = agent_str.lower() - default_agent_blocks = [ - 'fedilist' - ] - for ua_block in default_agent_blocks: - if ua_block in agent_str_lower: - print('Blocked User agent: ' + ua_block) - return True - - agent_domain = None - - if agent_str: - # is this a web crawler? If so the block it - if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower: - if self.server.news_instance: - return False - print('Blocked Crawler: ' + agent_str) - return True - # get domain name from User-Agent - agent_domain = user_agent_domain(agent_str, self.server.debug) - else: - # no User-Agent header is present - return True - - # is the User-Agent type blocked? eg. "Mastodon" - if self.server.user_agents_blocked: - blocked_ua = False - for agent_name in self.server.user_agents_blocked: - if agent_name in agent_str: - blocked_ua = True - break - if blocked_ua: - return True - - if not agent_domain: - return False - - # is the User-Agent domain blocked - blocked_ua = False - if not agent_domain.startswith(calling_domain): - self.server.blocked_cache_last_updated = \ - update_blocked_cache(self.server.base_dir, - self.server.blocked_cache, - self.server.blocked_cache_last_updated, - self.server.blocked_cache_update_secs) - - blocked_ua = is_blocked_domain(self.server.base_dir, agent_domain, - self.server.blocked_cache) - # if self.server.debug: - if blocked_ua: - print('Blocked User agent: ' + agent_domain) - return blocked_ua - def _request_csv(self) -> bool: """Should a csv response be given? """ @@ -14033,7 +13975,16 @@ class PubServer(BaseHTTPRequestHandler): ua_str = self._get_user_agent() if not self._permitted_crawler_path(self.path): - if self._blocked_user_agent(calling_domain, ua_str): + block, self.server.blocked_cache_last_updated = \ + blocked_user_agent(calling_domain, ua_str, + self.server.news_instance, + self.server.debug, + self.server.user_agents_blocked, + self.server.blocked_cache_last_updated, + self.server.base_dir, + self.server.blocked_cache, + self.server.blocked_cache_update_secs) + if block: self._400() return @@ -18565,7 +18516,16 @@ class PubServer(BaseHTTPRequestHandler): ua_str = self._get_user_agent() - if self._blocked_user_agent(calling_domain, ua_str): + block, self.server.blocked_cache_last_updated = \ + blocked_user_agent(calling_domain, ua_str, + self.server.news_instance, + self.server.debug, + self.server.user_agents_blocked, + self.server.blocked_cache_last_updated, + self.server.base_dir, + self.server.blocked_cache, + self.server.blocked_cache_update_secs) + if block: self._400() self.server.postreq_busy = False return From 35883119bee3a51c4d994799f9e5ea56f7f25a7e Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 6 Mar 2022 12:31:58 +0000 Subject: [PATCH 3/5] Option to allow some crawlers --- daemon.py | 12 +++++++++--- epicyon.py | 23 ++++++++++++++++++++--- tests.py | 16 ++++++++++++---- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/daemon.py b/daemon.py index 5d5a1b72c..87a0730d4 100644 --- a/daemon.py +++ b/daemon.py @@ -13983,7 +13983,8 @@ class PubServer(BaseHTTPRequestHandler): self.server.blocked_cache_last_updated, self.server.base_dir, self.server.blocked_cache, - self.server.blocked_cache_update_secs) + self.server.blocked_cache_update_secs, + self.server.crawlers_allowed) if block: self._400() return @@ -18524,7 +18525,8 @@ class PubServer(BaseHTTPRequestHandler): self.server.blocked_cache_last_updated, self.server.base_dir, self.server.blocked_cache, - self.server.blocked_cache_update_secs) + self.server.blocked_cache_update_secs, + self.server.crawlers_allowed) if block: self._400() self.server.postreq_busy = False @@ -19457,7 +19459,8 @@ def load_tokens(base_dir: str, tokens_dict: {}, tokens_lookup: {}) -> None: break -def run_daemon(dyslexic_font: bool, +def run_daemon(crawlers_allowed: [], + dyslexic_font: bool, content_license_url: str, lists_enabled: str, default_reply_interval_hrs: int, @@ -19636,6 +19639,9 @@ def run_daemon(dyslexic_font: bool, # list of blocked user agent types within the User-Agent header httpd.user_agents_blocked = user_agents_blocked + # list of crawler bots permitted within the User-Agent header + httpd.crawlers_allowed = crawlers_allowed + httpd.unit_test = unit_test httpd.allow_local_network_access = allow_local_network_access if unit_test: diff --git a/epicyon.py b/epicyon.py index b38ca64fc..87bcbbaf6 100644 --- a/epicyon.py +++ b/epicyon.py @@ -141,6 +141,10 @@ parser.add_argument('--lists_enabled', type=str, parser.add_argument('--userAgentBlocks', type=str, default=None, help='List of blocked user agents, separated by commas') +parser.add_argument('--crawlersAllowed', type=str, + default=None, + help='List of permitted web crawler user agents, ' + + 'separated by commas') parser.add_argument('--libretranslate', dest='libretranslateUrl', type=str, default=None, help='URL for LibreTranslate service') @@ -3301,8 +3305,20 @@ else: get_config_param(base_dir, 'userAgentsBlocked') if user_agents_blocked_str: agent_blocks_list = user_agents_blocked_str.split(',') - for agentBlockStr in agent_blocks_list: - user_agents_blocked.append(agentBlockStr.strip()) + for user_agents_blocked_str2 in agent_blocks_list: + user_agents_blocked.append(user_agents_blocked_str2.strip()) + +crawlers_allowed = [] +if args.crawlersAllowed: + crawlers_allowed_str = args.crawlersAllowed + set_config_param(base_dir, 'crawlersAllowed', crawlers_allowed_str) +else: + crawlers_allowed_str = \ + get_config_param(base_dir, 'crawlersAllowed') +if crawlers_allowed_str: + crawlers_allowed_list = crawlers_allowed_str.split(',') + for crawlers_allowed_str2 in crawlers_allowed_list: + crawlers_allowed.append(crawlers_allowed_str2.strip()) lists_enabled = '' if args.lists_enabled: @@ -3365,7 +3381,8 @@ if args.defaultCurrency: print('Default currency set to ' + args.defaultCurrency) if __name__ == "__main__": - run_daemon(args.dyslexic_font, + run_daemon(crawlers_allowed, + args.dyslexic_font, content_license_url, lists_enabled, args.default_reply_interval_hrs, diff --git a/tests.py b/tests.py index 14734a87b..f4121d76a 100644 --- a/tests.py +++ b/tests.py @@ -822,8 +822,10 @@ def create_server_alice(path: str, domain: str, port: int, lists_enabled = '' content_license_url = 'https://creativecommons.org/licenses/by/4.0' dyslexic_font = False + crawlers_allowed = [] print('Server running: Alice') - run_daemon(dyslexic_font, + run_daemon(crawlers_allowed, + dyslexic_font, content_license_url, lists_enabled, default_reply_interval_hrs, low_bandwidth, max_like_count, @@ -975,8 +977,10 @@ def create_server_bob(path: str, domain: str, port: int, lists_enabled = '' content_license_url = 'https://creativecommons.org/licenses/by/4.0' dyslexic_font = False + crawlers_allowed = [] print('Server running: Bob') - run_daemon(dyslexic_font, + run_daemon(crawlers_allowed, + dyslexic_font, content_license_url, lists_enabled, default_reply_interval_hrs, low_bandwidth, max_like_count, @@ -1051,8 +1055,10 @@ def create_server_eve(path: str, domain: str, port: int, federation_list: [], lists_enabled = '' content_license_url = 'https://creativecommons.org/licenses/by/4.0' dyslexic_font = False + crawlers_allowed = [] print('Server running: Eve') - run_daemon(dyslexic_font, + run_daemon(crawlers_allowed, + dyslexic_font, content_license_url, lists_enabled, default_reply_interval_hrs, low_bandwidth, max_like_count, @@ -1129,8 +1135,10 @@ def create_server_group(path: str, domain: str, port: int, lists_enabled = '' content_license_url = 'https://creativecommons.org/licenses/by/4.0' dyslexic_font = False + crawlers_allowed = [] print('Server running: Group') - run_daemon(dyslexic_font, + run_daemon(crawlers_allowed, + dyslexic_font, content_license_url, lists_enabled, default_reply_interval_hrs, low_bandwidth, max_like_count, From f4fc143b3a609789014c1f1ad7ecea79f9f8799c Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 6 Mar 2022 12:56:26 +0000 Subject: [PATCH 4/5] Add crawlers module --- crawlers.py | 120 +++++++++++++++++++++++++++++++++++++++++++ daemon.py | 24 +++++++++ translations/ar.json | 3 +- translations/ca.json | 3 +- translations/cy.json | 3 +- translations/de.json | 3 +- translations/en.json | 3 +- translations/es.json | 3 +- translations/fr.json | 3 +- translations/ga.json | 3 +- translations/hi.json | 3 +- translations/it.json | 3 +- translations/ja.json | 3 +- translations/ko.json | 3 +- translations/ku.json | 3 +- translations/oc.json | 3 +- translations/pl.json | 3 +- translations/pt.json | 3 +- translations/ru.json | 3 +- translations/sw.json | 3 +- translations/uk.json | 3 +- translations/zh.json | 3 +- webapp_profile.py | 18 +++++-- 23 files changed, 199 insertions(+), 23 deletions(-) create mode 100644 crawlers.py diff --git a/crawlers.py b/crawlers.py new file mode 100644 index 000000000..6ec5c43d0 --- /dev/null +++ b/crawlers.py @@ -0,0 +1,120 @@ +__filename__ = "crawlers.py" +__author__ = "Bob Mottram" +__license__ = "AGPL3+" +__version__ = "1.3.0" +__maintainer__ = "Bob Mottram" +__email__ = "bob@libreserver.org" +__status__ = "Production" +__module_group__ = "Core" + +import time +from utils import save_json +from utils import user_agent_domain +from blocking import update_blocked_cache +from blocking import is_blocked_domain + +default_user_agent_blocks = [ + 'fedilist' +] + + +def update_known_crawlers(ua_str: str, + base_dir: str, known_crawlers: {}, + last_known_crawler: int): + """Updates a dictionary of known crawlers accessing nodeinfo + or the masto API + """ + if not ua_str: + return None + + curr_time = int(time.time()) + if known_crawlers.get(ua_str): + known_crawlers[ua_str]['hits'] += 1 + known_crawlers[ua_str]['lastseen'] = curr_time + else: + known_crawlers[ua_str] = { + "lastseen": curr_time, + "hits": 1 + } + + if curr_time - last_known_crawler >= 30: + # remove any old observations + remove_crawlers = [] + for uagent, item in known_crawlers.items(): + if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30: + remove_crawlers.append(uagent) + for uagent in remove_crawlers: + del known_crawlers[uagent] + # save the list of crawlers + save_json(known_crawlers, + base_dir + '/accounts/knownCrawlers.json') + return curr_time + + +def blocked_user_agent(calling_domain: str, agent_str: str, + news_instance: bool, debug: bool, + user_agents_blocked: [], + blocked_cache_last_updated, + base_dir: str, + blocked_cache: [], + blocked_cache_update_secs: int, + crawlers_allowed: []): + """Should a GET or POST be blocked based upon its user agent? + """ + if not agent_str: + return False, blocked_cache_last_updated + + agent_str_lower = agent_str.lower() + for ua_block in default_user_agent_blocks: + if ua_block in agent_str_lower: + print('Blocked User agent: ' + ua_block) + return True, blocked_cache_last_updated + + agent_domain = None + + if agent_str: + # is this a web crawler? If so the block it + if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower: + # if this is a news instance then we want it + # to be indexed by search engines + if news_instance: + return False, blocked_cache_last_updated + # is this crawler allowed? + for crawler in crawlers_allowed: + if crawler.lower() in agent_str_lower: + return False, blocked_cache_last_updated + print('Blocked Crawler: ' + agent_str) + return True, blocked_cache_last_updated + # get domain name from User-Agent + agent_domain = user_agent_domain(agent_str, debug) + else: + # no User-Agent header is present + return True, blocked_cache_last_updated + + # is the User-Agent type blocked? eg. "Mastodon" + if user_agents_blocked: + blocked_ua = False + for agent_name in user_agents_blocked: + if agent_name in agent_str: + blocked_ua = True + break + if blocked_ua: + return True, blocked_cache_last_updated + + if not agent_domain: + return False, blocked_cache_last_updated + + # is the User-Agent domain blocked + blocked_ua = False + if not agent_domain.startswith(calling_domain): + blocked_cache_last_updated = \ + update_blocked_cache(base_dir, blocked_cache, + blocked_cache_last_updated, + blocked_cache_update_secs) + + blocked_ua = \ + is_blocked_domain(base_dir, agent_domain, blocked_cache) + # if self.server.debug: + if blocked_ua: + print('Blocked User agent: ' + agent_domain) + return blocked_ua, blocked_cache_last_updated diff --git a/daemon.py b/daemon.py index 87a0730d4..0d0f5c4b2 100644 --- a/daemon.py +++ b/daemon.py @@ -6689,6 +6689,29 @@ class PubServer(BaseHTTPRequestHandler): set_config_param(base_dir, 'userAgentsBlocked', user_agents_blocked_str) + # save allowed web crawlers + crawlers_allowed = [] + if fields.get('crawlersAllowedStr'): + crawlers_allowed_str = \ + fields['crawlersAllowedStr'] + crawlers_allowed_list = \ + crawlers_allowed_str.split('\n') + for uagent in crawlers_allowed_list: + if uagent in crawlers_allowed: + continue + crawlers_allowed.append(uagent.strip()) + if str(self.server.crawlers_allowed) != \ + str(crawlers_allowed): + self.server.crawlers_allowed = \ + crawlers_allowed + crawlers_allowed_str = '' + for uagent in crawlers_allowed: + if crawlers_allowed_str: + crawlers_allowed_str += ',' + crawlers_allowed_str += uagent + set_config_param(base_dir, 'crawlersAllowed', + crawlers_allowed_str) + # save peertube instances list peertube_instances_file = \ base_dir + '/accounts/peertube.txt' @@ -13733,6 +13756,7 @@ class PubServer(BaseHTTPRequestHandler): self.server.text_mode_banner, city, self.server.user_agents_blocked, + self.server.crawlers_allowed, access_keys, default_reply_interval_hrs, self.server.cw_lists, diff --git a/translations/ar.json b/translations/ar.json index 37d7da727..69bb8642f 100644 --- a/translations/ar.json +++ b/translations/ar.json @@ -515,5 +515,6 @@ "Show who liked this post": "أظهر من أحب هذا المنشور", "Show who repeated this post": "أظهر من كرر هذا المنصب", "Repeated by": "يتكرر بواسطة", - "Register": "يسجل" + "Register": "يسجل", + "Web Crawlers Allowed": "برامج زحف الويب المسموح بها" } diff --git a/translations/ca.json b/translations/ca.json index 3331ed2c0..3dc675148 100644 --- a/translations/ca.json +++ b/translations/ca.json @@ -515,5 +515,6 @@ "Show who liked this post": "Mostra a qui li agrada aquesta publicació", "Show who repeated this post": "Mostra qui ha repetit aquesta publicació", "Repeated by": "Repetit per", - "Register": "Registra't" + "Register": "Registra't", + "Web Crawlers Allowed": "Es permeten rastrejadors web" } diff --git a/translations/cy.json b/translations/cy.json index bc53cf557..8ce0051e9 100644 --- a/translations/cy.json +++ b/translations/cy.json @@ -515,5 +515,6 @@ "Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn", "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn", "Repeated by": "Ailadrodd gan", - "Register": "Cofrestrwch" + "Register": "Cofrestrwch", + "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe" } diff --git a/translations/de.json b/translations/de.json index cae215688..cb3ee15b2 100644 --- a/translations/de.json +++ b/translations/de.json @@ -515,5 +515,6 @@ "Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat", "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat", "Repeated by": "Wiederholt von", - "Register": "Registrieren" + "Register": "Registrieren", + "Web Crawlers Allowed": "Webcrawler erlaubt" } diff --git a/translations/en.json b/translations/en.json index b0966cbb8..6391accd0 100644 --- a/translations/en.json +++ b/translations/en.json @@ -515,5 +515,6 @@ "Show who liked this post": "Show who liked this post", "Show who repeated this post": "Show who repeated this post", "Repeated by": "Repeated by", - "Register": "Register" + "Register": "Register", + "Web Crawlers Allowed": "Web Crawlers Allowed" } diff --git a/translations/es.json b/translations/es.json index 6a4695c4d..e91eb9d20 100644 --- a/translations/es.json +++ b/translations/es.json @@ -515,5 +515,6 @@ "Show who liked this post": "Mostrar a quién le gustó esta publicación", "Show who repeated this post": "Mostrar quién repitió esta publicación", "Repeated by": "Repetido por", - "Register": "Registrarse" + "Register": "Registrarse", + "Web Crawlers Allowed": "Rastreadores web permitidos" } diff --git a/translations/fr.json b/translations/fr.json index 68c8258f8..429016e58 100644 --- a/translations/fr.json +++ b/translations/fr.json @@ -515,5 +515,6 @@ "Show who liked this post": "Montrer qui a aimé ce post", "Show who repeated this post": "Montrer qui a répété ce post", "Repeated by": "Répété par", - "Register": "S'inscrire" + "Register": "S'inscrire", + "Web Crawlers Allowed": "Robots d'exploration Web autorisés" } diff --git a/translations/ga.json b/translations/ga.json index 04e2ccf53..0589b6d87 100644 --- a/translations/ga.json +++ b/translations/ga.json @@ -515,5 +515,6 @@ "Show who liked this post": "Taispeáin cé a thaitin an postáil seo", "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís", "Repeated by": "Arís agus arís eile ag", - "Register": "Clár" + "Register": "Clár", + "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe" } diff --git a/translations/hi.json b/translations/hi.json index be544e035..d1ac37dee 100644 --- a/translations/hi.json +++ b/translations/hi.json @@ -515,5 +515,6 @@ "Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया", "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया", "Repeated by": "द्वारा दोहराया गया", - "Register": "रजिस्टर करें" + "Register": "रजिस्टर करें", + "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है" } diff --git a/translations/it.json b/translations/it.json index 1935c8d61..3c0c311ba 100644 --- a/translations/it.json +++ b/translations/it.json @@ -515,5 +515,6 @@ "Show who liked this post": "Mostra a chi è piaciuto questo post", "Show who repeated this post": "Mostra chi ha ripetuto questo post", "Repeated by": "Ripetuto da", - "Register": "Registrati" + "Register": "Registrati", + "Web Crawlers Allowed": "Web crawler consentiti" } diff --git a/translations/ja.json b/translations/ja.json index 984ba4109..fb3f075a3 100644 --- a/translations/ja.json +++ b/translations/ja.json @@ -515,5 +515,6 @@ "Show who liked this post": "この投稿を高く評価した人を表示する", "Show who repeated this post": "この投稿を繰り返した人を表示する", "Repeated by": "によって繰り返される", - "Register": "登録" + "Register": "登録", + "Web Crawlers Allowed": "許可されるWebクローラー" } diff --git a/translations/ko.json b/translations/ko.json index a01cb53d7..19d6a6b26 100644 --- a/translations/ko.json +++ b/translations/ko.json @@ -515,5 +515,6 @@ "Show who liked this post": "이 포스트를 좋아한 사람 표시", "Show who repeated this post": "이 포스트를 반복한 사람 표시", "Repeated by": "반복한 사람", - "Register": "등록" + "Register": "등록", + "Web Crawlers Allowed": "웹 크롤러 허용" } diff --git a/translations/ku.json b/translations/ku.json index 26ed875a2..f55c059cf 100644 --- a/translations/ku.json +++ b/translations/ku.json @@ -515,5 +515,6 @@ "Show who liked this post": "Nîşan bide kê ev post eciband", "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye", "Repeated by": "Ji hêla dubare kirin", - "Register": "Fêhrist" + "Register": "Fêhrist", + "Web Crawlers Allowed": "Crawlers Web Destûrdar in" } diff --git a/translations/oc.json b/translations/oc.json index 28141a22d..c5b280708 100644 --- a/translations/oc.json +++ b/translations/oc.json @@ -511,5 +511,6 @@ "Show who liked this post": "Show who liked this post", "Show who repeated this post": "Show who repeated this post", "Repeated by": "Repeated by", - "Register": "Register" + "Register": "Register", + "Web Crawlers Allowed": "Web Crawlers Allowed" } diff --git a/translations/pl.json b/translations/pl.json index 5b7402616..bc96da46c 100644 --- a/translations/pl.json +++ b/translations/pl.json @@ -515,5 +515,6 @@ "Show who liked this post": "Pokaż, kto polubił ten post", "Show who repeated this post": "Pokaż, kto powtórzył ten post", "Repeated by": "Powtórzone przez", - "Register": "Zarejestrować" + "Register": "Zarejestrować", + "Web Crawlers Allowed": "Dozwolone roboty sieciowe" } diff --git a/translations/pt.json b/translations/pt.json index ff9fe2276..03cd5e5aa 100644 --- a/translations/pt.json +++ b/translations/pt.json @@ -515,5 +515,6 @@ "Show who liked this post": "Mostrar quem gostou deste post", "Show who repeated this post": "Mostrar quem repetiu esta postagem", "Repeated by": "Repetido por", - "Register": "Registro" + "Register": "Registro", + "Web Crawlers Allowed": "Rastreadores da Web permitidos" } diff --git a/translations/ru.json b/translations/ru.json index 253bc985c..762a7d5cc 100644 --- a/translations/ru.json +++ b/translations/ru.json @@ -515,5 +515,6 @@ "Show who liked this post": "Показать, кому понравился этот пост", "Show who repeated this post": "Показать, кто повторил этот пост", "Repeated by": "Повторено", - "Register": "регистр" + "Register": "регистр", + "Web Crawlers Allowed": "Веб-сканеры разрешены" } diff --git a/translations/sw.json b/translations/sw.json index e0e1ea758..4be3a608f 100644 --- a/translations/sw.json +++ b/translations/sw.json @@ -515,5 +515,6 @@ "Show who liked this post": "Onyesha ni nani aliyependa chapisho hili", "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili", "Repeated by": "Imerudiwa na", - "Register": "Sajili" + "Register": "Sajili", + "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa" } diff --git a/translations/uk.json b/translations/uk.json index 9e36c4a4c..32c2407e9 100644 --- a/translations/uk.json +++ b/translations/uk.json @@ -515,5 +515,6 @@ "Show who liked this post": "Покажіть, кому сподобався цей пост", "Show who repeated this post": "Покажіть, хто повторив цей пост", "Repeated by": "Повторюється за", - "Register": "Реєстрація" + "Register": "Реєстрація", + "Web Crawlers Allowed": "Веб-сканери дозволені" } diff --git a/translations/zh.json b/translations/zh.json index b469048bc..784009f13 100644 --- a/translations/zh.json +++ b/translations/zh.json @@ -515,5 +515,6 @@ "Show who liked this post": "显示谁喜欢这篇文章", "Show who repeated this post": "显示谁重复了这篇文章", "Repeated by": "重复", - "Register": "登记" + "Register": "登记", + "Web Crawlers Allowed": "允许网络爬虫" } diff --git a/webapp_profile.py b/webapp_profile.py index 8faaf5cba..f57dc07d7 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -1631,6 +1631,7 @@ def _html_edit_profile_shared_items(base_dir: str, nickname: str, domain: str, def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str, user_agents_blocked: str, + crawlers_allowed: str, translate: {}, reply_interval_hours: int, cw_lists: {}, lists_enabled: str) -> str: """Filtering and blocking section of edit profile screen @@ -1807,6 +1808,16 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str, 'userAgentsBlockedStr', user_agents_blocked_str, 200, '', False) + crawlers_allowed_str = '' + for uagent in crawlers_allowed: + if crawlers_allowed_str: + crawlers_allowed_str += '\n' + crawlers_allowed_str += uagent + edit_profile_form += \ + edit_text_area(translate['Web Crawlers Allowed'], + 'crawlersAllowedStr', crawlers_allowed_str, + 200, '', False) + cw_lists_str = '' for name, _ in cw_lists.items(): variablename = get_cw_list_variable(name) @@ -2137,7 +2148,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str, default_timeline: str, theme: str, peertube_instances: [], text_mode_banner: str, city: str, - user_agents_blocked: str, + user_agents_blocked: [], + crawlers_allowed: [], access_keys: {}, default_reply_interval_hrs: int, cw_lists: {}, lists_enabled: str) -> str: @@ -2354,8 +2366,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str, default_reply_interval_hrs) edit_profile_form += \ _html_edit_profile_filtering(base_dir, nickname, domain, - user_agents_blocked, translate, - reply_interval_hours, + user_agents_blocked, crawlers_allowed, + translate, reply_interval_hours, cw_lists, lists_enabled) # git projects section From 73be47e80f0d2c93e9755a0c41267f66aa2658cc Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 6 Mar 2022 13:07:25 +0000 Subject: [PATCH 5/5] Note about web crawlers --- README_commandline.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README_commandline.md b/README_commandline.md index 27e57c1ad..1c8a75e26 100644 --- a/README_commandline.md +++ b/README_commandline.md @@ -388,3 +388,15 @@ The CalDav endpoint for an account is: ```bash yourdomain/calendars/yournick ``` + +## Web Crawlers + +Having search engines index social media posts is not usually considered appropriate, since even if "public" they may contain personally identifiable information. If you are running a news instance then web crawlers will be permitted by the system, but otherwise by default they will be blocked. + +If you want to allow specific web crawlers then when running the daemon (typically with systemd) you can use the **crawlersAllowed** option. It can take a list of bot names, separated by commas. For example: + +```bash +--crawlersAllowed "googlebot, apple" +``` + +Typically web crawlers have names ending in "bot", but partial names can also be used.