diff --git a/crawlers.py b/crawlers.py
index 6ec5c43d0..952f3ffdd 100644
--- a/crawlers.py
+++ b/crawlers.py
@@ -7,6 +7,7 @@ __email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
+import os
import time
from utils import save_json
from utils import user_agent_domain
@@ -51,6 +52,51 @@ def update_known_crawlers(ua_str: str,
return curr_time
+def load_known_web_bots(base_dir: str) -> []:
+ """Returns a list of known web bots
+ """
+ known_bots_filename = base_dir + '/accounts/knownBots.txt'
+ if not os.path.isfile(known_bots_filename):
+ return []
+ crawlers_str = None
+ try:
+ with open(known_bots_filename, 'r') as fp_crawlers:
+ crawlers_str = fp_crawlers.read()
+ except OSError:
+ print('EX: unable to load web bots from ' +
+ known_bots_filename)
+ if not crawlers_str:
+ return []
+ known_bots = []
+ crawlers_list = crawlers_str.split('\n')
+ for crawler in crawlers_list:
+ if not crawler:
+ continue
+ crawler = crawler.replace('\n', '').strip()
+ if not crawler:
+ continue
+ if crawler not in known_bots:
+ known_bots.append(crawler)
+ return known_bots
+
+
+def _save_known_web_bots(base_dir: str, known_bots: []) -> bool:
+ """Saves a list of known web bots
+ """
+ known_bots_filename = base_dir + '/accounts/knownBots.txt'
+ known_bots_str = ''
+ for crawler in known_bots:
+ known_bots_str += crawler.strip() + '\n'
+ try:
+ with open(known_bots_filename, 'w+') as fp_crawlers:
+ fp_crawlers.write(known_bots_str)
+ except OSError:
+ print("EX: unable to save known web bots to " +
+ known_bots_filename)
+ return False
+ return True
+
+
def blocked_user_agent(calling_domain: str, agent_str: str,
news_instance: bool, debug: bool,
user_agents_blocked: [],
@@ -58,7 +104,8 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
base_dir: str,
blocked_cache: [],
blocked_cache_update_secs: int,
- crawlers_allowed: []):
+ crawlers_allowed: [],
+ known_bots: []):
"""Should a GET or POST be blocked based upon its user agent?
"""
if not agent_str:
@@ -73,8 +120,13 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
agent_domain = None
if agent_str:
- # is this a web crawler? If so the block it
+ # is this a web crawler? If so then block it by default
+ # unless this is a news instance or if it is in the allowed list
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
+ if agent_str_lower not in known_bots:
+ known_bots.append(agent_str_lower)
+ known_bots.sort()
+ _save_known_web_bots(base_dir, known_bots)
# if this is a news instance then we want it
# to be indexed by search engines
if news_instance:
diff --git a/daemon.py b/daemon.py
index 0d0f5c4b2..cd615e3f9 100644
--- a/daemon.py
+++ b/daemon.py
@@ -380,6 +380,7 @@ from siteactive import referer_is_active
from webapp_likers import html_likers_of_post
from crawlers import update_known_crawlers
from crawlers import blocked_user_agent
+from crawlers import load_known_web_bots
import os
@@ -14008,7 +14009,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs,
- self.server.crawlers_allowed)
+ self.server.crawlers_allowed,
+ self.server.known_bots)
if block:
self._400()
return
@@ -14185,6 +14187,24 @@ class PubServer(BaseHTTPRequestHandler):
'_GET', 'isAuthorized',
self.server.debug)
+ if authorized and self.path.endswith('/bots.txt'):
+ known_bots_str = ''
+ for bot_name in self.server.known_bots:
+ known_bots_str += bot_name + '\n'
+ # TODO
+ msg = known_bots_str.encode('utf-8')
+ msglen = len(msg)
+ self._set_headers('text/plain; charset=utf-8',
+ msglen, None, calling_domain, True)
+ self._write(msg)
+ if self.server.debug:
+ print('Sent known bots: ' +
+ self.server.path + ' ' + calling_domain)
+ fitness_performance(getreq_start_time, self.server.fitness,
+ '_GET', 'get_known_bots',
+ self.server.debug)
+ return
+
# shared items catalog for this instance
# this is only accessible to instance members or to
# other instances which present an authorization token
@@ -18550,7 +18570,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.base_dir,
self.server.blocked_cache,
self.server.blocked_cache_update_secs,
- self.server.crawlers_allowed)
+ self.server.crawlers_allowed,
+ self.server.known_bots)
if block:
self._400()
self.server.postreq_busy = False
@@ -19666,6 +19687,9 @@ def run_daemon(crawlers_allowed: [],
# list of crawler bots permitted within the User-Agent header
httpd.crawlers_allowed = crawlers_allowed
+ # list of web crawlers known to the system
+ httpd.known_bots = load_known_web_bots(base_dir)
+
httpd.unit_test = unit_test
httpd.allow_local_network_access = allow_local_network_access
if unit_test:
diff --git a/translations/ar.json b/translations/ar.json
index 69bb8642f..25d3a03c4 100644
--- a/translations/ar.json
+++ b/translations/ar.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "أظهر من كرر هذا المنصب",
"Repeated by": "يتكرر بواسطة",
"Register": "يسجل",
- "Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
+ "Web Bots Allowed": "مسموح روبوتات الويب",
+ "Known Search Bots": "روبوتات بحث الويب المعروفة"
}
diff --git a/translations/ca.json b/translations/ca.json
index 3dc675148..839c0035d 100644
--- a/translations/ca.json
+++ b/translations/ca.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
"Repeated by": "Repetit per",
"Register": "Registra't",
- "Web Crawlers Allowed": "Es permeten rastrejadors web"
+ "Web Bots Allowed": "Bots web permesos",
+ "Known Search Bots": "Bots de cerca web coneguts"
}
diff --git a/translations/cy.json b/translations/cy.json
index 8ce0051e9..7567962b7 100644
--- a/translations/cy.json
+++ b/translations/cy.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
"Repeated by": "Ailadrodd gan",
"Register": "Cofrestrwch",
- "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
+ "Web Bots Allowed": "Web Bots a Ganiateir",
+ "Known Search Bots": "Bots Chwilio Gwe Hysbys"
}
diff --git a/translations/de.json b/translations/de.json
index cb3ee15b2..e8a88b053 100644
--- a/translations/de.json
+++ b/translations/de.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
"Repeated by": "Wiederholt von",
"Register": "Registrieren",
- "Web Crawlers Allowed": "Webcrawler erlaubt"
+ "Web Bots Allowed": "Webbots erlaubt",
+ "Known Search Bots": "Bekannte Bots für die Websuche"
}
diff --git a/translations/en.json b/translations/en.json
index 6391accd0..76758fb36 100644
--- a/translations/en.json
+++ b/translations/en.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by",
"Register": "Register",
- "Web Crawlers Allowed": "Web Crawlers Allowed"
+ "Web Bots Allowed": "Web Search Bots Allowed",
+ "Known Search Bots": "Known Web Search Bots"
}
diff --git a/translations/es.json b/translations/es.json
index e91eb9d20..aa88cfafd 100644
--- a/translations/es.json
+++ b/translations/es.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Mostrar quién repitió esta publicación",
"Repeated by": "Repetido por",
"Register": "Registrarse",
- "Web Crawlers Allowed": "Rastreadores web permitidos"
+ "Web Bots Allowed": "Bots web permitidos",
+ "Known Search Bots": "Bots de búsqueda web conocidos"
}
diff --git a/translations/fr.json b/translations/fr.json
index 429016e58..71ce41fb1 100644
--- a/translations/fr.json
+++ b/translations/fr.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Montrer qui a répété ce post",
"Repeated by": "Répété par",
"Register": "S'inscrire",
- "Web Crawlers Allowed": "Robots d'exploration Web autorisés"
+ "Web Bots Allowed": "Robots Web autorisés",
+ "Known Search Bots": "Robots de recherche Web connus"
}
diff --git a/translations/ga.json b/translations/ga.json
index 0589b6d87..597385634 100644
--- a/translations/ga.json
+++ b/translations/ga.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
"Repeated by": "Arís agus arís eile ag",
"Register": "Clár",
- "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
+ "Web Bots Allowed": "Róbónna Gréasáin Ceadaithe",
+ "Known Search Bots": "Róbónna Cuardach Gréasáin Aitheanta"
}
diff --git a/translations/hi.json b/translations/hi.json
index d1ac37dee..348f31bdb 100644
--- a/translations/hi.json
+++ b/translations/hi.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
"Repeated by": "द्वारा दोहराया गया",
"Register": "रजिस्टर करें",
- "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
+ "Web Bots Allowed": "वेब बॉट्स की अनुमति है",
+ "Known Search Bots": "ज्ञात वेब खोज बॉट्स"
}
diff --git a/translations/it.json b/translations/it.json
index 3c0c311ba..f66199075 100644
--- a/translations/it.json
+++ b/translations/it.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Mostra chi ha ripetuto questo post",
"Repeated by": "Ripetuto da",
"Register": "Registrati",
- "Web Crawlers Allowed": "Web crawler consentiti"
+ "Web Bots Allowed": "Web bot consentiti",
+ "Known Search Bots": "Bot di ricerca Web noti"
}
diff --git a/translations/ja.json b/translations/ja.json
index fb3f075a3..b524984d9 100644
--- a/translations/ja.json
+++ b/translations/ja.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "この投稿を繰り返した人を表示する",
"Repeated by": "によって繰り返される",
"Register": "登録",
- "Web Crawlers Allowed": "許可されるWebクローラー"
+ "Web Bots Allowed": "許可されたWebボット",
+ "Known Search Bots": "既知のWeb検索ボット"
}
diff --git a/translations/ko.json b/translations/ko.json
index 19d6a6b26..5df692faa 100644
--- a/translations/ko.json
+++ b/translations/ko.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "이 포스트를 반복한 사람 표시",
"Repeated by": "반복한 사람",
"Register": "등록",
- "Web Crawlers Allowed": "웹 크롤러 허용"
+ "Web Bots Allowed": "웹 봇 허용",
+ "Known Search Bots": "알려진 웹 검색 봇"
}
diff --git a/translations/ku.json b/translations/ku.json
index f55c059cf..2a94f25de 100644
--- a/translations/ku.json
+++ b/translations/ku.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
"Repeated by": "Ji hêla dubare kirin",
"Register": "Fêhrist",
- "Web Crawlers Allowed": "Crawlers Web Destûrdar in"
+ "Web Bots Allowed": "Web Bots Destûrdar in",
+ "Known Search Bots": "Botên Lêgerîna Webê yên naskirî"
}
diff --git a/translations/oc.json b/translations/oc.json
index c5b280708..fc4a357a6 100644
--- a/translations/oc.json
+++ b/translations/oc.json
@@ -512,5 +512,6 @@
"Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by",
"Register": "Register",
- "Web Crawlers Allowed": "Web Crawlers Allowed"
+ "Web Bots Allowed": "Web Search Bots Allowed",
+ "Known Search Bots": "Known Web Search Bots"
}
diff --git a/translations/pl.json b/translations/pl.json
index bc96da46c..301c70145 100644
--- a/translations/pl.json
+++ b/translations/pl.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Pokaż, kto powtórzył ten post",
"Repeated by": "Powtórzone przez",
"Register": "Zarejestrować",
- "Web Crawlers Allowed": "Dozwolone roboty sieciowe"
+ "Web Bots Allowed": "Dozwolone boty internetowe",
+ "Known Search Bots": "Znane boty wyszukiwania w sieci"
}
diff --git a/translations/pt.json b/translations/pt.json
index 03cd5e5aa..4c8e848cc 100644
--- a/translations/pt.json
+++ b/translations/pt.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Mostrar quem repetiu esta postagem",
"Repeated by": "Repetido por",
"Register": "Registro",
- "Web Crawlers Allowed": "Rastreadores da Web permitidos"
+ "Web Bots Allowed": "Webbots permitidos",
+ "Known Search Bots": "Bots de pesquisa na Web conhecidos"
}
diff --git a/translations/ru.json b/translations/ru.json
index 762a7d5cc..a5f545bcd 100644
--- a/translations/ru.json
+++ b/translations/ru.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Показать, кто повторил этот пост",
"Repeated by": "Повторено",
"Register": "регистр",
- "Web Crawlers Allowed": "Веб-сканеры разрешены"
+ "Web Bots Allowed": "Веб-боты разрешены",
+ "Known Search Bots": "Известные боты веб-поиска"
}
diff --git a/translations/sw.json b/translations/sw.json
index 4be3a608f..bda0f5299 100644
--- a/translations/sw.json
+++ b/translations/sw.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
"Repeated by": "Imerudiwa na",
"Register": "Sajili",
- "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
+ "Web Bots Allowed": "Mtandao wa Boti Unaruhusiwa",
+ "Known Search Bots": "Vijibu vya Utafutaji wa Wavuti vinavyojulikana"
}
diff --git a/translations/uk.json b/translations/uk.json
index 32c2407e9..2f1d65fc4 100644
--- a/translations/uk.json
+++ b/translations/uk.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "Покажіть, хто повторив цей пост",
"Repeated by": "Повторюється за",
"Register": "Реєстрація",
- "Web Crawlers Allowed": "Веб-сканери дозволені"
+ "Web Bots Allowed": "Веб-боти дозволені",
+ "Known Search Bots": "Відомі пошукові роботи в Інтернеті"
}
diff --git a/translations/zh.json b/translations/zh.json
index 784009f13..280ce1fcf 100644
--- a/translations/zh.json
+++ b/translations/zh.json
@@ -516,5 +516,6 @@
"Show who repeated this post": "显示谁重复了这篇文章",
"Repeated by": "重复",
"Register": "登记",
- "Web Crawlers Allowed": "允许网络爬虫"
+ "Web Bots Allowed": "允许网络机器人",
+ "Known Search Bots": "已知的网络搜索机器人"
}
diff --git a/webapp_profile.py b/webapp_profile.py
index f57dc07d7..713190bc8 100644
--- a/webapp_profile.py
+++ b/webapp_profile.py
@@ -1808,13 +1808,17 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
'userAgentsBlockedStr', user_agents_blocked_str,
200, '', False)
+ edit_profile_form += \
+ '' + \
+ translate['Known Search Bots'] + '
\n'
+
crawlers_allowed_str = ''
for uagent in crawlers_allowed:
if crawlers_allowed_str:
crawlers_allowed_str += '\n'
crawlers_allowed_str += uagent
edit_profile_form += \
- edit_text_area(translate['Web Crawlers Allowed'],
+ edit_text_area(translate['Web Bots Allowed'],
'crawlersAllowedStr', crawlers_allowed_str,
200, '', False)