diff --git a/daemon.py b/daemon.py index 731bc8512..fcfb9172a 100644 --- a/daemon.py +++ b/daemon.py @@ -392,6 +392,36 @@ def saveDomainQrcode(baseDir: str, httpPrefix: str, class PubServer(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' + def _updateKnownCrawlers(self, uaStr: str) -> None: + """Updates a dictionary of known crawlers accessing nodeinfo + or the masto API + """ + if not uaStr: + return + + currTime = int(time.time()) + if self.server.knownCrawlers.get(uaStr): + self.server.knownCrawlers[uaStr]['hits'] += 1 + self.server.knownCrawlers[uaStr]['lastseen'] = currTime + else: + self.server.knownCrawlers[uaStr] = { + "lastseen": currTime, + "hits": 1 + } + + if currTime - self.server.lastKnownCrawler >= 30: + # remove any old observations + removeCrawlers = [] + for ua, item in self.server.knownCrawlers.items(): + if currTime - item['lastseen'] >= 60 * 60 * 24 * 30: + removeCrawlers.append(ua) + for ua in removeCrawlers: + del self.server.knownCrawlers[ua] + # save the list of crawlers + saveJson(self.server.knownCrawlers, + self.server.baseDir + '/accounts/knownCrawlers.json') + self.server.lastKnownCrawler = currTime + def _getInstanceUrl(self, callingDomain: str) -> str: """Returns the URL for this instance """ @@ -520,11 +550,22 @@ class PubServer(BaseHTTPRequestHandler): def _blockedUserAgent(self, callingDomain: str, agentStr: str) -> bool: """Should a GET or POST be blocked based upon its user agent? """ + if not agentStr: + return False + + agentStrLower = agentStr.lower() + defaultAgentBlocks = [ + 'fedilist.com' + ] + for uaBlock in defaultAgentBlocks: + if uaBlock in agentStrLower: + print('Blocked User agent: ' + uaBlock) + return True + agentDomain = None if agentStr: # is this a web crawler? If so the block it - agentStrLower = agentStr.lower() if 'bot/' in agentStrLower or 'bot-' in agentStrLower: if self.server.newsInstance: return False @@ -969,6 +1010,7 @@ class PubServer(BaseHTTPRequestHandler): return False def _mastoApiV1(self, path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, @@ -989,10 +1031,12 @@ class PubServer(BaseHTTPRequestHandler): print('mastodon api v1: ' + path) print('mastodon api v1: authorized ' + str(authorized)) print('mastodon api v1: nickname ' + str(nickname)) + self._updateKnownCrawlers(uaStr) brochMode = brochModeIsActive(baseDir) sendJson, sendJsonStr = mastoApiV1Response(path, callingDomain, + uaStr, authorized, httpPrefix, baseDir, @@ -1031,6 +1075,7 @@ class PubServer(BaseHTTPRequestHandler): return True def _mastoApi(self, path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, domainFull: str, @@ -1041,18 +1086,19 @@ class PubServer(BaseHTTPRequestHandler): projectVersion: str, customEmoji: [], showNodeInfoAccounts: bool) -> bool: - return self._mastoApiV1(path, callingDomain, authorized, + return self._mastoApiV1(path, callingDomain, uaStr, authorized, httpPrefix, baseDir, nickname, domain, domainFull, onionDomain, i2pDomain, translate, registration, systemLanguage, projectVersion, customEmoji, showNodeInfoAccounts) - def _nodeinfo(self, callingDomain: str) -> bool: + def _nodeinfo(self, uaStr: str, callingDomain: str) -> bool: if not self.path.startswith('/nodeinfo/2.0'): return False if self.server.debug: print('DEBUG: nodeinfo ' + self.path) + self._updateKnownCrawlers(uaStr) # If we are in broch mode then don't show potentially # sensitive metadata. @@ -1091,7 +1137,7 @@ class PubServer(BaseHTTPRequestHandler): self._set_headers('application/ld+json', msglen, None, callingDomain, True) self._write(msg) - print('nodeinfo sent') + print('nodeinfo sent to ' + callingDomain) return True self._404() return True @@ -11819,6 +11865,36 @@ class PubServer(BaseHTTPRequestHandler): return True return False + def _showKnownCrawlers(self, callingDomain: str, path: str, + baseDir: str, knownCrawlers: {}) -> bool: + """Show a list of known web crawlers + """ + if '/users/' not in path: + return False + if not path.endswith('/crawlers'): + return False + nickname = getNicknameFromActor(path) + if not nickname: + return False + if not isModerator(baseDir, nickname): + return False + crawlersList = [] + currTime = int(time.time()) + recentCrawlers = 60 * 60 * 24 * 30 + for uaStr, item in knownCrawlers.items(): + if item['lastseen'] - currTime < recentCrawlers: + crawlersList.append(str(item['hits']) + ' ' + uaStr) + crawlersList.sort(reverse=True) + msg = '' + for lineStr in crawlersList: + msg += lineStr + '\n' + msg = msg.encode('utf-8') + msglen = len(msg) + self._set_headers('text/plain; charset=utf-8', msglen, + None, callingDomain, True) + self._write(msg) + return True + def _editProfile(self, callingDomain: str, path: str, translate: {}, baseDir: str, httpPrefix: str, domain: str, port: int, @@ -12113,7 +12189,7 @@ class PubServer(BaseHTTPRequestHandler): # Since fediverse crawlers are quite active, # make returning info to them high priority # get nodeinfo endpoint - if self._nodeinfo(callingDomain): + if self._nodeinfo(uaStr, callingDomain): return fitnessPerformance(GETstartTime, self.server.fitness, @@ -12446,7 +12522,8 @@ class PubServer(BaseHTTPRequestHandler): return # minimal mastodon api - if self._mastoApi(self.path, callingDomain, authorized, + if self._mastoApi(self.path, callingDomain, uaStr, + authorized, self.server.httpPrefix, self.server.baseDir, self.authorizedNickname, @@ -14349,6 +14426,12 @@ class PubServer(BaseHTTPRequestHandler): self.server.GETbusy = False return + # list of known crawlers accessing nodeinfo or masto API + if self._showKnownCrawlers(callingDomain, self.path, + self.server.baseDir, + self.server.knownCrawlers): + return + # edit profile in web interface if self._editProfile(callingDomain, self.path, self.server.translate, @@ -17302,6 +17385,15 @@ def runDaemon(listsEnabled: str, createNewsInbox(baseDir, domain, port, httpPrefix) setConfigParam(baseDir, "listsEnabled", "Murdoch press") + # dict of known web crawlers accessing nodeinfo or the masto API + # and how many times they have been seen + httpd.knownCrawlers = {} + knownCrawlersFilename = baseDir + '/accounts/knownCrawlers.json' + if os.path.isfile(knownCrawlersFilename): + httpd.knownCrawlers = loadJson(knownCrawlersFilename) + # when was the last crawler seen? + httpd.lastKnownCrawler = 0 + if listsEnabled: httpd.listsEnabled = listsEnabled else: diff --git a/mastoapiv1.py b/mastoapiv1.py index 18722dfce..9661b31d5 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -82,6 +82,7 @@ def _getMastoApiV1Account(baseDir: str, nickname: str, domain: str) -> {}: def mastoApiV1Response(path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, @@ -100,12 +101,18 @@ def mastoApiV1Response(path: str, callingDomain: str, """ sendJson = None sendJsonStr = '' + if not uaStr: + uaStr = '' # parts of the api needing authorization if authorized and nickname: if path == '/api/v1/accounts/verify_credentials': sendJson = _getMastoApiV1Account(baseDir, nickname, domain) - sendJsonStr = 'masto API account sent for ' + nickname + sendJsonStr = \ + 'masto API account sent for ' + nickname + ' ' + uaStr + + # information about where the request is coming from + callingInfo = ' ' + uaStr + ', ' + callingDomain # Parts of the api which don't need authorization mastoId = _getMastApiV1Id(path) @@ -121,57 +128,73 @@ def mastoApiV1Response(path: str, callingDomain: str, path = path.split('?')[0] if path.endswith('/followers'): sendJson = [] - sendJsonStr = 'masto API followers sent for ' + nickname + sendJsonStr = \ + 'masto API followers sent for ' + nickname + \ + callingInfo elif path.endswith('/following'): sendJson = [] - sendJsonStr = 'masto API following sent for ' + nickname + sendJsonStr = \ + 'masto API following sent for ' + nickname + \ + callingInfo elif path.endswith('/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent for ' + nickname + sendJsonStr = \ + 'masto API statuses sent for ' + nickname + \ + callingInfo elif path.endswith('/search'): sendJson = [] - sendJsonStr = 'masto API search sent ' + originalPath + sendJsonStr = \ + 'masto API search sent ' + originalPath + \ + callingInfo elif path.endswith('/relationships'): sendJson = [] sendJsonStr = \ - 'masto API relationships sent ' + originalPath + 'masto API relationships sent ' + originalPath + \ + callingInfo else: sendJson = \ _getMastoApiV1Account(baseDir, pathNickname, domain) - sendJsonStr = 'masto API account sent for ' + nickname + sendJsonStr = \ + 'masto API account sent for ' + nickname + \ + callingInfo # NOTE: adding support for '/api/v1/directory seems to create # federation problems, so avoid implementing that if path.startswith('/api/v1/blocks'): sendJson = [] - sendJsonStr = 'masto API instance blocks sent ' + path + sendJsonStr = \ + 'masto API instance blocks sent ' + path + callingInfo elif path.startswith('/api/v1/favorites'): sendJson = [] - sendJsonStr = 'masto API favorites sent ' + path + sendJsonStr = 'masto API favorites sent ' + path + callingInfo elif path.startswith('/api/v1/follow_requests'): sendJson = [] - sendJsonStr = 'masto API follow requests sent ' + path + sendJsonStr = \ + 'masto API follow requests sent ' + path + callingInfo elif path.startswith('/api/v1/mutes'): sendJson = [] - sendJsonStr = 'masto API mutes sent ' + path + sendJsonStr = \ + 'masto API mutes sent ' + path + callingInfo elif path.startswith('/api/v1/notifications'): sendJson = [] - sendJsonStr = 'masto API notifications sent ' + path + sendJsonStr = \ + 'masto API notifications sent ' + path + callingInfo elif path.startswith('/api/v1/reports'): sendJson = [] - sendJsonStr = 'masto API reports sent ' + path + sendJsonStr = 'masto API reports sent ' + path + callingInfo elif path.startswith('/api/v1/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent ' + path + sendJsonStr = 'masto API statuses sent ' + path + callingInfo elif path.startswith('/api/v1/timelines'): sendJson = { 'error': 'This method requires an authenticated user' } - sendJsonStr = 'masto API timelines sent ' + path + sendJsonStr = 'masto API timelines sent ' + path + callingInfo elif path.startswith('/api/v1/custom_emojis'): sendJson = customEmoji - sendJsonStr = 'masto API custom emojis sent ' + path + sendJsonStr = \ + 'masto API custom emojis sent ' + path + callingInfo adminNickname = getConfigParam(baseDir, 'admin') if adminNickname and path == '/api/v1/instance': @@ -208,7 +231,7 @@ def mastoApiV1Response(path: str, callingDomain: str, registration, systemLanguage, projectVersion) - sendJsonStr = 'masto API instance metadata sent' + sendJsonStr = 'masto API instance metadata sent ' + uaStr elif path.startswith('/api/v1/instance/peers'): # This is just a dummy result. # Showing the full list of peers would have privacy implications. @@ -216,8 +239,8 @@ def mastoApiV1Response(path: str, callingDomain: str, # small instances a full list of peers would convey a lot of # information about the interests of a small number of accounts sendJson = ['mastodon.social', domainFull] - sendJsonStr = 'masto API peers metadata sent' + sendJsonStr = 'masto API peers metadata sent ' + uaStr elif path.startswith('/api/v1/instance/activity'): sendJson = [] - sendJsonStr = 'masto API activity metadata sent' + sendJsonStr = 'masto API activity metadata sent ' + uaStr return sendJson, sendJsonStr diff --git a/translations/ar.json b/translations/ar.json index 2ae899465..4b46432fb 100644 --- a/translations/ar.json +++ b/translations/ar.json @@ -489,5 +489,6 @@ "Join": "انضم", "Leave": "يترك", "System Monitor": "مراقب النظام", - "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية" + "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية", + "Known Web Crawlers": "برامج زحف الويب المعروفة" } diff --git a/translations/ca.json b/translations/ca.json index f4861d724..59fbdc8d1 100644 --- a/translations/ca.json +++ b/translations/ca.json @@ -489,5 +489,6 @@ "Join": "Uneix-te", "Leave": "Marxa", "System Monitor": "Monitor del sistema", - "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents" + "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents", + "Known Web Crawlers": "Exploradors web coneguts" } diff --git a/translations/cy.json b/translations/cy.json index f25c5a27c..79339634a 100644 --- a/translations/cy.json +++ b/translations/cy.json @@ -489,5 +489,6 @@ "Join": "Ymunwch", "Leave": "Gadewch", "System Monitor": "Monitor System", - "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol" + "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol", + "Known Web Crawlers": "Crawlers Gwe Hysbys" } diff --git a/translations/de.json b/translations/de.json index e2760a864..9852d7703 100644 --- a/translations/de.json +++ b/translations/de.json @@ -489,5 +489,6 @@ "Join": "Verbinden", "Leave": "Verlassen", "System Monitor": "Systemmonitor", - "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen" + "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen", + "Known Web Crawlers": "Bekannte Web-Crawler" } diff --git a/translations/en.json b/translations/en.json index 4e8d3ec13..1efb610ef 100644 --- a/translations/en.json +++ b/translations/en.json @@ -489,5 +489,6 @@ "Join": "Join", "Leave": "Leave", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Add content warnings for the following sites" + "Add content warnings for the following sites": "Add content warnings for the following sites", + "Known Web Crawlers": "Known Web Crawlers" } diff --git a/translations/es.json b/translations/es.json index 67cf33835..0c88b4250 100644 --- a/translations/es.json +++ b/translations/es.json @@ -489,5 +489,6 @@ "Join": "Entrar", "Leave": "Dejar", "System Monitor": "Monitor del sistema", - "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios" + "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios", + "Known Web Crawlers": "Rastreadores web conocidos" } diff --git a/translations/fr.json b/translations/fr.json index 056a914c0..05f4284bd 100644 --- a/translations/fr.json +++ b/translations/fr.json @@ -489,5 +489,6 @@ "Join": "Rejoindre", "Leave": "Laisser", "System Monitor": "Moniteur système", - "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants" + "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants", + "Known Web Crawlers": "Crawlers Web connus" } diff --git a/translations/ga.json b/translations/ga.json index 73f740bcf..dc0e6f802 100644 --- a/translations/ga.json +++ b/translations/ga.json @@ -489,5 +489,6 @@ "Join": "Bí páirteach", "Leave": "Fág", "System Monitor": "Monatóir Córais", - "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas" + "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas", + "Known Web Crawlers": "Crawlers Gréasáin Aitheanta" } diff --git a/translations/hi.json b/translations/hi.json index c4e475f2d..bf52fae8b 100644 --- a/translations/hi.json +++ b/translations/hi.json @@ -489,5 +489,6 @@ "Join": "शामिल हों", "Leave": "छोड़ना", "System Monitor": "सिस्टम मॉनिटर", - "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें" + "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें", + "Known Web Crawlers": "ज्ञात वेब क्रॉलर" } diff --git a/translations/it.json b/translations/it.json index 4648aedae..41341a01d 100644 --- a/translations/it.json +++ b/translations/it.json @@ -489,5 +489,6 @@ "Join": "Aderire", "Leave": "Lasciare", "System Monitor": "Monitor di sistema", - "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti" + "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti", + "Known Web Crawlers": "Crawler Web conosciuti" } diff --git a/translations/ja.json b/translations/ja.json index 0fe46e4f4..d96250ced 100644 --- a/translations/ja.json +++ b/translations/ja.json @@ -489,5 +489,6 @@ "Join": "加入", "Leave": "離れる", "System Monitor": "システムモニター", - "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します" + "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します", + "Known Web Crawlers": "既知のWebクローラー" } diff --git a/translations/ku.json b/translations/ku.json index 1c4f7dd19..946d077be 100644 --- a/translations/ku.json +++ b/translations/ku.json @@ -489,5 +489,6 @@ "Join": "Bihevgirêdan", "Leave": "Terikandin", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin" + "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin", + "Known Web Crawlers": "Crawlerên Webê yên naskirî" } diff --git a/translations/oc.json b/translations/oc.json index d898dfa43..d8f72b430 100644 --- a/translations/oc.json +++ b/translations/oc.json @@ -485,5 +485,6 @@ "Join": "Join", "Leave": "Leave", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Add content warnings for the following sites" + "Add content warnings for the following sites": "Add content warnings for the following sites", + "Known Web Crawlers": "Known Web Crawlers" } diff --git a/translations/pt.json b/translations/pt.json index 789df52ea..b1056b4f4 100644 --- a/translations/pt.json +++ b/translations/pt.json @@ -489,5 +489,6 @@ "Join": "Juntar", "Leave": "Sair", "System Monitor": "Monitor de Sistema", - "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites" + "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites", + "Known Web Crawlers": "Rastreadores da Web conhecidos" } diff --git a/translations/ru.json b/translations/ru.json index 2352871d7..eb71190d0 100644 --- a/translations/ru.json +++ b/translations/ru.json @@ -489,5 +489,6 @@ "Join": "Присоединиться", "Leave": "Оставлять", "System Monitor": "Системный монитор", - "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов" + "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов", + "Known Web Crawlers": "Известные веб-сканеры" } diff --git a/translations/sw.json b/translations/sw.json index 3e7aa7b8d..a7e078946 100644 --- a/translations/sw.json +++ b/translations/sw.json @@ -489,5 +489,6 @@ "Join": "Jiunge", "Leave": "Ondoka", "System Monitor": "Ufuatiliaji wa Mfumo", - "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo" + "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo", + "Known Web Crawlers": "Watambaji Wavuti Wanaojulikana" } diff --git a/translations/zh.json b/translations/zh.json index e7104146b..3be4afa78 100644 --- a/translations/zh.json +++ b/translations/zh.json @@ -489,5 +489,6 @@ "Join": "加入", "Leave": "离开", "System Monitor": "系统监视器", - "Add content warnings for the following sites": "为以下网站添加内容警告" + "Add content warnings for the following sites": "为以下网站添加内容警告", + "Known Web Crawlers": "已知的网络爬虫" } diff --git a/utils.py b/utils.py index 47572bfb0..45e6e4152 100644 --- a/utils.py +++ b/utils.py @@ -715,7 +715,7 @@ def getStatusNumber(publishedStr: str = None) -> (str, str): def evilIncarnate() -> []: - return ('gab.com', 'gabfed.com', 'spinster.xyz', + return ('fedilist.com', 'gab.com', 'gabfed.com', 'spinster.xyz', 'kiwifarms.cc', 'djitter.com') diff --git a/webapp_profile.py b/webapp_profile.py index 914d212c8..1f29a368d 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -1520,8 +1520,8 @@ def _htmlEditProfileSharedItems(baseDir: str, nickname: str, domain: str, def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str, - userAgentsBlocked: str, translate: {}, - replyIntervalHours: int, + userAgentsBlocked: str, + translate: {}, replyIntervalHours: int, CWlists: {}, listsEnabled: str) -> str: """Filtering and blocking section of edit profile screen """ @@ -1669,6 +1669,10 @@ def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str, allowedInstancesStr + '\n' if isModerator(baseDir, nickname): + editProfileForm += \ + '' + \ + translate['Known Web Crawlers'] + '
\n' + userAgentsBlockedStr = '' for ua in userAgentsBlocked: if userAgentsBlockedStr: