From a661a368ca83cabb84fc5e8bf81c3e341bddc15f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 19:42:44 +0100 Subject: [PATCH 01/17] Show calling domains for API --- daemon.py | 2 +- mastoapiv1.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/daemon.py b/daemon.py index 731bc8512..a98ab476c 100644 --- a/daemon.py +++ b/daemon.py @@ -1091,7 +1091,7 @@ class PubServer(BaseHTTPRequestHandler): self._set_headers('application/ld+json', msglen, None, callingDomain, True) self._write(msg) - print('nodeinfo sent') + print('nodeinfo sent to ' + callingDomain) return True self._404() return True diff --git a/mastoapiv1.py b/mastoapiv1.py index 18722dfce..c3e39f80b 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -208,7 +208,7 @@ def mastoApiV1Response(path: str, callingDomain: str, registration, systemLanguage, projectVersion) - sendJsonStr = 'masto API instance metadata sent' + sendJsonStr = 'masto API instance metadata sent ' + callingDomain elif path.startswith('/api/v1/instance/peers'): # This is just a dummy result. # Showing the full list of peers would have privacy implications. @@ -216,8 +216,8 @@ def mastoApiV1Response(path: str, callingDomain: str, # small instances a full list of peers would convey a lot of # information about the interests of a small number of accounts sendJson = ['mastodon.social', domainFull] - sendJsonStr = 'masto API peers metadata sent' + sendJsonStr = 'masto API peers metadata sent ' + callingDomain elif path.startswith('/api/v1/instance/activity'): sendJson = [] - sendJsonStr = 'masto API activity metadata sent' + sendJsonStr = 'masto API activity metadata sent ' + callingDomain return sendJson, sendJsonStr From e617de5f146d970baec11647e299ae353e0aa474 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 19:51:30 +0100 Subject: [PATCH 02/17] Show calling domain for api endpoints --- mastoapiv1.py | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/mastoapiv1.py b/mastoapiv1.py index c3e39f80b..eec92e68a 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -121,57 +121,73 @@ def mastoApiV1Response(path: str, callingDomain: str, path = path.split('?')[0] if path.endswith('/followers'): sendJson = [] - sendJsonStr = 'masto API followers sent for ' + nickname + sendJsonStr = \ + 'masto API followers sent for ' + nickname + \ + ' ' + callingDomain elif path.endswith('/following'): sendJson = [] - sendJsonStr = 'masto API following sent for ' + nickname + sendJsonStr = \ + 'masto API following sent for ' + nickname + \ + ' ' + callingDomain elif path.endswith('/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent for ' + nickname + sendJsonStr = \ + 'masto API statuses sent for ' + nickname + \ + ' ' + callingDomain elif path.endswith('/search'): sendJson = [] - sendJsonStr = 'masto API search sent ' + originalPath + sendJsonStr = \ + 'masto API search sent ' + originalPath + \ + ' ' + callingDomain elif path.endswith('/relationships'): sendJson = [] sendJsonStr = \ - 'masto API relationships sent ' + originalPath + 'masto API relationships sent ' + originalPath + \ + ' ' + callingDomain else: sendJson = \ _getMastoApiV1Account(baseDir, pathNickname, domain) - sendJsonStr = 'masto API account sent for ' + nickname + sendJsonStr = \ + 'masto API account sent for ' + nickname + \ + ' ' + callingDomain # NOTE: adding support for '/api/v1/directory seems to create # federation problems, so avoid implementing that if path.startswith('/api/v1/blocks'): sendJson = [] - sendJsonStr = 'masto API instance blocks sent ' + path + sendJsonStr = \ + 'masto API instance blocks sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/favorites'): sendJson = [] - sendJsonStr = 'masto API favorites sent ' + path + sendJsonStr = 'masto API favorites sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/follow_requests'): sendJson = [] - sendJsonStr = 'masto API follow requests sent ' + path + sendJsonStr = \ + 'masto API follow requests sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/mutes'): sendJson = [] - sendJsonStr = 'masto API mutes sent ' + path + sendJsonStr = \ + 'masto API mutes sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/notifications'): sendJson = [] - sendJsonStr = 'masto API notifications sent ' + path + sendJsonStr = \ + 'masto API notifications sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/reports'): sendJson = [] - sendJsonStr = 'masto API reports sent ' + path + sendJsonStr = 'masto API reports sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent ' + path + sendJsonStr = 'masto API statuses sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/timelines'): sendJson = { 'error': 'This method requires an authenticated user' } - sendJsonStr = 'masto API timelines sent ' + path + sendJsonStr = 'masto API timelines sent ' + path + ' ' + callingDomain elif path.startswith('/api/v1/custom_emojis'): sendJson = customEmoji - sendJsonStr = 'masto API custom emojis sent ' + path + sendJsonStr = \ + 'masto API custom emojis sent ' + path + ' ' + callingDomain adminNickname = getConfigParam(baseDir, 'admin') if adminNickname and path == '/api/v1/instance': From d36d1817fd9b032110af534d18dc20987816e928 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 20:24:42 +0100 Subject: [PATCH 03/17] Show user agent rather than calling domain --- daemon.py | 8 ++++++-- mastoapiv1.py | 39 +++++++++++++++++++++------------------ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/daemon.py b/daemon.py index a98ab476c..5fc68c9fb 100644 --- a/daemon.py +++ b/daemon.py @@ -969,6 +969,7 @@ class PubServer(BaseHTTPRequestHandler): return False def _mastoApiV1(self, path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, @@ -993,6 +994,7 @@ class PubServer(BaseHTTPRequestHandler): brochMode = brochModeIsActive(baseDir) sendJson, sendJsonStr = mastoApiV1Response(path, callingDomain, + uaStr, authorized, httpPrefix, baseDir, @@ -1031,6 +1033,7 @@ class PubServer(BaseHTTPRequestHandler): return True def _mastoApi(self, path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, domainFull: str, @@ -1041,7 +1044,7 @@ class PubServer(BaseHTTPRequestHandler): projectVersion: str, customEmoji: [], showNodeInfoAccounts: bool) -> bool: - return self._mastoApiV1(path, callingDomain, authorized, + return self._mastoApiV1(path, callingDomain, uaStr, authorized, httpPrefix, baseDir, nickname, domain, domainFull, onionDomain, i2pDomain, translate, registration, systemLanguage, @@ -12446,7 +12449,8 @@ class PubServer(BaseHTTPRequestHandler): return # minimal mastodon api - if self._mastoApi(self.path, callingDomain, authorized, + if self._mastoApi(self.path, callingDomain, uaStr, + authorized, self.server.httpPrefix, self.server.baseDir, self.authorizedNickname, diff --git a/mastoapiv1.py b/mastoapiv1.py index eec92e68a..6260e2fb3 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -82,6 +82,7 @@ def _getMastoApiV1Account(baseDir: str, nickname: str, domain: str) -> {}: def mastoApiV1Response(path: str, callingDomain: str, + uaStr: str, authorized: bool, httpPrefix: str, baseDir: str, nickname: str, domain: str, @@ -100,6 +101,8 @@ def mastoApiV1Response(path: str, callingDomain: str, """ sendJson = None sendJsonStr = '' + if not uaStr: + uaStr = '' # parts of the api needing authorization if authorized and nickname: @@ -123,33 +126,33 @@ def mastoApiV1Response(path: str, callingDomain: str, sendJson = [] sendJsonStr = \ 'masto API followers sent for ' + nickname + \ - ' ' + callingDomain + ' ' + uaStr elif path.endswith('/following'): sendJson = [] sendJsonStr = \ 'masto API following sent for ' + nickname + \ - ' ' + callingDomain + ' ' + uaStr elif path.endswith('/statuses'): sendJson = [] sendJsonStr = \ 'masto API statuses sent for ' + nickname + \ - ' ' + callingDomain + ' ' + uaStr elif path.endswith('/search'): sendJson = [] sendJsonStr = \ 'masto API search sent ' + originalPath + \ - ' ' + callingDomain + ' ' + uaStr elif path.endswith('/relationships'): sendJson = [] sendJsonStr = \ 'masto API relationships sent ' + originalPath + \ - ' ' + callingDomain + ' ' + uaStr else: sendJson = \ _getMastoApiV1Account(baseDir, pathNickname, domain) sendJsonStr = \ 'masto API account sent for ' + nickname + \ - ' ' + callingDomain + ' ' + uaStr # NOTE: adding support for '/api/v1/directory seems to create # federation problems, so avoid implementing that @@ -157,37 +160,37 @@ def mastoApiV1Response(path: str, callingDomain: str, if path.startswith('/api/v1/blocks'): sendJson = [] sendJsonStr = \ - 'masto API instance blocks sent ' + path + ' ' + callingDomain + 'masto API instance blocks sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/favorites'): sendJson = [] - sendJsonStr = 'masto API favorites sent ' + path + ' ' + callingDomain + sendJsonStr = 'masto API favorites sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/follow_requests'): sendJson = [] sendJsonStr = \ - 'masto API follow requests sent ' + path + ' ' + callingDomain + 'masto API follow requests sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/mutes'): sendJson = [] sendJsonStr = \ - 'masto API mutes sent ' + path + ' ' + callingDomain + 'masto API mutes sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/notifications'): sendJson = [] sendJsonStr = \ - 'masto API notifications sent ' + path + ' ' + callingDomain + 'masto API notifications sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/reports'): sendJson = [] - sendJsonStr = 'masto API reports sent ' + path + ' ' + callingDomain + sendJsonStr = 'masto API reports sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent ' + path + ' ' + callingDomain + sendJsonStr = 'masto API statuses sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/timelines'): sendJson = { 'error': 'This method requires an authenticated user' } - sendJsonStr = 'masto API timelines sent ' + path + ' ' + callingDomain + sendJsonStr = 'masto API timelines sent ' + path + ' ' + uaStr elif path.startswith('/api/v1/custom_emojis'): sendJson = customEmoji sendJsonStr = \ - 'masto API custom emojis sent ' + path + ' ' + callingDomain + 'masto API custom emojis sent ' + path + ' ' + uaStr adminNickname = getConfigParam(baseDir, 'admin') if adminNickname and path == '/api/v1/instance': @@ -224,7 +227,7 @@ def mastoApiV1Response(path: str, callingDomain: str, registration, systemLanguage, projectVersion) - sendJsonStr = 'masto API instance metadata sent ' + callingDomain + sendJsonStr = 'masto API instance metadata sent ' + uaStr elif path.startswith('/api/v1/instance/peers'): # This is just a dummy result. # Showing the full list of peers would have privacy implications. @@ -232,8 +235,8 @@ def mastoApiV1Response(path: str, callingDomain: str, # small instances a full list of peers would convey a lot of # information about the interests of a small number of accounts sendJson = ['mastodon.social', domainFull] - sendJsonStr = 'masto API peers metadata sent ' + callingDomain + sendJsonStr = 'masto API peers metadata sent ' + uaStr elif path.startswith('/api/v1/instance/activity'): sendJson = [] - sendJsonStr = 'masto API activity metadata sent ' + callingDomain + sendJsonStr = 'masto API activity metadata sent ' + uaStr return sendJson, sendJsonStr From 8670e4e4560806b728d92796ba37771c09953bc9 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 21:17:55 +0100 Subject: [PATCH 04/17] Send user agent --- mastoapiv1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mastoapiv1.py b/mastoapiv1.py index 6260e2fb3..347f8247f 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -108,7 +108,8 @@ def mastoApiV1Response(path: str, callingDomain: str, if authorized and nickname: if path == '/api/v1/accounts/verify_credentials': sendJson = _getMastoApiV1Account(baseDir, nickname, domain) - sendJsonStr = 'masto API account sent for ' + nickname + sendJsonStr = \ + 'masto API account sent for ' + nickname + ' ' + usStr # Parts of the api which don't need authorization mastoId = _getMastApiV1Id(path) From d72a27b5cc287a3d32db87c0761db6529433f6bd Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 21:18:17 +0100 Subject: [PATCH 05/17] Typo --- mastoapiv1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mastoapiv1.py b/mastoapiv1.py index 347f8247f..b00490934 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -109,7 +109,7 @@ def mastoApiV1Response(path: str, callingDomain: str, if path == '/api/v1/accounts/verify_credentials': sendJson = _getMastoApiV1Account(baseDir, nickname, domain) sendJsonStr = \ - 'masto API account sent for ' + nickname + ' ' + usStr + 'masto API account sent for ' + nickname + ' ' + uaStr # Parts of the api which don't need authorization mastoId = _getMastApiV1Id(path) From 8fe8701414f89777229d17c49c4d7507b478bca0 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 23:56:16 +0100 Subject: [PATCH 06/17] Default user agent blocks --- daemon.py | 10 +++++++++- utils.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/daemon.py b/daemon.py index 5fc68c9fb..f190d8bb9 100644 --- a/daemon.py +++ b/daemon.py @@ -520,11 +520,19 @@ class PubServer(BaseHTTPRequestHandler): def _blockedUserAgent(self, callingDomain: str, agentStr: str) -> bool: """Should a GET or POST be blocked based upon its user agent? """ + agentStrLower = agentStr.lower() + defaultAgentBlocks = ( + 'fedilist.com' + ) + for uaBlock in defaultAgentBlocks: + if uaBlock in agentStrLower: + print('Blocked User agent: ' + uaBlock) + return True + agentDomain = None if agentStr: # is this a web crawler? If so the block it - agentStrLower = agentStr.lower() if 'bot/' in agentStrLower or 'bot-' in agentStrLower: if self.server.newsInstance: return False diff --git a/utils.py b/utils.py index 47572bfb0..45e6e4152 100644 --- a/utils.py +++ b/utils.py @@ -715,7 +715,7 @@ def getStatusNumber(publishedStr: str = None) -> (str, str): def evilIncarnate() -> []: - return ('gab.com', 'gabfed.com', 'spinster.xyz', + return ('fedilist.com', 'gab.com', 'gabfed.com', 'spinster.xyz', 'kiwifarms.cc', 'djitter.com') From 19194f3d3022636f60368074e7237d2ff291d026 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sat, 23 Oct 2021 23:58:18 +0100 Subject: [PATCH 07/17] list --- daemon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/daemon.py b/daemon.py index f190d8bb9..e84433801 100644 --- a/daemon.py +++ b/daemon.py @@ -521,9 +521,9 @@ class PubServer(BaseHTTPRequestHandler): """Should a GET or POST be blocked based upon its user agent? """ agentStrLower = agentStr.lower() - defaultAgentBlocks = ( + defaultAgentBlocks = [ 'fedilist.com' - ) + ] for uaBlock in defaultAgentBlocks: if uaBlock in agentStrLower: print('Blocked User agent: ' + uaBlock) From 9e81ad9bad12c7186d1a6b4ee355beaef1aba9ba Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 10:02:28 +0100 Subject: [PATCH 08/17] More information about calling domain --- mastoapiv1.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mastoapiv1.py b/mastoapiv1.py index b00490934..9661b31d5 100644 --- a/mastoapiv1.py +++ b/mastoapiv1.py @@ -111,6 +111,9 @@ def mastoApiV1Response(path: str, callingDomain: str, sendJsonStr = \ 'masto API account sent for ' + nickname + ' ' + uaStr + # information about where the request is coming from + callingInfo = ' ' + uaStr + ', ' + callingDomain + # Parts of the api which don't need authorization mastoId = _getMastApiV1Id(path) if mastoId is not None: @@ -127,33 +130,33 @@ def mastoApiV1Response(path: str, callingDomain: str, sendJson = [] sendJsonStr = \ 'masto API followers sent for ' + nickname + \ - ' ' + uaStr + callingInfo elif path.endswith('/following'): sendJson = [] sendJsonStr = \ 'masto API following sent for ' + nickname + \ - ' ' + uaStr + callingInfo elif path.endswith('/statuses'): sendJson = [] sendJsonStr = \ 'masto API statuses sent for ' + nickname + \ - ' ' + uaStr + callingInfo elif path.endswith('/search'): sendJson = [] sendJsonStr = \ 'masto API search sent ' + originalPath + \ - ' ' + uaStr + callingInfo elif path.endswith('/relationships'): sendJson = [] sendJsonStr = \ 'masto API relationships sent ' + originalPath + \ - ' ' + uaStr + callingInfo else: sendJson = \ _getMastoApiV1Account(baseDir, pathNickname, domain) sendJsonStr = \ 'masto API account sent for ' + nickname + \ - ' ' + uaStr + callingInfo # NOTE: adding support for '/api/v1/directory seems to create # federation problems, so avoid implementing that @@ -161,37 +164,37 @@ def mastoApiV1Response(path: str, callingDomain: str, if path.startswith('/api/v1/blocks'): sendJson = [] sendJsonStr = \ - 'masto API instance blocks sent ' + path + ' ' + uaStr + 'masto API instance blocks sent ' + path + callingInfo elif path.startswith('/api/v1/favorites'): sendJson = [] - sendJsonStr = 'masto API favorites sent ' + path + ' ' + uaStr + sendJsonStr = 'masto API favorites sent ' + path + callingInfo elif path.startswith('/api/v1/follow_requests'): sendJson = [] sendJsonStr = \ - 'masto API follow requests sent ' + path + ' ' + uaStr + 'masto API follow requests sent ' + path + callingInfo elif path.startswith('/api/v1/mutes'): sendJson = [] sendJsonStr = \ - 'masto API mutes sent ' + path + ' ' + uaStr + 'masto API mutes sent ' + path + callingInfo elif path.startswith('/api/v1/notifications'): sendJson = [] sendJsonStr = \ - 'masto API notifications sent ' + path + ' ' + uaStr + 'masto API notifications sent ' + path + callingInfo elif path.startswith('/api/v1/reports'): sendJson = [] - sendJsonStr = 'masto API reports sent ' + path + ' ' + uaStr + sendJsonStr = 'masto API reports sent ' + path + callingInfo elif path.startswith('/api/v1/statuses'): sendJson = [] - sendJsonStr = 'masto API statuses sent ' + path + ' ' + uaStr + sendJsonStr = 'masto API statuses sent ' + path + callingInfo elif path.startswith('/api/v1/timelines'): sendJson = { 'error': 'This method requires an authenticated user' } - sendJsonStr = 'masto API timelines sent ' + path + ' ' + uaStr + sendJsonStr = 'masto API timelines sent ' + path + callingInfo elif path.startswith('/api/v1/custom_emojis'): sendJson = customEmoji sendJsonStr = \ - 'masto API custom emojis sent ' + path + ' ' + uaStr + 'masto API custom emojis sent ' + path + callingInfo adminNickname = getConfigParam(baseDir, 'admin') if adminNickname and path == '/api/v1/instance': From 49f28f552809c220f0eb1c7e15f1e9f8f9849560 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 10:26:23 +0100 Subject: [PATCH 09/17] Log user agents accessing nodeinfo --- daemon.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/daemon.py b/daemon.py index e84433801..82fccbb13 100644 --- a/daemon.py +++ b/daemon.py @@ -392,6 +392,17 @@ def saveDomainQrcode(baseDir: str, httpPrefix: str, class PubServer(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' + def _updateKnownCrawlers(self, uaStr: str) -> None: + """Updates a dictionary of known crawlers accessing nodeinfo + or the masto API + """ + if self.server.knownCrawlers.get(uaStr): + self.server.knownCrawlers[uaStr]['hits'] += 1 + else: + self.server.knownCrawlers[uaStr] = { + "hits": 1 + } + def _getInstanceUrl(self, callingDomain: str) -> str: """Returns the URL for this instance """ @@ -998,6 +1009,7 @@ class PubServer(BaseHTTPRequestHandler): print('mastodon api v1: ' + path) print('mastodon api v1: authorized ' + str(authorized)) print('mastodon api v1: nickname ' + str(nickname)) + self._updateKnownCrawlers(uaStr) brochMode = brochModeIsActive(baseDir) sendJson, sendJsonStr = mastoApiV1Response(path, @@ -1059,11 +1071,12 @@ class PubServer(BaseHTTPRequestHandler): projectVersion, customEmoji, showNodeInfoAccounts) - def _nodeinfo(self, callingDomain: str) -> bool: + def _nodeinfo(self, uaStr: str, callingDomain: str) -> bool: if not self.path.startswith('/nodeinfo/2.0'): return False if self.server.debug: print('DEBUG: nodeinfo ' + self.path) + self._updateKnownCrawlers(uaStr) # If we are in broch mode then don't show potentially # sensitive metadata. @@ -12124,7 +12137,7 @@ class PubServer(BaseHTTPRequestHandler): # Since fediverse crawlers are quite active, # make returning info to them high priority # get nodeinfo endpoint - if self._nodeinfo(callingDomain): + if self._nodeinfo(uaStr, callingDomain): return fitnessPerformance(GETstartTime, self.server.fitness, @@ -17114,6 +17127,10 @@ def runDaemon(listsEnabled: str, # list of blocked user agent types within the User-Agent header httpd.userAgentsBlocked = userAgentsBlocked + # dict of known web crawlers accessing nodeinfo or the masto API + # and how many times they have been seen + httpd.knownCrawlers = {} + httpd.unitTest = unitTest httpd.allowLocalNetworkAccess = allowLocalNetworkAccess if unitTest: From 6778be04ae1616fd617e7957d55bb7ba6d52cb9a Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 10:57:10 +0100 Subject: [PATCH 10/17] Show list of known crawlers --- daemon.py | 33 +++++++++++++++++++++++++++++++++ webapp_profile.py | 8 ++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/daemon.py b/daemon.py index 82fccbb13..be07bef69 100644 --- a/daemon.py +++ b/daemon.py @@ -11843,6 +11843,33 @@ class PubServer(BaseHTTPRequestHandler): return True return False + def _showKnownCrawlers(self, callingDomain: str, path: str, + baseDir: str, knownCrawlers: {}) -> bool: + """Show a list of known web crawlers + """ + if '/users/' not in path: + return False + if not path.endswith('/crawlers'): + return False + nickname = getNicknameFromActor(path) + if not nickname: + return False + if not isModerator(baseDir, nickname): + return False + crawlersList = [] + for uaStr, item in knownCrawlers.items(): + crawlersList.append(str(item['hits']) + ' ' + uaStr) + crawlersList.sort(reverse=True) + msg = '' + for lineStr in crawlersList: + msg += lineStr + '\n' + msg = msg.encode('utf-8') + msglen = len(msg) + self._set_headers('text/plain; charset=utf-8', msglen, + None, callingDomain, True) + self._write(msg) + return True + def _editProfile(self, callingDomain: str, path: str, translate: {}, baseDir: str, httpPrefix: str, domain: str, port: int, @@ -14374,6 +14401,12 @@ class PubServer(BaseHTTPRequestHandler): self.server.GETbusy = False return + # list of known crawlers accessing nodeinfo or masto API + if self._showKnownCrawlers(callingDomain, self.path, + self.server.baseDir, + self.server.knownCrawlers): + return + # edit profile in web interface if self._editProfile(callingDomain, self.path, self.server.translate, diff --git a/webapp_profile.py b/webapp_profile.py index 914d212c8..1f29a368d 100644 --- a/webapp_profile.py +++ b/webapp_profile.py @@ -1520,8 +1520,8 @@ def _htmlEditProfileSharedItems(baseDir: str, nickname: str, domain: str, def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str, - userAgentsBlocked: str, translate: {}, - replyIntervalHours: int, + userAgentsBlocked: str, + translate: {}, replyIntervalHours: int, CWlists: {}, listsEnabled: str) -> str: """Filtering and blocking section of edit profile screen """ @@ -1669,6 +1669,10 @@ def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str, allowedInstancesStr + '\n' if isModerator(baseDir, nickname): + editProfileForm += \ + '' + \ + translate['Known Web Crawlers'] + '
\n' + userAgentsBlockedStr = '' for ua in userAgentsBlocked: if userAgentsBlockedStr: From 12028370e5d1795ede5728b95e819a877026c3fa Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 11:04:55 +0100 Subject: [PATCH 11/17] Translations --- translations/ar.json | 3 ++- translations/ca.json | 3 ++- translations/cy.json | 3 ++- translations/de.json | 3 ++- translations/en.json | 3 ++- translations/es.json | 3 ++- translations/fr.json | 3 ++- translations/ga.json | 3 ++- translations/hi.json | 3 ++- translations/it.json | 3 ++- translations/ja.json | 3 ++- translations/ku.json | 3 ++- translations/oc.json | 3 ++- translations/pt.json | 3 ++- translations/ru.json | 3 ++- translations/sw.json | 3 ++- translations/zh.json | 3 ++- 17 files changed, 34 insertions(+), 17 deletions(-) diff --git a/translations/ar.json b/translations/ar.json index 2ae899465..4b46432fb 100644 --- a/translations/ar.json +++ b/translations/ar.json @@ -489,5 +489,6 @@ "Join": "انضم", "Leave": "يترك", "System Monitor": "مراقب النظام", - "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية" + "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية", + "Known Web Crawlers": "برامج زحف الويب المعروفة" } diff --git a/translations/ca.json b/translations/ca.json index f4861d724..59fbdc8d1 100644 --- a/translations/ca.json +++ b/translations/ca.json @@ -489,5 +489,6 @@ "Join": "Uneix-te", "Leave": "Marxa", "System Monitor": "Monitor del sistema", - "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents" + "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents", + "Known Web Crawlers": "Exploradors web coneguts" } diff --git a/translations/cy.json b/translations/cy.json index f25c5a27c..79339634a 100644 --- a/translations/cy.json +++ b/translations/cy.json @@ -489,5 +489,6 @@ "Join": "Ymunwch", "Leave": "Gadewch", "System Monitor": "Monitor System", - "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol" + "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol", + "Known Web Crawlers": "Crawlers Gwe Hysbys" } diff --git a/translations/de.json b/translations/de.json index e2760a864..9852d7703 100644 --- a/translations/de.json +++ b/translations/de.json @@ -489,5 +489,6 @@ "Join": "Verbinden", "Leave": "Verlassen", "System Monitor": "Systemmonitor", - "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen" + "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen", + "Known Web Crawlers": "Bekannte Web-Crawler" } diff --git a/translations/en.json b/translations/en.json index 4e8d3ec13..1efb610ef 100644 --- a/translations/en.json +++ b/translations/en.json @@ -489,5 +489,6 @@ "Join": "Join", "Leave": "Leave", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Add content warnings for the following sites" + "Add content warnings for the following sites": "Add content warnings for the following sites", + "Known Web Crawlers": "Known Web Crawlers" } diff --git a/translations/es.json b/translations/es.json index 67cf33835..0c88b4250 100644 --- a/translations/es.json +++ b/translations/es.json @@ -489,5 +489,6 @@ "Join": "Entrar", "Leave": "Dejar", "System Monitor": "Monitor del sistema", - "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios" + "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios", + "Known Web Crawlers": "Rastreadores web conocidos" } diff --git a/translations/fr.json b/translations/fr.json index 056a914c0..05f4284bd 100644 --- a/translations/fr.json +++ b/translations/fr.json @@ -489,5 +489,6 @@ "Join": "Rejoindre", "Leave": "Laisser", "System Monitor": "Moniteur système", - "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants" + "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants", + "Known Web Crawlers": "Crawlers Web connus" } diff --git a/translations/ga.json b/translations/ga.json index 73f740bcf..dc0e6f802 100644 --- a/translations/ga.json +++ b/translations/ga.json @@ -489,5 +489,6 @@ "Join": "Bí páirteach", "Leave": "Fág", "System Monitor": "Monatóir Córais", - "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas" + "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas", + "Known Web Crawlers": "Crawlers Gréasáin Aitheanta" } diff --git a/translations/hi.json b/translations/hi.json index c4e475f2d..bf52fae8b 100644 --- a/translations/hi.json +++ b/translations/hi.json @@ -489,5 +489,6 @@ "Join": "शामिल हों", "Leave": "छोड़ना", "System Monitor": "सिस्टम मॉनिटर", - "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें" + "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें", + "Known Web Crawlers": "ज्ञात वेब क्रॉलर" } diff --git a/translations/it.json b/translations/it.json index 4648aedae..41341a01d 100644 --- a/translations/it.json +++ b/translations/it.json @@ -489,5 +489,6 @@ "Join": "Aderire", "Leave": "Lasciare", "System Monitor": "Monitor di sistema", - "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti" + "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti", + "Known Web Crawlers": "Crawler Web conosciuti" } diff --git a/translations/ja.json b/translations/ja.json index 0fe46e4f4..d96250ced 100644 --- a/translations/ja.json +++ b/translations/ja.json @@ -489,5 +489,6 @@ "Join": "加入", "Leave": "離れる", "System Monitor": "システムモニター", - "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します" + "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します", + "Known Web Crawlers": "既知のWebクローラー" } diff --git a/translations/ku.json b/translations/ku.json index 1c4f7dd19..946d077be 100644 --- a/translations/ku.json +++ b/translations/ku.json @@ -489,5 +489,6 @@ "Join": "Bihevgirêdan", "Leave": "Terikandin", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin" + "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin", + "Known Web Crawlers": "Crawlerên Webê yên naskirî" } diff --git a/translations/oc.json b/translations/oc.json index d898dfa43..d8f72b430 100644 --- a/translations/oc.json +++ b/translations/oc.json @@ -485,5 +485,6 @@ "Join": "Join", "Leave": "Leave", "System Monitor": "System Monitor", - "Add content warnings for the following sites": "Add content warnings for the following sites" + "Add content warnings for the following sites": "Add content warnings for the following sites", + "Known Web Crawlers": "Known Web Crawlers" } diff --git a/translations/pt.json b/translations/pt.json index 789df52ea..b1056b4f4 100644 --- a/translations/pt.json +++ b/translations/pt.json @@ -489,5 +489,6 @@ "Join": "Juntar", "Leave": "Sair", "System Monitor": "Monitor de Sistema", - "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites" + "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites", + "Known Web Crawlers": "Rastreadores da Web conhecidos" } diff --git a/translations/ru.json b/translations/ru.json index 2352871d7..eb71190d0 100644 --- a/translations/ru.json +++ b/translations/ru.json @@ -489,5 +489,6 @@ "Join": "Присоединиться", "Leave": "Оставлять", "System Monitor": "Системный монитор", - "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов" + "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов", + "Known Web Crawlers": "Известные веб-сканеры" } diff --git a/translations/sw.json b/translations/sw.json index 3e7aa7b8d..a7e078946 100644 --- a/translations/sw.json +++ b/translations/sw.json @@ -489,5 +489,6 @@ "Join": "Jiunge", "Leave": "Ondoka", "System Monitor": "Ufuatiliaji wa Mfumo", - "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo" + "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo", + "Known Web Crawlers": "Watambaji Wavuti Wanaojulikana" } diff --git a/translations/zh.json b/translations/zh.json index e7104146b..3be4afa78 100644 --- a/translations/zh.json +++ b/translations/zh.json @@ -489,5 +489,6 @@ "Join": "加入", "Leave": "离开", "System Monitor": "系统监视器", - "Add content warnings for the following sites": "为以下网站添加内容警告" + "Add content warnings for the following sites": "为以下网站添加内容警告", + "Known Web Crawlers": "已知的网络爬虫" } From 0d83ad84760ba447f67531e9967261878b076a5b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 12:06:08 +0100 Subject: [PATCH 12/17] Saving known crawlers list --- daemon.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/daemon.py b/daemon.py index be07bef69..8b09363bc 100644 --- a/daemon.py +++ b/daemon.py @@ -398,10 +398,15 @@ class PubServer(BaseHTTPRequestHandler): """ if self.server.knownCrawlers.get(uaStr): self.server.knownCrawlers[uaStr]['hits'] += 1 + self.server.knownCrawlers[uaStr]['lastseen'] = \ + int(time.time()) else: self.server.knownCrawlers[uaStr] = { + "lastseen": int(time.time()), "hits": 1 } + saveJson(self.server.knownCrawlers, + self.server.baseDir + '/accounts/knownCrawlers.json') def _getInstanceUrl(self, callingDomain: str) -> str: """Returns the URL for this instance @@ -17160,10 +17165,6 @@ def runDaemon(listsEnabled: str, # list of blocked user agent types within the User-Agent header httpd.userAgentsBlocked = userAgentsBlocked - # dict of known web crawlers accessing nodeinfo or the masto API - # and how many times they have been seen - httpd.knownCrawlers = {} - httpd.unitTest = unitTest httpd.allowLocalNetworkAccess = allowLocalNetworkAccess if unitTest: @@ -17364,6 +17365,13 @@ def runDaemon(listsEnabled: str, createNewsInbox(baseDir, domain, port, httpPrefix) setConfigParam(baseDir, "listsEnabled", "Murdoch press") + # dict of known web crawlers accessing nodeinfo or the masto API + # and how many times they have been seen + httpd.knownCrawlers = {} + knownCrawlersFilename = baseDir + '/accounts/knownCrawlers.json' + if os.path.isfile(knownCrawlersFilename): + httpd.knownCrawlers = loadJson(baseDir + '/accounts/knownCrawlers.json') + if listsEnabled: httpd.listsEnabled = listsEnabled else: From 0c4a3bf968d45236d29c424b2990529fd7123783 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 12:11:17 +0100 Subject: [PATCH 13/17] Only show recent crawlers --- daemon.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/daemon.py b/daemon.py index 8b09363bc..ec7c3f829 100644 --- a/daemon.py +++ b/daemon.py @@ -405,8 +405,8 @@ class PubServer(BaseHTTPRequestHandler): "lastseen": int(time.time()), "hits": 1 } - saveJson(self.server.knownCrawlers, - self.server.baseDir + '/accounts/knownCrawlers.json') + saveJson(self.server.knownCrawlers, + self.server.baseDir + '/accounts/knownCrawlers.json') def _getInstanceUrl(self, callingDomain: str) -> str: """Returns the URL for this instance @@ -11862,8 +11862,11 @@ class PubServer(BaseHTTPRequestHandler): if not isModerator(baseDir, nickname): return False crawlersList = [] + currTime = int(time.time()) + recentCrawlers = 60 * 60 * 24 * 30 for uaStr, item in knownCrawlers.items(): - crawlersList.append(str(item['hits']) + ' ' + uaStr) + if item['lastseen'] - currTime < recentCrawlers: + crawlersList.append(str(item['hits']) + ' ' + uaStr) crawlersList.sort(reverse=True) msg = '' for lineStr in crawlersList: @@ -17370,7 +17373,7 @@ def runDaemon(listsEnabled: str, httpd.knownCrawlers = {} knownCrawlersFilename = baseDir + '/accounts/knownCrawlers.json' if os.path.isfile(knownCrawlersFilename): - httpd.knownCrawlers = loadJson(baseDir + '/accounts/knownCrawlers.json') + httpd.knownCrawlers = loadJson(knownCrawlersFilename) if listsEnabled: httpd.listsEnabled = listsEnabled From b54cc5b00615d4abb08c8ce3365c8af4c067a61d Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 12:20:31 +0100 Subject: [PATCH 14/17] Ensure that crawler list is not saved too often --- daemon.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/daemon.py b/daemon.py index ec7c3f829..38da5e467 100644 --- a/daemon.py +++ b/daemon.py @@ -396,17 +396,19 @@ class PubServer(BaseHTTPRequestHandler): """Updates a dictionary of known crawlers accessing nodeinfo or the masto API """ + currTime = int(time.time()) if self.server.knownCrawlers.get(uaStr): self.server.knownCrawlers[uaStr]['hits'] += 1 - self.server.knownCrawlers[uaStr]['lastseen'] = \ - int(time.time()) + self.server.knownCrawlers[uaStr]['lastseen'] = currTime else: self.server.knownCrawlers[uaStr] = { - "lastseen": int(time.time()), + "lastseen": currTime, "hits": 1 } - saveJson(self.server.knownCrawlers, - self.server.baseDir + '/accounts/knownCrawlers.json') + if currTime - self.server.lastKnownCrawler >= 10: + saveJson(self.server.knownCrawlers, + self.server.baseDir + '/accounts/knownCrawlers.json') + self.server.lastKnownCrawler = currTime def _getInstanceUrl(self, callingDomain: str) -> str: """Returns the URL for this instance @@ -17374,6 +17376,8 @@ def runDaemon(listsEnabled: str, knownCrawlersFilename = baseDir + '/accounts/knownCrawlers.json' if os.path.isfile(knownCrawlersFilename): httpd.knownCrawlers = loadJson(knownCrawlersFilename) + # when was the last crawler seen? + httpd.lastKnownCrawler = 0 if listsEnabled: httpd.listsEnabled = listsEnabled From b6479903cfbbff410347c4c10304e8af712d3b0b Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 12:48:16 +0100 Subject: [PATCH 15/17] Check that user agent exists --- daemon.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/daemon.py b/daemon.py index 38da5e467..279fc6788 100644 --- a/daemon.py +++ b/daemon.py @@ -538,6 +538,9 @@ class PubServer(BaseHTTPRequestHandler): def _blockedUserAgent(self, callingDomain: str, agentStr: str) -> bool: """Should a GET or POST be blocked based upon its user agent? """ + if not agentStr: + return False + agentStrLower = agentStr.lower() defaultAgentBlocks = [ 'fedilist.com' From 611a6da196d4f546edbbb05b5ac4dd1f77d40aed Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 12:49:01 +0100 Subject: [PATCH 16/17] Check that user agent exists --- daemon.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/daemon.py b/daemon.py index 279fc6788..d33e27113 100644 --- a/daemon.py +++ b/daemon.py @@ -396,6 +396,9 @@ class PubServer(BaseHTTPRequestHandler): """Updates a dictionary of known crawlers accessing nodeinfo or the masto API """ + if not uaStr: + return + currTime = int(time.time()) if self.server.knownCrawlers.get(uaStr): self.server.knownCrawlers[uaStr]['hits'] += 1 From 517bcc6197351c2fa731d511664c13050fc00a44 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Sun, 24 Oct 2021 19:05:04 +0100 Subject: [PATCH 17/17] Remove old user agent log entries --- daemon.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/daemon.py b/daemon.py index d33e27113..fcfb9172a 100644 --- a/daemon.py +++ b/daemon.py @@ -408,7 +408,16 @@ class PubServer(BaseHTTPRequestHandler): "lastseen": currTime, "hits": 1 } - if currTime - self.server.lastKnownCrawler >= 10: + + if currTime - self.server.lastKnownCrawler >= 30: + # remove any old observations + removeCrawlers = [] + for ua, item in self.server.knownCrawlers.items(): + if currTime - item['lastseen'] >= 60 * 60 * 24 * 30: + removeCrawlers.append(ua) + for ua in removeCrawlers: + del self.server.knownCrawlers[ua] + # save the list of crawlers saveJson(self.server.knownCrawlers, self.server.baseDir + '/accounts/knownCrawlers.json') self.server.lastKnownCrawler = currTime