diff --git a/daemon.py b/daemon.py
index 731bc8512..fcfb9172a 100644
--- a/daemon.py
+++ b/daemon.py
@@ -392,6 +392,36 @@ def saveDomainQrcode(baseDir: str, httpPrefix: str,
class PubServer(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
+ def _updateKnownCrawlers(self, uaStr: str) -> None:
+ """Updates a dictionary of known crawlers accessing nodeinfo
+ or the masto API
+ """
+ if not uaStr:
+ return
+
+ currTime = int(time.time())
+ if self.server.knownCrawlers.get(uaStr):
+ self.server.knownCrawlers[uaStr]['hits'] += 1
+ self.server.knownCrawlers[uaStr]['lastseen'] = currTime
+ else:
+ self.server.knownCrawlers[uaStr] = {
+ "lastseen": currTime,
+ "hits": 1
+ }
+
+ if currTime - self.server.lastKnownCrawler >= 30:
+ # remove any old observations
+ removeCrawlers = []
+ for ua, item in self.server.knownCrawlers.items():
+ if currTime - item['lastseen'] >= 60 * 60 * 24 * 30:
+ removeCrawlers.append(ua)
+ for ua in removeCrawlers:
+ del self.server.knownCrawlers[ua]
+ # save the list of crawlers
+ saveJson(self.server.knownCrawlers,
+ self.server.baseDir + '/accounts/knownCrawlers.json')
+ self.server.lastKnownCrawler = currTime
+
def _getInstanceUrl(self, callingDomain: str) -> str:
"""Returns the URL for this instance
"""
@@ -520,11 +550,22 @@ class PubServer(BaseHTTPRequestHandler):
def _blockedUserAgent(self, callingDomain: str, agentStr: str) -> bool:
"""Should a GET or POST be blocked based upon its user agent?
"""
+ if not agentStr:
+ return False
+
+ agentStrLower = agentStr.lower()
+ defaultAgentBlocks = [
+ 'fedilist.com'
+ ]
+ for uaBlock in defaultAgentBlocks:
+ if uaBlock in agentStrLower:
+ print('Blocked User agent: ' + uaBlock)
+ return True
+
agentDomain = None
if agentStr:
# is this a web crawler? If so the block it
- agentStrLower = agentStr.lower()
if 'bot/' in agentStrLower or 'bot-' in agentStrLower:
if self.server.newsInstance:
return False
@@ -969,6 +1010,7 @@ class PubServer(BaseHTTPRequestHandler):
return False
def _mastoApiV1(self, path: str, callingDomain: str,
+ uaStr: str,
authorized: bool,
httpPrefix: str,
baseDir: str, nickname: str, domain: str,
@@ -989,10 +1031,12 @@ class PubServer(BaseHTTPRequestHandler):
print('mastodon api v1: ' + path)
print('mastodon api v1: authorized ' + str(authorized))
print('mastodon api v1: nickname ' + str(nickname))
+ self._updateKnownCrawlers(uaStr)
brochMode = brochModeIsActive(baseDir)
sendJson, sendJsonStr = mastoApiV1Response(path,
callingDomain,
+ uaStr,
authorized,
httpPrefix,
baseDir,
@@ -1031,6 +1075,7 @@ class PubServer(BaseHTTPRequestHandler):
return True
def _mastoApi(self, path: str, callingDomain: str,
+ uaStr: str,
authorized: bool, httpPrefix: str,
baseDir: str, nickname: str, domain: str,
domainFull: str,
@@ -1041,18 +1086,19 @@ class PubServer(BaseHTTPRequestHandler):
projectVersion: str,
customEmoji: [],
showNodeInfoAccounts: bool) -> bool:
- return self._mastoApiV1(path, callingDomain, authorized,
+ return self._mastoApiV1(path, callingDomain, uaStr, authorized,
httpPrefix, baseDir, nickname, domain,
domainFull, onionDomain, i2pDomain,
translate, registration, systemLanguage,
projectVersion, customEmoji,
showNodeInfoAccounts)
- def _nodeinfo(self, callingDomain: str) -> bool:
+ def _nodeinfo(self, uaStr: str, callingDomain: str) -> bool:
if not self.path.startswith('/nodeinfo/2.0'):
return False
if self.server.debug:
print('DEBUG: nodeinfo ' + self.path)
+ self._updateKnownCrawlers(uaStr)
# If we are in broch mode then don't show potentially
# sensitive metadata.
@@ -1091,7 +1137,7 @@ class PubServer(BaseHTTPRequestHandler):
self._set_headers('application/ld+json', msglen,
None, callingDomain, True)
self._write(msg)
- print('nodeinfo sent')
+ print('nodeinfo sent to ' + callingDomain)
return True
self._404()
return True
@@ -11819,6 +11865,36 @@ class PubServer(BaseHTTPRequestHandler):
return True
return False
+ def _showKnownCrawlers(self, callingDomain: str, path: str,
+ baseDir: str, knownCrawlers: {}) -> bool:
+ """Show a list of known web crawlers
+ """
+ if '/users/' not in path:
+ return False
+ if not path.endswith('/crawlers'):
+ return False
+ nickname = getNicknameFromActor(path)
+ if not nickname:
+ return False
+ if not isModerator(baseDir, nickname):
+ return False
+ crawlersList = []
+ currTime = int(time.time())
+ recentCrawlers = 60 * 60 * 24 * 30
+ for uaStr, item in knownCrawlers.items():
+ if item['lastseen'] - currTime < recentCrawlers:
+ crawlersList.append(str(item['hits']) + ' ' + uaStr)
+ crawlersList.sort(reverse=True)
+ msg = ''
+ for lineStr in crawlersList:
+ msg += lineStr + '\n'
+ msg = msg.encode('utf-8')
+ msglen = len(msg)
+ self._set_headers('text/plain; charset=utf-8', msglen,
+ None, callingDomain, True)
+ self._write(msg)
+ return True
+
def _editProfile(self, callingDomain: str, path: str,
translate: {}, baseDir: str,
httpPrefix: str, domain: str, port: int,
@@ -12113,7 +12189,7 @@ class PubServer(BaseHTTPRequestHandler):
# Since fediverse crawlers are quite active,
# make returning info to them high priority
# get nodeinfo endpoint
- if self._nodeinfo(callingDomain):
+ if self._nodeinfo(uaStr, callingDomain):
return
fitnessPerformance(GETstartTime, self.server.fitness,
@@ -12446,7 +12522,8 @@ class PubServer(BaseHTTPRequestHandler):
return
# minimal mastodon api
- if self._mastoApi(self.path, callingDomain, authorized,
+ if self._mastoApi(self.path, callingDomain, uaStr,
+ authorized,
self.server.httpPrefix,
self.server.baseDir,
self.authorizedNickname,
@@ -14349,6 +14426,12 @@ class PubServer(BaseHTTPRequestHandler):
self.server.GETbusy = False
return
+ # list of known crawlers accessing nodeinfo or masto API
+ if self._showKnownCrawlers(callingDomain, self.path,
+ self.server.baseDir,
+ self.server.knownCrawlers):
+ return
+
# edit profile in web interface
if self._editProfile(callingDomain, self.path,
self.server.translate,
@@ -17302,6 +17385,15 @@ def runDaemon(listsEnabled: str,
createNewsInbox(baseDir, domain, port, httpPrefix)
setConfigParam(baseDir, "listsEnabled", "Murdoch press")
+ # dict of known web crawlers accessing nodeinfo or the masto API
+ # and how many times they have been seen
+ httpd.knownCrawlers = {}
+ knownCrawlersFilename = baseDir + '/accounts/knownCrawlers.json'
+ if os.path.isfile(knownCrawlersFilename):
+ httpd.knownCrawlers = loadJson(knownCrawlersFilename)
+ # when was the last crawler seen?
+ httpd.lastKnownCrawler = 0
+
if listsEnabled:
httpd.listsEnabled = listsEnabled
else:
diff --git a/mastoapiv1.py b/mastoapiv1.py
index 18722dfce..9661b31d5 100644
--- a/mastoapiv1.py
+++ b/mastoapiv1.py
@@ -82,6 +82,7 @@ def _getMastoApiV1Account(baseDir: str, nickname: str, domain: str) -> {}:
def mastoApiV1Response(path: str, callingDomain: str,
+ uaStr: str,
authorized: bool,
httpPrefix: str,
baseDir: str, nickname: str, domain: str,
@@ -100,12 +101,18 @@ def mastoApiV1Response(path: str, callingDomain: str,
"""
sendJson = None
sendJsonStr = ''
+ if not uaStr:
+ uaStr = ''
# parts of the api needing authorization
if authorized and nickname:
if path == '/api/v1/accounts/verify_credentials':
sendJson = _getMastoApiV1Account(baseDir, nickname, domain)
- sendJsonStr = 'masto API account sent for ' + nickname
+ sendJsonStr = \
+ 'masto API account sent for ' + nickname + ' ' + uaStr
+
+ # information about where the request is coming from
+ callingInfo = ' ' + uaStr + ', ' + callingDomain
# Parts of the api which don't need authorization
mastoId = _getMastApiV1Id(path)
@@ -121,57 +128,73 @@ def mastoApiV1Response(path: str, callingDomain: str,
path = path.split('?')[0]
if path.endswith('/followers'):
sendJson = []
- sendJsonStr = 'masto API followers sent for ' + nickname
+ sendJsonStr = \
+ 'masto API followers sent for ' + nickname + \
+ callingInfo
elif path.endswith('/following'):
sendJson = []
- sendJsonStr = 'masto API following sent for ' + nickname
+ sendJsonStr = \
+ 'masto API following sent for ' + nickname + \
+ callingInfo
elif path.endswith('/statuses'):
sendJson = []
- sendJsonStr = 'masto API statuses sent for ' + nickname
+ sendJsonStr = \
+ 'masto API statuses sent for ' + nickname + \
+ callingInfo
elif path.endswith('/search'):
sendJson = []
- sendJsonStr = 'masto API search sent ' + originalPath
+ sendJsonStr = \
+ 'masto API search sent ' + originalPath + \
+ callingInfo
elif path.endswith('/relationships'):
sendJson = []
sendJsonStr = \
- 'masto API relationships sent ' + originalPath
+ 'masto API relationships sent ' + originalPath + \
+ callingInfo
else:
sendJson = \
_getMastoApiV1Account(baseDir, pathNickname, domain)
- sendJsonStr = 'masto API account sent for ' + nickname
+ sendJsonStr = \
+ 'masto API account sent for ' + nickname + \
+ callingInfo
# NOTE: adding support for '/api/v1/directory seems to create
# federation problems, so avoid implementing that
if path.startswith('/api/v1/blocks'):
sendJson = []
- sendJsonStr = 'masto API instance blocks sent ' + path
+ sendJsonStr = \
+ 'masto API instance blocks sent ' + path + callingInfo
elif path.startswith('/api/v1/favorites'):
sendJson = []
- sendJsonStr = 'masto API favorites sent ' + path
+ sendJsonStr = 'masto API favorites sent ' + path + callingInfo
elif path.startswith('/api/v1/follow_requests'):
sendJson = []
- sendJsonStr = 'masto API follow requests sent ' + path
+ sendJsonStr = \
+ 'masto API follow requests sent ' + path + callingInfo
elif path.startswith('/api/v1/mutes'):
sendJson = []
- sendJsonStr = 'masto API mutes sent ' + path
+ sendJsonStr = \
+ 'masto API mutes sent ' + path + callingInfo
elif path.startswith('/api/v1/notifications'):
sendJson = []
- sendJsonStr = 'masto API notifications sent ' + path
+ sendJsonStr = \
+ 'masto API notifications sent ' + path + callingInfo
elif path.startswith('/api/v1/reports'):
sendJson = []
- sendJsonStr = 'masto API reports sent ' + path
+ sendJsonStr = 'masto API reports sent ' + path + callingInfo
elif path.startswith('/api/v1/statuses'):
sendJson = []
- sendJsonStr = 'masto API statuses sent ' + path
+ sendJsonStr = 'masto API statuses sent ' + path + callingInfo
elif path.startswith('/api/v1/timelines'):
sendJson = {
'error': 'This method requires an authenticated user'
}
- sendJsonStr = 'masto API timelines sent ' + path
+ sendJsonStr = 'masto API timelines sent ' + path + callingInfo
elif path.startswith('/api/v1/custom_emojis'):
sendJson = customEmoji
- sendJsonStr = 'masto API custom emojis sent ' + path
+ sendJsonStr = \
+ 'masto API custom emojis sent ' + path + callingInfo
adminNickname = getConfigParam(baseDir, 'admin')
if adminNickname and path == '/api/v1/instance':
@@ -208,7 +231,7 @@ def mastoApiV1Response(path: str, callingDomain: str,
registration,
systemLanguage,
projectVersion)
- sendJsonStr = 'masto API instance metadata sent'
+ sendJsonStr = 'masto API instance metadata sent ' + uaStr
elif path.startswith('/api/v1/instance/peers'):
# This is just a dummy result.
# Showing the full list of peers would have privacy implications.
@@ -216,8 +239,8 @@ def mastoApiV1Response(path: str, callingDomain: str,
# small instances a full list of peers would convey a lot of
# information about the interests of a small number of accounts
sendJson = ['mastodon.social', domainFull]
- sendJsonStr = 'masto API peers metadata sent'
+ sendJsonStr = 'masto API peers metadata sent ' + uaStr
elif path.startswith('/api/v1/instance/activity'):
sendJson = []
- sendJsonStr = 'masto API activity metadata sent'
+ sendJsonStr = 'masto API activity metadata sent ' + uaStr
return sendJson, sendJsonStr
diff --git a/translations/ar.json b/translations/ar.json
index 2ae899465..4b46432fb 100644
--- a/translations/ar.json
+++ b/translations/ar.json
@@ -489,5 +489,6 @@
"Join": "انضم",
"Leave": "يترك",
"System Monitor": "مراقب النظام",
- "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية"
+ "Add content warnings for the following sites": "أضف تحذيرات المحتوى للمواقع التالية",
+ "Known Web Crawlers": "برامج زحف الويب المعروفة"
}
diff --git a/translations/ca.json b/translations/ca.json
index f4861d724..59fbdc8d1 100644
--- a/translations/ca.json
+++ b/translations/ca.json
@@ -489,5 +489,6 @@
"Join": "Uneix-te",
"Leave": "Marxa",
"System Monitor": "Monitor del sistema",
- "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents"
+ "Add content warnings for the following sites": "Afegiu advertiments de contingut per als llocs següents",
+ "Known Web Crawlers": "Exploradors web coneguts"
}
diff --git a/translations/cy.json b/translations/cy.json
index f25c5a27c..79339634a 100644
--- a/translations/cy.json
+++ b/translations/cy.json
@@ -489,5 +489,6 @@
"Join": "Ymunwch",
"Leave": "Gadewch",
"System Monitor": "Monitor System",
- "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol"
+ "Add content warnings for the following sites": "Ychwanegwch rybuddion cynnwys ar gyfer y gwefannau canlynol",
+ "Known Web Crawlers": "Crawlers Gwe Hysbys"
}
diff --git a/translations/de.json b/translations/de.json
index e2760a864..9852d7703 100644
--- a/translations/de.json
+++ b/translations/de.json
@@ -489,5 +489,6 @@
"Join": "Verbinden",
"Leave": "Verlassen",
"System Monitor": "Systemmonitor",
- "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen"
+ "Add content warnings for the following sites": "Inhaltswarnungen für die folgenden Websites hinzufügen",
+ "Known Web Crawlers": "Bekannte Web-Crawler"
}
diff --git a/translations/en.json b/translations/en.json
index 4e8d3ec13..1efb610ef 100644
--- a/translations/en.json
+++ b/translations/en.json
@@ -489,5 +489,6 @@
"Join": "Join",
"Leave": "Leave",
"System Monitor": "System Monitor",
- "Add content warnings for the following sites": "Add content warnings for the following sites"
+ "Add content warnings for the following sites": "Add content warnings for the following sites",
+ "Known Web Crawlers": "Known Web Crawlers"
}
diff --git a/translations/es.json b/translations/es.json
index 67cf33835..0c88b4250 100644
--- a/translations/es.json
+++ b/translations/es.json
@@ -489,5 +489,6 @@
"Join": "Entrar",
"Leave": "Dejar",
"System Monitor": "Monitor del sistema",
- "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios"
+ "Add content warnings for the following sites": "Agregue advertencias de contenido para los siguientes sitios",
+ "Known Web Crawlers": "Rastreadores web conocidos"
}
diff --git a/translations/fr.json b/translations/fr.json
index 056a914c0..05f4284bd 100644
--- a/translations/fr.json
+++ b/translations/fr.json
@@ -489,5 +489,6 @@
"Join": "Rejoindre",
"Leave": "Laisser",
"System Monitor": "Moniteur système",
- "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants"
+ "Add content warnings for the following sites": "Ajouter des avertissements de contenu pour les sites suivants",
+ "Known Web Crawlers": "Crawlers Web connus"
}
diff --git a/translations/ga.json b/translations/ga.json
index 73f740bcf..dc0e6f802 100644
--- a/translations/ga.json
+++ b/translations/ga.json
@@ -489,5 +489,6 @@
"Join": "Bí páirteach",
"Leave": "Fág",
"System Monitor": "Monatóir Córais",
- "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas"
+ "Add content warnings for the following sites": "Cuir rabhaidh ábhair leis na suíomhanna seo a leanas",
+ "Known Web Crawlers": "Crawlers Gréasáin Aitheanta"
}
diff --git a/translations/hi.json b/translations/hi.json
index c4e475f2d..bf52fae8b 100644
--- a/translations/hi.json
+++ b/translations/hi.json
@@ -489,5 +489,6 @@
"Join": "शामिल हों",
"Leave": "छोड़ना",
"System Monitor": "सिस्टम मॉनिटर",
- "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें"
+ "Add content warnings for the following sites": "निम्नलिखित साइटों के लिए सामग्री चेतावनियाँ जोड़ें",
+ "Known Web Crawlers": "ज्ञात वेब क्रॉलर"
}
diff --git a/translations/it.json b/translations/it.json
index 4648aedae..41341a01d 100644
--- a/translations/it.json
+++ b/translations/it.json
@@ -489,5 +489,6 @@
"Join": "Aderire",
"Leave": "Lasciare",
"System Monitor": "Monitor di sistema",
- "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti"
+ "Add content warnings for the following sites": "Aggiungi avvisi sui contenuti per i seguenti siti",
+ "Known Web Crawlers": "Crawler Web conosciuti"
}
diff --git a/translations/ja.json b/translations/ja.json
index 0fe46e4f4..d96250ced 100644
--- a/translations/ja.json
+++ b/translations/ja.json
@@ -489,5 +489,6 @@
"Join": "加入",
"Leave": "離れる",
"System Monitor": "システムモニター",
- "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します"
+ "Add content warnings for the following sites": "次のサイトのコンテンツ警告を追加します",
+ "Known Web Crawlers": "既知のWebクローラー"
}
diff --git a/translations/ku.json b/translations/ku.json
index 1c4f7dd19..946d077be 100644
--- a/translations/ku.json
+++ b/translations/ku.json
@@ -489,5 +489,6 @@
"Join": "Bihevgirêdan",
"Leave": "Terikandin",
"System Monitor": "System Monitor",
- "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin"
+ "Add content warnings for the following sites": "Ji bo malperên jêrîn hişyariyên naverokê zêde bikin",
+ "Known Web Crawlers": "Crawlerên Webê yên naskirî"
}
diff --git a/translations/oc.json b/translations/oc.json
index d898dfa43..d8f72b430 100644
--- a/translations/oc.json
+++ b/translations/oc.json
@@ -485,5 +485,6 @@
"Join": "Join",
"Leave": "Leave",
"System Monitor": "System Monitor",
- "Add content warnings for the following sites": "Add content warnings for the following sites"
+ "Add content warnings for the following sites": "Add content warnings for the following sites",
+ "Known Web Crawlers": "Known Web Crawlers"
}
diff --git a/translations/pt.json b/translations/pt.json
index 789df52ea..b1056b4f4 100644
--- a/translations/pt.json
+++ b/translations/pt.json
@@ -489,5 +489,6 @@
"Join": "Juntar",
"Leave": "Sair",
"System Monitor": "Monitor de Sistema",
- "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites"
+ "Add content warnings for the following sites": "Adicione avisos de conteúdo para os seguintes sites",
+ "Known Web Crawlers": "Rastreadores da Web conhecidos"
}
diff --git a/translations/ru.json b/translations/ru.json
index 2352871d7..eb71190d0 100644
--- a/translations/ru.json
+++ b/translations/ru.json
@@ -489,5 +489,6 @@
"Join": "Присоединиться",
"Leave": "Оставлять",
"System Monitor": "Системный монитор",
- "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов"
+ "Add content warnings for the following sites": "Добавить предупреждения о содержании для следующих сайтов",
+ "Known Web Crawlers": "Известные веб-сканеры"
}
diff --git a/translations/sw.json b/translations/sw.json
index 3e7aa7b8d..a7e078946 100644
--- a/translations/sw.json
+++ b/translations/sw.json
@@ -489,5 +489,6 @@
"Join": "Jiunge",
"Leave": "Ondoka",
"System Monitor": "Ufuatiliaji wa Mfumo",
- "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo"
+ "Add content warnings for the following sites": "Ongeza maonyo ya yaliyomo kwa wavuti zifuatazo",
+ "Known Web Crawlers": "Watambaji Wavuti Wanaojulikana"
}
diff --git a/translations/zh.json b/translations/zh.json
index e7104146b..3be4afa78 100644
--- a/translations/zh.json
+++ b/translations/zh.json
@@ -489,5 +489,6 @@
"Join": "加入",
"Leave": "离开",
"System Monitor": "系统监视器",
- "Add content warnings for the following sites": "为以下网站添加内容警告"
+ "Add content warnings for the following sites": "为以下网站添加内容警告",
+ "Known Web Crawlers": "已知的网络爬虫"
}
diff --git a/utils.py b/utils.py
index 47572bfb0..45e6e4152 100644
--- a/utils.py
+++ b/utils.py
@@ -715,7 +715,7 @@ def getStatusNumber(publishedStr: str = None) -> (str, str):
def evilIncarnate() -> []:
- return ('gab.com', 'gabfed.com', 'spinster.xyz',
+ return ('fedilist.com', 'gab.com', 'gabfed.com', 'spinster.xyz',
'kiwifarms.cc', 'djitter.com')
diff --git a/webapp_profile.py b/webapp_profile.py
index 914d212c8..1f29a368d 100644
--- a/webapp_profile.py
+++ b/webapp_profile.py
@@ -1520,8 +1520,8 @@ def _htmlEditProfileSharedItems(baseDir: str, nickname: str, domain: str,
def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str,
- userAgentsBlocked: str, translate: {},
- replyIntervalHours: int,
+ userAgentsBlocked: str,
+ translate: {}, replyIntervalHours: int,
CWlists: {}, listsEnabled: str) -> str:
"""Filtering and blocking section of edit profile screen
"""
@@ -1669,6 +1669,10 @@ def _htmlEditProfileFiltering(baseDir: str, nickname: str, domain: str,
allowedInstancesStr + '\n'
if isModerator(baseDir, nickname):
+ editProfileForm += \
+ '' + \
+ translate['Known Web Crawlers'] + '
\n'
+
userAgentsBlockedStr = ''
for ua in userAgentsBlocked:
if userAgentsBlockedStr: