Add crawlers module

merge-requests/30/head
Bob Mottram 2022-03-06 12:56:26 +00:00
parent 35883119be
commit f4fc143b3a
23 changed files with 199 additions and 23 deletions

120
crawlers.py 100644
View File

@ -0,0 +1,120 @@
__filename__ = "crawlers.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.3.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import time
from utils import save_json
from utils import user_agent_domain
from blocking import update_blocked_cache
from blocking import is_blocked_domain
default_user_agent_blocks = [
'fedilist'
]
def update_known_crawlers(ua_str: str,
base_dir: str, known_crawlers: {},
last_known_crawler: int):
"""Updates a dictionary of known crawlers accessing nodeinfo
or the masto API
"""
if not ua_str:
return None
curr_time = int(time.time())
if known_crawlers.get(ua_str):
known_crawlers[ua_str]['hits'] += 1
known_crawlers[ua_str]['lastseen'] = curr_time
else:
known_crawlers[ua_str] = {
"lastseen": curr_time,
"hits": 1
}
if curr_time - last_known_crawler >= 30:
# remove any old observations
remove_crawlers = []
for uagent, item in known_crawlers.items():
if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30:
remove_crawlers.append(uagent)
for uagent in remove_crawlers:
del known_crawlers[uagent]
# save the list of crawlers
save_json(known_crawlers,
base_dir + '/accounts/knownCrawlers.json')
return curr_time
def blocked_user_agent(calling_domain: str, agent_str: str,
news_instance: bool, debug: bool,
user_agents_blocked: [],
blocked_cache_last_updated,
base_dir: str,
blocked_cache: [],
blocked_cache_update_secs: int,
crawlers_allowed: []):
"""Should a GET or POST be blocked based upon its user agent?
"""
if not agent_str:
return False, blocked_cache_last_updated
agent_str_lower = agent_str.lower()
for ua_block in default_user_agent_blocks:
if ua_block in agent_str_lower:
print('Blocked User agent: ' + ua_block)
return True, blocked_cache_last_updated
agent_domain = None
if agent_str:
# is this a web crawler? If so the block it
if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower:
# if this is a news instance then we want it
# to be indexed by search engines
if news_instance:
return False, blocked_cache_last_updated
# is this crawler allowed?
for crawler in crawlers_allowed:
if crawler.lower() in agent_str_lower:
return False, blocked_cache_last_updated
print('Blocked Crawler: ' + agent_str)
return True, blocked_cache_last_updated
# get domain name from User-Agent
agent_domain = user_agent_domain(agent_str, debug)
else:
# no User-Agent header is present
return True, blocked_cache_last_updated
# is the User-Agent type blocked? eg. "Mastodon"
if user_agents_blocked:
blocked_ua = False
for agent_name in user_agents_blocked:
if agent_name in agent_str:
blocked_ua = True
break
if blocked_ua:
return True, blocked_cache_last_updated
if not agent_domain:
return False, blocked_cache_last_updated
# is the User-Agent domain blocked
blocked_ua = False
if not agent_domain.startswith(calling_domain):
blocked_cache_last_updated = \
update_blocked_cache(base_dir, blocked_cache,
blocked_cache_last_updated,
blocked_cache_update_secs)
blocked_ua = \
is_blocked_domain(base_dir, agent_domain, blocked_cache)
# if self.server.debug:
if blocked_ua:
print('Blocked User agent: ' + agent_domain)
return blocked_ua, blocked_cache_last_updated

View File

@ -6689,6 +6689,29 @@ class PubServer(BaseHTTPRequestHandler):
set_config_param(base_dir, 'userAgentsBlocked', set_config_param(base_dir, 'userAgentsBlocked',
user_agents_blocked_str) user_agents_blocked_str)
# save allowed web crawlers
crawlers_allowed = []
if fields.get('crawlersAllowedStr'):
crawlers_allowed_str = \
fields['crawlersAllowedStr']
crawlers_allowed_list = \
crawlers_allowed_str.split('\n')
for uagent in crawlers_allowed_list:
if uagent in crawlers_allowed:
continue
crawlers_allowed.append(uagent.strip())
if str(self.server.crawlers_allowed) != \
str(crawlers_allowed):
self.server.crawlers_allowed = \
crawlers_allowed
crawlers_allowed_str = ''
for uagent in crawlers_allowed:
if crawlers_allowed_str:
crawlers_allowed_str += ','
crawlers_allowed_str += uagent
set_config_param(base_dir, 'crawlersAllowed',
crawlers_allowed_str)
# save peertube instances list # save peertube instances list
peertube_instances_file = \ peertube_instances_file = \
base_dir + '/accounts/peertube.txt' base_dir + '/accounts/peertube.txt'
@ -13733,6 +13756,7 @@ class PubServer(BaseHTTPRequestHandler):
self.server.text_mode_banner, self.server.text_mode_banner,
city, city,
self.server.user_agents_blocked, self.server.user_agents_blocked,
self.server.crawlers_allowed,
access_keys, access_keys,
default_reply_interval_hrs, default_reply_interval_hrs,
self.server.cw_lists, self.server.cw_lists,

View File

@ -515,5 +515,6 @@
"Show who liked this post": "أظهر من أحب هذا المنشور", "Show who liked this post": "أظهر من أحب هذا المنشور",
"Show who repeated this post": "أظهر من كرر هذا المنصب", "Show who repeated this post": "أظهر من كرر هذا المنصب",
"Repeated by": "يتكرر بواسطة", "Repeated by": "يتكرر بواسطة",
"Register": "يسجل" "Register": "يسجل",
"Web Crawlers Allowed": "برامج زحف الويب المسموح بها"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Mostra a qui li agrada aquesta publicació", "Show who liked this post": "Mostra a qui li agrada aquesta publicació",
"Show who repeated this post": "Mostra qui ha repetit aquesta publicació", "Show who repeated this post": "Mostra qui ha repetit aquesta publicació",
"Repeated by": "Repetit per", "Repeated by": "Repetit per",
"Register": "Registra't" "Register": "Registra't",
"Web Crawlers Allowed": "Es permeten rastrejadors web"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn", "Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn",
"Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn", "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn",
"Repeated by": "Ailadrodd gan", "Repeated by": "Ailadrodd gan",
"Register": "Cofrestrwch" "Register": "Cofrestrwch",
"Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat", "Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat",
"Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat", "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat",
"Repeated by": "Wiederholt von", "Repeated by": "Wiederholt von",
"Register": "Registrieren" "Register": "Registrieren",
"Web Crawlers Allowed": "Webcrawler erlaubt"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Show who liked this post", "Show who liked this post": "Show who liked this post",
"Show who repeated this post": "Show who repeated this post", "Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by", "Repeated by": "Repeated by",
"Register": "Register" "Register": "Register",
"Web Crawlers Allowed": "Web Crawlers Allowed"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Mostrar a quién le gustó esta publicación", "Show who liked this post": "Mostrar a quién le gustó esta publicación",
"Show who repeated this post": "Mostrar quién repitió esta publicación", "Show who repeated this post": "Mostrar quién repitió esta publicación",
"Repeated by": "Repetido por", "Repeated by": "Repetido por",
"Register": "Registrarse" "Register": "Registrarse",
"Web Crawlers Allowed": "Rastreadores web permitidos"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Montrer qui a aimé ce post", "Show who liked this post": "Montrer qui a aimé ce post",
"Show who repeated this post": "Montrer qui a répété ce post", "Show who repeated this post": "Montrer qui a répété ce post",
"Repeated by": "Répété par", "Repeated by": "Répété par",
"Register": "S'inscrire" "Register": "S'inscrire",
"Web Crawlers Allowed": "Robots d'exploration Web autorisés"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Taispeáin cé a thaitin an postáil seo", "Show who liked this post": "Taispeáin cé a thaitin an postáil seo",
"Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís", "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís",
"Repeated by": "Arís agus arís eile ag", "Repeated by": "Arís agus arís eile ag",
"Register": "Clár" "Register": "Clár",
"Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया", "Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया",
"Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया", "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया",
"Repeated by": "द्वारा दोहराया गया", "Repeated by": "द्वारा दोहराया गया",
"Register": "रजिस्टर करें" "Register": "रजिस्टर करें",
"Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Mostra a chi è piaciuto questo post", "Show who liked this post": "Mostra a chi è piaciuto questo post",
"Show who repeated this post": "Mostra chi ha ripetuto questo post", "Show who repeated this post": "Mostra chi ha ripetuto questo post",
"Repeated by": "Ripetuto da", "Repeated by": "Ripetuto da",
"Register": "Registrati" "Register": "Registrati",
"Web Crawlers Allowed": "Web crawler consentiti"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "この投稿を高く評価した人を表示する", "Show who liked this post": "この投稿を高く評価した人を表示する",
"Show who repeated this post": "この投稿を繰り返した人を表示する", "Show who repeated this post": "この投稿を繰り返した人を表示する",
"Repeated by": "によって繰り返される", "Repeated by": "によって繰り返される",
"Register": "登録" "Register": "登録",
"Web Crawlers Allowed": "許可されるWebクローラー"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "이 포스트를 좋아한 사람 표시", "Show who liked this post": "이 포스트를 좋아한 사람 표시",
"Show who repeated this post": "이 포스트를 반복한 사람 표시", "Show who repeated this post": "이 포스트를 반복한 사람 표시",
"Repeated by": "반복한 사람", "Repeated by": "반복한 사람",
"Register": "등록" "Register": "등록",
"Web Crawlers Allowed": "웹 크롤러 허용"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Nîşan bide kê ev post eciband", "Show who liked this post": "Nîşan bide kê ev post eciband",
"Show who repeated this post": "Nîşan bide kê ev post dubare kiriye", "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye",
"Repeated by": "Ji hêla dubare kirin", "Repeated by": "Ji hêla dubare kirin",
"Register": "Fêhrist" "Register": "Fêhrist",
"Web Crawlers Allowed": "Crawlers Web Destûrdar in"
} }

View File

@ -511,5 +511,6 @@
"Show who liked this post": "Show who liked this post", "Show who liked this post": "Show who liked this post",
"Show who repeated this post": "Show who repeated this post", "Show who repeated this post": "Show who repeated this post",
"Repeated by": "Repeated by", "Repeated by": "Repeated by",
"Register": "Register" "Register": "Register",
"Web Crawlers Allowed": "Web Crawlers Allowed"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Pokaż, kto polubił ten post", "Show who liked this post": "Pokaż, kto polubił ten post",
"Show who repeated this post": "Pokaż, kto powtórzył ten post", "Show who repeated this post": "Pokaż, kto powtórzył ten post",
"Repeated by": "Powtórzone przez", "Repeated by": "Powtórzone przez",
"Register": "Zarejestrować" "Register": "Zarejestrować",
"Web Crawlers Allowed": "Dozwolone roboty sieciowe"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Mostrar quem gostou deste post", "Show who liked this post": "Mostrar quem gostou deste post",
"Show who repeated this post": "Mostrar quem repetiu esta postagem", "Show who repeated this post": "Mostrar quem repetiu esta postagem",
"Repeated by": "Repetido por", "Repeated by": "Repetido por",
"Register": "Registro" "Register": "Registro",
"Web Crawlers Allowed": "Rastreadores da Web permitidos"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Показать, кому понравился этот пост", "Show who liked this post": "Показать, кому понравился этот пост",
"Show who repeated this post": "Показать, кто повторил этот пост", "Show who repeated this post": "Показать, кто повторил этот пост",
"Repeated by": "Повторено", "Repeated by": "Повторено",
"Register": "регистр" "Register": "регистр",
"Web Crawlers Allowed": "Веб-сканеры разрешены"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Onyesha ni nani aliyependa chapisho hili", "Show who liked this post": "Onyesha ni nani aliyependa chapisho hili",
"Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili", "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili",
"Repeated by": "Imerudiwa na", "Repeated by": "Imerudiwa na",
"Register": "Sajili" "Register": "Sajili",
"Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "Покажіть, кому сподобався цей пост", "Show who liked this post": "Покажіть, кому сподобався цей пост",
"Show who repeated this post": "Покажіть, хто повторив цей пост", "Show who repeated this post": "Покажіть, хто повторив цей пост",
"Repeated by": "Повторюється за", "Repeated by": "Повторюється за",
"Register": "Реєстрація" "Register": "Реєстрація",
"Web Crawlers Allowed": "Веб-сканери дозволені"
} }

View File

@ -515,5 +515,6 @@
"Show who liked this post": "显示谁喜欢这篇文章", "Show who liked this post": "显示谁喜欢这篇文章",
"Show who repeated this post": "显示谁重复了这篇文章", "Show who repeated this post": "显示谁重复了这篇文章",
"Repeated by": "重复", "Repeated by": "重复",
"Register": "登记" "Register": "登记",
"Web Crawlers Allowed": "允许网络爬虫"
} }

View File

@ -1631,6 +1631,7 @@ def _html_edit_profile_shared_items(base_dir: str, nickname: str, domain: str,
def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str, def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
user_agents_blocked: str, user_agents_blocked: str,
crawlers_allowed: str,
translate: {}, reply_interval_hours: int, translate: {}, reply_interval_hours: int,
cw_lists: {}, lists_enabled: str) -> str: cw_lists: {}, lists_enabled: str) -> str:
"""Filtering and blocking section of edit profile screen """Filtering and blocking section of edit profile screen
@ -1807,6 +1808,16 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str,
'userAgentsBlockedStr', user_agents_blocked_str, 'userAgentsBlockedStr', user_agents_blocked_str,
200, '', False) 200, '', False)
crawlers_allowed_str = ''
for uagent in crawlers_allowed:
if crawlers_allowed_str:
crawlers_allowed_str += '\n'
crawlers_allowed_str += uagent
edit_profile_form += \
edit_text_area(translate['Web Crawlers Allowed'],
'crawlersAllowedStr', crawlers_allowed_str,
200, '', False)
cw_lists_str = '' cw_lists_str = ''
for name, _ in cw_lists.items(): for name, _ in cw_lists.items():
variablename = get_cw_list_variable(name) variablename = get_cw_list_variable(name)
@ -2137,7 +2148,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
default_timeline: str, theme: str, default_timeline: str, theme: str,
peertube_instances: [], peertube_instances: [],
text_mode_banner: str, city: str, text_mode_banner: str, city: str,
user_agents_blocked: str, user_agents_blocked: [],
crawlers_allowed: [],
access_keys: {}, access_keys: {},
default_reply_interval_hrs: int, default_reply_interval_hrs: int,
cw_lists: {}, lists_enabled: str) -> str: cw_lists: {}, lists_enabled: str) -> str:
@ -2354,8 +2366,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str,
default_reply_interval_hrs) default_reply_interval_hrs)
edit_profile_form += \ edit_profile_form += \
_html_edit_profile_filtering(base_dir, nickname, domain, _html_edit_profile_filtering(base_dir, nickname, domain,
user_agents_blocked, translate, user_agents_blocked, crawlers_allowed,
reply_interval_hours, translate, reply_interval_hours,
cw_lists, lists_enabled) cw_lists, lists_enabled)
# git projects section # git projects section