mirror of https://gitlab.com/bashrc2/epicyon
				
				
				
			Add crawlers module
							parent
							
								
									35883119be
								
							
						
					
					
						commit
						f4fc143b3a
					
				|  | @ -0,0 +1,120 @@ | |||
| __filename__ = "crawlers.py" | ||||
| __author__ = "Bob Mottram" | ||||
| __license__ = "AGPL3+" | ||||
| __version__ = "1.3.0" | ||||
| __maintainer__ = "Bob Mottram" | ||||
| __email__ = "bob@libreserver.org" | ||||
| __status__ = "Production" | ||||
| __module_group__ = "Core" | ||||
| 
 | ||||
| import time | ||||
| from utils import save_json | ||||
| from utils import user_agent_domain | ||||
| from blocking import update_blocked_cache | ||||
| from blocking import is_blocked_domain | ||||
| 
 | ||||
| default_user_agent_blocks = [ | ||||
|     'fedilist' | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| def update_known_crawlers(ua_str: str, | ||||
|                           base_dir: str, known_crawlers: {}, | ||||
|                           last_known_crawler: int): | ||||
|     """Updates a dictionary of known crawlers accessing nodeinfo | ||||
|     or the masto API | ||||
|     """ | ||||
|     if not ua_str: | ||||
|         return None | ||||
| 
 | ||||
|     curr_time = int(time.time()) | ||||
|     if known_crawlers.get(ua_str): | ||||
|         known_crawlers[ua_str]['hits'] += 1 | ||||
|         known_crawlers[ua_str]['lastseen'] = curr_time | ||||
|     else: | ||||
|         known_crawlers[ua_str] = { | ||||
|             "lastseen": curr_time, | ||||
|             "hits": 1 | ||||
|         } | ||||
| 
 | ||||
|     if curr_time - last_known_crawler >= 30: | ||||
|         # remove any old observations | ||||
|         remove_crawlers = [] | ||||
|         for uagent, item in known_crawlers.items(): | ||||
|             if curr_time - item['lastseen'] >= 60 * 60 * 24 * 30: | ||||
|                 remove_crawlers.append(uagent) | ||||
|         for uagent in remove_crawlers: | ||||
|             del known_crawlers[uagent] | ||||
|         # save the list of crawlers | ||||
|         save_json(known_crawlers, | ||||
|                   base_dir + '/accounts/knownCrawlers.json') | ||||
|     return curr_time | ||||
| 
 | ||||
| 
 | ||||
| def blocked_user_agent(calling_domain: str, agent_str: str, | ||||
|                        news_instance: bool, debug: bool, | ||||
|                        user_agents_blocked: [], | ||||
|                        blocked_cache_last_updated, | ||||
|                        base_dir: str, | ||||
|                        blocked_cache: [], | ||||
|                        blocked_cache_update_secs: int, | ||||
|                        crawlers_allowed: []): | ||||
|     """Should a GET or POST be blocked based upon its user agent? | ||||
|     """ | ||||
|     if not agent_str: | ||||
|         return False, blocked_cache_last_updated | ||||
| 
 | ||||
|     agent_str_lower = agent_str.lower() | ||||
|     for ua_block in default_user_agent_blocks: | ||||
|         if ua_block in agent_str_lower: | ||||
|             print('Blocked User agent: ' + ua_block) | ||||
|             return True, blocked_cache_last_updated | ||||
| 
 | ||||
|     agent_domain = None | ||||
| 
 | ||||
|     if agent_str: | ||||
|         # is this a web crawler? If so the block it | ||||
|         if 'bot/' in agent_str_lower or 'bot-' in agent_str_lower: | ||||
|             # if this is a news instance then we want it | ||||
|             # to be indexed by search engines | ||||
|             if news_instance: | ||||
|                 return False, blocked_cache_last_updated | ||||
|             # is this crawler allowed? | ||||
|             for crawler in crawlers_allowed: | ||||
|                 if crawler.lower() in agent_str_lower: | ||||
|                     return False, blocked_cache_last_updated | ||||
|             print('Blocked Crawler: ' + agent_str) | ||||
|             return True, blocked_cache_last_updated | ||||
|         # get domain name from User-Agent | ||||
|         agent_domain = user_agent_domain(agent_str, debug) | ||||
|     else: | ||||
|         # no User-Agent header is present | ||||
|         return True, blocked_cache_last_updated | ||||
| 
 | ||||
|     # is the User-Agent type blocked? eg. "Mastodon" | ||||
|     if user_agents_blocked: | ||||
|         blocked_ua = False | ||||
|         for agent_name in user_agents_blocked: | ||||
|             if agent_name in agent_str: | ||||
|                 blocked_ua = True | ||||
|                 break | ||||
|         if blocked_ua: | ||||
|             return True, blocked_cache_last_updated | ||||
| 
 | ||||
|     if not agent_domain: | ||||
|         return False, blocked_cache_last_updated | ||||
| 
 | ||||
|     # is the User-Agent domain blocked | ||||
|     blocked_ua = False | ||||
|     if not agent_domain.startswith(calling_domain): | ||||
|         blocked_cache_last_updated = \ | ||||
|             update_blocked_cache(base_dir, blocked_cache, | ||||
|                                  blocked_cache_last_updated, | ||||
|                                  blocked_cache_update_secs) | ||||
| 
 | ||||
|         blocked_ua = \ | ||||
|             is_blocked_domain(base_dir, agent_domain, blocked_cache) | ||||
|         # if self.server.debug: | ||||
|         if blocked_ua: | ||||
|             print('Blocked User agent: ' + agent_domain) | ||||
|     return blocked_ua, blocked_cache_last_updated | ||||
							
								
								
									
										24
									
								
								daemon.py
								
								
								
								
							
							
						
						
									
										24
									
								
								daemon.py
								
								
								
								
							|  | @ -6689,6 +6689,29 @@ class PubServer(BaseHTTPRequestHandler): | |||
|                             set_config_param(base_dir, 'userAgentsBlocked', | ||||
|                                              user_agents_blocked_str) | ||||
| 
 | ||||
|                         # save allowed web crawlers | ||||
|                         crawlers_allowed = [] | ||||
|                         if fields.get('crawlersAllowedStr'): | ||||
|                             crawlers_allowed_str = \ | ||||
|                                 fields['crawlersAllowedStr'] | ||||
|                             crawlers_allowed_list = \ | ||||
|                                 crawlers_allowed_str.split('\n') | ||||
|                             for uagent in crawlers_allowed_list: | ||||
|                                 if uagent in crawlers_allowed: | ||||
|                                     continue | ||||
|                                 crawlers_allowed.append(uagent.strip()) | ||||
|                         if str(self.server.crawlers_allowed) != \ | ||||
|                            str(crawlers_allowed): | ||||
|                             self.server.crawlers_allowed = \ | ||||
|                                 crawlers_allowed | ||||
|                             crawlers_allowed_str = '' | ||||
|                             for uagent in crawlers_allowed: | ||||
|                                 if crawlers_allowed_str: | ||||
|                                     crawlers_allowed_str += ',' | ||||
|                                 crawlers_allowed_str += uagent | ||||
|                             set_config_param(base_dir, 'crawlersAllowed', | ||||
|                                              crawlers_allowed_str) | ||||
| 
 | ||||
|                         # save peertube instances list | ||||
|                         peertube_instances_file = \ | ||||
|                             base_dir + '/accounts/peertube.txt' | ||||
|  | @ -13733,6 +13756,7 @@ class PubServer(BaseHTTPRequestHandler): | |||
|                                     self.server.text_mode_banner, | ||||
|                                     city, | ||||
|                                     self.server.user_agents_blocked, | ||||
|                                     self.server.crawlers_allowed, | ||||
|                                     access_keys, | ||||
|                                     default_reply_interval_hrs, | ||||
|                                     self.server.cw_lists, | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "أظهر من أحب هذا المنشور", | ||||
|     "Show who repeated this post": "أظهر من كرر هذا المنصب", | ||||
|     "Repeated by": "يتكرر بواسطة", | ||||
|     "Register": "يسجل" | ||||
|     "Register": "يسجل", | ||||
|     "Web Crawlers Allowed": "برامج زحف الويب المسموح بها" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Mostra a qui li agrada aquesta publicació", | ||||
|     "Show who repeated this post": "Mostra qui ha repetit aquesta publicació", | ||||
|     "Repeated by": "Repetit per", | ||||
|     "Register": "Registra't" | ||||
|     "Register": "Registra't", | ||||
|     "Web Crawlers Allowed": "Es permeten rastrejadors web" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Dangoswch pwy oedd yn hoffi'r post hwn", | ||||
|     "Show who repeated this post": "Dangoswch pwy ailadroddodd y post hwn", | ||||
|     "Repeated by": "Ailadrodd gan", | ||||
|     "Register": "Cofrestrwch" | ||||
|     "Register": "Cofrestrwch", | ||||
|     "Web Crawlers Allowed": "Caniatáu Ymlusgwyr Gwe" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Zeigen, wem dieser Beitrag gefallen hat", | ||||
|     "Show who repeated this post": "Zeigen Sie, wer diesen Beitrag wiederholt hat", | ||||
|     "Repeated by": "Wiederholt von", | ||||
|     "Register": "Registrieren" | ||||
|     "Register": "Registrieren", | ||||
|     "Web Crawlers Allowed": "Webcrawler erlaubt" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Show who liked this post", | ||||
|     "Show who repeated this post": "Show who repeated this post", | ||||
|     "Repeated by": "Repeated by", | ||||
|     "Register": "Register" | ||||
|     "Register": "Register", | ||||
|     "Web Crawlers Allowed": "Web Crawlers Allowed" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Mostrar a quién le gustó esta publicación", | ||||
|     "Show who repeated this post": "Mostrar quién repitió esta publicación", | ||||
|     "Repeated by": "Repetido por", | ||||
|     "Register": "Registrarse" | ||||
|     "Register": "Registrarse", | ||||
|     "Web Crawlers Allowed": "Rastreadores web permitidos" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Montrer qui a aimé ce post", | ||||
|     "Show who repeated this post": "Montrer qui a répété ce post", | ||||
|     "Repeated by": "Répété par", | ||||
|     "Register": "S'inscrire" | ||||
|     "Register": "S'inscrire", | ||||
|     "Web Crawlers Allowed": "Robots d'exploration Web autorisés" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Taispeáin cé a thaitin an postáil seo", | ||||
|     "Show who repeated this post": "Taispeáin cé a rinne an postáil seo arís", | ||||
|     "Repeated by": "Arís agus arís eile ag", | ||||
|     "Register": "Clár" | ||||
|     "Register": "Clár", | ||||
|     "Web Crawlers Allowed": "Crawlers Gréasáin Ceadaithe" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "दिखाएँ कि इस पोस्ट को किसने पसंद किया", | ||||
|     "Show who repeated this post": "दिखाएं कि इस पोस्ट को किसने दोहराया", | ||||
|     "Repeated by": "द्वारा दोहराया गया", | ||||
|     "Register": "रजिस्टर करें" | ||||
|     "Register": "रजिस्टर करें", | ||||
|     "Web Crawlers Allowed": "वेब क्रॉलर की अनुमति है" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Mostra a chi è piaciuto questo post", | ||||
|     "Show who repeated this post": "Mostra chi ha ripetuto questo post", | ||||
|     "Repeated by": "Ripetuto da", | ||||
|     "Register": "Registrati" | ||||
|     "Register": "Registrati", | ||||
|     "Web Crawlers Allowed": "Web crawler consentiti" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "この投稿を高く評価した人を表示する", | ||||
|     "Show who repeated this post": "この投稿を繰り返した人を表示する", | ||||
|     "Repeated by": "によって繰り返される", | ||||
|     "Register": "登録" | ||||
|     "Register": "登録", | ||||
|     "Web Crawlers Allowed": "許可されるWebクローラー" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "이 포스트를 좋아한 사람 표시", | ||||
|     "Show who repeated this post": "이 포스트를 반복한 사람 표시", | ||||
|     "Repeated by": "반복한 사람", | ||||
|     "Register": "등록" | ||||
|     "Register": "등록", | ||||
|     "Web Crawlers Allowed": "웹 크롤러 허용" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Nîşan bide kê ev post eciband", | ||||
|     "Show who repeated this post": "Nîşan bide kê ev post dubare kiriye", | ||||
|     "Repeated by": "Ji hêla dubare kirin", | ||||
|     "Register": "Fêhrist" | ||||
|     "Register": "Fêhrist", | ||||
|     "Web Crawlers Allowed": "Crawlers Web Destûrdar in" | ||||
| } | ||||
|  |  | |||
|  | @ -511,5 +511,6 @@ | |||
|     "Show who liked this post": "Show who liked this post", | ||||
|     "Show who repeated this post": "Show who repeated this post", | ||||
|     "Repeated by": "Repeated by", | ||||
|     "Register": "Register" | ||||
|     "Register": "Register", | ||||
|     "Web Crawlers Allowed": "Web Crawlers Allowed" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Pokaż, kto polubił ten post", | ||||
|     "Show who repeated this post": "Pokaż, kto powtórzył ten post", | ||||
|     "Repeated by": "Powtórzone przez", | ||||
|     "Register": "Zarejestrować" | ||||
|     "Register": "Zarejestrować", | ||||
|     "Web Crawlers Allowed": "Dozwolone roboty sieciowe" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Mostrar quem gostou deste post", | ||||
|     "Show who repeated this post": "Mostrar quem repetiu esta postagem", | ||||
|     "Repeated by": "Repetido por", | ||||
|     "Register": "Registro" | ||||
|     "Register": "Registro", | ||||
|     "Web Crawlers Allowed": "Rastreadores da Web permitidos" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Показать, кому понравился этот пост", | ||||
|     "Show who repeated this post": "Показать, кто повторил этот пост", | ||||
|     "Repeated by": "Повторено", | ||||
|     "Register": "регистр" | ||||
|     "Register": "регистр", | ||||
|     "Web Crawlers Allowed": "Веб-сканеры разрешены" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Onyesha ni nani aliyependa chapisho hili", | ||||
|     "Show who repeated this post": "Onyesha ni nani aliyerudia chapisho hili", | ||||
|     "Repeated by": "Imerudiwa na", | ||||
|     "Register": "Sajili" | ||||
|     "Register": "Sajili", | ||||
|     "Web Crawlers Allowed": "Watambazaji Wavuti Zinaruhusiwa" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "Покажіть, кому сподобався цей пост", | ||||
|     "Show who repeated this post": "Покажіть, хто повторив цей пост", | ||||
|     "Repeated by": "Повторюється за", | ||||
|     "Register": "Реєстрація" | ||||
|     "Register": "Реєстрація", | ||||
|     "Web Crawlers Allowed": "Веб-сканери дозволені" | ||||
| } | ||||
|  |  | |||
|  | @ -515,5 +515,6 @@ | |||
|     "Show who liked this post": "显示谁喜欢这篇文章", | ||||
|     "Show who repeated this post": "显示谁重复了这篇文章", | ||||
|     "Repeated by": "重复", | ||||
|     "Register": "登记" | ||||
|     "Register": "登记", | ||||
|     "Web Crawlers Allowed": "允许网络爬虫" | ||||
| } | ||||
|  |  | |||
|  | @ -1631,6 +1631,7 @@ def _html_edit_profile_shared_items(base_dir: str, nickname: str, domain: str, | |||
| 
 | ||||
| def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str, | ||||
|                                  user_agents_blocked: str, | ||||
|                                  crawlers_allowed: str, | ||||
|                                  translate: {}, reply_interval_hours: int, | ||||
|                                  cw_lists: {}, lists_enabled: str) -> str: | ||||
|     """Filtering and blocking section of edit profile screen | ||||
|  | @ -1807,6 +1808,16 @@ def _html_edit_profile_filtering(base_dir: str, nickname: str, domain: str, | |||
|                            'userAgentsBlockedStr', user_agents_blocked_str, | ||||
|                            200, '', False) | ||||
| 
 | ||||
|         crawlers_allowed_str = '' | ||||
|         for uagent in crawlers_allowed: | ||||
|             if crawlers_allowed_str: | ||||
|                 crawlers_allowed_str += '\n' | ||||
|             crawlers_allowed_str += uagent | ||||
|         edit_profile_form += \ | ||||
|             edit_text_area(translate['Web Crawlers Allowed'], | ||||
|                            'crawlersAllowedStr', crawlers_allowed_str, | ||||
|                            200, '', False) | ||||
| 
 | ||||
|         cw_lists_str = '' | ||||
|         for name, _ in cw_lists.items(): | ||||
|             variablename = get_cw_list_variable(name) | ||||
|  | @ -2137,7 +2148,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str, | |||
|                       default_timeline: str, theme: str, | ||||
|                       peertube_instances: [], | ||||
|                       text_mode_banner: str, city: str, | ||||
|                       user_agents_blocked: str, | ||||
|                       user_agents_blocked: [], | ||||
|                       crawlers_allowed: [], | ||||
|                       access_keys: {}, | ||||
|                       default_reply_interval_hrs: int, | ||||
|                       cw_lists: {}, lists_enabled: str) -> str: | ||||
|  | @ -2354,8 +2366,8 @@ def html_edit_profile(css_cache: {}, translate: {}, base_dir: str, path: str, | |||
|                                  default_reply_interval_hrs) | ||||
|     edit_profile_form += \ | ||||
|         _html_edit_profile_filtering(base_dir, nickname, domain, | ||||
|                                      user_agents_blocked, translate, | ||||
|                                      reply_interval_hours, | ||||
|                                      user_agents_blocked, crawlers_allowed, | ||||
|                                      translate, reply_interval_hours, | ||||
|                                      cw_lists, lists_enabled) | ||||
| 
 | ||||
|     # git projects section | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue