| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __filename__ = "siteactive.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __credits__ = ["webchk"] | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2022-02-03 13:58:20 +00:00
										 |  |  | __version__ = "1.3.0" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-26 11:16:41 +00:00
										 |  |  | __module_group__ = "Core" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import http.client | 
					
						
							|  |  |  | from urllib.parse import urlparse | 
					
						
							|  |  |  | import ssl | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Result: | 
					
						
							|  |  |  |     """Holds result of an URL check.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The redirect attribute is a Result object that the URL was redirected to. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The sitemap_urls attribute will contain a list of Result object if url | 
					
						
							|  |  |  |     is a sitemap file and http_response() was run with parse set to True. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, url): | 
					
						
							|  |  |  |         self.url = url | 
					
						
							|  |  |  |         self.status = 0 | 
					
						
							|  |  |  |         self.desc = '' | 
					
						
							|  |  |  |         self.headers = None | 
					
						
							|  |  |  |         self.latency = 0 | 
					
						
							|  |  |  |         self.content = '' | 
					
						
							|  |  |  |         self.redirect = None | 
					
						
							|  |  |  |         self.sitemap_urls = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         if self.status == 0: | 
					
						
							|  |  |  |             return '{} ... {}'.format(self.url, self.desc) | 
					
						
							|  |  |  |         return '{} ... {} {} ({})'.format( | 
					
						
							|  |  |  |             self.url, self.status, self.desc, self.latency | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def fill_headers(self, headers): | 
					
						
							|  |  |  |         """Takes a list of tuples and converts it a dictionary.""" | 
					
						
							|  |  |  |         self.headers = {h[0]: h[1] for h in headers} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _site_active_parse_url(url): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Returns an object with properties representing
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     scheme:   URL scheme specifier | 
					
						
							|  |  |  |     netloc:   Network location part | 
					
						
							|  |  |  |     path:     Hierarchical path | 
					
						
							|  |  |  |     params:   Parameters for last path element | 
					
						
							|  |  |  |     query:    Query component | 
					
						
							|  |  |  |     fragment: Fragment identifier | 
					
						
							|  |  |  |     username: User name | 
					
						
							|  |  |  |     password: Password | 
					
						
							|  |  |  |     hostname: Host name (lower case) | 
					
						
							|  |  |  |     port:     Port number as integer, if present | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     loc = urlparse(url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # if the scheme (http, https ...) is not available urlparse wont work | 
					
						
							|  |  |  |     if loc.scheme == "": | 
					
						
							|  |  |  |         url = "http://" + url | 
					
						
							|  |  |  |         loc = urlparse(url) | 
					
						
							|  |  |  |     return loc | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  | def _site_active_http_connect(loc, timeout: int): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Connects to the host and returns an HTTP or HTTPS connections.""" | 
					
						
							|  |  |  |     if loc.scheme == "https": | 
					
						
							|  |  |  |         ssl_context = ssl.SSLContext() | 
					
						
							|  |  |  |         return http.client.HTTPSConnection( | 
					
						
							|  |  |  |             loc.netloc, context=ssl_context, timeout=timeout) | 
					
						
							|  |  |  |     return http.client.HTTPConnection(loc.netloc, timeout=timeout) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _site_active_http_request(loc, timeout: int): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Performs a HTTP request and return response in a Result object.
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |     conn = _site_active_http_connect(loc, timeout) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     method = 'HEAD' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     conn.request(method, loc.path) | 
					
						
							|  |  |  |     resp = conn.getresponse() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     result = Result(loc.geturl()) | 
					
						
							|  |  |  |     result.status = resp.status | 
					
						
							|  |  |  |     result.desc = resp.reason | 
					
						
							|  |  |  |     result.fill_headers(resp.getheaders()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     conn.close() | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def site_is_active(url: str, timeout: int) -> bool: | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Returns true if the current url is resolvable.
 | 
					
						
							|  |  |  |     This can be used to check that an instance is online before | 
					
						
							|  |  |  |     trying to send posts to it. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |     if not url.startswith('http') and \ | 
					
						
							|  |  |  |        not url.startswith('ipfs') and \ | 
					
						
							|  |  |  |        not url.startswith('ipns'): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |         return False | 
					
						
							|  |  |  |     if '.onion/' in url or '.i2p/' in url or \ | 
					
						
							|  |  |  |        url.endswith('.onion') or \ | 
					
						
							|  |  |  |        url.endswith('.i2p'): | 
					
						
							|  |  |  |         # skip this check for onion and i2p | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     loc = _site_active_parse_url(url) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     result = Result(url=url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         result = _site_active_http_request(loc, timeout) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if 400 <= result.status < 500: | 
					
						
							|  |  |  |             return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     except BaseException: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         print('EX: site_is_active ' + str(loc)) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     return False | 
					
						
							| 
									
										
										
										
											2022-02-05 10:49:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def referer_is_active(http_prefix: str, | 
					
						
							|  |  |  |                       referer_domain: str, ua_str: str, | 
					
						
							|  |  |  |                       calling_site_timeout: int) -> bool: | 
					
						
							|  |  |  |     """Returns true if the given referer is an active website
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     referer_url = http_prefix + '://' + referer_domain | 
					
						
							|  |  |  |     if referer_domain + '/' in ua_str: | 
					
						
							|  |  |  |         referer_url = referer_url + ua_str.split(referer_domain)[1] | 
					
						
							|  |  |  |         ending_chars = (' ', ';', ')') | 
					
						
							|  |  |  |         for end_ch in ending_chars: | 
					
						
							|  |  |  |             if end_ch in referer_url: | 
					
						
							|  |  |  |                 referer_url = referer_url.split(end_ch)[0] | 
					
						
							|  |  |  |     return site_is_active(referer_url, calling_site_timeout) |