| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __filename__ = "siteactive.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __credits__ = ["webchk"] | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2024-01-21 19:01:20 +00:00
										 |  |  | __version__ = "1.5.0" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-26 11:16:41 +00:00
										 |  |  | __module_group__ = "Core" | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import http.client | 
					
						
							|  |  |  | import ssl | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  | from urllib.parse import urlparse | 
					
						
							|  |  |  | from utils import data_dir | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Result: | 
					
						
							|  |  |  |     """Holds result of an URL check.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The redirect attribute is a Result object that the URL was redirected to. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The sitemap_urls attribute will contain a list of Result object if url | 
					
						
							|  |  |  |     is a sitemap file and http_response() was run with parse set to True. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, url): | 
					
						
							|  |  |  |         self.url = url | 
					
						
							|  |  |  |         self.status = 0 | 
					
						
							|  |  |  |         self.desc = '' | 
					
						
							|  |  |  |         self.headers = None | 
					
						
							|  |  |  |         self.latency = 0 | 
					
						
							|  |  |  |         self.content = '' | 
					
						
							|  |  |  |         self.redirect = None | 
					
						
							|  |  |  |         self.sitemap_urls = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         if self.status == 0: | 
					
						
							|  |  |  |             return '{} ... {}'.format(self.url, self.desc) | 
					
						
							|  |  |  |         return '{} ... {} {} ({})'.format( | 
					
						
							|  |  |  |             self.url, self.status, self.desc, self.latency | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def fill_headers(self, headers): | 
					
						
							|  |  |  |         """Takes a list of tuples and converts it a dictionary.""" | 
					
						
							|  |  |  |         self.headers = {h[0]: h[1] for h in headers} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _site_active_parse_url(url): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Returns an object with properties representing
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     scheme:   URL scheme specifier | 
					
						
							|  |  |  |     netloc:   Network location part | 
					
						
							|  |  |  |     path:     Hierarchical path | 
					
						
							|  |  |  |     params:   Parameters for last path element | 
					
						
							|  |  |  |     query:    Query component | 
					
						
							|  |  |  |     fragment: Fragment identifier | 
					
						
							|  |  |  |     username: User name | 
					
						
							|  |  |  |     password: Password | 
					
						
							|  |  |  |     hostname: Host name (lower case) | 
					
						
							|  |  |  |     port:     Port number as integer, if present | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     loc = urlparse(url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # if the scheme (http, https ...) is not available urlparse wont work | 
					
						
							|  |  |  |     if loc.scheme == "": | 
					
						
							|  |  |  |         url = "http://" + url | 
					
						
							|  |  |  |         loc = urlparse(url) | 
					
						
							|  |  |  |     return loc | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  | def _site_active_http_connect(loc, timeout: int): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Connects to the host and returns an HTTP or HTTPS connections.""" | 
					
						
							|  |  |  |     if loc.scheme == "https": | 
					
						
							|  |  |  |         ssl_context = ssl.SSLContext() | 
					
						
							|  |  |  |         return http.client.HTTPSConnection( | 
					
						
							|  |  |  |             loc.netloc, context=ssl_context, timeout=timeout) | 
					
						
							|  |  |  |     return http.client.HTTPConnection(loc.netloc, timeout=timeout) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _site_active_http_request(loc, timeout: int): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Performs a HTTP request and return response in a Result object.
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |     conn = _site_active_http_connect(loc, timeout) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     method = 'HEAD' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     conn.request(method, loc.path) | 
					
						
							|  |  |  |     resp = conn.getresponse() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     result = Result(loc.geturl()) | 
					
						
							|  |  |  |     result.status = resp.status | 
					
						
							|  |  |  |     result.desc = resp.reason | 
					
						
							|  |  |  |     result.fill_headers(resp.getheaders()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     conn.close() | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  | def site_is_active(url: str, timeout: int, | 
					
						
							|  |  |  |                    sites_unavailable: []) -> bool: | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     """Returns true if the current url is resolvable.
 | 
					
						
							|  |  |  |     This can be used to check that an instance is online before | 
					
						
							|  |  |  |     trying to send posts to it. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-10-08 20:11:27 +00:00
										 |  |  |     if '<>' in url: | 
					
						
							|  |  |  |         url = url.replace('<>', '') | 
					
						
							| 
									
										
										
										
											2022-04-29 13:54:13 +00:00
										 |  |  |     if not url.startswith('http') and \ | 
					
						
							|  |  |  |        not url.startswith('ipfs') and \ | 
					
						
							|  |  |  |        not url.startswith('ipns'): | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |         return False | 
					
						
							|  |  |  |     if '.onion/' in url or '.i2p/' in url or \ | 
					
						
							|  |  |  |        url.endswith('.onion') or \ | 
					
						
							|  |  |  |        url.endswith('.i2p'): | 
					
						
							|  |  |  |         # skip this check for onion and i2p | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     loc = _site_active_parse_url(url) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     result = Result(url=url) | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     url2 = url | 
					
						
							|  |  |  |     if '://' in url: | 
					
						
							|  |  |  |         url2 = url.split('://')[1] | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |         result = _site_active_http_request(loc, timeout) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 10:33:59 +00:00
										 |  |  |         if url2 in sites_unavailable: | 
					
						
							|  |  |  |             sites_unavailable.remove(url2) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |         if 400 <= result.status < 500: | 
					
						
							| 
									
										
										
										
											2023-10-10 10:33:59 +00:00
										 |  |  |             # the site is available but denying access | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |             return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 11:21:25 +00:00
										 |  |  |     except BaseException as ex: | 
					
						
							|  |  |  |         print('EX: site_is_active ' + url + ' ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2023-10-10 10:33:59 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if url2 not in sites_unavailable: | 
					
						
							|  |  |  |         sites_unavailable.append(url2) | 
					
						
							| 
									
										
										
										
											2021-02-10 11:24:34 +00:00
										 |  |  |     return False | 
					
						
							| 
									
										
										
										
											2022-02-05 10:49:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def referer_is_active(http_prefix: str, | 
					
						
							|  |  |  |                       referer_domain: str, ua_str: str, | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |                       calling_site_timeout: int, | 
					
						
							|  |  |  |                       sites_unavailable: []) -> bool: | 
					
						
							| 
									
										
										
										
											2022-02-05 10:49:31 +00:00
										 |  |  |     """Returns true if the given referer is an active website
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     referer_url = http_prefix + '://' + referer_domain | 
					
						
							|  |  |  |     if referer_domain + '/' in ua_str: | 
					
						
							|  |  |  |         referer_url = referer_url + ua_str.split(referer_domain)[1] | 
					
						
							|  |  |  |         ending_chars = (' ', ';', ')') | 
					
						
							|  |  |  |         for end_ch in ending_chars: | 
					
						
							|  |  |  |             if end_ch in referer_url: | 
					
						
							|  |  |  |                 referer_url = referer_url.split(end_ch)[0] | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     return site_is_active(referer_url, calling_site_timeout, | 
					
						
							|  |  |  |                           sites_unavailable) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def save_unavailable_sites(base_dir: str, sites_unavailable: []) -> None: | 
					
						
							|  |  |  |     """Save a list of unavailable sites
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  |     unavailable_sites_filename = data_dir(base_dir) + '/unavailable_sites.txt' | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     sites_unavailable.sort() | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(unavailable_sites_filename, 'w+', | 
					
						
							|  |  |  |                   encoding='utf-8') as fp_sites: | 
					
						
							|  |  |  |             for site in sites_unavailable: | 
					
						
							| 
									
										
										
										
											2023-09-16 12:47:23 +00:00
										 |  |  |                 if site: | 
					
						
							|  |  |  |                     fp_sites.write(site + '\n') | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     except OSError: | 
					
						
							|  |  |  |         print('EX: unable to save unavailable sites') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def load_unavailable_sites(base_dir: str) -> []: | 
					
						
							|  |  |  |     """load a list of unavailable sites
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-05-12 12:35:26 +00:00
										 |  |  |     unavailable_sites_filename = data_dir(base_dir) + '/unavailable_sites.txt' | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     sites_unavailable = [] | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(unavailable_sites_filename, 'r', | 
					
						
							|  |  |  |                   encoding='utf-8') as fp_sites: | 
					
						
							|  |  |  |             sites_unavailable = fp_sites.read().split('\n') | 
					
						
							|  |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2024-07-13 14:38:11 +00:00
										 |  |  |         print('EX: unable to read unavailable sites ' + | 
					
						
							|  |  |  |               unavailable_sites_filename) | 
					
						
							| 
									
										
										
										
											2023-09-15 21:04:31 +00:00
										 |  |  |     return sites_unavailable |