__filename__ = "siteactive.py" __author__ = "Bob Mottram" __credits__ = ["webchk"] __license__ = "AGPL3+" __version__ = "1.3.0" __maintainer__ = "Bob Mottram" __email__ = "bob@libreserver.org" __status__ = "Production" __module_group__ = "Core" import http.client from urllib.parse import urlparse import ssl class Result: """Holds result of an URL check. The redirect attribute is a Result object that the URL was redirected to. The sitemap_urls attribute will contain a list of Result object if url is a sitemap file and http_response() was run with parse set to True. """ def __init__(self, url): self.url = url self.status = 0 self.desc = '' self.headers = None self.latency = 0 self.content = '' self.redirect = None self.sitemap_urls = None def __repr__(self): if self.status == 0: return '{} ... {}'.format(self.url, self.desc) return '{} ... {} {} ({})'.format( self.url, self.status, self.desc, self.latency ) def fill_headers(self, headers): """Takes a list of tuples and converts it a dictionary.""" self.headers = {h[0]: h[1] for h in headers} def _site_active_parse_url(url): """Returns an object with properties representing scheme: URL scheme specifier netloc: Network location part path: Hierarchical path params: Parameters for last path element query: Query component fragment: Fragment identifier username: User name password: Password hostname: Host name (lower case) port: Port number as integer, if present """ loc = urlparse(url) # if the scheme (http, https ...) is not available urlparse wont work if loc.scheme == "": url = "http://" + url loc = urlparse(url) return loc def _site_a_ctive_http_connect(loc, timeout: int): """Connects to the host and returns an HTTP or HTTPS connections.""" if loc.scheme == "https": ssl_context = ssl.SSLContext() return http.client.HTTPSConnection( loc.netloc, context=ssl_context, timeout=timeout) return http.client.HTTPConnection(loc.netloc, timeout=timeout) def _site_active_http_request(loc, timeout: int): """Performs a HTTP request and return response in a Result object. """ conn = _site_a_ctive_http_connect(loc, timeout) method = 'HEAD' conn.request(method, loc.path) resp = conn.getresponse() result = Result(loc.geturl()) result.status = resp.status result.desc = resp.reason result.fill_headers(resp.getheaders()) conn.close() return result def site_is_active(url: str, timeout: int) -> bool: """Returns true if the current url is resolvable. This can be used to check that an instance is online before trying to send posts to it. """ if not url.startswith('http'): return False if '.onion/' in url or '.i2p/' in url or \ url.endswith('.onion') or \ url.endswith('.i2p'): # skip this check for onion and i2p return True loc = _site_active_parse_url(url) result = Result(url=url) try: result = _site_active_http_request(loc, timeout) if 400 <= result.status < 500: return result return True except BaseException: print('EX: site_is_active ' + str(loc)) return False def referer_is_active(http_prefix: str, referer_domain: str, ua_str: str, calling_site_timeout: int) -> bool: """Returns true if the given referer is an active website """ referer_url = http_prefix + '://' + referer_domain if referer_domain + '/' in ua_str: referer_url = referer_url + ua_str.split(referer_domain)[1] ending_chars = (' ', ';', ')') for end_ch in ending_chars: if end_ch in referer_url: referer_url = referer_url.split(end_ch)[0] return site_is_active(referer_url, calling_site_timeout)