epicyon/siteactive.py

140 lines
4.1 KiB
Python

__filename__ = "siteactive.py"
__author__ = "Bob Mottram"
__credits__ = ["webchk"]
__license__ = "AGPL3+"
__version__ = "1.3.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import http.client
from urllib.parse import urlparse
import ssl
class Result:
"""Holds result of an URL check.
The redirect attribute is a Result object that the URL was redirected to.
The sitemap_urls attribute will contain a list of Result object if url
is a sitemap file and http_response() was run with parse set to True.
"""
def __init__(self, url):
self.url = url
self.status = 0
self.desc = ''
self.headers = None
self.latency = 0
self.content = ''
self.redirect = None
self.sitemap_urls = None
def __repr__(self):
if self.status == 0:
return '{} ... {}'.format(self.url, self.desc)
return '{} ... {} {} ({})'.format(
self.url, self.status, self.desc, self.latency
)
def fill_headers(self, headers):
"""Takes a list of tuples and converts it a dictionary."""
self.headers = {h[0]: h[1] for h in headers}
def _site_active_parse_url(url):
"""Returns an object with properties representing
scheme: URL scheme specifier
netloc: Network location part
path: Hierarchical path
params: Parameters for last path element
query: Query component
fragment: Fragment identifier
username: User name
password: Password
hostname: Host name (lower case)
port: Port number as integer, if present
"""
loc = urlparse(url)
# if the scheme (http, https ...) is not available urlparse wont work
if loc.scheme == "":
url = "http://" + url
loc = urlparse(url)
return loc
def _site_active_http_connect(loc, timeout: int):
"""Connects to the host and returns an HTTP or HTTPS connections."""
if loc.scheme == "https":
ssl_context = ssl.SSLContext()
return http.client.HTTPSConnection(
loc.netloc, context=ssl_context, timeout=timeout)
return http.client.HTTPConnection(loc.netloc, timeout=timeout)
def _site_active_http_request(loc, timeout: int):
"""Performs a HTTP request and return response in a Result object.
"""
conn = _site_active_http_connect(loc, timeout)
method = 'HEAD'
conn.request(method, loc.path)
resp = conn.getresponse()
result = Result(loc.geturl())
result.status = resp.status
result.desc = resp.reason
result.fill_headers(resp.getheaders())
conn.close()
return result
def site_is_active(url: str, timeout: int) -> bool:
"""Returns true if the current url is resolvable.
This can be used to check that an instance is online before
trying to send posts to it.
"""
if not url.startswith('http') and \
not url.startswith('ipfs') and \
not url.startswith('ipns'):
return False
if '.onion/' in url or '.i2p/' in url or \
url.endswith('.onion') or \
url.endswith('.i2p'):
# skip this check for onion and i2p
return True
loc = _site_active_parse_url(url)
result = Result(url=url)
try:
result = _site_active_http_request(loc, timeout)
if 400 <= result.status < 500:
return result
return True
except BaseException as ex:
print('EX: site_is_active ' + url + ' ' + str(ex))
return False
def referer_is_active(http_prefix: str,
referer_domain: str, ua_str: str,
calling_site_timeout: int) -> bool:
"""Returns true if the given referer is an active website
"""
referer_url = http_prefix + '://' + referer_domain
if referer_domain + '/' in ua_str:
referer_url = referer_url + ua_str.split(referer_domain)[1]
ending_chars = (' ', ';', ')')
for end_ch in ending_chars:
if end_ch in referer_url:
referer_url = referer_url.split(end_ch)[0]
return site_is_active(referer_url, calling_site_timeout)