2021-02-10 11:24:34 +00:00
|
|
|
__filename__ = "siteactive.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__credits__ = ["webchk"]
|
|
|
|
__license__ = "AGPL3+"
|
2024-01-21 19:01:20 +00:00
|
|
|
__version__ = "1.5.0"
|
2021-02-10 11:24:34 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2021-02-10 11:24:34 +00:00
|
|
|
__status__ = "Production"
|
2021-06-26 11:16:41 +00:00
|
|
|
__module_group__ = "Core"
|
2021-02-10 11:24:34 +00:00
|
|
|
|
|
|
|
import http.client
|
|
|
|
import ssl
|
2024-05-12 12:35:26 +00:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
from utils import data_dir
|
2021-02-10 11:24:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Result:
|
|
|
|
"""Holds result of an URL check.
|
|
|
|
|
|
|
|
The redirect attribute is a Result object that the URL was redirected to.
|
|
|
|
|
|
|
|
The sitemap_urls attribute will contain a list of Result object if url
|
|
|
|
is a sitemap file and http_response() was run with parse set to True.
|
|
|
|
"""
|
|
|
|
def __init__(self, url):
|
|
|
|
self.url = url
|
|
|
|
self.status = 0
|
|
|
|
self.desc = ''
|
|
|
|
self.headers = None
|
|
|
|
self.latency = 0
|
|
|
|
self.content = ''
|
|
|
|
self.redirect = None
|
|
|
|
self.sitemap_urls = None
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
if self.status == 0:
|
|
|
|
return '{} ... {}'.format(self.url, self.desc)
|
|
|
|
return '{} ... {} {} ({})'.format(
|
|
|
|
self.url, self.status, self.desc, self.latency
|
|
|
|
)
|
|
|
|
|
|
|
|
def fill_headers(self, headers):
|
|
|
|
"""Takes a list of tuples and converts it a dictionary."""
|
|
|
|
self.headers = {h[0]: h[1] for h in headers}
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _site_active_parse_url(url):
|
2021-02-10 11:24:34 +00:00
|
|
|
"""Returns an object with properties representing
|
|
|
|
|
|
|
|
scheme: URL scheme specifier
|
|
|
|
netloc: Network location part
|
|
|
|
path: Hierarchical path
|
|
|
|
params: Parameters for last path element
|
|
|
|
query: Query component
|
|
|
|
fragment: Fragment identifier
|
|
|
|
username: User name
|
|
|
|
password: Password
|
|
|
|
hostname: Host name (lower case)
|
|
|
|
port: Port number as integer, if present
|
|
|
|
"""
|
|
|
|
loc = urlparse(url)
|
|
|
|
|
|
|
|
# if the scheme (http, https ...) is not available urlparse wont work
|
|
|
|
if loc.scheme == "":
|
|
|
|
url = "http://" + url
|
|
|
|
loc = urlparse(url)
|
|
|
|
return loc
|
|
|
|
|
|
|
|
|
2022-04-29 13:54:13 +00:00
|
|
|
def _site_active_http_connect(loc, timeout: int):
|
2021-02-10 11:24:34 +00:00
|
|
|
"""Connects to the host and returns an HTTP or HTTPS connections."""
|
|
|
|
if loc.scheme == "https":
|
|
|
|
ssl_context = ssl.SSLContext()
|
|
|
|
return http.client.HTTPSConnection(
|
|
|
|
loc.netloc, context=ssl_context, timeout=timeout)
|
|
|
|
return http.client.HTTPConnection(loc.netloc, timeout=timeout)
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def _site_active_http_request(loc, timeout: int):
|
2021-02-10 11:24:34 +00:00
|
|
|
"""Performs a HTTP request and return response in a Result object.
|
|
|
|
"""
|
2022-04-29 13:54:13 +00:00
|
|
|
conn = _site_active_http_connect(loc, timeout)
|
2021-02-10 11:24:34 +00:00
|
|
|
method = 'HEAD'
|
|
|
|
|
|
|
|
conn.request(method, loc.path)
|
|
|
|
resp = conn.getresponse()
|
|
|
|
|
|
|
|
result = Result(loc.geturl())
|
|
|
|
result.status = resp.status
|
|
|
|
result.desc = resp.reason
|
|
|
|
result.fill_headers(resp.getheaders())
|
|
|
|
|
|
|
|
conn.close()
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2023-09-15 21:04:31 +00:00
|
|
|
def site_is_active(url: str, timeout: int,
|
|
|
|
sites_unavailable: []) -> bool:
|
2021-02-10 11:24:34 +00:00
|
|
|
"""Returns true if the current url is resolvable.
|
|
|
|
This can be used to check that an instance is online before
|
|
|
|
trying to send posts to it.
|
|
|
|
"""
|
2023-10-08 20:11:27 +00:00
|
|
|
if '<>' in url:
|
|
|
|
url = url.replace('<>', '')
|
2022-04-29 13:54:13 +00:00
|
|
|
if not url.startswith('http') and \
|
|
|
|
not url.startswith('ipfs') and \
|
|
|
|
not url.startswith('ipns'):
|
2021-02-10 11:24:34 +00:00
|
|
|
return False
|
|
|
|
if '.onion/' in url or '.i2p/' in url or \
|
|
|
|
url.endswith('.onion') or \
|
|
|
|
url.endswith('.i2p'):
|
|
|
|
# skip this check for onion and i2p
|
|
|
|
return True
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
loc = _site_active_parse_url(url)
|
2021-02-10 11:24:34 +00:00
|
|
|
result = Result(url=url)
|
2023-09-15 21:04:31 +00:00
|
|
|
url2 = url
|
|
|
|
if '://' in url:
|
|
|
|
url2 = url.split('://')[1]
|
2021-02-10 11:24:34 +00:00
|
|
|
|
|
|
|
try:
|
2021-12-29 21:55:09 +00:00
|
|
|
result = _site_active_http_request(loc, timeout)
|
2021-02-10 11:24:34 +00:00
|
|
|
|
2023-10-10 10:33:59 +00:00
|
|
|
if url2 in sites_unavailable:
|
|
|
|
sites_unavailable.remove(url2)
|
|
|
|
|
2021-02-10 11:24:34 +00:00
|
|
|
if 400 <= result.status < 500:
|
2023-10-10 10:33:59 +00:00
|
|
|
# the site is available but denying access
|
2021-02-10 11:24:34 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
2022-06-06 11:21:25 +00:00
|
|
|
except BaseException as ex:
|
|
|
|
print('EX: site_is_active ' + url + ' ' + str(ex))
|
2023-10-10 10:33:59 +00:00
|
|
|
|
|
|
|
if url2 not in sites_unavailable:
|
|
|
|
sites_unavailable.append(url2)
|
2021-02-10 11:24:34 +00:00
|
|
|
return False
|
2022-02-05 10:49:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
def referer_is_active(http_prefix: str,
|
|
|
|
referer_domain: str, ua_str: str,
|
2023-09-15 21:04:31 +00:00
|
|
|
calling_site_timeout: int,
|
|
|
|
sites_unavailable: []) -> bool:
|
2022-02-05 10:49:31 +00:00
|
|
|
"""Returns true if the given referer is an active website
|
|
|
|
"""
|
|
|
|
referer_url = http_prefix + '://' + referer_domain
|
|
|
|
if referer_domain + '/' in ua_str:
|
|
|
|
referer_url = referer_url + ua_str.split(referer_domain)[1]
|
|
|
|
ending_chars = (' ', ';', ')')
|
|
|
|
for end_ch in ending_chars:
|
|
|
|
if end_ch in referer_url:
|
|
|
|
referer_url = referer_url.split(end_ch)[0]
|
2023-09-15 21:04:31 +00:00
|
|
|
return site_is_active(referer_url, calling_site_timeout,
|
|
|
|
sites_unavailable)
|
|
|
|
|
|
|
|
|
|
|
|
def save_unavailable_sites(base_dir: str, sites_unavailable: []) -> None:
|
|
|
|
"""Save a list of unavailable sites
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
unavailable_sites_filename = data_dir(base_dir) + '/unavailable_sites.txt'
|
2023-09-15 21:04:31 +00:00
|
|
|
sites_unavailable.sort()
|
|
|
|
try:
|
|
|
|
with open(unavailable_sites_filename, 'w+',
|
|
|
|
encoding='utf-8') as fp_sites:
|
|
|
|
for site in sites_unavailable:
|
2023-09-16 12:47:23 +00:00
|
|
|
if site:
|
|
|
|
fp_sites.write(site + '\n')
|
2023-09-15 21:04:31 +00:00
|
|
|
except OSError:
|
|
|
|
print('EX: unable to save unavailable sites')
|
|
|
|
|
|
|
|
|
|
|
|
def load_unavailable_sites(base_dir: str) -> []:
|
|
|
|
"""load a list of unavailable sites
|
|
|
|
"""
|
2024-05-12 12:35:26 +00:00
|
|
|
unavailable_sites_filename = data_dir(base_dir) + '/unavailable_sites.txt'
|
2023-09-15 21:04:31 +00:00
|
|
|
sites_unavailable = []
|
|
|
|
try:
|
|
|
|
with open(unavailable_sites_filename, 'r',
|
|
|
|
encoding='utf-8') as fp_sites:
|
|
|
|
sites_unavailable = fp_sites.read().split('\n')
|
|
|
|
except OSError:
|
2024-07-13 14:38:11 +00:00
|
|
|
print('EX: unable to read unavailable sites ' +
|
|
|
|
unavailable_sites_filename)
|
2023-09-15 21:04:31 +00:00
|
|
|
return sites_unavailable
|