Improve checking of site active status

This uses a defluffed version of webchk
merge-requests/17/head
Bob Mottram 2021-02-10 11:24:34 +00:00
parent 0826326653
commit b16fb0d24c
4 changed files with 126 additions and 28 deletions

View File

@ -30,6 +30,7 @@ from session import postJsonString
from session import postImage
from webfinger import webfingerHandle
from httpsig import createSignedHeader
from siteactive import siteIsActive
from utils import fileLastModified
from utils import isPublicPost
from utils import hasUsersPath
@ -38,7 +39,6 @@ from utils import getFullDomain
from utils import getFollowersList
from utils import isEvil
from utils import removeIdEnding
from utils import siteIsActive
from utils import getCachedPostFilename
from utils import getStatusNumber
from utils import createPersonDir

121
siteactive.py 100644
View File

@ -0,0 +1,121 @@
__filename__ = "siteactive.py"
__author__ = "Bob Mottram"
__credits__ = ["webchk"]
__license__ = "AGPL3+"
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@freedombone.net"
__status__ = "Production"
import http.client
from urllib.parse import urlparse
import ssl
class Result:
"""Holds result of an URL check.
The redirect attribute is a Result object that the URL was redirected to.
The sitemap_urls attribute will contain a list of Result object if url
is a sitemap file and http_response() was run with parse set to True.
"""
def __init__(self, url):
self.url = url
self.status = 0
self.desc = ''
self.headers = None
self.latency = 0
self.content = ''
self.redirect = None
self.sitemap_urls = None
def __repr__(self):
if self.status == 0:
return '{} ... {}'.format(self.url, self.desc)
return '{} ... {} {} ({})'.format(
self.url, self.status, self.desc, self.latency
)
def fill_headers(self, headers):
"""Takes a list of tuples and converts it a dictionary."""
self.headers = {h[0]: h[1] for h in headers}
def _siteActiveParseUrl(url):
"""Returns an object with properties representing
scheme: URL scheme specifier
netloc: Network location part
path: Hierarchical path
params: Parameters for last path element
query: Query component
fragment: Fragment identifier
username: User name
password: Password
hostname: Host name (lower case)
port: Port number as integer, if present
"""
loc = urlparse(url)
# if the scheme (http, https ...) is not available urlparse wont work
if loc.scheme == "":
url = "http://" + url
loc = urlparse(url)
return loc
def _siteACtiveHttpConnect(loc, timeout: int):
"""Connects to the host and returns an HTTP or HTTPS connections."""
if loc.scheme == "https":
ssl_context = ssl.SSLContext()
return http.client.HTTPSConnection(
loc.netloc, context=ssl_context, timeout=timeout)
return http.client.HTTPConnection(loc.netloc, timeout=timeout)
def _siteActiveHttpRequest(loc, timeout: int):
"""Performs a HTTP request and return response in a Result object.
"""
conn = _siteACtiveHttpConnect(loc, timeout)
method = 'HEAD'
conn.request(method, loc.path)
resp = conn.getresponse()
result = Result(loc.geturl())
result.status = resp.status
result.desc = resp.reason
result.fill_headers(resp.getheaders())
conn.close()
return result
def siteIsActive(url: str, timeout=10) -> bool:
"""Returns true if the current url is resolvable.
This can be used to check that an instance is online before
trying to send posts to it.
"""
if not url.startswith('http'):
return False
if '.onion/' in url or '.i2p/' in url or \
url.endswith('.onion') or \
url.endswith('.i2p'):
# skip this check for onion and i2p
return True
loc = _siteActiveParseUrl(url)
result = Result(url=url)
try:
result = _siteActiveHttpRequest(loc, timeout)
if 400 <= result.status < 500:
return result
return True
except BaseException:
pass
return False

View File

@ -38,7 +38,7 @@ from utils import getFullDomain
from utils import validNickname
from utils import firstParagraphFromString
from utils import removeIdEnding
from utils import siteIsActive
from siteactive import siteIsActive
from utils import updateRecentPostsCache
from utils import followPerson
from utils import getNicknameFromActor
@ -2067,6 +2067,7 @@ def testJsonld():
def testSiteIsActive():
print('testSiteIsActive')
assert(siteIsActive('https://archive.org'))
assert(siteIsActive('https://mastodon.social'))
assert(not siteIsActive('https://notarealwebsite.a.b.c'))
@ -2818,7 +2819,8 @@ def testFunctions():
'createServerBob',
'createServerEve',
'E2EEremoveDevice',
'setOrganizationScheme'
'setOrganizationScheme',
'fill_headers'
]
excludeImports = [
'link',

View File

@ -11,9 +11,6 @@ import time
import shutil
import datetime
import json
from socket import error as SocketError
import errno
import urllib.request
import idna
from pprint import pprint
from calendar import monthrange
@ -1841,28 +1838,6 @@ def updateAnnounceCollection(recentPostsCache: {},
saveJson(postJsonObject, postFilename)
def siteIsActive(url: str) -> bool:
"""Returns true if the current url is resolvable.
This can be used to check that an instance is online before
trying to send posts to it.
"""
if not url.startswith('http'):
return False
if '.onion/' in url or '.i2p/' in url or \
url.endswith('.onion') or \
url.endswith('.i2p'):
# skip this check for onion and i2p
return True
try:
req = urllib.request.Request(url)
urllib.request.urlopen(req, timeout=10) # nosec
return True
except SocketError as e:
if e.errno == errno.ECONNRESET:
print('WARN: connection was reset during siteIsActive')
return False
def weekDayOfMonthStart(monthNumber: int, year: int) -> int:
"""Gets the day number of the first day of the month
1=sun, 7=sat