247 lines
8.5 KiB
Ragel
247 lines
8.5 KiB
Ragel
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||
│ │
|
||
│ This program is free software; you can redistribute it and/or modify │
|
||
│ it under the terms of the GNU General Public License as published by │
|
||
│ the Free Software Foundation; version 2 of the License. │
|
||
│ │
|
||
│ This program is distributed in the hope that it will be useful, but │
|
||
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
|
||
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
|
||
│ General Public License for more details. │
|
||
│ │
|
||
│ You should have received a copy of the GNU General Public License │
|
||
│ along with this program; if not, write to the Free Software │
|
||
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
|
||
│ 02110-1301 USA │
|
||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||
#include "libc/assert.h"
|
||
#include "libc/dce.h"
|
||
#include "libc/dns/dns.h"
|
||
#include "libc/log/log.h"
|
||
#include "libc/str/str.h"
|
||
#include "libc/sysv/errfuns.h"
|
||
#include "net/http/uri.h"
|
||
|
||
#define static
|
||
|
||
/* clang-format off */
|
||
%% machine uriparse;
|
||
%% write data;
|
||
/* clang-format on */
|
||
|
||
/**
|
||
* Parses URI.
|
||
*
|
||
* This is a general URL parser. It's typically used for HTTP. Support
|
||
* for the bonus syntax needed by SIP is provided. The whirlwhind tour
|
||
* of the URI rabbit hole is as follows:
|
||
*
|
||
* /foo.html
|
||
* //justine.local/foo.html
|
||
* http://justine.local/foo.html
|
||
* http://bettersearchengine.local/search.cgi?q=my%20query
|
||
* file:///etc/passwd
|
||
* gs://bucket/object.txt
|
||
* zip:///usr/share/zoneinfo/GMT
|
||
* sip:127.0.0.1:5060;lr
|
||
* sip:+12125650666@gateway.example
|
||
* sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00
|
||
* data:video/mpeg;base64,gigabytesofhex
|
||
*
|
||
* This parser operates on slices rather than C strings. It performs
|
||
* slicing and validation only. Operations like turning "%20"→" " or
|
||
* "80"→80 and perfect hashing can be done later, if needed.
|
||
*
|
||
* The Uri object is owned by the caller; it has a lifecycle like the
|
||
* following:
|
||
*
|
||
* struct Uri uri;
|
||
* memset(&uri, 0, sizeof(uri));
|
||
*
|
||
* uriparse(&uri, s1, strlen(s1));
|
||
* CHECK_EQ(kUriSchemeHttp, urischeme(uri->scheme, s1));
|
||
*
|
||
* uriparse(&uri, s2, strlen(s2));
|
||
* printf("host = %`.*s\n", uri->host.n, s2 + uri->host.i);
|
||
*
|
||
* Inner arrays may be granted memory by the caller. The uri->𝐴.i field
|
||
* is cleared at the mark of this function. No more than uri->𝐴.n items
|
||
* can be inserted. If we need more than that, then ENOMEM is returned
|
||
* rather than dynamically extending uri->𝐴.p. However, if uri->𝐴.n==0,
|
||
* we assume caller doesn't care about uri->𝐴 and its data is discarded.
|
||
*
|
||
* @param uri is owned by caller
|
||
* @param p is caller-owned uri string; won't copy/alias/mutate
|
||
* @return 0 on success, or -1 w/ errno
|
||
* @see RFC2396: Uniform Resource Identifiers (URI): Generic Syntax
|
||
* @see RFC3261: SIP: Session Initiation Protocol
|
||
*/
|
||
int uriparse(struct Uri *uri, const char *p, size_t size) {
|
||
unsigned zero, cs;
|
||
struct UriKeyval kv;
|
||
const char *pe, *eof, *buf, *mark;
|
||
|
||
assert(p || !size);
|
||
assert(size <= 0x7ffff000);
|
||
|
||
#define ABSENT ((struct UriSlice){zero, zero})
|
||
#define SLICE ((struct UriSlice){mark - buf, p - mark})
|
||
|
||
cs = zero = VEIL("r", 0u);
|
||
eof = pe = (mark = buf = p) + size;
|
||
|
||
uri->scheme = ABSENT;
|
||
uri->opaque = ABSENT;
|
||
uri->userinfo = ABSENT;
|
||
uri->host = ABSENT;
|
||
uri->port = ABSENT;
|
||
uri->fragment = ABSENT;
|
||
uri->segs.i = zero;
|
||
uri->paramsegs.i = zero;
|
||
uri->params.i = zero;
|
||
uri->queries.i = zero;
|
||
|
||
/* clang-format off */
|
||
|
||
%%{
|
||
action Mark { mark = p; }
|
||
action SetScheme { uri->scheme = SLICE; }
|
||
action SetFragment { uri->fragment = SLICE; }
|
||
action SetUserinfo { uri->userinfo = SLICE; }
|
||
action SetHost { uri->host = SLICE; }
|
||
action SetPort { uri->port = SLICE; }
|
||
|
||
action SetKey {
|
||
kv.k = SLICE;
|
||
kv.v = (struct UriSlice){zero, zero};
|
||
}
|
||
|
||
action SetVal {
|
||
kv.v = SLICE;
|
||
}
|
||
|
||
action RestartSegs {
|
||
uri->segs.i = zero;
|
||
uri->paramsegs.i = zero;
|
||
}
|
||
|
||
action AppendParam {
|
||
if (uri->params.n) {
|
||
if (uri->params.i < uri->params.n) {
|
||
uri->params.p[uri->params.i++] = kv;
|
||
} else {
|
||
return enomem();
|
||
}
|
||
}
|
||
}
|
||
|
||
action AppendQuery {
|
||
if (uri->queries.n) {
|
||
if (uri->queries.i < uri->queries.n) {
|
||
uri->queries.p[uri->queries.i++] = kv;
|
||
} else {
|
||
return enomem();
|
||
}
|
||
}
|
||
}
|
||
|
||
action AppendSegment {
|
||
if (p > mark && uri->segs.n) {
|
||
if (uri->segs.i < uri->segs.n) {
|
||
uri->segs.p[uri->segs.i++] = SLICE;
|
||
} else {
|
||
return enomem();
|
||
}
|
||
}
|
||
}
|
||
|
||
action HandleOpaquePart {
|
||
switch (urischeme(uri->scheme, buf)) {
|
||
case kUriSchemeSip:
|
||
case kUriSchemeSips:
|
||
--p;
|
||
fgoto sip;
|
||
default:
|
||
if (uricspn(p, pe - p) == pe - p) {
|
||
uri->opaque = (struct UriSlice){p - buf, pe - p};
|
||
return zero;
|
||
} else {
|
||
return einval();
|
||
}
|
||
}
|
||
}
|
||
|
||
mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")";
|
||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ",";
|
||
unreserved = alnum | mark;
|
||
ipv4c = digit | ".";
|
||
ipv6c = xdigit | "." | ":";
|
||
hostc = alnum | "-" | ".";
|
||
telc = digit | "+" | "-";
|
||
schemec = alnum | "+" | "-" | ".";
|
||
userinfoc = unreserved | "&" | "=" | "+" | "$" | "," | "?" | ":";
|
||
paramc = unreserved | "[" | "]" | ":" | "&" | "+" | "$";
|
||
queryc = unreserved | "[" | "]" | "/" | "?" | ":" | "+" | "$";
|
||
pathc = unreserved | ":" | "@" | "&" | "=" | "+" | "$" | ",";
|
||
relc = unreserved | ";" | "@" | "&" | "=" | "+" | "$" | ",";
|
||
uric = reserved | unreserved;
|
||
|
||
escaped = "%" xdigit xdigit;
|
||
pathchar = escaped | pathc;
|
||
urichar = escaped | uric;
|
||
relchar = escaped | relc;
|
||
userinfochar = escaped | userinfoc;
|
||
paramchar = escaped | paramc;
|
||
querychar = escaped | queryc;
|
||
|
||
paramkey = paramchar+ >Mark %SetKey;
|
||
paramval = paramchar+ >Mark %SetVal;
|
||
param = ";" paramkey ( "=" paramval )? %AppendParam;
|
||
|
||
querykey = querychar+ >Mark %SetKey;
|
||
queryval = querychar+ >Mark %SetVal;
|
||
query = querykey ( "=" queryval )? %AppendQuery;
|
||
queries = "?" query ( "&" query )*;
|
||
|
||
scheme = ( alpha @Mark schemec* ) ":" @SetScheme;
|
||
userinfo = userinfochar+ >Mark "@" @SetUserinfo;
|
||
host6 = "[" ( ipv6c+ >Mark %SetHost ) "]";
|
||
host = host6 | ( ( ipv4c | hostc | telc )+ >Mark %SetHost );
|
||
port = digit+ >Mark %SetPort;
|
||
hostport = host ( ":" port )?;
|
||
authority = userinfo? hostport;
|
||
segment = pathchar+ %AppendSegment param*;
|
||
rel_segment = relchar+ >Mark %AppendSegment;
|
||
path_segments = segment ( "/" @Mark segment )*;
|
||
abs_path = "/" @Mark path_segments;
|
||
net_path = "//" authority abs_path? >RestartSegs;
|
||
hier_part = ( net_path | abs_path ) queries?;
|
||
rel_path = rel_segment abs_path?;
|
||
opaque_part = ( urichar -- "/" ) @HandleOpaquePart;
|
||
fragment = "#" urichar* >Mark %SetFragment;
|
||
relativeURI = ( net_path | abs_path | rel_path ) queries?;
|
||
absoluteURI = scheme ( hier_part | opaque_part );
|
||
sip := authority >Mark param*;
|
||
uri := ( relativeURI | absoluteURI )? fragment?;
|
||
}%%
|
||
|
||
%% write init;
|
||
cs = uriparse_en_uri;
|
||
%% write exec;
|
||
|
||
/* clang-format on */
|
||
|
||
if (cs >= uriparse_first_final) {
|
||
if (uri->host.n <= DNS_NAME_MAX && uri->port.n <= 6) {
|
||
return zero;
|
||
} else {
|
||
return eoverflow();
|
||
}
|
||
} else {
|
||
return einval();
|
||
}
|
||
}
|