/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ │vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ This program is free software; you can redistribute it and/or modify │ │ it under the terms of the GNU General Public License as published by │ │ the Free Software Foundation; version 2 of the License. │ │ │ │ This program is distributed in the hope that it will be useful, but │ │ WITHOUT ANY WARRANTY; without even the implied warranty of │ │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │ │ General Public License for more details. │ │ │ │ You should have received a copy of the GNU General Public License │ │ along with this program; if not, write to the Free Software │ │ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │ │ 02110-1301 USA │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/assert.h" #include "libc/dce.h" #include "libc/dns/dns.h" #include "libc/log/log.h" #include "libc/str/str.h" #include "libc/sysv/errfuns.h" #include "net/http/uri.h" #define static /* clang-format off */ %% machine uriparse; %% write data; /* clang-format on */ /** * Parses URI. * * This is a general URL parser. It's typically used for HTTP. Support * for the bonus syntax needed by SIP is provided. The whirlwhind tour * of the URI rabbit hole is as follows: * * /foo.html * //justine.local/foo.html * http://justine.local/foo.html * http://bettersearchengine.local/search.cgi?q=my%20query * file:///etc/passwd * gs://bucket/object.txt * zip:///usr/share/zoneinfo/GMT * sip:127.0.0.1:5060;lr * sip:+12125650666@gateway.example * sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00 * data:video/mpeg;base64,gigabytesofhex * * This parser operates on slices rather than C strings. It performs * slicing and validation only. Operations like turning "%20"→" " or * "80"→80 and perfect hashing can be done later, if needed. * * The Uri object is owned by the caller; it has a lifecycle like the * following: * * struct Uri uri; * memset(&uri, 0, sizeof(uri)); * * uriparse(&uri, s1, strlen(s1)); * CHECK_EQ(kUriSchemeHttp, urischeme(uri->scheme, s1)); * * uriparse(&uri, s2, strlen(s2)); * printf("host = %`.*s\n", uri->host.n, s2 + uri->host.i); * * Inner arrays may be granted memory by the caller. The uri->𝐴.i field * is cleared at the mark of this function. No more than uri->𝐴.n items * can be inserted. If we need more than that, then ENOMEM is returned * rather than dynamically extending uri->𝐴.p. However, if uri->𝐴.n==0, * we assume caller doesn't care about uri->𝐴 and its data is discarded. * * @param uri is owned by caller * @param p is caller-owned uri string; won't copy/alias/mutate * @return 0 on success, or -1 w/ errno * @see RFC2396: Uniform Resource Identifiers (URI): Generic Syntax * @see RFC3261: SIP: Session Initiation Protocol */ int uriparse(struct Uri *uri, const char *p, size_t size) { unsigned zero, cs; struct UriKeyval kv; const char *pe, *eof, *buf, *mark; assert(p || !size); assert(size <= 0x7ffff000); #define ABSENT ((struct UriSlice){zero, zero}) #define SLICE ((struct UriSlice){mark - buf, p - mark}) cs = zero = VEIL("r", 0u); eof = pe = (mark = buf = p) + size; uri->scheme = ABSENT; uri->opaque = ABSENT; uri->userinfo = ABSENT; uri->host = ABSENT; uri->port = ABSENT; uri->fragment = ABSENT; uri->segs.i = zero; uri->paramsegs.i = zero; uri->params.i = zero; uri->queries.i = zero; /* clang-format off */ %%{ action Mark { mark = p; } action SetScheme { uri->scheme = SLICE; } action SetFragment { uri->fragment = SLICE; } action SetUserinfo { uri->userinfo = SLICE; } action SetHost { uri->host = SLICE; } action SetPort { uri->port = SLICE; } action SetKey { kv.k = SLICE; kv.v = (struct UriSlice){zero, zero}; } action SetVal { kv.v = SLICE; } action RestartSegs { uri->segs.i = zero; uri->paramsegs.i = zero; } action AppendParam { if (uri->params.n) { if (uri->params.i < uri->params.n) { uri->params.p[uri->params.i++] = kv; } else { return enomem(); } } } action AppendQuery { if (uri->queries.n) { if (uri->queries.i < uri->queries.n) { uri->queries.p[uri->queries.i++] = kv; } else { return enomem(); } } } action AppendSegment { if (p > mark && uri->segs.n) { if (uri->segs.i < uri->segs.n) { uri->segs.p[uri->segs.i++] = SLICE; } else { return enomem(); } } } action HandleOpaquePart { switch (urischeme(uri->scheme, buf)) { case kUriSchemeSip: case kUriSchemeSips: --p; fgoto sip; default: if (uricspn(p, pe - p) == pe - p) { uri->opaque = (struct UriSlice){p - buf, pe - p}; return zero; } else { return einval(); } } } mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"; reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","; unreserved = alnum | mark; ipv4c = digit | "."; ipv6c = xdigit | "." | ":"; hostc = alnum | "-" | "."; telc = digit | "+" | "-"; schemec = alnum | "+" | "-" | "."; userinfoc = unreserved | "&" | "=" | "+" | "$" | "," | "?" | ":"; paramc = unreserved | "[" | "]" | ":" | "&" | "+" | "$"; queryc = unreserved | "[" | "]" | "/" | "?" | ":" | "+" | "$"; pathc = unreserved | ":" | "@" | "&" | "=" | "+" | "$" | ","; relc = unreserved | ";" | "@" | "&" | "=" | "+" | "$" | ","; uric = reserved | unreserved; escaped = "%" xdigit xdigit; pathchar = escaped | pathc; urichar = escaped | uric; relchar = escaped | relc; userinfochar = escaped | userinfoc; paramchar = escaped | paramc; querychar = escaped | queryc; paramkey = paramchar+ >Mark %SetKey; paramval = paramchar+ >Mark %SetVal; param = ";" paramkey ( "=" paramval )? %AppendParam; querykey = querychar+ >Mark %SetKey; queryval = querychar+ >Mark %SetVal; query = querykey ( "=" queryval )? %AppendQuery; queries = "?" query ( "&" query )*; scheme = ( alpha @Mark schemec* ) ":" @SetScheme; userinfo = userinfochar+ >Mark "@" @SetUserinfo; host6 = "[" ( ipv6c+ >Mark %SetHost ) "]"; host = host6 | ( ( ipv4c | hostc | telc )+ >Mark %SetHost ); port = digit+ >Mark %SetPort; hostport = host ( ":" port )?; authority = userinfo? hostport; segment = pathchar+ %AppendSegment param*; rel_segment = relchar+ >Mark %AppendSegment; path_segments = segment ( "/" @Mark segment )*; abs_path = "/" @Mark path_segments; net_path = "//" authority abs_path? >RestartSegs; hier_part = ( net_path | abs_path ) queries?; rel_path = rel_segment abs_path?; opaque_part = ( urichar -- "/" ) @HandleOpaquePart; fragment = "#" urichar* >Mark %SetFragment; relativeURI = ( net_path | abs_path | rel_path ) queries?; absoluteURI = scheme ( hier_part | opaque_part ); sip := authority >Mark param*; uri := ( relativeURI | absoluteURI )? fragment?; }%% %% write init; cs = uriparse_en_uri; %% write exec; /* clang-format on */ if (cs >= uriparse_first_final) { if (uri->host.n <= DNS_NAME_MAX && uri->port.n <= 6) { return zero; } else { return eoverflow(); } } else { return einval(); } }