#include "i.h"
// function forward declarations
static void split(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2);
static ParsedUrl* urlfromparts(ParsedUrl* p);
static Rune* canonize(Rune* s1, int n1, Rune* s2);
int Ufmt(Fmt *f);
// globals
Rune* schemes[NSCHEMES] = {
L"",
L"http",
L"https",
L"ftp",
L"file",
L"gopher",
L"mailto",
L"news",
L"nntp",
L"telnet",
L"wais",
L"prospero",
L"unknown"
};
void
urlinit(void)
{
fmtinstall('U', Ufmt);
}
// Parse the surl string into its components.
// Return a pointer to a newly allocated ParsedUrl.
// If makeabs is true, add assume an http:// is in front of surl
// if surl has no // or :.
ParsedUrl*
makeurl(Rune* surl, int makeabs)
{
int i;
int surllen;
int scheme = NOSCHEME;
Rune* x;
Rune* sch = nil;
int schlen = 0;
Rune* url = nil;
int urllen = 0;
Rune* up = nil;
int uplen = 0;
Rune* hp = nil;
int hplen = 0;
Rune* netloc = nil;
int netloclen = 0;
int slsl;
ParsedUrl u;
memset(&u, 0, sizeof(ParsedUrl));
surllen = Strlen(surl);
split(surl, surllen, L":", &sch, &schlen, &url, &urllen);
if(urllen == 0) {
url = surl;
urllen = surllen;
sch = nil;
schlen = 0;
}
else {
x = Strnclass(sch, L"^-a-zA-Z0-9.+", schlen);
if(x != nil) {
url = surl;
urllen = surllen;
sch = nil;
schlen = 0;
}
else {
scheme = UNKNOWN;
for(i = 0; i < NSCHEMES; i++)
if(!Strncmpci(sch, schlen, schemes[i])) {
scheme = i;
schlen = Strlen(schemes[i]);
break;
}
}
}
if(scheme == MAILTO) {
u.path = url;
u.npath = urllen;
}
else {
slsl = 0;
if(urllen >= 2 && url[0] == '/' && url[1] == '/' ) {
slsl = 1;
url += 2;
urllen -= 2;
}
else if(makeabs && scheme==NOSCHEME) {
slsl = 1;
scheme = HTTP;
}
if(slsl) {
splitl(url, urllen, L"/", &netloc, &netloclen, &u.path, &u.npath);
if(u.npath != 0) {
u.path = u.path+1;
u.npath--;
}
u.pstart = L"/";
u.npstart = 1;
if(scheme == FILE) {
u.host = netloc;
u.nhost = netloclen;
}
else {
split(netloc, netloclen, L"@", &up, &uplen, &hp, &hplen);
if(hplen == 0) {
hp = up;
hplen = uplen;
}
else
split(up, uplen, L":", &u.user, &u.nuser, &u.passwd, &u.npasswd);
split(hp, hplen, L":", &u.host, &u.nhost, &u.port, &u.nport);
}
}
else {
if(urllen > 0 && url[0] == '/') {
u.pstart = L"/";
u.npstart = 1;
u.path = url+1;
u.npath = urllen-1;
}
else {
u.path = url;
u.npath = urllen;
}
}
if(scheme == FILE) {
if(u.nhost == 0) {
u.host = L"localhost";
u.nhost = 9;
}
}
else {
split(u.path, u.npath, L"#", &u.path, &u.npath, &u.frag, &u.nfrag);
split(u.path, u.npath, L"?", &u.path, &u.npath, &u.query, &u.nquery);
}
}
u.scheme = scheme;
return urlfromparts(&u);
}
int
Ufmt(Fmt *f)
{
ParsedUrl* u;
Rune* su;
u = va_arg(f->args, ParsedUrl*);
if(u == nil)
su = L"<null>";
else
su = u->url;
return fmtrunestrcpy(f, su);
}
// Return a URL that is u made absolute relative to b.
// The original urls won't be touched.
ParsedUrl*
makeabsoluteurl(ParsedUrl* u, ParsedUrl* b)
{
int n;
int scheme;
Rune* p;
Rune* path;
ParsedUrl *ans;
ParsedUrl t;
scheme = u->scheme;
// Should return if SCHEME already given (according to RFC1808,
// but various extant web pages violate that rule for non-http schemes
if(scheme != NOSCHEME && scheme != HTTP)
return u;
memset(&t, 0, sizeof(ParsedUrl));
if(u->nhost == 0 && u->npath == 0 && u->npstart == 0 && u->nquery == 0 && u->nfrag == 0) {
memmove(&t, b, sizeof(ParsedUrl));
}
else {
memmove(&t, u, sizeof(ParsedUrl));
if(scheme == NOSCHEME)
scheme = b->scheme;
if(t.nhost == 0) {
t.user = b->user;
t.nuser = b->nuser;
t.passwd = b->passwd;
t.npasswd = b->npasswd;
t.host = b->host;
t.nhost = b->nhost;
t.port = b->port;
t.nport = b->nport;
if(t.npstart == 0) {
t.pstart = L"/";
t.npstart = 1;
if(t.npath == 0) {
t.path = b->path;
t.npath = b->npath;
if(t.nquery == 0) {
t.query = b->query;
t.nquery = b->nquery;
}
}
else {
n = b->npath;
p = Strnrclass(b->path, L"/", n);
if(p == nil)
n = 0;
else
n = p-b->path;
t.path = path = canonize(b->path, n, u->path);
t.npath = Strlen(path);
}
}
}
}
t.scheme = scheme;
ans = urlfromparts(&t);
return ans;
}
// Make a copy of url that has q as query.
ParsedUrl*
makequeryurl(ParsedUrl* url, Rune* q)
{
ParsedUrl t;
memmove(&t, url, sizeof(ParsedUrl));
t.query = q;
t.nquery = Strlen(q);
return urlfromparts(&t);
}
// p has correct values for all its fields, except they might
// point into different strings. Make a new ParsedUrl with
// one contiguous string containing all the parts.
static ParsedUrl*
urlfromparts(ParsedUrl* p)
{
int n;
int schlen;
Rune* x;
ParsedUrl* ans;
if(p->scheme == NOSCHEME)
schlen = 0;
else
schlen = Strlen(schemes[p->scheme]);
n = schlen + (schlen != 0);
if(p->nhost != 0)
n += 2 + p->nuser + p->npasswd + p->nhost + p->nport +
(p->npasswd != 0) + (p->nuser != 0) + (p->nport != 0);
n += p->npstart + p->npath + p->nquery + p->nfrag +
(p->nquery != 0) + (p->nfrag != 0);
ans = (ParsedUrl*)emalloc(sizeof(ParsedUrl)+n*sizeof(Rune));
x = ans->url;
ans->scheme = p->scheme;
ans->nuser = p->nuser;
ans->npasswd = p->npasswd;
ans->nhost = p->nhost;
ans->nport = p->nport;
ans->npstart = p->npstart;
ans->npath = p->npath;
ans->nquery = p->nquery;
ans->nfrag = p->nfrag;
if(schlen != 0) {
x = Stradd(x, schemes[p->scheme], schlen);
*x++ = ':';
}
if(p->nhost != 0) {
x = Stradd(x, L"//", 2);
ans->user = x;
ans->passwd = x;
if(p->nuser != 0) {
x = Stradd(x, p->user, p->nuser);
if(p->npasswd != 0) {
*x++ = ':';
ans->passwd = x;
x = Stradd(x, p->passwd, p->npasswd);
}
*x++ = '@';
}
ans->host = x;
ans->port = x;
x = Stradd(x, p->host, p->nhost);
if(p->nport != 0) {
*x++ = ':';
ans->port = x;
x = Stradd(x, p->port, p->nport);
}
}
else {
ans->user = x;
ans->passwd = x;
ans->host = x;
ans->port = x;
}
ans->pstart = x;
x = Stradd(x, L"/", p->npstart);
ans->path = x;
x = Stradd(x, p->path, p->npath);
ans->query = x;
if(p->nquery != 0) {
*x++ = '?';
ans->query = x;
x = Stradd(x, p->query, p->nquery);
}
ans->frag = x;
if(p->nfrag != 0) {
*x++ = '#';
x = Stradd(x, p->frag, p->nfrag);
}
*x++ = 0;
assert(x == ans->url+n+1);
ans->nurl = n+1;
return ans;
}
// Don't include fragment in test, since we are testing if the
// pointed to docs are the same, not places within docs.
int
urlequal(ParsedUrl* a, ParsedUrl* b)
{
return
a->npath == b->npath && Streqn(a->path, a->npath, b->path) &&
a->scheme == b->scheme &&
a->nhost == b->nhost && Streqn(a->host, a->nhost, b->host) &&
a->nport == b->nport && Streqn(a->port, a->nport, b->port) &&
a->nuser == b->nuser && Streqn(a->user, a->nuser, b->user) &&
a->npasswd == b->npasswd && Streqn(a->passwd, a->npasswd, b->passwd) &&
a->npstart == b->npstart && Streqn(a->pstart, a->npstart, b->pstart) &&
a->nquery == b->nquery && Streqn(a->query, a->nquery, b->query);
}
// Like splitl, but assume one char match and omit that from second part.
// If no split, all s goes in first component
static void
split(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
{
splitl(s, n, cl, p1, n1, p2, n2);
if((*n2) != 0) {
(*p2)++;
(*n2)--;
}
}
// Make a new string that is canonization of the path s1[0:n1]/s2.
// (Canonization removes ./ and ../ from string).
static Rune*
canonize(Rune* s1, int n1, Rune* s2)
{
int k, m, n, shiftby, shiftstart;
Rune* ans;
Rune* r;
Rune* elem[SMALLBUFSIZE];
int elen[SMALLBUFSIZE];
k = splitall(s1, n1, L"/", elem, elen, SMALLBUFSIZE-1);
m = splitall(s2, Strlen(s2), L"/", elem+k, elen+k, SMALLBUFSIZE-k);
n = m+k;
if(n == SMALLBUFSIZE)
trace("warning: url too long; truncated\n");
for(k = 0; k < n; ) {
m = elen[k];
if(m <= 2) {
shiftby = 0;
shiftstart = 0;
r = elem[k];
if(r[0] == '.') {
if(m == 1) {
shiftstart = k;
shiftby = 1;
}
else if(r[1] == '.') {
shiftstart = k-1;
shiftby = 2;
if(shiftstart == -1) {
shiftstart = 0;
shiftby = 1;
}
}
}
if(shiftby != 0) {
for(m = shiftstart+shiftby; m < n; m++) {
elem[m-shiftby] = elem[m];
elen[m-shiftby] = elen[m];
}
n -= shiftby;
k = shiftstart;
continue;
}
}
k++;
}
if(n == 0)
return nil;
m = 0;
for(k = 0; k < n; k++)
m += elen[k];
m += k-1;
ans = emalloc((m+1)*sizeof(Rune));
r = ans;
for(k = 0; k < n; k++) {
r = Stradd(r, elem[k], elen[k]);
if(k < n-1)
*r++ = '/';
}
*r = 0;
return ans;
}
ParsedUrl*
copyurl(ParsedUrl* url)
{
return urlfromparts(url);
}
// For debugging
static int
validurlpart(ParsedUrl* u, int n, int npart, Rune* part)
{
return npart == 0 ||
(npart > 0 && part != nil && &u->url[0] <= part && part+npart <= &u->url[n]);
}
int
validurl(ParsedUrl* u)
{
int n;
if(u == nil || u->scheme < NOSCHEME || u->scheme >= NSCHEMES)
return 0;
else {
n = (u->scheme == NOSCHEME)? 0 : Strlen(schemes[u->scheme])+1;
if(u->nhost != 0)
n += 2 + u->nuser + u->npasswd + u->nhost + u->nport +
(u->npasswd != 0) + (u->nuser != 0) + (u->nport != 0);
n += u->npstart + u->npath + u->nquery + u->nfrag +
(u->nquery != 0) + (u->nfrag != 0);
return n == u->nurl &&
u->url[n] == 0 &&
validurlpart(u, n, u->nuser, u->user) &&
validurlpart(u, n, u->npasswd, u->passwd) &&
validurlpart(u, n, u->nhost, u->host) &&
validurlpart(u, n, u->nport, u->port) &&
(u->npstart == 0 || (u->npstart == 1 && u->pstart[0] == L'/')) &&
validurlpart(u, n, u->npath, u->path) &&
validurlpart(u, n, u->nquery, u->query) &&
validurlpart(u, n, u->nfrag, u->frag);
}
}
|