// htmlFetch library module. This contains code for fetching a web page
// from the internet or from a local file, and parsing it. Also it can
// submit forms and get back the results.
global class htmlStatus
// HTTP version and status code.
{
string version="HTTP/1.1"; // HTTP protocal version
int status=200; // HTTP status code. 200 means A-OK.
}
global class htmlCookie
// A cookie - stored by the browser usually. We need to
// echo it back when we post forms.
{
string name; // Cookie name.
string value; // Cookie value.
string domain; // The set of web domains the cookie applies to.
string path; // The cookie applies below this path I guess.
string expires; // Expiration date.
bit secure; // Is it a secure coookie?
}
global class htmlTag
// An HTML tag - includes attribute list but no text.
{
string name; // Tag name.
dir of string attributes; // All attributes. May be nil
int start; // Start position within htmlPage.fullText
int end; // End position (one past closing '>')
flow attribute(string attName, string usual=nil) into (string val)
// Return value of attribute if it exists, or usual val if it doesn't
{
string key = attName.upper();
if (attributes)
val = attributes[key];
if (val == nil)
val = usual;
}
flow requiredAttribute(string attName) into (string val)
// Return attribute value. Punt if it doesn't exist
{
string key = attName.upper();
if (attributes)
val = attributes[key];
if (val == nil)
punt("Missing required $attName attribute in $name");
}
}
global class htmlFormVar
// A variable within an html form. Associated with a button, input tag etc.
{
string name; // Variable name.
writable string value; // Current value if any. May be nil.
string type; // Variable type.
array of string values; // Available values for some types, nil for others.
string tagName; // Name of associated tag.
array of htmlTag tags; // Associated html tags.
}
global class htmlForm
// A form within an html page.
{
string name; // Name (nil if not defined)
string action; // URL to call when they press submit
string method="GET"; // Could also be POST.
int startTagIx; // Index of start tag in page.tags
int endTagIx; // Index just past end tag in page.tags
dir of htmlFormVar vars; // Variables defined in form.
to varsToStringDir() into (dir of string strings)
// Convert our CGI vars into a simple dir of string.
{
dir of string cgiVars = ();
strings = ();
for (var in vars)
{
bit gotIt = false;
string val = var.value;
string type = var.type;
if (var.tagName == "SELECT" || var.tagName == "TEXTAREA")
gotIt = true;
else if (type)
{
if (type.same("RADIO") || type.same("TEXTBOX")
|| type.same("PASSWORD") || type.same("HIDDEN")
|| type.same("TEXT") || type.same("FILE"))
gotIt = true;
else if (type.same("CHECKBOX") && val)
gotIt = true;
}
if (gotIt)
{
if (val == nil)
val = "";
strings[var.name] = val;
}
}
}
}
global class htmlPage
// A complete parsed-out html page.
{
string url; // URL or file where we got the page.
string text; // Full unparsed text including headers
htmlStatus status; // Version and status info
dir of string header; // Header lines including cookies
dir of htmlCookie cookies; // Cookies
int htmlStartPos; // Start of html code within text
array of htmlTag tags;// All the tags in order
array of htmlForm forms; // Possibly empty array of forms.
to submit(htmlForm form=nil, string button="submit", string value="submit")
into (htmlPage page)
// Submit form and get response from server.
{
if (!form)
{
if (!forms)
punt("Can't submit $url because forms is nil.");
if (forms.size < 1)
punt("Can't submit $url because forms is embly.");
form = forms[0];
if (!form)
punt("Can't submit $url because forms[0] is nil.");
}
dir of string cgiVars = form.varsToStringDir();
if (button)
cgiVars[button] = value;
string dest = expandRelativeUrl(url, form.action);
page = htmlPageFetch(dest, cgiVars, form.method, cookies);
}
}
flow htmlStatusParse(string text, int pos) into (htmlStatus status, int newPos)
// Read in status line.
{
string line;
(line, newPos) = text.nextLine(pos);
if (!line) punt("Empty HTML file");
array of string words = line.words(); // Chop line into words
if (words.size < 2) punt("bad HTTP status line"); // Check for two words
status = (words[0], words[1].asInt()); // Initialize status object
if (!status.version.startsWith("HTTP/")) punt("No HTTP in status line");
}
flow htmlCookieParse(string line) into (htmlCookie cookie)
{
array of string parts = line.split(';');
for (part in parts)
{
part = part.trim();
int equals = part.find('=');
if (equals < 0) punt("missing equals in cookie $part");
string field = part.first(equals);
string val = part.rest(equals+1);
if (!cookie) // first time through, alloc self and set name
{
cookie = ();
cookie.name = field;
cookie.value = val;
}
else if (field == 'domain')
cookie.domain = val;
else if (field == 'path')
cookie.path = val;
else if (field == 'expires')
cookie.expires = val;
else if (field == 'secure')
cookie.secure = 1;
}
}
flow htmlHeaderParse(string text, int pos)
into (dir of string header, dir of htmlCookie cookies, int newPos)
// Read lines until get a blank one. Put the lines into a hash.
// The lines are of the format key: value. We strip the colon.
{
string line;
header = ();
cookies = ();
newPos = pos;
for (;;)
{
(line,newPos) = text.nextLine(newPos);
if (!line) punt("End of file in HTTP header");
(string key, int p) = line.nextWord(0);
if (!key)
break;
// Check for trailing colon and then trim it.
if (!key.endsWith(':')) punt("Missing : in HTTP header line\n\t$line");
key = key.first(key.size-1);
string val = line.rest(p).trim();
header[key] = val;
if (key == "Set-Cookie")
{
htmlCookie cookie = htmlCookieParse(val);
cookies[cookie.name] = cookie;
}
}
}
flow htmlTagsParse(string text, int pos) into (array of htmlTag tags)
// Scan text starting at pos for HTML tags and return an array of them.
{
int p = pos; // Current position in text
tags = (); // Allocate results array
while (p < text.size)
{
char c = text[p];
p++;
if (c == '<')
{
if (text[p] == '!') // HTML comment
{
p++;
if (text[p] == '-' && text[p+1] == '-')
p = text.findNext('-->', p);
else
p = text.findNext('>', p);
if (p < 0)
{
warn("End of file in comment");
break;
}
}
else // It's a real tag, let's process it.
{
// Make up tag and set start position to <
htmlTag tag = ();
tag.start = p-1;
tag.attributes = ();
// Grab first word into tagName.
int end = p;
for (;;)
{
c = text[end];
if (c == '>' || isSpace(c))
break;
end++;
if (end >= text.size)
{
warn("unclosed tag " + text.rest(tag.start));
break;
}
}
tag.name = text.middle(p, end-p).upper();
tags.append(tag);
p = end;
// If we're at end of tag already fill in a few
// fields and bail on rest of loop
if (p >= text.size)
{
tag.end = text.size;
continue;
}
else if (c == '>')
{
tag.end = end+1;
continue;
}
// Process name/value pairs until we get end tag.
for (;;)
{
string name,val;
bit gotEnd=0;
// Check for end tag.
p += text.leadingSpaces(p);
if (p >= text.size)
{
warn("unclosed tag " + text.rest(tag.start));
tag.end = text.size;
break;
}
if (text[p] == '>')
{
p++;
tag.end = p;
break;
}
// Get name - everything up to equals.
end = p;
while (end < text.size)
{
c = text[end];
if (c == '=')
break;
else if (c == '>')
break;
else if (isSpace(c))
break;
end += 1;
}
// Save name and advance our position
name = text.middle(p, end-p);
p = end;
if (p >= text.size)
continue; // Error message will happen at start of loop
else if (c == '>' || isSpace(c))
val = "";
else // case c == '='
{
p++;
p += text.leadingSpaces(p);
if (text[p] == '"')
(val,p) = text.betweenQuotes(p);
else
{
end = p;
while (end < text.size)
{
c = text[end];
if (c == '>' || isSpace(c))
break;
end++;
}
val = text.middle(p, end-p);
p = end;
}
}
// Create attribute dir if it doesn't exist yet.
if (!tag.attributes)
tag.attributes = ();
tag.attributes[name.upper()] = val;
}
}
}
}
}
flow findOrMakeVar(writable dir of htmlFormVar vars, string name, htmlTag tag)
into (htmlFormVar var)
// Find a variable of existing name if it exists, other wise
// make a new one. Add reference to this tag to var, and return it.
{
var = vars[name];
if (!var)
{
var = ();
var.name = name;
var.tagName = tag.name;
var.tags = (tag);
vars[name] = var;
}
else
{
if (var.tagName != tag.name)
punt("Mixing types " + var.tagName +
" and " + tag.name + " in FORM variable $name");
var.tags.append(tag);
}
}
flow isMixableInputType(string type) into (bit ok)
// Return TRUE if this as an INPUT type you can mix with other
// types ok
{
ok = type == "BUTTON" || type == "SUBMIT" || type == "IMAGE";
}
flow formParseVars(htmlPage page, htmlForm form) into (dir of htmlFormVar vars)
// Figure out the variables used in a form. Variables may be constructed
// from buttons, INPUT tags, OPTION lists, or TEXTAREAS.
{
vars = ();
for (tagIx in form.startTagIx til form.endTagIx)
{
htmlTag tag = page.tags[tagIx];
if (tag.name == "INPUT")
{
string type = tag.attribute("TYPE", "TEXT");
string varName = tag.attribute("NAME");
string value = tag.attribute("VALUE");
if (!varName)
{
if (!type.same("SUBMIT") && !type.same("CLEAR")
&& !type.same("BUTTON") && !type.same("RESET")
&& !type.same("IMAGE"))
punt("Missing NAME attribute in INPUT $type");
continue;
}
htmlFormVar var = findOrMakeVar(vars, varName, tag);
if (var.type && var.type != type)
if (!isMixableInputType(var.type) || !isMixableInputType(type))
punt("Mising input types $type and " + var.type);
var.type = type;
if (type.same("TEXT") || type.same("PASSWORD") || type.same("FILE")
|| type.same("HIDDEN") || type.same("IMAGE"))
var.value = value;
else if (type.same("CHECKBOX"))
{
if (tag.attribute("CHECKED") != nil)
var.value = "on";
}
else if (type.same("RADIO"))
{
if (tag.attribute("CHECKED") != nil)
var.value = value;
if (!var.values)
var.values = (value);
else
var.values.append(value);
}
else if (type.same("RESET") || type.same("BUTTON") ||
type.same("SUBMIT") || type.same("IMAGE"))
{
// do nothing
}
else
punt("Unrecognized input type $type");
}
else if (tag.name == "SELECT")
{
string varName = tag.requiredAttribute("NAME");
htmlFormVar var = findOrMakeVar(vars, varName, tag);
for (subIx in tagIx+1 til form.endTagIx)
{
htmlTag subTag = page.tags[subIx];
if (subTag.name == "/SELECT")
{
if (!var.value && var.values)
var.value = var.values[0];
break;
}
else if (subTag.name == "OPTION")
{
string val = subTag.attribute("VALUE");
if (!val)
{
// We use what is between this tag and next for value
int start = subTag.end;
int end = page.text.findNext('<', start);
if (end > 0)
val = page.text.middle(start, end-start);
}
if (val)
{
if (!var.values)
var.values = (val);
else
var.values.append(val);
if (subTag.attribute("SELECTED") != nil)
var.value = val;
}
}
}
}
else if (tag.name == "TEXTAREA")
{
string varName = tag.requiredAttribute("NAME");
htmlFormVar var = findOrMakeVar(vars, varName, tag);
// We use what is between this tag and next for value
int start = tag.end;
int end = page.text.findNext('<', start);
if (end > 0)
var.value = page.text.middle(start, end-start);
}
}
}
flow htmlFormsParse(htmlPage page) into (array of htmlForm forms)
// Assuming that page.tags is already present, rummage through
// it and return forms.
{
forms = ();
htmlForm form = nil;
for (tagIx@tag in page.tags)
{
if (tag.name == "FORM")
{
if (form) punt("FORM inside of FORM");
form = ();
form.startTagIx = tagIx;
forms.append(form);
form.name = tag.attribute("NAME", "");
form.action = tag.requiredAttribute("ACTION");
form.method = tag.attribute("METHOD", "GET");
}
else if (tag.name == "/FORM")
{
if (!form) punt("/FORM without FORM");
form.endTagIx = tagIx;
form = nil;
}
}
for (form in forms)
{
form.vars = formParseVars(page,form);
}
}
global flow htmlPageParse(string text, string name) into (htmlPage page)
{
page = ();
page.url = name;
page.text = text;
int pos = 0;
string contentType;
if (text.startsWith("HTTP/"))
{
(page.status, pos) = htmlStatusParse(page.text, pos);
(page.header, page.cookies, pos) = htmlHeaderParse(page.text, pos);
contentType = page.header["Content-Type"];
}
else
{
page.status = ();
page.header = ();
page.header['Content-Type'] = contentType;
}
if (!contentType)
{
contentType = 'text/html';
page.header['Content-Type'] = contentType;
}
page.htmlStartPos = pos;
if (contentType.startsWith('text/html'))
{
page.tags = htmlTagsParse(page.text, pos);
page.forms = htmlFormsParse(page);
}
}
flow httpCookiesToString(dir of htmlCookie cookies) into (string out)
// Convert cookie directory into a string that we can send.
{
dyString tmp = "Cookie:";
for (cookie in cookies)
tmp += ' ' + cookie.name + '=' + cookie.value + ';';
tmp += '\r\n';
out = tmp;
}
flow cgiVarsToString(dir of string vars) into (string out)
// Make up a string with the cgi variables all encoded.
{
if (vars)
{
dyString tmp = "";
for (key@val in vars)
{
if (tmp.size == 0)
tmp.append('?');
else
tmp.append('&');
tmp += key;
tmp += '=';
tmp += val.cgiEncode();
}
out = tmp;
}
}
flow urlIsHttp(string url) into (bit b)
// Return true if url starts with http://
{
b = url.first(7).same('http://');
}
flow expandRelativeUrl(string baseUrl, string relUrl) into (string fullUrl)
// Unless relUrl starts with "http://" then it needs to be expanded
// relative to the base url.
{
if (!urlIsHttp(baseUrl))
punt("Can't expandRelativeUrl from $baseUrl, no http://");
if (urlIsHttp(relUrl))
fullUrl = baseUrl;
else
{
int slash;
if (relUrl.startsWith('/'))
{
slash = baseUrl.findNext('/', 'http://'.size);
relUrl = relUrl.rest(1);
}
else
{
slash = baseUrl.findLast('/');
}
if (slash >= 0)
fullUrl = baseUrl.first(slash+1) + relUrl;
else
fullUrl = baseUrl + '/' + relUrl;
}
}
flow httpForwardingAddress(string text) into (string url)
// Look through text for signs that it is just a forwarding
// message. The big clue for this is the 301 that is the
// second word in the text right after the HTTP version.
// If we find it then try and find the forwarding address
// and stick it in URL. If we fail for any reason, or if
// the page doesn't need forwarding to begin with, url will be nil.
{
(string line,int pos) = text.nextLine(0);
if (line)
{
if (line.startsWith("HTTP/"))
{
array of string words = line.words();
if (words.size >= 2)
{
if (words[1] == "301")
{
url = text.between('