// htmlFetch library module. This contains code for fetching a web page // from the internet or from a local file, and parsing it. Also it can // submit forms and get back the results. global class htmlStatus // HTTP version and status code. { string version="HTTP/1.1"; // HTTP protocal version int status=200; // HTTP status code. 200 means A-OK. } global class htmlCookie // A cookie - stored by the browser usually. We need to // echo it back when we post forms. { string name; // Cookie name. string value; // Cookie value. string domain; // The set of web domains the cookie applies to. string path; // The cookie applies below this path I guess. string expires; // Expiration date. bit secure; // Is it a secure coookie? } global class htmlTag // An HTML tag - includes attribute list but no text. { string name; // Tag name. dir of string attributes; // All attributes. May be nil int start; // Start position within htmlPage.fullText int end; // End position (one past closing '>') flow attribute(string attName, string usual=nil) into (string val) // Return value of attribute if it exists, or usual val if it doesn't { string key = attName.upper(); if (attributes) val = attributes[key]; if (val == nil) val = usual; } flow requiredAttribute(string attName) into (string val) // Return attribute value. Punt if it doesn't exist { string key = attName.upper(); if (attributes) val = attributes[key]; if (val == nil) punt("Missing required $attName attribute in $name"); } } global class htmlFormVar // A variable within an html form. Associated with a button, input tag etc. { string name; // Variable name. writable string value; // Current value if any. May be nil. string type; // Variable type. array of string values; // Available values for some types, nil for others. string tagName; // Name of associated tag. array of htmlTag tags; // Associated html tags. } global class htmlForm // A form within an html page. { string name; // Name (nil if not defined) string action; // URL to call when they press submit string method="GET"; // Could also be POST. int startTagIx; // Index of start tag in page.tags int endTagIx; // Index just past end tag in page.tags dir of htmlFormVar vars; // Variables defined in form. to varsToStringDir() into (dir of string strings) // Convert our CGI vars into a simple dir of string. { dir of string cgiVars = (); strings = (); for (var in vars) { bit gotIt = false; string val = var.value; string type = var.type; if (var.tagName == "SELECT" || var.tagName == "TEXTAREA") gotIt = true; else if (type) { if (type.same("RADIO") || type.same("TEXTBOX") || type.same("PASSWORD") || type.same("HIDDEN") || type.same("TEXT") || type.same("FILE")) gotIt = true; else if (type.same("CHECKBOX") && val) gotIt = true; } if (gotIt) { if (val == nil) val = ""; strings[var.name] = val; } } } } global class htmlPage // A complete parsed-out html page. { string url; // URL or file where we got the page. string text; // Full unparsed text including headers htmlStatus status; // Version and status info dir of string header; // Header lines including cookies dir of htmlCookie cookies; // Cookies int htmlStartPos; // Start of html code within text array of htmlTag tags;// All the tags in order array of htmlForm forms; // Possibly empty array of forms. to submit(htmlForm form=nil, string button="submit", string value="submit") into (htmlPage page) // Submit form and get response from server. { if (!form) { if (!forms) punt("Can't submit $url because forms is nil."); if (forms.size < 1) punt("Can't submit $url because forms is embly."); form = forms[0]; if (!form) punt("Can't submit $url because forms[0] is nil."); } dir of string cgiVars = form.varsToStringDir(); if (button) cgiVars[button] = value; string dest = expandRelativeUrl(url, form.action); page = htmlPageFetch(dest, cgiVars, form.method, cookies); } } flow htmlStatusParse(string text, int pos) into (htmlStatus status, int newPos) // Read in status line. { string line; (line, newPos) = text.nextLine(pos); if (!line) punt("Empty HTML file"); array of string words = line.words(); // Chop line into words if (words.size < 2) punt("bad HTTP status line"); // Check for two words status = (words[0], words[1].asInt()); // Initialize status object if (!status.version.startsWith("HTTP/")) punt("No HTTP in status line"); } flow htmlCookieParse(string line) into (htmlCookie cookie) { array of string parts = line.split(';'); for (part in parts) { part = part.trim(); int equals = part.find('='); if (equals < 0) punt("missing equals in cookie $part"); string field = part.first(equals); string val = part.rest(equals+1); if (!cookie) // first time through, alloc self and set name { cookie = (); cookie.name = field; cookie.value = val; } else if (field == 'domain') cookie.domain = val; else if (field == 'path') cookie.path = val; else if (field == 'expires') cookie.expires = val; else if (field == 'secure') cookie.secure = 1; } } flow htmlHeaderParse(string text, int pos) into (dir of string header, dir of htmlCookie cookies, int newPos) // Read lines until get a blank one. Put the lines into a hash. // The lines are of the format key: value. We strip the colon. { string line; header = (); cookies = (); newPos = pos; for (;;) { (line,newPos) = text.nextLine(newPos); if (!line) punt("End of file in HTTP header"); (string key, int p) = line.nextWord(0); if (!key) break; // Check for trailing colon and then trim it. if (!key.endsWith(':')) punt("Missing : in HTTP header line\n\t$line"); key = key.first(key.size-1); string val = line.rest(p).trim(); header[key] = val; if (key == "Set-Cookie") { htmlCookie cookie = htmlCookieParse(val); cookies[cookie.name] = cookie; } } } flow htmlTagsParse(string text, int pos) into (array of htmlTag tags) // Scan text starting at pos for HTML tags and return an array of them. { int p = pos; // Current position in text tags = (); // Allocate results array while (p < text.size) { char c = text[p]; p++; if (c == '<') { if (text[p] == '!') // HTML comment { p++; if (text[p] == '-' && text[p+1] == '-') p = text.findNext('-->', p); else p = text.findNext('>', p); if (p < 0) { warn("End of file in comment"); break; } } else // It's a real tag, let's process it. { // Make up tag and set start position to < htmlTag tag = (); tag.start = p-1; tag.attributes = (); // Grab first word into tagName. int end = p; for (;;) { c = text[end]; if (c == '>' || isSpace(c)) break; end++; if (end >= text.size) { warn("unclosed tag " + text.rest(tag.start)); break; } } tag.name = text.middle(p, end-p).upper(); tags.append(tag); p = end; // If we're at end of tag already fill in a few // fields and bail on rest of loop if (p >= text.size) { tag.end = text.size; continue; } else if (c == '>') { tag.end = end+1; continue; } // Process name/value pairs until we get end tag. for (;;) { string name,val; bit gotEnd=0; // Check for end tag. p += text.leadingSpaces(p); if (p >= text.size) { warn("unclosed tag " + text.rest(tag.start)); tag.end = text.size; break; } if (text[p] == '>') { p++; tag.end = p; break; } // Get name - everything up to equals. end = p; while (end < text.size) { c = text[end]; if (c == '=') break; else if (c == '>') break; else if (isSpace(c)) break; end += 1; } // Save name and advance our position name = text.middle(p, end-p); p = end; if (p >= text.size) continue; // Error message will happen at start of loop else if (c == '>' || isSpace(c)) val = ""; else // case c == '=' { p++; p += text.leadingSpaces(p); if (text[p] == '"') (val,p) = text.betweenQuotes(p); else { end = p; while (end < text.size) { c = text[end]; if (c == '>' || isSpace(c)) break; end++; } val = text.middle(p, end-p); p = end; } } // Create attribute dir if it doesn't exist yet. if (!tag.attributes) tag.attributes = (); tag.attributes[name.upper()] = val; } } } } } flow findOrMakeVar(writable dir of htmlFormVar vars, string name, htmlTag tag) into (htmlFormVar var) // Find a variable of existing name if it exists, other wise // make a new one. Add reference to this tag to var, and return it. { var = vars[name]; if (!var) { var = (); var.name = name; var.tagName = tag.name; var.tags = (tag); vars[name] = var; } else { if (var.tagName != tag.name) punt("Mixing types " + var.tagName + " and " + tag.name + " in FORM variable $name"); var.tags.append(tag); } } flow isMixableInputType(string type) into (bit ok) // Return TRUE if this as an INPUT type you can mix with other // types ok { ok = type == "BUTTON" || type == "SUBMIT" || type == "IMAGE"; } flow formParseVars(htmlPage page, htmlForm form) into (dir of htmlFormVar vars) // Figure out the variables used in a form. Variables may be constructed // from buttons, INPUT tags, OPTION lists, or TEXTAREAS. { vars = (); for (tagIx in form.startTagIx til form.endTagIx) { htmlTag tag = page.tags[tagIx]; if (tag.name == "INPUT") { string type = tag.attribute("TYPE", "TEXT"); string varName = tag.attribute("NAME"); string value = tag.attribute("VALUE"); if (!varName) { if (!type.same("SUBMIT") && !type.same("CLEAR") && !type.same("BUTTON") && !type.same("RESET") && !type.same("IMAGE")) punt("Missing NAME attribute in INPUT $type"); continue; } htmlFormVar var = findOrMakeVar(vars, varName, tag); if (var.type && var.type != type) if (!isMixableInputType(var.type) || !isMixableInputType(type)) punt("Mising input types $type and " + var.type); var.type = type; if (type.same("TEXT") || type.same("PASSWORD") || type.same("FILE") || type.same("HIDDEN") || type.same("IMAGE")) var.value = value; else if (type.same("CHECKBOX")) { if (tag.attribute("CHECKED") != nil) var.value = "on"; } else if (type.same("RADIO")) { if (tag.attribute("CHECKED") != nil) var.value = value; if (!var.values) var.values = (value); else var.values.append(value); } else if (type.same("RESET") || type.same("BUTTON") || type.same("SUBMIT") || type.same("IMAGE")) { // do nothing } else punt("Unrecognized input type $type"); } else if (tag.name == "SELECT") { string varName = tag.requiredAttribute("NAME"); htmlFormVar var = findOrMakeVar(vars, varName, tag); for (subIx in tagIx+1 til form.endTagIx) { htmlTag subTag = page.tags[subIx]; if (subTag.name == "/SELECT") { if (!var.value && var.values) var.value = var.values[0]; break; } else if (subTag.name == "OPTION") { string val = subTag.attribute("VALUE"); if (!val) { // We use what is between this tag and next for value int start = subTag.end; int end = page.text.findNext('<', start); if (end > 0) val = page.text.middle(start, end-start); } if (val) { if (!var.values) var.values = (val); else var.values.append(val); if (subTag.attribute("SELECTED") != nil) var.value = val; } } } } else if (tag.name == "TEXTAREA") { string varName = tag.requiredAttribute("NAME"); htmlFormVar var = findOrMakeVar(vars, varName, tag); // We use what is between this tag and next for value int start = tag.end; int end = page.text.findNext('<', start); if (end > 0) var.value = page.text.middle(start, end-start); } } } flow htmlFormsParse(htmlPage page) into (array of htmlForm forms) // Assuming that page.tags is already present, rummage through // it and return forms. { forms = (); htmlForm form = nil; for (tagIx@tag in page.tags) { if (tag.name == "FORM") { if (form) punt("FORM inside of FORM"); form = (); form.startTagIx = tagIx; forms.append(form); form.name = tag.attribute("NAME", ""); form.action = tag.requiredAttribute("ACTION"); form.method = tag.attribute("METHOD", "GET"); } else if (tag.name == "/FORM") { if (!form) punt("/FORM without FORM"); form.endTagIx = tagIx; form = nil; } } for (form in forms) { form.vars = formParseVars(page,form); } } global flow htmlPageParse(string text, string name) into (htmlPage page) { page = (); page.url = name; page.text = text; int pos = 0; string contentType; if (text.startsWith("HTTP/")) { (page.status, pos) = htmlStatusParse(page.text, pos); (page.header, page.cookies, pos) = htmlHeaderParse(page.text, pos); contentType = page.header["Content-Type"]; } else { page.status = (); page.header = (); page.header['Content-Type'] = contentType; } if (!contentType) { contentType = 'text/html'; page.header['Content-Type'] = contentType; } page.htmlStartPos = pos; if (contentType.startsWith('text/html')) { page.tags = htmlTagsParse(page.text, pos); page.forms = htmlFormsParse(page); } } flow httpCookiesToString(dir of htmlCookie cookies) into (string out) // Convert cookie directory into a string that we can send. { dyString tmp = "Cookie:"; for (cookie in cookies) tmp += ' ' + cookie.name + '=' + cookie.value + ';'; tmp += '\r\n'; out = tmp; } flow cgiVarsToString(dir of string vars) into (string out) // Make up a string with the cgi variables all encoded. { if (vars) { dyString tmp = ""; for (key@val in vars) { if (tmp.size == 0) tmp.append('?'); else tmp.append('&'); tmp += key; tmp += '='; tmp += val.cgiEncode(); } out = tmp; } } flow urlIsHttp(string url) into (bit b) // Return true if url starts with http:// { b = url.first(7).same('http://'); } flow expandRelativeUrl(string baseUrl, string relUrl) into (string fullUrl) // Unless relUrl starts with "http://" then it needs to be expanded // relative to the base url. { if (!urlIsHttp(baseUrl)) punt("Can't expandRelativeUrl from $baseUrl, no http://"); if (urlIsHttp(relUrl)) fullUrl = baseUrl; else { int slash; if (relUrl.startsWith('/')) { slash = baseUrl.findNext('/', 'http://'.size); relUrl = relUrl.rest(1); } else { slash = baseUrl.findLast('/'); } if (slash >= 0) fullUrl = baseUrl.first(slash+1) + relUrl; else fullUrl = baseUrl + '/' + relUrl; } } flow httpForwardingAddress(string text) into (string url) // Look through text for signs that it is just a forwarding // message. The big clue for this is the 301 that is the // second word in the text right after the HTTP version. // If we find it then try and find the forwarding address // and stick it in URL. If we fail for any reason, or if // the page doesn't need forwarding to begin with, url will be nil. { (string line,int pos) = text.nextLine(0); if (line) { if (line.startsWith("HTTP/")) { array of string words = line.words(); if (words.size >= 2) { if (words[1] == "301") { url = text.between('