/* htmlPage - stuff to read, parse, and submit htmlPages and forms. * * typical usage is: * struct htmlPage *page = htmlPageGet(url); * htmlPageValidateOrAbort(page); * var = htmlPageGetVar(page, page->forms, "org"); * if (var != NULL) * printf("Organism = var->org); * htmlPageSetVar(page, page->forms, "org", "Human"); * newPage = htmlPageFromForm(page, page->forms, "submit", "Go"); */ #include "common.h" #include "errabort.h" #include "errCatch.h" #include "memalloc.h" #include "linefile.h" #include "hash.h" #include "dystring.h" #include "cheapcgi.h" #include "obscure.h" #include "filePath.h" #include "net.h" #include "htmlPage.h" void htmlStatusFree(struct htmlStatus **pStatus) /* Free up resources associated with status */ { struct htmlStatus *status = *pStatus; if (status != NULL) { freeMem(status->version); freez(pStatus); } } void htmlStatusFreeList(struct htmlStatus **pList) /* Free a list of dynamically allocated htmlStatus's */ { struct htmlStatus *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlStatusFree(&el); } *pList = NULL; } void htmlCookieFree(struct htmlCookie **pCookie) /* Free memory associated with cookie. */ { struct htmlCookie *cookie = *pCookie; if (cookie != NULL) { freeMem(cookie->name); freeMem(cookie->value); freeMem(cookie->domain); freeMem(cookie->path); freeMem(cookie->expires); freez(pCookie); } } void htmlCookieFreeList(struct htmlCookie **pList) /* Free a list of dynamically allocated htmlCookie's */ { struct htmlCookie *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlCookieFree(&el); } *pList = NULL; } struct htmlCookie *htmlCookieFileRead(char *fileName) /* Read cookies from a line oriented file. First word in line * is the cookie name, the rest of the line the cookie value. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct htmlCookie *list = NULL, *cookie; char *line, *word; while (lineFileNextReal(lf, &line)) { word = nextWord(&line); line = skipLeadingSpaces(line); if (line == NULL) errAbort("Missing cookie value line %d of %s", lf->lineIx, lf->fileName); AllocVar(cookie); cookie->name = cloneString(word); cookie->value = cloneString(line); slAddHead(&list, cookie); } lineFileClose(&lf); slReverse(&list); return list; } static void cookieOutput(struct dyString *dy, struct htmlCookie *cookieList) /* Write cookies to dy. */ { struct htmlCookie *cookie; if (cookieList != NULL) { dyStringAppend(dy, "Cookie:"); for (cookie = cookieList; cookie != NULL; cookie = cookie->next) { if (cookie != cookieList) dyStringAppendC(dy, ';'); dyStringAppendC(dy, ' '); dyStringAppend(dy, cookie->name); dyStringAppendC(dy, '='); dyStringAppend(dy, cookie->value); } dyStringAppend(dy, "\r\n"); } } void htmlAttributeFree(struct htmlAttribute **pAttribute) /* Free up resources associated with attribute. */ { struct htmlAttribute *att = *pAttribute; if (att != NULL) { freeMem(att->name); freeMem(att->val); freez(pAttribute); } } void htmlAttributeFreeList(struct htmlAttribute **pList) /* Free a list of dynamically allocated htmlAttribute's */ { struct htmlAttribute *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlAttributeFree(&el); } *pList = NULL; } void htmlTagFree(struct htmlTag **pTag) /* Free up resources associated with tag. */ { struct htmlTag *tag = *pTag; if (tag != NULL) { htmlAttributeFreeList(&tag->attributes); freeMem(tag->name); freez(pTag); } } void htmlTagFreeList(struct htmlTag **pList) /* Free a list of dynamically allocated htmlTag's */ { struct htmlTag *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlTagFree(&el); } *pList = NULL; } void htmlFormVarFree(struct htmlFormVar **pVar) /* Free up resources associated with form variable. */ { struct htmlFormVar *var = *pVar; if (var != NULL) { freeMem(var->curVal); slFreeList(&var->values); slFreeList(&var->tags); freez(pVar); } } void htmlFormVarFreeList(struct htmlFormVar **pList) /* Free a list of dynamically allocated htmlFormVar's */ { struct htmlFormVar *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlFormVarFree(&el); } *pList = NULL; } void htmlFormFree(struct htmlForm **pForm) /* Free up resources associated with form variable. */ { struct htmlForm *form = *pForm; if (form != NULL) { htmlFormVarFreeList(&form->vars); freez(pForm); } } void htmlFormFreeList(struct htmlForm **pList) /* Free a list of dynamically allocated htmlForm's */ { struct htmlForm *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlFormFree(&el); } *pList = NULL; } void htmlPageFree(struct htmlPage **pPage) /* Free up resources associated with htmlPage. */ { struct htmlPage *page = *pPage; if (page != NULL) { freez(&page->url); htmlStatusFree(&page->status); freeHashAndVals(&page->header); htmlCookieFreeList(&page->cookies); freez(&page->fullText); htmlTagFreeList(&page->tags); htmlFormFreeList(&page->forms); freez(pPage); } } void htmlPageFreeList(struct htmlPage **pList) /* Free a list of dynamically allocated htmlPage's */ { struct htmlPage *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; htmlPageFree(&el); } *pList = NULL; } static int findLineNumber(char *start, char *pos) /* Figure out line number of given position relative to start. */ { char *s; int line = 1; for (s = start; s <= pos; ++s) { if (s[0] == '\n') ++line; } return line; } static void tagVaWarn(struct htmlPage *page, struct htmlTag *tag, char *format, va_list args) /* Print warning message and some context of tag. */ { char context[80]; strncpy(context, tag->start, sizeof(context)); context[sizeof(context)-1] = 0; warn("Error near line %d of %s:\n %s", findLineNumber(page->htmlText, tag->start), page->url, context); vaWarn(format, args); } static void tagWarn(struct htmlPage *page, struct htmlTag *tag, char *format, ...) /* Print warning message and some context of tag. */ { va_list args; va_start(args, format); tagVaWarn(page, tag, format, args); va_end(args); } static void tagAbort(struct htmlPage *page, struct htmlTag *tag, char *format, ...) /* Print abort message and some context of tag. */ { va_list args; va_start(args, format); tagVaWarn(page, tag, format, args); va_end(args); noWarnAbort(); } struct htmlStatus *htmlStatusParse(char **pText) /* Read in status from first line. Update pText to point to next line. * Note unlike many routines here, this does not insert zeros into text. */ { char *text = *pText; char *end = strchr(text, '\n'); struct htmlStatus *status; if (end != NULL) *pText = end+1; else *pText = text + strlen(text); end = skipToSpaces(text); if (end == NULL) { warn("Short status line."); return NULL; } AllocVar(status); status->version = cloneStringZ(text, end-text); end = skipLeadingSpaces(end); if (!isdigit(end[0])) { warn("Not a number in status field"); return NULL; } status->status = atoi(end); return status; } char *htmlNextCrLfLine(char **pS) /* Return zero-terminated line and advance *pS to start of * next line. Return NULL at end of file. Warn if there is * no . */ { char *s = *pS, *e; if (s == NULL || s[0] == 0) return NULL; e = strchr(s, '\n'); if (e == NULL) verbose(1, "End of file in header\n"); else { *e = 0; if (e == s || e[-1] != '\r') verbose(1, "Missing in header line\n"); else e[-1] = 0; e += 1; } *pS = e; return s; } static void cookieParseNameValuePair(char *s, char **retName, char **retVal) /* Parse out name/value pair. Warn and return FALSE if there's a problem. */ { char *val = strchr(s, '='); if (val == NULL) { val = s + strlen(s); } *val++ = 0; *retName = s; *retVal = val; } static struct htmlCookie *parseCookie(char *s) /* Parse out cookie line to the right of Set-Cookie. */ { char *e, *name, *val; struct htmlCookie *cookie; /* Grab up to semicolon, which is the cookie name/value pair. */ e = strchr(s, ';'); if (e == NULL) { warn("Missing ';' in cookie"); return NULL; } *e++ = 0; /* Allocate cookie and fill out name/value pair. */ AllocVar(cookie); cookieParseNameValuePair(s, &name, &val); cookie->name = cloneString(name); cookie->value = cloneString(val); /* Loop through to grab the other info - domain and so forth. */ s = e; for (;;) { /* Find next semicolon and zero-terminate it. */ s = skipLeadingSpaces(s); e = strchr(s, ';'); if (e == NULL) break; *e++ = 0; /* Parse out name/value pairs and save it away if it's one we know about. */ cookieParseNameValuePair(s, &name, &val); if (sameString(name, "domain")) cookie->domain = cloneString(val); else if (sameString(name, "path")) cookie->path = cloneString(val); else if (sameString(name, "expires")) cookie->expires = cloneString(val); else if (sameString(name, "secure")) cookie->secure = TRUE; s = e; } return cookie; } static struct hash *htmlHeaderRead(char **pHtml, struct htmlCookie **pCookies) /* Read in from second line through first blank line and * save in hash. These lines are in the form name: value. */ { struct hash *hash = hashNew(6); for (;;) { char *line = htmlNextCrLfLine(pHtml); char *word; if (line == NULL) { warn("End of file in header"); break; } word = nextWord(&line); if (word == NULL) break; line = skipLeadingSpaces(line); hashAdd(hash, word, cloneString(line)); if (sameString(word, "Set-Cookie:")) { struct htmlCookie *cookie = parseCookie(line); if (cookie != NULL) slAddTail(pCookies, cookie); } } return hash; } static char *htmlAttributeFindVal(struct htmlAttribute *list, char *name) /* Find named attribute or return NULL. */ { struct htmlAttribute *att; for (att = list; att != NULL; att = att->next) { if (sameWord(att->name, name)) return att->val; } return NULL; } char *htmlTagAttributeVal(struct htmlPage *page, struct htmlTag *tag, char *name, char *defaultVal) /* Return value of named attribute, or defaultVal if attribute doesn't exist. */ { char *val = htmlAttributeFindVal(tag->attributes, name); if (val == NULL) val = defaultVal; return val; } char *htmlTagAttributeNeeded(struct htmlPage *page, struct htmlTag *tag, char *name) /* Return named tag attribute. Complain and return "n/a" if it * doesn't exist. */ { char *val = htmlTagAttributeVal(page, tag, name, NULL); if (val == NULL) { tagWarn(page, tag, "Missing %s attribute", name); val = "n/a"; } return val; } static struct htmlTag *htmlTagScan(char *html, char *dupe) /* Scan HTML for tags and return a list of them. * Html is the text to scan, and dupe is a copy of it * which this routine will insert 0's in in the course of * parsing.*/ { char *s = dupe, c, *e, *tagName; struct htmlTag *tagList = NULL, *tag; struct htmlAttribute *att; int pos; for (;;) { c = *s++; if (c == 0) break; if (c == '<') { if (*s == '!') /* HTML comment. */ { s += 1; if (s[0] == '-' && s[1] == '-') s = stringIn("-->", s); else s = strchr(s, '>'); if (s == NULL) { warn("End of file in comment"); break; } } else { /* Grab first word into tagName. */ e = s; for (;;) { c = *e; if (c == '>' || c == 0 || isspace(c)) break; e += 1; } if (c != 0) *e++ = 0; tagName = s; s = e; /* Allocate tag, fill in name, and stick it on list. */ AllocVar(tag); tag->name = cloneString(tagName); slAddHead(&tagList, tag); pos = tagName - dupe - 1; tag->start = html+pos; /* If already got end tag (or EOF) stop processing tag. */ if (c == '>' || c == 0) { tag->end = html + (e - dupe); continue; } /* Process name/value pairs until get end tag. */ for (;;) { char *name, *val; boolean gotEnd = FALSE; /* Check for end tag. */ s = skipLeadingSpaces(s); if (s[0] == '>' || s[0] == 0) { tag->end = html + (s - dupe); if (s[0] == '>') tag->end += 1; break; } /* Get name - everything up to equals. */ e = s; for (;;) { c = *e; if (c == '=') break; else if (c == '>') break; else if (c == 0) break; else if (isspace(c)) break; e += 1; } if (c == 0) { warn("End of file in tag"); break; } name = s; *e++ = 0; eraseTrailingSpaces(name); if (c == '>') { val = ""; gotEnd = TRUE; tag->end = html + (e - dupe); } else if (isspace(c)) { val = ""; } else { val = e = skipLeadingSpaces(e); if (e[0] == '"' || e[0] == '\'') { if (!parseQuotedStringNoEscapes(val, val, &e)) break; } else { for (;;) { c = *e; if (c == '>') { gotEnd = TRUE; *e++ = 0; tag->end = html + (e - dupe); break; } else if (isspace(c)) { *e++ = 0; break; } else if (c == 0) break; ++e; } } } AllocVar(att); att->name = cloneString(name); att->val = cloneString(val); slAddTail(&tag->attributes, att); s = e; if (gotEnd) break; } } } } slReverse(&tagList); return tagList; } static struct htmlFormVar *findOrMakeVar(struct htmlPage *page, char *name, struct hash *hash, struct htmlTag *tag, struct htmlFormVar **pVarList) /* Find variable of existing name if it exists, otherwise * make a new one and add to hash and list. Add reference * to this tag to var. */ { struct htmlFormVar *var = hashFindVal(hash, name); if (var == NULL) { AllocVar(var); var->name = name; var->tagName = tag->name; hashAdd(hash, name, var); slAddHead(pVarList, var); } else { if (!sameWord(var->tagName, tag->name)) { tagWarn(page, tag, "Mixing FORM variable tag types %s and %s", var->tagName, tag->name); var->tagName = tag->name; } } refAdd(&var->tags, tag); return var; } static boolean isMixableInputType(char *type) /* Return TRUE if it's a type you can mix with others ok, like * button, submit, and image. */ { return sameWord(type, "BUTTON") || sameWord(type, "SUBMIT") || sameWord(type, "IMAGE"); } static void htmlFormVarAddValue(struct htmlFormVar *var, char *value) /* Add value to list of predefined values for var. */ { struct slName *name = slNameNew(value); slAddTail(&var->values, name); } static struct htmlFormVar *formParseVars(struct htmlPage *page, struct htmlForm *form) /* Return a list of variables parsed out of form. * A form variable is something that may appear in the name * side of the name=value pairs that serves as input to a CGI * script. The variables may be constructed from buttons, * INPUT tags, OPTION lists, or TEXTAREAs. */ { struct htmlTag *tag; struct htmlFormVar *varList = NULL, *var; struct hash *hash = newHash(0); for (tag = form->startTag->next; tag != form->endTag; tag = tag->next) { if (sameWord(tag->name, "INPUT")) { char *type = htmlTagAttributeVal(page, tag, "TYPE", NULL); char *varName = htmlTagAttributeVal(page, tag, "NAME", NULL); char *value = htmlTagAttributeVal(page, tag, "VALUE", NULL); if (type == NULL) type = "TEXT"; if (varName == NULL) { if (!htmlTagAttributeVal(page, tag, "ONCHANGE", NULL) && !sameWord(type, "SUBMIT") && !sameWord(type, "CLEAR") && !sameWord(type, "BUTTON") && !sameWord(type, "RESET") && !sameWord(type, "IMAGE")) tagWarn(page, tag, "Missing NAME attribute"); varName = "n/a"; } var = findOrMakeVar(page, varName, hash, tag, &varList); if (var->type != NULL && !sameWord(var->type, type)) { if (!isMixableInputType(var->type) || !isMixableInputType(type)) tagWarn(page, tag, "Mixing input types %s and %s", var->type, type); } var->type = type; if (sameWord(type, "TEXT") || sameWord(type, "PASSWORD") || sameWord(type, "FILE") || sameWord(type, "HIDDEN") || sameWord(type, "IMAGE")) { var->curVal = cloneString(value); } else if (sameWord(type, "CHECKBOX")) { if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL) var->curVal = cloneString("on"); } else if (sameWord(type, "RADIO")) { if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL) var->curVal = cloneString(value); htmlFormVarAddValue(var, value); } else if ( sameWord(type, "RESET") || sameWord(type, "BUTTON") || sameWord(type, "SUBMIT") || sameWord(type, "IMAGE") || sameWord(type, "n/a")) { /* Do nothing. */ } else { tagWarn(page, tag, "Unrecognized INPUT TYPE %s", type); } } else if (sameWord(tag->name, "SELECT")) { char *varName = htmlTagAttributeNeeded(page, tag, "NAME"); struct htmlTag *subTag; var = findOrMakeVar(page, varName, hash, tag, &varList); for (subTag = tag->next; subTag != form->endTag; subTag = subTag->next) { if (sameWord(subTag->name, "/SELECT")) { if (var->curVal == NULL && var->values != NULL) { var->curVal = cloneString(var->values->name); } break; } else if (sameWord(subTag->name, "OPTION")) { char *val = cloneString(htmlTagAttributeVal(page, subTag, "VALUE", NULL)); if (val == NULL) { char *e = strchr(subTag->end, '<'); if (e != NULL) val = cloneStringZ(subTag->end, e - subTag->end); } if (val != NULL) htmlFormVarAddValue(var, val); if (htmlTagAttributeVal(page, subTag, "SELECTED", NULL) != NULL) { if (val != NULL) var->curVal = cloneString(val); } freez(&val); } } } else if (sameWord(tag->name, "TEXTAREA")) { char *varName = htmlTagAttributeNeeded(page, tag, "NAME"); char *e = strchr(tag->end, '<'); var = findOrMakeVar(page, varName, hash, tag, &varList); if (e != NULL) var->curVal = cloneStringZ(tag->end, e - tag->end); } } freeHash(&hash); slReverse(&varList); for (var = varList; var != NULL; var = var->next) { slReverse(&var->tags); } return varList; } static struct htmlForm *htmlParseForms(struct htmlPage *page, struct htmlTag *startTag, struct htmlTag *endTag) /* Parse out list of forms from tag stream. */ { struct htmlForm *formList = NULL, *form = NULL; struct htmlTag *tag; for (tag = startTag; tag != endTag; tag = tag->next) { if (sameWord(tag->name, "FORM")) { if (form != NULL) tagWarn(page, tag, "FORM inside of FORM"); AllocVar(form); form->startTag = tag; slAddHead(&formList, form); form->name = htmlTagAttributeVal(page, tag, "name", "n/a"); form->action = htmlTagAttributeNeeded(page, tag, "action"); form->method = htmlTagAttributeVal(page, tag, "method", "GET"); } else if (sameWord(tag->name, "/FORM")) { if (form == NULL) tagWarn(page, tag, "/FORM outside of FORM"); else { form->endTag = tag->next; form = NULL; } } } slReverse(&formList); for (form = formList; form != NULL; form = form->next) { form->vars = formParseVars(page, form); } return formList; } struct htmlPage *htmlPageParse(char *url, char *fullText) /* Parse out page and return. */ { struct htmlPage *page; char *dupe = cloneLongString(fullText); char *s = dupe; struct htmlStatus *status = htmlStatusParse(&s); char *contentType; if (status == NULL) return NULL; AllocVar(page); page->url = cloneString(url); page->fullText = fullText; page->status = status; page->header = htmlHeaderRead(&s, &page->cookies); contentType = hashFindVal(page->header, "Content-Type:"); if (contentType == NULL) { warn("No contentType, assuming text/html"); contentType = cloneString("text/html"); hashAdd(page->header, "Content-Type:", contentType); } page->htmlText = fullText + (s - dupe); if (startsWith("text/html", contentType)) { page->tags = htmlTagScan(page->htmlText, s); page->forms = htmlParseForms(page, page->tags, NULL); } freez(&dupe); return page; } struct htmlPage *htmlPageParseNoHead(char *url, char *htmlText) /* Parse out page in memory (past http header if any) and return. */ { char *dupe = cloneString(htmlText); struct htmlPage *page; AllocVar(page); page->url = cloneString(url); page->fullText = page->htmlText = htmlText; page->tags = htmlTagScan(page->htmlText, dupe); page->forms = htmlParseForms(page, page->tags, NULL); freez(&dupe); return page; } struct htmlPage *htmlPageParseOk(char *url, char *fullText) /* Parse out page and return only if status ok. */ { struct htmlPage *page = htmlPageParse(url, fullText); if (page == NULL) noWarnAbort(); if (page->status->status != 200) errAbort("%s returned with status code %d", url, page->status->status); return page; } char *htmlSlurpWithCookies(char *url, struct htmlCookie *cookies) /* Send get message to url with cookies, and return full response as * a dyString. This is not parsed or validated, and includes http * header lines. Typically you'd pass this to htmlPageParse() to * get an actual page. */ { struct dyString *dyHeader = dyStringNew(0); struct dyString *dyText; int sd; cookieOutput(dyHeader, cookies); sd = netOpenHttpExt(url, "GET", dyHeader->string); dyText = netSlurpFile(sd); close(sd); dyStringFree(&dyHeader); return dyStringCannibalize(&dyText); } struct htmlPage *htmlPageGetWithCookies(char *url, struct htmlCookie *cookies) /* Get page from URL giving server the given cookies. Note only the * name and value parts of the cookies need to be filled in. */ { char *buf = htmlSlurpWithCookies(url, cookies); return htmlPageParse(url, buf); } struct htmlPage *htmlPageForwarded(char *url, struct htmlCookie *cookies) /* Get html page. If it's just a forwarding link then get do the * forwarding. Cookies is a possibly empty list of cookies with * name and value parts filled in. */ { struct htmlPage *page = htmlPageGetWithCookies(url, cookies); int level, maxLevels = 7; for (level = 0; level < maxLevels; ++level) { struct htmlPage *newPage; char *newUrl = hashFindVal(page->header, "Location:"); if (newUrl == NULL) break; newPage = htmlPageGetWithCookies(newUrl, cookies); htmlPageFree(&page); page = newPage; } return page; } struct htmlPage *htmlPageForwardedNoAbort(char *url, struct htmlCookie *cookies) /* Try and get an HTML page. Print warning and return NULL if there's a problem. */ { struct errCatch *errCatch = errCatchNew(); struct htmlPage *page = NULL; if (errCatchStart(errCatch)) page = htmlPageForwarded(url, cookies); errCatchEnd(errCatch); if (errCatch->gotError) warn("%s", errCatch->message->string); errCatchFree(&errCatch); return page; } struct htmlPage *htmlPageGet(char *url) /* Get page from URL (may be a file). */ { if (fileExists(url)) { char *buf; readInGulp(url, &buf, NULL); return htmlPageParseNoHead(url, buf); } else return htmlPageGetWithCookies(url, NULL); } void htmlFormVarPrint(struct htmlFormVar *var, FILE *f, char *prefix) /* Print out variable to file, prepending prefix. */ { struct slName *val; fprintf(f, "%s%s\t%s\t%s\t%s\n", prefix, var->name, var->tagName, naForNull(var->type), naForNull(var->curVal)); for (val = var->values; val != NULL; val = val->next) fprintf(f, "%s\t%s\n", prefix, val->name); } void htmlFormPrint(struct htmlForm *form, FILE *f) /* Print out form structure. */ { struct htmlFormVar *var; fprintf(f, "%s\t%s\t%s\n", form->name, form->method, form->action); for (var = form->vars; var != NULL; var = var->next) htmlFormVarPrint(var, f, "\t"); } struct htmlForm *htmlFormGet(struct htmlPage *page, char *name) /* Get named form. */ { struct htmlForm *form; for (form = page->forms; form != NULL; form = form->next) if (sameWord(form->name, name)) break; return form; } struct htmlFormVar *htmlFormVarGet(struct htmlForm *form, char *name) /* Get named variable. */ { struct htmlFormVar *var; if (form == NULL) errAbort("Null form passed to htmlFormVarGet"); for (var = form->vars; var != NULL; var = var->next) if (sameWord(var->name, name)) break; return var; } void htmlFormVarSet(struct htmlForm *form, char *name, char *val) /* Set variable to given value. Create it if it doesn't exist*/ { struct htmlFormVar *var; if (form == NULL) errAbort("Null form passed to htmlFormVarSet"); var = htmlFormVarGet(form, name); if (var == NULL) { AllocVar(var); var->type = "TEXT"; var->tagName = "INPUT"; var->name = name; slAddHead(&form->vars, var); } freez(&var->curVal); var->curVal = cloneString(val); } struct htmlFormVar *htmlPageGetVar(struct htmlPage *page, struct htmlForm *form, char *name) /* Get named variable. If form is NULL, first form in page is used. */ { if (form == NULL) form = page->forms; return htmlFormVarGet(form, name); } void htmlPageSetVar(struct htmlPage *page, struct htmlForm *form, char *name, char *val) /* Set variable to given value. If form is NULL, first form in page is used. */ { if (page == NULL) errAbort("Null page passed to htmlPageSetVar"); if (form == NULL) form = page->forms; if (form == NULL) errAbort("Null form in htmlPageSetVar"); htmlFormVarSet(form, name, val); } static void asciiEntityDecode(char *in, char *out, int inLength) /* Decode from SGML Character Entity &# format to normal. * Out will be a little shorter than in typically, and * can be the same buffer. Only supports ASCII charset. */ { char c; int i; char *e; for (i=0; i 5) { /* probably a badly formatted string, just recover and continue */ *out++ = '&'; *out++ = '#'; } else { int code; if (sscanf(in, "%d", &code) != 1) { code = '?'; } if (code > 255) { code = '?'; } in = e; in++; *out++ = code; } } else *out++ = c; } *out++ = 0; } char *htmlExpandUrl(char *base, char *url) /* Expand URL that is relative to base to stand on its own. * Return NULL if it's not http or https. */ { struct dyString *dy = NULL; char *hostName, *pastHostName; /* some mailto: have SGML char encoding, e.g a to hide from spambots */ url = cloneString(url); /* Clone because asciiEntityDecode may modify it. */ asciiEntityDecode(url, url, strlen(url)); /* In easiest case URL is actually absolute and begins with * protocol. Just return clone of url. */ if (startsWith("http:", url) || startsWith("https:", url)) return url; /* If it's got a colon, but no http or https, then it's some * protocol we don't understand, like a mailto. Just return NULL. */ if (strchr(url, ':') != NULL) { freez(&url); return NULL; } /* Figure out first character past host name. Load up * return string with protocol (if any) and host name. */ dy = dyStringNew(256); if (startsWith("http:", base) || startsWith("https:", base)) hostName = (strchr(base, ':') + 3); else hostName = base; pastHostName = strchr(hostName, '/'); if (pastHostName == NULL) pastHostName = hostName + strlen(hostName); dyStringAppendN(dy, base, pastHostName - base); /* Add url to return string after host name. */ if (startsWith("/", url)) /* New URL is absolute, just append to hostName */ { dyStringAppend(dy, url); } else { char *curDir = pastHostName; char *endDir; if (curDir[0] == '/') curDir += 1; dyStringAppendC(dy, '/'); endDir = strrchr(curDir, '/'); if (endDir == NULL) endDir = curDir; if (startsWith("../", url)) { char *dir = cloneStringZ(curDir, endDir-curDir); char *path = expandRelativePath(dir, url); if (path != NULL) { dyStringAppend(dy, path); } freez(&dir); freez(&path); } else { dyStringAppendN(dy, curDir, endDir-curDir); if (lastChar(dy->string) != '/') dyStringAppendC(dy, '/'); dyStringAppend(dy, url); } } freez(&url); return dyStringCannibalize(&dy); } static void appendCgiVar(struct dyString *dy, char *name, char *value) /* Append cgiVar with cgi-encoded value to dy. */ { char *enc = NULL; if (value == NULL) value = ""; enc = cgiEncode(value); if (dy->stringSize != 0) dyStringAppendC(dy, '&'); dyStringAppend(dy, name); dyStringAppendC(dy, '='); dyStringAppend(dy, enc); freez(&enc); } #define MIMEBUFSIZE 4096 static void appendMimeVar(struct dyString *dy, char *name, char *value, char *varType, char *boundary) /* Append cgiVar with cgi-encoded value to dy. */ { char *fileName = NULL; if (value == NULL) value = ""; dyStringAppend(dy, "\r\n--"); dyStringAppend(dy, boundary); dyStringAppend(dy, "\r\n"); dyStringAppend(dy, "content-disposition: form-data; name=\""); dyStringAppend(dy, name); dyStringAppend(dy, "\""); if (varType && sameWord(varType, "FILE")) { fileName = strrchr(value,'/'); if (fileName) ++fileName; else fileName = value; dyStringAppend(dy, "; filename=\""); dyStringAppend(dy, fileName); dyStringAppend(dy, "\""); } dyStringAppend(dy, "\r\n"); dyStringAppend(dy, "\r\n"); if (varType && sameWord(varType, "FILE") && !sameWord(value,"")) { FILE *f = mustOpen(value, "r"); char buf[MIMEBUFSIZE]; int bytesRead = 0; do { bytesRead = fread(buf,1,MIMEBUFSIZE,f); if (bytesRead < 0) errnoAbort("error reading file to upload %s",value); dyStringAppendN(dy, buf, bytesRead); } while(bytesRead > 0); carefulClose(&f); } else dyStringAppend(dy, value); } static void appendMimeTerminus(struct dyString *dy, char *boundary) /* Append MIME boundary terminator to dy. */ { dyStringAppend(dy, "\r\n--"); dyStringAppend(dy, boundary); dyStringAppend(dy, "--\r\n"); } static int countOccurrences(char *needle, int nLen, char *haystack, int hLen) /* count # of occurrences of needle in haystack */ { int count = 0; char *match=NULL; while((match=memMatch(needle, nLen, haystack, hLen)) != NULL) { ++count; hLen -= (match - haystack) + nLen; if (hLen < 1) break; haystack=match+nLen; } return count; } static boolean isMimeEncoded(struct htmlForm *form) /* determine if the form is using MIME encoding */ { struct htmlAttribute *a; for(a = form->startTag->attributes;a;a = a->next) if (sameWord(a->name,"ENCTYPE") && sameWord(a->val,"multipart/form-data")) return TRUE; return FALSE; } char *htmlFormCgiVars(struct htmlPage *page, struct htmlForm *form, char *buttonName, char *buttonVal, struct dyString *dyHeader) /* Return cgi vars in name=val format from use having pressed * submit button of given name and value. */ { struct dyString *dy = newDyString(0); struct htmlFormVar *var; boolean isMime = isMimeEncoded(form); int mimeParts = 0; char boundary[256]; while(TRUE) { if (isMime) { /* choose a new string for the boundary */ /* Set initial seed */ int i = 0; safef(boundary,sizeof(boundary),"%s", "---------"); srand( (unsigned)time( NULL ) ); for(i=strlen(boundary);i<41;++i) { int r = (int) 26 * (rand() / (RAND_MAX + 1.0)); boundary[i] = r+'A'; } boundary[i] = 0; } if (form == NULL) form = page->forms; if (buttonName != NULL && !isMime) appendCgiVar(dy, buttonName, buttonVal); for (var = form->vars; var != NULL; var = var->next) { if (sameWord(var->tagName, "SELECT") || sameWord(var->tagName, "TEXTAREA") || (var->type != NULL && ((sameWord(var->type, "RADIO") || sameWord(var->type, "TEXTBOX") || sameWord(var->type, "PASSWORD") || sameWord(var->type, "HIDDEN") || sameWord(var->type, "TEXT") || sameWord(var->type, "FILE"))))) { char *val = var->curVal; if (val == NULL) val = ""; if (isMime) { ++mimeParts; appendMimeVar(dy, var->name, val, var->type, boundary); } else appendCgiVar(dy, var->name, val); } else if (var->type != NULL && sameWord(var->type, "CHECKBOX")) { if (var->curVal != NULL) { if (isMime) { ++mimeParts; appendMimeVar(dy, var->name, var->curVal, var->type, boundary); } else appendCgiVar(dy, var->name, var->curVal); } } else if (isMime && buttonName && sameWord(buttonName,var->name)) { ++mimeParts; appendMimeVar(dy, buttonName, buttonVal, NULL, boundary); } } if (isMime) { ++mimeParts; appendMimeTerminus(dy,boundary); if (countOccurrences(boundary,strlen(boundary),dy->string,dy->stringSize) != mimeParts) { /* boundary was found in input! # occurrences not as expected */ dyStringClear(dy); continue; /* if at first you don't succeed, try another boundary string */ } dyStringPrintf(dyHeader, "Content-type: multipart/form-data, boundary=%s\r\n",boundary); if (isMime && verboseLevel() == 2) { mustWrite(stderr, dyHeader->string, dyHeader->stringSize); mustWrite(stderr, dy->string, dy->stringSize); } } break; } return dyStringCannibalize(&dy); } struct htmlPage *htmlPageFromForm(struct htmlPage *origPage, struct htmlForm *form, char *buttonName, char *buttonVal) /* Return a new htmlPage based on response to pressing indicated button * on indicated form in origPage. */ { struct htmlPage *newPage = NULL; struct dyString *dyUrl = dyStringNew(0); struct dyString *dyHeader = dyStringNew(0); struct dyString *dyText = NULL; char *url = htmlExpandUrl(origPage->url, form->action); char *cgiVars = NULL; int contentLength = 0; int sd = -1; dyStringAppend(dyUrl, url); cookieOutput(dyHeader, origPage->cookies); if (sameWord(form->method, "GET")) { cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader); dyStringAppend(dyUrl, "?"); dyStringAppend(dyUrl, cgiVars); verbose(3, "GET %s\n", dyUrl->string); sd = netOpenHttpExt(dyUrl->string, form->method, dyHeader->string); } else if (sameWord(form->method, "POST")) { cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader); contentLength = strlen(cgiVars); verbose(3, "POST %s\n", dyUrl->string); dyStringPrintf(dyHeader, "Content-length: %d\r\n", contentLength); sd = netOpenHttpExt(dyUrl->string, form->method, dyHeader->string); mustWriteFd(sd, cgiVars, contentLength); } dyText = netSlurpFile(sd); close(sd); newPage = htmlPageParse(url, dyStringCannibalize(&dyText)); freez(&url); dyStringFree(&dyUrl); dyStringFree(&dyHeader); freez(&cgiVars); return newPage; } struct slName *htmlPageScanAttribute(struct htmlPage *page, char *tagName, char *attribute) /* Scan page for values of particular attribute in particular tag. * if tag is NULL then scans in all tags. */ { struct htmlTag *tag; struct htmlAttribute *att; struct slName *list = NULL, *el; for (tag = page->tags; tag != NULL; tag = tag->next) { if (tagName == NULL || sameWord(tagName, tag->name)) { for (att = tag->attributes; att != NULL; att = att->next) { if (sameWord(attribute, att->name)) { el = slNameNew(att->val); slAddHead(&list, el); } } } } slReverse(&list); return list; } struct slName *htmlPageLinks(struct htmlPage *page) /* Scan through tags list and pull out HREF attributes. */ { return htmlPageScanAttribute(page, NULL, "HREF"); } struct htmlTableRow /* Data on a row */ { struct htmlTableRow *next; int tdCount; int inTd; }; struct htmlTable /* Data on a table. */ { struct htmlTable *next; struct htmlTableRow *row; int rowCount; }; static void validateTables(struct htmlPage *page, struct htmlTag *startTag, struct htmlTag *endTag) /* Validate
are all properly nested, and that there * are no empty rows. */ { struct htmlTable *tableStack = NULL, *table; struct htmlTableRow *row; struct htmlTag *tag; for (tag = startTag; tag != endTag; tag = tag->next) { if (sameWord(tag->name, "TABLE")) { if (tableStack != NULL) { if (tableStack->row == NULL || !tableStack->row->inTd) tagAbort(page, tag, "TABLE inside of another table, but not inside of
\n"); } AllocVar(table); slAddHead(&tableStack, table); } else if (sameWord(tag->name, "/TABLE")) { if ((table = tableStack) == NULL) tagAbort(page, tag, "Extra
tag"); if (table->rowCount == 0) tagAbort(page, tag, " with no 's"); if (table->row != NULL) tagAbort(page, tag, "
inside of a row"); tableStack = table->next; freez(&table); } else if (sameWord(tag->name, "TR")) { if ((table = tableStack) == NULL) tagAbort(page, tag, " outside of TABLE"); if (table->row != NULL) tagAbort(page, tag, "... with no in between"); AllocVar(table->row); table->rowCount += 1; } else if (sameWord(tag->name, "/TR")) { if ((table = tableStack) == NULL) tagAbort(page, tag, " outside of TABLE"); if (table->row == NULL) tagAbort(page, tag, " with no "); #ifdef LEGAL_ACTUALLY if (table->row->inTd) { tagAbort(page, tag, " while is open"); } #endif /* LEGAL_ACTUALLY */ if (table->row->tdCount == 0) tagAbort(page, tag, "Empty row in "); freez(&table->row); } else if (sameWord(tag->name, "TD") || sameWord(tag->name, "TH")) { if ((table = tableStack) == NULL) tagAbort(page, tag, "<%s> outside of
", tag->name); if ((row = table->row) == NULL) tagAbort(page, tag, "<%s> outside of ", tag->name); #ifdef LEGAL_ACTUALLY if (row->inTd) { tagAbort(page, tag, "<%s>...<%s> with no in between", tag->name, tag->name, tag->name); } #endif /* LEGAL_ACTUALLY */ row->inTd = TRUE; row->tdCount += 1; } else if (sameWord(tag->name, "/TD") || sameWord(tag->name, "/TH")) { if ((table = tableStack) == NULL) tagAbort(page, tag, "<%s> outside of
", tag->name); if ((row = table->row) == NULL) tagAbort(page, tag, "<%s> outside of ", tag->name); if (!row->inTd) tagAbort(page, tag, "<%s> with no <%s>", tag->name, tag->name+1); row->inTd = FALSE; } } if (tableStack != NULL) tagAbort(page, tag, "Missing
"); } static void checkTagIsInside(struct htmlPage *page, char *outsiders, char *insiders, struct htmlTag *startTag, struct htmlTag *endTag) /* Check that insiders are all bracketed by outsiders. */ { char *outDupe = cloneString(outsiders); char *inDupe = cloneString(insiders); char *line, *word; int depth = 0; struct htmlTag *tag; struct hash *outOpen = newHash(8); struct hash *outClose = newHash(8); struct hash *inHash = newHash(8); char buf[256]; /* Create hashes of all insiders */ line = inDupe; while ((word = nextWord(&line)) != NULL) { touppers(word); hashAdd(inHash, word, NULL); } /* Create hash of open and close outsiders. */ line = outDupe; while ((word = nextWord(&line)) != NULL) { touppers(word); hashAdd(outOpen, word, NULL); safef(buf, sizeof(buf), "/%s", word); hashAdd(outClose, buf, NULL); } /* Stream through tags making sure that insiders are * at least one deep inside of outsiders. */ for (tag = startTag; tag != NULL; tag = tag->next) { char *type = tag->name; if (hashLookup(outOpen, type )) ++depth; else if (hashLookup(outClose, type)) --depth; else if (hashLookup(inHash, type)) { if (depth <= 0) { if (!startsWith(" and tags are properly nested. */ { struct htmlTag *tag; int depth = 0; char endType[256]; safef(endType, sizeof(endType), "/%s", type); for (tag = startTag; tag != endTag; tag = tag->next) { if (sameWord(tag->name, type)) ++depth; else if (sameWord(tag->name, endType)) { --depth; if (depth < 0) tagAbort(page, tag, "<%s> without preceding <%s>", endType, type); } } if (depth != 0) errAbort("Missing <%s> tag", endType); } static void validateNestingTags(struct htmlPage *page, struct htmlTag *startTag, struct htmlTag *endTag, char *nesters[], int nesterCount) /* Validate many tags that do need to nest. */ { int i; for (i=0; i) * up to and including and check some things. */ { struct htmlTag *tag, *endTag = NULL; /* First search for end tag. */ for (tag = startTag; tag != NULL; tag = tag->next) { if (sameWord(tag->name, "/BODY")) { endTag = tag; break; } } if (endTag == NULL) errAbort("Missing "); validateTables(page, startTag, endTag); checkTagIsInside(page, "DIR MENU OL UL", "LI", startTag, endTag); checkTagIsInside(page, "DL", "DD DT", startTag, endTag); checkTagIsInside(page, "COLGROUP TABLE", "COL", startTag, endTag); checkTagIsInside(page, "MAP", "AREA", startTag, endTag); checkTagIsInside(page, "FORM SCRIPT", "INPUT BUTTON /BUTTON OPTION SELECT /SELECT TEXTAREA /TEXTAREA" "FIELDSET /FIELDSET" , startTag, endTag); validateNestingTags(page, startTag, endTag, bodyNesters, ArraySize(bodyNesters)); return endTag->next; } static char *urlOkChars() /* Return array character indexed array that has * 1 for characters that are ok in URLs and 0 * elsewhere. */ { char *okChars; int c; AllocArray(okChars, 256); for (c=0; c<256; ++c) if (isalnum(c)) okChars[c] = 1; /* This list is a little more inclusive than W3's. */ okChars['='] = 1; okChars['-'] = 1; okChars['/'] = 1; okChars['%'] = 1; okChars['.'] = 1; okChars[';'] = 1; okChars[':'] = 1; okChars['_'] = 1; okChars['&'] = 1; okChars['+'] = 1; okChars['('] = 1; okChars[')'] = 1; okChars['$'] = 1; okChars['!'] = 1; okChars['*'] = 1; okChars['@'] = 1; okChars['\''] = 1; // apparently the apostrophe itself is ok return okChars; } static void validateCgiUrl(char *url) /* Make sure URL follows basic CGI encoding rules. */ { if (startsWith("http:", url) || startsWith("https:", url)) { static char *okChars = NULL; UBYTE c, *s; if (okChars == NULL) okChars = urlOkChars(); url = strchr(url, '?'); if (url != NULL) { s = (UBYTE*)url+1; while ((c = *s++) != 0) { if (!okChars[c]) { errAbort("Character %c not allowed in URL %s", c, url); } } } } } static void validateCgiUrls(struct htmlPage *page) /* Make sure URLs in page follow basic CGI encoding rules. */ { struct htmlForm *form; struct slName *linkList = htmlPageLinks(page), *link; for (form = page->forms; form != NULL; form = form->next) validateCgiUrl(form->action); for (link = linkList; link != NULL; link = link->next) validateCgiUrl(link->name); slFreeList(&linkList); } static int countTagsOfType(struct htmlTag *tagList, char *type) /* Count number of tags of given type. */ { struct htmlTag *tag; int count = 0; for (tag = tagList; tag != NULL; tag = tag->next) if (sameString(tag->name, type)) ++count; return count; } static void checkExactlyOne(struct htmlTag *tagList, char *type) /* Check there is exactly one of tag in list. */ { int count = countTagsOfType(tagList, type); if (count != 1) errAbort("Expecting exactly 1 <%s>, got %d", type, count); } void htmlPageFormOrAbort(struct htmlPage *page) /* Aborts if no FORM found */ { if (page == NULL) errAbort("Can't validate NULL page"); if (page->forms == NULL) errAbort("No form found"); } void htmlPageValidateOrAbort(struct htmlPage *page) /* Do some basic validations. Aborts if there is a problem. */ { struct htmlTag *tag; boolean gotTitle = FALSE; char *contentType = NULL; if (page == NULL) errAbort("Can't validate NULL page"); if (page->header != NULL) contentType = hashFindVal(page->header, "Content-Type:"); if (contentType == NULL || startsWith("text/html", contentType)) { /* To simplify things upper case all tag names. */ for (tag = page->tags; tag != NULL; tag = tag->next) touppers(tag->name); checkExactlyOne(page->tags, "BODY"); /* Validate header, and make a suggestion or two */ if ((tag = page->tags) == NULL) errAbort("No tags"); if (!sameWord(tag->name, "HTML")) errAbort("Doesn't start with tag"); tag = tag->next; if (tag == NULL || !sameWord(tag->name, "HEAD")) warn(" tag does not follow tag"); else { for (;;) { tag = tag->next; if (tag == NULL) errAbort("Missing "); if (sameWord(tag->name, "TITLE")) gotTitle = TRUE; if (sameWord(tag->name, "/HEAD")) break; } if (!gotTitle) warn("No title in "); validateNestingTags(page, page->tags, tag, headNesters, ArraySize(headNesters)); tag = tag->next; } if (tag == NULL || !sameWord(tag->name, "BODY")) errAbort(" tag does not follow tag"); tag = validateBody(page, tag->next); if (tag == NULL || !sameWord(tag->name, "/HTML")) errAbort("Missing "); validateCgiUrls(page); } }