/* scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "htmlPage.h" void usage() /* Explain usage and exit. */ { errAbort( "scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.\n" "usage:\n" " scrapeCruzBiotech startUrl destFile\n" "options:\n" " -xxx=XXX\n" ); } static struct optionSpec options[] = { {NULL, 0}, }; char *catBase = "http://www.scbt.com/catalog/"; void weedString(char *weed, char *s) { s = stringIn(weed, s); if (s != NULL) { int i, len = strlen(weed); for (i=0; ihtmlText); boolean gotFcm = (stringIn("FCM", page->htmlText) != NULL); boolean gotIhc = (stringIn("IHC", page->htmlText) != NULL); char *scIdStart = stringIn("sc-", page->htmlText); char *fcmStart = stringIn("FCM", page->htmlText); char *ihcStart = stringIn("IHC", page->htmlText); char *type = "other"; if (stringIn(">Transcription Regulators<", page->htmlText)) type = "txn"; else if (stringIn(">Homeodomain Proteins<", page->htmlText)) type = "hox"; if (scIdStart != NULL) { char *scId = nextWord(&scIdStart); fprintf(f, "%s\tsc-%d\t%s\t%d\t%d\n", scId, id, type, gotFcm, gotIhc); uglyf("%s\tsc-%d\t%s\t%d\t%d\n", scId, id, type, gotFcm, gotIhc); } htmlPageFree(&page); } void scrapeCruzBiotech(char *outFile) /* scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.. */ { FILE *f = mustOpen(outFile, "w"); char url[1024]; int id, minId = 1, maxId=46000; for (id=minId; id<=maxId; ++id) { safef(url, sizeof(url), "http://www.scbt.com/catalog/detail.lasso?-token.order_id=&-database=catalog&-layout=web_detail&-Op=eq&catalog_number=sc-%d&-search", id); scrapeProduct(url, id, f); uglyf("Done %d of %d\n", id, maxId); } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 2) usage(); scrapeCruzBiotech(argv[1]); return 0; }