/* dynAlign.c - align using dynamic programming. */ #include "common.h" #include "memalloc.h" #include "portable.h" #include "cheapcgi.h" #include "htmshell.h" #include "dnautil.h" #include "dnaseq.h" #include "fa.h" #include "wormdna.h" #include "crudeali.h" #include "xenalign.h" boolean isOnWeb; /* True if run from Web rather than command line. */ void dustData(char *in,char *out) /* Clean up data by getting rid non-alphabet stuff. */ { char c; while ((c = *in++) != 0) if (isalpha(c)) *out++ = tolower(c); *out++ = 0; } void doMiddle() { char *query = cgiString("query"); char *target = cgiString("target"); int qSize, tSize; int maxQ = 10000 * 2, maxT = 100000; long startTime, endTime; double estTime; long clock1000(); dustData(query, query); dustData(target, target); printf("
"); qSize = strlen(query); tSize = strlen(target); printf("query size %d\ntarget size %d\n", qSize, tSize); if ( qSize <= maxQ || tSize <= maxT) { if (qSize < 2000 && tSize < 4000) { estTime = 0.1 + qSize * tSize * 0.25 / 1000000.0; } else { estTime = (double)qSize*15.0/10000.0 + 0.4; } printf("This will take about %4.1f minutes to align on a vintage 1999 server\n", estTime); fflush(stdout); startTime = clock1000(); xenAlignBig(query, qSize, target, tSize, stdout, TRUE); endTime = clock1000(); printf("%4.2f minutes to align\n", (double)(endTime-startTime)/(60*1000)); } else errAbort("%d bases in query, %d bases in target - too big to handle.\n" "Maximum is %d bases in query, %d bases in target.", qSize, tSize, maxQ, maxT); } void xenAlignFa(char *queryFile, char *targetFile, FILE *out) /* Align query with target fa files, putting result in out. */ { struct dnaSeq *q, *t; q = faReadDna(queryFile); t = faReadDna(targetFile); printf("Aligning %s (%d bases) with %s (%d bases)\n", q->name, q->size, t->name, t->size); xenAlignBig(q->dna, q->size, t->dna, t->size, out, TRUE); freeDnaSeq(&t); freeDnaSeq(&q); } void usage() /* Describe usage of program and terminate. */ { errAbort("dynAlign - used dynamic programming to find alignment between two nucleotide sequence\n" "usage:\n" " dynAlign test string1 string2\n" "aligns two short sequences in command line.\n" " dynAlign two query.fa target.fa outfile\n" "aligns two sequences in fa files\n" " dynAlign worms bwana.out outFile\n" "aligns C. briggsae genomic fragments against full C. elegans genome.\n" "cbali.out is the output of the cbAli program and outFile is the file\n" "to produce.\n" " dynAlign starting cbali.out outFile startIx\n" "same as dynAlign worms, but starts part way through cbali.out\n" " dynAlign range cbali.out outFile startIx endIx\n" "same as dynAlign worms, but specifies start and end areas to work on\n"); } void doublePrint(FILE *f, char *format, ...) /* Print both to file and to standard output. */ { va_list args; va_start(args, format); vfprintf(f, format, args); vfprintf(stdout, format, args); va_end(args); } struct hitCluster /* Merge a couple of hits into same cluster if it's easy via this. */ { struct hitCluster *next; char *probe; int pStart, pEnd; char *target; int tStart, tEnd; boolean isRc; }; int innerTargetSize = 3000; /* How big a cluster can get. */ int outerTargetSize = 5000; /* 1000 bases on either side of cluster. */ void alignAll(struct hitCluster *hcList, int hcListSize, int starting, int ending, char *outName) /* Do all alignments in list. */ { FILE *out = mustOpen(outName, "w"); char *probeName = NULL; struct dnaSeq *probe = NULL; struct hitCluster *hc; DNA *targetDna = NULL; char *chrom; int tStart, tEnd, tMid, tSize; int pSize; int score; int hcIx = 1; for (hc = hcList;hc != NULL; hc = hc->next) { if (hc->target != NULL && hcIx >= starting && hcIx <= ending) { /* Get new probe if necessary. */ if (hc->probe != probeName) { probeName = hc->probe; freeDnaSeq(&probe); probe = faReadDna(probeName); } /* Get 5000 base pairs around target. */ chrom = hc->target; tMid = (hc->tStart + hc->tEnd)/2; tStart = tMid - outerTargetSize/2; tEnd = tStart + outerTargetSize; wormClipRangeToChrom(chrom, &tStart, &tEnd); tSize = tEnd - tStart; targetDna = wormChromPart(chrom, tStart, tSize); pSize = hc->pEnd - hc->pStart; if (hc->isRc) reverseComplement(probe->dna + hc->pStart, pSize); /* Write preliminary info to file and screen. */ doublePrint(out, "Aligning %s:%d-%d %c to %s:%d-%d + \n(%d of %d)\n", probeName, hc->pStart, hc->pEnd, (hc->isRc ? '-' : '+'), chrom, tStart, tEnd, hcIx, hcListSize); score = xenAlignSmall(probe->dna + hc->pStart, pSize, targetDna, tSize, out, FALSE); if (hc->isRc) reverseComplement(probe->dna + hc->pStart, pSize); doublePrint(out, "best score %d\n\n", score); freez(&targetDna); fflush(out); } ++hcIx; } freeDnaSeq(&probe); fclose(out); } boolean addToCluster(struct hitCluster *hc, char *chrom, int start, int end, boolean isRc, int count) /* If can add chromosome range to cluster in first count members of hcList * without making cluster too big do it, otherwise return FALSE. */ { int s, e; int pStart = hc->pStart; int i; for (i=0; ipStart == pStart); if (hc->tStart == 0 && hc->tEnd == 0) { /* We're the first member of this cluster. */ hc->target = wormOfficialChromName(chrom); hc->tStart = start; hc->tEnd = end; hc->isRc = isRc; return TRUE; } if (!sameString(chrom, hc->target) || isRc != hc->isRc) continue; s = min(hc->tStart, start); e = max(hc->tEnd, end); if (e - s > innerTargetSize) continue; else { hc->tStart = s; hc->tEnd = e; return TRUE; } if ((hc = hc->next) == NULL) break; } return FALSE; } void wormSecondPassAlign(char *inName, char *outName, int starting, int ending) /* Do second pass of 3 stage alignment. Take homologous chromosome * position info from BWANA, run seven-state-aligner, and make * output for stitcher program to put together. */ { FILE *in = mustOpen(inName, "r"); char line[512]; char *words[64]; int lineCount = 0, wordCount; char *probeFileName = ""; int aliCount = 0; int totalAliCount; int numToDo = 25; struct hitCluster *hcList = NULL, *hc = NULL; /* First just scan through input file - parse it, make sure * it looks good, and figure out how long it is. */ printf("Scanning input file\n"); while (fgets(line, sizeof(line), in)) { ++lineCount; wordCount = chopLine(line, words); if (wordCount < 1) continue; /* Skip blank lines. */ if (sameWord("Processing", words[0])) { assert(wordCount >= 4); probeFileName = cloneString(words[1]); } else if (sameWord("Aligning", words[0])) { assert(wordCount >= 7); assert(sameString(probeFileName, words[6])); assert(isdigit(words[2][0]) && isdigit(words[4][0])); AllocVar(hc); hc->probe = probeFileName; hc->pStart = atoi(words[2]); hc->pEnd = atoi(words[4]); slAddHead(&hcList, hc); aliCount = 1; } else if (sameWord("Got", words[0])) { } else if (sameWord("No", words[0])) { assert(wordCount >= 2 && sameWord("alignment", words[1])); freeMem(slPopHead(&hcList)); } else if (isdigit(words[0][0])) { char *chrom; int start, end; boolean isRc = TRUE; char strand; int score = atoi(words[0]); if (score >= 45) { assert(hc != NULL); if (wordCount < 3 || !wormParseChromRange(words[1], &chrom, &start, &end)) errAbort("bad line %d of %s\n", lineCount, inName); strand = words[2][0]; if (strand == '+') isRc = FALSE; else if (strand == '-') isRc = TRUE; else errAbort("Expecting + or - line %d of %s", lineCount, inName); if (!addToCluster(hc, chrom, start, end, isRc, aliCount)) { if (aliCount < numToDo) { ++aliCount; AllocVar(hc); hc->probe = probeFileName; hc->pStart = hcList->pStart; hc->pEnd = hcList->pEnd; hc->target = wormOfficialChromName(chrom); hc->tStart = start; hc->tEnd = end; hc->isRc = isRc; slAddHead(&hcList, hc); } } } } } slReverse(&hcList); totalAliCount = slCount(hcList); fclose(in); printf("Proceeding on to %d alignments\n", totalAliCount); alignAll(hcList, totalAliCount, starting, ending, outName); } int main(int argc, char *argv[]) { //pushCarefulMemHandler(); dnaUtilOpen(); if ((isOnWeb = cgiIsOnWeb()) == TRUE) { htmShell("DynaAlign Results", doMiddle, NULL); } else { char *command; if (argc < 2) usage(); command = argv[1]; if (sameWord(command, "test")) { DNA *query, *target; if (argc < 4) usage(); query = (DNA*)argv[2]; target = (DNA*)argv[3]; xenAlignSmall(query, strlen(query), target, strlen(target), stdout, TRUE); } else if (sameWord(command, "two")) { FILE *out; if (argc < 5) usage(); out = mustOpen(argv[4], "w"); xenAlignFa(argv[2], argv[3], out); } else if (sameWord(command, "worms")) { char *cbName = argv[2]; char *outFileName = argv[3]; if (argc < 4) usage(); wormSecondPassAlign(cbName, outFileName, 0, 0x7fffffff); } else if (sameWord(command, "starting")) { char *cbName = argv[2]; char *outFileName = argv[3]; if (argc < 5 || !isdigit(argv[4][0])) usage(); wormSecondPassAlign(cbName, outFileName, atoi(argv[4]), 0x7fffffff); } else if (sameWord(command, "range")) { char *cbName = argv[2]; char *outFileName = argv[3]; if (argc < 6 || !isdigit(argv[4][0]) || !isdigit(argv[5][0])) usage(); wormSecondPassAlign(cbName, outFileName, atoi(argv[4]), atoi(argv[5])); } else usage(); } return 0; }