/* dynAlign.c - align using dynamic programming. */ #include "common.h" #include "memalloc.h" #include "portable.h" #include "cheapcgi.h" #include "htmshell.h" #include "dnautil.h" #include "dnaseq.h" #include "fa.h" #include "wormdna.h" #include "crudeali.h" #include "xenalign.h" boolean isOnWeb; /* True if run from Web rather than command line. */ void dustData(char *in,char *out) /* Clean up data by getting rid non-alphabet stuff. */ { char c; while ((c = *in++) != 0) if (isalpha(c)) *out++ = tolower(c); *out++ = 0; } void doMiddle() { char *query = cgiString("query"); char *target = cgiString("target"); int qSize, tSize; int maxQ = 10000 * 2, maxT = 100000; long startTime, endTime; double estTime; long clock1000(); dustData(query, query); dustData(target, target); printf("
");
qSize = strlen(query);
tSize = strlen(target);
printf("query size %d\ntarget size %d\n", qSize, tSize);
if ( qSize <= maxQ || tSize <= maxT)
{
if (qSize < 2000 && tSize < 4000)
{
estTime = 0.1 + qSize * tSize * 0.25 / 1000000.0;
}
else
{
estTime = (double)qSize*15.0/10000.0 + 0.4;
}
printf("This will take about %4.1f minutes to align on a vintage 1999 server\n",
estTime);
fflush(stdout);
startTime = clock1000();
xenAlignBig(query, qSize, target, tSize, stdout, TRUE);
endTime = clock1000();
printf("%4.2f minutes to align\n", (double)(endTime-startTime)/(60*1000));
}
else
errAbort("%d bases in query, %d bases in target - too big to handle.\n"
"Maximum is %d bases in query, %d bases in target.",
qSize, tSize, maxQ, maxT);
}
void xenAlignFa(char *queryFile, char *targetFile, FILE *out)
/* Align query with target fa files, putting result in out. */
{
struct dnaSeq *q, *t;
q = faReadDna(queryFile);
t = faReadDna(targetFile);
printf("Aligning %s (%d bases) with %s (%d bases)\n",
q->name, q->size, t->name, t->size);
xenAlignBig(q->dna, q->size, t->dna, t->size, out, TRUE);
freeDnaSeq(&t);
freeDnaSeq(&q);
}
void usage()
/* Describe usage of program and terminate. */
{
errAbort("dynAlign - used dynamic programming to find alignment between two nucleotide sequence\n"
"usage:\n"
" dynAlign test string1 string2\n"
"aligns two short sequences in command line.\n"
" dynAlign two query.fa target.fa outfile\n"
"aligns two sequences in fa files\n"
" dynAlign worms bwana.out outFile\n"
"aligns C. briggsae genomic fragments against full C. elegans genome.\n"
"cbali.out is the output of the cbAli program and outFile is the file\n"
"to produce.\n"
" dynAlign starting cbali.out outFile startIx\n"
"same as dynAlign worms, but starts part way through cbali.out\n"
" dynAlign range cbali.out outFile startIx endIx\n"
"same as dynAlign worms, but specifies start and end areas to work on\n");
}
void doublePrint(FILE *f, char *format, ...)
/* Print both to file and to standard output. */
{
va_list args;
va_start(args, format);
vfprintf(f, format, args);
vfprintf(stdout, format, args);
va_end(args);
}
struct hitCluster
/* Merge a couple of hits into same cluster if it's easy via this. */
{
struct hitCluster *next;
char *probe;
int pStart, pEnd;
char *target;
int tStart, tEnd;
boolean isRc;
};
int innerTargetSize = 3000; /* How big a cluster can get. */
int outerTargetSize = 5000; /* 1000 bases on either side of cluster. */
void alignAll(struct hitCluster *hcList, int hcListSize, int starting, int ending, char *outName)
/* Do all alignments in list. */
{
FILE *out = mustOpen(outName, "w");
char *probeName = NULL;
struct dnaSeq *probe = NULL;
struct hitCluster *hc;
DNA *targetDna = NULL;
char *chrom;
int tStart, tEnd, tMid, tSize;
int pSize;
int score;
int hcIx = 1;
for (hc = hcList;hc != NULL; hc = hc->next)
{
if (hc->target != NULL && hcIx >= starting && hcIx <= ending)
{
/* Get new probe if necessary. */
if (hc->probe != probeName)
{
probeName = hc->probe;
freeDnaSeq(&probe);
probe = faReadDna(probeName);
}
/* Get 5000 base pairs around target. */
chrom = hc->target;
tMid = (hc->tStart + hc->tEnd)/2;
tStart = tMid - outerTargetSize/2;
tEnd = tStart + outerTargetSize;
wormClipRangeToChrom(chrom, &tStart, &tEnd);
tSize = tEnd - tStart;
targetDna = wormChromPart(chrom, tStart, tSize);
pSize = hc->pEnd - hc->pStart;
if (hc->isRc)
reverseComplement(probe->dna + hc->pStart, pSize);
/* Write preliminary info to file and screen. */
doublePrint(out, "Aligning %s:%d-%d %c to %s:%d-%d + \n(%d of %d)\n",
probeName, hc->pStart, hc->pEnd, (hc->isRc ? '-' : '+'),
chrom, tStart, tEnd,
hcIx, hcListSize);
score = xenAlignSmall(probe->dna + hc->pStart, pSize,
targetDna, tSize, out, FALSE);
if (hc->isRc)
reverseComplement(probe->dna + hc->pStart, pSize);
doublePrint(out, "best score %d\n\n", score);
freez(&targetDna);
fflush(out);
}
++hcIx;
}
freeDnaSeq(&probe);
fclose(out);
}
boolean addToCluster(struct hitCluster *hc, char *chrom, int start, int end, boolean isRc, int count)
/* If can add chromosome range to cluster in first count members of hcList
* without making cluster too big do it, otherwise return FALSE. */
{
int s, e;
int pStart = hc->pStart;
int i;
for (i=0; ipStart == pStart);
if (hc->tStart == 0 && hc->tEnd == 0)
{
/* We're the first member of this cluster. */
hc->target = wormOfficialChromName(chrom);
hc->tStart = start;
hc->tEnd = end;
hc->isRc = isRc;
return TRUE;
}
if (!sameString(chrom, hc->target) || isRc != hc->isRc)
continue;
s = min(hc->tStart, start);
e = max(hc->tEnd, end);
if (e - s > innerTargetSize)
continue;
else
{
hc->tStart = s;
hc->tEnd = e;
return TRUE;
}
if ((hc = hc->next) == NULL)
break;
}
return FALSE;
}
void wormSecondPassAlign(char *inName, char *outName, int starting, int ending)
/* Do second pass of 3 stage alignment. Take homologous chromosome
* position info from BWANA, run seven-state-aligner, and make
* output for stitcher program to put together. */
{
FILE *in = mustOpen(inName, "r");
char line[512];
char *words[64];
int lineCount = 0, wordCount;
char *probeFileName = "";
int aliCount = 0;
int totalAliCount;
int numToDo = 25;
struct hitCluster *hcList = NULL, *hc = NULL;
/* First just scan through input file - parse it, make sure
* it looks good, and figure out how long it is. */
printf("Scanning input file\n");
while (fgets(line, sizeof(line), in))
{
++lineCount;
wordCount = chopLine(line, words);
if (wordCount < 1)
continue; /* Skip blank lines. */
if (sameWord("Processing", words[0]))
{
assert(wordCount >= 4);
probeFileName = cloneString(words[1]);
}
else if (sameWord("Aligning", words[0]))
{
assert(wordCount >= 7);
assert(sameString(probeFileName, words[6]));
assert(isdigit(words[2][0]) && isdigit(words[4][0]));
AllocVar(hc);
hc->probe = probeFileName;
hc->pStart = atoi(words[2]);
hc->pEnd = atoi(words[4]);
slAddHead(&hcList, hc);
aliCount = 1;
}
else if (sameWord("Got", words[0]))
{
}
else if (sameWord("No", words[0]))
{
assert(wordCount >= 2 && sameWord("alignment", words[1]));
freeMem(slPopHead(&hcList));
}
else if (isdigit(words[0][0]))
{
char *chrom;
int start, end;
boolean isRc = TRUE;
char strand;
int score = atoi(words[0]);
if (score >= 45)
{
assert(hc != NULL);
if (wordCount < 3 || !wormParseChromRange(words[1], &chrom, &start, &end))
errAbort("bad line %d of %s\n", lineCount, inName);
strand = words[2][0];
if (strand == '+')
isRc = FALSE;
else if (strand == '-')
isRc = TRUE;
else
errAbort("Expecting + or - line %d of %s", lineCount, inName);
if (!addToCluster(hc, chrom, start, end, isRc, aliCount))
{
if (aliCount < numToDo)
{
++aliCount;
AllocVar(hc);
hc->probe = probeFileName;
hc->pStart = hcList->pStart;
hc->pEnd = hcList->pEnd;
hc->target = wormOfficialChromName(chrom);
hc->tStart = start;
hc->tEnd = end;
hc->isRc = isRc;
slAddHead(&hcList, hc);
}
}
}
}
}
slReverse(&hcList);
totalAliCount = slCount(hcList);
fclose(in);
printf("Proceeding on to %d alignments\n", totalAliCount);
alignAll(hcList, totalAliCount, starting, ending, outName);
}
int main(int argc, char *argv[])
{
//pushCarefulMemHandler();
dnaUtilOpen();
if ((isOnWeb = cgiIsOnWeb()) == TRUE)
{
htmShell("DynaAlign Results", doMiddle, NULL);
}
else
{
char *command;
if (argc < 2)
usage();
command = argv[1];
if (sameWord(command, "test"))
{
DNA *query, *target;
if (argc < 4)
usage();
query = (DNA*)argv[2];
target = (DNA*)argv[3];
xenAlignSmall(query, strlen(query), target, strlen(target), stdout, TRUE);
}
else if (sameWord(command, "two"))
{
FILE *out;
if (argc < 5)
usage();
out = mustOpen(argv[4], "w");
xenAlignFa(argv[2], argv[3], out);
}
else if (sameWord(command, "worms"))
{
char *cbName = argv[2];
char *outFileName = argv[3];
if (argc < 4)
usage();
wormSecondPassAlign(cbName, outFileName, 0, 0x7fffffff);
}
else if (sameWord(command, "starting"))
{
char *cbName = argv[2];
char *outFileName = argv[3];
if (argc < 5 || !isdigit(argv[4][0]))
usage();
wormSecondPassAlign(cbName, outFileName, atoi(argv[4]), 0x7fffffff);
}
else if (sameWord(command, "range"))
{
char *cbName = argv[2];
char *outFileName = argv[3];
if (argc < 6 || !isdigit(argv[4][0]) || !isdigit(argv[5][0]))
usage();
wormSecondPassAlign(cbName, outFileName, atoi(argv[4]), atoi(argv[5]));
}
else
usage();
}
return 0;
}