/* exonAli.c - Allign cDNAs based on assumption that alignments * will be clustered into exons. */ #include "common.h" #include "portable.h" #include "dnautil.h" #include "shaRes.h" #include "dnaseq.h" #include "nt4.h" #include "fa.h" #include "wormdna.h" #include "htmshell.h" #include "cheapcgi.h" #include "fuzzyFind.h" #include "errabort.h" long clock1000(); void usage() /* Describe how to use program and abort. */ { errAbort("This program aligns cDNA with genomic sequence. " "Usage:\n" " exonAli named output cdnaName(s)\n" " exonAli in output listFile\n" " exonAli all output faFile ntDir\n" " exonAli starting output faFile ntDir startingIx [count]\n" " exonAli resume output faFile ntDir\n"); } FILE *reportFile = NULL; boolean isHtml = FALSE; void reportAndExtra(char *format, va_list args, char *extra) /* Print out to stdout and maybe a report file. Possibly * include extra string before report. */ { if (reportFile != NULL) { if (extra != NULL) fprintf(reportFile, "%s", extra); vfprintf(reportFile, format, args); fflush(reportFile); } if (extra != NULL) fprintf(stdout, "%s", extra); vfprintf(stdout, format, args); if (isHtml) fprintf(stdout, "
\n"); } void reportWarning(char *format, va_list args) /* Print a warning message */ { reportAndExtra(format, args, "###"); } void vaReportf(char *format, va_list args) /* Print out to stdout and maybe a report file. */ { reportAndExtra(format, args, NULL); } void reportf(char *format, ...) /* Print out to stdout and maybe a report file. */ { va_list args; va_start(args, format); vaReportf(format, args); va_end(args); } #define tileSize 16 /* # of bases we fit in 32 bits. */ #define tileSizeShift 4 /* How far to shift something to multiply it by tile size. */ struct crudeHit /* A tileSize exact match between probe and target. */ { int probeOffset; int targetOffset; boolean isLumped; }; struct crudeExon /* A bunch of hits that hang together on a diagonal. */ { int probeStart; int probeEnd; int targetStart; int targetEnd; int hitCount; boolean isLumped; }; struct crudeGene /* A collection of diagonals that seem to hang together * like exons. */ { struct nt4Seq *target; boolean isRc; int probeStart; int probeEnd; int targetStart; int targetEnd; int score; }; int lumpHitsIntoExons(struct crudeHit *hits, int hitCount, struct crudeExon *exons) /* Lump together hits that are within 48 bp of each other and within 2 of * the same diagonal into "exons". The exons array must be hitCount elements * in order to accomodate the worst case where no lumping occurs. */ { int i; struct crudeExon *exon = exons; int exonCount = 0; for (i=0; iprobeStart = pOff; exon->probeEnd = pOff + tileSize; exon->targetStart = tOff; exon->targetEnd = tOff + tileSize; exon->hitCount = 1; diagDiff = thisDiff; } else if (diagDiff - 2 <= thisDiff && thisDiff <= diagDiff + 2 && exon->probeEnd - 2 <= pOff && pOff <= exon->probeEnd + 48 && exon->targetEnd - 2 <= tOff && tOff <= exon->targetEnd + 48) { hits[i].isLumped = TRUE; exon->probeEnd = pOff + tileSize; exon->targetEnd = tOff + tileSize; exon->hitCount += 1; } } } if (!startedExon) /* Must be all lumped together. */ break; ++exonCount; ++exon; } return exonCount; } int lumpExonsIntoGenes(struct crudeExon *exons, int exonCount, struct crudeGene *genes, struct nt4Seq *target, boolean isRc) /* Lump together crude exons into crude genes, and score them. */ { int i; struct crudeGene *gene = genes; int geneCount = 0; for (i=0; itarget = target; gene->isRc = isRc; gene->probeStart = exons[i].probeStart; gene->probeEnd = exons[i].probeEnd; gene->targetStart = exons[i].targetStart; gene->targetEnd = exons[i].targetEnd; gene->score = exons[i].hitCount * exons[i].hitCount; } else if (gene->targetEnd - 10 <= tOff && tOff <= gene->targetEnd + 25000 && gene->probeEnd - 10 <= pOff && pOff <= gene->probeEnd + 64 && lastDiff - 2 <= thisDiff) { exons[i].isLumped = TRUE; gene->targetEnd = exons[i].targetEnd; gene->probeEnd = exons[i].probeEnd; gene->score += exons[i].hitCount * exons[i].hitCount; exonsInThis += 1; lastDiff = thisDiff; } } } if (!startedGene) break; gene->score += exonsInThis; ++geneCount; ++gene; } return geneCount; } boolean makeGoodTile(DNA *dna, bits32 *retTile) /* Turn next 16 base pairs into a tile. Return FALSE * if it wouldn't be a good tile. */ { int i; DNA repeater[2*tileSize-1]; bits32 tile; /* Make sure that tile isn't part of a pattern. */ memcpy(repeater, dna+1, tileSize-1); memcpy(repeater+tileSize-1, dna, tileSize); if (memMatch(dna, tileSize, repeater, 2*tileSize-1) != NULL) return FALSE; /* Make sure no N's in tile */ for (i=0; idna; int maxProbeCount = probeSeq->size - tileSize + 1; int probeCount = 0; struct probeTile *probes; struct probeTile *probe; struct probeTile **hash; int i; struct fastProber *fp; if (maxProbeCount <= 0) return NULL; AllocVar(fp); fp->hash = hash = makeTileHash(); fp->probes = probes = needMem(maxProbeCount * sizeof(probes[0]) ); for (i=0; itile)) { int hashIx = tileHashFunc(probe->tile); probe->offset = i; probe->next = hash[hashIx]; hash[hashIx] = probe; ++probeCount; } } return fp; } void freeFastProber(struct fastProber **pFp) { struct fastProber *fp = *pFp; if (fp != NULL) { freeMem(fp->probes); freeMem(fp->hash); freez(pFp); } } int makeIndividualHits(struct probeTile **hash, bits32 *bases, int baseOffset, int baseWordCount, struct crudeHit *hits, int maxHitCount, int hitCount) /* Scan a short segment for individual hits. */ { struct probeTile *probe; int i; for (i=0; itile) { if (hitCount >= maxHitCount) { warn("Too many hits, only taking first %d", maxHitCount); goto END_HIT_LOOP; } hits[hitCount].probeOffset = probe->offset; hits[hitCount].targetOffset = ((i+baseOffset)<next; } while (probe != NULL); } } END_HIT_LOOP: return hitCount; } int makeHits(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits, int maxHitCount) /* Scan entire target for hits to probe. */ { bits32 *bases = target->bases; struct probeTile **hash = fp->hash; unsigned long acc; int hitCount = 0; int i; int baseWordCount = (target->baseCount>>tileSizeShift); bits32 *endBases = bases + baseWordCount; int chunkSize = 8; int chunkCount = baseWordCount/chunkSize; int baseOffset = 0; for (i=0; i= maxHitCount) break; } bases += chunkSize; baseOffset += chunkSize; } if (bases != endBases) hitCount = makeIndividualHits(hash, bases, baseOffset, endBases-bases, hits, maxHitCount, hitCount); return hitCount; } int cmpHitsTargetFirst(const void *va, const void *vb) /* Compare function to sort hit array by ascending * target offset followed by ascending probe offset. */ { struct crudeHit *a = (struct crudeHit *)va; struct crudeHit *b = (struct crudeHit *)vb; long diff; if ((diff = a->targetOffset - b->targetOffset) != 0) return diff; return a->probeOffset - b->probeOffset; } int cmpExonsTargetFirst(const void *va, const void *vb) /* Compare function to sort exons by ascending target * offset followed by ascending probe offset. */ { struct crudeExon *a = (struct crudeExon *)va; struct crudeExon *b = (struct crudeExon *)vb; long diff; if ((diff = a->targetStart - b->targetStart) != 0) return diff; return a->probeStart - b->probeStart; } #define maxHitsAtOnce 20000 /* Maximum number of hits to allow on a single scan. * If we get more than this need to toss out a bad tile * or something... */ int findCrudeGenes(struct fastProber *fp, struct nt4Seq *target, struct crudeGene *genes, int maxGeneCount, boolean isRc) /* Scan target with probe, and arrange hits into crude genes. */ { struct crudeHit *hits; struct crudeExon *exons; int hitCount, exonCount, geneCount = 0; if (fp == NULL) return 0; assert(maxGeneCount >= maxHitsAtOnce); hits = needMem(maxHitsAtOnce*sizeof(*hits)); exons = needMem(maxHitsAtOnce*sizeof(*exons)); hitCount = makeHits(fp, target, hits, maxHitsAtOnce); //hitCount = makeIndividualHits(fp->hash, target->bases, target->baseCount>>tileSizeShift, // hits, maxHitsAtOnce, 0); if (hitCount > 0) { qsort(hits, hitCount, sizeof(hits[0]), cmpHitsTargetFirst); exonCount = lumpHitsIntoExons(hits, hitCount, exons); qsort(exons, exonCount, sizeof(exons[0]), cmpExonsTargetFirst); geneCount = lumpExonsIntoGenes(exons, exonCount, genes, target, isRc); } freeMem(hits); freeMem(exons); return geneCount; } int cmpGenesByScore(const void *va, const void *vb) /* cmp function to sort leaving highest scores first. */ { struct crudeGene *a = (struct crudeGene *)va; struct crudeGene *b = (struct crudeGene *)vb; return b->score - a->score; } int countGenesBetter(struct crudeGene *genes, int geneCount, int cutoff) /* Count number of genes (in sorted list) with scores better than cutoff */ { int i; for (i=0; i halfGeneCount) { worstScore = genes[newGeneCount-1].score; newGeneCount = countGenesBetter(genes, newGeneCount, worstScore); } return newGeneCount; } void findAliEnds(struct ffAli *ali, DNA *needle, DNA *hay, long *retNeedleStart, long *retNeedleEnd, long *retHayStart, long *retHayEnd) { DNA *hStart = NULL; DNA *hEnd = NULL; DNA *nStart = NULL; DNA *nEnd = NULL; boolean first = TRUE; while (ali->left) ali = ali->left; for (;ali != NULL; ali = ali->right) { if (first) { hStart = ali->hStart; hEnd = ali->hEnd; nStart = ali->nStart; nEnd = ali->nEnd; first = FALSE; } else { if (ali->hStart < hStart) hStart = ali->hStart; if (ali->hEnd > hEnd) hEnd = ali->hEnd; if (ali->nStart < nStart) nStart = ali->nStart; if (ali->nEnd > nEnd) nEnd = ali->nEnd; } } *retHayStart = hStart - hay; *retNeedleStart = nStart - needle; *retHayEnd = hEnd - hay; *retNeedleEnd = nEnd - needle; } boolean findFineAlignment(struct dnaSeq *probe, struct crudeGene *crudeGene, struct ffAli **retAli, int *retAliScore, DNA **retUnpacked, int *retUnpackedSize, long *retNeedleStart, long *retNeedleEnd, long *retHayStart, long *retHayEnd) /* Call fuzzy finder on the crude gene alignment to make a * complete alignment. */ { struct nt4Seq *target; int tarStart, tarEnd; int unpackedSize; DNA *unpacked; struct ffAli *ali = NULL; static int outside = 250; int aliScore; long hayStart, hayEnd, needleStart, needleEnd; target = crudeGene->target; tarStart = crudeGene->targetStart - outside; if (tarStart < 0) tarStart = 0; tarEnd = crudeGene->targetEnd + outside; if (tarEnd > target->baseCount) tarEnd = target->baseCount; unpackedSize = tarEnd - tarStart; unpacked = nt4Unpack(target, tarStart, unpackedSize); if (crudeGene->isRc) reverseComplement(probe->dna, probe->size); ali = ffFind(probe->dna, probe->dna + probe->size, unpacked, unpacked + unpackedSize, ffTight); aliScore = ffScoreCdna(ali); if (crudeGene->isRc) reverseComplement(probe->dna, probe->size); if (ali != NULL) { findAliEnds(ali, probe->dna, unpacked, &needleStart, &needleEnd, &hayStart, &hayEnd); hayStart += tarStart; hayEnd += tarStart; } //#define SHOW_ALI #ifdef SHOW_ALI if (hayStart == 4801255 && hayEnd == 4801707) { char hayName[50]; sprintf(hayName, "%s:%d-%d %c", target->name, tarStart, tarEnd); ffShowAli(ali, "probe", probe->dna, 0, hayName, unpacked, (tarStart), crudeGene->isRc ? '-' : '+'); } #endif /* SHOW_ALI */ *retAli = ali; *retAliScore = aliScore; *retUnpacked = unpacked; *retUnpackedSize = unpackedSize; *retNeedleStart = needleStart; *retNeedleEnd = needleEnd; *retHayStart = hayStart; *retHayEnd = hayEnd; return (ali != NULL); } void myBlast(struct dnaSeq *probe, char *probeName, struct nt4Seq *chrome[], int chromeCount) { int oneGeneCount; int i; int decentAlignments = 0; int chromeIx; int geneCount = 0; int maxGeneCount = 2*maxHitsAtOnce; long startTime, endTime; int isRc; struct nt4Seq *target; struct crudeGene *crudeGenes = needMem(maxGeneCount*sizeof(*crudeGenes)); struct fastProber *fps[2]; /* Time how long it takes to allign things. */ startTime = clock1000(); fps[0] = makeFastProber(probe); reverseComplement(probe->dna, probe->size); fps[1] = makeFastProber(probe); reverseComplement(probe->dna, probe->size); for (chromeIx = 0; chromeIx < chromeCount; ++chromeIx) { target = chrome[chromeIx]; for (isRc = 0; isRc <= 1; ++isRc) { /* Get crude idea of genes on one strand. */ oneGeneCount = findCrudeGenes(fps[isRc], target, crudeGenes+geneCount, maxGeneCount-geneCount, isRc); /* Increment gene count and if necessary compact list. */ geneCount += oneGeneCount; if (maxGeneCount - geneCount < maxHitsAtOnce) { geneCount = filterPoorGenes(crudeGenes, geneCount); if (maxGeneCount - geneCount < maxHitsAtOnce) { warn("Too many genes. Moving on to the next.\n"); break; } } } if (maxGeneCount - geneCount < maxHitsAtOnce) break; } /* Sort genes by initial promise and get rid of some trash. */ { int bestScore, worstScore; qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore); bestScore = crudeGenes[0].score; worstScore = crudeGenes[geneCount-1].score; if (bestScore > worstScore) { int cutOff = bestScore/10; geneCount = countGenesBetter(crudeGenes, geneCount, cutOff); } } /* Do fine alignments. */ for (i=0; itarget->name, hayStart, hayEnd, cg->score, aliScore, cg->isRc ? '-' : '+'); ++decentAlignments; ffFreeAli(&ali); freez(&unpackedDna); } } freeFastProber(&fps[0]); freeFastProber(&fps[1]); endTime = clock1000(); reportf("%d alignments of %s in %f seconds\n\n", decentAlignments, probeName, (endTime-startTime)*0.001); freeMem(crudeGenes); } char *lastCdnaReported(char *reportFileName) /* Open report file and scan for last cDNA blasted. */ { FILE *f = mustOpen(reportFileName, "r"); char linebuf[512]; char *words[16]; int wordCount; char *lastName = NULL; static char lastNameBuf[256]; for (;;) { if (fgets(linebuf, sizeof(linebuf), f) == NULL) break; wordCount = chopString(linebuf, whiteSpaceChopper, words, ArraySize(words)); if (wordCount > 4 && (strcmp("alignments", words[1]) == 0)) { strncpy(lastNameBuf, words[3], sizeof(lastNameBuf)); lastName = lastNameBuf; } } return lastName; } void reportBlasting(int ix, struct dnaSeq *cdna) { char *cdnaName = cdna->name; reportf("%d Blasting %s %d bases\n", ix, cdnaName, cdna->size); } void doInFile(char *inFileName, struct nt4Seq *nt4s[], int nt4Count) { FILE *inFile = mustOpen(inFileName, "r"); char lineBuf[256]; char *words[10]; char *cdnaName; int wordCount; struct dnaSeq *cdna; int ix = 0; while (fgets(lineBuf, sizeof(lineBuf), inFile) != NULL) { wordCount = chopString(lineBuf, whiteSpaceChopper, words, ArraySize(words)); if (wordCount == 0) /* Skip blank lines. */ continue; ++ix; cdnaName = words[0]; if (!wormCdnaSeq(cdnaName, &cdna, NULL)) errAbort("Couldn't find cdna %s\n", cdnaName); else { reportBlasting(ix, cdna); myBlast(cdna, cdnaName, nt4s, nt4Count); } freeDnaSeq(&cdna); } } FILE *estFile; int loadNt4s(char *dir, struct nt4Seq ***retNt4s) /* Load up genome stored 2 bits to a nucleotide. */ { struct slName *nameList, *name; struct nt4Seq **nt4s; int count; int i; char fileName[512]; char chromName[256]; char fullChromName[256]; long start, end; nameList = listDir(dir, "*.nt4"); count = slCount(nameList); if (count < 1) errAbort("Couldn't find any .nt4 files in %s", dir); nt4s = needMem(count*sizeof(nt4s[0]) ); start = clock1000(); for (name = nameList, i=0; name != NULL; name = name->next, i+=1) { char *end; sprintf(fileName, "%s/%s", dir, name->name); /* Cut off .nt4 suffix. */ strcpy(chromName, name->name); end = strrchr(chromName, '.'); assert(end != NULL); *end = 0; sprintf(fullChromName, "Chromosome %s", chromName); nt4s[i] = loadNt4(fileName, fullChromName); } end = clock1000(); printf("%4.2f seconds loading %d .nt4 files\n", (end-start)*0.001, count); *retNt4s = nt4s; return count; } void startEsts(char *estFileName) /* Start searching ESTs */ { estFile = mustOpen(estFileName, "rb"); } struct dnaSeq *nextEst() /* Get next EST */ { return faReadOneDnaSeq(estFile, NULL, TRUE); } int main(int argc, char *argv[]) /* Set things up to batch process all cdnas in data base. * Print results to console and to a report file. * Allow user to resume interrupted batch process. */ { char *command; struct dnaSeq *cdna = NULL; struct nt4Seq **nt4s; int nt4Count; int i; char *reportFileName; if (argc < 3) usage(); dnaUtilOpen(); pushWarnHandler(reportWarning); command = argv[1]; reportFileName = argv[2]; /* Simple "named" command doesn't get appended to report file. */ if (!differentWord(command, "named")) { int i; nt4Count = loadNt4s(wormChromDir(), &nt4s); for (i=3; iname; ++dbIx; if (!differentWord(cdnaName, seekUntilName)) break; freeDnaSeq(&cdna); } if (cdna == NULL) errAbort("Couldn't find %s in cDNA database", seekUntilName); freeDnaSeq(&cdna); } if (seekUntilIx > 0) { for (dbIx = 0; dbIx < seekUntilIx; ++dbIx) { if ((cdna = nextEst()) == NULL) errAbort("Can't seek to %d, there's only %d", seekUntilIx+1, dbIx+1); freeDnaSeq(&cdna); } } /* Main loop */ while ((cdna = nextEst()) != NULL) { if (dbIx >= endIx) break; ++dbIx; cdnaName = cdna->name; reportBlasting(dbIx, cdna); myBlast(cdna, cdnaName, nt4s, nt4Count); freeDnaSeq(&cdna); } } for (i=0; i