/* i16Find - Find sequence by searching with 16 base index.. */ /* This file is copyright 2008 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "verbose.h" #include "i16.h" boolean mmap; boolean noOverflow; int maxMismatch = 0; int maxRepeat = 10; void usage() /* Explain usage and exit. */ { errAbort( "i16Find - Find sequence by searching with 16 base index.\n" " i16Find target.i16 query.fa output\n" "options:\n" " -maxRepeat=N - maximum number of alignments to output on one query sequence. Default %d\n" " -mmap - Use memory mapping. Faster just a few reads, but much slower for many reads\n" " -maxMismatch - maximum number of mismatches allowed. Default %d. NOT IMPLEMENTED\n" " -noOverflow - Don't explore index overflow regions (usually just repeats anyway)\n" , maxRepeat, maxMismatch ); } static struct optionSpec options[] = { {"maxRepeat", OPTION_INT}, {"mmap", OPTION_BOOLEAN}, {"maxMismatch", OPTION_INT}, {"noOverflow", OPTION_BOOLEAN}, {NULL, 0}, }; int i16OverflowIx(struct i16 *i16, bits32 slot) /* Find index of hex that matches val, or -1 if none such. */ { int startIx=0, endIx=i16->header->overflowSlotCount-1, midIx; bits32 *slots = i16->overflowSlots; bits32 posVal; for (;;) { if (startIx == endIx) { posVal = slots[startIx]; if (posVal == slot) return startIx; else return -1; } midIx = ((startIx + endIx)>>1); posVal = slots[midIx]; if (posVal < slot) startIx = midIx+1; else endIx = midIx; } } bits64 slotsExplored = 0; void i16FindExact(struct i16 *i16, DNA *qDna, int qSize, struct slInt **pHitList) /* Put exact hits on hit list. */ { bits32 slot = packDna16(qDna); int slotSize = i16->slotSizes[slot]; bits32 *slotList; if (slotSize == i16OverflowCount) { if (noOverflow) { return; } else { int overflowIx = i16OverflowIx(i16, slot); assert(overflowIx >= 0); slotSize = i16->overflowSizes[overflowIx]; slotList = i16->overflowOffsets + i16->overflowStarts[overflowIx]; } } else { slotList = i16->slotOffsets + i16->slotStarts[slot]; } int i; slotsExplored += slotSize; for (i=0; iallDna + tOffset; if (memcmp(qDna+16, tDna+16, qSize-16) == 0) { struct slInt *hit = slIntNew(tOffset); slAddHead(pHitList, hit); } } } void i16Find(char *i16File, char *queryFile, char *outputFile) /* i16Find - Find sequence by searching indexed traversable suffix array.. */ { struct i16 *i16 = i16Read(i16File, mmap); verboseTime(1, "Loaded %s", i16File); struct dnaLoad *qLoad = dnaLoadOpen(queryFile); FILE *f = mustOpen(outputFile, "w"); struct dnaSeq *qSeq; int queryCount = 0, hitCount = 0, missCount=0; while ((qSeq = dnaLoadNext(qLoad)) != NULL) { struct slInt *hit, *hitList = NULL; verbose(2, "Processing %s\n", qSeq->name); toUpperN(qSeq->dna, qSeq->size); i16FindExact(i16, qSeq->dna, qSeq->size, &hitList); if (hitList != NULL) { int count = slCount(hitList); if (count < maxRepeat) { for (hit = hitList; hit != NULL; hit = hit->next) { bits32 offset = hit->val; fprintf(f, "%s hits offset %u\n", qSeq->name, offset); fprintf(f, "q %s\n", qSeq->dna); fprintf(f, "t "); mustWrite(f, i16->allDna + offset, qSeq->size); fprintf(f, "\n"); } } ++hitCount; } else ++missCount; ++queryCount; dnaSeqFree(&qSeq); } verboseTime(1, "Alignment"); verbose(1, "%d queries. %d hits (%5.2f%%). %d misses (%5.2f%%), %lld slotsExplored.\n", queryCount, hitCount, 100.0*hitCount/queryCount, missCount, 100.0*missCount/queryCount, slotsExplored); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { verboseTimeInit(); optionInit(&argc, argv, options); if (argc != 4) usage(); maxRepeat = optionInt("maxRepeat", maxRepeat); maxMismatch = optionInt("maxMismatch", maxMismatch); mmap = optionExists("mmap"); noOverflow = optionExists("noOverflow"); dnaUtilOpen(); i16Find(argv[1], argv[2], argv[3]); return 0; }