/* itsaFind - Find sequence by searching indexed traversable suffix array.. */ /* Copyright 2008 Jim Kent all rights reserved. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "verbose.h" #include "itsa.h" boolean mmap; int maxMismatch = 0; int maxRepeat = 10; void usage() /* Explain usage and exit. */ { errAbort( "itsaFind - Find sequence by searching indexed traversable suffix array.\n" "usage:\n" " itsaFind target.itsa query.fa output\n" "options:\n" " -maxRepeat=N - maximum number of alignments to output on one query sequence. Default %d\n" " -mmap - Use memory mapping. Faster just a few reads, but much slower for many reads\n" " -maxMismatch - maximum number of mismatches allowed. Default %d. NOT IMPLEMENTED\n" , maxRepeat, maxMismatch ); } static struct optionSpec options[] = { {"maxRepeat", OPTION_INT}, {"mmap", OPTION_BOOLEAN}, {"maxMismatch", OPTION_INT}, {NULL, 0}, }; void finalSearch(DNA *tDna, bits32 *suffixArray, bits32 searchStart, bits32 searchEnd, DNA *qDna, int qSize, int alreadyMatched, struct slInt **pHitList) /* Our search has been narrowed to be between searchStart and searchEnd. * We know within the interval a prefix of size alreadyMatched is already * the same. Here we check if anything in this interval to see if there is * a full match to anything. If so we add it to hitList. Due to the peculiarities * of the array seach with each successive item in the window we've checked one * more letter already. */ { bits32 searchIx; for (searchIx = searchStart; searchIx < searchEnd; ++searchIx) { int diff = memcmp(qDna+alreadyMatched, tDna+alreadyMatched+suffixArray[searchIx], qSize-alreadyMatched); /* Todo - break without a hit when diff is the wrong sign. */ if (diff == 0) { struct slInt *hit = slIntNew(searchIx); slAddHead(pHitList, hit); break; } ++alreadyMatched; } } void itsaFindWithin(DNA *tDna, bits32 *suffixArray, bits32 *traverseArray, DNA *qDna, int qSize, int cursor, bits32 searchStart, bits32 searchEnd, struct slInt **pHitList) /* Search the part of the suffix array between searchStart and searchEnd for a match. * The searchStart/searchEnd and cursor position must agree. */ { // uglyf("itsaFindWithin(qDna=%s cursor=%d searchStart=%d searchEnd=%d\n", qDna, cursor, searchStart, searchEnd); bits32 arrayPos = searchStart; /* We step through each base of the query */ for (; cursor= searchEnd) { finalSearch(tDna, suffixArray, searchStart, arrayPos, qDna, qSize, cursor-(arrayPos-searchStart), pHitList); return; /* Ran through all variations of letters at this position. */ } nextOffset = traverseArray[nextPos]; tBase = tDna[suffixArray[nextPos]+cursor]; if (qBase == tBase) { searchStart = arrayPos = nextPos; break; } } } searchEnd = arrayPos + nextOffset; /* We are going to advance to next position in query and in array. * This will only possibly yield a match if the next item in the suffix * array has a prefix that matches the current position up to our current * offset. Happily this is encoded in the traverse array in a subtle way. * The step to find another letter at this position has to be greater than * one for the prefix to be shared. */ if (nextOffset <= 1) { finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize, cursor - (arrayPos-searchStart), pHitList); return; /* No match since prefix of next position doesn't match us. */ } ++arrayPos; } /* If we got here we matched the whole query sequence, and actually there's multiple * matches to it. It's actually rare to get to here on query sequences larger than * 20 bases or so. */ finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize, qSize - (arrayPos-searchStart-1), pHitList); // TODO - check -1 } void itsaFindGivenSlot(int slot, struct itsa *itsa, DNA *qDna, int qSize, int remainingMismatches, struct slInt **pHitList) /* Given a slot (a 13-mer converted to binary that is the first 13 bases of qDna possibly with * an induced mutation or two) check out appropriate part of suffix array for hits. */ { bits32 searchStart = itsa->index13[slot]; if (searchStart != 0) { searchStart -= 1; /* Pesky thing to keep 0 meaning no data in slot. */ // uglyf("Going to look within. Cursor slot %d\n", itsa->cursors13[slot]); itsaFindWithin(itsa->allDna, itsa->array, itsa->traverse, qDna, qSize, itsa->cursors13[slot], searchStart, searchStart + itsa->traverse[searchStart], pHitList); } } void itsaExactFind(struct itsa *itsa, DNA *qDna, int qSize, struct slInt **pHitList) /* Search indexed traversable suffix tree for exact matches. */ { int slot = itsaDnaToBinary(qDna, 13); itsaFindGivenSlot(slot, itsa, qDna, qSize, 0, pHitList); } int itsaCountIdenticalPrefix(DNA *dna, int prefixSize, bits32 *array, bits32 arraySize) /* Count up number of places in a row in array, where the referenced DNA is the * same up to prefixSize bases. You do this a lot since generally the suffix tree * search just gives you the first place in the array that matches. */ { bits32 i; DNA *first = dna + array[0]; for (i=1; i 0) { subsLeft -= 1; int mutantSlot = slot; static int mutMasks[13] = {0x3FFFFFC, 0x3FFFFF3, 0x3FFFFCF, 0x3FFFF3F, 0x3FFFCFF, 0x3FFF3FF, 0x3FFCFFF, 0x3FF3FFF, 0x3FCFFFF, 0x3F3FFFF, 0x3CFFFFF, 0x33FFFFF, 0x0FFFFFF, }; int toggle1 = 1; int baseIx; for (baseIx = 0; baseIx<13; ++baseIx) { int baseOffset = 12-baseIx; DNA qOrig = qDna[baseOffset]; mutantSlot = slot & mutMasks[baseIx]; /* A */ if (qOrig != 'A') { qMutant[baseOffset] = 'A'; itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList); } mutantSlot += toggle1; if (qOrig != 'C') { qMutant[baseOffset] = 'C'; itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList); } mutantSlot += toggle1; if (qOrig != 'G') { qMutant[baseOffset] = 'G'; itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList); } mutantSlot += toggle1; if (qOrig != 'T') { qMutant[baseOffset] = 'T'; itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList); } qMutant[baseOffset] = qOrig; toggle1 <<= 2; /* Move on to next base. */ } } freeMem(qMutant); } void itsaFind(char *itsaFile, char *queryFile, char *outputFile) /* itsaFind - Find sequence by searching indexed traversable suffix array.. */ { struct itsa *itsa = itsaRead(itsaFile, mmap); verboseTime(1, "Loaded %s", itsaFile); struct dnaLoad *qLoad = dnaLoadOpen(queryFile); bits32 arraySize = itsa->header->arraySize; FILE *f = mustOpen(outputFile, "w"); struct dnaSeq *qSeq; int queryCount = 0, hitCount = 0, missCount=0; while ((qSeq = dnaLoadNext(qLoad)) != NULL) { struct slInt *hit, *hitList = NULL; verbose(2, "Processing %s\n", qSeq->name); toUpperN(qSeq->dna, qSeq->size); itsaFuzzyFind(itsa, 0, arraySize, 0, qSeq->dna, qSeq->size, maxMismatch, &hitList); if (hitList != NULL) ++hitCount; else ++missCount; for (hit = hitList; hit != NULL; hit = hit->next) { bits32 hitIx = hit->val; bits32 count = itsaCountIdenticalPrefix(itsa->allDna, qSeq->size, itsa->array + hitIx, arraySize - hitIx); bits32 i; if (count > maxRepeat) { } else { for (i=0; iname, itsa->array[hitIx+i]); fprintf(f, "q %s\n", qSeq->dna); fprintf(f, "t "); mustWrite(f, itsa->allDna + itsa->array[hitIx+i], qSeq->size); fprintf(f, "\n"); } } } ++queryCount; dnaSeqFree(&qSeq); } verboseTime(1, "Alignment"); verbose(1, "%d queries. %d hits (%5.2f%%). %d misses (%5.2f%%).\n", queryCount, hitCount, 100.0*hitCount/queryCount, missCount, 100.0*missCount/queryCount); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); maxRepeat = optionInt("maxRepeat", maxRepeat); maxMismatch = optionInt("maxMismatch", maxMismatch); mmap = optionExists("mmap"); verboseTimeInit(); dnaUtilOpen(); itsaBaseToValInit(); itsaFind(argv[1], argv[2], argv[3]); return 0; }