/* sufxFind - Find sequence by searching suffix array. */ /* Copyright Jim Kent 2008 all rights reserved. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "sufx.h" boolean mmap; int maxMismatch = 2; int maxRepeat = 10; void usage() /* Explain usage and exit. */ { errAbort( "sufxFind - Find sequence by searching suffix array.\n" "usage:\n" " sufxFind target.sufx query.fa output\n" "options:\n" " -maxRepeat=N - maximum number of alignments to output on one query sequence. Default %d\n" " -maxMismatch - maximum number of mismatches allowed. Default %d.\n" " -mmap - Use memory mapping. Faster just a few reads, but much slower for many reads\n" , maxRepeat, maxMismatch ); } static struct optionSpec options[] = { {"maxRepeat", OPTION_INT}, {"maxMismatch", OPTION_INT}, {"mmap", OPTION_BOOLEAN}, {NULL, 0}, }; int sufxCountIdenticalPrefix(DNA *dna, int tileSize, bits32 *array, bits32 arraySize) /* Count up number of places in a row in array, where the referenced DNA is the * same up to tileSize bases. You do this a lot since generally the suffix tree * search just gives you the first place in the array that matches. */ { bits32 i; DNA *first = dna + array[0]; for (i=1; i= searchEnd) { finalSearch(tDna, suffixArray, searchStart, arrayPos, qDna, qSize, cursor-(arrayPos-searchStart), pHitList); return; /* Ran through all variations of letters at this position. */ } nextOffset = traverseArray[nextPos]; tBase = tDna[suffixArray[nextPos]+cursor]; // uglyf(" %c at %d\n", tBase, nextPos); if (qBase == tBase) { // uglyf(" match %c at %d\n", tBase, nextPos); searchStart = arrayPos = nextPos; break; } } } searchEnd = arrayPos + nextOffset; /* We are going to advance to next position in query and in array. * This will only possibly yield a match if the next item in the suffix * array has a prefix that matches the current position up to our current * offset. Happily this is encoded in the traverse array in a subtle way. * The step to find another letter at this position has to be greater than * one for the prefix to be shared. */ if (nextOffset <= 1) { finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize, cursor - (arrayPos-searchStart), pHitList); return; /* No match since prefix of next position doesn't match us. */ } ++arrayPos; } /* If we got here we matched the whole query sequence, and actually there's multiple * matches to it. It's actually rare to get to here on query sequences larger than * 20 bases or so. */ finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize, qSize - (arrayPos-searchStart-1), pHitList); // TODO - check -1 } void sufxFindExact(DNA *tDna, bits32 *suffixArray, bits32 *traverseArray, bits32 arraySize, DNA *qDna, int qSize, struct slInt **pHitList) /* Search for all exact matches to qDna in suffix array. Return them in pHitList. */ { sufxFindWithin(tDna, suffixArray, traverseArray, qDna, qSize, 0, 0, arraySize, pHitList); } boolean mostlySame(char *a, char *b, int size, int maxMismatch) /* Return TRUE if a and b are the same with possibly a few mismatches. */ { int mismatches = 0; int i; for (i=0; i maxMismatch) return FALSE; } return TRUE; } void sufxFuzzyFind(char *tDna, bits32 *suffixArray, bits32 *traverseArray, bits32 sliceStart, bits32 sliceEnd, int cursor, char *qDna, int qSize, int subCount, int maxSubs, struct slInt **pHitList) /* Search suffix array for matches up to maxSubs different. Recursive routine. */ { #ifdef DEBUG { uglyf("sufxFuzzyFind cursor=%d qSize=%d subCount=%d maxSubs=%d sliceStart=%d sliceEnd=%d\n", cursor, qSize, subCount, maxSubs, sliceStart, sliceEnd); char *q = cloneStringZ(qDna, qSize); tolowers(q); q[cursor] = toupper(q[cursor]); uglyf("%s\n", q); } #endif /* DEBUG */ /* Start recursion with end conditions. First is that we've matched the whole thing. */ if (cursor == qSize) { struct slInt *hit = slIntNew(sliceStart); slAddHead(pHitList, hit); return; } /* If we have used up all of our substitutions, then look for an exact match to * what is left and add it if it exists. This is another end condition. */ if (subCount == maxSubs) { sufxFindWithin(tDna, suffixArray, traverseArray, qDna, qSize, cursor, sliceStart, sliceEnd, pHitList); return; } /* If not at end, then call self for each of the possibilities present so far. */ bits32 arrayPos, nextArrayPos; for (arrayPos = sliceStart; arrayPos < sliceEnd; arrayPos = nextArrayPos) { bits32 subsliceSize = traverseArray[arrayPos]; nextArrayPos = arrayPos + subsliceSize; int extraSub = (qDna[cursor] == tDna[suffixArray[arrayPos]+cursor] ? 0 : 1); if (mostlySame(qDna+cursor+1, tDna+suffixArray[arrayPos]+cursor+1, qSize - cursor - 1, maxSubs - subCount - extraSub)) { struct slInt *hit = slIntNew(arrayPos); slAddHead(pHitList, hit); } if (subsliceSize > 1) { sufxFuzzyFind(tDna, suffixArray, traverseArray, arrayPos+1, nextArrayPos, cursor+1, qDna, qSize, subCount+extraSub, maxSubs, pHitList); } } } void sufxFind(char *sufxFile, char *queryFile, char *outputFile) /* sufxFind - Find sequence by searching suffix array.. */ { struct sufx *sufx = sufxRead(sufxFile, mmap); uglyTime("Loaded %s", sufxFile); struct dnaLoad *qLoad = dnaLoadOpen(queryFile); bits32 arraySize = sufx->header->arraySize; FILE *f = mustOpen(outputFile, "w"); struct dnaSeq *qSeq; int queryCount = 0, hitCount = 0, missCount=0; while ((qSeq = dnaLoadNext(qLoad)) != NULL) { struct slInt *hit, *hitList = NULL; verbose(2, "Processing %s\n", qSeq->name); toUpperN(qSeq->dna, qSeq->size); sufxFuzzyFind(sufx->allDna, sufx->array, sufx->traverse, 0, arraySize, 0, qSeq->dna, qSeq->size, 0, maxMismatch, &hitList); if (hitList != NULL) ++hitCount; else ++missCount; for (hit = hitList; hit != NULL; hit = hit->next) { bits32 hitIx = hit->val; bits32 count = sufxCountIdenticalPrefix(sufx->allDna, qSeq->size, sufx->array + hitIx, arraySize - hitIx); bits32 i; if (count > maxRepeat) { } else { for (i=0; iname, sufx->array[hitIx+i]); fprintf(f, "q %s\n", qSeq->dna); fprintf(f, "t "); mustWrite(f, sufx->allDna + sufx->array[hitIx+i], qSeq->size); fprintf(f, "\n"); } } } ++queryCount; dnaSeqFree(&qSeq); } uglyTime("Alignment"); verbose(1, "%d queries. %d hits (%5.2f%%). %d misses (%5.2f%%).\n", queryCount, hitCount, 100.0*hitCount/queryCount, missCount, 100.0*missCount/queryCount); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { uglyTime(NULL); optionInit(&argc, argv, options); if (argc != 4) usage(); maxRepeat = optionInt("maxRepeat", maxRepeat); maxMismatch = optionInt("maxMismatch", maxMismatch); mmap = optionExists("mmap"); dnaUtilOpen(); sufxFind(argv[1], argv[2], argv[3]); return 0; }