/* splat - Speedy Local Alignment Tool. For short reads - 25 bases and up. */ /* Copyright 2008 Jim Kent all rights reserved. */ /* The algorithm is designed to map 25-mers accommodating up to two base substitutions, * or one substitution and a single base insertion or deletion. The idea of the algorithm * is based on the idea that there will be a very similar 12-mer, and a very similar * 6-mer in the query sequence. First a "splix" indexed which is combined 12-mer and 6-mers * is constructed, and then the index is searched allowing a single base mismatch and a * single base shift between the 12-mers and 6-mers. * * This all probably sounds a little vague. To make it more concrete let's introduce some * notation. Let the letters A-Y represent locations 1-25 of the 25-mer. The 25-mer as a * whole is represented as ABCDEFGHIJKLMNOPQRSTUVWXY. In our search we'll consider two * major cases - those where there is a near-perfect match on the left side (ABCDEFGHIJKL) * and where there is a near perfect match on the right side (NOPQRSTUVWXYZ). * * Since we are only allowing only one indel, it must be either on the right or the left * side. Thus the other side must be free from indels. Since we allow only a single mismatch * if there is an indel, this indel-free side must differ by at most one base between the * query and the genome. Thus if we index all 12-mers in the genome, and look up the side and * all possible all possible single base differences from that side in the index, we will be * guaranteed to be able to find the true mapping location (if any) in the index. Since there * are 2 alternative bases at each of 12 positions in the side, plus the side itself, there are * 37 lookups we need to do for each side, and a total of 74 for both sides of the query. * * Unfortunately in a 4 gigabase genome, approximately the size of the human genome, * one would expect each 12-mer to occur 256 times. Since we're searching 74 times, this * means we expect 74 * 256 = 18944 spurious matches. We'd like to be a little more specific * than this. This is where the 6-mers come in. For now let's just consider the case * where the left side is the indel free near-perfect match. In the splix index * there is a list of all six-mers that come after the 12-mer, and also a list of * all six-mers that come immemediately after that. Thus in the case where there * is a perfect match to the query in the genome, there will be an index entry * corresponding to * ABCDEFGHIJKL MNOPQR STUVWX * where we call MNOPQR the 'after1' and STUVWX the 'after2.' * * In the no-indel case, we have up to two mismatches. If one of them is in the 12-mer, * then either after1 or after2 will be a perfect match. If the 12-mer is perfect, * then we have two mismatches to distribute between after1 and after2. If both mismatches * are in one of the afters, then the other will be perfect, otherwise both afters will have * exactly one mismatch. So, in the worst case, so long as we look for near-perfect matches * to both after1 and after2, we will be guaranteed to find the true mapping. * * This will end up narrowing down our search a lot. Each 6-mer lookup happens by chance just * 1 in 4096 times. We will need to do 2 * (3 * 6 + 1) = 38 searches though to handle the * off-by-one. This gets us down to approximately 176 spurious matches for every true match. * * The segmented nature of the index, and the fact that our query has 25 rather than 24 * bases together make it possible to accommodate single base insertions in this scheme * relatively inexpensively. Considering just to consider the case where the indel is on * the right side, it can either occur in the MNOPQR section of the STUVWX section. If * it occurs in the latter section, there is no problem, because MNOPQR will match. We only * need to consider then the case where the indel is in the 'third quadrant' that is in * MNOPQR. * * Let's represent the matching part of the genome with the same letters. An insertion in * the genome in this quadrant would thus yeild one of the following things in the splix index: * ABCDEFGHIJKL +MNOPQ RSTUVW * ABCDEFGHIJKL M+NOPQ RSTUVW * ABCDEFGHIJKL MN+OPQ RSTUVW * ABCDEFGHIJKL MNO+PQ RSTUVW * ABCDEFGHIJKL MNOP+Q RSTUVW * ABCDEFGHIJKL MNOPQ+ RSTUVW * Note in all cases the after2 is RSTUVW. Fortunately this is a sequence that is found in * the query, just using bases 17-23 instead of the usual 18-24 for the after2. * * A deletion in the genome would be one of the following cases: * ABCDEFGHIJKL NOPQRS TUVWXY * ABCDEFGHIJKL MOPQRS TUVWXY * ABCDEFGHIJKL MNPQRS TUVWXY * ABCDEFGHIJKL MNOQRS TUVWXY * ABCDEFGHIJKL MNOPRS TUVWXY * ABCDEFGHIJKL MNOPQS TUVWXY * Note in all cases the after2 is TUVWXY. Fortunately this is also a sequence that is found * in the query, just usig bases 19-25 instead of the usual 18-24 for the after2. * * Thus by doing a search on the after2 shifted one base left and right we pick up all single * base deletions. This involves 38 searches, and has the effect of bringing us up to an * expectation of 352 spurious matches. * * Doing 352 extensions per read is not that much, but still we want to do 100's of millions * of reads in a timely manner, so the strategy here is to do as little work per extension * as possible. From the index search we already know the mismatches * in 18 of the query bases, know if there is an indel, and if there is an indel have * localized it to within about 6 bases. So we only need to do the extension on the parts * that haven't been searched in the index. The code to do this is a little long and * contains quite a few cases, but it contains no nested loops, and the single loops just run * about 6 times so it is quite fast. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "dnaseq.h" #include "dnaLoad.h" #include "splix.h" #include "intValTree.h" #include "fa.h" #include "fuzzyFind.h" #include "psl.h" #include "axt.h" #include "maf.h" #include "splat.h" char *version = "31"; /* Program version number. */ /* Command line driven variables. */ static char *over = NULL; static char *out = "splat"; static int maxDivergence = 5; static boolean worseToo = FALSE; static int maxRepeat = 10; static char *repeatOutput = NULL; static int maxGap = 1; static int maxMismatch = 2; static boolean memoryMap = FALSE; static int tagSize; static boolean exactOnly; /* Set to true if maxGap and maxMismatch are both 0. */ static void usage() /* Explain usage and exit. */ { errAbort( "splat - SPeedy Local Alignment Tool. v%s. Designed to align small reads to large\n" "DNA databases such as genomes quickly. Reads need to be at least 25 bases.\n" "usage:\n" " splat target query output\n" "where:\n" " target is a large indexed dna database in splix format. Use splixMake to create this\n" " file from fasta, 2bit or nib format\n" " query is a fasta or fastq file full of sequences of at least 25 bases each\n" " output is the output alignment file, by default in .splat format\n" "note: can use 'stdin' or 'stdout' as a file name for better piping\n" "overall options:\n" " -out=format - Output format. Options include splat (default), psl, maf, [soap, eland soon....]\n" " -worseToo - if set return alignments other than the best alignments\n" " -maxRepeat=N - maximum number of alignments to output on one query sequence. Default %d\n" " -over=fileName - Name of file with list of 25-mers that map many times in genome.\n" " Helps speed in some cases. Typically use file human10.over for human genome.\n" " -repeatOutput=fileName.fa - Put reads that map more than maxRepeat times in here\n" "options effecting just first 25 bases\n" " -maxGap=N. Maximum gap size. Default is %d. Set to 0 for no gaps\n" " -maxMismatch=N. Maximum number of mismatches. Default %d. (A gap counts as a mismatch.)\n" " -maxDivergence=N Maximum divergence level between read and genome to map. Default %d\n" " Divergence combines gaps and mismatches. Mismatch=2, gap=3\n" " -mmap - Use memory mapping. Faster just a few reads, but much slower for many reads\n" , version, maxRepeat, maxGap, maxMismatch, maxDivergence ); } static struct optionSpec options[] = { {"over", OPTION_STRING}, {"out", OPTION_STRING}, {"maxDivergence", OPTION_FLOAT}, {"worseToo", OPTION_BOOLEAN}, {"maxRepeat", OPTION_INT}, {"repeatOutput", OPTION_STRING}, {"maxGap", OPTION_INT}, {"maxMismatch", OPTION_INT}, {"mmap", OPTION_BOOLEAN}, {NULL, 0}, }; static FILE *repeatOutputFile = NULL; int overArraySize; bits64 *overArray; long exactIndexQueries, hits12, hits18, hits25, hitsFull; static int splatTagCmp(const void *va, const void *vb) /* Sort tags based on position fields, then divergence. */ { const struct splatTag *a = *((struct splatTag **)va); const struct splatTag *b = *((struct splatTag **)vb); int diff; if ((diff = a->t1 - b->t1) != 0) return diff; int aEnd = a->t2 + a->size2; int bEnd = b->t2 + b->size2; if ((diff = aEnd - bEnd) != 0) return diff; if ((diff = a->strand - b->strand) != 0) return diff; if ((diff = a->divergence - b->divergence) != 0) return diff; if ((diff = a->q1 - b->q1) != 0) return diff; aEnd = a->q2 + a->size2; bEnd = b->q2 + b->size2; return aEnd - bEnd; } void splatAlignFree(struct splatAlign **pAli) /* Free up a splatAlign. */ { struct splatAlign *ali = *pAli; if (ali != NULL) { chainFree(&ali->chain); freez(pAli); } } void splatAlignFreeList(struct splatAlign **pList) /* Free a list of dynamically allocated splatAlign's */ { struct splatAlign *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; splatAlignFree(&el); } *pList = NULL; } void splatAlignFreeList(struct splatAlign **pList); /* Free up a list of splatAligns. */ static int splatAlignCmpScore(const void *va, const void *vb) /* Sort alignments based on score. */ { const struct splatAlign *a = *((struct splatAlign **)va); const struct splatAlign *b = *((struct splatAlign **)vb); double diff = a->chain->score - b->chain->score; if (diff < 0) return -1; else if (diff > 0) return 1; else return 0; } #ifdef DEBUG static void dumpSixmer(int hexamer) /* Write out hexamer */ { fputc(valToNt[(hexamer>>10)&3], stdout); fputc(valToNt[(hexamer>>8)&3], stdout); fputc(valToNt[(hexamer>>6)&3], stdout); fputc(valToNt[(hexamer>>4)&3], stdout); fputc(valToNt[(hexamer>>2)&3], stdout); fputc(valToNt[hexamer&3], stdout); } static void dumpTwelvemer(int twelvemer) { fputc(valToNt[(twelvemer>>22)&3], stdout); fputc(valToNt[(twelvemer>>20)&3], stdout); fputc(valToNt[(twelvemer>>18)&3], stdout); fputc(valToNt[(twelvemer>>16)&3], stdout); fputc(valToNt[(twelvemer>>14)&3], stdout); fputc(valToNt[(twelvemer>>12)&3], stdout); dumpSixmer(twelvemer); } #endif /* DEBUG */ static void dnaToBinary(DNA *dna, int maxGap, int *pFirstHalf, int *pSecondHalf, bits16 *pAfter1, bits16 *pAfter2, bits16 *pBefore1, bits16 *pBefore2, bits16 *pFirstHex, bits16 *pLastHex, bits16 *pTwoAfterFirstHex, bits16 *pTwoBeforeLastHex) /* Convert from DNA string to binary representation chopped up various ways. */ { bits32 firstHalf = (ntValNoN[(int)dna[0]] << 22) + (ntValNoN[(int)dna[1]] << 20) + (ntValNoN[(int)dna[2]] << 18) + (ntValNoN[(int)dna[3]] << 16) + (ntValNoN[(int)dna[4]] << 14) + (ntValNoN[(int)dna[5]] << 12) + (ntValNoN[(int)dna[6]] << 10) + (ntValNoN[(int)dna[7]] << 8) + (ntValNoN[(int)dna[8]] << 6) + (ntValNoN[(int)dna[9]] << 4) + (ntValNoN[(int)dna[10]] << 2) + (ntValNoN[(int)dna[11]]); bits16 after1 = (ntValNoN[(int)dna[12]] << 10) + (ntValNoN[(int)dna[13]] << 8) + (ntValNoN[(int)dna[14]] << 6) + (ntValNoN[(int)dna[15]] << 4) + (ntValNoN[(int)dna[16]] << 2) + (ntValNoN[(int)dna[17]]); bits16 after2 = (ntValNoN[(int)dna[18]] << 10) + (ntValNoN[(int)dna[19]] << 8) + (ntValNoN[(int)dna[20]] << 6) + (ntValNoN[(int)dna[21]] << 4) + (ntValNoN[(int)dna[22]] << 2) + (ntValNoN[(int)dna[23]]); bits32 secondHalf; bits16 before1, before2; bits16 firstHex = firstHalf >> 12; bits16 lastHex; if (maxGap) { int base24 = ntValNoN[(int)dna[24]]; lastHex = ((after2 << 2) + base24) & 0xFFF; secondHalf = (((((bits32)after1)<<14) + (after2<<2) + base24) & 0xFFFFFF); before1 = ((firstHalf>>10)&0xFFF); before2 = (((firstHalf<<2) + (ntValNoN[(int)dna[12]])) & 0xFFF); } else { lastHex = after2; secondHalf = (((bits32)after1)<<12) + after2; before1 = firstHex; before2 = firstHalf & 0xFFF; } *pFirstHalf = firstHalf; *pSecondHalf = secondHalf; *pAfter1 = after1; *pAfter2 = after2; *pBefore1 = before1; *pBefore2 = before2; *pFirstHex = firstHex; *pLastHex = lastHex; *pTwoAfterFirstHex = (firstHalf >> 8) & 0xFFF; *pTwoBeforeLastHex = (secondHalf >> 4) & 0xFFF; } #ifdef SOON static boolean chopOutPieces(bits64 bases25, int maxGap, int *pFirstHalf, int *pSecondHalf, bits16 *pAfter1, bits16 *pAfter2, bits16 *pBefore1, bits16 *pBefore2, bits16 *pFirstHex, bits16 *pLastHex, bits16 *pTwoAfterFirstHex, bits16 *pTwoBeforeLastHex) /* Convert from DNA string to binary representation chopped up various ways. */ { } #endif /* SOON */ int hexCmp(const void *va, const void *vb) /* Compare two bit16. */ { bits16 a = *((bits16*)va), b = *((bits16*)vb); return a-b; } int binaryFindHex(bits16 val, bits16 *pos, int posCount) /* Find index of hex that matches val, or -1 if none such. */ { int startIx=0, endIx=posCount-1, midIx; bits16 posVal; for (;;) { if (startIx == endIx) { posVal = pos[startIx]; if (posVal == val) return startIx; else return -1; } midIx = ((startIx + endIx)>>1); posVal = pos[midIx]; if (posVal < val) startIx = midIx+1; else endIx = midIx; } } int linearFindHex(bits16 query, bits16 *hexes, int hexCount) /* See if query sequence is in hexes, which is sorted. */ { int i; for (i=0; i= 0) { for (;;) { int dnaOffset = offsets[ix] + startOffset; struct splatHit *hit; lmAllocVar(lm, hit); hit->tOffset = dnaOffset; hit->gapSize = gapSize; hit->missingQuad = missingQuad; hit->subCount = subCount; slAddHead(pHitList, hit); ix += 1; ++hits18; if (sortedHexes[ix] != hex) break; } } } static void searchExact(struct splix *splix, int twelvemer, bits16 sixmer, int whichSixmer, int dnaStartOffset, int subCount, int gapSize, int missingQuad, struct lm *lm, struct splatHit **pHitList) /* If there's an exact match put DNA offset of match into hitTree. */ { exactIndexQueries += 1; int slotSize = splix->slotSizes[twelvemer]; if (slotSize != 0) { /* Compute actual positions of the subindexes for the 1/4 reads in the slot. */ char *slot = splix->slots[twelvemer]; bits16 *hexesSixmer = (bits16*)(slot + whichSixmer*sizeof(bits16)*slotSize); bits32 *offsetsSixmer = (bits32*)((8+sizeof(bits32)*whichSixmer)*slotSize + slot); exactIndexHits(sixmer, hexesSixmer, slotSize, offsetsSixmer, dnaStartOffset, subCount, gapSize, missingQuad, lm, pHitList); } } static void searchExact12Vary6(struct splix *splix, int twelvemer, bits16 sixmer, int whichSixmer, int dnaStartOffset, int gapSize, int missingQuad, struct lm *lm, struct splatHit **pHitList) /* Search for exact matches to twelvemer and off-by-ones to sixmer. */ { bits16 oneOff = sixmer; /* The toggles here are xor'd to quickly cycle us through all 4 binary values for a base. */ bits16 toggle1 = 1, toggle2 = 2; int baseIx; for (baseIx = 0; baseIx<6; ++baseIx) { oneOff ^= toggle1; searchExact(splix, twelvemer, oneOff, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle2; searchExact(splix, twelvemer, oneOff, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle1; searchExact(splix, twelvemer, oneOff, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle2; /* Restore base to unmutated form. */ toggle1 <<= 2; /* Move on to next base. */ toggle2 <<= 2; /* Move on to next base. */ } } static void searchVary12Exact6(struct splix *splix, int twelvemer, bits16 sixmer, int whichSixmer, int dnaStartOffset, int gapSize, int missingQuad, struct lm *lm, struct splatHit **pHitList) /* Search for exact matches to twelvemer and off-by-ones to sixmer. */ { bits32 oneOff = twelvemer; /* The toggles here are xor'd to quickly cycle us through all 4 binary values for a base. */ bits32 toggle1 = 1, toggle2 = 2; int baseIx; for (baseIx = 0; baseIx<12; ++baseIx) { oneOff ^= toggle1; searchExact(splix, oneOff, sixmer, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle2; searchExact(splix, oneOff, sixmer, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle1; searchExact(splix, oneOff, sixmer, whichSixmer, dnaStartOffset, 1, gapSize, missingQuad, lm, pHitList); oneOff ^= toggle2; /* Restore base to unmutated form. */ toggle1 <<= 2; /* Move on to next base. */ toggle2 <<= 2; /* Move on to next base. */ } } struct alignContext /* Context for an alignment. Necessary data for rbTree-traversing call-back function. * Actually we no longer need this, but it does bundle together parameters shared * by a number of routines together. */ { struct splix *splix; /* Index. */ struct splatTag **pTagList; /* Pointer to tag list that gets filled in. */ struct dnaSeq *qSeq; /* DNA sequence. */ char strand; /* query strand. */ int tagPosition; /* Position of tag. */ struct lm *lm; /* Local memory pool where tags are allocated. */ }; int countDnaDiffs(DNA *a, DNA *b, int size) /* Count number of differing bases. */ { int i; int count = 0; for (i=0; itagPosition; q2 += c->tagPosition; /* Normalize q2,t2 to make 'gap' zero size in single block case */ if (size2 <= 0) { size2 = 0; q2 = q1 + size1; t2 = t1 + size1; } /* Calculate divergence - 2*bases_different + 2*(gap_count) + base_in_gaps. */ int qGap = q1 + size1 - q2; int tGap = t1 + size1 - t2; int gap = max(qGap, tGap); int divergence = 2*diffCount + gap; if (gap) divergence += 2; /* Alloc and fill in tag. */ struct splatTag *tag; lmAllocVar(c->lm, tag); tag->divergence = divergence; tag->q1 = q1; tag->t1 = t1; tag->size1 = size1; tag->q2 = q2; tag->t2 = t2; tag->size2 = size2; tag->strand = c->strand; /* Add to list. */ slAddHead(c->pTagList, tag); } static void slideInsert1(DNA *q, DNA *t, int gapSize, int solidSize, int mysterySize, int *retBestGapPos, int *retLeastDiffs) /* Slide insert though all possible positions and return best position and number * of differences there. */ { int gapPos = -1; int diffs = countDnaDiffs(q, t, solidSize); int bestGapPos = gapPos; int leastDiffs = diffs; for (gapPos = 0; gapPos < mysterySize; ++gapPos) { DNA qq = q[gapPos]; DNA tt = t[gapPos]; if (qq != tt) diffs -= 1; if (q[gapPos-gapSize] != tt) diffs += 1; if (diffs < leastDiffs) { leastDiffs = diffs; bestGapPos = gapPos; } } *retLeastDiffs = leastDiffs; *retBestGapPos = bestGapPos; } static void slideInsert2(DNA *q, DNA *t, int gapSize, int solidSize, int mysterySize, int *retBestGapPos, int *retLeastDiffs) /* Slide insert though all possible positions and return best position and number * of differences there. Does this a _slightly_ different way than slideInsert1. * Actually different in at least three ways, and to parameterize all three would * give it a lot of parameters.... */ { int diffs = countDnaDiffs(q+gapSize, t, solidSize); int bestGapPos = -1; int leastDiffs = diffs; int gapPos; for (gapPos = 0; gapPos < mysterySize; ++gapPos) { DNA qq = q[gapPos]; DNA tt = t[gapPos]; if (qq != tt) diffs += 1; if (q[gapPos+gapSize] != tt) diffs -= 1; if (diffs < leastDiffs) { leastDiffs = diffs; bestGapPos = gapPos; } } *retLeastDiffs = leastDiffs; *retBestGapPos = bestGapPos; } static void slideDeletion2(DNA *q, DNA *t, int gapSize, int solidSize, int mysterySize, int *retBestGapPos, int *retLeastDiffs) /* Slide deletion through all possible positions and return best position and number * of differences there. */ { int diffs = countDnaDiffs(q, t+gapSize, solidSize); int bestGapPos = -1; int leastDiffs = diffs; int gapPos; for (gapPos = 0; gapPos < mysterySize; ++gapPos) { DNA qq = q[gapPos]; DNA tt = t[gapPos]; if (qq != tt) diffs += 1; if (qq != t[gapPos+gapSize]) diffs -= 1; if (diffs < leastDiffs) { leastDiffs = diffs; bestGapPos = gapPos; } } *retLeastDiffs = leastDiffs; *retBestGapPos = bestGapPos; } static void extendMaybeOutputNoIndexIndels(struct splatHit *hit, struct alignContext *c) /* Handle case where the 12-mer and the 6-mer in the index both hit without any * indication of an indel. In the case where the missing quadrant is the first or * last though there could still be an indel in one of these quadrands. */ { int tOffset = hit->tOffset; int origDiffCount = hit->subCount; int diffCount = origDiffCount; int quad = hit->missingQuad; struct dnaSeq *qSeq = c->qSeq; DNA *qDna = qSeq->dna + c->tagPosition; DNA *tDna = c->splix->allDna + tOffset; switch (hit->missingQuad) { /* The ?'s in the comments below indicate areas that have not been explored in the index. * The _'s indicate areas that have been explored in the index. * The singleton ? may actually represent 0 up to maxGap bases. */ case 0: /* Explore ? ?????? ______ ______ ______ */ diffCount += countDnaDiffs(qDna, tDna, 6+maxGap); break; case 1: /* Explore ? ______ ?????? ______ ______ */ diffCount += countDnaDiffs(qDna, tDna, maxGap); diffCount += countDnaDiffs(qDna + 6 + maxGap, tDna + 6 + maxGap, 6); break; case 2: /* Explore ______ ______ ?????? ______ ? */ diffCount += countDnaDiffs(qDna+12, tDna+12, 6); diffCount += countDnaDiffs(qDna+24, tDna+24, maxGap); break; case 3: /* Explore ______ ______ ______ ?????? ? */ diffCount += countDnaDiffs(qDna+18, tDna+18, 6+maxGap); break; } if (qSeq->size > 24 && maxGap == 0) /* Make 25th base significant in all cases. */ { if (c->strand == '-') { if (qDna[-1] != tDna[-1]) ++diffCount; } else { if (qDna[24] != tDna[24]) ++diffCount; } } if (diffCount <= maxMismatch) addMatch(diffCount, c, tOffset, 0, tagSize, 0, 0, 0); else if (maxGap > 0 && origDiffCount < maxMismatch) { /* Need to explore possibilities of indels if we are on the end quadrants */ int gapSize = 1; /* May need more work here for indels bigger than a single base. */ int bestGapPosInsert, leastDiffsInsert; int bestGapPosDelete, leastDiffsDelete; if (quad == 0) { /* Figure out insertion best score and position. */ slideInsert1(qDna+gapSize, tDna+gapSize, gapSize, 6, 7, &bestGapPosInsert, &leastDiffsInsert); /* Figure out deletion best score and position. */ slideDeletion2(qDna, tDna-gapSize, gapSize, 7, 8, &bestGapPosDelete, &leastDiffsDelete); /* Output better scoring of insert or delete - delete on tie since it has one * more matching base. */ if (leastDiffsInsert < leastDiffsDelete) { if (leastDiffsInsert < maxMismatch) { int b1Size = bestGapPosInsert + 1; addMatch(leastDiffsInsert+origDiffCount, c, tOffset+gapSize, 0, b1Size, tOffset+gapSize+b1Size, b1Size+gapSize, tagSize - b1Size - gapSize); } } else { if (leastDiffsDelete < maxMismatch) { int b1Size = bestGapPosDelete + 1; addMatch(leastDiffsDelete+origDiffCount, c, tOffset-gapSize, 0, b1Size, tOffset+b1Size, b1Size, tagSize - b1Size); } } } else if (quad == 3) { /* Figure out insertion best score and position. */ int startOffset = 18; slideInsert2(qDna+startOffset, tDna+startOffset, gapSize, 6, 7, &bestGapPosInsert, &leastDiffsInsert); /* Figure out deletion best score and position. */ slideDeletion2(qDna+startOffset, tDna+startOffset, gapSize, 7, 8, &bestGapPosDelete, &leastDiffsDelete); /* Output better scoring of insert or delete - delete on tie since it has one * more matching base. */ if (leastDiffsInsert < leastDiffsDelete) { if (leastDiffsInsert < maxMismatch) { int b1Size = bestGapPosInsert + 1 + startOffset; addMatch(leastDiffsInsert+origDiffCount, c, tOffset, 0, b1Size, tOffset+b1Size, b1Size+gapSize, tagSize - b1Size - gapSize); } } else { if (leastDiffsDelete < maxMismatch) { int b1Size = bestGapPosDelete + 1 + startOffset; addMatch(leastDiffsDelete+origDiffCount, c, tOffset, 0, b1Size, tOffset+b1Size+gapSize, b1Size, tagSize - b1Size); } } } } } static void extendMaybeOutputInsert(struct splatHit *hit, struct alignContext *c) /* Handle case where there is a base inserted in the query relative to the target. */ { int tOffset = hit->tOffset; int mysteryOffset = 0; /* Offset of unexplored region in query. */ int mysterySize = 6; /* May need to adjust when have >1 size gaps, not sure. */ int gapSize = hit->gapSize; switch (hit->missingQuad) { case 1: /* Explore ______ ?????? ? ______ ______ */ mysteryOffset = 6; break; case 2: /* Explore ______ ______ ? ?????? ------ */ mysteryOffset = 12; break; default: internalErr(); break; } int bestGapPos, leastDiffs; slideInsert2(c->qSeq->dna + c->tagPosition + mysteryOffset, c->splix->allDna + tOffset + mysteryOffset, gapSize, mysterySize, mysterySize, &bestGapPos, &leastDiffs); leastDiffs += hit->subCount + 1; if (leastDiffs <= maxMismatch) { int b1Size = bestGapPos + mysteryOffset + 1; addMatch(leastDiffs, c, tOffset, 0, b1Size, tOffset+b1Size, b1Size+gapSize, tagSize - b1Size - gapSize); } } static void extendMaybeOutputDeletion(struct splatHit *hit, struct alignContext *c) /* Handle case where there is a base deleted in the query relative to the target. */ { int tOffset = hit->tOffset; int gapSize = -hit->gapSize; int bigMysteryOffset = 0, qSmallMysteryOffset = 0, tSmallMysteryOffset = 0; int bigMysterySize = 6, smallMysterySize = gapSize; int diffCount = hit->subCount; switch (hit->missingQuad) { case 1: /* Explore Q ?? ______ ????? ______ ______ */ /* Explore T ?? ______ ?????? ______ ______ */ bigMysteryOffset = 8; break; case 2: /* Explore Q ______ ______ ????? ------ ?? */ /* Explore T ______ ______ ?????? ------ ?? */ bigMysteryOffset = 12; qSmallMysteryOffset = 23; tSmallMysteryOffset = 24; break; default: internalErr(); break; } DNA *q = c->qSeq->dna + c->tagPosition; DNA *t = c->splix->allDna + tOffset; diffCount += countDnaDiffs(q + qSmallMysteryOffset, t + tSmallMysteryOffset, smallMysterySize); if (diffCount < maxMismatch) { int bestGapPos, leastDiffs; slideDeletion2(q + bigMysteryOffset, t+bigMysteryOffset, gapSize, bigMysterySize-gapSize, bigMysterySize, &bestGapPos, &leastDiffs); leastDiffs += diffCount + 1; if (leastDiffs <= maxMismatch) { int b1Size = bestGapPos + bigMysteryOffset + 1; addMatch(leastDiffs, c, tOffset, 0, b1Size, tOffset+b1Size+gapSize, b1Size, tagSize - b1Size); } } } static void extendMaybeOutput(struct splatHit *hit, struct alignContext *c) /* Callback function for tree traverser that extends alignments and if good, outputs them. */ { verbose(3, " tOffset=%d gapSize=%d subCount=%d missingQuad=%d\n", hit->tOffset, hit->gapSize, hit->subCount, hit->missingQuad); if (hit->gapSize == 0) { extendMaybeOutputNoIndexIndels(hit, c); } else if (hit->gapSize > 0) { extendMaybeOutputInsert(hit, c); } else { extendMaybeOutputDeletion(hit, c); } } void splatHitsOneStrand(struct dnaSeq *qSeq, bits64 bases25, char strand, int tagPosition, struct splix *splix, int maxGap, struct lm *lm, struct splatHit **pHitList) /* Look through index for hits to query tag on one strand. * Input: * qSeq - entire query sequence, reverse complimented if on - strand * strand - either + or - * tagPosition - start position within query to pass against index. * splixMinQuerySize + maxGap long. * splix - the index to search * maxGap - the maximum insertion or deletion size * A list of splat hits allocated on lm. */ { /* Convert DNA to binary format */ int firstHalf, secondHalf; bits16 after1, after2, before1, before2; bits16 firstHex, lastHex, twoAfterFirstHex, twoBeforeLastHex; dnaToBinary(qSeq->dna + tagPosition, maxGap, &firstHalf, &secondHalf, &after1, &after2, &before1, &before2, &firstHex, &lastHex, &twoAfterFirstHex, &twoBeforeLastHex); int secondHalfPos = -12 - maxGap; searchExact(splix, firstHalf, after1, 2, 0, 0, 0, 3, lm, pHitList); if (!exactOnly) { searchExact(splix, firstHalf, after2, 3, 0, 0, 0, 2, lm, pHitList); searchExact(splix, secondHalf, before1, 0, secondHalfPos, 0, 0, 1, lm, pHitList); searchExact(splix, secondHalf, before2, 1, secondHalfPos, 0, 0, 0, lm, pHitList); } if (maxMismatch > 1) { searchVary12Exact6(splix, firstHalf, after1, 2, 0, 0, 3, lm, pHitList); searchVary12Exact6(splix, firstHalf, after2, 3, 0, 0, 2, lm, pHitList); searchVary12Exact6(splix, secondHalf, before1, 0, secondHalfPos, 0, 1, lm, pHitList); searchVary12Exact6(splix, secondHalf, before2, 1, secondHalfPos, 0, 0, lm, pHitList); searchExact12Vary6(splix, firstHalf, after1, 2, 0, 0, 3, lm, pHitList); searchExact12Vary6(splix, firstHalf, after2, 3, 0, 0, 2, lm, pHitList); searchExact12Vary6(splix, secondHalf, before1, 0, secondHalfPos, 0, 1, lm, pHitList); searchExact12Vary6(splix, secondHalf, before2, 1, secondHalfPos, 0, 0, lm, pHitList); } /* Look at single base indels with few mismatches */ if (maxGap > 0) { searchExact(splix, secondHalf, twoAfterFirstHex, 0, secondHalfPos-maxGap, 0, -1, 1, lm, pHitList); searchExact(splix, secondHalf, firstHex, 0, secondHalfPos+maxGap, 0, 1, 1, lm, pHitList); searchExact(splix, firstHalf, twoBeforeLastHex, 3, 0, 0, -1, 2, lm, pHitList); searchExact(splix, firstHalf, lastHex, 3, 0, 0, 1, 2, lm, pHitList); if (maxMismatch > 1) { searchExact12Vary6(splix, secondHalf, twoAfterFirstHex, 0, secondHalfPos-maxGap, -1, 1, lm, pHitList); searchExact12Vary6(splix, secondHalf, firstHex, 0, secondHalfPos+maxGap, 1, 1, lm, pHitList); searchExact12Vary6(splix, firstHalf, twoBeforeLastHex, 3, 0, -1, 2, lm, pHitList); searchExact12Vary6(splix, firstHalf, lastHex, 3, 0, 1, 2, lm, pHitList); searchVary12Exact6(splix, secondHalf, twoAfterFirstHex, 0, secondHalfPos-maxGap, -1, 1, lm, pHitList); searchVary12Exact6(splix, secondHalf, firstHex, 0, secondHalfPos+maxGap, 1, 1, lm, pHitList); searchVary12Exact6(splix, firstHalf, twoBeforeLastHex, 3, 0, -1, 2, lm, pHitList); searchVary12Exact6(splix, firstHalf, lastHex, 3, 0, 1, 2, lm, pHitList); } } } void extendHitsToTags(struct splatHit *hitList, struct dnaSeq *qSeq, char strand, int tagPosition, struct splix *splix, struct lm *lm, struct splatTag **pTagList) /* Examine each of the hits in the hitTree, and add ones that are high scoring enough * after extension to the tagList. If bestOnly is set it may free existing tags on list if * it finds a higher scoring tag. */ { struct splatHit *hit; struct alignContext ac; ZeroVar(&ac); ac.splix = splix; ac.pTagList = pTagList; ac.qSeq = qSeq; ac.strand = strand; ac.tagPosition = tagPosition; ac.lm = lm; for (hit = hitList; hit != NULL; hit = hit->next) extendMaybeOutput(hit, &ac); } static struct splatTag *removeDupeTags(struct splatTag *tagList) /* Return list with duplicate tags removed. As a side effect sort list. */ { struct splatTag *tag, *next, *newList = NULL; slSort(&tagList, splatTagCmp); for (tag = tagList; tag != NULL; tag = next) { next = tag->next; if (next == NULL || splatTagCmp(&tag, &next) != 0) slAddHead(&newList, tag); } slReverse(&newList); return newList; } static int findLeastDivergence(struct splatTag *tagList) { struct splatTag *tag; int leastDivergence = tagList->divergence; for (tag = tagList->next; tag != NULL; tag = tag->next) { if (tag->divergence < leastDivergence) leastDivergence = tag->divergence; } return leastDivergence; } static struct splatTag *splatTagFilterOnDivergence(struct splatTag *tagList, int maxDivergence) /* Remove (and free) tags with more than maxDivergence from list. */ { struct splatTag *tag, *next, *newList = NULL; for (tag = tagList; tag != NULL; tag = next) { next = tag->next; if (tag->divergence <= maxDivergence) slAddHead(&newList, tag); } slReverse(&newList); return newList; } static void splatOneStrand(struct dnaSeq *qSeq, bits64 bases25, char strand, int tagPosition, struct splix *splix, int maxGap, struct lm *lm, struct splatTag **pTagList) /* Align one query strand against index, filter, and write out results */ { struct splatHit *hitList = NULL; splatHitsOneStrand(qSeq, bases25, strand, tagPosition, splix, maxGap, lm, &hitList); struct splatTag *tagList = NULL; int hitCount = slCount(hitList); verbose(2, " %d hits on %c strand\n", hitCount, strand); if (hitCount > 0) { extendHitsToTags(hitList, qSeq, strand, tagPosition, splix, lm, &tagList); /* Since this is faster than removing dupes, and will often trim list a lot, do * it now, even though we have to do it again when we have data on both strands. */ if (tagList != NULL && !worseToo) tagList = splatTagFilterOnDivergence(tagList, findLeastDivergence(tagList) ); /* Some duplicate tags may have come through. This also will filter the tags * by chromosome position. */ tagList = removeDupeTags(tagList); *pTagList = slCat(tagList, *pTagList); } } static void splatOne(struct dnaSeq *qSeqF, struct splix *splix, int maxGap, struct axtScoreScheme *scoreScheme, FILE *f, int *retMapCount, boolean *retIsRepeat) /* Align one query sequence against index, filter, and write out results. * Returns the number of mappings. */ { /* Local memory pool for hits and tags for this sequence. */ struct lm *lm = lmInit(1024*4); /* Make reverse complement of query too. */ struct dnaSeq *qSeqR = cloneDnaSeq(qSeqF); reverseComplement(qSeqR->dna, qSeqR->size); struct splatTag *tagList = NULL; int size = qSeqF->size; int desiredSize = tagSize; int outputCount = 0; boolean isRepeat = FALSE; if (size < desiredSize) { warn("%s is %d bases, minimum splat query size %d, skipping", qSeqF->name, tagSize, qSeqF->size); return; } bits64 bases25f = basesToBits64(qSeqF->dna, 25); boolean isRepeatingOver = overCheck(bases25f, overArraySize, overArray); if (!isRepeatingOver) { splatOneStrand(qSeqF, bases25f, '+', 0, splix, maxGap, lm, &tagList); int tagPosition = qSeqR->size - desiredSize; bits64 bases25r = basesToBits64(qSeqR->dna + tagPosition, 25); isRepeatingOver = overCheck(bases25r, overArraySize, overArray); if (!isRepeatingOver) splatOneStrand(qSeqR, bases25r, '-', tagPosition, splix, maxGap, lm, &tagList); } verbose(2, " %d tags\n", slCount(tagList)); if (isRepeatingOver) { if (repeatOutputFile != NULL) faWriteNext(repeatOutputFile, qSeqF->name, qSeqF->dna, qSeqF->size); isRepeat = TRUE; } else if (tagList != NULL) { /* Find least divergence. */ /* Unless doing worseToo remove tags that are not best scoring */ if (!worseToo) tagList = splatTagFilterOnDivergence(tagList, findLeastDivergence(tagList) ); /* Count up mappings, and output either to repeat file or to mapping file. */ outputCount = slCount(tagList); hits25 += outputCount; isRepeat = (outputCount > maxRepeat); if (isRepeat) { if (repeatOutputFile != NULL) faWriteNext(repeatOutputFile, qSeqF->name, qSeqF->dna, qSeqF->size); outputCount = 0; } else { struct splatAlign *aliList = splatExtendTags(tagList, qSeqF, qSeqR, splix, scoreScheme); hitsFull += slCount(aliList); slSort(&aliList, splatAlignCmpScore); splatOutList(aliList, out, qSeqF, qSeqR, splix, f); splatAlignFreeList(&aliList); } #ifdef SOON #endif /* SOON */ } dnaSeqFree(&qSeqR); lmCleanup(&lm); *retMapCount = outputCount; *retIsRepeat = isRepeat; } void splat(char *target, char *query, char *output) /* splat - Speedy Local Alignment Tool. */ { struct axtScoreScheme *scoreScheme = axtScoreSchemeSimpleDna(2, 2, 2, 2); struct dnaLoad *qLoad = dnaLoadOpen(query); if (over != NULL) { overRead(over, maxRepeat+1, &overArraySize, &overArray); } struct splix *splix = splixRead(target, memoryMap); verboseTime(1, "Loaded %s", target); FILE *f = mustOpen(output, "w"); if (repeatOutput != NULL) repeatOutputFile = mustOpen(repeatOutput, "w"); splatOutHeader(target, query, out, f); struct dnaSeq *qSeq; int uniqCount = 0, totalMap = 0, totalRepeat = 0, totalReads = 0; while ((qSeq = dnaLoadNext(qLoad)) != NULL) { verbose(2, "Processing %s\n", qSeq->name); toUpperN(qSeq->dna, qSeq->size); int mapCount = 0; boolean isRepeat = FALSE; splatOne(qSeq, splix, maxGap, scoreScheme, f, &mapCount, &isRepeat); verbose(2, " %d mappings\n", mapCount); if (mapCount > 0) { ++totalMap; if (mapCount == 1) ++uniqCount; } if (isRepeat) ++totalRepeat; dnaSeqFree(&qSeq); ++totalReads; } verboseTime(1, "Alignment"); /* Report statistics (to stderr) */ verbose(1, "Overall results for mapping %d reads in %s to %s\n", totalReads, query, target); int missCount = totalReads - totalMap - totalRepeat; verbose(1, "%d (%5.2f%%) did not map\n", missCount, 100.0 * missCount/totalReads); verbose(1, "%d (%5.2f%%) mapped more than %d places\n", totalRepeat, 100.0 * totalRepeat / totalReads, maxRepeat); if (maxRepeat >= 2) { int multiCount = totalMap - uniqCount; verbose(1, "%d (%5.2f%%) mapped between 2 and %d places\n", multiCount, 100.0 * multiCount / totalReads, maxRepeat); } verbose(1, "%d (%5.2f%%) mapped uniquely\n", uniqCount, 100.0 * uniqCount / totalReads); verbose(1, "%ld index queries, %ld hits12, %ld hits18, %ld hits25, %ld hitsFull\n", exactIndexQueries, hits12, hits18, hits25, hitsFull); /* Clean up. */ splixFree(&splix); dnaLoadClose(&qLoad); carefulClose(&f); carefulClose(&repeatOutputFile); } int main(int argc, char *argv[]) /* Process command line. */ { verboseTimeInit(); optionInit(&argc, argv, options); if (argc != 4) usage(); over = optionVal("over", over); out = optionVal("out", out); maxDivergence = optionInt("maxDivergence", maxDivergence); worseToo = optionExists("worseToo"); maxRepeat = optionInt("maxRepeat", maxRepeat); repeatOutput = optionVal("repeatOutput", NULL); maxGap = optionInt("maxGap", maxGap); maxMismatch = optionInt("maxMismatch", maxMismatch); exactOnly = (maxGap == 0 && maxMismatch == 0); tagSize = maxGap + splixMinQuerySize; memoryMap = optionExists("mmap"); if (maxGap > 1) errAbort("Sorry, for now maxGap must be just 0 or 1"); dnaUtilOpen(); splat(argv[1], argv[2], argv[3]); return 0; }