/* Copyright 1999-2003 Jim Kent. All rights reserved. */ /* fuzzyFind - searches a large DNA sequence (the haystack) for * a smaller DNA sequence (the needle). What makes this tricky * is that the match need not be exact. * * The algorithm looks for exact matches to (typically ten) evenly * spaced subsequences of the needle long enough for the match to * occur only once or less by chance in the haystack. If * in fact they occur more than 3 times, the subsequence is * extended by a base. * * Next these ten-mers (called "tiles" in the code) * are extended as far as possible exactly. Each tile at * this point may match the haystack in multiple places. * * Next the "weave" routines try to find a good * way to link the tiles together. Good is scored * by a routine which awards a point for a matching * base, and subtracts the log-base-two of gaps. It * penalizes an additionallly if a tile is * placed before instead of after a previous tile. * * Now that a preliminary alignment exists, the algorithm * searches for exact matches between tiles. These matches * are constrained to be long enough that they only occur * once in the space between tiles. These matches are * extended as far as they can be exactly, and added as * further tiles to the alignment list. * * Finally the tiles are extended inexactly until they * abutt each other, or until even inexact extension is * fruitless. * * The inexact extension is perhaps the trickiest part of the * algorithm. It's implemented in expandLeft and expandRight. * In these the extension is first taken as far as it can go * exactly. Then if four of the next five bases match it's * taken five bases further. Then a break is generated, and the * next significant match between the needle and haystack is * scanned for. */ #include "common.h" #include "dnautil.h" #include "localmem.h" #include "errabort.h" #include "fuzzyFind.h" #include "obscure.h" /* ffMemory routines - * FuzzyFinder allocates memory internally from the * following routines, which allocate blocks from the * main allocator and then dole them out. * * At the end of fuzzy-finding, the actual alignment * is copied to more permanent memory, and all the * blocks freed. This saves from having to remember * to free every little thing, and also is faster. * * If memory can't be allocated this will throw * back to the root of the fuzzy-finder system. * * (In typical usage this only allocates about * as much memory as the size of the DNA * you're scanning though.) * */ /* debugging stuff. */ void dumpFf(struct ffAli *left, DNA *needle, DNA *hay); /* settable parameter, defaults to constant value */ static jmp_buf ffRecover; static void ffAbort() /* Abort fuzzy finding. */ { longjmp(ffRecover, -1); } static struct lm *ffMemPool = NULL; static void ffMemInit() /* Initialize fuzzyFinder local memory system. */ { ffMemPool = lmInit(2048); } static void ffMemCleanup() /* Free up fuzzyFinder local memory system. */ { lmCleanup(&ffMemPool); } static void *ffNeedMem(size_t size) /* Allocate from fuzzyFinder local memory system. */ { return lmAlloc(ffMemPool, size); } static void makeFreqTable(DNA *dna, int dnaSize, double freq[4]) { int histo[4]; double total; int i; dnaBaseHistogram(dna, dnaSize, histo); total = histo[0] + histo[1] + histo[2] + histo[3]; if (total == 0) total = 1; for (i=0; i<4; ++i) freq[i] = (double)histo[i] / total; } static double oligoProb(DNA *oligo, int size, double freq[4]) { double prob = 1.0; int i; int baseVal; for (i=0; i= 0) prob *= freq[baseVal]; } return prob; } static boolean findImprobableOligo(DNA *needle, int needleLength, double maxProb, double freq[4], DNA **rOligo, int *rOligoLength, double *rOligoProb) /* Find an oligo that's got less than maxProb probability of matching to * random DNA with the given base frequency distribution. */ { int i; double totalProb = 1.0; int base; int startIx = 0; for (i=0;i maxSkip) diagSize = maxSkip; /* Scan diagonally... */ /* 0 1 2 3 1 2 3 4 2 3 4 5 3 4 5 6 */ for (i=1; i<=diagSize; ++i) { int hOff = i; int nOff = 0; int hDiff = hOff - haySize; matchSize = gapPenalty + digitsBaseTwo(i); if (hDiff > 0) { nOff += hDiff; hOff -= hDiff; } for (;hOff >= 0; --hOff, ++nOff) { int needleLeft = needleSize - nOff; int hayLeft = haySize - hOff; if (matchSize > needleLeft) break; if (matchSize > hayLeft) continue; if (ne[-nOff-1] == he[-hOff-1] && memcmp(ne-nOff-matchSize, he-hOff-matchSize, matchSize) == 0) { ali->nStart = ne - nOff - matchSize; ali->nEnd = ne - nOff; ali->hStart = he - hOff - matchSize; ali->hEnd = he- hOff; return TRUE; } } } return FALSE; } static boolean rightNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, int gapPenalty, int maxSkip) /* Scan to the right for something that matches the next bit of the needle * in the haystack. */ { int haySize = he - hs; int needleSize = ne - ns; int diagSize = haySize + needleSize; int matchSize; int i; /* We take care of bigger skips on the "tile" level. */ if (diagSize > maxSkip) diagSize = maxSkip; /* Scan diagonally... */ /* 0 1 2 3 1 2 3 4 2 3 4 5 3 4 5 6 */ for (i=1; i<=diagSize; ++i) { int hOff = i; int nOff = 0; int hDiff = hOff - haySize; matchSize = gapPenalty + digitsBaseTwo(i); if (hDiff > 0) { nOff += hDiff; hOff -= hDiff; } for (;hOff >= 0; --hOff, ++nOff) { int needleLeft = needleSize - nOff; int hayLeft = haySize - hOff; if (matchSize > needleLeft) break; if (matchSize > hayLeft) continue; if (ns[nOff] == hs[hOff] && memcmp(ns+nOff, hs+hOff, matchSize) == 0) { ali->nStart = ns + nOff; ali->nEnd = ns + nOff + matchSize; ali->hStart = hs + hOff; ali->hEnd = hs + hOff + matchSize; return TRUE; } } } return FALSE; } static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip); static boolean expandLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip) /* Given a matching segment, try to expand the aligned parts to the * right. */ { DNA *ns = ali->nStart; DNA *hs = ali->hStart; int score; DNA *oldNs = ns; for (;;) { while (ns > needleStart && hs > hayStart && ns[-1] == hs[-1]) { --hs; --ns; } if (ns <= needleStart || hs <= hayStart) { ali->nStart = ns; ali->hStart = hs; return ns != oldNs; } /* Find fuzzy score to the left. */ { int windowSize = 5; int nSize = ns-needleStart; int hSize = hs-hayStart; if (windowSize > nSize) windowSize = nSize; if (windowSize > hSize) windowSize = hSize; if (windowSize > 0) score = dnaScoreMatch(ns-windowSize, hs-windowSize, windowSize); else score = -1; if (score >= windowSize-2) { hs -= windowSize; ns -= windowSize; } else if (--numSkips >= 0) { struct ffAli *newAli = ffNeedMem(sizeof(*newAli)); ali->nStart = ns; ali->hStart = hs; if (ns - needleStart < 3 || !leftNextMatch(newAli, needleStart, ns, hayStart, hs, gapPenalty, maxSkip)) { return ns != oldNs; /* We did our best... */ } newAli->right = ali; newAli->left = ali->left; if (ali->left) ali->left->right = newAli; ali->left = newAli; ali = newAli; expandRight(ali, needleStart, ns, hayStart, hs, 0, gapPenalty, maxSkip); ns = ali->nStart; hs = ali->hStart; } else { ali->nStart = ns; ali->hStart = hs; return ns != oldNs; } } } } static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip) /* Given a matched segment, try to expand it to the right. */ { int score; DNA *ne = ali->nEnd; DNA *he = ali->hEnd; DNA *oldNe = ne; for (;;) { /* Expand as far to the right as you can keeping perfect match */ while (ne < needleEnd && he < hayEnd && *ne == *he) { ++he; ++ne; } /* Check to see if have come to the end. */ if (ne >= needleEnd || he >= hayEnd) { ali->nEnd = ne; ali->hEnd = he; return oldNe != ne; } /* Find fuzzy score to the right. */ { int windowSize = 5; int nSize = needleEnd-ne; int hSize = hayEnd - he; if (windowSize > nSize) windowSize = nSize; if (windowSize > hSize) windowSize = hSize; if (windowSize > 0) score = dnaScoreMatch(ne, he, windowSize); else score = -1; if (score >= windowSize-2) { he += windowSize; ne += windowSize; } else if (--numSkips >= 0) { struct ffAli *newAli = ffNeedMem(sizeof(*newAli)); ali->nEnd = ne; ali->hEnd = he; if (needleEnd - ne < 3 || !rightNextMatch(newAli, ne, needleEnd, he, hayEnd, gapPenalty, maxSkip)) { return oldNe != ne; /* We did our best... */ } newAli->left = ali; newAli->right = ali->right; if (ali->right) ali->right->left = newAli; ali->right = newAli; ali = newAli; expandLeft(ali, ne, needleEnd, he, hayEnd, 0, gapPenalty, maxSkip); ne = ali->nEnd; he = ali->hEnd; } else { ali->nEnd = ne; ali->hEnd = he; return oldNe != ne; } } } } static boolean exactFind(DNA *needle, int nSize, DNA *hay, int hSize, int *hOffset) /* Look for exact match of needle in haystack. */ { DNA firstC = needle[0]; DNA *restOfNeedle = needle+1; int i; int endIx = hSize - nSize; int restSize = nSize-1; for (i = 0; i<=endIx; ++i) { if (firstC == hay[i]) { if (memcmp(restOfNeedle, hay+i+1, restSize) == 0) { *hOffset = i; return TRUE; } } } *hOffset = -1; return FALSE; } #ifdef UNUSED static int countAlis(struct ffAli *ali) /* Count number of blocks in alignment. */ { int count = 0; if (ali == NULL) return 0; while (ali->left) ali = ali->left; while (ali) { count += 1; ali = ali->right; } return count; } #endif /* UNUSED */ static struct ffAli *reconsiderAlignedGaps(struct ffAli *ali) /* If the gap between two blocks is the same in both needle and * haystack, see if we're actually better off with the two * blocks merged together. */ { struct ffAli *a = NULL; struct ffAli *left = NULL; if (ali == NULL) return NULL; a = ali; for (;;) { int gapSize; /* Advance to next ali */ left = a; a = a->right; if (a == NULL) break; gapSize = a->nStart - left->nEnd; if (gapSize == a->hStart - left->hEnd) { int gapScore; int matchScore; gapScore = -ffCdnaGapPenalty(left, a); matchScore = dnaScoreMatch(left->nEnd, left->hEnd, gapSize); if (matchScore > gapScore) { /* Make current cover left. RemoveEmpty will take * care of empty shell of left later. */ a->hStart = left->hEnd = left->hStart; a->nStart = left->nEnd = left->nStart; } } } return ali; } static struct ffAli *trimAlis(struct ffAli *aliList) /* If ends of an ali don't match trim them off. */ { struct ffAli *ali; int trimCount = 0; for (ali = aliList; ali != NULL; ali = ali->right) { while (ali->nStart[0] != ali->hStart[0] && ali->nStart < ali->nEnd) { ali->nStart += 1; ali->hStart += 1; ++trimCount; } while (ali->nEnd[-1] != ali->hEnd[-1] && ali->nEnd > ali->nStart) { ali->nEnd -= 1; ali->hEnd -= 1; ++trimCount; } } return aliList; } static int extendThroughN; /* Can extend through blocks of N's? */ void setFfExtendThroughN(boolean val) /* Set whether or not can extend through N's. */ { extendThroughN = val; } boolean expandThroughNRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd) /* Expand through up to three N's to the left. */ { DNA *nEnd = ali->nEnd; DNA *hEnd = ali->hEnd; DNA n,h; boolean expanded = FALSE; while (nEnd < needleEnd && hEnd < hayEnd) { n = *nEnd; h = *hEnd; if ((n == h) || (n == 'n' && (extendThroughN || nEnd + 3 >= needleEnd || nEnd[1] != 'n' || nEnd[2] != 'n' || nEnd[3] != 'n')) || (h == 'n' && (extendThroughN || hEnd + 3 >= hayEnd || hEnd[1] != 'n' || hEnd[2] != 'n' || hEnd[3] != 'n'))) { nEnd += 1; hEnd += 1; expanded = TRUE; } else break; } ali->nEnd = nEnd; ali->hEnd = hEnd; return expanded; } boolean expandThroughNLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd) /* Expand through up to three N's to the left. */ { DNA *nStart = ali->nStart-1; DNA *hStart = ali->hStart-1; DNA n,h; boolean expanded = FALSE; while (nStart >= needleStart && hStart >= hayStart) { n = *nStart; h = *hStart; if ((n == h) || (n == 'n' && (extendThroughN || nStart - 3 < needleStart || nStart[-1] != 'n' || nStart[-2] != 'n' || nStart[-3] != 'n')) || (h == 'n' && (extendThroughN || hStart - 3 < hayStart || hStart[-1] != 'n' || hStart[-2] != 'n' || hStart[-3] != 'n'))) { nStart -= 1; hStart -= 1; expanded = TRUE; } else break; } ali->nStart = nStart + 1; ali->hStart = hStart + 1; return expanded; } static struct ffAli *expandAlis(struct ffAli *ali, DNA *nStart, DNA *nEnd, DNA *hStart, DNA *hEnd, int gapPenalty, int maxSkip) /* Expand alignment to cover in-between tiles as well. */ { struct ffAli *a, *left, *right; DNA *ns, *ne, *hs, *he; boolean expanded = TRUE; while (expanded) { expanded = FALSE; /* Loop through expanding three times. */ /* First just expand through N's that don't require an insertion/deletion. */ for (a = ali; a != NULL; a = a->right) { if ((left = a->left) != NULL) { ns = left->nEnd; hs = left->hEnd; } else { ns = nStart; hs = hStart; } if ((right = a->right) != NULL) { ne = right->nStart; he = right->hStart; } else { ne = nEnd; he = hEnd; } expanded |= expandThroughNLeft(a, ns, ne, hs, he); expanded |= expandThroughNRight(a, ns, ne, hs, he); } /* Second do other expansion that doesn't require an insertion/deletion. */ for (a = ali; a != NULL; a = a->right) { if ((left = a->left) != NULL) { ns = left->nEnd; hs = left->hEnd; } else { ns = nStart; hs = hStart; } if ((right = a->right) != NULL) { ne = right->nStart; he = right->hStart; } else { ne = nEnd; he = hEnd; } expanded |= expandLeft(a, ns, ne, hs, he, 0, gapPenalty, maxSkip); expanded |= expandRight(a, ns, ne, hs, he, 0, gapPenalty, maxSkip); } /* Finally do insertion/deletion. */ for (a = ali; a != NULL; a = a->right) { if ((left = a->left) != NULL) { ns = left->nEnd; hs = left->hEnd; } else { ns = nStart; hs = hStart; } if ((right = a->right) != NULL) { ne = right->nStart; he = right->hStart; } else { ne = nEnd; he = hEnd; } expanded |= expandLeft(a, ns, ne, hs, he, 1, gapPenalty, maxSkip); expanded |= expandRight(a, ns, ne, hs, he, 1, gapPenalty, maxSkip); } while (ali->left) ali = ali->left; } return ali; } DNA *matchInMem(DNA *ns, DNA *ne, DNA *hs, DNA *he) { int nLen = ne - ns; DNA c = *ns++; he -= nLen; nLen -= 1; while (hs < he) { if (*hs++ == c && memcmp(ns, hs, nLen) == 0) { return hs-1; } } return NULL; } struct ffAli *findAliBetween(DNA *tile, int tileSize, DNA *ns, DNA *ne, DNA *hs, DNA *he) { DNA *match; DNA *tileEnd = tile + tileSize; int tries = 0; for (;;) { if ((match = matchInMem(tile, tileEnd, hs, he)) == NULL) return NULL; if (matchInMem(tile, tileEnd, match+1, he) == NULL) { /* Got exactly one match, whoopie! */ struct ffAli *ali = ffNeedMem(sizeof(*ali)); ali->nStart = tile; ali->nEnd = tileEnd; ali->hStart = match; ali->hEnd = match + (tileEnd-tile); ffExpandExactLeft(ali, ns, hs); ffExpandExactRight(ali, ne, he); return ali; } /* Got more than one match. Expand tile to make it more specific. * In general first add base to start, then base to beginning. If * at beginning or end already are constrained. If already spanning * full needle, can't expand, so you're done. */ if (tile <= ns) { if (tileEnd >= ne) return NULL; ++tileEnd; } else if (tile >= tileEnd) { --tile; } else if (tries & 1) { ++tileEnd; } else { --tile; } ++tries; } } double evalExactAli(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, int numTiles, double freq[4]) { int haySize = he-hs; double prob = 1.0; double allPossibles = haySize*numTiles; for (;ali != NULL;ali=ali->right) { double p = oligoProb(ali->nStart, ali->nEnd - ali->nStart, freq) * allPossibles; if (p < 1.0) prob *= p; } return prob; } void ffUnlink(struct ffAli **pList, struct ffAli *el) /* Unlink element from doubly linked list. If leftmost * element update list pointer. */ { struct ffAli *list = *pList; struct ffAli *right = el->right; struct ffAli *left = el->left; if (el == list) /* If first element need to update list. */ *pList = right; if (right != NULL) right->left = left; if (left != NULL) left->right = right; el->left = el->right = NULL; } void unlinkAli(struct ffAli **pList, struct ffAli *ali) /* Unlink ali from list. */ { ffUnlink(pList, ali); } struct protoGene { struct protoGene *left; struct protoGene *right; struct ffAli *hits; DNA *hStart, *hEnd, *nStart, *nEnd; int score; }; int cmpProtoSize(const void *va, const void *vb) /* Sort biggest size first. */ { const struct protoGene *a = *((struct protoGene **)va); const struct protoGene *b = *((struct protoGene **)vb); int aSize, bSize; aSize = a->nEnd - a->nStart; bSize = b->nEnd - b->nStart; return (bSize - aSize); } int cmpProtoNeedle(const void *va, const void *vb) /* Sort smallest offset into needle first. */ { const struct protoGene *a = *((struct protoGene **)va); const struct protoGene *b = *((struct protoGene **)vb); return (a->nStart - b->nStart); } int cmpProtoScore(const void *va, const void *vb) /* Sort biggest score first. */ { const struct protoGene *a = *((struct protoGene **)va); const struct protoGene *b = *((struct protoGene **)vb); return (b->score - a->score); } boolean canAdd(struct protoGene *a, struct protoGene *b) /* Can b fit into a? It can if it doesn't overlap * by more than 1/4 with what's already there. */ { struct ffAli *pa; DNA *bnEnd = b->nEnd; DNA *bnStart = b->nStart; DNA *bhEnd = b->hEnd; DNA *bhStart = b->hStart; int bSize = bnEnd - bnStart; int maxOverlap; for (pa = a->hits; pa != NULL; pa = pa->right) { int aSize = pa->nEnd - pa->nStart; DNA *start = (bnStart > pa->nStart ? bnStart : pa->nStart); DNA *end = (bnEnd < pa->nEnd ? bnEnd : pa->nEnd); maxOverlap = (aSize < bSize ? aSize : bSize) / 4; if (maxOverlap < 2) maxOverlap = 2; if (end - start >= maxOverlap) return FALSE; start = (bhStart > pa->hStart ? bhStart : pa->hStart); end = (bhEnd < pa->hEnd ? bhEnd : pa->hEnd); if (end - start >= maxOverlap) return FALSE; } return TRUE; } boolean bestMerger(struct protoGene *list, enum ffStringency stringency, DNA *ns, DNA *hs, struct protoGene **retA, struct protoGene **retB) /* Figure out best scoring merger in list. */ { int bestScore = -0x7fffffff; int score; struct protoGene *bestA = NULL, *bestB = NULL; struct protoGene *a, *b; boolean isCdna = (stringency == ffCdna); if (list == NULL) return FALSE; for (a = list; a != NULL; a = a->right) { for (b = a->right; b != NULL; b = b->right) { if (canAdd(a,b)) { int hGap = b->hStart - a->hEnd; int nGap = b->nStart - a->nEnd; if (hGap < 0) { hGap = -8*hGap; if (!isCdna || hGap < 32) hGap = (hGap*hGap); } if (nGap < 0) nGap = -8*nGap; score = -hGap - nGap*nGap; if (score > bestScore) { bestA = a; bestB = b; bestScore = score; } } } } *retA = bestA; *retB = bestB; return bestA != NULL; } void mergeProtoGenes(struct protoGene **pList, struct protoGene *a, struct protoGene *b) /* Absorb protoGene b into protoGene a. */ { ffUnlink((struct ffAli**)pList, (struct ffAli *)b); ffCat(&a->hits, &b->hits); if (a->hStart > b->hStart) a->hStart = b->hStart; if (a->nStart > b->nStart) a->nStart = b->nStart; if (a->hEnd < b->hEnd) a->hEnd = b->hEnd; if (a->nEnd < b->nEnd) a->nEnd = b->nEnd; } void lumpProtoGenes(struct protoGene **pList, DNA *ns, DNA *hs, enum ffStringency stringency) /* Lump together as many protogenes as we can. */ { struct protoGene *a, *b; while (bestMerger(*pList, stringency, ns, hs, &a, &b)) { mergeProtoGenes(pList, a, b); } } struct protoGene *lumpHits(struct ffAli **pHitList, DNA *ns, DNA *hs) /* Lump together as many hits as can. Criteria - they must be close * to same "diagonal." That is the distance between them must be * nearly the same in the needle and the haystack. */ { struct protoGene proto; struct protoGene *retProto; struct ffAli *ali, *nextAli; int dif, lastDif; int lumpCount = 1; if ((ali = *pHitList) == NULL) return NULL; /* Move first hit onto our crude exon. */ nextAli = ali->right; proto.left = proto.right = NULL; unlinkAli(pHitList, ali); proto.hits = ali; proto.hStart = ali->hStart; proto.hEnd = ali->hEnd; proto.nStart = ali->nStart; proto.nEnd = ali->nEnd; proto.score = 0; lastDif = ali->hStart - ali->nStart; /* Go through rest of list and put others that are close to * on the same diagonal onto this exon. */ while ((ali = nextAli) != NULL) { nextAli = ali->right; dif = ali->hStart - ali->nStart; if (lastDif - 2 <= dif && dif <= lastDif + 2) { lastDif = dif; unlinkAli(pHitList, ali); ali->left = proto.hits; proto.hits = ali; proto.hEnd = ali->hEnd; proto.nEnd = ali->nEnd; ++lumpCount; } } proto.hits = ffMakeRightLinks(proto.hits); retProto = ffNeedMem(sizeof(*retProto)); memcpy(retProto, &proto, sizeof(*retProto)); return retProto; } #ifdef OLD void removeThrowbackHits(struct protoGene *proto) /* Remove hits that cause backtracking in hay coordinates. */ { struct ffAli *left, *right; boolean gotThrowback = FALSE; right = proto->hits; for (;;) { left = right; right = right->right; if (right == NULL) break; if (left->hStart > right->hStart) { right->hStart = right->hEnd = left->hEnd; right->nStart = right->nEnd = left->nEnd; gotThrowback = TRUE; } } if (gotThrowback) proto->hits = ffRemoveEmptyAlis(proto->hits, FALSE); } #endif /* OLD */ void thinProtoList(struct protoGene **pList, int maxToTake) /* Reduce list to only top scoring members. At this * point protoList is singly linked on left. */ { struct protoGene *newList = NULL, *pg, *next; int i; slSort(pList, cmpProtoScore); for (pg=*pList, i=0; pg != NULL && i < maxToTake; pg = next, ++i) { next = pg->left; pg->left = newList; newList = pg; } *pList = newList; } static struct ffAli *weaveAli(struct ffAli *hitList, DNA *ns, DNA *ne, DNA *hs, DNA *he, double freq[4], int *rBestVal, enum ffStringency stringency) /* Weave together best looking alignment out of the hitList list. */ { struct ffAli *ali, *lastAli; struct protoGene *proto, *protoList = NULL; int protoCount = 0; /* Sort initial hits and remove duplicates. */ ffAliSort(&hitList, ffCmpHitsHayFirst); ali = hitList; for (;;) { lastAli = ali; if ((ali = ali->right) == NULL) break; if (ali->hStart == lastAli->hStart && ali->hEnd == lastAli->hEnd && ali->nStart == lastAli->nStart && ali->nEnd == lastAli->nEnd) { unlinkAli(&hitList, lastAli); } } /* Lump together things which look to be separated only by small * amounts of noise. */ while ((proto = lumpHits(&hitList, ns, hs)) != NULL) { proto->left = protoList; protoList = proto; ++protoCount; } if (protoCount > 200) { thinProtoList(&protoList, 200); } protoList = (struct protoGene *)ffMakeRightLinks((struct ffAli*)protoList); /* Lump together any other ones that can, starting with ones * that go together best. */ ffAliSort((struct ffAli**)(void*)&protoList, cmpProtoNeedle); lumpProtoGenes(&protoList, ns, hs, stringency ); for (proto = protoList; proto != NULL; proto = proto->right) { ffAliSort((struct ffAli**)&proto->hits, ffCmpHitsNeedleFirst); /* removeThrowbackHits(proto); */ proto->score = ffScore(proto->hits,stringency); } /* Sort by score and return the best. */ ffAliSort((struct ffAli **)(void*)&protoList, cmpProtoScore); *rBestVal = protoList->score; return protoList->hits; } /* This set of variables is set before calling the recursive tile finders - the * below two routines. */ static double rwFreq[4]; static boolean rwIsCdna; static boolean rwCheckGoodEnough; static struct ffAli *rwFindTilesBetween(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency, double probMax) /* Search for more or less regularly spaced exact matches that are * in the right order. */ { int searchOffset; int endTileOffset; int haySize = he - hs; int needleSize = ne - ns; int numTiles = 0; int bestWeaveVal; int tileIx; struct ffAli *hitList = NULL; struct ffAli *bestAli; struct ffAli *ali; int possibleTiles; double tileProbOne; possibleTiles = (haySize - nextPowerOfFour(needleSize)); if (possibleTiles < 1) possibleTiles = 1; tileProbOne = probMax/possibleTiles; searchOffset = 0; endTileOffset = 0; tileIx = 0; for (;;) { DNA *tile; int tileSize; double tileProb; DNA *h = hs; searchOffset = endTileOffset; if (!ffFindGoodOligo(ns+searchOffset, needleSize-searchOffset, tileProbOne, rwFreq, &tile, &tileSize, &tileProb)) { break; } /* Find all parts that match the tile. */ for (;;) { if ((h = matchInMem(tile, tile+tileSize, h, he)) == NULL) break; ali = ffNeedMem(sizeof(*ali)); ali->hStart = h; ali->hEnd = ali->hStart + tileSize; ali->nStart = tile; ali->nEnd = tile + tileSize; ali->left = hitList; hitList = ali; h += tileSize; } endTileOffset = tile + tileSize - ns; ++numTiles; } if (hitList == NULL) return NULL; hitList = ffMakeRightLinks(hitList); for (ali = hitList; ali != NULL; ali = ali->right) { ffExpandExactLeft(ali, ns, hs); ffExpandExactRight(ali, ne, he); } bestAli = weaveAli(hitList, ns, ne, hs, he, rwFreq, &bestWeaveVal, stringency); if (rwCheckGoodEnough) { double prob; prob = evalExactAli(bestAli, ns, ne, hs, he, numTiles, rwFreq); if (prob > 0.1) return NULL; rwCheckGoodEnough = FALSE; } return bestAli; } static struct ffAli *exactAli(DNA *ns, DNA *ne, DNA *hs, DNA *he) /* Return alignment based on exact match. */ { int exactOffset; if (exactFind(ns, ne-ns, hs, he-hs, &exactOffset)) { struct ffAli *ali = ffNeedMem(sizeof(*ali)); ali->nStart = ns; ali->nEnd = ne; ali->hStart = hs + exactOffset; ali->hEnd = ali->hStart + (ne - ns); if (ali->hEnd > he) ali->hEnd = he; return ali; } else return NULL; } static struct ffAli *recursiveWeave(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency, double probMax, int level, int orientation) /* Find a set of tiles, then recurse to find set of tiles between the tiles * at somewhat lower stringency. */ { struct ffAli *left = NULL, *right = NULL, *aliList; if ((left = exactAli(ns, ne, hs, he)) != NULL) return left; if (stringency == ffExact) return NULL; aliList = rwFindTilesBetween(ns, ne, hs, he, stringency, probMax); if (aliList != NULL) { DNA *lne, *rns, *lhe, *rhs; int ndif, hdif; right = aliList; if (orientation == 0) orientation = ffIntronOrientation(aliList); for (;;) { /* Figure out the end points to recurse to. */ if (left != NULL) { lne = left->nEnd; lhe = left->hEnd; } else { lne = ns; lhe = hs; } if (right != NULL) { rns = right->nStart; rhs = right->hStart; } else { rns = ne; rhs = he; } ndif = rns-lne; hdif = rhs-lhe; /* If a big enough gap left recurse. */ if (ndif >= 5 && hdif >= 5) { struct ffAli *newLeft = NULL, *newRight; newLeft = recursiveWeave(lne, rns, lhe, rhs, stringency, probMax*2, level+1, orientation); if (newLeft != NULL) { /* Insert new tiles between left and right. */ /* Find right end of tile set. */ newRight = newLeft; while (newRight->right != NULL) newRight = newRight->right; if (left != NULL) { left->right = newLeft; newLeft->left = left; } else { aliList = newLeft; } if (right != NULL) { right->left = newRight; newRight->right = right; } } } if ((left = right) == NULL) break; right = right->right; } } return aliList; } static struct ffAli *findWovenTiles(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency) { struct ffAli *bestAli; int haySize = he - hs; int needleSize = ne - ns; static double tileStrinProbMult[] = { 0.0001, 0.0005, 0.0005, 0.5, }; /* exact cDNA tight loose */ if (needleSize < 2 || haySize < 2) /* Be serious man! */ return NULL; /* Set up rwVariables - essentially locals except that they don't change over the course * of the recursion. */ makeFreqTable(hs, haySize, rwFreq); rwIsCdna = (stringency == ffCdna); rwCheckGoodEnough = (stringency == ffTight || stringency == ffCdna); bestAli = recursiveWeave(ns, ne, hs, he, stringency, tileStrinProbMult[stringency], 1, 0); return bestAli; } static struct ffAli *findBestAli(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency) { static int iniExpGapPen[] = {0 /* (exact) */, 4 /* cDna */, 4 /* tight */, 4 /* loose */}; static int addExpGapPen[] = {0 /* (exact) */, 3 /* cDna */, 3 /* tight */, 3 /* loose */}; static int midTileMinSize[] ={0 /*(exact) */,12 /* cDNA */, 12 /* tight */, 4 /* loose */}; struct ffAli *bestAli; int matchSize; matchSize = nextPowerOfFour(he-hs)+1; if (matchSize < midTileMinSize[stringency]) matchSize = midTileMinSize[stringency]; bestAli = findWovenTiles(ns, ne, hs, he, stringency); if (bestAli == NULL) return NULL; bestAli = ffMergeNeedleAlis(bestAli, FALSE); bestAli = expandAlis(bestAli,ns,ne,hs,he,iniExpGapPen[stringency], 1); bestAli = ffMergeNeedleAlis(bestAli, FALSE); bestAli = expandAlis(bestAli,ns,ne,hs,he,addExpGapPen[stringency], 2*matchSize); bestAli = trimAlis(bestAli); bestAli = ffMergeNeedleAlis(bestAli, FALSE); bestAli = ffMergeHayOverlaps(bestAli); bestAli = reconsiderAlignedGaps(bestAli); bestAli = ffRemoveEmptyAlis(bestAli, FALSE); bestAli = ffMergeNeedleAlis(bestAli, FALSE); if (stringency == ffCdna) ffSlideIntrons(bestAli); bestAli = ffRemoveEmptyAlis(bestAli, FALSE); return bestAli; } static struct ffAli *saveAliToPermanentMem(struct ffAli *volatileAli) /* Save alignment to memory that doesn't get thrown away. */ { struct ffAli *leftList = NULL; struct ffAli *newAli, *ali; struct ffAli *rightList = NULL; /* Allocate memory, copy contents and set left pointer. */ for (ali = volatileAli; ali != NULL; ali = ali->right) { newAli = needMem(sizeof(*newAli)); if (newAli == NULL) { slFreeList(leftList); ffAbort(); } memcpy(newAli, ali, sizeof(*newAli)); newAli->left = leftList; leftList = newAli; } /* Set right pointer. */ for (ali = leftList; ali != NULL; ali = ali->left) { ali->right = rightList; rightList = ali; } return rightList; } struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd, enum ffStringency stringency) /* Return an alignment of needle in haystack. */ { struct ffAli *bestAli; int status; assert(needleStart <= needleEnd); assert(hayStart <= hayEnd); ffMemInit(); dnaUtilOpen(); /* Set up error recovery. */ status = setjmp(ffRecover); if (status == 0) /* Always true except after long jump. */ { pushAbortHandler(ffAbort); bestAli = findBestAli(needleStart, needleEnd, hayStart, hayEnd, stringency); ffCountGoodEnds(bestAli); bestAli = saveAliToPermanentMem(bestAli); } else /* They long jumped here because of an error. */ { bestAli = NULL; } popAbortHandler(); ffMemCleanup(); return bestAli; } boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize, enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore) /* Return TRUE if find an allignment using needle, or reverse complement of * needle to search haystack. DNA must be lower case. If pScore is non-NULL returns * score of alignment. */ { int fScore, rScore; struct ffAli *fAli, *rAli; /* Get forward alignment and score it. */ fAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency); fScore = ffScore(fAli,stringency); /* Get reverse alignment and score it. */ reverseComplement(needle, needleSize); rAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency); rScore = ffScore(rAli,stringency); reverseComplement(needle, needleSize); /* If no good alignment on either strand return FALSE. */ if (fAli == NULL && rAli == NULL) { *pAli = NULL; return FALSE; } /* Return TRUE with best alignment. Free the other one. */ if (fScore > rScore) { *pAli = fAli; *pRcNeedle = FALSE; if (pScore != NULL) *pScore = fScore; ffFreeAli(&rAli); } else { *pAli = rAli; *pRcNeedle = TRUE; if (pScore != NULL) *pScore = rScore; ffFreeAli(&fAli); } return TRUE; } boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize, enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle) /* Return TRUE if find an allignment using needle, or reverse complement of * needle to search haystack. DNA must be lower case. */ { return ffFindAndScore(needle, needleSize, haystack, haySize, stringency, pAli, pRcNeedle, NULL); } boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle) /* Return TRUE if find an alignment using needle, or reverse complement of * needle to search haystack. DNA must be lower case. Needle and haystack * are zero terminated. */ { return ffFindEitherStrandN(needle, strlen(needle), haystack, strlen(haystack), stringency, pAli, pRcNeedle); }