/* * Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) * * Released under the MIT license, see LICENSE.txt */ #include "sonLib.h" #include #include #include #include #include "pairwiseAligner.h" #include "sonLib.h" #include "pairwiseAlignment.h" #ifdef _OPENMP static const int32_t ThreadChunkSize = 150; static inline int32_t getNumThreads(int32_t arraySize) { int32_t numThreads = omp_get_max_threads(); int32_t maxThreads = arraySize / ThreadChunkSize; if (maxThreads < numThreads && maxThreads > 0) return maxThreads; return numThreads; } #endif /* * Basic Pecan HMM code used by Cactus base aligner. */ /// //Sequence stuff // static char convertChar(char i) { switch (i) { case 'A': case 'a': return 0; case 'C': case 'c': return 1; case 'G': case 'g': return 2; case 'T': case 't': return 3; default: return 4; } } char *convertSequence(const char *s, int32_t sL) { assert(sL >= 0); assert(strlen(s) == sL); char *cS = st_malloc((sL + 1) * sizeof(char)); for (int32_t i = 0; i < sL; i++) { cS[i] = convertChar(s[i]); } cS[sL] = '\0'; return cS; } /// //Basic math /// #define logZero -INFINITY #define logUnderflowThreshold 7.5 #define posteriorMatchThreshold 0.01 static inline double lookup(double x) { //return log (exp (x) + 1); #ifdef BEN_DEBUG assert (x >= 0.00f); assert (x <= logUnderflowThreshold); #endif if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f; if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f; if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f; return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f; } double logAdd(double x, double y) { if (x < y) return (x == logZero || y - x >= logUnderflowThreshold) ? y : lookup(y - x) + x; return (y == logZero || x - y >= logUnderflowThreshold) ? x : lookup(x - y) + y; } /* * Calculates the addition of two log numbers then assigns them to the pointer x. */ static inline void logAddAndAssign(double *x, double y) { *x = logAdd(*x, y); } //// //State stuff. //// static const int32_t cellNo = 5; static void checkState(int32_t state) { #ifdef BEN_DEBUG assert(state >= 0); assert(state < cellNo); #endif } static void checkPosition(int32_t z, int32_t zL) { #ifdef BEN_DEBUG assert(z >= 1); assert(z < zL); #endif } #define matchContinueTransition -0.030064059121770816 //0.9703833696510062f #define gapOpenShortTransition -4.34381910900448 //0.0129868352330243 #define gapOpenLongTransition -6.30810595366929 //(1.0 - match - 2*gapOpenShort)/2 = 0.001821479941473 #define gapShortExtendTransition -0.3388262689231553 //0.7126062401851738f; #define gapShortSwitchTransition -4.910694825551255 //0.0073673675173412815f; #define matchFromShortGapTransition -1.272871422049609 //1.0 - gapExtend - gapSwitch = 0.280026392297485 #define gapLongExtendTransition -0.003442492794189331 //0.99656342579062f; #define matchFromLongGapTransition -5.673280173170473 //1.0 - gapExtend = 0.00343657420938 static inline double transitionProb(int32_t from, int32_t to) { checkState(from); checkState(to); static const double transitions[25] = { /*Match */matchContinueTransition, matchFromShortGapTransition, matchFromShortGapTransition, matchFromLongGapTransition, matchFromLongGapTransition, /*To shortGapX */gapOpenShortTransition, gapShortExtendTransition, gapShortSwitchTransition, logZero, logZero, /*To shortGapY */gapOpenShortTransition, gapShortSwitchTransition, gapShortExtendTransition, logZero, logZero, /*To longGapX */gapOpenLongTransition, logZero, logZero, gapLongExtendTransition, logZero, /*To longGapY */gapOpenLongTransition, logZero, logZero, logZero, gapLongExtendTransition }; return transitions[to * cellNo + from]; } static inline int32_t getTransitionOffSetX(int32_t state) { checkState(state); static int32_t offsets[] = { 1, 1, 0, 1, 0 }; return offsets[state]; } static inline int32_t getTransitionOffSetY(int32_t state) { checkState(state); static int32_t offsets[] = { 1, 0, 1, 0, 1 }; return offsets[state]; } #define gapEmission -1.6094379124341003 //log(0.2) = -1.6094379124341003 #define matchEmission -2.1149196655034745 //log(0.12064298095701059); #define transversionEmission -4.5691014376830479 //log(0.010367271172731285); #define transitionEmission -3.9833860032220842 //log(0.01862247669752685); #define matchNEmission -3.2188758248682006 //log(0.04); static inline double emissionProb(int32_t x, int32_t y, int32_t lX, int32_t lY, const char *sX, const char *sY, int32_t state) { checkState(state); static const double gapM[5] = { gapEmission, gapEmission, gapEmission, gapEmission, gapEmission }; static const double matchM[25] = { matchEmission, transversionEmission, transitionEmission, transversionEmission, matchNEmission, transversionEmission, matchEmission, transversionEmission, transitionEmission, matchNEmission, transitionEmission, transversionEmission, matchEmission, transversionEmission, matchNEmission, transversionEmission, transitionEmission, transversionEmission, matchEmission, matchNEmission, matchNEmission, matchNEmission, matchNEmission, matchNEmission, matchNEmission }; switch (state) { case 0: checkPosition(x, lX); checkPosition(y, lY); return matchM[sX[x - 1] * 5 + sY[y - 1]]; case 1: case 3: checkPosition(x, lX); return gapM[(int32_t) sX[x - 1]]; case 2: case 4: checkPosition(y, lY); return gapM[(int32_t) sY[y - 1]]; default: assert(0); return 0.0; } } static inline double startStateProbs(int32_t state) { checkState(state); //static const double startProb = -1.0986122886681098; //math.log(1.0/3.0) = -1.0986122886681098 static const double startStates[5] = { matchContinueTransition, gapOpenShortTransition, gapOpenShortTransition, gapOpenLongTransition, gapOpenLongTransition }; return startStates[state]; } static inline double endStateProbs(int32_t state) { checkState(state); static const double endProb = -1.6094379124341; //math.log(1.0/5.0) = -1.6094379124341 double endStates[5] = { endProb, endProb, endProb, endProb, endProb }; return endStates[state]; } ///// //Forward matrix ///// static inline double *getCell(double *m, int32_t x, int32_t y, int32_t lX) { if (x >= 0 && y >= 0) { return &(m[(y * lX + x) * cellNo]); } return NULL; } static inline double *getEmptyMatrix(int32_t lX, int32_t lY) { int32_t j = lX * lY * cellNo; double *m = st_malloc(j * sizeof(double)); for (int32_t i = 0; i < j; i++) { m[i] = logZero; } return m; } static double *initialiseForwardMatrix(int32_t lX, int32_t lY) { double *fM = getEmptyMatrix(lX, lY); double *cell = getCell(fM, 0, 0, lX); assert(cell != NULL); for (int32_t i = 0; i < cellNo; i++) { cell[i] = startStateProbs(i); } return fM; } static inline void forwardCell(double *fM, int32_t x, int32_t y, int32_t lX, int32_t lY, const char *sX, const char *sY) { double *cell = getCell(fM, x, y, lX); for (int32_t to = 0; to < cellNo; to++) { double *pCell = getCell(fM, x - getTransitionOffSetX(to), y - getTransitionOffSetY(to), lX); if (pCell != NULL) { double eP = emissionProb(x, y, lX, lY, sX, sY, to); for (int32_t from = 0; from < cellNo; from++) { logAddAndAssign(&cell[to], pCell[from] + transitionProb(from, to) + eP); } } } } double *forwardMatrix(int32_t lX, int32_t lY, const char *sX, const char *sY) { double *fM = initialiseForwardMatrix(lX, lY); // all '/' diagonals intersectinog x = 0 axis for (int32_t x = 0; x < lX; ++x) { int32_t diagLen = (lY < x + 1) ? lY : x + 1; #pragma omp parallel default(shared) if(diagLen >= ThreadChunkSize) num_threads(getNumThreads(diagLen)) { #pragma omp for schedule(static) for (int32_t y = 0; y < diagLen; ++y) { forwardCell(fM, x - y, y, lX, lY, sX, sY); } } } // all '/' diagonals intersecting y = lY - 1 axis for (int32_t y = 1; y < lY; ++y) { int32_t diagLen = (lX < lY - y) ? lX : lY - y; #pragma omp parallel default(shared) if(diagLen >= ThreadChunkSize) num_threads(getNumThreads(diagLen)) { #pragma omp for schedule(static) for (int32_t i = 0; i < diagLen; ++i) { forwardCell(fM, lX - 1 - i, y + i, lX, lY, sX, sY); } } } return fM; } double totalForwardProb(double *fM, int32_t lX, int32_t lY) { double *cell = getCell(fM, lX - 1, lY - 1, lX); assert(cell != NULL); double totalProb = endStateProbs(0) + cell[0]; for (int32_t i = 1; i < cellNo; i++) { logAddAndAssign(&totalProb, endStateProbs(i) + cell[i]); } return totalProb; } ///// //Backward matrix ///// static double *initialiseBackwardMatrix(int32_t lX, int32_t lY) { double *bM = getEmptyMatrix(lX, lY); double *cell = getCell(bM, lX - 1, lY - 1, lX); assert(cell != NULL); for (int32_t i = 0; i < cellNo; i++) { cell[i] = endStateProbs(i); } return bM; } static inline void backwardCell(double *bM, int32_t x, int32_t y, int32_t lX, int32_t lY, const char *sX, const char *sY) { double *cell = getCell(bM, x, y, lX); for (int32_t to = 0; to < cellNo; to++) { int32_t x2 = x + getTransitionOffSetX(to); int32_t y2 = y + getTransitionOffSetY(to); if (x2 < lX && y2 < lY) { double *pCell = getCell(bM, x2, y2, lX); double eP = emissionProb(x2, y2, lX, lY, sX, sY, to); for (int32_t from = 0; from < cellNo; from++) { logAddAndAssign(&cell[from], pCell[to] + transitionProb(from, to) + eP); } } } } double *backwardMatrix(int32_t lX, int32_t lY, const char *sX, const char *sY) { double *bM = initialiseBackwardMatrix(lX, lY); // all '/' diagonals intersecting y = lY - 1 axis for (int32_t y = lY - 1; y >= 1; --y) { int32_t diagLen = (lX < lY - y) ? lX : lY - y; #pragma omp parallel default(shared) if(diagLen >= ThreadChunkSize) num_threads(getNumThreads(diagLen)) { #pragma omp for schedule(static) for (int32_t i = 0; i < diagLen; ++i) { backwardCell(bM, lX - 1 - i, y + i, lX, lY, sX, sY); } } } // all '/' diagonals intersecting x = 0 axis for (int32_t x = lX - 1; x >= 0; --x) { int32_t diagLen = (lY < x + 1) ? lY : x + 1; #pragma omp parallel default(shared) if(diagLen >= ThreadChunkSize) num_threads(getNumThreads(diagLen)) { #pragma omp for schedule(static) for (int32_t y = 0; y < diagLen; ++y) { backwardCell(bM, x - y, y, lX, lY, sX, sY); } } } return bM; } double totalBackwardProb(double *fM, int32_t lX) { double *cell = getCell(fM, 0, 0, lX); double totalProb = startStateProbs(0) + cell[0]; for (int32_t i = 1; i < cellNo; i++) { logAddAndAssign(&totalProb, startStateProbs(i) + cell[i]); } return totalProb; } ///// //Posterior probabilities ///// static inline double posteriorMatchProb(double *fM, double *bM, int32_t x, int32_t y, int32_t lX, int32_t lY, const char *sX, const char *sY, double totalProb) { int32_t to = 0; double *pCell = getCell(fM, x - 1, y - 1, lX); assert(pCell != NULL); double *cell = getCell(bM, x, y, lX); double eP = emissionProb(x, y, lX, lY, sX, sY, to); int32_t from = 0; double f = pCell[from] + transitionProb(from, to) + eP + cell[to]; for (from = 1; from < cellNo; from++) { logAddAndAssign(&f, pCell[from] + transitionProb(from, to) + eP + cell[to]); } double p = exp(f - totalProb); #ifdef BEN_DEBUG /*if(p < -0.01 || p > 1.01) { st_uglyf("I got a bad position, %i %i %f\n", x, y, p); }*/ //assert(p >= -0.01 && p < 1.01); #endif return p; } static void getPosteriorProbs(double *fM, double *bM, int32_t lX, int32_t lY, const char *sX, const char *sY, stList *alignedPairs, double totalProb, PairwiseAlignmentParameters *p) { stList** alignedPairMatrix = (stList**)st_malloc(lX * lY * sizeof(stList*)); #pragma omp parallel default(shared) if(lX >= ThreadChunkSize) num_threads(getNumThreads(lX)) { #pragma omp for schedule(static) for (int32_t x = 1; x < lX; x++) { alignedPairMatrix[x] = stList_construct(); for (int32_t y = 1; y < lY; y++) { double f = posteriorMatchProb(fM, bM, x, y, lX, lY, sX, sY, totalProb); if (f >= posteriorMatchThreshold) { if (f > 1.0) { f = 1.0; } #ifdef BEN_DEBUG assert(sX[x-1] >= 0 && sX[x-1] <= 4); assert(sY[y-1] >= 0 && sY[y-1] <= 4); #endif if(p->alignAmbiguityCharacters || (sX[x-1] < 4 && sY[y-1] < 4)) { stIntTuple *alignedPair = stIntTuple_construct(3, (int32_t) floor(f * PAIR_ALIGNMENT_PROB_1), x - 1, y - 1); stList_append(alignedPairMatrix[x], alignedPair); } } } } } for (int32_t x = 1; x < lX; x++) { stList_appendAll(alignedPairs, alignedPairMatrix[x]); stList_destruct(alignedPairMatrix[x]); } free(alignedPairMatrix); } /////// //Maximal expected accuracy alignment /////// stList *getAlignedPairs(const char *sX, const char *sY, PairwiseAlignmentParameters *p) { //Allocate the matrices. int32_t lX = strlen(sX) + 1; int32_t lY = strlen(sY) + 1; char *cSX = convertSequence(sX, lX - 1); char *cSY = convertSequence(sY, lY - 1); double *fM; double *bM; //#pragma omp parallel default(shared) { //#pragma omp sections { //#pragma omp section { bM = backwardMatrix(lX, lY, cSX, cSY); } //#pragma omp section { fM = forwardMatrix(lX, lY, cSX, cSY); } } } double totalFProb = totalForwardProb(fM, lX, lY); double totalBProb = totalBackwardProb(bM, lX); double totalProb = (totalFProb + totalBProb) / 2; double diff = (totalFProb - totalBProb) / totalProb; if (diff < 0.0) { diff = -diff; } //Check they are about the same. #ifdef BEN_DEBUG assert(diff < 0.0001); #endif //Get posterior probabilities above 0.01 threshold. stList *alignedPairs = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct); getPosteriorProbs(fM, bM, lX, lY, cSX, cSY, alignedPairs, totalProb, p); //Cleanup free(fM); free(bM); free(cSX); free(cSY); return alignedPairs; } char *getSubString(const char *cA, int32_t start, int32_t length) { char *cA2 = memcpy(st_malloc(sizeof(char) * (length + 1)), cA + start, length); cA2[length] = '\0'; #ifdef BEN_DEBUG for(int32_t i=0; i l ? 1 : (k < l ? -1 : 0); } stList *getBlastPairs(const char *sX, const char *sY, int32_t lX, int32_t lY, int32_t trim) { /* * Uses lastz to compute a bunch of monotonically increasing pairs such that for any pair of consecutive pairs in the list * (x1, y1) (x2, y2) in the set of aligned pairs x1 appears before x2 in X and y1 appears before y2 in Y. */ stList *alignedPairs = stList_construct3(0, (void (*)(void *))stIntTuple_destruct); //the list to put the output in if(lX == 0 || lY == 0) { return alignedPairs; } //Write to file.. char *tempFile1 = getTempFile(); char *tempFile2 = getTempFile(); char *tempFile3 = getTempFile(); FILE *fileHandle = fopen(tempFile1, "w"); fprintf(fileHandle, ">a\n%s\n", sX); fclose(fileHandle); fileHandle = fopen(tempFile2, "w"); fprintf(fileHandle, ">b\n%s\n", sY); fclose(fileHandle); //Run lastz int32_t exitValue = st_system("lastz --hspthresh=800 --chain --strand=plus --gapped --format=cigar --ambiguous=iupac %s %s > %s", tempFile1, tempFile2, tempFile3); //int32_t exitValue = st_system("lastz --hspthresh=800 --maxwordcount=90%% --chain --strand=plus --gapped --format=cigar --ambiguous=iupac %s[unmask] %s[unmask] > %s", tempFile1, tempFile2, tempFile3); assert(exitValue == 0); //Read from file.. fileHandle = fopen(tempFile3, "r"); struct PairwiseAlignment *pA; while((pA = cigarRead(fileHandle)) != NULL) { int32_t j = pA->start1; int32_t k = pA->start2; assert(strcmp(pA->contig1, "a") == 0); assert(strcmp(pA->contig2, "b") == 0); assert(pA->strand1); assert(pA->strand2); for (int32_t i = 0; i < pA->operationList->length; i++) { struct AlignmentOperation *op = pA->operationList->list[i]; if (op->opType == PAIRWISE_MATCH) { for(int32_t l=trim; llength-trim; l++) { stList_append(alignedPairs, stIntTuple_construct(2, j+l, k+l)); } } if (op->opType != PAIRWISE_INDEL_Y) { j += op->length; } if (op->opType != PAIRWISE_INDEL_X) { k += op->length; } } assert(j == pA->end1); assert(k == pA->end2); destructPairwiseAlignment(pA); } fclose(fileHandle); stList_sort(alignedPairs, sortByXPlusYCoordinate); //Ensure the coordinates are increasing //Remove old files st_system("rm %s %s %s", tempFile1, tempFile2, tempFile3); free(tempFile1); free(tempFile2); free(tempFile3); return alignedPairs; } stList *filterPairsToGetAnchorPoints(stList *alignedPairs, int32_t minRectangleSize, int32_t lX, int32_t lY) { /* * Filters the blast pairs so that the are spaced with dp matrices of at least minRectangle size between them. */ int32_t x = -1; int32_t y = -1; stList *filteredPairs = stList_construct(); for(int32_t i=0; i x && y2 > y) { //This should never occur but deals with an error in lastz, I think.... if(((int64_t)x2 - x - 1) * (y2 - y - 1) >= minRectangleSize && ((int64_t)lX - x2 - 1) * (lY - y2 - 1) >= minRectangleSize) { stList_append(filteredPairs, alignedPair); x = x2; y = y2; } } } assert(x < lX); assert(y < lY); return filteredPairs; } static void convertPairs(stList *alignedPairs2, int32_t offsetX, int32_t offsetY) { /* * Convert the coordinates of the computed pairs. */ for (int32_t k = 0; k < stList_length(alignedPairs2); k++) { stIntTuple *i = stList_get(alignedPairs2, k); assert(stIntTuple_length(i) == 3); stList_set(alignedPairs2, k, stIntTuple_construct(3, stIntTuple_getPosition(i, 0), stIntTuple_getPosition(i, 1) + offsetX, stIntTuple_getPosition(i, 2) + offsetY)); stIntTuple_destruct(i); } } static stList *getAlignedPairs_Split(char *sX, char *sY, int32_t lX, int32_t lY, int32_t bandSize, PairwiseAlignmentParameters *p) { /* * Aligns the sequences, but if the product of there sequence lengths is greater than bandSize squared * then a dp matrix of bandsize squared if computed in the top left part of the entire dp matrix, and a * corresponding square in the bottom right part of the matrix. */ if((int64_t)lX * lY <= (int64_t)bandSize * bandSize) { //products can be > 2^31 return getAlignedPairs(sX, sY, p); } st_logDebug("We found an overlarge matrix to compute: %i %i \n", lX, lY); if(lX > bandSize) { char *sX2 = getSubString(sX, 0, bandSize); stList *alignedPairs = getAlignedPairs_Split(sX2, sY, bandSize, lY, bandSize, p); free(sX2); sX2 = getSubString(sX, lX-bandSize, bandSize); stList *alignedPairs2 = getAlignedPairs_Split(sX2, sY, bandSize, lY, bandSize, p); free(sX2); convertPairs(alignedPairs2, lX-bandSize, 0); stList_appendAll(alignedPairs, alignedPairs2); while(stList_length(alignedPairs2) > 0) { //empty and destroy the second list. stList_pop(alignedPairs2); } stList_destruct(alignedPairs2); return alignedPairs; } assert(lY > bandSize); char *sY2 = getSubString(sY, 0, bandSize); stList *alignedPairs = getAlignedPairs(sX, sY2, p); free(sY2); sY2 = getSubString(sY, lY-bandSize, bandSize); stList *alignedPairs2 = getAlignedPairs(sX, sY2, p); free(sY2); convertPairs(alignedPairs2, 0, lY-bandSize); stList_appendAll(alignedPairs, alignedPairs2); while(stList_length(alignedPairs2) > 0) { //empty and destroy the second list. stList_pop(alignedPairs2); } stList_destruct(alignedPairs2); return alignedPairs; } static int getAlignedPairs_FastP(const void *i, const void *j) { int64_t k = stIntTuple_getPosition((stIntTuple *)i, 1) + stIntTuple_getPosition((stIntTuple *)i, 2); int64_t l = stIntTuple_getPosition((stIntTuple *)j, 1) + stIntTuple_getPosition((stIntTuple *)j, 2); return k > l ? 1 : (k < l ? -1 : 0); } PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters_construct() { PairwiseAlignmentParameters *p = st_malloc(sizeof(PairwiseAlignmentParameters)); p->maxBandingSize = 3000; p->minBandingSize = 1000; p->minBandingConstraintDistance = 300; p->minTraceBackDiag = 42; p->minTraceGapDiags = 20; p->constraintDiagonalTrim = 4; p->alignAmbiguityCharacters = 0; return p; } void pairwiseAlignmentBandingParameters_destruct(PairwiseAlignmentParameters *p) { free(p); } stList *getAlignedPairs_Fast(const char *sX, const char *sY, PairwiseAlignmentParameters *p) { /* * Aligns the pair of sequences using banding constraints. */ int32_t lX = strlen(sX); int32_t lY = strlen(sY); int32_t offsetX = 0; int32_t offsetY = 0; stList *blastPairs; if((int64_t)lX * lY > (int64_t)p->minBandingSize * p->minBandingSize) { blastPairs = getBlastPairs(sX, sY, lX, lY, p->constraintDiagonalTrim); } else { blastPairs = stList_construct(); //We don't bother getting anchors if the sequences are sufficiently small. } stList *bandPairs = filterPairsToGetAnchorPoints(blastPairs, p->minBandingConstraintDistance * p->minBandingConstraintDistance, lX, lY); stListIterator *bandIt = stList_getIterator(bandPairs); stIntTuple *bandPair; st_logDebug("We got %i aligned pairs and %i filtered pairs\n", stList_length(blastPairs), stList_length(bandPairs)); stSortedSet *alignedPairs = stSortedSet_construct3((int(*)(const void *, const void *)) getAlignedPairsFast_cmpFn, NULL); bool done = 0; while (!done) { int32_t endsetX, endsetY; if((bandPair = stList_getNext(bandIt)) != NULL) { endsetX = stIntTuple_getPosition(bandPair, 0); endsetY = stIntTuple_getPosition(bandPair, 1); } else { done = 1; endsetX = lX-1; endsetY = lY-1; } st_logDebug("The next blast square, min x: %i, min y: %i, max x: %i, max y: %i, size x: %i, size y: %i\n", offsetX, offsetY, endsetX, endsetY, endsetX - offsetX, endsetY - offsetY); //Get the appropriate x substring int32_t lX2 = endsetX - offsetX + 1; char *sX2 = getSubString(sX, offsetX, lX2); //Get the appropriate y substring int32_t lY2 = endsetY - offsetY + 1; char *sY2 = getSubString(sY, offsetY, lY2); //Do the actual alignment.. stList *alignedPairs2 = getAlignedPairs_Split(sX2, sY2, lX2, lY2, p->maxBandingSize, p); //Cleanup the temporary sequences free(sX2); free(sY2); //Convert the coordinates convertPairs(alignedPairs2, offsetX, offsetY); //The diagonal bounds of the banding block int32_t startDiag = offsetX + offsetY; int32_t endDiag = startDiag + lX2 + lY2; //Now either setup the next job if there is some sequence remaining. if (offsetX + lX2 < lX || offsetY + lY2 < lY) { //We still have work to do on another round //Sort so that the coordinates are in increasing x+y coordinate order stList_sort(alignedPairs2, getAlignedPairs_FastP); int32_t traceBackDiag = endDiag - p->minTraceBackDiag; int32_t traceForwardDiag = startDiag + p->minTraceBackDiag; //(lX2 + lY2) / 2; //We require the new alignment to overlap by at most halfway from the old one. int32_t newOffsetX = offsetX, newOffsetY = offsetY, maxScore = -1; stListIterator *it = stList_getIterator(alignedPairs2); stIntTuple *i; while ((i = stList_getNext(it)) != NULL) { int32_t j = stIntTuple_getPosition(i, 1); int32_t k = stIntTuple_getPosition(i, 2); int32_t score = stIntTuple_getPosition(i, 0); int32_t diagC = j + k; if (diagC >= traceForwardDiag && diagC <= traceBackDiag && score >= 0.6 * PAIR_ALIGNMENT_PROB_1) { //traceBackCandidateThreshold //has the required score to be considered a start point. if(j + k > newOffsetX + newOffsetY) { //Should not be needed, but numerical accuracy is not perfect. if(score >= maxScore * 0.99) { maxScore = score; newOffsetX = j; newOffsetY = k; } } } } stList_destructIterator(it); if (maxScore != -1) { st_logDebug("We found an interim point x: %i y: %i prob: %f\n", newOffsetX, newOffsetY, (double)maxScore / PAIR_ALIGNMENT_PROB_1); //We can start a new alignment assert(newOffsetX > offsetX || newOffsetY > offsetY); //Update the offsets offsetX = newOffsetX; offsetY = newOffsetY; } else { //No candidate start point was found so we just stop the extension assert(newOffsetX == offsetX && newOffsetY == offsetY); st_logDebug("We failed to find an interim point, x: %i y: %i\n", newOffsetX, newOffsetY); } } //Add the pairs to the alignment (merging together any duplicate pairs) //And skip any pairs within minTraceGapDiags. while (stList_length(alignedPairs2) > 0) { stIntTuple *i = stList_pop(alignedPairs2); assert(stIntTuple_length(i) == 3); int32_t l = stIntTuple_getPosition(i, 1); int32_t m = stIntTuple_getPosition(i, 2); //is not too close the start point (or is allowed because we are at the start of the band) //and is not too close to the end point (or is allowed because we're at the end of the band) if ((startDiag == 0 || l + m >= startDiag + p->minTraceGapDiags) && (done || l + m <= endDiag - p->minTraceGapDiags)) { stIntTuple *j; if ((j = stSortedSet_search(alignedPairs, i)) != NULL) { stSortedSet_remove(alignedPairs, i); #ifdef BEN_DEBUG assert(l == stIntTuple_getPosition(j, 1)); assert(m == stIntTuple_getPosition(j, 2)); assert(stSortedSet_search(alignedPairs, i) == NULL); #endif stIntTuple *k = stIntTuple_construct(3, (stIntTuple_getPosition(i, 0) + stIntTuple_getPosition(j, 0)) / 2, l, m); stSortedSet_insert(alignedPairs, k); #ifdef BEN_DEBUG assert(stSortedSet_search(alignedPairs, i) == k); assert(stSortedSet_search(alignedPairs, j) == k); #endif stIntTuple_destruct(i); stIntTuple_destruct(j); } else { stSortedSet_insert(alignedPairs, i); #ifdef BEN_DEBUG assert(stSortedSet_search(alignedPairs, i) == i); #endif } } else { stIntTuple_destruct(i); } } stList_destruct(alignedPairs2); } stList_destructIterator(bandIt); //Convert the set to a list. stList *alignedPairs2 = stSortedSet_getList(alignedPairs); #ifdef BEN_DEBUG assert(stList_length(alignedPairs2) == stSortedSet_size(alignedPairs)); #endif stList_setDestructor(alignedPairs2, (void(*)(void *)) stIntTuple_destruct); stSortedSet_destruct(alignedPairs); stList_destruct(bandPairs); stList_destruct(blastPairs); return alignedPairs2; }