/* chainScore - score chains. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "dnaseq.h" #include "nib.h" #include "fa.h" #include "axt.h" #include "psl.h" #include "boxClump.h" #include "chainBlock.h" #include "portable.h" int minScore = 1000; char *detailsName = NULL; char *gapFileName = NULL; struct scoreData /* Data needed to score block. */ { struct dnaSeq *qSeq; /* Query sequence. */ struct dnaSeq *tSeq; /* Target sequence. */ struct axtScoreScheme *ss; /* Scoring scheme. */ double gapPower; /* Power to raise gap size to. */ }; struct scoreData scoreData; struct axtScoreScheme *scoreScheme = NULL; int interpolate(int x, int *s, double *v, int sCount) /* Find closest value to x in s, and then lookup corresponding * value in v. Interpolate where necessary. */ { int i, ds, ss; double dv; for (i=0; itName, b->tName); if (dif == 0) dif = strcmp(a->qName, b->qName); if (dif == 0) dif = (int)a->qStrand - (int)b->qStrand; return dif; } void loadIfNewSeq(char *nibDir, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* Load sequence unless it is already loaded. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { char fileName[512]; freeDnaSeq(pSeq); snprintf(fileName, sizeof(fileName), "%s/%s.nib", nibDir, newName); *pName = newName; *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); uglyf("Loaded %d bases in %s\n", seq->size, fileName); } } void loadFaSeq(struct hash *faHash, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* retrieve sequence from hash. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { char fileName[512]; *pName = newName; *pSeq = seq = hashFindVal(faHash, newName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); uglyf("Loaded %d bases from %s fa\n", seq->size, newName); } } void usage() /* Explain usage and exit. */ { errAbort( "chainScore - score chains\n" "usage:\n" " chainScore in.chain tNibDir qNibDir out.chain\n" "options:\n" " -minScore=N Minimum score for chain, default %d\n" " -scoreScheme=fileName Read the scoring matrix from a blastz-format file\n" " -linearGap=filename Read piecewise linear gap from tab delimited file\n" " sample linearGap file \n" "tablesize 11\n" "smallSize 111\n" "position 1 2 3 11 111 2111 12111 32111 72111 152111 252111\n" "qGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900\n" "tGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900\n" "bothGap 750 825 850 1000 1300 3300 23300 58300 118300 218300 318300\n" , minScore ); } static struct optionSpec options[] = { {NULL, 0}, }; int gapCost(int dq, int dt) /* Figure out gap costs. */ { if (dt < 0) dt = 0; if (dq < 0) dq = 0; if (dt == 0) { if (dq < aid.smallSize) return aid.qSmall[dq]; else if (dq >= aid.qLastPos) return aid.qLastPosVal + aid.qLastSlope * (dq-aid.qLastPos); else return interpolate(dq, aid.longPos, aid.qLong, aid.qPosCount); } else if (dq == 0) { if (dt < aid.smallSize) return aid.tSmall[dt]; else if (dt >= aid.tLastPos) return aid.tLastPosVal + aid.tLastSlope * (dt-aid.tLastPos); else return interpolate(dt, aid.longPos, aid.tLong, aid.tPosCount); } else { int both = dq + dt; if (both < aid.smallSize) return aid.bSmall[both]; else if (both >= aid.bLastPos) return aid.bLastPosVal + aid.bLastSlope * (both-aid.bLastPos); else return interpolate(both, aid.longPos, aid.bLong, aid.bPosCount); } } double scoreBlock(char *q, char *t, int size, int matrix[256][256]) /* Score block through matrix. */ { double score = 0; int i; for (i=0; iblockList; b != NULL; b = b->next) { int size = b->qEnd - b->qStart; score += scoreBlock(qSeq->dna + b->qStart, tSeq->dna + b->tStart, size, matrix); if (a != NULL) score -= gapCost(b->tStart - a->tEnd, b->qStart - a->qEnd); a = b; } return score; } void scorePair(struct seqPair *sp, struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct chain **pChainList, struct chain *chainList) /* Chain up blocks and output. */ { struct chain *chain, *next; struct cBlock *b; /* Set up info for connect function. */ scoreData.qSeq = qSeq; scoreData.tSeq = tSeq; scoreData.ss = scoreScheme; scoreData.gapPower = 1.0/2.5; for (chain = chainList; chain != NULL; chain = chain->next) { chain->score = chainScore(chain, qSeq, tSeq, scoreData.ss->matrix, gapCost); } /* Move chains scoring over threshold to master list. */ for (chain = chainList; chain != NULL; chain = next) { next = chain->next; if (chain->score >= minScore) { slAddHead(pChainList, chain); } else { chainFree(&chain); } } } void doChainScore(char *chainIn, char *tNibDir, char *qNibDir, char *chainOut) { char qStrand = 0, tStrand = 0; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char *qName = "", *tName = ""; FILE *f = mustOpen(chainOut, "w"); struct chain *chainList = NULL, *chain; struct chain *inputChains, *next; FILE *details = NULL; struct lineFile *lf = NULL; struct dnaSeq *seq, *seqList = NULL; struct hash *faHash = newHash(0); struct hash *chainHash = newHash(0); char comment[1024]; FILE *faF; struct seqPair *spList = NULL, *sp; struct dyString *dy = newDyString(512); struct lineFile *chainsLf = lineFileOpen(chainIn, TRUE); while ((chain = chainRead(chainsLf)) != NULL) { dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", chain->qName, chain->qStrand, chain->tName); sp = hashFindVal(chainHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(chainHash, dy->string, sp, &sp->name); sp->qName = cloneString(chain->qName); sp->tName = cloneString(chain->tName); sp->qStrand = chain->qStrand; } slAddHead(&sp->chain, chain); } slSort(&spList, seqPairCmp); lineFileClose(&chainsLf); if (optionExists("faQ")) { faF = mustOpen(qNibDir, "r"); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) { hashAdd(faHash, seq->name, seq); slAddHead(&seqList, seq); } fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { if (optionExists("faQ")) { assert (faHash != NULL); loadFaSeq(faHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } else loadIfNewSeq(qNibDir, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); loadIfNewSeq(tNibDir, sp->tName, '+', &tName, &tSeq, &tStrand); scorePair(sp, qSeq, tSeq, &chainList, sp->chain); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); } double calcSlope(double y2, double y1, double x2, double x1) /* Calculate slope of line from x1/y1 to x2/y2 */ { return (y2-y1)/(x2-x1); } void initGapAid(char *gapFileName) /* Initialize gap aid structure for faster gap * computations. */ { int i, tableSize, startLong = -1; char *sizeDesc[2]; char *words[128]; if (gapFileName != NULL) { struct lineFile *lf = lineFileOpen(gapFileName, TRUE); int count; lineFileNextRowTab(lf, sizeDesc, 2); tableSize = atoi(sizeDesc[1]); AllocArray(gapInitPos,tableSize); AllocArray(gapInitQGap,tableSize); AllocArray(gapInitTGap,tableSize); AllocArray(gapInitBothGap,tableSize); while (count = lineFileChopNext(lf, words, tableSize+1)) { if (sameString(words[0],"smallSize")) { aid.smallSize = atoi(words[1]); } if (sameString(words[0],"position")) { for (i=0 ; i