/* pslReps - analyse repeats and generate list of best alignments * from a genome wide, sorted by mRNA .psl alignment file. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "memalloc.h" #include "psl.h" #include "options.h" #include "obscure.h" #include "sqlNum.h" /* command line */ static struct optionSpec optionSpecs[] = { {"minAli", OPTION_FLOAT}, {"nearTop", OPTION_FLOAT}, {"minCover", OPTION_FLOAT}, {"minNearTopSize", OPTION_INT}, {"ignoreSize", OPTION_BOOLEAN}, {"sizeMatters", OPTION_BOOLEAN}, /* opposite of ignore size, for compat */ {"noIntrons", OPTION_BOOLEAN}, {"singleHit", OPTION_BOOLEAN}, {"nohead", OPTION_BOOLEAN}, {"ignoreNs", OPTION_BOOLEAN}, {"coverQSizes", OPTION_STRING}, {NULL, 0} }; double minAli = 0.93; double nearTop = 0.01; double minCover = 0.0; boolean ignoreNs = FALSE; /* Ignore the n's when calculating coverage score, other wise n's count as mismatches. */ boolean ignoreSize = FALSE; boolean noIntrons = FALSE; boolean singleHit = FALSE; boolean noHead = FALSE; boolean quiet = FALSE; int minNearTopSize = 30; char *coverQSizeFile = NULL; void usage() /* Print usage instructions and exit. */ { errAbort( "pslReps - analyse repeats and generate genome wide best\n" "alignments from a sorted set of local alignments\n" "usage:\n" " pslReps in.psl out.psl out.psr\n" "where in.psl is an alignment file generated by psLayout and\n" "sorted by pslSort, out.psl is the best alignment output\n" "and out.psr contains repeat info\n" "options:\n" " -nohead don't add PSL header\n" " -ignoreSize Will not weigh in favor of larger alignments so much\n" " -noIntrons Will not penalize for not having introns when calculating\n" " size factor\n" " -singleHit Takes single best hit, not splitting into parts\n" " -minCover=0.N minimum coverage to output. Default is 0.\n" " -ignoreNs Ignore 'N's when calculating minCover.\n" " -minAli=0.N minimum alignment ratio\n" " default is 0.93\n" " -nearTop=0.N how much can deviate from top and be taken\n" " default is 0.01\n" " -minNearTopSize=N Minimum size of alignment that is near top\n" " for alignment to be kept. Default 30.\n" " -coverQSizes=file Tab-separate file with effective query sizes.\n" " When used with -minCover, this allows polyAs\n" " to be excluded from the coverage calculation\n"); } /* hash table of query name to sizes */ struct hash* coverQSizes = NULL; void loadCoverQSizes(char* coverQSizeFile) /* load coverage query sizes */ { struct lineFile *lf = lineFileOpen(coverQSizeFile, TRUE); char *row[2]; coverQSizes = hashNew(0); while (lineFileNextRowTab(lf, row, ArraySize(row))) { int qSize = sqlSigned(row[1]); hashAdd(coverQSizes, row[0], intToPt(qSize)); } lineFileClose(&lf); } int calcMilliScore(struct psl *psl) /* Figure out percentage score. */ { return 1000-pslCalcMilliBad(psl, TRUE); } int intronFactor(struct psl *psl) /* Figure bonus for having introns. An intron is worth 3 bases... * An intron in this case is just a gap of 0 bases in query and * 30 or more in target. */ { int i, blockCount = psl->blockCount; int ts, qs, te, qe, sz; int bonus = 0; if (blockCount <= 1) return 0; sz = psl->blockSizes[0]; qe = psl->qStarts[0] + sz; te = psl->tStarts[0] + sz; for (i=1; iqStarts[i]; ts = psl->tStarts[i]; if (qs == qe && ts - te >= 30) bonus += 3; sz = psl->blockSizes[i]; qe = qs + sz; te = ts + sz; } if (bonus > 10) bonus = 10; return bonus; } int sizeFactor(struct psl *psl) /* Return a factor that will favor longer alignments. */ { int score; if (ignoreSize) return 0; score = 4*round(sqrt(psl->match + psl->repMatch/4)); if (!noIntrons) score += intronFactor(psl); return score; } int calcSizedScore(struct psl *psl) /* Return score that includes base matches and size. */ { int score = calcMilliScore(psl) + sizeFactor(psl); return score; } boolean uglyTarget(struct psl *psl) /* Return TRUE if it's the one we're snooping on. */ { return sameString(psl->qName, "AF103731"); } boolean closeToTop(struct psl *psl, int *scoreTrack) /* Returns TRUE if psl is near the top scorer for at least 20 bases. */ { int milliScore = calcSizedScore(psl); int threshold = round(milliScore * (1.0+nearTop)); int i, blockIx; int start, size, end; int topCount = 0; char strand = psl->strand[0]; for (blockIx = 0; blockIx < psl->blockCount; ++blockIx) { start = psl->qStarts[blockIx]; size = psl->blockSizes[blockIx]; end = start+size; if (strand == '-') reverseIntRange(&start, &end, psl->qSize); for (i=start; i= minNearTopSize) return TRUE; } } } return FALSE; } boolean passMinCoverage(struct psl *psl) /* Does this psl have enough bases aligned to pass the minCover filter. */ { int qSize = psl->qSize; if (coverQSizes != NULL) { struct hashEl *hel = hashLookup(coverQSizes, psl->qName); if (hel != NULL) qSize = ptToInt(hel->val); } if(ignoreNs) return (psl->match + psl->repMatch >= minCover * (qSize - psl->nCount)); else return (psl->match + psl->repMatch >= minCover * qSize); } boolean passFilters(struct psl *psl, int *scoreTrack) /* Return TRUE if this psl passes the millScore, minCoverage and closeToTop thresholds. If scoreTrack is NULL then closeToTop threshold is skipped. */ { int milliMin = 1000*minAli; if (calcMilliScore(psl) >= milliMin && (scoreTrack == NULL || closeToTop(psl, scoreTrack)) && passMinCoverage(psl)) return TRUE; return FALSE; } void processBestMulti(char *acc, struct psl *pslList, FILE *bestFile, FILE *repFile) /* Find psl's that are best anywhere along their length. */ { struct psl *psl; int qSize; int *repTrack = NULL; int *scoreTrack = NULL; int milliScore; int goodAliCount = 0; int bestAliCount = 0; int milliMin = 1000*minAli; if (pslList == NULL) return; qSize = pslList->qSize; AllocArray(repTrack, qSize+1); AllocArray(scoreTrack, qSize+1); for (psl = pslList; psl != NULL; psl = psl->next) { int blockIx; char strand = psl->strand[0]; milliScore = calcMilliScore(psl); if (milliScore >= milliMin) { ++goodAliCount; milliScore += sizeFactor(psl); for (blockIx = 0; blockIx < psl->blockCount; ++blockIx) { int start = psl->qStarts[blockIx]; int size = psl->blockSizes[blockIx]; int end = start+size; int i; if (strand == '-') reverseIntRange(&start, &end, psl->qSize); if (start < 0 || end > qSize) { warn("Odd: qName %s tName %s qSize %d psl->qSize %d start %d end %d", psl->qName, psl->tName, qSize, psl->qSize, start, end); if (start < 0) start = 0; if (end > qSize) end = qSize; } for (i=start; i scoreTrack[i]) scoreTrack[i] = milliScore; } } } } /* Print out any alignments that are within 2% of top score. */ for (psl = pslList; psl != NULL; psl = psl->next) { if(passFilters(psl, scoreTrack)) { pslTabOut(psl, bestFile); ++bestAliCount; } } /* Print out run-length-encoded repeat info. */ { int runVal = repTrack[0]; int curVal; int runSize = 1; int packetCount = 0; int *packetSize = NULL; int *packetVal = NULL; int i; AllocArray(packetSize, qSize); AllocArray(packetVal, qSize); repTrack[qSize] = -1; /* Sentinal value to simplify RLC loop. */ fprintf(repFile, "%s\t%d\t%d\t", acc, bestAliCount, goodAliCount); for (i=1; i<=qSize; ++i) { if ((curVal = repTrack[i]) != runVal) { packetSize[packetCount] = runSize; packetVal[packetCount] = runVal; ++packetCount; runSize = 1; runVal = curVal; } else { ++runSize; } } fprintf(repFile, "%d\t", packetCount); for (i=0; inext) { score = pslScore(psl); if (score > bestScore) { bestScore = score; bestPsl = psl; } } threshold = round((1.0 - nearTop)*bestScore); for (psl = pslList; psl != NULL; psl = psl->next) { if (pslScore(psl) >= threshold && passFilters(psl, NULL)) pslTabOut(psl, bestFile); } } void doOneAcc(char *acc, struct psl *pslList, FILE *bestFile, FILE *repFile) /* Process alignments of one piece of mRNA. */ { if (singleHit) processBestSingle(acc, pslList, bestFile, repFile); else processBestMulti(acc, pslList, bestFile, repFile); } void pslReps(char *inName, char *bestAliName, char *repName) /* Analyse inName and put best alignments for eacmRNA in estAliName. * Put repeat info in repName. */ { struct lineFile *in = pslFileOpen(inName); FILE *bestFile = mustOpen(bestAliName, "w"); FILE *repFile = mustOpen(repName, "w"); int lineSize; char *line; char *words[32]; int wordCount; struct psl *pslList = NULL, *psl = NULL; char lastName[512]; int aliCount = 0; quiet = sameString(bestAliName, "stdout") || sameString(repName, "stdout"); if (coverQSizeFile != NULL) loadCoverQSizes(coverQSizeFile); if (!quiet) printf("Processing %s to %s and %s\n", inName, bestAliName, repName); if (!noHead) pslWriteHead(bestFile); strcpy(lastName, ""); while (lineFileNext(in, &line, &lineSize)) { if (((++aliCount & 0x1ffff) == 0) && !quiet) { printf("."); fflush(stdout); } wordCount = chopTabs(line, words); if (wordCount == 21) psl = pslLoad(words); else if (wordCount == 23) psl = pslxLoad(words); else errAbort("Bad line %d of %s\n", in->lineIx, in->fileName); if (!sameString(lastName, psl->qName)) { doOneAcc(lastName, pslList, bestFile, repFile); pslFreeList(&pslList); safef(lastName, sizeof(lastName), "%s", psl->qName); } slAddHead(&pslList, psl); } doOneAcc(lastName, pslList, bestFile, repFile); pslFreeList(&pslList); lineFileClose(&in); fclose(bestFile); fclose(repFile); if (!quiet) printf("Processed %d alignments\n", aliCount); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 4) usage(); minAli = optionFloat("minAli", minAli); nearTop = optionFloat("nearTop", nearTop); minCover = optionFloat("minCover", minCover); minNearTopSize = optionInt("minNearTopSize", minNearTopSize); ignoreSize = optionExists("ignoreSize"); if (optionExists("sizeMatters")) warn("warning: -sizeMatters is deprecated and is now the default"); noIntrons = optionExists("noIntrons"); singleHit = optionExists("singleHit"); noHead = optionExists("nohead"); ignoreNs = optionExists("ignoreNs"); coverQSizeFile = optionVal("coverQSizes", NULL); pslReps(argv[1], argv[2], argv[3]); return 0; }