/* blastOut.c - stuff to output an alignment in blast format. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "axt.h" #include "obscure.h" #include "genoFind.h" struct axtRef /* A reference to an axt. */ { struct axtRef *next; struct axt *axt; }; static int axtRefCmpScore(const void *va, const void *vb) /* Compare to sort based on score. */ { const struct axtRef *a = *((struct axtRef **)va); const struct axtRef *b = *((struct axtRef **)vb); return b->axt->score - a->axt->score; } struct targetHits /* Collection of hits to a single target. */ { struct targetHits *next; char *name; /* Target name */ int size; /* Target size */ struct axtRef *axtList; /* List of axts, sorted by score. */ int score; /* Score of best element */ }; static void targetHitsFree(struct targetHits **pObj) /* Free one target hits structure. */ { struct targetHits *obj = *pObj; if (obj != NULL) { freeMem(obj->name); slFreeList(&obj->axtList); freez(pObj); } } static void targetHitsFreeList(struct targetHits **pList) /* Free a list of dynamically allocated targetHits's */ { struct targetHits *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; targetHitsFree(&el); } *pList = NULL; } static int targetHitsCmpScore(const void *va, const void *vb) /* Compare to sort based on score. */ { const struct targetHits *a = *((struct targetHits **)va); const struct targetHits *b = *((struct targetHits **)vb); return b->score - a->score; } static int blastzToWublastScore(int bzScore) /* Convert from 100 points/match blastz score to 5 points/match * wu-blast score. */ { return bzScore/19; } static double blastzScoreToWuBits(int bzScore, boolean isProt) /* Convert from blastz score to bit score. The magic number's * here are taken from comparing wu-blast scores to axtScores on * a couple of sequences. This doesn't really seem to be a bit * score. It's not very close to 2 bits per base. */ { int wuScore = blastzToWublastScore(bzScore); if (isProt) return wuScore * 0.3545; else return wuScore * 0.1553; } static double blastzScoreToWuExpectation(int bzScore, double databaseSize) /* I'm puzzled by the wu-blast expectation score. I would * think it would be databaseSize / (2^bitScore) but * it's not. I think the best I can do is approximate * it with something scaled to be close to this expectation. */ { double logProbOne = -log(2) * bzScore / 32.1948; return databaseSize * exp(logProbOne); } static int blastzScoreToNcbiBits(int bzScore) /* Convert blastz score to bit score in NCBI sense. */ { return round(bzScore * 0.0205); } static int blastzScoreToNcbiScore(int bzScore) /* Conver blastz score to NCBI matrix score. */ { return round(bzScore * 0.0529); } static double blastzScoreToNcbiExpectation(int bzScore) /* Convert blastz score to expectation in NCBI sense. */ { double bits = bzScore * 0.0205; double logProb = -bits*log(2); return 3.0e9 * exp(logProb); } static double expectationToProbability(double e) /* Convert expected number of hits to probability of at least * one hit. This is a crude approximation, but actually pretty precise * for e < 0.1, which is all that really matters.... */ { if (e < 0.999) return e; else return 0.999; } static int countMatches(char *a, char *b, int size) /* Count number of characters that match between a and b. */ { int count = 0; int i; for (i=0; imatrix[(int)a[i]][(int)b[i]] > 0) ++count; } return count; } static int plusStrandPos(int pos, int size, char strand, boolean isEnd) /* Return position on plus strand, one based. */ { if (strand == '-') { pos = size - pos; if (isEnd) ++pos; } else { if (!isEnd) ++pos; } return pos; } static int calcDigitCount(struct axt *axt, int tSize, int qSize) /* Figure out how many digits needed for blast position display. */ { int tDig, qDig; if (axt->qStrand == '-') qDig = digitsBaseTen(qSize - axt->qStart + 1); else qDig = digitsBaseTen(axt->qEnd); if (axt->tStrand == '-') tDig = digitsBaseTen(tSize - axt->tStart + 1); else tDig = digitsBaseTen(axt->tEnd); return max(tDig, qDig); } static void blastiodAxtOutput(FILE *f, struct axt *axt, int tSize, int qSize, int lineSize, boolean isProt, boolean isTranslated) /* Output base-by-base part of alignment in blast-like fashion. */ { int tOff = axt->tStart; int dt = (isTranslated ? 3 : 1); int qOff = axt->qStart; int lineStart, lineEnd, i; int digits = calcDigitCount(axt, tSize, qSize); struct axtScoreScheme *ss = NULL; if (isProt) ss = axtScoreSchemeProteinDefault(); for (lineStart = 0; lineStart < axt->symCount; lineStart = lineEnd) { lineEnd = lineStart + lineSize; if (lineEnd > axt->symCount) lineEnd = axt->symCount; fprintf(f, "Query: %-*d ", digits, plusStrandPos(qOff, qSize, axt->qStrand, FALSE)); for (i=lineStart; iqSym[i]; fputc(c, f); if (c != '-') ++qOff; } fprintf(f, " %-d\n", plusStrandPos(qOff, qSize, axt->qStrand, TRUE)); fprintf(f, " %*s ", digits, " "); for (i=lineStart; iqSym[i]; t = axt->tSym[i]; if (isProt) { if (q == t) c = q; else if (ss->matrix[(int)q][(int)t] > 0) c = '+'; else c = ' '; } else c = ((toupper(q) == toupper(t)) ? '|' : ' '); fputc(c, f); } fprintf(f, "\n"); fprintf(f, "Sbjct: %-*d ", digits, plusStrandPos(tOff, tSize, axt->tStrand, FALSE)); for (i=lineStart; itSym[i]; fputc(c, f); if (c != '-') tOff += dt; } fprintf(f, " %-d\n", plusStrandPos(tOff, tSize, axt->tStrand, TRUE)); fprintf(f, "\n"); } } static struct targetHits *bundleIntoTargets(struct axtBundle *abList) /* BLAST typically outputs everything on the same query and target * in one clump. This routine rearranges axts in abList to do this. */ { struct targetHits *targetList = NULL, *target; struct hash *targetHash = newHash(10); struct axtBundle *ab; struct axtRef *ref; /* Build up a list of targets in database hit by query sorted by * score of hits. */ for (ab = abList; ab != NULL; ab = ab->next) { struct axt *axt; for (axt = ab->axtList; axt != NULL; axt = axt->next) { target = hashFindVal(targetHash, axt->tName); if (target == NULL) { AllocVar(target); slAddHead(&targetList, target); hashAdd(targetHash, axt->tName, target); target->name = cloneString(axt->tName); target->size = ab->tSize; } if (axt->score > target->score) target->score = axt->score; AllocVar(ref); ref->axt = axt; slAddHead(&target->axtList, ref); } } slSort(&targetList, targetHitsCmpScore); for (target = targetList; target != NULL; target = target->next) slSort(&target->axtList, axtRefCmpScore); hashFree(&targetHash); return targetList; } static char *nameForStrand(char strand) /* Return Plus/Minus for +/- */ { if (strand == '-') return "Minus"; else return "Plus"; } static char *progType(boolean isProt, struct axtBundle *ab, boolean isUpper) /* Return blast 'program' */ { if (!isProt) return isUpper ? "BLASTN" : "blastn"; else { if (ab->axtList->frame != 0) return isUpper ? "TBLASTN" : "tblastn"; else return isUpper ? "BLASTP" : "blastp"; } } static void wuBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, FILE *f, char *databaseName, int databaseSeqCount, double databaseLetterCount, char *ourId) /* Do wublast-like output at end of processing query. */ { char asciiNum[32]; struct targetHits *targetList = NULL, *target; char *queryName; int isRc; int querySize = abList->qSize; boolean isTranslated = (abList->axtList->frame != 0); /* Print out stuff that doesn't depend on query or database. */ if (ourId == NULL) ourId = "axtBlastOut"; fprintf(f, "%s 2.0MP-WashU [%s]\n", progType(isProt, abList, TRUE), ourId); fprintf(f, "\n"); fprintf(f, "Copyright (C) 2000-2002 Jim Kent\n"); fprintf(f, "All Rights Reserved\n"); fprintf(f, "\n"); fprintf(f, "Reference: Kent, WJ. (2002) BLAT - The BLAST-like alignment tool\n"); fprintf(f, "\n"); if (!isProt) { fprintf(f, "Notice: this program and its default parameter settings are optimized to find\n"); fprintf(f, "nearly identical sequences very rapidly. For slower but more sensitive\n"); fprintf(f, "alignments please use other methods.\n"); fprintf(f, "\n"); } /* Print query and database info. */ queryName = abList->axtList->qName; fprintf(f, "Query= %s\n", queryName); fprintf(f, " (%d letters; record %d)\n", abList->qSize, queryIx); fprintf(f, "\n"); fprintf(f, "Database: %s\n", databaseName); sprintLongWithCommas(asciiNum, databaseLetterCount); fprintf(f, " %d sequences; %s total letters\n", databaseSeqCount, asciiNum); fprintf(f, "Searching....10....20....30....40....50....60....70....80....90....100%% done\n"); fprintf(f, "\n"); targetList = bundleIntoTargets(abList); /* Print out summary of hits. */ fprintf(f, " Smallest\n"); fprintf(f, " Sum\n"); fprintf(f, " High Probability\n"); fprintf(f, "Sequences producing High-scoring Segment Pairs: Score P(N) N\n"); fprintf(f, "\n"); for (target = targetList; target != NULL; target = target->next) { double expectation = blastzScoreToWuExpectation(target->score, databaseLetterCount); double p = expectationToProbability(expectation); fprintf(f, "%-61s %4d %8.1e %2d\n", target->name, blastzToWublastScore(target->score), p, slCount(target->axtList)); } /* Print out details on each target. */ for (target = targetList; target != NULL; target = target->next) { fprintf(f, "\n\n>%s\n", target->name); fprintf(f, " Length = %d\n", target->size); fprintf(f, "\n"); for (isRc=0; isRc <= 1; ++isRc) { boolean saidStrand = FALSE; char strand = (isRc ? '-' : '+'); char *strandName = nameForStrand(strand); struct axtRef *ref; struct axt *axt; for (ref = target->axtList; ref != NULL; ref = ref->next) { axt = ref->axt; if (axt->qStrand == strand) { int matches = countMatches(axt->qSym, axt->tSym, axt->symCount); int positives = countPositives(axt->qSym, axt->tSym, axt->symCount); if (!saidStrand) { saidStrand = TRUE; if (!isProt) fprintf(f, " %s Strand HSPs:\n\n", strandName); } fprintf(f, " Score = %d (%2.1f bits), Expect = %5.1e, P = %5.1e\n", blastzToWublastScore(axt->score), blastzScoreToWuBits(axt->score, isProt), blastzScoreToWuExpectation(axt->score, databaseLetterCount), blastzScoreToWuExpectation(axt->score, databaseLetterCount)); fprintf(f, " Identities = %d/%d (%d%%), Positives = %d/%d (%d%%)", matches, axt->symCount, round(100.0 * matches / axt->symCount), positives, axt->symCount, round(100.0 * positives / axt->symCount)); if (isProt) { if (axt->frame != 0) fprintf(f, ", Frame = %c%d", axt->tStrand, axt->frame); fprintf(f, "\n"); } else fprintf(f, ", Strand = %s / Plus\n", strandName); fprintf(f, "\n"); blastiodAxtOutput(f, axt, target->size, querySize, 60, isProt, isTranslated); } } } } /* Cleanup time. */ targetHitsFreeList(&targetList); } static void ncbiPrintE(FILE *f, double e) /* Print a small number NCBI style. */ { if (e <= 0.0) fprintf(f, "0.0"); else if (e <= 1.0e-100) { char buf[256], *s; safef(buf, sizeof(buf), "%e", e); s = strchr(buf, 'e'); if (s == NULL) fprintf(f, "0.0"); else fprintf(f, "%s", s); } else fprintf(f, "%1.0e", e); } /* special global variable needed for Known Genes track building. Fan 1/21/03 */ int answer_for_kg; static void ncbiBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, FILE *f, char *databaseName, int databaseSeqCount, double databaseLetterCount, char *ourId, double minIdentity) /* Do ncbiblast-like output at end of processing query. */ { char asciiNum[32]; struct targetHits *targetList = NULL, *target; char *queryName; int querySize = abList->qSize; boolean isTranslated = (abList->axtList->frame != 0); /* Print out stuff that doesn't depend on query or database. */ if (ourId == NULL) ourId = "axtBlastOut"; fprintf(f, "%s 2.2.11 [%s]\n", progType(isProt, abList, TRUE), ourId); fprintf(f, "\n"); fprintf(f, "Reference: Kent, WJ. (2002) BLAT - The BLAST-like alignment tool\n"); fprintf(f, "\n"); /* Print query and database info. */ queryName = abList->axtList->qName; fprintf(f, "Query= %s\n", queryName); fprintf(f, " (%d letters)\n", abList->qSize); fprintf(f, "\n"); fprintf(f, "Database: %s \n", databaseName); sprintLongWithCommas(asciiNum, databaseLetterCount); fprintf(f, " %d sequences; %s total letters\n", databaseSeqCount, asciiNum); fprintf(f, "\n"); fprintf(f, "Searching.done\n"); targetList = bundleIntoTargets(abList); /* Print out summary of hits. */ fprintf(f, " Score E\n"); fprintf(f, "Sequences producing significant alignments: (bits) Value\n"); fprintf(f, "\n"); for (target = targetList; target != NULL; target = target->next) { struct axtRef *ref; struct axt *axt; int matches; double identity, expectation; int bit; for (ref = target->axtList; ref != NULL; ref = ref->next) { axt = ref->axt; matches = countMatches(axt->qSym, axt->tSym, axt->symCount); identity = round(100.0 * matches / axt->symCount); /* skip output if minIdentity not reached */ if (identity < minIdentity) continue; bit = blastzScoreToNcbiBits(axt->score); expectation = blastzScoreToNcbiExpectation(axt->score); fprintf(f, "%-67s %4d ", target->name, bit); ncbiPrintE(f, expectation); fprintf(f, "\n"); } } fprintf(f, "\n"); /* Print out details on each target. */ for (target = targetList; target != NULL; target = target->next) { struct axtRef *ref; struct axt *axt; int matches, gaps; char *oldName; int ii = 0; double identity; oldName = strdup(""); for (ref = target->axtList; ref != NULL; ref = ref->next) { ii++; axt = ref->axt; matches = countMatches(axt->qSym, axt->tSym, axt->symCount); identity = round(100.0 * matches / axt->symCount); /* skip output if minIdentity not reached */ if (identity < minIdentity) continue; /* print target sequence name and length only once */ if (!sameWord(oldName, target->name)) { fprintf(f, "\n\n>%s \n", target->name); fprintf(f, " Length = %d\n", target->size); oldName = strdup(target->name); } fprintf(f, "\n"); fprintf(f, " Score = %d bits (%d), Expect = ", blastzScoreToNcbiBits(axt->score), blastzScoreToNcbiScore(axt->score)); ncbiPrintE(f, blastzScoreToNcbiExpectation(axt->score)); fprintf(f, "\n"); if (isProt) { int positives = countPositives(axt->qSym, axt->tSym, axt->symCount); gaps = countGaps(axt->qSym, axt->tSym, axt->symCount); fprintf(f, " Identities = %d/%d (%d%%),", matches, axt->symCount, round(100.0 * matches / axt->symCount)); fprintf(f, " Positives = %d/%d (%d%%),", positives, axt->symCount, round(100.0 * positives / axt->symCount)); fprintf(f, " Gaps = %d/%d (%d%%)\n", gaps, axt->symCount, round(100.0 * gaps / axt->symCount)); if (axt->frame != 0) fprintf(f, " Frame = %c%d\n", axt->tStrand, axt->frame); /* set the special global variable, answer_for_kg. This is needed for Known Genes track building. Fan 1/21/03 */ answer_for_kg=axt->symCount - matches; } else { fprintf(f, " Identities = %d/%d (%d%%)\n", matches, axt->symCount, round(100.0 * matches / axt->symCount)); /* blast displays dna searches as +- instead of blat's default -+ */ if (!isTranslated) if ((axt->qStrand == '-') && (axt->tStrand == '+')) { reverseIntRange(&axt->qStart, &axt->qEnd, querySize); reverseIntRange(&axt->tStart, &axt->tEnd, target->size); reverseComplement(axt->qSym, axt->symCount); reverseComplement(axt->tSym, axt->symCount); axt->qStrand = '+'; axt->tStrand = '-'; } fprintf(f, " Strand = %s / %s\n", nameForStrand(axt->qStrand), nameForStrand(axt->tStrand)); } fprintf(f, "\n"); blastiodAxtOutput(f, axt, target->size, querySize, 60, isProt, isTranslated); } } fprintf(f, " Database: %s\n", databaseName); /* Cleanup time. */ targetHitsFreeList(&targetList); } static void xmlBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, FILE *f, char *databaseName, int databaseSeqCount, double databaseLetterCount, char *ourId) /* Do ncbi blast xml-like output at end of processing query. * WARNING - still not completely baked. Format at NCBI seems * to be missing some end tags actually when I checked. -jk */ { char *queryName = abList->axtList->qName; int querySize = abList->qSize; int hitNum = 0; struct targetHits *targetList = NULL, *target; if (ourId == NULL) ourId = "axtBlastOut"; fprintf(f, "\n"); fprintf(f, "\n"); fprintf(f, "\n"); fprintf(f, " %s\n", progType(isProt, abList, FALSE)); fprintf(f, " %s 2.2.4 [%s]\n", progType(isProt, abList, FALSE), ourId); fprintf(f, " ~Reference: Kent, WJ. (2002) BLAT - The BLAST-like alignment tool\n"); fprintf(f, " %s\n", databaseName); fprintf(f, " %d\n", queryIx); fprintf(f, " %s\n", queryName); fprintf(f, " %d\n", querySize); if (isProt) { fprintf(f, " \n"); fprintf(f, " BLOSUM62\n"); fprintf(f, " 0.001\n"); fprintf(f, " 10\n"); fprintf(f, " 1\n"); fprintf(f, " \n"); } fprintf(f, " \n"); fprintf(f, " \n"); fprintf(f, " 1\n"); fprintf(f, " \n"); /* Print out details on each target. */ targetList = bundleIntoTargets(abList); for (target = targetList; target != NULL; target = target->next) { struct axtRef *ref; fprintf(f, " \n"); fprintf(f, " %d\n", hitNum); fprintf(f, " %s\n", target->name); fprintf(f, " %s\n", target->name); fprintf(f, " %d\n", target->size); fprintf(f, " \n"); for (ref = target->axtList; ref != NULL; ref = ref->next) { struct axt *axt = ref->axt; int hspIx = 0; fprintf(f, " \n"); fprintf(f, " %d\n", ++hspIx); fprintf(f, " %d\n", blastzScoreToNcbiBits(axt->score)); fprintf(f, " %d\n", blastzScoreToNcbiScore(axt->score)); fprintf(f, " %f\n", blastzScoreToNcbiExpectation(axt->score)); fprintf(f, " %d\n", axt->qStart+1); fprintf(f, " %d\n", axt->qEnd); fprintf(f, " %d\n", axt->tStart+1); fprintf(f, " %d\n", axt->tEnd); fprintf(f, " \n"); } fprintf(f, " \n"); } fprintf(f, " \n"); fprintf(f, " \n"); fprintf(f, " \n"); fprintf(f, "\n"); } static void printAxtTargetBlastTab(FILE *f, struct axt *axt, int targetSize) /* Print out target in tabular blast-oriented format. */ { int s = axt->tStart, e = axt->tEnd; if (axt->tStrand == '-') reverseIntRange(&s, &e, targetSize); if (axt->tStrand == axt->qStrand) { fprintf(f, "%d\t", s+1); fprintf(f, "%d\t", e); } else { fprintf(f, "%d\t", e); fprintf(f, "%d\t", s+1); } } static void tabBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, FILE *f, char *databaseName, int databaseSeqCount, double databaseLetterCount, char *ourId, boolean withComment) /* Do NCBI tabular blast output. */ { char *queryName = abList->axtList->qName; int querySize = abList->qSize; struct targetHits *targetList = NULL, *target; if (withComment) { // use date from CVS, unless checked out with -kk, then ignore. char * rcsDate = "$Date: 2009/02/26 00:05:49 $"; char dateStamp[11]; if (strlen(rcsDate) > 17) safencpy(dateStamp, sizeof(dateStamp), rcsDate+7, 10); else safecpy(dateStamp, sizeof(dateStamp), ""); dateStamp[10] = 0; fprintf(f, "# BLAT %s [%s]\n", gfVersion, dateStamp); fprintf(f, "# Query: %s\n", queryName); fprintf(f, "# Database: %s\n", databaseName); fprintf(f, "%s\n", "# Fields: Query id, Subject id, % identity, alignment length, " "mismatches, gap openings, q. start, q. end, s. start, s. end, " "e-value, bit score"); } /* Print out details on each target. */ targetList = bundleIntoTargets(abList); for (target = targetList; target != NULL; target = target->next) { struct axtRef *ref; for (ref = target->axtList; ref != NULL; ref = ref->next) { struct axt *axt = ref->axt; int matches = countMatches(axt->qSym, axt->tSym, axt->symCount); int gaps = countGaps(axt->qSym, axt->tSym, axt->symCount); int gapOpens = countGapOpens(axt->qSym, axt->tSym, axt->symCount); fprintf(f, "%s\t", axt->qName); fprintf(f, "%s\t", axt->tName); fprintf(f, "%.2f\t", 100.0 * matches/axt->symCount); fprintf(f, "%d\t", axt->symCount); fprintf(f, "%d\t", axt->symCount - matches - gaps); fprintf(f, "%d\t", gapOpens); if (axt->qStrand == '-') { int s = axt->qStart, e = axt->qEnd; reverseIntRange(&s, &e, querySize); fprintf(f, "%d\t", s+1); fprintf(f, "%d\t", e); printAxtTargetBlastTab(f, axt, target->size); } else { fprintf(f, "%d\t", axt->qStart + 1); fprintf(f, "%d\t", axt->qEnd); printAxtTargetBlastTab(f, axt, target->size); } fprintf(f, "%3.1e\t", blastzScoreToNcbiExpectation(axt->score)); fprintf(f, "%d.0\n", blastzScoreToNcbiBits(axt->score)); } } /* Cleanup time. */ targetHitsFreeList(&targetList); } void axtBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, FILE *f, char *databaseName, int databaseSeqCount, double databaseLetterCount, char *blastType, char *ourId, double minIdentity) /* Output a bundle of axt's on the same query sequence in blast format. * The parameters in detail are: * ab - the list of bundles of axt's. * f - output file handle * databaseSeqCount - number of sequences in database * databaseLetterCount - number of bases or aa's in database * blastType - blast/wublast/blast8/blast9/xml * ourId - optional (may be NULL) thing to put in header */ { if (abList == NULL) return; if (sameWord(blastType, "wublast")) wuBlastOut(abList, queryIx, isProt, f, databaseName, databaseSeqCount, databaseLetterCount, ourId); else if (sameWord(blastType, "xml")) xmlBlastOut(abList, queryIx, isProt, f, databaseName, databaseSeqCount, databaseLetterCount, ourId); else if (sameWord(blastType, "blast")) ncbiBlastOut(abList, queryIx, isProt, f, databaseName, databaseSeqCount, databaseLetterCount, ourId, minIdentity); else if (sameWord(blastType, "blast8")) tabBlastOut(abList, queryIx, isProt, f, databaseName, databaseSeqCount, databaseLetterCount, ourId, FALSE); else if (sameWord(blastType, "blast9")) tabBlastOut(abList, queryIx, isProt, f, databaseName, databaseSeqCount, databaseLetterCount, ourId, TRUE); else errAbort("Unrecognized blastType %s in axtBlastOut", blastType); }