/* gfOut - stuff to manage output for genoFind system - * currently supports psl, axt, blast, and wu-blast. * * Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "dystring.h" #include "dnautil.h" #include "axt.h" #include "maf.h" #include "trans3.h" #include "psl.h" #include "genoFind.h" struct pslxData /* This is the data structure put in gfOutput.data for psl/pslx output. */ { FILE *f; /* Output file. */ boolean saveSeq; /* Save sequence too? */ }; struct axtData /* This is the data structure put in gfOutput.data for axt/blast output. */ { struct axtBundle *bundleList; /* List of bundles. */ char *databaseName; /* Just used for blast. */ int databaseSeqCount; /* Just used for blast. */ double databaseLetters; /* Just used for blast. */ char *blastType; /* 'blast' or 'wublast' or 'xml' or 'blast8' or 'blast9' */ double minIdentity; /* Just used for blast. */ }; static void savePslx(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq, boolean isRc, enum ffStringency stringency, int minMatch, FILE *f, struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc, struct hash *maskHash, int minIdentity, boolean qIsProt, boolean tIsProt, boolean saveSeq) /* Analyse one alignment and if it looks good enough write it out to file in * psl format (or pslX format - if saveSeq is TRUE). */ { /* This function was stolen from psLayout and slightly extensively to cope * with protein as well as DNA aligments. */ struct ffAli *ff, *nextFf; struct ffAli *right = ffRightmost(ali); DNA *needle = qSeq->dna; DNA *hay = tSeq->dna; int nStart = ali->nStart - needle; int nEnd = right->nEnd - needle; int hStart, hEnd; int nInsertBaseCount = 0; int nInsertCount = 0; int hInsertBaseCount = 0; int hInsertCount = 0; int matchCount = 0; int mismatchCount = 0; int repMatch = 0; int countNs = 0; DNA *np, *hp, n, h; int blockSize; int i; struct trans3 *t3List = NULL; Bits *maskBits = NULL; if (maskHash != NULL) maskBits = hashMustFindVal(maskHash, tSeq->name); if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset; hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset; /* Count up matches, mismatches, inserts, etc. */ for (ff = ali; ff != NULL; ff = nextFf) { nextFf = ff->right; blockSize = ff->nEnd - ff->nStart; np = ff->nStart; hp = ff->hStart; for (i=0; ihStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int nGap = nextFf->nStart - ff->nEnd; if (nGap != 0) { ++nInsertCount; nInsertBaseCount += nGap; } if (hGap != 0) { ++hInsertCount; hInsertBaseCount += hGap; } } } /* See if it looks good enough to output, and output. */ /* if (score >= minMatch) Moved to higher level */ { int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount); if ((matchCount + repMatch + mismatchCount) > 0) { int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount); if (id >= minIdentity) { if (isRc) { int temp; int oSize = qSeq->size; temp = nStart; nStart = oSize - nEnd; nEnd = oSize - temp; } if (targetIsRc) { int temp; temp = hStart; hStart = chromSize - hEnd; hEnd = chromSize - temp; } fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c", matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount, (isRc ? '-' : '+')); if (reportTargetStrand) fprintf(f, "%c", (targetIsRc ? '-' : '+') ); fprintf(f, "\t%s\t%d\t%d\t%d\t" "%s\t%d\t%d\t%d\t%d\t", qSeq->name, qSeq->size, nStart, nEnd, chromName, chromSize, hStart, hEnd, ffAliCount(ali)); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nStart - needle)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset); if (saveSeq) { fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->nStart, ff->nEnd - ff->nStart); fputc(',', f); } fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->hStart, ff->hEnd - ff->hStart); fputc(',', f); } } fprintf(f, "\n"); if (ferror(f)) { perror(""); errAbort("Write error to .psl"); } } } } } static void pslOut(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out) /* Save psl for more complex alignments. */ { struct pslxData *outForm = out->data; savePslx(chromName, chromSize, chromOffset, ali, tSeq, qSeq, qIsRc, stringency, minMatch, outForm->f, t3Hash, out->reportTargetStrand, tIsRc, out->maskHash, out->minGood, out->qIsProt, out->tIsProt, outForm->saveSeq); } static struct ffAli *ffNextBreak(struct ffAli *ff, int maxInsert, bioSeq *tSeq, struct trans3 *t3List) /* Return ffAli after first gap in either sequence longer than maxInsert, * or after first gap in both sequences. Return may legitimately * be NULL. */ { struct ffAli *rt = ff->right; int hGap, nGap; int nhStart, ohEnd; for (;;) { if (rt == NULL) break; nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE); ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE); hGap = nhStart - ohEnd; nGap = rt->nStart - ff->nEnd; if (hGap != 0 && nGap != 0) break; if (hGap < 0 || nGap < 0) break; if (hGap > maxInsert || nGap > maxInsert) break; ff = rt; rt = ff->right; } return rt; } static void saveAxtBundle(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out) /* Save alignment to axtBundle. */ { struct axtData *ad = out->data; struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL; struct axt *axt; struct dyString *q = newDyString(1024), *t = newDyString(1024); struct axtBundle *gab; struct trans3 *t3List = NULL; if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); AllocVar(gab); gab->tSize = chromSize; gab->qSize = qSeq->size; for (sAli = ali; sAli != NULL; sAli = eAli) { eAli = ffNextBreak(sAli, 8, tSeq, t3List); dyStringClear(q); dyStringClear(t); for (ff = sAli; ff != eAli; ff = ff->right) { dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart); dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart); rt = ff->right; if (rt != eAli) { int nGap = rt->nStart - ff->nEnd; int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int gap = max(nGap, hGap); if (nGap < 0 || hGap < 0) { errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name); } if (nGap == gap) { dyStringAppendN(q, ff->nEnd, gap); dyStringAppendMultiC(t, '-', gap); } else { dyStringAppendN(t, ff->hEnd, gap); dyStringAppendMultiC(q, '-', gap); } } eFf = ff; /* Keep track of last block in bunch */ } assert(t->stringSize == q->stringSize); AllocVar(axt); axt->qName = cloneString(qSeq->name); axt->qStart = sAli->nStart - qSeq->dna; axt->qEnd = eFf->nEnd - qSeq->dna; axt->qStrand = (qIsRc ? '-' : '+'); axt->tName = cloneString(chromName); axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset; axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset; axt->tStrand = (tIsRc ? '-' : '+'); axt->symCount = t->stringSize; axt->qSym = cloneString(q->string); axt->tSym = cloneString(t->string); axt->frame = trans3Frame(sAli->hStart, t3List); if (out->qIsProt) axt->score = axtScoreProteinDefault(axt); else axt->score = axtScoreDnaDefault(axt); slAddHead(&gab->axtList, axt); } slReverse(&gab->axtList); dyStringFree(&q); dyStringFree(&t); slAddHead(&ad->bundleList, gab); } static void axtQueryOut(struct gfOutput *out, FILE *f) /* Do axt oriented output - at end of processing query. */ { struct axtData *aod = out->data; struct axtBundle *gab; for (gab = aod->bundleList; gab != NULL; gab = gab->next) { struct axt *axt; for (axt = gab->axtList; axt != NULL; axt = axt->next) axtWrite(axt, f); } axtBundleFreeList(&aod->bundleList); } static void mafHead(struct gfOutput *out, FILE *f) /* Write maf header. */ { mafWriteStart(f, "blastz"); } static void mafQueryOut(struct gfOutput *out, FILE *f) /* Do axt oriented output - at end of processing query. */ { struct axtData *aod = out->data; struct axtBundle *gab; for (gab = aod->bundleList; gab != NULL; gab = gab->next) { struct axt *axt; for (axt = gab->axtList; axt != NULL; axt = axt->next) { struct mafAli temp; mafFromAxtTemp(axt, gab->tSize, gab->qSize, &temp); mafWrite(f, &temp); } } axtBundleFreeList(&aod->bundleList); } static int axtMatchCount(struct axt *axt) /* Return matches. */ { int matchCount = 0; int i; for (i=0; isymCount; ++i) { if (axt->qSym[i] == axt->tSym[i]) ++matchCount; } return matchCount; } static double axtIdRatio(struct axt *axt) /* Return matches/total. */ { int matchCount = axtMatchCount(axt); return (double)matchCount/(double)axt->symCount; } static double axtListRatio(struct axt *axt) /* Return matches/total. */ { int matchCount = 0; int symCount = 0; for (; axt != NULL; axt = axt->next) { matchCount += axtMatchCount(axt); symCount += axt->symCount; } return (double)matchCount/(double)symCount; } static void sim4QueryOut(struct gfOutput *out, FILE *f) /* Do sim4-like output - at end of processing query. */ { struct axtData *aod = out->data; struct axtBundle *gab; slReverse(&aod->bundleList); for (gab = aod->bundleList; gab != NULL; gab = gab->next) { struct axt *axt = gab->axtList; // check minIdentity of the entire alignment int goodPpt = 1000 * axtListRatio(axt); if (!(goodPpt >= out->minGood)) continue; fprintf(f, "\n"); fprintf(f, "seq1 = %s, %d bp\n", axt->qName, gab->qSize); fprintf(f, "seq2 = %s, %d bp\n", axt->tName, gab->tSize); fprintf(f, "\n"); if (axt->qStrand == '-') fprintf(f, "(complement)\n"); for (; axt != NULL; axt = axt->next) { fprintf(f, "%d-%d ", axt->qStart+1, axt->qEnd); fprintf(f, "(%d-%d) ", axt->tStart+1, axt->tEnd); fprintf(f, "%1.0f%% ", 100.0 * axtIdRatio(axt)); if (axt->qStrand == '-') fprintf(f, "<-\n"); else fprintf(f, "->\n"); } } axtBundleFreeList(&aod->bundleList); } static void blastQueryOut(struct gfOutput *out, FILE *f) /* Output blast on query. */ { struct axtData *aod = out->data; axtBlastOut(aod->bundleList, out->queryIx, out->qIsProt, f, aod->databaseName, aod->databaseSeqCount, aod->databaseLetters, aod->blastType, "blat", aod->minIdentity); axtBundleFreeList(&aod->bundleList); } static struct gfOutput *gfOutputInit(int goodPpt, boolean qIsProt, boolean tIsProt) /* Allocate and initialize gfOutput. You'll need to fill in * gfOutput.out at a minimum, and likely gfOutput.data before * using though. */ { struct gfOutput *out; AllocVar(out); out->minGood = goodPpt; out->qIsProt = qIsProt; out->tIsProt = tIsProt; return out; } static void pslHead(struct gfOutput *out, FILE *f) /* Write out psl head */ { pslWriteHead(f); } struct gfOutput *gfOutputPsl(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f, boolean saveSeq, boolean noHead) /* Set up psl/pslx output */ { struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt); struct pslxData *pslData; AllocVar(pslData); pslData->saveSeq = saveSeq; pslData->f = f; out->out = pslOut; out->data = pslData; if (!noHead) out->fileHead = pslHead; return out; } struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, boolean tIsProt) /* Setup output for in memory axt output. */ { struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt); struct axtData *ad; AllocVar(ad); out->out = saveAxtBundle; out->data = ad; return out; } struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f) /* Setup output for axt format. */ { struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt); out->queryOut = axtQueryOut; return out; } struct gfOutput *gfOutputMaf(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f) /* Setup output for maf format. */ { struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt); out->queryOut = mafQueryOut; out->fileHead = mafHead; return out; } struct gfOutput *gfOutputSim4(int goodPpt, boolean qIsProt, boolean tIsProt, char *databaseName) /* Set up to output in sim4 format. */ { struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt); struct axtData *ad = out->data; if (qIsProt || tIsProt) errAbort("sim4 output is not available for protein query sequences."); ad->databaseName = databaseName; out->queryOut = sim4QueryOut; return out; } struct gfOutput *gfOutputBlast(int goodPpt, boolean qIsProt, boolean tIsProt, char *databaseName, int databaseSeqCount, double databaseLetters, char *blastType, double minIdentity, FILE *f) /* Setup output for blast format. */ { struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt); struct axtData *ad = out->data; ad->databaseName = databaseName; ad->databaseSeqCount = databaseSeqCount; ad->databaseLetters = databaseLetters; ad->blastType = blastType; ad->minIdentity = minIdentity; out->queryOut = blastQueryOut; return out; } struct gfOutput *gfOutputAny(char *format, int goodPpt, boolean qIsProt, boolean tIsProt, boolean noHead, char *databaseName, int databaseSeqCount, double databaseLetters, double minIdentity, FILE *f) /* Initialize output in a variety of formats in file or memory. * Parameters: * format - either 'psl', 'pslx', 'sim4', 'blast', 'wublast', 'axt', 'xml' * goodPpt - minimum identity of alignments to output in parts per thousand * qIsProt - true if query side is a protein. * tIsProt - true if target (database) side is a protein. * noHead - if true suppress header in psl/pslx output. * databaseName - name of database. Only used for blast output * databaseSeq - number of sequences in database - only for blast * databaseLetters - number of bases/aas in database - only blast * minIdentity - minimum identity - only blast * FILE *f - file. */ { struct gfOutput *out = NULL; static char *blastTypes[] = {"blast", "wublast", "blast8", "blast9", "xml"}; if (format == NULL) format = "psl"; if (sameWord(format, "psl")) out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, FALSE, noHead); else if (sameWord(format, "pslx")) out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, TRUE, noHead); else if (sameWord(format, "sim4")) out = gfOutputSim4(goodPpt, qIsProt, tIsProt, databaseName); else if (stringArrayIx(format, blastTypes, ArraySize(blastTypes)) >= 0) out = gfOutputBlast(goodPpt, qIsProt, tIsProt, databaseName, databaseSeqCount, databaseLetters, format, minIdentity, f); else if (sameWord(format, "axt")) out = gfOutputAxt(goodPpt, qIsProt, tIsProt, f); else if (sameWord(format, "maf")) out = gfOutputMaf(goodPpt, qIsProt, tIsProt, f); else errAbort("Unrecognized output format '%s'", format); return out; } void gfOutputQuery(struct gfOutput *out, FILE *f) /* Finish writing out results for a query to file. */ { ++out->queryIx; if (out->queryOut != NULL) out->queryOut(out, f); } void gfOutputHead(struct gfOutput *out, FILE *f) /* Write out header if any. */ { if (out->fileHead != NULL) out->fileHead(out, f); } void gfOutputFree(struct gfOutput **pOut) /* Free up output */ { struct gfOutput *out = *pOut; if (out != NULL) { freeMem(out->data); freez(pOut); } }