/* RefineAli.c - Stuff to check and refine alignments. * * This program checks alignments and refines alignments. * * The alignments are checked at the edges, and if at least decent in * the middle, but not as good on the edges, the alignment is done * again against a larger chunk of genomic DNA on the side(s) that don't * match up. * * After any reallignment, full data about decent or better alignments * is written to a file. This info is: * cdnaName * score, targetChromosome, targetStrand * length, cdnaOffset, targetOffset * length, cdnaOffset, targetOffset * score, targetChromosome, targetStrand * length, cdnaOffset, targetOffset * length, cdnaOffset, targetOffset * Poor and none alignments are written to another file as simply * cdnaName * with the intention of eventually blasting these, or running them * with an exonAli with smaller tile size or something. */ #include "common.h" #include "portable.h" #include "hash.h" #include "obscure.h" #include "errabort.h" #include "dnautil.h" #include "dnaseq.h" #include "snof.h" #include "fa.h" #include "nt4.h" #include "wormdna.h" #include "fuzzyFind.h" #include "htmshell.h" #include "cheapcgi.h" #include "cdnaAli.h" void loadGenome(char *nt4Dir, struct nt4Seq ***retNt4Seq, char ***retChromNames, int *retNt4Count) /* Load up entire packed genome into memory. */ { struct slName *nameList, *name; int count; int i; char **chromNames; struct nt4Seq **chroms; char fileName[512]; char chromName[256]; nameList = listDir(nt4Dir, "*.nt4"); count = slCount(nameList); chroms = needMem(count*sizeof(chroms[0])); chromNames = needMem(count*sizeof(chromNames[0])); for (name = nameList, i=0; name != NULL; name = name->next, i+=1) { char *end; sprintf(fileName, "%s/%s", nt4Dir, name->name); /* Cut off .nt4 suffix. */ strcpy(chromName, name->name); end = strrchr(chromName, '.'); assert(end != NULL); *end = 0; chroms[i] = loadNt4(fileName, chromName); chromNames[i] = chroms[i]->name; } slFreeList(&nameList); *retNt4Seq = chroms; *retChromNames = chromNames; *retNt4Count = count; } void freeGenome(struct nt4Seq ***pNt4Seq, char ***pChromNames, int chromCount) /* Free up genome. */ { int i; struct nt4Seq **nt4s = *pNt4Seq; for (i=0; iname)) == NULL) hashAdd(redoHash, ci->name, reason); } #ifdef OLD void endRedoHash() { char *redoName = "redo.txt"; FILE *redoFile = mustOpen(redoName, "w"); struct hashEl *hel; int i; for (i=0; isize; ++i) { for (hel = redoHash->table[i]; hel != NULL; hel = hel->next) { fprintf(redoFile, "%s %s\n", hel->name, hel->val); } } freeHash(&redoHash); } #endif boolean anyCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo) /* Get a single cDNA sequence. Optionally (if retInfo is non-null) get additional * info about the sequence. */ { static FILE *cdnaFa; static struct snof *cdnaSnof = NULL; long offset; char *faComment; char **pFaComment = (retInfo == NULL ? NULL : &faComment); if (cdnaSnof == NULL) { char buf[512]; cdnaSnof = snofMustOpen(cdnaName); sprintf(buf, "%s%s", cdnaName, ".fa"); cdnaFa = mustOpen(buf, "rb"); } if (!snofFindOffset(cdnaSnof, name, &offset)) return FALSE; fseek(cdnaFa, offset, SEEK_SET); if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna)) return FALSE; if (retInfo != NULL) { /* Kludge - only look up info if format is more or less right. */ int fieldCount = countChars(faComment, '|'); if (fieldCount >= 8) wormFaCommentIntoInfo(faComment, retInfo); else zeroBytes(retInfo, sizeof(*retInfo)); } return TRUE; } boolean weAreWeb() /* Return true if we're executing as a CGI app from a browser. */ { static boolean checked = FALSE; static boolean result = FALSE; if (!checked) { checked = TRUE; if (getenv("QUERY_STRING") != NULL) result = TRUE; } return result; } char *lineBreak() /* Return line break string. */ { if (weAreWeb()) return "
\n"; else return "\n"; } void vaReportFile(char *format, va_list args, FILE *file, char *pre, char *post) /* Write printf formatted report to file, prepending extra if non-NULL */ { if (pre != NULL) fprintf(file, "%s", pre); vfprintf(file, format, args); if (post != NULL) fprintf(file, "%s", post); } void reportGood(char *format, ...) /* Printf to good log file and console. */ { va_list args; va_start(args, format); //vaReportFile(format, args, stdout, NULL, NULL); vaReportFile(format, args, goodLogFile, NULL, NULL); va_end(args); } void reportBad(char *format, ...) /* Printf to bad log file and console. */ { va_list args; va_start(args, format); //vaReportFile(format, args, stdout, NULL, NULL); vaReportFile(format, args, badLogFile, NULL, NULL); va_end(args); } void reportUnusual(char *format, ...) /* Printf to unusual log file. */ { va_list args; va_start(args, format); vaReportFile(format, args, stdout, NULL, NULL); vaReportFile(format, args, unusualFile, NULL, NULL); va_end(args); } void reportWarning(char *format, va_list args) /* Print warning to console and also to log files. */ { vaReportFile(format, args, stdout, "###", lineBreak()); vaReportFile(format, args, errFile, "###", "\n"); fflush(errFile); } void usageErr() /* Print usage info and bail */ { errAbort( "This program turns rough alignments into fine ones.\n" "usage:\n" " refineAli roughInputFile cdnaBase chromDir goodAlignFile badAlignFile coolFile errorFile startIx endIx [c2gFile]\n" "example:\n" " refineAli ea\\all.out cDNA\\allcdna chrom ra\\good.txt ra\\bad.txt ra\\cool.txt ra\\err.txt 0 100000 features\\c2g\n"); } char **chromNames; bits32 dnaCrc(DNA *dna, int size) /* Return a number that is fairly unique for a particular * piece of DNA. */ { bits32 shiftAcc = 0; bits32 b; while (--size >= 0) { b = *dna++; if (shiftAcc >= 0x80000000u) { shiftAcc <<= 1; shiftAcc |= 1; shiftAcc += b; } else { shiftAcc <<= 1; shiftAcc += b; } } return shiftAcc; } void filterDupeCdna(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq) /* Flag duplicated cDNA that is named differently. */ { #define crcHashSize (1<<14) #define crcHashMask (crcHashSize-1) static struct cdnaInfo *crcHash[crcHashSize]; int hashVal = (ci->baseCrc&crcHashMask); struct cdnaInfo *hel = crcHash[hashVal]; struct dnaSeq *otherSeq; while (hel != NULL) { if (hel->baseCount == ci->baseCount && hel->baseCrc == ci->baseCrc) { if (!anyCdnaSeq(hel->name, &otherSeq, NULL)) errAbort("Can't find other cDNA %s", hel->name); if (memcmp(cdnaSeq->dna, otherSeq->dna, cdnaSeq->size) == 0) { warn("%s Duplicate cDNA (%d %s and %d %s)", ci->name, ci->ix, ci->name, hel->ix, hel->name); ci->isDupe = TRUE; break; } else { warn("%s Misleading CRC match (%d %s and %d %s)", ci->name, ci->ix, ci->name, hel->ix, hel->name); } } hel = hel->crcHashNext; } ci->crcHashNext = crcHash[hashVal]; crcHash[hashVal] = ci; } struct cdnaInfo *lookupInfo(struct hash *hash, char *name) { struct hashEl *el = hashLookup(hash, name); if (el == NULL) return NULL; return (struct cdnaInfo *)(el->val); } void findAliEnds(struct ffAli *ali, DNA *needle, DNA *hay, long *retNeedleStart, long *retNeedleEnd, long *retHayStart, long *retHayEnd) { DNA *hStart = NULL; DNA *hEnd = NULL; DNA *nStart = NULL; DNA *nEnd = NULL; boolean first = TRUE; while (ali->left) ali = ali->left; for (;ali != NULL; ali = ali->right) { if (first) { hStart = ali->hStart; hEnd = ali->hEnd; nStart = ali->nStart; nEnd = ali->nEnd; first = FALSE; } else { if (ali->hStart < hStart) hStart = ali->hStart; if (ali->hEnd > hEnd) hEnd = ali->hEnd; if (ali->nStart < nStart) nStart = ali->nStart; if (ali->nEnd > nEnd) nEnd = ali->nEnd; } } *retHayStart = hStart - hay; *retNeedleStart = nStart - needle; *retHayEnd = hEnd - hay; *retNeedleEnd = nEnd - needle; } void fetchUnpacked(int chromIx, int gStart, int gEnd, int extraOnEnds, DNA **retDna, int *retStart, int *retEnd) /* This routine fetches DNA for a segment of chromosome. * Because of packing issues it may fetch a bit more than * requested. retStart and retEnd are the beginning and * end offsets of what's actually fetched. */ { struct nt4Seq *nt4 = chroms[chromIx]; DNA *unpacked; /* Adjust endpoints to include extra clipped to fit inside chromosome. */ gStart -= extraOnEnds; if (gStart < 0) gStart = 0; gEnd += extraOnEnds; if (gEnd > nt4->baseCount) gEnd = nt4->baseCount; unpacked = nt4Unpack(nt4, gStart, gEnd-gStart); /* Set return variables. */ *retDna = unpacked; *retStart = gStart; *retEnd = gEnd; } int goodNts(DNA *dna, int size) /* Count up number of non-N nucleotides. */ { int count = 0; while (--size >= 0) { if (ntVal[(int)(*dna++)] >= 0) ++count; } return count; } #define minGoodSize 12 int leftFlakySize(struct ffAli *ali, DNA *needle, int needleSize) /* Returns # of unalligned or flakily aligned bases on left side * of needle. */ { int aliSize; DNA *firstGood = NULL; while (ali->left) ali = ali->left; while (ali != NULL) { aliSize = ali->nEnd - ali->nStart; if (ffScoreMatch(ali->nStart, ali->hStart, aliSize) >= minGoodSize) { firstGood = ali->nStart; break; } ali = ali->right; } if (firstGood == NULL) return needleSize; else return firstGood - needle; } int rightFlakySize(struct ffAli *ali, DNA *needle, int needleSize) /* Returns # of unalligned or flakily aligned bases on right side * of needle. */ { int aliSize; DNA *firstGood = NULL; while (ali->right) ali = ali->right; while (ali != NULL) { aliSize = ali->nEnd - ali->nStart; if (ffScoreMatch(ali->nStart, ali->hStart, aliSize) >= minGoodSize) { firstGood = ali->nEnd; break; } ali = ali->left; } if (firstGood == NULL) return needleSize; else return (needle + needleSize) - firstGood; } int polyaSize(DNA *cdna, int cdnaSize) /* Return size of polyA tail. */ { int count=0; while (--cdnaSize >= 0) { if (cdna[cdnaSize] != 'a') break; ++count; } if (count < 4) /* Not really poly ... */ return 0; return count; } int gapPenalty(struct ffAli *left, struct ffAli *ali) /* Calculate gap penalty using exon scoring. */ { int nGap, hGap; int minGap; nGap = ali->nStart - left->nEnd; assert(nGap >= 0); hGap = ali->hStart - left->hStart; if (hGap < 0) { hGap = 0; nGap -= hGap; } minGap = (nGap < hGap ? nGap : hGap); return (2 + digitsBaseTwo(hGap+minGap) + (nGap-minGap)); } int scoreExonAli(struct ffAli *ali) /* Score alignment using assumptions that it may have introns. */ { struct ffAli *left; int oneScore; int onePenalty; int score; score = ffScoreMatch(ali->nStart, ali->hStart, ali->nEnd - ali->nStart); for (;;) { left = ali; if ((ali = ali->right) == NULL) break; onePenalty = gapPenalty(left, ali); score -= onePenalty; oneScore = ffScoreMatch(ali->nStart, ali->hStart, ali->nEnd - ali->nStart); score += oneScore; } return score; } void clipEnds(int leftMin, int *left, int *right, int rightMax) /* Make sure left and right are within bounds. */ { if (*left < leftMin) *left = leftMin; if (*right > rightMax) *right = rightMax; } int cmpRoughAli(const void *va, const void *vb) /* Compare two rough alis by score. */ { struct roughAli **pA = (struct roughAli **)va; struct roughAli **pB = (struct roughAli **)vb; struct roughAli *a = *pA, *b = *pB; return b->score - a->score; } void sortRoughAlis(struct roughAli **pAli) { slSort(pAli, cmpRoughAli); } int cmpFineAli(const void *va, const void *vb) /* Compare two rough alis by score. */ { struct fineAli **pA = (struct fineAli **)va; struct fineAli **pB = (struct fineAli **)vb; struct fineAli *a = *pA, *b = *pB; return b->score - a->score; } void sortFineAlis(struct fineAli **pAli) { slSort(pAli, cmpFineAli); } void flagOneRough(struct roughAli *list, struct roughAli *suspect) /* Check list from beginning to suspect for things that * completely overlap with suspect. */ { struct roughAli *better; for (better = list; better != suspect; better = better->next) { if (better->chromIx == suspect->chromIx) { if ((better->gStart <= suspect->gStart && better->gEnd >= suspect->gEnd) || (better->gStart >= suspect->gStart && better->gEnd <= suspect->gEnd)) { suspect->isDupe = TRUE; break; } } } } void flagDupeRoughAlis(struct roughAli *raList) /* Get rid of lower or equal scoring roughAlis that * cover same region. raList should be sorted. */ { int count = slCount(raList); struct roughAli *ra; if (count <= 1) return; for (ra = raList->next; ra != NULL; ra = ra->next) { flagOneRough(raList, ra); } } void flagOneFine(struct fineAli *list, struct fineAli *suspect) /* Check list from beginning to suspect for things that * completely overlap with suspect. */ { struct fineAli *better; for (better = list; better != suspect; better = better->next) { if (better->chromIx == suspect->chromIx && better->isRc == suspect->isRc) { long snStart, snEnd, shStart, shEnd, bnStart, bnEnd, bhStart, bhEnd; findAliEnds(suspect->blocks, suspect->virtNeedle, suspect->virtHaystack, &snStart, &snEnd, &shStart, &shEnd); findAliEnds(better->blocks, better->virtNeedle, better->virtHaystack, &bnStart, &bnEnd, &bhStart, &bhEnd); if ((bhStart <= shStart && bhEnd >= shEnd) || (bhStart >= shStart && bhEnd <= shEnd)) { suspect->isDupe = TRUE; break; } } } } void flagDupeFineAlis(struct fineAli *faList) /* Get rid of duplicate fineAlis */ { int count = slCount(faList); struct fineAli *fa; if (count <= 1) return; for (fa = faList->next; fa != NULL; fa = fa->next) { flagOneFine(faList, fa); } } struct cr2g { struct cr2g *next; char strand; int start, end; char geneName[32]; }; int findChromIx(char *name) { int i; for (i=0; i\n", fileName); while (fgets(buf, sizeof(buf), f) != NULL) { wordCount = chopString(buf, whiteSpaceChopper, words, ArraySize(words)); if (wordCount != 3) errAbort("Bad cr2g file\n"); partCount = chopString(words[0], ":-", parts, ArraySize(parts)); if (partCount != 3) errAbort("Bad cr2g file\n"); AllocVar(el); chromIx = findChromIx(parts[0]); el->strand = words[1][0]; el->start = atoi(parts[1]); el->end = atoi(parts[2]); strncpy(el->geneName, words[2], sizeof(el->geneName)); el->next = chromLists[chromIx]; chromLists[chromIx] = el; } uglyf("Done loading %s
\n", fileName); fclose(f); } static boolean isLoadedCtg = FALSE; static struct cr2g *cr2gs[8]; void findClosestGene(char *chromName, int chromStart, int chromEnd, int strand, char **retGeneName, long *retGeneStart, long *retGeneEnd) { struct cr2g *cr2g; int chromIx; int closestDistance = 0x7fffffff; struct cr2g *closestGene = NULL; int distance; int okOutsideDistance = GENE_SPACE; if (c2gName == NULL) { char buf[64]; sprintf(buf, "%s:%d-%d", chromName, chromStart, chromEnd); *retGeneName = cloneString(buf); *retGeneStart = chromStart; *retGeneEnd = chromEnd; return; } if (!isLoadedCtg) { loadCr2g(c2gName, cr2gs); isLoadedCtg = TRUE; } chromIx = findChromIx(chromName); for (cr2g = cr2gs[chromIx]; cr2g != NULL; cr2g = cr2g->next) { if (chromStart > cr2g->end) distance = chromStart - cr2g->end; else if (chromEnd < cr2g->start) distance = cr2g->start - chromEnd; else { /* Overlaps */ distance = 0; } if (distance < closestDistance) { closestGene = cr2g; closestDistance = distance; } } if (closestDistance <= okOutsideDistance) { *retGeneName = closestGene->geneName; *retGeneStart = closestGene->start; *retGeneEnd = closestGene->end; } else { char buf[64]; sprintf(buf, "%s:%d-%d", chromName, chromStart, chromEnd); *retGeneName = cloneString(buf); *retGeneStart = chromStart; *retGeneEnd = chromEnd; } } void hyperReportAlis(struct cdnaInfo *ci) /* Report alignments in hypertext format. */ { struct fineAli *fa; printf("%d %s ", ci->ix, ci->name, ci->name); printf("%d bases %d rough alignments. Fine alignments:\n", ci->baseCount, slCount(ci->roughAli)); for (fa = ci->fineAli; fa != NULL; fa = fa->next) { if (fa->isDupe) continue; printf("%s %ld-%ld hits ", ci->name, fa->nStart, fa->nEnd); printf("", ci->name, chromNames[fa->chromIx], fa->hStart-100, fa->hEnd+100); printf("chromosome %s %ld-%ld", chromNames[fa->chromIx], fa->hStart, fa->hEnd); printf(" gene %s", fa->geneName, fa->geneName); printf(" score %f\n", (double)fa->score / (double)ci->baseCount); } htmlHorizontalLine(); } boolean correctIsBackwards(boolean putativeIsReversed, boolean isRc, struct ffAli *ffAli, char *cdnaName) /* Look for introns to double check orientation */ { int fCount = 0; int rCount = 0; struct ffAli *left, *right; int nGap, hGap; int leftSize, rightSize; boolean isReversed; right = ffAli; for (;;) { left = right; if ((right = left->right) == NULL) break; leftSize = left->nEnd - left->nStart; rightSize = right->nEnd - right->nStart; nGap = right->nStart - left->nEnd; hGap = right->hStart - left->hEnd; if (nGap == 0 && hGap >= 16 && leftSize >= 5 && rightSize >= 5) { DNA *iStart = left->hEnd; DNA *iEnd = right->hStart; if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g') ++fCount; else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c') ++rCount; } } if (fCount == rCount) return putativeIsReversed; if (fCount != 0 && rCount != 0) return putativeIsReversed; isReversed = (rCount > fCount); if (isRc) isReversed = !isReversed; if (isReversed != putativeIsReversed) { warn("Correcting backwards EST %s", cdnaName); } return isReversed; } void refineAlis(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq) /* Turn ci->roughAli into ci->fineAli. Refine alignment. */ { struct roughAli *ra; struct fineAli *fa; struct ffAli *ffAli = NULL; DNA *unpacked; int outerStart, outerEnd; int score; boolean isRc; boolean leftCruddyCount; boolean rightCruddyCount; DNA *hayStart; int hayLen; DNA *hayEnd; int gStart, gEnd; boolean badFind; sortRoughAlis(&ci->roughAli); flagDupeRoughAlis(ci->roughAli); for (ra = ci->roughAli; ra != NULL; ra = ra->next) { int bestScore = -0x7fffffff; int oldBestScore = -0x7fffffff; int oldScore; struct ffAli *bestAli = NULL; boolean bestIsRc = FALSE; if (ra->isDupe) continue; /* If score is less than 1/8 of cdna size, don't bother * with further processing. */ if (ra->score < ci->baseCount/8) { continue; } gStart = ra->gStart - 16; gEnd = ra->gEnd + 16; /* Unpack dna, including extra at either end. */ fetchUnpacked(ra->chromIx, ra->gStart, ra->gEnd, 15250, &unpacked, &outerStart, &outerEnd); badFind = FALSE; for (;;) { clipEnds(outerStart, &gStart, &gEnd, outerEnd); hayStart = unpacked + gStart-outerStart; hayLen = gEnd - gStart; hayEnd = hayStart + hayLen; if (!ffFindEitherStrandN(cdnaSeq->dna, cdnaSeq->size, hayStart, hayLen, ffCdna, &ffAli, &isRc)) { if (ra->score > 20 && ra->score > oldBestScore + 5 && ra == ci->roughAli) { if (badFind) { warn("%s - still couldn't ffFind after expansion", ci->name); break; } warn("%s Couldn't ffFind %s (%d bases %d score %d bestScore) in chromosome %s %d-%d", ci->name, ci->name, ci->baseCount, ra->score, oldBestScore, chromNames[ra->chromIx], gStart, gEnd); addRedoHash(ci, "ffFind"); } else break; badFind = TRUE; gStart -= 500; gEnd += 500; continue; } if (isRc) reverseComplement(cdnaSeq->dna, cdnaSeq->size); score = scoreExonAli(ffAli); oldScore = ffScoreCdna(ffAli); leftCruddyCount = leftFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size); rightCruddyCount = rightFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size) - polyaSize(cdnaSeq->dna, cdnaSeq->size); if (isRc) reverseComplement(cdnaSeq->dna, cdnaSeq->size); if (score <= bestScore) { ffFreeAli(&ffAli); break; } bestScore = score; oldBestScore = oldScore; ffFreeAli(&bestAli); bestAli = ffAli; bestIsRc = isRc; if (leftCruddyCount <= 0 && rightCruddyCount <= 0) break; if (gStart == outerStart || gEnd == outerEnd) break; if (leftCruddyCount < 16) gStart -= 2*leftCruddyCount; else gStart -= 5000; if (rightCruddyCount > 0) { if (rightCruddyCount < 16) gEnd += 2*rightCruddyCount; else gEnd += 5000; } } if (bestAli != NULL) { AllocVar(fa); fa->chromIx = ra->chromIx; fa->isRc = bestIsRc; fa->score = bestScore; fa->blocks = bestAli; fa->virtNeedle = cdnaSeq->dna; fa->virtHaystack = unpacked - outerStart; findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack, &fa->nStart, &fa->nEnd, &fa->hStart, &fa->hEnd); findClosestGene(chromNames[fa->chromIx], fa->hStart, fa->hEnd, (fa->isRc ? '-' : '+'), &fa->geneName, &fa->geneStart, &fa->geneEnd); fa->isBackwards = correctIsBackwards(ci->isBackwards, fa->isRc, fa->blocks, cdnaSeq->name); fa->next = ci->fineAli; ci->fineAli = fa; } freez(&unpacked); } slReverse(&ci->fineAli); sortFineAlis(&ci->fineAli); flagDupeFineAlis(ci->fineAli); if (weAreWeb()) hyperReportAlis(ci); else printf("%d %s\n", ci->ix, ci->name); slFreeList(&ci->roughAli); } int bestFineScore(struct fineAli *fines) /* Return best score in a list of fineAlis. */ { int bestScore = -0x7fffffff; struct fineAli *fa; for (fa = fines; fa != NULL; fa = fa->next) { if (fa->score > bestScore) bestScore = fa->score; } return bestScore; } int bestRoughScore(struct roughAli *roughs) { int bestScore = -0x7fffffff; struct roughAli *ra; for (ra = roughs; ra != NULL; ra = ra->next) { if (ra->score > bestScore) bestScore = ra->score; } return bestScore; } struct textLine { struct textLine *next; char line[1]; /* This is allocated to be bigger. */ }; struct textLine *loadLines(char *fileName) /* Load in each line of file into a textLine structure. */ { char buf[512]; struct textLine *list = NULL; struct textLine *tl; int textSize; FILE *f = mustOpen(fileName, "r"); while (fgets(buf, sizeof(buf), f) != NULL) { textSize = strlen(buf); tl = needMem(sizeof(*tl) + textSize); strcpy(tl->line, buf); tl->next = list; list = tl; } slReverse(&list); fclose(f); return list; } void doGoodBad(struct cdnaInfo *ciList) { struct cdnaInfo *ci; int goodCount = 0; int badCount = 0; int poorCount = 0; struct fineAli *fa; int bestScore; for (ci=ciList; ci!=NULL; ci = ci->next) { if (ci->isDupe) continue; if (ci->fineAli == NULL) { reportBad("%s score %d\n", ci->name, 0); if (ci->roughScore > 32 && ci->roughScore > ci->baseCount/8) addRedoHash(ci, "dropped"); ++badCount; continue; } bestScore = ci->fineScore; if (bestScore < ci->baseCount/2) { fa = ci->fineAli; reportBad("%s score %f at chromosome %s:%ld-%ld gene %s\n", ci->name, (double)bestScore / (double)ci->baseCount, chromNames[fa->chromIx], fa->hStart, fa->hEnd, fa->geneName); addRedoHash(ci, "poor"); ++poorCount; continue; } reportGood("cDNA %s size %d %s\n", ci->name, ci->baseCount, (ci->isEmbryonic ? "embryo" : "unknown")); for(fa = ci->fineAli;fa!=NULL; fa = fa->next) { if (fa->isDupe) continue; if (fa->score >= bestScore) { char *chromName = chromNames[fa->chromIx]; struct ffAli *ali, *lastAli = NULL; fa->isGood = TRUE; reportGood("\tscore %f chromosome %s strand %c %c gene %s\n", (double)fa->score / (double)ci->baseCount, chromName, (fa->isRc ? '-' : '+'), (fa->isBackwards ? '-' : '+'), fa->geneName); for (ali=fa->blocks; ali != NULL; ali = ali->right) { reportGood("\t\t%3d %3d | %d %d size %3d goodEnds %2d %2d", ali->nStart - fa->virtNeedle, ali->nEnd - fa->virtNeedle, ali->hStart - fa->virtHaystack, ali->hEnd - fa->virtHaystack, ali->nEnd - ali->nStart, ali->startGood, ali->endGood); if (lastAli != NULL) reportGood(" gap %d | %d", ali->nStart - lastAli->nEnd, ali->hStart - lastAli->hEnd); reportGood("\n"); lastAli = ali; } } } ++goodCount; } printf("%d Good %d Poor %d Bad\n", goodCount, poorCount, badCount); } void reportBackSkippers(struct cdnaInfo *ci, struct fineAli *fa) /* Report cases where the alignment has one exon behind another. */ { struct ffAli *ali = fa->blocks; struct ffAli *right = ali->right; int skip; while (right != NULL) { skip = right->hStart - ali->hEnd; if (skip < 0) { reportUnusual("Skips %s %s:%d-%d alignment skips %d\n", ci->name, chromNames[fa->chromIx], fa->hStart, fa->hEnd, skip); addRedoHash(ci, "skipsBack"); } ali = right; right = right->right; } } void doUnusual(struct cdnaInfo *cdnaList) /* Report unusual things. */ { struct cdnaInfo *ci; struct fineAli *fa; int bestScore; int greatCount; int closeCount; int decentCount; int greatCutoff; int closeCutoff; int decentCutoff; char *comment; for (ci = cdnaList; ci != NULL; ci = ci->next) { if (ci->isDupe || ci->fineAli == NULL) continue; if (ci->baseCount < 100) reportUnusual("%s is short - %d bases\n", ci->name, ci->baseCount); bestScore = ci->fineScore; greatCutoff = ci->baseCount * 9 / 10; closeCutoff = 9*bestScore/10; decentCutoff = ci->baseCount/2; greatCount = closeCount = decentCount = 0; comment = NULL; for (fa = ci->fineAli; fa != NULL; fa = fa->next) { if (fa->isDupe) continue; if (fa->isGood) { ++decentCount; reportBackSkippers(ci, fa); } if (fa->score >= closeCutoff) ++closeCount; if (fa->score >= greatCutoff) ++greatCount; } if (greatCount > 1) comment = "great"; else if (closeCount > 1) comment = "similar"; else if (decentCount > 1) comment = "decent"; if (comment != NULL) { reportUnusual("Multiple %s hits for %s\n", comment, ci->name); for (fa = ci->fineAli; fa != NULL; fa = fa->next) { if (fa->isDupe) continue; if (fa->score >= decentCutoff || fa->score >= closeCutoff) { long hStart, hEnd, nStart, nEnd; findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack, &nStart, &nEnd, &hStart, &hEnd); reportUnusual("\t%s:%d-%d score %f\n", chromNames[fa->chromIx], hStart, hEnd, (double)fa->score/ci->baseCount); } } } } } struct geneHit { struct geneHit *next; char *cdnaName; int chromOffset; }; struct geneHitList { struct geneHitList *next; char *geneName; struct geneHit *hits; }; int cmpGhlName(const void *va, const void *vb) /* Compare two geneHitLists by name. */ { struct geneHitList **pA = (struct geneHitList **)va; struct geneHitList **pB = (struct geneHitList **)vb; struct geneHitList *a = *pA, *b = *pB; return strcmp(a->geneName, b->geneName); } int cmpGhOffset(const void *va, const void *vb) /* Compare two geneHits by chromosome offset. */ { struct geneHit **pA = (struct geneHit **)va; struct geneHit **pB = (struct geneHit **)vb; struct geneHit *a = *pA, *b = *pB; return a->chromOffset - b->chromOffset; } #ifdef OLD void makeCdnaToGene(struct cdnaInfo *cdnaList) /* Make cdna to gene translation file. */ { struct hash *hash = newHash(12); struct cdnaInfo *ci; struct fineAli *fa; struct geneHit *gh; struct geneHitList *geneHitList = NULL; struct geneHitList *ghl; struct hashEl *he; uglyf("Making cdnaToGene file
\n"); for (ci = cdnaList; ci != NULL; ci = ci->next) { if (ci->isDupe) continue; for (fa = ci->fineAli; fa != NULL; fa = fa->next) { if (fa->isDupe || !fa->isGood) continue; if ((he = hashLookup(hash, fa->geneName)) == NULL) { AllocVar(ghl); ghl->geneName = fa->geneName; ghl->next = geneHitList; geneHitList = ghl; he = hashAdd(hash, fa->geneName, ghl); } ghl = (struct geneHitList *)(he->val); AllocVar(gh); gh->cdnaName = ci->name; gh->chromOffset = fa->hStart; gh->next = ghl->hits; ghl->hits = gh; } } slSort(&geneHitList, cmpGhlName); for (ghl=geneHitList; ghl!=NULL; ghl = ghl->next) { slReverse(&ghl->hits); slSort(&ghl->hits, cmpGhOffset); fprintf(cdnaToGeneFile, "%s ", ghl->geneName); for (gh = ghl->hits; gh != NULL; gh = gh->next) fprintf(cdnaToGeneFile, "%s ", gh->cdnaName); fprintf(cdnaToGeneFile, "\n"); } freeHash(&hash); slFreeList(&geneHitList); uglyf("Done making cdnaToGene file
\n"); } #endif /* OLD */ void hitLine(struct cdnaInfo *ci, int lineCount, char *cdnaName, char *cdnaRange, char *mitoChrom, char *chromName, char *genomeRange, char *scoreString) /* Process line with hit info. */ { char *s; struct roughAli *ra; if (ci == NULL || differentWord(cdnaName, ci->name)) { errAbort("Got hit line %d without corresponding blasting for cDNA %s", lineCount, cdnaName); } /* Allocate structure to hold data for line. */ ra = needMem(sizeof(*ra)); /* Grab range of bases covered in cDNA */ if (cdnaRange != NULL) { s = cdnaRange; ra->cStart = atoi(s); s = strchr(s, '-')+1; ra->cEnd = atoi(s)+1; if (ra->cStart >= ra->cEnd) { int temp; warn("%s line %d cdna %s: cStart %d > cEnd %d. Swapping.", cdnaName, lineCount, cdnaName, ra->cStart, ra->cEnd); addRedoHash(ci, "swappedCdna"); temp = ra->cStart; ra->cStart = ra->cEnd; ra->cEnd = ra->cStart; } } /* Grab range of bases covered in genome. */ s = genomeRange; ra->gStart = atoi(s); s = strchr(s, '-')+1; ra->gEnd = atoi(s)+1; if (ra->gStart >= ra->gEnd) { int temp; warn("%s line %d cdna %s: gStart %d > gEnd %d. Swapping.", cdnaName, lineCount, cdnaName, ra->gStart, ra->gEnd); addRedoHash(ci, "swappedGenome"); temp = ra->gStart; ra->gStart = ra->gEnd; ra->gEnd = ra->gStart; } /* Figure out chromosome it's on. */ if (!differentWord(mitoChrom, "Mitochondrial")) ra->chromIx = 6; else { int i; for (i=0; ichromIx = i; break; } } } /* Get alignment score. */ ra->score = atoi(scoreString); /* Hang it on list. */ ra->next = ci->roughAli; ci->roughAli = ra; } void analyse(int start, int stop) { struct hash *hash; char line[512]; int lineCount = 0; char *words[32]; int wordCount; struct cdnaInfo *cdnaList = NULL; struct cdnaInfo *ci = NULL; int cdnaCount; int maxCdnaCount = stop - start; cdnaCount = 1; if (start > 1) { for (;;) { if (!fgets(line, sizeof(line), inFile)) errAbort("Not %d cDNAs in file, only %d\n", start, cdnaCount); ++lineCount; if (line[0] == '#') /* Skip comments. */ continue; wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words)); if (wordCount <= 0) /* Skip empty lines. */ continue; if (!differentWord(words[1], "alignments")) { ++cdnaCount; if (cdnaCount >= start) break; } } } cdnaCount = 0; hash = newHash(14); /* Hash table with 16k entries. */ for (;;) { if (!fgets(line, sizeof(line), inFile)) break; ++lineCount; if (line[0] == '#') /* Skip comments. */ continue; wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words)); if (wordCount <= 0) /* Skip empty lines. */ continue; if (wordCount < 4) /* Everyone else has at least four words. */ { errAbort("Short line %d:\n", lineCount); } if (sameWord(words[1], "Blasting")) { char *cdnaName = words[2]; if ((ci = lookupInfo(hash, cdnaName)) == NULL) { struct hashEl *hel; ci = needMem(sizeof(*ci)); hel = hashAdd(hash, cdnaName, ci); ci->next = cdnaList; cdnaList = ci; ci->ix = atoi(words[0]); ci->name = hel->name; } } else if (sameWord(words[2], "hits")) { /* Newer style - includes cDNA matching range. */ if (ci == NULL) continue; hitLine(ci, lineCount, words[0], words[1], words[3], words[4], words[5], words[9]); } else if (sameWord(words[1], "hits")) /* Older style - no cDNA matching range. */ { if (ci == NULL) continue; hitLine(ci, lineCount, words[0], NULL, words[2], words[3], words[4], words[8]); } else if (sameWord(words[1], "alignments")) { struct dnaSeq *cdnaSeq; struct wormCdnaInfo info; if (ci == NULL) continue; if (differentWord(ci->name, words[3])) errAbort("Line %d - %s is not %s", lineCount, words[3], ci->name); if (!ci->finished) { if (!anyCdnaSeq(ci->name, &cdnaSeq, &info)) { warn("Can't find cDNA %s", ci->name); ci->isDupe = TRUE; } else { ci->baseCount = cdnaSeq->size; ci->baseCrc = dnaCrc(cdnaSeq->dna, cdnaSeq->size); slReverse(&ci->roughAli); ci->roughScore = bestRoughScore(ci->roughAli); filterDupeCdna(ci, cdnaSeq); ci->isBackwards = (info.orientation == '-'); refineAlis(ci, cdnaSeq); ci->fineScore = bestFineScore(ci->fineAli); ci->isEmbryonic = info.isEmbryonic; ci->finished = TRUE; freeDnaSeq(&cdnaSeq); ++cdnaCount; if (cdnaCount >= maxCdnaCount) break; } } } else { errAbort("Can't deal with line %d\n", lineCount); } } slReverse(&cdnaList); doGoodBad(cdnaList); doUnusual(cdnaList); //makeCdnaToGene(cdnaList); /* Clean up. */ /* These two are slow and not really necessary. */ #ifdef FASTIDIOUS slFreeList(&cdnaList); freeHash(&hash); #endif uglyf("Done analyse\n"); } int test(int argc, char *argv[]) { int start, stop; if (argc != 9 && argc != 10) usageErr(); inName = argv[0]; cdnaName = argv[1]; chromDir = argv[2]; goodLogName = argv[3]; badLogName = argv[4]; unusualName = argv[5]; errName = argv[6]; start = atoi(argv[7]); stop = atoi(argv[8]); if (start >= stop) usageErr(); if (argc == 10) c2gName = argv[9]; inFile = mustOpen(inName, "r"); goodLogFile = mustOpen(goodLogName, "w"); badLogFile = mustOpen(badLogName, "w"); unusualFile = mustOpen(unusualName, "w"); errFile = mustOpen(errName, "w"); pushWarnHandler(reportWarning); dnaUtilOpen(); printf("Loading chromosomes\n"); loadGenome(chromDir, &chroms, &chromNames, &chromCount); startRedoHash(); printf("Analysing %s\n", inName); if (weAreWeb()) htmlHorizontalLine(); analyse(start, stop); //endRedoHash(); freeGenome(&chroms, &chromNames, chromCount); popWarnHandler(); fclose(inFile); fclose(goodLogFile); fclose(badLogFile); fclose(unusualFile); fclose(errFile); return 0; } void doMiddle() { char *commandLine = cgiString("commandLine"); char *words[50]; int wordCount; wordCount = chopString(commandLine, whiteSpaceChopper, words, ArraySize(words)); printf("
");
test(wordCount, words);
printf("
"); } int main(int argc, char *argv[]) { if (weAreWeb()) htmShell("refineAli Output", doMiddle, "QUERY"); else test(argc-1, argv+1); return 0; }