/* axtCountBeds.c - Program to count matching bases in bed and output positional information as well. */ #include "common.h" #include "axt.h" #include "bed.h" #include "linefile.h" #include "hdb.h" #include "dnaseq.h" #include "dnautil.h" #include "chromKeeper.h" #include "options.h" #include "obscure.h" void usage() { errAbort("axtCountBeds.c - Program to count matching bases in bed and output\n" "positional information as well.\n" "usage:\n " "axtCountBeds bedFile axtFile db outFile chrom\n"); } struct axt *axtList = NULL; boolean ckInit = FALSE; struct axt* axtReadAll(char* fileName) /* Read all of the axt records from a file. */ { struct axt *axtList = NULL; struct axt *rec; struct lineFile *lf = lineFileOpen(fileName, TRUE); while ((rec = axtRead(lf)) != NULL) { slAddHead(&axtList, rec); } lineFileClose(&lf); if (axtList == NULL) errAbort("no alignments found in %s", fileName); slReverse(&axtList); return axtList; } void chromKeeperForAxt(char *fileName, char *db) { struct axt *axt = NULL; axtList = axtReadAll(fileName); if(!ckInit) { chromKeeperInit(db); ckInit = TRUE; } for(axt = axtList; axt != NULL; axt = axt->next) { chromKeeperAdd(axt->tName, axt->tStart, axt->tEnd, axt); } } void cleanUpChromKeeper() /* Clean stuff out of the chromkeeper. */ { struct axt *axt = NULL, *axtNext = NULL; for(axt = axtList; axt != NULL; axt = axtNext) { axtNext = axt->next; chromKeeperRemove(axt->tName, axt->tStart, axt->tEnd, axt); } axtFreeList(&axtList); } void printIntArray(int *array, int count) { int i; for(i = 0; i < count; i++) { fprintf(stderr, "%d,", array[i]); } fprintf(stderr, "\n"); fflush(stderr); } boolean isDna(char c) { return (strchr("atgcATGC", c)) != NULL; } void scoreMatches(struct axt *axt, int *matches, char *dna, int matchCount, int chromStart, int chromEnd) /* Look through the axt record and score mathes in matches. */ { int start = chromStart - axt->tStart; int end = min(chromEnd - axt->tStart, axt->tEnd - axt->tStart); int i =0; int position = 0; tolowers(axt->qSym); tolowers(axt->tSym); while(position < end && axt->qSym[i] != '\0') { if(axt->tSym[i] == '-') { i++; continue; } else if(position >= start && position < end) { dna[position + axt->tStart - chromStart] = axt->tSym[i]; if(axt->qSym[i] == axt->tSym[i]) matches[position + axt->tStart - chromStart] = 1; else if((axt->qSym[i] != axt->tSym[i]) && isDna(axt->tSym[i]) && isDna(axt->qSym[i])) matches[position + axt->tStart - chromStart] = 0; else if(axt->qSym[i] == '-' || axt->tSym[i] == '-' || axt->qSym[i] == 'n' || axt->tSym[i] == 'n') matches[position + axt->tStart - chromStart] = -1; else errAbort("axtCountBeds::scoreMatches() - Got %c and %c for qSym and tSym. " "ChromStart %d, chromEnd %d, offSet %d", axt->qSym[i], axt->tSym[i], chromStart, chromEnd, i); } i++; position++; } } void initMatches(int *matches, int count, int val) { int i; for(i=0; ichromEnd - bed->chromStart+100); struct dyString *outSeqOrtho = newDyString(bed->chromEnd - bed->chromStart+100); int currentStart = bed->chromStart; int currentEnd = bed->chromEnd; struct binElement *be = NULL; fprintf(out, ">%s.%d.%s.%s:%d-%d\n", bed->name, bed->score, bed->strand, bed->chrom, bed->chromStart,bed->chromEnd); for(be = beList; be != NULL; be = be->next) { struct axt *axt = be->val; int start = currentStart - axt->tStart; int end = 0;//max(currentEnd - axt->tStart, axt->tEnd - currentStart); int pos = 0; int i = 0; struct dnaSeq *seq = NULL; end = min(axt->tEnd - axt->tStart, currentEnd - axt->tStart); if(bed->chromEnd >= axt->tStart && bed->chromStart <= axt->tEnd) { /* If axt doesn't cover, pad things with genomic dna. */ if(start < 0) { seq = hChromSeq(bed->chrom, currentStart, axt->tStart); for(i = 0; i < abs(start); i++) { dyStringPrintf(outSeq, "%c", seq->dna[i]); dyStringPrintf(outSeqOrtho, "-"); } dnaSeqFree(&seq); currentStart = axt->tStart; start = 0; } i = 0; /* Loop through incrementing position when non-indel characters are found in target sequence. */ while(pos < end && axt->qSym[i] != '\0') { if(pos >= start && pos < end) { dyStringPrintf(outSeq, "%c", axt->tSym[i]); dyStringPrintf(outSeqOrtho, "%c", axt->qSym[i]); if(axt->tSym[i] != '-') currentStart++; } if(axt->tSym[i] != '-') { pos++; } i++; } } } /* Pad out the end with sequence. */ if(currentStart < bed->chromEnd) { struct dnaSeq *seq = NULL; int i = 0; seq = hChromSeq(bed->chrom, currentStart, bed->chromEnd); for(i = 0; i < bed->chromEnd - currentStart; i++) { dyStringPrintf(outSeq, "%c", seq->dna[i]); dyStringPrintf(outSeqOrtho, "-"); } dnaSeqFree(&seq); } if(bed->strand[0] == '-') { reverseComplement(outSeq->string, outSeq->stringSize); reverseComplement(outSeqOrtho->string, outSeqOrtho->stringSize); } if(qrnaStyle) { writeSeqWithBreaks(out, outSeq->string, outSeq->stringSize, 50); fprintf(out, ">%s.%d.%s.%s:%d-%d-Ortho\n", bed->name, bed->score, bed->strand, bed->chrom, bed->chromStart,bed->chromEnd); writeSeqWithBreaks(out, outSeqOrtho->string, outSeqOrtho->stringSize, 50); } else { fprintf(out, "%s\n", outSeq->string); fprintf(out, "%s\n", outSeqOrtho->string); } dyStringFree(&outSeq); dyStringFree(&outSeqOrtho); } void countMatches(struct bed *bed, FILE *out, FILE *axtOut, FILE *qrnaOut) /* Count up the matches for a particular bed. */ { struct axt *axtList = NULL, *axt; struct binElement *beList = NULL, *be = NULL; int *matches = NULL; int matchSize = 0, i = 0; int match = 0, misMatch=0, notAlign=0; struct slRef *refList = NULL, *ref = NULL; char *dna = NULL; struct dnaSeq *seq = NULL; matchSize = bed->chromEnd - bed->chromStart; AllocArray(matches, matchSize); AllocArray(dna, matchSize+1); initMatches(matches, matchSize, -1); beList = chromKeeperFind(bed->chrom, bed->chromStart, bed->chromEnd); for(be = beList; be != NULL; be = be->next) { axt = be->val; scoreMatches(axt, matches, dna, matchSize, bed->chromStart, bed->chromEnd); } if(axtOut != NULL && slCount(beList) != 0 ) writeAxt(axtOut, bed, beList, FALSE); if(qrnaOut != NULL && slCount(beList) != 0) writeAxt(qrnaOut, bed, beList, TRUE); else { warn("Nothing for %s %s:%d-%d", bed->name, bed->chrom, bed->chromStart, bed->chromEnd); } /* Output bed. */ bedOutputN(bed, 6, out, '\t', '\t'); /* Gather info for percent id */ for(i = 0; i < matchSize; i++) { if(matches[i] == 1) match++; else if(matches[i] == 0) misMatch++; else if(matches[i] == -1) notAlign++; else errAbort("agxCountBeds::countMatches() - matches array at position %d is: %d, not 1, 0, or -1", i, matches[i]); } /* Get sequence. */ seq = hChromSeq(bed->chrom, bed->chromStart, bed->chromEnd); if(strlen(dna) == seq->size && differentString(seq->dna, dna) && refList != NULL) warn("axtCountBeds::countMatches() - Different sequences from coordinates and axt file.:\ncoord: %s\naxt: %s\n", seq->dna, dna); if(sameString(bed->strand, "-")) reverseComplement(seq->dna, seq->size); fprintf(out, "%d\t%d\t%d\t%f\t", match, misMatch, notAlign, (float)match/(match+misMatch)); fprintf(out, "%s\t", seq->dna); fprintf(out, "%d\t", matchSize); if(sameString(bed->strand, "-")) { for(i=matchSize-1; i >= 0; i--) fprintf(out, "%d,", matches[i]); } else { for(i=0; istring)) { dyStringClear(axtChromFile); dyStringPrintf(axtChromFile, "%s/%s.axt.gz", axtFile, row[0]); } dotForUser(); chromKeeperForAxt(axtChromFile->string, db); for(bed = bedList; bed != NULL; bed = bed->next) { if(sameString(bed->strand,"")) bed->strand[0] = '+'; if(differentString(bed->chrom, row[0])) continue; countMatches(bed, out, axtOut, qrnaOut); } cleanUpChromKeeper(); } carefulClose(&out); carefulClose(&axtOut); warn("\nDone."); } int main(int argc, char *argv[]) { optionInit(&argc, argv, NULL); if(argc <= 2) usage(); dnaUtilOpen(); axtCountBeds(argv[1], argv[2], argv[3], argv[4], argv[5]); return 0; }