/* regBeadPos - Position regulatory beads along a chromosome string. The beads are * nucleosomes, open regions and closed regions. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "bigWig.h" #include "hmmstats.h" char *simpleBed = NULL; FILE *simpleBedFile = NULL; char *stateProb = NULL; FILE *stateProbFile = NULL; double peakThreshold=40; double minPeakSum = 500; char *narrowPeak = NULL; FILE *narrowPeakFile = NULL; void usage() /* Explain usage and exit. */ { errAbort( "regBeadPos - Position regulatory beads along a chromosome string. The beads\n" "are nucleosomes, open regions and closed regions.\n" "usage:\n" " regBeadPos in.tab outFile.bed\n" "options:\n" " -simpleBed=simpleOut.bed - output each little window separately\n" " -narrowPeak=peaks.narrowPeak - output just the peak base here\n" " -peakThreshold=N.N - min average dnase score for peak output\n" " -stateProb=stateProbs.c\n" ); } static struct optionSpec options[] = { {"simpleBed", OPTION_STRING}, {"stateProb", OPTION_STRING}, {"narrowPeak", OPTION_STRING}, {"peakThreshold", OPTION_DOUBLE}, {NULL, 0}, }; enum aStates /* Internal states for HMM. */ { aLow, /* Not much DNAse or histone activity. */ aHisStart, aDnase, aHisEnd, aStateCount, }; #define DNASE_LEVELS 5 #define HISTONE_LEVELS 5 #define HMM_LETTERS (DNASE_LEVELS * HISTONE_LEVELS) char visStates[] = /* User visible states of HMM. */ { '.', 'O', 'x', 'O', }; struct stateSummary /* Keep track of letters seen in each state */ { int counts[aStateCount][HMM_LETTERS]; }; struct stateSummary stateSummary; struct inFile /* Info about input files. */ { struct inFile *next; char *name; /* Symbolic name used inside program - DNASE or HISTONE */ char *fileName; /* Full path name. */ struct bbiFile *bigWig; /* Open big wig */ struct bigWigValsOnChrom *chromVals; /* Fast bigWig access */ double cuts[5]; /* Where we make the cuts on different levels */ }; struct hash *makeInFilesHash(char *fileName) /* Read input and make hash out of it. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[7]; struct hash *hash = hashNew(0); while (lineFileRow(lf, row)) { struct inFile *in; AllocVar(in); in->name = cloneString(row[0]); in->fileName = cloneString(row[1]); in->bigWig = bigWigFileOpen(in->fileName); in->chromVals = bigWigValsOnChromNew(); int i; for (i=0; icuts); ++i) in->cuts[i] = lineFileNeedDouble(lf, row, i+2); hashAdd(hash, in->name, in); } return hash; } void *findInHashFromFile(char *key, struct hash *hash, char *fileName) /* Like hashMustFindVal, but with a better error message. */ { void *val = hashFindVal(hash, key); if (val == NULL) errAbort("Can't find %s in %s\n", key, fileName); return val; } inline UBYTE quant(struct inFile *inFile, double inVal) /* Quantize inVal into something small using inFile->cuts */ { double *cuts = inFile->cuts; /* COuld do this if stack with a loop and a separate case for the * last value, but actually would take as about as many lines of code and be * slower. The code is not too slow because the first test handles 90% of * the cases in any case.... */ if (inVal < cuts[0]) return 0; else if (inVal < cuts[1]) return 1; else if (inVal < cuts[2]) return 2; else if (inVal < cuts[3]) return 3; else return 4; } void makeInLetters(int chromSize, struct inFile *a, struct inFile *b, int basesInWindow, int *retLetterCount, UBYTE **retLetters) /* This does two things - averages over a small window, and then quantizes the * result into one of a small number of values. The quantization is done in * a and b separately, and the letter produced in the end is aBin * binCount + bBin */ { int letterCount = chromSize/basesInWindow; UBYTE *letters; AllocArray(letters, letterCount); int i, startWin = 0; double *aVals = a->chromVals->valBuf; double *bVals = b->chromVals->valBuf; double invWinSize = 1.0/basesInWindow; for (i=0; i newScore) \ { \ newScore = oneScore; \ parent = sourceState; \ } #define prob1(table, letter) ((table)[letter]) void traceback(double *scores, State **allStates, int letterCount, State *tStates) /* Fill in tStates with the states of along the optimal path */ { int i; double maxScore = -0x3fffffff; int maxState = 0; State *states; /* Find end state. */ for (i=0; i maxScore) { maxScore = scores[i]; maxState = i; } } /* Fill in tStates with record of states of optimal path */ for (i = letterCount-1; i >= 0; i -= 1) { tStates[i] = maxState; states = allStates[maxState]; maxState = states[i]; } } void dynamo(UBYTE *letters, int letterCount, State *out) /* Run dynamic program of HMM and put states of most likely path in out. */ { State **allStates; UBYTE *pos, c; int stateCount = aStateCount; int stateByteSize = letterCount * sizeof(State); int i; int lettersIx; int scanSize = letterCount; double unlikely = log(unlikelyProb); /* Allocate state tables. */ allStates = needMem(stateCount * sizeof(allStates[0])); for (i=0; icounts[states[i]][letters[i]] += 1; } void writeOneStateSummaryInC(FILE *f, struct stateSummary *summary, char *name, State state) { fprintf(f, "double %s[] = {", name); int i; for (i=0; icounts[state][i]); } fprintf(f, "};\n"); } void writeStateSummaryInC(FILE *f, struct stateSummary *summary) /* Output summary as a C array */ { writeOneStateSummaryInC(f, summary, "aLowCounts", aLow); writeOneStateSummaryInC(f, summary, "aHisStart", aHisStart); writeOneStateSummaryInC(f, summary, "aDnase", aDnase); writeOneStateSummaryInC(f, summary, "aHisEnd", aHisEnd); }; void outputSimpleStates(char *chrom, int shrinkFactor, State *states, int shrunkSize, FILE *f) /* output bed - item every 5. */ { /* Run it through a look up table. */ int i; for (i=0; i shrinkFactor) { double sum = 0; int i; double *valBuf = dnaseIn->chromVals->valBuf; double maxVal = valBuf[chromStart]; int maxIx = chromStart; for (i=chromStart; i maxVal) { maxVal = val; maxIx = i; } sum += dnaseIn->chromVals->valBuf[i]; } double ave = sum/width; if (ave >= threshold && sum >= minPeakSum) { int maxEndIx; for (maxEndIx=maxIx; maxEndIx 1000) score = 1000; fprintf(f, "%s\t%d\t%d\tpk\t%d\t.\t", chrom, maxIx, maxEndIx, score); fprintf(f, "%g\t-1\t-1\t%d\n", sum, (maxIx+maxEndIx)/2 - maxIx); } } } void newOutputStates(char *chrom, int shrinkFactor, State *states, int shrunkSize, FILE *f, struct inFile *dnaseIn, FILE *peakFile) /* Convert runs of states to various types of BED lines. */ { /* Run it through a look up table. */ int stateIx = 0; char *labels = needMem(shrunkSize+1); /* Extra so the countLeadingChars always end. */ int i; for (i=0; i 0) { n += dCount; n += countLeadingChars(labels+stateIx+n, 'O'); } fprintf(f, "%s\t%d\t%d\tframed\n", chrom, start, start + n*shrinkFactor); stateIx += n; break; default: internalErr(); break; } } freez(&labels); } void runHmmOnChrom(struct bbiChromInfo *chrom, struct inFile *dnaseIn, struct inFile *histoneIn, FILE *f) /* Do the HMM run on the one chromosome. */ { int inLetterCount; UBYTE *inLetters; makeInLetters(chrom->size, dnaseIn, histoneIn, 5, &inLetterCount, &inLetters); uglyTime("Quantizing %s of size %d", chrom->name, chrom->size); State *states; AllocArray(states, inLetterCount); dynamo(inLetters, inLetterCount, states); uglyTime("Ran HMM dynamo"); newOutputStates(chrom->name, 5, states, inLetterCount, f, dnaseIn, narrowPeakFile); if (simpleBedFile) outputSimpleStates(chrom->name, 5, states, inLetterCount, simpleBedFile); if (stateProbFile) addStateCounts(chrom->name, inLetters, states, inLetterCount, &stateSummary); uglyTime("Wrote output"); } void regBeadPos(char *inTab, char *outFile) /* regBeadPos - Position regulatory beads along a chromosome string. The beads are nucleosomes, * open regions and closed regions.. */ { struct hash *inHash = makeInFilesHash(inTab); uglyf("Read %d from %s\n", inHash->elCount, inTab); struct inFile *dnaseIn = findInHashFromFile("DNASE", inHash, inTab); struct inFile *histoneIn = findInHashFromFile("HISTONE", inHash, inTab); uglyf("%s and %s found\n", dnaseIn->name, histoneIn->name); FILE *f = mustOpen(outFile, "w"); if (simpleBed != NULL) simpleBedFile = mustOpen(simpleBed, "w"); if (stateProb != NULL) stateProbFile = mustOpen(stateProb, "w"); if (narrowPeak != NULL) narrowPeakFile = mustOpen(narrowPeak, "w"); struct bbiChromInfo *chrom, *chromList = bbiChromList(dnaseIn->bigWig); makeTransitionProbs(); makeEmissionProbs(); verbose(2, "Processing %d chromosomes\n", slCount(chromList)); uglyTime(NULL); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (bigWigValsOnChromFetchData(dnaseIn->chromVals, chrom->name, dnaseIn->bigWig) && bigWigValsOnChromFetchData(histoneIn->chromVals, chrom->name, histoneIn->bigWig)) { uglyTime("Got data"); runHmmOnChrom(chrom, dnaseIn, histoneIn, f); } } if (stateProbFile) writeStateSummaryInC(stateProbFile, &stateSummary); carefulClose(&stateProbFile); carefulClose(&simpleBedFile); carefulClose(&narrowPeakFile); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); simpleBed = optionVal("simpleBed", simpleBed); stateProb = optionVal("stateProb", stateProb); peakThreshold = optionDouble("peakThreshold", peakThreshold); narrowPeak = optionVal("narrowPeak", narrowPeak); regBeadPos(argv[1], argv[2]); return 0; }