/* cgapSageFind - Find all the locations in the genome with SAGE data including SNP variants. */ #include "common.h" #include "obscure.h" #include "dystring.h" #include "portable.h" #include "linefile.h" #include "hash.h" #include "dnaLoad.h" #include "bed.h" #include "snp.h" #include "cgapSage/cgapSage.h" #include "cgapSage/cgapSageLib.h" #include "cgapSageFind.h" void usage() /* Explain usage and exit. */ { errAbort( "cgapSageFind - Find all the locations in the sequence with SAGE data including SNP variants\n" "usage:\n" " cgapSageFind sequence frequencies.txt libraries.txt snps.txt output.bed\n" ); } struct snp *narrowDownSnps(struct snp **pSnpList, char *chrom) /* Get all the snps on a chrom and return it sorted. */ { struct snp *newList = NULL; struct snp *oldList = NULL; struct snp *cur; if (!pSnpList || !(*pSnpList)) return NULL; while ((cur = slPopHead(pSnpList)) != NULL) { if (sameString(cur->chrom, chrom)) slAddHead(&newList, cur); else slAddHead(&oldList, cur); } *pSnpList = oldList; slSort(&newList, bedCmp); return newList; } struct slRef *snpsInTag(struct snp **pChromSnps, int pos, int start) /* Find the SNPs in the next 21 (TAG_SIZE) bases. */ /* chromSnps is sorted so quit search after we go beyond range. */ { struct slRef *snpList = NULL; struct snp *chromSnps, *cur; if (!pChromSnps || !(*pChromSnps)) return NULL; /* Pop off SNPs until we're at the current position. */ cur = *pChromSnps; while (((cur = *pChromSnps) != NULL) && (cur->chromStart < (pos + start))) { cur = slPopHead(pChromSnps); snpFree(&cur); } cur = *pChromSnps; while (cur && (cur->chromStart < (pos + TAG_SIZE + start))) { if ((cur->chromStart >= pos + start) && (cur->chromStart < (pos + TAG_SIZE + start))) { struct slRef *newRef; AllocVar(newRef); newRef->val = cur; slAddHead(&snpList, newRef); } cur = cur->next; } slReverse(&snpList); return snpList; } char otherStrand(char base) /* Just return the base on the other strand. Super dumb. */ { if (base == 'A') return 'T'; if (base == 'T') return 'A'; if (base == 'C') return 'G'; return 'C'; } boolean checkSeqCATG(struct dnaSeq *chrom, int pos, int start, struct slRef *snpList) /* This function is meant to speed things up by quickly checking if the */ /* current window begins or ends with a CATG or if there's SNPs in the first */ /* or last four bases. */ { struct slRef *snpRef; if (startsWith("CATG", chrom->dna + pos)) return TRUE; if (((pos + TAG_SIZE) < chrom->size) && (startsWith("CATG", chrom->dna + pos + TAG_SIZE - 4))) return TRUE; for (snpRef = snpList; snpRef != NULL; snpRef = snpRef->next) { struct snp *snp = snpRef->val; if ((snp->chromStart < pos + start + 4) || (snp->chromStart >= pos + start + TAG_SIZE - 4)) return TRUE; } return FALSE; } void possiblyOutputATag(struct dnaSeq *seq, int pos, int start, char strand, struct snp **snpsUsedArray, int numSnps, struct hash *freqHash, struct hash *libTotHash, FILE *output) /* If this sequence permutation appears in the frequency hash, output it. */ { struct slPair *list; list = hashFindVal(freqHash, seq->dna + 4); if (list) /* Output in cgapSage bed format. */ { struct slPair *cur; int chromStart = pos + start; int chromEnd = chromStart + TAG_SIZE; int thickStart = (strand == '+') ? chromStart : chromEnd - 4; int thickEnd = thickStart + 4; int numLibs = slCount(list); int i; int snpsUsed = 0; /* chrom, chromStart, chromEnd */ fprintf(output, "%s\t%d\t%d\t", seq->name, chromStart, chromEnd); /* name */ fprintf(output, "%s\t", seq->dna + 4); /* score, strand, thickStart, thickEnd */ fprintf(output, "1000\t%c\t%d\t%d\t", strand, thickStart, thickEnd); /* numLibs */ fprintf(output, "%d\t", numLibs); /* libIds */ for (cur = list; cur != NULL; cur = cur->next) fprintf(output, "%s,", cur->name); fprintf(output, "\t"); /* freqs */ for (cur = list; cur != NULL; cur = cur->next) fprintf(output, "%d,", ptToInt(cur->val)); fprintf(output, "\t"); /* TPMs */ for (cur = list; cur != NULL; cur = cur->next) { double totalTags = (double)hashIntVal(libTotHash, cur->name); int freq = ptToInt(cur->val); double tpm = (double)freq * (1000000 / totalTags); fprintf(output, "%.4f,", tpm); } fprintf(output, "\t"); for (i = 0; i < numSnps; i++) if (snpsUsedArray[i] != NULL) snpsUsed++; /* numSNps */ fprintf(output, "%d\t", snpsUsed); /* snps */ for (i = 0; i < numSnps; i++) if (snpsUsedArray[i] != NULL) fprintf(output, "%s,", snpsUsedArray[i]->name); fprintf(output, "\n"); } } void findTagsRecursingPosition(struct dnaSeq *seq, struct dnaSeq *refCopy, int pos, int start, char strand, struct snp **snpArray, struct snp **snpsUsedArray, int numSnps, struct hash *freqHash, struct hash *libTotHash, FILE *output, int ix, int snpIx) /* Recursively permute all possible 21-mers containing SNPs. Takes into account */ /* that there can be more than one SNP per tag and even more than one SNP per */ /* base. */ { /* Check to see if the first four bases are CATG */ /* If we're at a SNP, go through the versions of it. */ if ((ix < TAG_SIZE) && !((ix == 1) && (seq->dna[0] != 'C')) && !((ix == 2) && (seq->dna[1] != 'A')) && !((ix == 3) && (seq->dna[2] != 'T')) && !((ix == 4) && (seq->dna[3] != 'G'))) { int chromIx = start + pos + (strand == '+' ? ix : (TAG_SIZE - ix)); /* Deal with the possibility the current base is a SNP. */ /* If the current SNP chromStart is < the base we're on, move */ /* the snpIx. */ while ((snpIx < numSnps) && (snpArray[snpIx]->chromStart < chromIx)) snpIx++; /* If this base has one or more SNPs, do stuff. */ if ((snpIx < numSnps) && (chromIx == snpArray[snpIx]->chromStart)) { while ((snpIx < numSnps) && (chromIx == snpArray[snpIx]->chromStart)) { int sizeClass = strlen(snpArray[snpIx]->class); int i; /* Loop through the SNP itself i.e. the strings "A/T", "C/G", etc. */ for (i = 0; i < sizeClass; i += 2) { char snpBase = snpArray[snpIx]->class[i]; /* Stupid minus-strand SNPs. How does that make sense?!?!? */ if (((snpArray[snpIx]->strand[0] == '-') && (strand == '+')) || ((snpArray[snpIx]->strand[0] == '+') && (strand == '-'))) snpBase = otherStrand(snpBase); seq->dna[ix] = snpBase; /* Test the SNP base against the original base. */ if ((snpBase != refCopy->dna[ix]) && (snpBase != '-')) /* We're using this SNP, so reflect that in the snpsUsedArray. */ snpsUsedArray[snpIx] = snpArray[snpIx]; else snpsUsedArray[snpIx] = NULL; findTagsRecursingPosition(seq, refCopy, pos, start, strand, snpArray, snpsUsedArray, numSnps, freqHash, libTotHash, output, ix+1, snpIx); } /* Check to see if the next SNP is on this base. If so, clear */ /* the usedArray at this snpIx, so when we deal with that SNP */ /* on the next iteration of this while loop, there's no blatant */ /* disagreement. */ if ((snpIx+1 < numSnps) && (chromIx == snpArray[snpIx+1]->chromStart)) snpsUsedArray[snpIx] = NULL; snpIx++; } } else /* (if this base didn't have a SNP, just move on). */ findTagsRecursingPosition(seq, refCopy, pos, start, strand, snpArray, snpsUsedArray, numSnps, freqHash, libTotHash, output, ix+1, snpIx); } else if (ix == TAG_SIZE) /* Finally, at the 21st level of recursion, output stuff. */ { possiblyOutputATag(seq, pos, start, strand, snpsUsedArray, numSnps, freqHash, libTotHash, output); } } void findTagsAtPos(struct dnaSeq *chrom, int pos, int start, struct snp **pChromSnps, struct hash *freqHash, struct hash *libTotHash, FILE *output) /* Find the tags at the specific position. */ { char *dna; struct slRef *snpList = snpsInTag(pChromSnps, pos, start); struct slRef *snpRef; struct dnaSeq *seq; struct dnaSeq *refCopy; struct snp **snpArray = NULL; struct snp **snpsUsedArray = NULL; int i = 0; int numSnps = slCount(snpList); if (numSnps > 0) { AllocArray(snpArray, numSnps); AllocArray(snpsUsedArray, numSnps); } //if ((pos > 4316050) && ((pos < 4316075))) for (snpRef = snpList; snpRef != NULL; snpRef = snpRef->next) snpArray[i++] = (struct snp *)snpRef->val; /* Start by making the most basic one. */ dna = cloneStringZ(chrom->dna + pos, TAG_SIZE); seq = newDnaSeq(dna, TAG_SIZE, chrom->name); /* Keep a copy so it's easy to tell which bases are introduced by SNPs. */ refCopy = cloneDnaSeq(seq); findTagsRecursingPosition(seq, refCopy, pos, start, '+', snpArray, snpsUsedArray, numSnps, freqHash, libTotHash, output, 0, 0); /* Now do the minus strand. */ freeDnaSeq(&seq); reverseComplement(refCopy->dna, refCopy->size); seq = cloneDnaSeq(refCopy); slReverse(&snpList); i = 0; for (snpRef = snpList; snpRef != NULL; snpRef = snpRef->next) { snpsUsedArray[i] = NULL; snpArray[i++] = (struct snp *)snpRef->val; } findTagsRecursingPosition(seq, refCopy, pos, start, '-', snpArray, snpsUsedArray, numSnps, freqHash, libTotHash, output, 0, 0); freez(&snpArray); freez(&snpsUsedArray); freeDnaSeq(&seq); freeDnaSeq(&refCopy); } int skipPosPastNs(struct dnaSeq *chrom, int pos) /* If the next 21 bases have an 'N', skip to base after the 'N'. Then */ /* check again... and so on. Return the nearest base where the next */ /* 21 bases are clear of 'N's. */ { int i = pos + TAG_SIZE; int furthestN = pos; if (pos >= chrom->size - TAG_SIZE) return pos; while (i > pos) { if ((chrom->dna[i] == 'N') || (chrom->dna[i] == 'X')) { furthestN = i; break; } i--; } if (furthestN > pos) { pos = skipPosPastNs(chrom, furthestN + 1); } return pos; } void findTagsOnChrom(struct dnaSeq *chrom, int start, struct snp **pChromSnps, struct hash *freqHash, struct hash *libTotHash, FILE *output) /* Find the tags on just the given chromosome. */ { int pos = 0; while ((pos = skipPosPastNs(chrom, pos)) < chrom->size - TAG_SIZE) { findTagsAtPos(chrom, pos, start, pChromSnps, freqHash, libTotHash, output); pos++; } } void findTagLocations(struct dnaSeq *seqList, struct snp **pSnpList, struct hash *freqHash, struct hash *libTotHash, FILE *output) /* Find the tags on all given sequences. */ { struct dnaSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) { int start = pickApartSeqName(&(seq->name)); struct snp *chromSnps = narrowDownSnps(pSnpList, seq->name); findTagsOnChrom(seq, start, &chromSnps, freqHash, libTotHash, output); snpFreeList(&chromSnps); } } void cgapSageFind(char *genome, char *freqFile, char *libsFile, char *snpFile, char *outputBedFile) /* cgapSageFind - Find all the locations in the genome with SAGE data */ /* including SNP variants. */ { FILE *output = mustOpen(outputBedFile, "w"); struct hash *freqHash = getFreqHash(freqFile); struct hash *libTotHash = getTotTagsHash(libsFile); struct snp *snpList = snpLoadAllByTab(snpFile); struct dnaSeq *seqList = dnaLoadAll(genome); findTagLocations(seqList, &snpList, freqHash, libTotHash, output); freeDnaSeqList(&seqList); snpFreeList(&snpList); hashFreeWithVals(&freqHash, hashElSlPairListFree); freeHash(&libTotHash); carefulClose(&output); } int main(int argc, char *argv[]) /* Process command line. */ { if (argc != 6) usage(); cgapSageFind(argv[1], argv[2], argv[3], argv[4], argv[5]); return 0; }