/* shortRepeatFind - Figure out short pieces of DNA that happen a lot in genome. * The current implementation scans the splix index. Scanning a suffix array * index is better though, so I recomment you use sufaRepeatFind instead. */ /* This file is copyright 2008 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnaseq.h" #include "dnaLoad.h" #include "splix.h" int readSize = 25; int minCount = 5; void usage() /* Explain usage and exit. */ { errAbort( "shortRepeatFind - Figure out short pieces of DNA that happen a lot in genome.\n" "usage:\n" " shortRepeatFind input.splix output.srf\n" "options:\n" " -readSize=N Size of basic read size(default %d)\n" " -minCount=N Minimum count of repeats to report (default %d)\n" , readSize, minCount); } static struct optionSpec options[] = { {"readSize", OPTION_INT}, {"minCount", OPTION_INT}, {NULL, 0}, }; long totalSeqSize(struct dnaSeq *seqList) /* Return total size of DNA in list. */ { long total = 0; struct dnaSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) total += seq->size; return total; } struct dnaSeq *dnaSeqOtherStrand(struct dnaSeq *seq) /* Return other strand of seq. Do dnaSeqFree on this when done. */ { struct dnaSeq *rc = cloneDnaSeq(seq); reverseComplement(rc->dna, rc->size); return rc; } void addSuffixes(char **suffixPt, char *s, int size) /* Add all suffixes to array starting at suffixPt. */ { while (--size >= 0) *suffixPt++ = s++; } int compareStrings(const void *va, const void *vb) /* Compare two strings. */ { char **a = (char **)va, **b = (char **)vb; return cmpDnaStrings(*a,*b); } int hexesSame(bits16 *a, int size) /* Return number of hexes same as the first one. */ { bits16 first = *a; int i; for (i=1; i>10)&3]), f); fputc(toupper(valToNt[(hexamer>>8)&3]), f); fputc(toupper(valToNt[(hexamer>>6)&3]), f); fputc(toupper(valToNt[(hexamer>>4)&3]), f); fputc(toupper(valToNt[(hexamer>>2)&3]), f); fputc(toupper(valToNt[hexamer&3]), f); } static void dumpTwelvemer(int twelvemer, FILE *f) /* Write out twelvemer as letters to file. */ { fputc(toupper(valToNt[(twelvemer>>22)&3]), f); fputc(toupper(valToNt[(twelvemer>>20)&3]), f); fputc(toupper(valToNt[(twelvemer>>18)&3]), f); fputc(toupper(valToNt[(twelvemer>>16)&3]), f); fputc(toupper(valToNt[(twelvemer>>14)&3]), f); fputc(toupper(valToNt[(twelvemer>>12)&3]), f); dumpSixmer(twelvemer, f); } boolean allDna(char *s, int size) /* Return TRUE is next size characters in s are all DNA characters. */ { int i; for (i=0; islotSizes[i]; char *slot = splix->slots[i]; bits16 *hexes = (bits16*)slot; bits16 *hexesAfter = hexes+2*slotSize; bits32 *offsets = (bits32*)(hexes + 4*slotSize); bits32 *offsetsAfter = offsets+2*slotSize; int hexIx, hexCount = 0; for (hexIx = 0; hexIx < slotSize; hexIx += hexCount) { hexCount = hexesSame(hexesAfter + hexIx, slotSize - hexIx); if (hexCount >= minCount) { if (hexCount > sortBufSize) { ExpandArray(sortBuf, sortBufSize, hexCount); sortBufSize = hexCount; } int j; for (j=0; jallDna + offsetsAfter[hexIx+j] + 12; qsort(sortBuf, hexCount, sizeof(sortBuf[0]), compareStrings); int afterIx, afterCount = 0; for (afterIx = 0; afterIx < hexCount; afterIx += afterCount) { afterCount = aftersSame(sortBuf+afterIx, hexCount - afterIx, suffixSize); if (afterCount >= minCount) { char *suffix = sortBuf[afterIx]; if (allDna(suffix, suffixSize)) { dumpTwelvemer(i, f); dumpSixmer(hexesAfter[hexIx], f); mustWrite(f, sortBuf[afterIx], suffixSize); fprintf(f, " %d\n", afterCount); } } } } } }; } int main(int argc, char *argv[]) /* Process command line. */ { dnaUtilOpen(); optionInit(&argc, argv, options); readSize = optionInt("readSize", readSize); minCount = optionInt("minCount", minCount); if (argc != 3) usage(); shortRepeatFind(argv[1], argv[2]); return 0; }