/* PatCount - Counts up the number of occurences of each * oligo of a fixed size (up to 13) in input sequence. * Writes all patterns that are overrepresented according * to a threshold to output. */ #include "common.h" #include "dnaseq.h" #include "fa.h" #include "sig.h" void usage() { errAbort("patCount - counts up the number of occurences of each\n" "oligo of a fixed size (up to 13) in input. Writes out\n" "all patterns that are overrepresented by at least factor\n" "to output, which is a binary .ooc file used by patSpace\n" "nucleotide homology algorithms.\n" "usage:\n" " patCount out.ooc oligoSize overFactor faFiles(s)\n" "example:\n" " patCount hgt.ooc 13 6 hgt1.fa hgt2.fa hgt3.fa"); } unsigned long powerOfFour(int oligoSize) /* Return integer power of four. */ { unsigned long p = 1; int i; for (i=0; i' */ for (;;) { c = fgetc(f); if (isspace(c) || isdigit(c)) continue; if (c == EOF || c == '>') c = 0; if (bufIx >= bufSize) { if (bufSize == 0) { bufSize = 1024 * 1024; buf = needMem(bufSize); } else { int newBufSize = bufSize + bufSize; buf = needMoreMem(buf, bufSize, newBufSize); uglyf("Increasing buffer from %d to %d bytes\n", bufSize, newBufSize); bufSize = newBufSize; } } buf[bufIx++] = c; if (c == 0) { *retDna = buf; *retSize = bufIx-1; *retName = name; return TRUE; } } } unsigned addPatCounts(DNA *dna, int dnaSize, int oligoSize, bits16 *patCounts, int mask) /* Count oligomers in sequence. */ { int i; int bits = 0; int bVal; unsigned count = 0; bits16 useCount; bits16 bigEnough = 0x3fff; for (i=0; i 13) usage(); s = argv[3]; if (!isdigit(s[0])) usage(); overFactor = atof(s); if (overFactor < 1.1) usage(); /* Allocate an integer array to store frequency counts of * each oligomer. */ patSpaceSize = powerOfFour(oligoSize); patSpaceByteSize = patSpaceSize * sizeof(patCounts[0]); uglyf("Allocating pattern space of %zu bytes\n", patSpaceByteSize); patCounts = needLargeMem(patSpaceByteSize); memset(patCounts, 0, patSpaceByteSize); /* Loop through input files. */ out = mustOpen(outName, "wb"); for (i=4; i 0) { ++distinctCount; if (count > maxCount) maxCount = count; if (count >= thresh) { ++overCount; writeOne(out, patIx); } } } fclose(out); } /* Write statistics to console. */ printf("Statistic for oligos of size %d (patSpaceSize %zu)\n", oligoSize, patSpaceSize); printf("Total oligos %d\n", totalCount); printf("Distinct oligos %d\n", distinctCount); printf("Maximum occurences of single oligo %d\n", maxCount); printf("OverFactor %4.2f yeilds threshold %d\n", overFactor, thresh); printf("Oligos filtered out as too common %d (%2.1f%%)\n", overCount, 100.0*overCount/distinctCount); return 0; }