/* fragFind - fast way of finding short patterns in data. */ #include "common.h" #include "portable.h" #include "dnautil.h" #include "dnaseq.h" #include "fa.h" #include "ameme.h" #include "fragFind.h" static int oligoVal(DNA *oligo, int oligoSize) /* Return oligo converted to a number - 2 bits per base. * returns -1 if oligo has 'N's or other non-nt characters. */ { int i; int acc = 0; int baseVal; for (i=0; i=0; --i) { out[i] = valToNt[val&3]; val>>=2; } } int *makeRcTable(int oligoSize) /* Make a table for doing reverse complement of packed oligos. */ { int tableSize = (1<<(oligoSize+oligoSize)); int tableByteSize = tableSize * sizeof(int); int *table = needLargeMem(tableByteSize); char oligo[17]; int i; for (i=0; iseq; softMask = seqEl->softMask; seqEl = seqEl->next; } else { seq = faReadOneDnaSeq(f, "", TRUE); if (seq == NULL) break; } dna = seq->dna; size = seq->size; endIx = size-oligoSize; for (i=0; i<=endIx; ++i) { if (softMask == NULL || !masked(softMask+i, oligoSize) ) { if ((oliVal = oligoVal(dna+i, oligoSize)) >= 0) { table[oliVal] += 1; ++total; } } } if (seqList == NULL) freeDnaSeq(&seq); } carefulClose(&f); *retTable = table; *retTotal = total; } static int bestTableIx(int *table, int oligoSize, int *rcTable) /* Return index of highest count in table. */ { int tableSize = (1<<(oligoSize+oligoSize)); int bestVal = -1; int bestIx = 0; int i; int val; if (rcTable != NULL) { for (i=0; i bestVal) { bestVal = val; bestIx = i; } } } else { for (i=0; i bestVal) { bestVal = val; bestIx = i; } } } return bestIx; } static int fuzzValOne(int *table, int oligoSize, int *rcTable) /* Return value of table entry with most number of * matches that are off by no more than one. */ { int tableSize = (1<<(oligoSize+oligoSize)); int tableIx; int bestVal = -0x3ffffff; int bestIx; for (tableIx = 0; tableIx < tableSize; ++tableIx) { int acc = 0; int mask = 0x3; int maskedIx; int inc = 1; int baseIx; for (baseIx = 0; baseIx bestVal) { bestVal = acc; bestIx = tableIx; } } return tableIx; } static int fuzzValTwo(int *table, int oligoSize, int *rcTable) /* Return value of table entry with most number of * matches that are off by no more than one. */ { int tableSize = (1<<(oligoSize+oligoSize)); int tableIx; int bestVal = -0x3ffffff; int bestIx = 0; for (tableIx = 0; tableIx < tableSize; ++tableIx) { int acc = 0; int ixTwo; int maskTwo = 0x3; int incTwo = 1; int baseTwo; int j; for (baseTwo = 0; baseTwo bestVal) { bestVal = acc; bestIx = tableIx; } } return bestIx; } static int fuzzValThree(int *table, int oligoSize, int *rcTable) /* Return value of table entry with most number of * matches that are off by no more than one. */ { int tableSize = (1<<(oligoSize+oligoSize)); int tableIx; int bestVal = -0x3ffffff; int bestIx = 0; for (tableIx = 0; tableIx < tableSize; ++tableIx) { int acc = 0; int ixThree; int maskThree = 0x3; int incThree = 1; int baseThree; int k; for (baseThree=0; baseThree bestVal) { bestVal = acc; bestIx = tableIx; } } return bestIx; } static int fuzzVal(int *table, int oligoSize, int mismatchesAllowed, boolean considerRc) /* Return value of table entry with most number of * matches that are off by no more than mismatchesAllowed. */ { int *rcTable = NULL; int ret = 0; if (considerRc) rcTable = makeRcTable(oligoSize); if (mismatchesAllowed == 0) ret = bestTableIx(table, oligoSize, rcTable); else if (mismatchesAllowed == 1) ret = fuzzValOne(table, oligoSize, rcTable); else if (mismatchesAllowed == 2) ret = fuzzValTwo(table, oligoSize, rcTable); else if (mismatchesAllowed == 3) ret = fuzzValThree(table, oligoSize, rcTable); else assert(FALSE); freeMem(rcTable); return ret; } static void normalizeTable(int *table, int oligoSize) /* Normalize table so that all entries are between 0 and 1000000 */ { int tableSize = (1<<(oligoSize+oligoSize)); int i = bestTableIx(table, oligoSize, NULL); int maxVal = table[i]; double scaleBy = 1000000.0/maxVal; for (i=0; inext) { struct dnaSeq *seq = seqEl->seq; DNA *dna = seq->dna; int size = seq->size; int endIx = size-oligoSize; ++seqCount; for (i=0; i<=endIx; ++i) { DNA *target = dna+i; if (allGoodBases(target, oligoSize)) { if (mismatchCount(oligo, target, oligoSize) <= mismatchesAllowed) { ++total; for (j=0; j 3) errAbort("Sorry, fragFind can only handle 0-3 mismatches."); if (fragSize > 10) errAbort("Sorry, fragFind can only handle fragments up to 10 bases."); startTime = clock1000(); makeOligoHistogram(NULL, goodSeq, fragSize, &goodTable, &goodCount); if (badName) makeOligoHistogram(badName, NULL, fragSize, &badTable, &badCount); if (badName) { normalizeTable(goodTable, fragSize); normalizeTable(badTable, fragSize); diffTables(goodTable, badTable, fragSize); goodIx = fuzzVal(badTable, fragSize, mismatchesAllowed, considerRc); } else { goodIx = fuzzVal(goodTable, fragSize, mismatchesAllowed, considerRc); } freez(&goodTable); freez(&badTable); unpackVal(goodIx, fragSize, unpacked); makeProfile(unpacked, fragSize, mismatchesAllowed, goodSeq, considerRc, profile); } #ifdef TESTING_STANDALONE void usage() /* Explain usage and exit. */ { errAbort("fragFind - find patterns in DNA.\n" "usage:\n" " fragFind good.fa"); } int main(int argc, char *argv[]) { char *goodName, *badName = NULL; double profile[16][4]; int alphabatize[4] = {A_BASE_VAL, C_BASE_VAL, G_BASE_VAL, T_BASE_VAL}; int i; dnaUtilOpen(); if (argc < 2) usage(); goodName = argv[1]; if (argc >= 3) badName = argv[2]; fragFind(goodName, badName, 7, 2, profile); printf("\n%s\n", unpacked); for (i=0; i<4; ++i) { int baseVal = alphabatize[i]; printf("%c ", valToNt[baseVal]); for (j=0; j