/* Some stuff that you'll likely need in any program that works with * DNA. Includes stuff for amino acids as well. * * Assumes that DNA is stored as a character. * The DNA it generates will include the bases * as lowercase tcag. It will generally accept * uppercase as well, and also 'n' or 'N' or '-' * for unknown bases. * * Amino acids are stored as single character upper case. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "dnautil.h" struct codonTable /* The dread codon table. */ { DNA *codon; /* Lower case. */ AA protCode; /* Upper case. The "Standard" code */ AA mitoCode; /* Upper case. Vertebrate Mitochondrial translations */ }; struct codonTable codonTable[] = /* The master codon/protein table. */ { {"ttt", 'F', 'F',}, {"ttc", 'F', 'F',}, {"tta", 'L', 'L',}, {"ttg", 'L', 'L',}, {"tct", 'S', 'S',}, {"tcc", 'S', 'S',}, {"tca", 'S', 'S',}, {"tcg", 'S', 'S',}, {"tat", 'Y', 'Y',}, {"tac", 'Y', 'Y',}, {"taa", 0, 0,}, {"tag", 0, 0,}, {"tgt", 'C', 'C',}, {"tgc", 'C', 'C',}, {"tga", 0, 'W',}, {"tgg", 'W', 'W',}, {"ctt", 'L', 'L',}, {"ctc", 'L', 'L',}, {"cta", 'L', 'L',}, {"ctg", 'L', 'L',}, {"cct", 'P', 'P',}, {"ccc", 'P', 'P',}, {"cca", 'P', 'P',}, {"ccg", 'P', 'P',}, {"cat", 'H', 'H',}, {"cac", 'H', 'H',}, {"caa", 'Q', 'Q',}, {"cag", 'Q', 'Q',}, {"cgt", 'R', 'R',}, {"cgc", 'R', 'R',}, {"cga", 'R', 'R',}, {"cgg", 'R', 'R',}, {"att", 'I', 'I',}, {"atc", 'I', 'I',}, {"ata", 'I', 'M',}, {"atg", 'M', 'M',}, {"act", 'T', 'T',}, {"acc", 'T', 'T',}, {"aca", 'T', 'T',}, {"acg", 'T', 'T',}, {"aat", 'N', 'N',}, {"aac", 'N', 'N',}, {"aaa", 'K', 'K',}, {"aag", 'K', 'K',}, {"agt", 'S', 'S',}, {"agc", 'S', 'S',}, {"aga", 'R', 0,}, {"agg", 'R', 0,}, {"gtt", 'V', 'V',}, {"gtc", 'V', 'V',}, {"gta", 'V', 'V',}, {"gtg", 'V', 'V',}, {"gct", 'A', 'A',}, {"gcc", 'A', 'A',}, {"gca", 'A', 'A',}, {"gcg", 'A', 'A',}, {"gat", 'D', 'D',}, {"gac", 'D', 'D',}, {"gaa", 'E', 'E',}, {"gag", 'E', 'E',}, {"ggt", 'G', 'G',}, {"ggc", 'G', 'G',}, {"gga", 'G', 'G',}, {"ggg", 'G', 'G',}, }; /* A table that gives values 0 for t 1 for c 2 for a 3 for g * (which is order aa's are in biochemistry codon tables) * and gives -1 for all others. */ int ntVal[256]; int ntValLower[256]; /* NT values only for lower case. */ int ntValUpper[256]; /* NT values only for upper case. */ int ntVal5[256]; int ntValNoN[256]; /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent ones. */ DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1]; /* convert tables for bit-4 indicating masked */ int ntValMasked[256]; DNA valToNtMasked[256]; static boolean inittedNtVal = FALSE; static void initNtVal() { if (!inittedNtVal) { int i; for (i=0; i= 3) { int c = ntVal[(int)dna[pos-3]]; if (c == A_BASE_VAL || c == G_BASE_VAL) return TRUE; } return FALSE; } boolean isReallyStopCodon(char *dna, boolean selenocysteine) /* Return TRUE if it's really a stop codon, even considering * possibilility of selenocysteine. */ { if (selenocysteine) { /* Luckily the mitochondria *also* replaces TGA with * something else, even though it isn't selenocysteine */ return lookupMitoCodon(dna) == 0; } else { return lookupCodon(dna) == 0; } } /* Returns one letter code for protein, * 0 for stop codon or X for bad input, * Vertebrate Mitochondrial Code */ AA lookupMitoCodon(DNA *dna) { int ix; int i; char c; if (!inittedNtVal) initNtVal(); ix = 0; for (i=0; i<3; ++i) { int bv = ntVal[(int)dna[i]]; if (bv<0) return 'X'; ix = (ix<<2) + bv; } c = codonTable[ix].mitoCode; c = toupper(c); return c; } Codon codonVal(DNA *start) /* Return value from 0-63 of codon starting at start. * Returns -1 if not a codon. */ { int v1,v2,v3; if ((v1 = ntVal[(int)start[0]]) < 0) return -1; if ((v2 = ntVal[(int)start[1]]) < 0) return -1; if ((v3 = ntVal[(int)start[2]]) < 0) return -1; return ((v1<<4) + (v2<<2) + v3); } DNA *valToCodon(int val) /* Return codon corresponding to val (0-63) */ { assert(val >= 0 && val < 64); return codonTable[val].codon; } void dnaTranslateSome(DNA *dna, char *out, int outSize) /* Translate DNA upto a stop codon or until outSize-1 amino acids, * whichever comes first. Output will be zero terminated. */ { int i; int dnaSize; int protSize = 0; outSize -= 1; /* Room for terminal zero */ dnaSize = strlen(dna); for (i=0; i= outSize) break; if ((out[protSize++] = lookupCodon(dna+i)) == 0) break; } out[protSize] = 0; } /* A little array to help us decide if a character is a * nucleotide, and if so convert it to lower case. */ char ntChars[256]; static void initNtChars() { static boolean initted = FALSE; if (!initted) { zeroBytes(ntChars, sizeof(ntChars)); ntChars['a'] = ntChars['A'] = 'a'; ntChars['c'] = ntChars['C'] = 'c'; ntChars['g'] = ntChars['G'] = 'g'; ntChars['t'] = ntChars['T'] = 't'; ntChars['n'] = ntChars['N'] = 'n'; ntChars['u'] = ntChars['U'] = 'u'; ntChars['-'] = 'n'; initted = TRUE; } } char ntMixedCaseChars[256]; static void initNtMixedCaseChars() { static boolean initted = FALSE; if (!initted) { zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars)); ntMixedCaseChars['a'] = 'a'; ntMixedCaseChars['A'] = 'A'; ntMixedCaseChars['c'] = 'c'; ntMixedCaseChars['C'] = 'C'; ntMixedCaseChars['g'] = 'g'; ntMixedCaseChars['G'] = 'G'; ntMixedCaseChars['t'] = 't'; ntMixedCaseChars['T'] = 'T'; ntMixedCaseChars['n'] = 'n'; ntMixedCaseChars['N'] = 'N'; ntMixedCaseChars['u'] = 'u'; ntMixedCaseChars['U'] = 'U'; ntMixedCaseChars['-'] = 'n'; initted = TRUE; } } /* Another array to help us do complement of DNA */ DNA ntCompTable[256]; static boolean inittedCompTable = FALSE; static void initNtCompTable() { zeroBytes(ntCompTable, sizeof(ntCompTable)); ntCompTable[' '] = ' '; ntCompTable['-'] = '-'; ntCompTable['='] = '='; ntCompTable['a'] = 't'; ntCompTable['c'] = 'g'; ntCompTable['g'] = 'c'; ntCompTable['t'] = 'a'; ntCompTable['u'] = 'a'; ntCompTable['n'] = 'n'; ntCompTable['-'] = '-'; ntCompTable['.'] = '.'; ntCompTable['A'] = 'T'; ntCompTable['C'] = 'G'; ntCompTable['G'] = 'C'; ntCompTable['T'] = 'A'; ntCompTable['U'] = 'A'; ntCompTable['N'] = 'N'; ntCompTable['R'] = 'Y'; ntCompTable['Y'] = 'R'; ntCompTable['M'] = 'K'; ntCompTable['K'] = 'M'; ntCompTable['S'] = 'S'; ntCompTable['W'] = 'W'; ntCompTable['V'] = 'B'; ntCompTable['H'] = 'D'; ntCompTable['D'] = 'H'; ntCompTable['B'] = 'V'; ntCompTable['X'] = 'N'; ntCompTable['r'] = 'y'; ntCompTable['y'] = 'r'; ntCompTable['s'] = 's'; ntCompTable['w'] = 'w'; ntCompTable['m'] = 'k'; ntCompTable['k'] = 'm'; ntCompTable['v'] = 'b'; ntCompTable['h'] = 'd'; ntCompTable['d'] = 'h'; ntCompTable['b'] = 'v'; ntCompTable['x'] = 'n'; ntCompTable['('] = ')'; ntCompTable[')'] = '('; inittedCompTable = TRUE; } /* Complement DNA (not reverse). */ void complement(DNA *dna, long length) { int i; if (!inittedCompTable) initNtCompTable(); for (i=0; i 0) { if (*a++ != '-') --size; } if (skipTrailingDash) while (*a == '-') ++a; return a; } int countNonDash(char *a, int size) /* Count number of non-dash characters. */ { int count = 0; int i; for (i=0; i 4) { count += 1; x >>= 2; } return count; } long dnaOrAaFilteredSize(char *raw, char filter[256]) /* Return how long DNA will be after non-DNA is filtered out. */ { char c; long count = 0; dnaUtilOpen(); while ((c = *raw++) != 0) { if (filter[(int)c]) ++count; } return count; } void dnaOrAaFilter(char *in, char *out, char filter[256]) /* Run chars through filter. */ { char c; dnaUtilOpen(); while ((c = *in++) != 0) { if ((c = filter[(int)c]) != 0) *out++ = c; } *out++ = 0; } long dnaFilteredSize(char *rawDna) /* Return how long DNA will be after non-DNA is filtered out. */ { return dnaOrAaFilteredSize(rawDna, ntChars); } void dnaFilter(char *in, DNA *out) /* Filter out non-DNA characters and change to lower case. */ { dnaOrAaFilter(in, out, ntChars); } void dnaFilterToN(char *in, DNA *out) /* Change all non-DNA characters to N. */ { DNA c; initNtChars(); while ((c = *in++) != 0) { if ((c = ntChars[(int)c]) != 0) *out++ = c; else *out++ = 'n'; } *out++ = 0; } void dnaMixedCaseFilter(char *in, DNA *out) /* Filter out non-DNA characters but leave case intact. */ { dnaOrAaFilter(in, out, ntMixedCaseChars); } long aaFilteredSize(char *raw) /* Return how long aa will be after non-aa chars is filtered out. */ { return dnaOrAaFilteredSize(raw, aaChars); } void aaFilter(char *in, DNA *out) /* Filter out non-aa characters and change to upper case. */ { dnaOrAaFilter(in, out, aaChars); } void upperToN(char *s, int size) /* Turn upper case letters to N's. */ { char c; int i; for (i=0; i= 0) { if ((val = ntVal[(int)*dna++]) >= 0) ++histogram[val]; } } bits64 basesToBits64(char *dna, int size) /* Convert dna of given size (up to 32) to binary representation */ { if (size > 32) errAbort("basesToBits64 called on %d bases, can only go up to 32", size); bits64 result = 0; int i; for (i=0; i= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } bits16 packDna8(DNA *in) /* Pack 8 bases into a short word */ { bits16 out = 0; int count = 8; int bVal; while (--count >= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } UBYTE packDna4(DNA *in) /* Pack 4 bases into a UBYTE */ { UBYTE out = 0; int count = 4; int bVal; while (--count >= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } void unpackDna(bits32 *tiles, int tileCount, DNA *out) /* Unpack DNA. Expands to 16x tileCount in output. */ { int i, j; bits32 tile; for (i=0; i=0; --j) { out[j] = valToNt[tile & 0x3]; tile >>= 2; } out += 16; } } void unpackDna4(UBYTE *tiles, int byteCount, DNA *out) /* Unpack DNA. Expands to 4x byteCount in output. */ { int i, j; UBYTE tile; for (i=0; i=0; --j) { out[j] = valToNt[tile & 0x3]; tile >>= 2; } out += 4; } } static void checkSizeTypes() /* Make sure that some of our predefined types are the right size. */ { assert(sizeof(UBYTE) == 1); assert(sizeof(WORD) == 2); assert(sizeof(bits32) == 4); assert(sizeof(bits16) == 2); } int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize) /* Given a gap in genome from iStart to iEnd, return * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no * intron. Assumes DNA is lower cased. */ { if (iEnd - iStart < minIntronSize) return 0; if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g') { return 1; } else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c') { return -1; } else return 0; } int intronOrientation(DNA *iStart, DNA *iEnd) /* Given a gap in genome from iStart to iEnd, return * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no * intron. Assumes DNA is lower cased. */ { return intronOrientationMinSize(iStart, iEnd, 32); } int dnaScore2(DNA a, DNA b) /* Score match between two bases (relatively crudely). */ { if (a == 'n' || b == 'n') return 0; if (a == b) return 1; else return -1; } int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, char ignore) /* Compare two sequences (without inserts or deletions) and score. */ { int i; int score = 0; for (i=0; i 0) { lineSize = lettersLeft; if (lineSize > maxPerLine) lineSize = maxPerLine; mustWrite(f, letters, lineSize); fputc('\n', f); letters += lineSize; lettersLeft -= lineSize; } } static int findTailPolyAMaybeMask(DNA *dna, int size, boolean doMask, boolean loose) /* Identify PolyA at end; mask to 'n' if specified. This allows a few * non-A's as noise to be trimmed too. Returns number of bases trimmed. * Leaves first two bases of PolyA in case there's a taa stop codon. */ { int i; int score = 10; int bestScore = 10; int bestPos = -1; int trimSize = 0; for (i=size-1; i>=0; --i) { DNA b = dna[i]; if (b == 'n' || b == 'N') continue; if (score > 20) score = 20; if (b == 'a' || b == 'A') { score += 1; if (score >= bestScore) { bestScore = score; bestPos = i; } else if (loose && score >= (bestScore - 8)) { /* If loose, keep extending even if score isn't back up to best. */ bestPos = i; } } else { score -= 10; } if (score < 0) { break; } } if (bestPos >= 0) { trimSize = size - bestPos - 2; // Leave two for aa in taa stop codon if (trimSize > 0) { if (doMask) for (i=size - trimSize; i 20) score = 20; if (b == 't' || b == 'T') { score += 1; if (score >= bestScore) { bestScore = score; bestPos = i; } else if (loose && score >= (bestScore - 8)) { /* If loose, keep extending even if score isn't back up to best. */ bestPos = i; } } else { score -= 10; } if (score < 0) { pastPoly = i; break; } } if (bestPos >= 0) { trimSize = bestPos+1 - 2; // Leave two for aa in taa stop codon if (trimSize > 0) { if (doMask) memset(dna, 'n', trimSize); } else trimSize = 0; } return trimSize; } int headPolyTSizeLoose(DNA *dna, int size) /* Return size of PolyT at start (if present). This allows a few non-T's as * noise to be trimmed too, but skips last two tt for revcomp'd taa stop * codon. * It is less conservative in extending the polyA region than maskHeadPolyT. */ { return findHeadPolyTMaybeMask(dna, size, FALSE, TRUE); } int maskHeadPolyT(DNA *dna, int size) /* Convert PolyT at start. This allows a few non-T's as noise to be * trimmed too. Returns number of bases trimmed. */ { return findHeadPolyTMaybeMask(dna, size, TRUE, FALSE); } boolean isDna(char *poly, int size) /* Return TRUE if letters in poly are at least 90% ACGTU */ { int i; int dnaCount = 0; dnaUtilOpen(); for (i=0; i= round(0.9 * size)); } boolean isAllDna(char *poly, int size) /* Return TRUE if letters in poly are 100% ACGTU */ { int i; if (size <= 1) return FALSE; dnaUtilOpen(); for (i=0; i