/* * $Id: seq2theta.c 2220 2007-11-30 19:09:15Z cegrant $ * * $Log$ * Revision 1.3 2006/03/08 20:50:11 nadya * merge chamges from v3_5_2 branch * * Revision 1.2.4.1 2006/01/24 20:44:08 nadya * update copyright * * Revision 1.2 2005/10/25 19:06:39 nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1 2005/07/29 17:26:42 nadya * Importing from meme-3.0.14, and adding configure/make * */ /*#define DEBUG*/ /*********************************************************************** * * * MEME++ * * Copyright 1994-2006, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* Routines to map a sequence to a theta matrix. */ #include "meme.h" /**********************************************************************/ /* init_map Set up an |A| x |A| sequence_letter_to_frequency_vector or sequence_letter_to_logodds_vector matrix where each column is the mapping from a given letter to a frequency/logodds vector. An extra row and column is added for the wildcard character 'X', and set to the background frequencies (or 0 if, logodds). Two types of mapping are possible: Uni - add n prior Pam - Dayhoff PAM for proteins, transversion/transition PAM for DNA Returns the map. */ /**********************************************************************/ extern THETA init_map( MAP_TYPE type, /* type of mapping: Uni - add n prior Pam - pam matrix */ double scale, /* degree of crispness, depends on type, Uni - pseudo count n to add Pam - pam distance */ int alength, /* length of alphabet */ double *back, /* background frequencies for 'X' */ BOOLEAN lo /* setup a logodds mapping if true */ ) { int i, j, p; THETA map; /* the map */ /* dayhoff PAM 1 matrix; order of alphabet: ACDEFGHIKLMNPQRSTVWY */ /* dayhoff_ij = Pr(amino acid j --> amino acid i | time=1) */ double dayhoff[20][20] = { { 9867, 3, 10, 17, 2, 21, 2, 6, 2, 4, 6, 9, 22, 8, 2, 35, 32, 18, 0, 2}, { 1, 9973, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 5, 1, 2, 0, 3}, { 6, 0, 9859, 53, 0, 6, 4, 1, 3, 0, 0, 42, 1, 6, 0, 5, 3, 1, 0, 0}, { 10, 0, 56, 9865, 0, 4, 2, 3, 4, 1, 1, 7, 3, 35, 0, 4, 2, 2, 0, 1}, { 1, 0, 0, 0, 9946, 1, 2, 8, 0, 6, 4, 1, 0, 0, 1, 2, 1, 0, 3, 28}, { 21, 1, 11, 7, 1, 9935, 1, 0, 2, 1, 1, 12, 3, 3, 1, 21, 3, 5, 0, 0}, { 1, 1, 3, 1, 2, 0, 9912, 0, 1, 1, 0, 18, 3, 20, 8, 1, 1, 1, 1, 4}, { 2, 2, 1, 2, 7, 0, 0, 9872, 2, 9, 12, 3, 0, 1, 2, 1, 7, 33, 0, 1}, { 2, 0, 6, 7, 0, 2, 2, 4, 9926, 1, 20, 25, 3, 12, 37, 8, 11, 1, 0, 1}, { 3, 0, 0, 1, 13, 1, 4, 22, 2, 9947, 45, 3, 3, 6, 1, 1, 3, 15, 4, 2}, { 1, 0, 0, 0, 1, 0, 0, 5, 4, 8, 9874, 0, 0, 2, 1, 1, 2, 4, 0, 0}, { 4, 0, 36, 6, 1, 6, 21, 3, 13, 1, 0, 9822, 2, 4, 1, 20, 9, 1, 1, 4}, { 13, 1, 1, 3, 1, 2, 5, 1, 2, 2, 1, 2, 9926, 8, 5, 12, 4, 2, 0, 0}, { 3, 0, 5, 27, 0, 1, 23, 1, 6, 3, 4, 4, 6, 9876, 9, 2, 2, 1, 0, 0}, { 1, 1, 0, 0, 1, 0, 10, 3, 19, 1, 4, 1, 4, 10, 9913, 6, 1, 1, 8, 0}, { 28, 11, 7, 6, 3, 16, 2, 2, 7, 1, 4, 34, 17, 4, 11, 9840, 38, 2, 5, 2}, { 22, 1, 4, 2, 1, 2, 1, 11, 8, 2, 6, 13, 5, 3, 2, 32, 9871, 9, 0, 2}, { 13, 3, 1, 2, 1, 3, 3, 57, 1, 11, 17, 1, 3, 2, 2, 2, 10, 9901, 0, 2}, { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 9976, 1}, { 1, 3, 0, 1, 21, 0, 4, 1, 0, 1, 0, 3, 0, 0, 0, 1, 1, 1, 2, 9945} }; /* transition/transversion PAM 1 matrix; alphabet order "ACGT" */ double trans[4][4] = { { 9900, 20, 60, 20}, { 20, 9900, 20, 60}, { 60, 20, 9900, 20}, { 20, 60, 20, 9900} }; /* special PAM mutation frequencies for proteins */ double pam_dna_freq[] = { 0.25 /* A */, 0.25 /* C */, 0.25 /* G */, 0.25 /* T */ }; /* special PAM mutation frequencies for proteins */ double pam_prot_freq[] = { 0.096 /* A */, 0.025 /* C */, 0.053 /* D */, 0.053 /* E */, 0.045 /* F */, 0.090 /* G */, 0.034 /* H */, 0.035 /* I */, 0.085 /* K */, 0.085 /* L */, 0.012 /* M */, 0.042 /* N */, 0.041 /* P */, 0.032 /* Q */, 0.034 /* R */, 0.057 /* S */, 0.062 /* T */, 0.078 /* V */, 0.012 /* W */, 0.030 /* Y */, 0.0 /* X */ }; /* create the array for the sequence to theta map */ create_2array(map, double, alength+1, alength+1); switch (type) { case Uni: { double main_letter; /* probability of main letter */ double other; /* probability of each other letter */ main_letter = (1.0 + scale)/(1.0 + alength * scale); other = scale/(1.0 + alength * scale); if (VERBOSE) {printf("main= %g\n\n", main_letter);} /* create a matrix of columns; each column gives mapping for a letter */ for (i=0; i