/*********************************************************************** * * * MetaMEME * * Date: 23/4/2002 * * Author: Timothy L. Bailey * * Copyright: University of Queensland * * * ***********************************************************************/ #include #include "matrix.h" #include "karlin.h" #include "rdb-matrix.h" #include "string.h" #include "subst-matrix.h" #include "alphabet.h" #ifndef SUBST_MATRIX_DEBUG #define SUBST_MATRIX_DEBUG 0 #endif #define EPSILON 1e-200 /* dayhoff PAM 1 (matrix; order of alphabet: ACDEFGHIKLMNPQRSTVWY */ /* dayhoff_ij = Pr(amino acid j --> amino acid i | time=1) = Pr(i | j, t=1) */ double dayhoff[20][20] = { { 9867, 3, 10, 17, 2, 21, 2, 6, 2, 4, 6, 9, 22, 8, 2, 35, 32, 18, 0, 2}, { 1, 9973, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 5, 1, 2, 0, 3}, { 6, 0, 9859, 53, 0, 6, 4, 1, 3, 0, 0, 42, 1, 6, 0, 5, 3, 1, 0, 0}, { 10, 0, 56, 9865, 0, 4, 2, 3, 4, 1, 1, 7, 3, 35, 0, 4, 2, 2, 0, 1}, { 1, 0, 0, 0, 9946, 1, 2, 8, 0, 6, 4, 1, 0, 0, 1, 2, 1, 0, 3, 28}, { 21, 1, 11, 7, 1, 9935, 1, 0, 2, 1, 1, 12, 3, 3, 1, 21, 3, 5, 0, 0}, { 1, 1, 3, 1, 2, 0, 9912, 0, 1, 1, 0, 18, 3, 20, 8, 1, 1, 1, 1, 4}, { 2, 2, 1, 2, 7, 0, 0, 9872, 2, 9, 12, 3, 0, 1, 2, 1, 7, 33, 0, 1}, { 2, 0, 6, 7, 0, 2, 2, 4, 9926, 1, 20, 25, 3, 12, 37, 8, 11, 1, 0, 1}, { 3, 0, 0, 1, 13, 1, 4, 22, 2, 9947, 45, 3, 3, 6, 1, 1, 3, 15, 4, 2}, { 1, 0, 0, 0, 1, 0, 0, 5, 4, 8, 9874, 0, 0, 2, 1, 1, 2, 4, 0, 0}, { 4, 0, 36, 6, 1, 6, 21, 3, 13, 1, 0, 9822, 2, 4, 1, 20, 9, 1, 1, 4}, { 13, 1, 1, 3, 1, 2, 5, 1, 2, 2, 1, 2, 9926, 8, 5, 12, 4, 2, 0, 0}, { 3, 0, 5, 27, 0, 1, 23, 1, 6, 3, 4, 4, 6, 9876, 9, 2, 2, 1, 0, 0}, { 1, 1, 0, 0, 1, 0, 10, 3, 19, 1, 4, 1, 4, 10, 9913, 6, 1, 1, 8, 0}, { 28, 11, 7, 6, 3, 16, 2, 2, 7, 1, 4, 34, 17, 4, 11, 9840, 38, 2, 5, 2}, { 22, 1, 4, 2, 1, 2, 1, 11, 8, 2, 6, 13, 5, 3, 2, 32, 9871, 9, 0, 2}, { 13, 3, 1, 2, 1, 3, 3, 57, 1, 11, 17, 1, 3, 2, 2, 2, 10, 9901, 0, 2}, { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 9976, 1}, { 1, 3, 0, 1, 21, 0, 4, 1, 0, 1, 0, 3, 0, 0, 0, 1, 1, 1, 2, 9945} }; /* transition/transversion PAM 1 matrix; alphabet order "ACGT" */ double trans[4][4] = { { 9900, 20, 60, 20}, { 20, 9900, 20, 60}, { 60, 20, 9900, 20}, { 20, 60, 20, 9900} }; /* PAM frequencies for DNA */ double pam_dna_freq[] = { 0.25 /* A */, 0.25 /* C */, 0.25 /* G */, 0.25 /* T */ }; /* PAM frequencies for proteins; supplied by S. Altschul */ double pam_prot_freq[] = { 0.087 /* A */, 0.033 /* C */, 0.047 /* D */, 0.050 /* E */, 0.040 /* F */, 0.088 /* G */, 0.034 /* H */, 0.037 /* I */, 0.081 /* K */, 0.085 /* L */, 0.015 /* M */, 0.040 /* N */, 0.051 /* P */, 0.038 /* Q */, 0.041 /* R */, 0.070 /* S */, 0.058 /* T */, 0.065 /* V */, 0.010 /* W */, 0.030 /* Y */, }; /**********************************************************************/ /* reorder_matrix Reorder a matrix from the alphabet order in alpha1 to the alphabet order of alpha2. The letters in alph1 must be a superset of those in alpha2. Rows and columns corresponding to letters missing from alpha2 are discarded. Returns a reordered square matrix of size length(alpha2). */ /**********************************************************************/ MATRIX_T *reorder_matrix( const char *alpha1, /* current alphabet */ const char *alpha2, /* new alphabet; must be subset */ MATRIX_T *in_matrix /* matrix to reorder */ ) { int i, j; int alen1 = strlen(alpha1); int alen2 = strlen(alpha2); MATRIX_T *out_matrix; if (alen2 > alen1) die("The new alphabet %s must be a subset of the old alphabet %s.\n", alpha2, alpha1); out_matrix = allocate_matrix(alen2, alen2); for (i=0; i1; i--) { MATRIX_T *product = matrix_multiply(matrix, mul); SWAP(MATRIX_T*, product, matrix) free_matrix(product); } free_matrix(mul); /* convert to joint or logodds matrix: target: J_ij = Pr(i,j) = Mij pr(j) logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) */ for (i=0; icol_names); alpha = (char *)mm_malloc(sizeof(char) * (alen+1)); for (i=0; icol_names)[0]; alpha[i] = '\0'; *alpha1 = alpha; /* return alphabet */ return(rdb_matrix->matrix); } /* read_score_matrix */ /**********************************************************************/ /* get_score_matrix Get a letter substitution scoring matrix. If a filename is given, the scoring matrix is read from the file. Otherwise, a PAM matrix of the given type and evolutionary distance is generated. The matrix is reordered to conform to the standard alphabet. This alphabet must be a subset of the alphabet in the file (if the matrix was read from a file). Any rows and columns for letters not in the standard alphabet are discarded. Returns the matrix. */ /**********************************************************************/ MATRIX_T *get_score_matrix( char *score_filename, /* name of score file */ ALPH_T alph, /* alphabet */ int dist /* PAM distance (ignored if filename != NULL) */ ) { char *alpha1; /* initial alphabet */ MATRIX_T *tmp; /* temporary score matrix */ MATRIX_T *matrix; /* score matrix */ assert(alph == DNA_ALPH || alph == PROTEIN_ALPH); if (score_filename) { /* read matrix from file */ tmp = read_score_matrix(score_filename, &alpha1); /* reorder the matrix to standard alphabet */ matrix = reorder_matrix(alpha1, alph_string(alph), tmp); free_matrix(tmp); free(alpha1); } else { /* generate PAM matrix */ matrix = gen_pam_matrix(alph, dist, TRUE); } return(matrix); } /* get_score_matrix */ /**********************************************************************/ /* make_karlin_input Prepare the input required for karlin() from a scoring matrix and a letter frequency distribution. */ /**********************************************************************/ KARLIN_INPUT_T *make_karlin_input( MATRIX_T *matrix, /* scoring matrix */ ARRAY_T *probs /* letter freq distribution */ ) { int i, j; double escore; long lowest, highest; ARRAY_T *score_probs; int nscores; int alen = get_num_rows(matrix); /* size of alphabet */ KARLIN_INPUT_T *karlin_input; /* data to return */ /* find the highest and lowest scores in the scoring matrix */ lowest = 1; highest = -1; for (i=0; i highest) highest = s; } } if (lowest >= 0) die("Lowest score in scoring matrix must be negative, is %f.", (double)lowest); if (highest<= 0) die("Highest score in scoring matrix must be positve, is %f.", (double)highest); /* allocate the array of score probabilities and set to 0 */ nscores = highest - lowest + 1; score_probs = allocate_array(nscores); init_array(0, score_probs); /* compute the probabilities of different scores */ escore = 0; for (i=0; ilow = lowest; karlin_input->high = highest; karlin_input->escore = escore; karlin_input->prob = score_probs; return(karlin_input); } /* make_karlin_input */ /**********************************************************************/ /* convert_score_to_target Convert a scoring matrix to a target matrix. */ /**********************************************************************/ MATRIX_T *convert_score_to_target( MATRIX_T *score, /* score matrix */ ARRAY_T *prob /* letter frequencies */ ) { int i, j; KARLIN_INPUT_T *karlin_input; double lambda, K, H; MATRIX_T *target; /* target freq. matrix */ int alen = get_num_rows(score); /* alphabet length */ /* make input for karlin() */ karlin_input = make_karlin_input(score, prob); /* get lambda */ karlin(karlin_input->low, karlin_input->high, karlin_input->prob->items, &lambda, &K, &H); /*printf("lambda %f K %f H %f\n", lambda, K, H);*/ /* calculate target frequencies */ target = allocate_matrix(alen, alen); for (i=0; iprob); myfree(karlin_input); return(target); } /* convert_score_to_target */ /**********************************************************************/ /* get_subst_target_matrix Get a substitution target frequency matrix. */ /**********************************************************************/ MATRIX_T *get_subst_target_matrix( char *score_filename, /* name of score file */ ALPH_T alph, /* alphabet */ int dist, /* PAM distance (ignored if score_filename != NULL) */ ARRAY_T *back /* background frequencies of standard alphabet */ ) { MATRIX_T *score; /* score matrix */ MATRIX_T *target; /* target frequency matrix */ score = get_score_matrix(score_filename, alph, dist); target = convert_score_to_target(score, back); if (SUBST_MATRIX_DEBUG) { int i, j, alength=alph_size(alph, ALPH_SIZE); double sum; if (score_filename) { printf("From file %s\n", score_filename); } else { printf("Generated PAM %d\n", dist); } printf("%6c ", ' '); for (i=0; i Specify 0 for protein, 1 for DNA. Specify the integer PAM distance. */ /***********************************************************************/ VERBOSE_T verbosity = NORMAL_VERBOSE; main(int argc, char **argv) { int i, j, alength; int dist = 0; ALPH_T alph = PROTEIN_ALPH; char *score_filename = NULL; char *alpha; MATRIX_T *matrix; ARRAY_T *probs; double *freqs; KARLIN_INPUT_T *karlin_input; int nscores; double sum; char usage[1000] = ""; // Define the usage message. strcat(usage, "USAGE: subst_matrix [options] \n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --dna\n"); strcat(usage, " --dist \n"); strcat(usage, "\n"); // Parse the command line. while (1) { int c; int option_index = 0; const char* option_name; // Define command line options. static struct option long_options[] = { {"dna", 0, 0, 0}, {"dist", 1, 0, 0}, }; // Read the next option, and break if we're done. c = getopt_long_only(argc, argv, "+", long_options, &option_index); if (c == -1) { break; } else if (c != 0) { die("Invalid return from getopt (%d)\n", c); } // Get the option name (we only use long options). option_name = long_options[option_index].name; if (strcmp(option_name, "dna") == 0) { alph = DNA_ALPH; } else if (strcmp(option_name, "dist") == 0) { dist = atoi(optarg); } else { die("Invalid option (%s).\n", option_name); } } // Read the single required argument. if (optind + 1 != argc) { fprintf(stderr, usage); exit(1); } score_filename = argv[optind]; alength = alph_size(alph, ALPH_SIZE); /* background frequencies */ probs = allocate_array(alength); freqs = alph == DNA_ALPH ? pam_dna_freq : pam_prot_freq; fill_array(freqs, probs); /* copy freqs into ARRAY_T */ if (dist > 1) { printf("From gen_pam_matrix:\n"); matrix = gen_pam_matrix(alph, dist, FALSE); printf("%6c ", ' '); for (i=0; i