/************************************************************************** * FILE: log-hmm.c * AUTHOR: William Stafford Noble * CREATE DATE: 1-28-98 * PROJECT: MHMM * DESCRIPTION: Convert between a normal HMM and a log HMM. **************************************************************************/ #include #include #include "log-hmm.h" #include "utils.h" #include "mhmm-state.h" #include "array.h" #include "matrix.h" #include "write-mhmm.h" #include "subst-matrix.h" /* substitution matrix routines */ #include "alphabet.h" #ifndef LOG_DEBUG #define LOG_DEBUG 0 #endif /************************************************************************** * See .h file for description. **************************************************************************/ static void convert_to_from_log_state( ALPH_T alph, BOOLEAN_T to_log, BOOLEAN_T zero_spacer_emit_lo, // Set spacer emission log-odds = 0? ARRAY_T* background, // Background emission distribution. MATRIX_T* target_freq, // Target frequency matrix. double beta, // Weight on pseudocounts. MHMM_STATE_T* source_state, // The state to be converted. MHMM_STATE_T* target_state // The converted form of the given state. ) { double alpha = source_state->alpha; int i_emit; // If there is no background, just copy from emit to odds or vice versa. if (background == NULL) { if (to_log) { copy_array(source_state->emit, source_state->emit_odds); } else { copy_array(source_state->emit_odds, source_state->emit); } } // Compute odds for the emission distribution in the source state. else if (to_log) { ARRAY_T *g; // Pseudocount frequencies. // Get the pseudocount frequencies for modifying observed frequencies. g = get_pseudocount_freqs(alph, source_state->emit, background, target_freq); for (i_emit = 0; i_emit < alph_size(alph, ALPH_SIZE); i_emit++) { // Start and end states have unity odds; spacers, too, if requested. if ((source_state->type == START_STATE) || (source_state->type == END_STATE) || (zero_spacer_emit_lo && source_state->type == SPACER_STATE)) { set_array_item(i_emit, 1.0, source_state->emit_odds); } // Compute the adjusted odds. else { double fi = get_array_item(i_emit, source_state->emit); // observed freq double gi = get_array_item(i_emit, g); // pseudocount frequency double q = (alpha * fi + beta * gi)/(alpha + beta); // adjusted frequency double b = get_array_item(i_emit, background); // background frequency set_array_item(i_emit, q / b, source_state->emit_odds); } } free_array(g); // Free pseudocounts. } // to_log // Allocate dynamic space in the new state. if (to_log) { target_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); target_state->emit_odds = allocate_array(alph_size(alph, ALL_SIZE)); target_state->itrans_out = (int *)mm_malloc(sizeof(int) * source_state->ntrans_out); target_state->trans_out = allocate_array(source_state->ntrans_out); target_state->itrans_in = (int *)mm_malloc(sizeof(int) * source_state->ntrans_in); target_state->trans_in = allocate_array(source_state->ntrans_in); } // Copy the state type. target_state->type = source_state->type; // Copy outgoing transitions. target_state->ntrans_out = source_state->ntrans_out; copy_int_array( source_state->ntrans_out, source_state->itrans_out, target_state->itrans_out ); convert_to_from_log_array( to_log, source_state->trans_out, target_state->trans_out ); // Copy incoming transitions. target_state->ntrans_in = source_state->ntrans_in; copy_int_array( source_state->ntrans_in, source_state->itrans_in, target_state->itrans_in ); convert_to_from_log_array( to_log, source_state->trans_in, target_state->trans_in ); // Copy the emission distribution. convert_to_from_log_array( to_log, source_state->emit, target_state->emit ); convert_to_from_log_array( to_log, source_state->emit_odds, target_state->emit_odds ); // Set the odds for the ambiguous characters as the (unweighted) average // of the odds for the characters they match. //FIXME: should be weighted average for statistics to really be kosher. calc_ambigs(alph, FALSE, target_state->emit_odds); // Copy the remaining stuff. target_state->i_motif = source_state->i_motif; target_state->w_motif = source_state->w_motif; strcpy(target_state->motif_id, source_state->motif_id); target_state->i_position = source_state->i_position; target_state->id_char = source_state->id_char; if (LOG_DEBUG) { for (i_emit = 0; i_emit < alph_size(alph, ALL_SIZE); i_emit++) { printf("%5.3f ", get_array_item(i_emit, target_state->emit_odds)); } printf("\n"); } } /************************************************************************** Set the gap opening and extension costs in the adjacency lists and transition matrix. extend ---> open/2 | | open/2 ----->spacer-----> **************************************************************************/ static void set_gap_costs( double gap_open, // Cost to open a gap. double gap_extend, // Cost to extend a gap. MHMM_T* hmm) // The hmm. { int i_state, i; double cost; for (i_state = 0; i_state < hmm->num_states; i_state++) { MHMM_STATE_T* state = hmm->states + i_state; // Set spacer incoming transitions to -gap_open/2 if it is >= 0 // and -gap_extend if it is a self-loop. for (i=0; intrans_in && (gap_open >= 0 || gap_extend >= 0); i++) { int j_state = state->itrans_in[i]; // from state if (i_state != j_state && gap_open < 0) continue; // only do self-loop? if (state->type == SPACER_STATE) { // to state is spacer cost = (i_state==j_state) ? -gap_extend : -gap_open/2; // self-loop? } else if (hmm->states[j_state].type == SPACER_STATE) { // from state is spacer cost = -gap_open/2; } else { // neither is spacer continue; } set_array_item(i, cost, state->trans_in); // set adj list set_matrix_cell(j_state, i_state, cost, hmm->trans); // set xition matrix } // incoming state // Set self-loop transitions to -gap_extend if gap_extend >= 0 // and set rest of outgoing spacer transitions to -gap_open/2. for (i=0; intrans_out && (gap_open >= 0 || gap_extend >= 0); i++) { int j_state = state->itrans_out[i]; // to state if (state->type == SPACER_STATE) { // from state is spacer cost = (i_state==j_state) ? -gap_extend : -gap_open/2; // self-loop? } else if (hmm->states[j_state].type == SPACER_STATE) { // to state is spacer cost = -gap_open/2; // rest of open cost } else { // neither is spacer continue; } set_array_item(i, cost, state->trans_out); // set adj list set_matrix_cell(i_state, j_state, cost, hmm->trans); // set xition matrix } // incoming state } // i_state } // set_gap_costs /************************************************************************** * Convert an HMM to or from log form. **************************************************************************/ void convert_to_from_log_hmm (BOOLEAN_T to_log, // Convert to log form? BOOLEAN_T zero_spacer_emit_lo, // Set spacer emission log-odds = 0? double gap_open, // Cost to open a gap; ignored if < 0 double gap_extend, // Cost to extend a gap; ignored if < 0 ARRAY_T* background, // The background distribution. char * sc_filename, // Name of score file (ignored if background NULL). int pam_dist, // PAM distance (ignored if sc_filename not NULL). double beta, // Weight on pseudocounts. MHMM_T* source_hmm, // The HMM to convert. MHMM_T** target_hmm) // The same HMM in log/log-odds form { int i_state; /* Index of the current state. */ MATRIX_T *target_freq = NULL; /* The target frequency matrix */ ALPH_T alph; alph = source_hmm->alph; /* Allocate space for the new HMM. */ if (to_log) { allocate_mhmm(source_hmm->alph, source_hmm->num_states, target_hmm); } // Copy the header information. (*target_hmm)->type = source_hmm->type; (*target_hmm)->log_odds = to_log; (*target_hmm)->num_motifs = source_hmm->num_motifs; (*target_hmm)->num_states = source_hmm->num_states; (*target_hmm)->num_spacers = source_hmm->num_spacers; (*target_hmm)->spacer_states = source_hmm->spacer_states; (*target_hmm)->alph_size = source_hmm->alph_size; (*target_hmm)->ambigs = source_hmm->ambigs; strcpy((*target_hmm)->alphabet, source_hmm->alphabet); // Create the substitution target frequency matrix if background given. if (background != NULL) { int asize = source_hmm->alph_size; char *alphabet = source_hmm->alphabet; char save_char = alphabet[asize]; alphabet[asize] = '\0'; // remove ambigs target_freq = get_subst_target_matrix(sc_filename, alph, pam_dist, background); alphabet[asize] = save_char; // replace ambigs } // target frequency matrix // Copy the states, converting to logs along the way. for (i_state = 0; i_state < source_hmm->num_states; i_state++) { (source_hmm)->states[i_state].alpha = 20; if (LOG_DEBUG) { printf("i_motif: %d i_position: %d\n", (source_hmm)->states[i_state].i_motif, (source_hmm)->states[i_state].i_position); } convert_to_from_log_state(alph, to_log, zero_spacer_emit_lo, background, target_freq, beta, &(source_hmm->states[i_state]), &((*target_hmm)->states[i_state])); } // Copy the background distribution, and convert to logs. convert_to_from_log_array(to_log, source_hmm->background, (*target_hmm)->background); // Copy the transition matrix, converting to logs along the way. convert_to_from_log_matrix(to_log, source_hmm->trans, (*target_hmm)->trans); // Set the transition costs to -gap_open and -gap_extend if // these are >= 0 and converting to logs. if (to_log && (gap_open >= 0 || gap_extend >= 0)) set_gap_costs(gap_open, gap_extend, *target_hmm); // Copy the hot_states. (*target_hmm)->hot_states = NULL; if (source_hmm->num_hot_states > 0) { mm_resize((*target_hmm)->hot_states, source_hmm->num_hot_states, int); } (*target_hmm)->num_hot_states = source_hmm->num_hot_states; if (LOG_DEBUG) { write_mhmm(QUIET_VERBOSE, *target_hmm, stderr); } // Free local dynamic memory. free_matrix(target_freq); } #ifdef MAIN #include "simple-getopt.h" #include "read-mhmm.h" VERBOSE_T verbosity = INVALID_VERBOSE; /************************************************************************** * int main **************************************************************************/ int main (int argc, char *argv[]) { char * hmm_filename= NULL; // File containing the HMM. FILE * hmm_file; MHMM_T * the_hmm; /* The HMM itself. */ MHMM_T * the_log_hmm; /* The HMM, with probs converted to logs. */ // Define command line options. cmdoption const options[] = { {"verbosity", REQUIRED_VALUE} }; int option_count = 1; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: log-hmm [options] \n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --verbosity 1|2|3|4|5 (default=2)\n"); strcat(usage, "\n"); simple_setopt(argc, argv, option_count, options); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error process command line options (%s)\n", message); } if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T)atoi(option_value); } } // Read the single required argument. if (option_index + 1 != argc) { fprintf(stderr, "%s", usage); exit(1); } hmm_filename = argv[option_index]; /* Read the model. */ read_mhmm(hmm_filename, &the_hmm); /* Convert back and forth. */ convert_to_from_log_hmm(TRUE, // To log form. FALSE, // Set spacer emissions to zero? -1, // Ignore gap_open. -1, // Ignore gap_extend. the_hmm->background, NULL, // No score file. 250.0, // PAM distance. 10, // beta ? the_hmm, &the_log_hmm); convert_to_from_log_hmm(FALSE, // From log form. FALSE, // Set spacer emissions to zero? -1, // Ignore gap_open. -1, // Ignore gap_extend. the_hmm->background, NULL, // No score file. 250.0, // PAM distance. 10, // beta the_log_hmm, &the_hmm); /* Print the HMM. */ write_mhmm(verbosity, the_hmm, stdout); /* Free dynamic memory. */ free_mhmm(the_hmm); free_mhmm(the_log_hmm); return(0); } #endif /* MAIN */