/************************************************************************** * FILE: dp.c * AUTHOR: William Stafford Noble, Timothy L. Bailey * CREATE DATE: 5/20/02 * PROJECT: MHMM * VERSION: $Revision: 1.1.1.1 $ * COPYRIGHT: 1998-2002, WNG, 2001, TLB * DESCRIPTION: Core dynamic programming algorithms for Meta-MEME. **************************************************************************/ #include #include #include #include #include #include #include #include #include "utils.h" // Generic utilities. #include "matrix.h" // Routines for floating point matrices. #include "array.h" // Routines for floating point arrays. #include "metameme.h" // Global metameme functions. #include "alphabet.h" // The amino acid / nucleotide alphabet. #include "seq.h" // Biosequence data structure. #include "mhmm-state.h" // HMM data structure. #include "read-mhmm.h" // HMM input/output. #include "log-hmm.h" // HMM log/log-odds conversion. #include "fitevd.h" // Extreme value distribution routines. #include "match.h" // Find a match in the traceback matrix. #include "pssm.h" // Needed for computing motif scores. #include "dp.h" #define FIRST_COL 1 // First "real" (non-padding) sequence column. /************************************************************************** * Print the dp and traceback matrices for debug purposes. **************************************************************************/ void print_matrices( int num_hot_states, int* hot_states, int seq_length, MATRIX_T* dp_matrix, MATRIX_T* trace_matrix ) { int i,j,k; for (k=-1; knum_states; BOOLEAN_T motif_scoring; int num_hot_states = the_log_hmm->num_hot_states; int* hot_states = the_log_hmm->hot_states; double best_score; int best_row; int in_index; motif_scoring = (motif_score_matrix != NULL); // The matrices should start life with safe values init_matrix(LOG_ZERO, dp_matrix); // Minimal score init_matrix(-1, trace_matrix); // Invalid state // Initialize the first column. set_matrix_cell(0, 0, 0.0, dp_matrix); set_matrix_cell(0, 0, -1, trace_matrix); for (i_row = 1; i_row < num_states; i_row++) { set_matrix_cell(i_row, 0, LOG_ZERO, dp_matrix); set_matrix_cell(i_row, 0, -1, trace_matrix); } // Consider each position in the sequence. for (i_col = 1; i_col < seq_length; i_col++) { // Start state has no predecessors. set_matrix_cell(0, i_col, local ? 0 : LOG_ZERO, dp_matrix); set_matrix_cell(0, i_col, -1, trace_matrix); // Consider each (hot) state in the model. for (i_hot = 0; i_hot < num_hot_states; i_hot++) { MHMM_STATE_T* this_state; // The current state in the model. PROB_T emit_logprob; // Log prob of emitting the current // character at the current position. int start_col; // Index of starting column of // "motif" in dp matrix. // Set current row. i_row = hot_states[i_hot]; // Get the current model state. this_state = &(the_log_hmm->states[i_row]); // Ignore start and mid-motif states if doing motif-scoring. if (motif_scoring && ((this_state->type == START_MOTIF_STATE) || (this_state->type == MID_MOTIF_STATE))) { continue; } // // Compute the emission cost at the current position. // Adjust this_state to point to start of motif if doing motif-scoring. // Skip this state if the motif would hang off left edge of sequence. // if (motif_scoring && this_state->type == END_MOTIF_STATE) { int w = this_state->w_motif; start_col = i_col-w+1; // Start pos in sequence. if (start_col < FIRST_COL) continue; // Motif doesn't fit. this_state -= (w - 1); // Start of motif. emit_logprob = get_matrix_cell(this_state->i_motif, start_col, motif_score_matrix); } else { start_col = i_col; // Start pos in sequence. emit_logprob = get_array_item(int_sequence[i_col], this_state->emit_odds); } // Find the preceding state with the highest score + transition cost. best_score = LOG_ZERO; best_row = -1; for (in_index = 0; in_index < this_state->ntrans_in; in_index++) { double trans; // Transition cost to get here. double prv_score; // Best score in previous state. double score; // Current score. int prv_row; // Index of the previous state. int prv_col; // Index of the previous seq. pos. // // Get previous state from list of in-transitions to current state. // // Previous state index. prv_row = this_state->itrans_in[in_index]; // Previous sequence position. prv_col = start_col - 1; // Transition cost to get here. trans = get_array_item(in_index, this_state->trans_in); // Previous state score. prv_score = get_matrix_cell(prv_row, prv_col, dp_matrix); // Total score to get here. score = prv_score + trans; // // Save best predecessor. // if (score > best_score) { best_score = score; best_row = prv_row; } } // preceding state // Compute the score for this cell. if (local && best_score < 0.0) { // Local and no positive predecessor. best_score = emit_logprob > 0 ? emit_logprob : 0; best_row = -1; } else { // Positive predecessor or non-local. best_score += emit_logprob; } set_matrix_cell(i_row, i_col, best_score, dp_matrix); set_matrix_cell(i_row, i_col, best_row, trace_matrix); } // i_row } // i_col // Traceback and scoring. return traceback_and_scoring( local ? LOCAL_VITERBI_SEARCH : GLOBAL_VITERBI_SEARCH, compute_trace, num_states, seq_length, the_log_hmm, motif_scoring, dp_matrix, trace_matrix, this_match ); } /************************************************************************** * Compute the total likelihood of a sequence, given a model. **************************************************************************/ #ifndef FORWARD_DEBUG #define FORWARD_DEBUG 0 #endif void forward_algorithm (BOOLEAN_T local, // Perform local scoring? (else global) int* int_sequence, // Sequence to be scored, int format. int seq_length, // The length of the sequence. MHMM_T * the_log_hmm, // An HMM in log format. MATRIX_T* dp_matrix, // Dynamic programming matrix. MATCH_T* this_match) // Only stores total likelihood of sequence. { int i_row; // Index of current row in dp matrix. int i_col; // Index of current column in dp matrix. const int num_states = the_log_hmm->num_states; // Number of states in HMM. //FIXME: handle motif scoring here! BOOLEAN_T motif_scoring = FALSE; // Initialize the first column to zeroes. for (i_row = 1; i_row < num_states; i_row++) { set_matrix_cell(i_row, 0, LOG_ZERO, dp_matrix); } // Initialize the upper left corner to one. set_matrix_cell(0, 0, 0.0, dp_matrix); // = LOG2(1.0) // Consider each position in the sequence. for (i_col = 0; i_col < seq_length - 1; i_col++) { // Index of current character in alphabet. int this_char_index = int_sequence[i_col]; // Initialize the next column to small numbers. for (i_row = 0; i_row < num_states; i_row++) { set_matrix_cell(i_row, i_col + 1, LOG_ZERO, dp_matrix); } // Consider each state in the model. for (i_row = 0; i_row < num_states; i_row++) { MHMM_STATE_T* this_state = &(the_log_hmm->states[i_row]); // Don't bother with unreachable states. if (get_matrix_cell(i_row, i_col, dp_matrix) <= LOG_SMALL) { continue; } // If we're in a motif, there is no transition cost. if ((this_state->type == START_MOTIF_STATE) || (this_state->type == MID_MOTIF_STATE)) { set_matrix_cell( i_row + 1, i_col + 1, get_matrix_cell(i_row, i_col, dp_matrix), dp_matrix ); } else { int i_trans; // Index of current transition out of this state. // Added during optimization. const int ntrans_out = this_state->ntrans_out; const int* itrans_out = this_state->itrans_out; ARRAY_T* trans_out = this_state->trans_out; // Add up the contributions from this state to the next column. for (i_trans = 0; i_trans < ntrans_out; i_trans++) { // Figure out where we're going to transition to. const int next_state = itrans_out[i_trans]; // Figure out the log probability of reaching that state. const PROB_T total_logprob = get_matrix_cell(i_row, i_col, dp_matrix) + get_array_item(i_trans, trans_out); // Store the cost in the next column. set_matrix_cell(next_state, i_col + 1, LOG_SUM(get_matrix_cell(next_state, i_col + 1, dp_matrix), total_logprob), dp_matrix); if (FORWARD_DEBUG) { fprintf(stderr, "Adding %g (= %g + %g) for transition from %d to %d.\n", total_logprob, get_matrix_cell(i_row, i_col, dp_matrix), get_array_item(i_trans, trans_out), i_row, next_state); } } } } // Add the emission probabilities to the next column. this_char_index = int_sequence[i_col + 1]; if (i_col + 1 != seq_length - 1) { // Skip flanking Xs for (i_row = 0; i_row < num_states; i_row++) { MHMM_STATE_T* this_state = &(the_log_hmm->states[i_row]); PROB_T this_emit = get_array_item(this_char_index, this_state->emit_odds); incr_matrix_cell(i_row, i_col + 1, this_emit, dp_matrix); if (FORWARD_DEBUG) { fprintf(stderr, "Adding %g for char %d in [%d][%d] -> %g.\n", this_emit, this_char_index, i_row, i_col + 1, get_matrix_cell(i_row, i_col + 1, dp_matrix)); } // Set negative values to zero if we're doing local scoring. if ((local) && (get_matrix_cell(i_row, i_col + 1, dp_matrix) < 0.0)) { set_matrix_cell(i_row, i_col + 1, 0.0, dp_matrix); if (FORWARD_DEBUG) { fprintf(stderr, "Setting negative value to zero at [%d][%d].\n", i_row, i_col + 1); } } } } } // Compute the sum of the last column. traceback_and_scoring( local ? LOCAL_TOTAL_SEARCH : GLOBAL_TOTAL_SEARCH, FALSE, // Don't compute traceback. num_states, seq_length, the_log_hmm, motif_scoring, dp_matrix, NULL, this_match ); } /************************************************************************** * Compute the backward probability matrix. **************************************************************************/ void backward_algorithm (const int * int_sequence, // Sequence to be scored, int format. const int seq_length, // The length of the sequence. const MHMM_T * the_log_hmm, /* An HMM in which the distributions have already been converted to log format. */ MATRIX_T* dp_matrix) // Dynamic programming matrix. { int i_row; // Index of current row in dp matrix. int i_col; // Index of current column in dp matrix. int i_trans; /* Index of current transition out of this state. */ int this_char_index; // Index of current character in alphabet. MHMM_STATE_T * this_state; // The current state in the model. int prev_state; // Index of the previous state. PROB_T emit_logprob; /* Log prob of emitting the current character at the current position. */ PROB_T before_logprob; // emit_logprob + current_logprob. PROB_T total_logprob; /* before_logprob + log prob of transition from current state to the next state. */ // Added during optimization. const int num_states = the_log_hmm->num_states; int ntrans_in; int * itrans_in; ARRAY_T* trans_in; // Initialize the last column to 1.0 (0.0 in log space). for (i_row = 0; i_row < num_states; i_row++) { set_matrix_cell(i_row, seq_length-1, 0.0, dp_matrix); } // Consider each position in the sequence. for (i_col = seq_length-1; i_col > 0; i_col--) { this_char_index = int_sequence[i_col]; // Initialize the preceding column to small numbers. for (i_row = 0; i_row < num_states; i_row++) { set_matrix_cell(i_row, i_col-1, LOG_ZERO, dp_matrix); } // Consider each state in the model. for (i_row = 0; i_row < num_states; i_row++) { // Don't bother with unreachable states. if (get_matrix_cell(i_row, i_col, dp_matrix) <= LOG_SMALL) { continue; } // Compute the emission cost at the current position. this_state = &(the_log_hmm->states[i_row]); if (i_col == seq_length-1) { emit_logprob = 0.0; // Skip flanking Xs } else { emit_logprob = get_array_item(this_char_index, this_state->emit_odds); } /* Compute the total cost at this position after emitting but before transitioning. */ before_logprob = get_matrix_cell(i_row, i_col, dp_matrix) + emit_logprob; // If we're in a motif, there is no transition cost. if ((this_state->type == END_MOTIF_STATE) || (this_state->type == MID_MOTIF_STATE)) { set_matrix_cell(i_row - 1, i_col - 1, before_logprob, dp_matrix); } else { // Add up the contributions from this state to the next column. ntrans_in = this_state->ntrans_in; itrans_in = this_state->itrans_in; trans_in = this_state->trans_in; for (i_trans = 0; i_trans < ntrans_in; i_trans++) { // Figure out where we're transitioning from. prev_state = itrans_in[i_trans]; // Figure out the log probability of reaching that state. total_logprob = before_logprob + get_array_item(i_trans, trans_in); // Store the cost in the previous column. if (get_matrix_cell(prev_state, i_col-1, dp_matrix) <= LOG_SMALL) { set_matrix_cell(prev_state, i_col-1, total_logprob, dp_matrix); } else { set_matrix_cell(prev_state, i_col-1, LOG_SUM(get_matrix_cell(prev_state, i_col-1, dp_matrix), total_logprob), dp_matrix); } } } } } } /************************************************************************** * If the forward and backward matrices are correct, then the dot * product of corresponding columns in the two matrices should be the * same for each state. This function verifies that equality. **************************************************************************/ #define SLOP 1E-6 BOOLEAN_T verify_matrices (int num_states, int max_length, PROB_T forward_score, MATRIX_T* forward_matrix, MATRIX_T* backward_matrix) { PROB_T this_value; // (Forward * backward) for a particular cell. PROB_T this_total; // Sum of (forward * backward) for one column. int i_seq; // Loop index. int i_state; // Loop index. for (i_seq = 0; i_seq < max_length; i_seq++) { // Add up (forward * backward) for each state. this_total = LOG_ZERO; for (i_state = 0; i_state < num_states; i_state++) { this_value = get_matrix_cell(i_state, i_seq, forward_matrix) + get_matrix_cell(i_state, i_seq, backward_matrix); this_total = LOG_SUM(this_total, this_value); } // Check to see if the result is close to what we expect. if (!almost_equal(forward_score, this_total, SLOP)) { fprintf(stderr, "Matrix error for position %d: difference=%.4g.\n", i_seq, forward_score - this_total); abort(); return(FALSE); } } return(TRUE); } /************************************************************************** * Find repeated matches of a model to a sequence. * * Let model have m states. * Let w(i) be the width of state i (= 1 for gap, else motif width). * let ps(i) be the predecessors of state i. * Let t(i,j) be the transition COST from i to j. * Let e(i,j) be the emission SCORE for aligning state i ENDING at position j. * * F(i,0) = 0, i= 1 to m * * The following are applied column-by-column, left to right, sequentially: * * F(0,j) = max F(0,j-1) * F(i,j-1) - R, i= 1 to m * * F(i,j) = max F(0,j-w(i)+1) + e(i,j) * F(k,j-w(i)) - t(k,i) + e(i,j), where k in ps(i) * **************************************************************************/ void repeated_match_algorithm (PROB_T dp_threshold, // Threshold for start of new match. int* int_sequence, // The sequence to be scored, int format. int seq_length, // The length of the sequence. MHMM_T* the_log_hmm, // An HMM in scaled log format. MATRIX_T* motif_score_matrix, // Position-specific scores for each motif. MATRIX_T* dp_matrix, // Dynamic programming matrix. MATRIX_T* trace_matrix, // Traceback matrix. MATCH_T* this_match) // Most probable path through model. { int i_hot; // Index of current hot state. int i_row; // Index of current row in dp matrix. int i_col; // Index of current column in dp matrix. int in_index; BOOLEAN_T motif_scoring = (motif_score_matrix != NULL); int num_hot_states = the_log_hmm->num_hot_states; int* hot_states = the_log_hmm->hot_states; const int num_states = the_log_hmm->num_states; // Initialize the upper left corner to one. set_matrix_cell(0, 0, 0.0, dp_matrix); // = LOG2(1.0) // Initialize the first column to 0. // Note: this column corresponds to the padding "X" at a start of // sequence. for (i_row = 1; i_row < num_states; i_row++) { set_matrix_cell(i_row, 0, LOG_ZERO, dp_matrix); set_matrix_cell(i_row, 0, 0, trace_matrix); } // // Consider each position in the sequence, starting with the second // position. // for (i_col = FIRST_COL; i_col <= seq_length; i_col++) { PROB_T top_row_value; // Value in the top row. int top_row_index; // Index of row from which top-row value came. const int prv_col = i_col - 1; // Previous column. // // Extend an unmatched region or end the current match if // it exceeds the DP threshold. // // F(0,j) = max F(0,j-1) // F(i,j-1) - R, i= 1 to m // Best score if previous column not in a match, too. top_row_value = get_matrix_cell(0, prv_col, dp_matrix); top_row_index = 0; // Get best score if a match ended in the previous column. for (i_hot=0; i_hotstates[i_row]); // // Compute the emission score at the current position. // If motif-scoring: // 1) Adjust this_state to point to start of motif state. // 2) Adjust start_col to point to start of motif in sequence. // 3) Skip this state if the motif would hang off left edge of sequence. // if (motif_scoring && this_state->type == END_MOTIF_STATE) { int w = this_state->w_motif; start_col = i_col-w+1; // Start pos in sequence. // Check to be sure motif fits. if (start_col < FIRST_COL) continue; this_state -= (w - 1); // Start of motif. emit_logprob = get_matrix_cell(this_state->i_motif, start_col, motif_score_matrix); } else { start_col = i_col; emit_logprob = get_array_item(int_sequence[i_col], this_state->emit_odds); } // // F(i,j) = max F(0,j-w(i)+1) + e(i,j) // F(k,j-w(i)) - t(k,i) + e(i,j), where k in ps(i) // no_match_score = get_matrix_cell(0, start_col, dp_matrix); top_row_value = no_match_score; top_row_index = 0; for (in_index = 0; in_index < this_state->ntrans_in; in_index++) { double trans; // Transition cost to get here. double prv_score; // Best score in previous state. double score; // Current score. int prv_row; // Index of the previous state. int prv_col; // Index of the previous seq. pos. // // Get previous state from list of in-transitions to current state. // // Previous state index. prv_row = this_state->itrans_in[in_index]; // Previous sequence position. prv_col = start_col - 1; // Previous state score. prv_score = get_matrix_cell(prv_row, prv_col, dp_matrix); // Transition cost to get here. trans = get_array_item(in_index, this_state->trans_in); // Total score to get here. score = prv_score + trans; // // Save best predecessor. // if (score > top_row_value) { top_row_value = score; top_row_index = prv_row; } } // preceding state top_row_value += emit_logprob; // Set this cell's values. set_matrix_cell(i_row, i_col, top_row_value, dp_matrix); set_matrix_cell(i_row, i_col, top_row_index, trace_matrix); } // i_hot } // i_col // Traceback and scoring. traceback_and_scoring( REPEATED_MATCH_SEARCH, TRUE, // Compute the trace. num_states, seq_length, the_log_hmm, motif_scoring, dp_matrix, trace_matrix, this_match ); } // repeated_match