/************************************************************************* * FILE: read-mhmm.c * AUTHOR: William Stafford Noble * CREATE DATE: 8-12-97 * PROJECT: MHMM * COPYRIGHT: 1997-2008, WSN * DESCRIPTION: Functions for reading HMMs in MHMM format. *************************************************************************/ #include #include #include #include #include #include #include #include #include #include "build-hmm.h" #include "mhmm-state.h" #include "alphabet.h" #include "array.h" #include "read-mhmm.h" #include "utils.h" #include "xml-util.h" /************************************************************************* * Read a single HMM state in MHMM format. *************************************************************************/ #define BUF_LENGTH 100 static void read_mhmm_state( ALPH_T alph, FILE * const model_file, const int i_state, MHMM_STATE_T * this_state ) { int state_num; /* State index read from file. */ int asize; /* Number of characters in the alphabet. */ int i_alph; /* Index of the current letter. */ int num_scan; /* Number of items scanned by fscanf(). */ char buffer[BUF_LENGTH]; /* Buffer for reading strings from the file. */ ATYPE this_emit; /* Read the state index and make sure it's the right one. */ num_scan = fscanf(model_file, " State: %d", &state_num); assert(num_scan == 1); assert(state_num == i_state); #ifdef NDEBUG num_scan = i_state; /* Avoid compiler warning. */ #endif /* Read other state characteristics. */ num_scan = fscanf(model_file, " type: %s", &(buffer[0])); assert(num_scan == 1); this_state->type = convert_enum_type_str(buffer, INVALID_STATE, STATE_STRS, NUM_STATE_T); num_scan = fscanf(model_file, " i_motif: %d", &(this_state->i_motif)); assert(num_scan == 1); num_scan = fscanf(model_file, " motif_id: %s", this_state->motif_id); assert(num_scan == 1); num_scan = fscanf(model_file, " i_position: %d", &(this_state->i_position)); assert(num_scan == 1); num_scan = fscanf(model_file, " id_char: %c", &(this_state->id_char)); assert(num_scan == 1); assert(this_state->id_char != '\0'); num_scan = fscanf(model_file, " num_sites: %lf", &(this_state->num_sites)); assert(num_scan == 1); /* Allocate space for the emission distributions. */ this_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); this_state->emit_odds = allocate_array(alph_size(alph, ALL_SIZE)); /* Read emission distribution from non-start-or-end states. */ asize = alph_size(alph, ALPH_SIZE); if ((this_state->type != START_STATE) && (this_state->type != END_STATE)) { fscanf(model_file, " emit: "); for (i_alph = 0; i_alph < asize; i_alph++) { num_scan = fscanf(model_file, PROB_SCAN, &this_emit); set_array_item(i_alph, this_emit, this_state->emit); assert(num_scan == 1); } /* Normalize the probabilities. */ normalize(0.00001, this_state->emit); /* Calculate probabilities for ambiguous characters. */ calc_ambigs(alph, FALSE, this_state->emit); } /* Assign 1.0 probabilities to start and end emissions. */ else { for (i_alph = 0; i_alph < alph_size(alph, ALL_SIZE); i_alph++) { set_array_item(i_alph, 1.0, this_state->emit); } } } /************************************************************************* * Read an HMM transition matrix in MHMM format. *************************************************************************/ static void read_mhmm_trans( FILE * model_file, const int num_states, MATRIX_T* trans ) { int num_rows; /* Number of rows and columns (equal). */ int num_cols; int i_row; /* Row and column indices. */ int i_col; int num_scan; /* Number of items scanned by fscanf(). */ #ifndef NO_RENORMALIZE int one_col; /* Index of the column in which 1.0 appears. */ MTYPE row_sum; /* Sum of given row (except for the 1.0 entry). */ BOOLEAN_T found_one; /* Does this column contain an entry of 1.0? */ BOOLEAN_T found_nonzero; /* ... or an entry other than 0.0? */ #endif MTYPE matrix_item; num_scan = fscanf(model_file, " Transition probability matrix (%d x %d):", &num_rows, &num_cols); assert(num_scan == 2); assert((num_states == num_rows) && (num_states == num_cols)); /* Read the matrix. */ for (i_row = 0; i_row < num_states; i_row++) { for (i_col = 0; i_col < num_states; i_col++) { num_scan = fscanf(model_file, PROB_SCAN, &matrix_item); set_matrix_cell(i_row, i_col, matrix_item, trans); assert(num_scan == 1); } } #ifndef NO_RENORMALIZE /* Values close to 1.0 get easily pushed to 1.0, so fix 'em. */ for (i_row = 0; i_row < num_states; i_row++) { /* Find out if there's a 1.0 and something nonzero in this row. */ found_one = FALSE; found_nonzero = FALSE; one_col = 0; /* To prevent compiler warning. */ for (i_col = 0; i_col < num_states; i_col++) { if (get_matrix_cell(i_row, i_col, trans) == 1.0) { found_one = TRUE; one_col = i_col; } else if (get_matrix_cell(i_row, i_col, trans) != 0.0) { found_nonzero = TRUE; } if (found_one && found_nonzero) { break; } } /* If so, add up everything else and subtract it from the 1.0 entry. */ if (found_one && found_nonzero) { fprintf( stderr, "Renormalizing row %d of the transition matrix: ", i_row ); row_sum = 0.0; for (i_col = 0; i_col < num_states; i_col++) { if (i_col != one_col) { row_sum += get_matrix_cell(i_row, i_col, trans); } } incr_matrix_cell(i_row, one_col, -row_sum, trans); fprintf( stderr, "trans[%d,%d]=%g\n", i_row, one_col, get_matrix_cell(i_row, one_col, trans) ); } } #endif num_scan = fscanf(model_file, " End of MHMM"); assert(num_scan == 0); } /*********************************************************************** * Read the emission probabilities for a state from XML. * Return value: pointer to ARRAY_T structure containing * emission probabilities. ***********************************************************************/ static void read_state_emission_prob_from_xml( ALPH_T alph, xmlXPathContextPtr xpath_ctxt, ARRAY_T* emission_probs, int state_index // chosen state ) { const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; int asize = alph_size(alph, ALPH_SIZE); int i_alph = 0; // Build the XPATH expression to get emission probabilities for chosen state. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "/MEME_HMM/states/state[@index='%d']/" "emission_probabilities/alphabet_array/value", state_index ); // Count the number of emission probabilities for the state. xmlXPathObjectPtr xpath_obj = xpath_query(xpath_ctxt, xpath_expression); int num_values = (xpath_obj->nodesetval ? xpath_obj->nodesetval->nodeNr : 0); xmlXPathFreeObject(xpath_obj); // Start and end states won't have any emissions in XML. if (num_values > 0) { // If not zero, the number of emission probabilities // should match the alphabet size. assert(num_values == asize); // XML doesn't enforce any order on the emission probability values, // so force reading emission probability values in alphabet order. xmlNodePtr currValueNode = NULL; for (i_alph = 0; i_alph < asize; i_alph++) { // Build the XPATH expression to get emission probability for // a character in the chosen state. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "/MEME_HMM/states/state[@index='%d']/" "emission_probabilities/alphabet_array/value[@letter_id='letter_%c']", state_index, alph_char(alph, i_alph) ); // Decode from node set to numeric value for emission prob. xpath_obj = xpath_query(xpath_ctxt, xpath_expression); currValueNode = xpath_obj->nodesetval->nodeTab[0]; ATYPE value = xmlXPathCastNodeToNumber(currValueNode); set_array_item(i_alph, value, emission_probs); xmlXPathFreeObject(xpath_obj); } /* Normalize the probabilities. */ normalize(0.00001, emission_probs); /* Calculate probabilities for ambiguous characters. */ calc_ambigs(alph, FALSE, emission_probs); } else { /* Assign 1.0 probabilities to start and end emissions. */ for (i_alph = 0; i_alph < alph_size(alph, ALL_SIZE); i_alph++) { set_array_item(i_alph, 1.0, emission_probs); } } } /*********************************************************************** * Read the transition probabilities from XML. * transition probabilities. ***********************************************************************/ static void read_mhmm_trans_from_xml( xmlXPathContextPtr xpath_ctxt, MATRIX_T* trans, int from_state_index // chosen state ) { const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; // Build the XPATH expression to get emission probabilities for chosen state. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "/MEME_HMM/states/state[@index='%d']/transition", from_state_index ); // Count the number of transition probabilities for the state. xmlXPathObjectPtr xpath_obj = xpath_query(xpath_ctxt, xpath_expression); int num_transitions = (xpath_obj->nodesetval ? xpath_obj->nodesetval->nodeNr : 0); // Some states won't have any transtion probabilities. if (num_transitions > 0) { xmlNodePtr transition_node = NULL; int i_transition = 0; for (i_transition = 0; i_transition < num_transitions; i_transition++) { transition_node = xpath_obj->nodesetval->nodeTab[i_transition]; xmlChar* property = read_xml_node_property(transition_node, "to_state_index"); int to_state_index = atoi((char *) property); xmlFree(property); ATYPE value = xmlXPathCastNodeToNumber(transition_node); set_matrix_cell(from_state_index, to_state_index, value, trans); } } xmlXPathFreeObject(xpath_obj); } /*********************************************************************** * Read the states from XML. ***********************************************************************/ static void read_mhmm_states_from_xml( ALPH_T alph, xmlXPathContextPtr xpath_ctxt, MHMM_T* the_mhmm ) { xmlXPathObjectPtr xpath_obj = NULL; xmlChar* property = NULL; // Select all states xpath_obj = xpath_query(xpath_ctxt, "/MEME_HMM/states/state"); xmlNodePtr curr_state = NULL; int i_state = 0; for(i_state = 0; i_state < the_mhmm->num_states; i_state++) { curr_state = xpath_obj->nodesetval->nodeTab[i_state]; if (curr_state == NULL) { die("Error: missing state %d\n", i_state); } // Get the state index attribute. // This will almost certainly be the same as i_state, // but the order of the states in the XML is not guaranteed. property = read_xml_node_property(curr_state, "index"); int state_index = atoi((char *) property); xmlFree(property); // Get the state type attribute property = read_xml_node_property(curr_state, "type"); (the_mhmm->states[state_index]).type = convert_enum_type_str( (char *) property, INVALID_STATE, STATE_STRS, NUM_STATE_T ); xmlFree(property); // Get the num_sites attribute property = read_xml_node_property(curr_state, "num_sites"); (the_mhmm->states[state_index]).num_sites = atof((char *) property); xmlFree(property); // Get the i_motif attribute property = read_xml_node_property(curr_state, "motif_index"); (the_mhmm->states[state_index]).i_motif = atoi((char *) property); xmlFree(property); // Get the position attribute property = read_xml_node_property(curr_state, "position"); (the_mhmm->states[state_index]).i_position = atoi((char *) property); xmlFree(property); // Get the motif_id attribute property = read_xml_node_property(curr_state, "motif_id"); strncpy( (the_mhmm->states[state_index]).motif_id, (char *) property, MAX_MOTIF_ID_LENGTH + 1 ); xmlFree(property); // Get the id_char attribute property = read_xml_node_property(curr_state, "id_char"); (the_mhmm->states[state_index]).id_char = property[0]; xmlFree(property); /* Allocate space for the emission distributions. */ the_mhmm->states[state_index].emit = allocate_array(alph_size(alph, ALL_SIZE)); the_mhmm->states[state_index].emit_odds = allocate_array(alph_size(alph, ALL_SIZE)); // Read the emission probabilities for the state. read_state_emission_prob_from_xml( alph, xpath_ctxt, the_mhmm->states[state_index].emit, state_index ); // Read the transtion probabilities for the state. read_mhmm_trans_from_xml( xpath_ctxt, the_mhmm->trans, state_index ); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, " %d", i_state); } } xmlXPathFreeObject(xpath_obj); } /************************************************************************* * Create an HMM struct from a parsed XML struct. * * IN: mhmm_doc * OUT: the_hmm *************************************************************************/ static void read_mhmm_from_xml( ALPH_T alph, xmlXPathContextPtr xpath_ctxt, MHMM_T** the_hmm ) { xmlXPathObjectPtr xpath_obj = NULL; xmlChar* property = NULL; xpath_obj = xpath_query(xpath_ctxt, "/MEME_HMM/states"); property = read_xml_node_property( xpath_obj->nodesetval->nodeTab[0], "num_states" ); int num_states = atoi((char *) property); xmlFree(property); // Allocate the HMM. allocate_mhmm(alph, num_states, the_hmm); (*the_hmm)->num_states = num_states; strncpy( (*the_hmm)->alphabet, alph_string(alph), (*the_hmm)->alph_size ); // Fill in the number and length of spacers since we already // have the states node in hand. property = read_xml_node_property( xpath_obj->nodesetval->nodeTab[0], "num_spacers" ); (*the_hmm)->num_spacers = atoi((char *) property); xmlFree(property); property = read_xml_node_property( (xmlNodePtr) xpath_obj->nodesetval->nodeTab[0], "spacer_states" ); (*the_hmm)->spacer_states = atoi((char *) property); xmlFree(property); xmlXPathFreeObject(xpath_obj); // Set the alphabet information (*the_hmm)->alph_size = alph_size(alph, ALPH_SIZE); strcpy((*the_hmm)->alphabet, alph_string(alph)); (*the_hmm)->ambigs = alph_size(alph, AMBIG_SIZE); // Read and set the HMM type xpath_obj = xpath_query(xpath_ctxt, "/MEME_HMM"); property = read_xml_node_property(xpath_obj->nodesetval->nodeTab[0], "type"); (*the_hmm)->type = convert_enum_type_str((char *) property, INVALID_HMM, HMM_STRS, NUM_HMM_T); xmlFree(property); // Read the optional fields property = xmlGetProp( xpath_obj->nodesetval->nodeTab[0], BAD_CAST "description" ); if (property != NULL) { (*the_hmm)->description = mm_malloc(sizeof(char) * strlen((char *) property) + 1); strcpy((*the_hmm)->description, (char *) property); xmlFree(property); } property = xmlGetProp( xpath_obj->nodesetval->nodeTab[0], BAD_CAST "motif_file" ); if (property != NULL) { (*the_hmm)->motif_file = mm_malloc(sizeof(char) * strlen((char *) property) + 1); strcpy((*the_hmm)->motif_file, (char *) property); xmlFree(property); } property = xmlGetProp( xpath_obj->nodesetval->nodeTab[0], BAD_CAST "sequence_file" ); if (property != NULL) { (*the_hmm)->motif_file = mm_malloc(sizeof(char) * strlen((char *) property) + 1); strcpy((*the_hmm)->sequence_file, (char *) property); xmlFree(property); } xmlXPathFreeObject(xpath_obj); // Initialize the transition matrix init_matrix(0.0, (*the_hmm)->trans); // Read all the states into the HMM read_mhmm_states_from_xml(alph, xpath_ctxt, *the_hmm); // Record the number of motifs. (*the_hmm)->num_motifs = get_num_motifs(*the_hmm); /* N.B. We can't read directly into the model's background because it contains the alphabet plus ambiguous characters. Hence the following shenanigans. */ { ARRAY_T* background = read_bg_freqs_from_xml(xpath_ctxt, alph); int i = 0; for (i = 0; i < alph_size(alph, ALPH_SIZE); i++) { set_array_item(i, get_array_item(i, background), (*the_hmm)->background); } free_array(background); } // Fill in ambiguous characters. calc_ambigs(alph, FALSE, (*the_hmm)->background); } /************************************************************************* * Read HMM from an XML file. * * IN: model_file - * OUT: the_hmm - *************************************************************************/ BOOLEAN_T read_mhmm_xml_file( char* model_filename, /* The name of the file to be read from. */ MHMM_T** the_hmm /* The HMM (not pre-allocated). */ ) { ALPH_T alph; xmlParserCtxtPtr ctxt; // The parser xmlDocPtr mhmm_doc; // The document tree xmlXPathContextPtr xpath_ctxt; // XPath context ctxt = xmlNewParserCtxt(); if (ctxt == NULL) { die("Failed to create XML parser.\n"); } // Parse and validate the file. mhmm_doc = xmlCtxtReadFile( ctxt, model_filename, NULL, // Encoding XML_PARSE_DTDVALID | XML_PARSE_NOERROR | XML_PARSE_NOWARNING ); // Did it parse? if (mhmm_doc == NULL) { // It may be an old plain-text file. xmlFreeParserCtxt(ctxt); xmlCleanupParser(); if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Failed to parse %s as an XML document.\n", model_filename); } return FALSE; } else { // Did it validate as a MEME_HMM document? if (ctxt->valid == 0 || strcmp((char *) ctxt->intSubName, "MEME_HMM") != 0) { xmlFreeDoc(mhmm_doc); xmlFreeParserCtxt(ctxt); xmlCleanupParser(); die("%s is not a valid MEME HMM XML document.\n", model_filename); } } if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "File %s is a valid MEME HMM XML file.\n", model_filename); } // Set up XPath context from parsed XML xpath_ctxt = xmlXPathNewContext(mhmm_doc); // Read and set alphabet alph = read_alphabet_from_xml(xpath_ctxt); // Create an HMM struct from the XML struct. read_mhmm_from_xml(alph, xpath_ctxt, the_hmm); /* Compute the state-by-state incoming and outgoing transitions. */ compute_ins_and_outs(*the_hmm, FALSE); /* free up the resulting document */ xmlXPathFreeContext(xpath_ctxt); xmlFreeDoc(mhmm_doc); xmlFreeParserCtxt(ctxt); xmlCleanupParser(); return TRUE; } /************************************************************************* * void read_mhmm * * Read an HMM in MHMM format. * * IN: model_file - * OUT: the_hmm - *************************************************************************/ void read_mhmm( char* hmm_filename, /* The file to be read from. */ MHMM_T** the_hmm /* The HMM (not pre-allocated). */ ) { ALPH_T alph; char buffer[BUF_LENGTH]; /* Buffer for reading from file. */ char version[BUF_LENGTH]; /* Buffer for reading from file. */ int type; /* Type of model (MHMM or HMMER). */ int num_states; /* Number of states in the model. */ int num_spacers; /* Number of spacers in the model. */ int spacer_states; /* Number of states per spacer. */ int asize; /* Number of characters in the alphabet. */ int i_state; /* Index of the current state. */ ARRAY_T* motif_widths; // Array to hold motif widths // Begin by assuming that it is an MHMM XML file. BOOLEAN_T result = read_mhmm_xml_file(hmm_filename, the_hmm); if (result) { // If that worked, we're done! return; } // Open the file for reading. FILE* hmm_file; if (open_file(hmm_filename, "r", TRUE, "HMM", "HMM", &hmm_file) == 0) { exit(1); } /* Skip the header. */ int num_scan = 0; // Number of items scanned by fscanf(). while (num_scan != 1) { if (fgets(buffer, BUF_LENGTH, hmm_file) == NULL) { die("Can't find Meta-MEME version number.\n"); } num_scan = sscanf(buffer, "MHMM v%s", version); } assert(strlen(version) < BUF_LENGTH); // Read the header information. num_scan = fscanf(hmm_file, " type: %s", &(buffer[0])); assert(num_scan == 1); type = convert_enum_type_str(buffer, INVALID_HMM, HMM_STRS, NUM_HMM_T); num_scan = fscanf(hmm_file, " num_states: %d", &num_states); assert(num_scan == 1); num_scan = fscanf(hmm_file, " num_spacers: %d", &num_spacers); assert(num_scan == 1); num_scan = fscanf(hmm_file, " spacer_length: %d", &spacer_states); assert(num_scan == 1); num_scan = fscanf(hmm_file, " alph_size: %d", &asize); assert(num_scan == 1); // Set up the alphabet. num_scan = fscanf(hmm_file, " alphabet: %s", buffer); assert(num_scan == 1); alph = alph_type(buffer, BUF_LENGTH); if (alph == INVALID_ALPH) die("The mhmm alphabet was unrecognised.\n"); if (asize != alph_size(alph, ALPH_SIZE)) die("The mhmm alphabet %s does not " "have %d letters\n", alph_name(alph), asize); // Allocate the HMM. allocate_mhmm(alph, num_states, the_hmm); // Store information that we have read so far. (*the_hmm)->type = type; (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = num_spacers; (*the_hmm)->spacer_states = spacer_states; (*the_hmm)->alph_size = asize; strcpy((*the_hmm)->alphabet, alph_string(alph)); (*the_hmm)->ambigs = alph_size(alph, AMBIG_SIZE); // Read the background distribution. num_scan = fscanf(hmm_file, " background: "); assert(num_scan == 0); /* N.B. We can't read directly into the model's background because it contains the alphabet plus ambiguous characters. Hence the following shenanigans. */ { int i; ARRAY_T* background; background = allocate_array(alph_size(alph, ALPH_SIZE)); read_array(hmm_file, background); for (i = 0; i < alph_size(alph, ALPH_SIZE); i++) { set_array_item(i, get_array_item(i, background), (*the_hmm)->background); } free_array(background); } // Extend the distribution to account for ambiguous characters. calc_ambigs(alph, FALSE, (*the_hmm)->background); // Read the optional fields. if (fscanf(hmm_file, " description: %[^\n]", buffer) == 1) { copy_string(&((*the_hmm)->description), buffer); } if (fscanf(hmm_file, " motif_file: %s", buffer) == 1) { copy_string(&((*the_hmm)->motif_file), buffer); } if (fscanf(hmm_file, " sequence_file: %s", buffer) == 1) { copy_string(&((*the_hmm)->sequence_file), buffer); } /* Read in each state. */ motif_widths = allocate_array(num_states); // tlb; need to set widths for (i_state = 0; i_state < num_states; i_state++) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "\rReading HMM: %d", i_state); read_mhmm_state(alph, hmm_file, i_state, &((*the_hmm)->states[i_state])); // Save motif width when you know it. if ((*the_hmm)->states[i_state].type == END_MOTIF_STATE) { set_array_item( (*the_hmm)->states[i_state].i_motif, // Motif index. (*the_hmm)->states[i_state].i_position + 1, // Width of motif. motif_widths ); } } if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "\n"); } // Record the motif widths in the states. for (i_state = 0; i_state < num_states; i_state++) { if ((*the_hmm)->states[i_state].type == START_MOTIF_STATE || (*the_hmm)->states[i_state].type == MID_MOTIF_STATE || (*the_hmm)->states[i_state].type == END_MOTIF_STATE) { (*the_hmm)->states[i_state].w_motif = get_array_item((*the_hmm)->states[i_state].i_motif, motif_widths); } else { // All other state types have width 1. (*the_hmm)->states[i_state].w_motif = 1; // start, end, spacer } } free_array(motif_widths); // Record the number of motifs. (*the_hmm)->num_motifs = get_num_motifs(*the_hmm); /* Read the transition matrix. */ read_mhmm_trans(hmm_file, num_states, (*the_hmm)->trans); assert(verify_trans_matrix(FALSE, num_states, (*the_hmm)->trans)); fscanf(hmm_file, " End of MHMM"); /* Compute the state-by-state incoming and outgoing transitions. */ compute_ins_and_outs(*the_hmm, FALSE); fclose(hmm_file); } // read_mhmm