/******************************************************************** * FILE: mhmm.c * AUTHOR: William Stafford Noble * CREATE DATE: 06-23-96 * PROJECT: MHMM * COPYRIGHT: 1996, WNG * DESCRIPTION: Given MEME output, write a motif-based HMM. ********************************************************************/ #include #include #include #include #include "binary-search.h" #include "utils.h" // Generic utilities. #include "matrix.h" // Matrix manipulation routines. #include "metameme.h" // Global Meta-MEME functions. #include "motif.h" // Data structure for holding motifs. #include "motif-in.h" #include "order.h" // Linear motif order and spacing. #include "mhmm.h" #include "mhmm-state.h" // Data structure for HMMs. #include "build-hmm.h" // Construct an HMM in memory. #include "write-mhmm.h" // Functions for producing MHMM output. #include "scanned-sequence.h" #define SLOP 1E-5 #define SPACER_LENGTH 9.0 // Expected length of a spacer. typedef struct reqmot REQMOT_T; struct reqmot { RBTREE_T *nums; RBTREE_T *ids; }; char* program = "mhmm"; extern int END_TRANSITION; /*********************************************************************** * Should the given motif be inserted into the model? * By default all are included ***********************************************************************/ static int select_all(void *context, int motif_i, MOTIF_T *motif) { if (get_motif_strands(motif)) return MOTIF_PLUS | MOTIF_MINUS; else return MOTIF_PLUS; } /*********************************************************************** * Should the given motif be kept for the model? * Select the requested motifs ***********************************************************************/ static int select_requested(void *context, int motif_i, MOTIF_T *motif) { REQMOT_T *requested; RBNODE_T *node; int which; requested = (REQMOT_T *)context; which = 0; if ((node = rbtree_find(requested->nums, &motif_i)) != NULL) { which |= *((int*)rbtree_value(node)); } if ((node = rbtree_find(requested->ids, get_motif_id(motif))) != NULL) { which |= *((int*)rbtree_value(node)); } return which; } /*********************************************************************** * Should the given motif be kept for the model? * Select the top n motifs ***********************************************************************/ static int select_top_n(void *context, int motif_i, MOTIF_T *motif) { if (*((int*)context) >= motif_i) { return select_all(NULL, motif_i, motif); } return 0; } /*********************************************************************** * Should the given motif be kept for the model? * Select any motif with an evalue at least as high as passed ***********************************************************************/ static int select_better_e_value(void *context, int motif_i, MOTIF_T *motif) { if (get_motif_evalue(motif) <= *((double*)context)) { return select_all(NULL, motif_i, motif); } return 0; } /*********************************************************************** * Should the given motif be kept for the model? * Select any motif with a complexity value at least as high as passed. ***********************************************************************/ static int select_more_complex(void *context, int motif_i, MOTIF_T *motif) { if (get_motif_complexity(motif) <= *((double*)context)) { return select_all(NULL, motif_i, motif); } return 0; } /*********************************************************************** * Allows sorting motif nums. ***********************************************************************/ static int motif_num_cmp(const void *p1, const void *p2) { int i1, i2, abs_i1, abs_i2; i1 = *((int*)p1); i2 = *((int*)p2); abs_i1 = (i1 < 0 ? -i1 : i1); abs_i2 = (i2 < 0 ? -i2 : i2); if (abs_i1 < abs_i2) { // sort motifs ascending return -1; } else if (abs_i1 > abs_i2) { return 1; } else { if (i1 > i2) { // put reverse complements second return -1; } else if (i1 == i2) { return 0; } else { return 1; } } } /*********************************************************************** * Calculate the motif transitions ***********************************************************************/ static void compute_transitions_and_spacers( RBTREE_T *motifs, // Tree of motifs keyed by file order (1 and up) IN ARRAYLST_T* motif_occurrences, // List of motif occurences IN BOOLEAN_T has_reverse_strand, // File included both strands? IN double p_threshold, // P-value to include motif occurences. IN ORDER_T** order_spacing, // Motif order and spacing (linear HMM) IN OUT MATRIX_T** transp_freq, // Motif-to-motif transitions. OUT MATRIX_T** spacer_ave // Average inter-motif distances. OUT ) { RBNODE_T *node; ORDER_T *new_order; // New order and spacing. BOOLEAN_T find_order; // Should we look for the motif order? int *m_nums; int m_count, i; // create a list of the nums of the remaining motifs // this allows log-n lookup of the motif's position in the matrix m_count = rbtree_size(motifs); m_nums = mm_malloc(sizeof(int) * m_count); for (i = 0, node = rbtree_first(motifs); node != NULL; i++, node = rbtree_next(node)) { m_nums[i] = *((int*)rbtree_key(node)); } // If we already have a motif order and spacing, don't find any more. if (*order_spacing == NULL) { find_order = TRUE; } else { find_order = FALSE; } new_order = NULL; // Allocate the matrices. *transp_freq = allocate_matrix(m_count + 2, m_count + 2); *spacer_ave = allocate_matrix(m_count + 2, m_count + 2); init_matrix(0.0, *transp_freq); init_matrix(0.0, *spacer_ave); for (i = 0; i < arraylst_size(motif_occurrences); i++) { SCANNED_SEQ_T *sseq; int i_occur; int prev_loc, loc, gap; long prev_position; // Location of the right edge of previous motif. sseq = (SCANNED_SEQ_T*)arraylst_get(i, motif_occurrences); if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Reading motif occurrences for sequence %s.\n", sseq->seq_id); } // If requested, try to create an order string. if (find_order) { new_order = create_empty_order(sseq->site_count, sseq->pvalue); } prev_loc = 0; prev_position = 0; for (i_occur = 0; i_occur < sseq->site_count; i_occur++) { MOTIF_T *motif; SCANNED_SITE_T *ssite; int motif_num; ssite = sseq->sites+i_occur; motif_num = (ssite->strand == '-' ? -(ssite->m_num) : ssite->m_num); motif = rbtree_get(motifs, &motif_num); // Only include motifs that have been retained if (motif == NULL) { if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Skipping motif number %d which appears in sequence %s.\n", motif_num, sseq->seq_id); } continue; } if ((p_threshold > 0.0) && (ssite->pvalue > p_threshold)) { if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Skipping motif %s in sequence %s (%g > %g).\n", get_motif_st_id(motif), sseq->seq_id, ssite->pvalue, p_threshold); } continue; } if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Adding motif %s in sequence %s.\n", get_motif_st_id(motif), sseq->seq_id); } gap = ssite->position - prev_position; loc = binary_search(&motif_num, m_nums, m_count, sizeof(int), motif_num_cmp) + 1; assert(loc > 0); // Increment the transition count and spacer length matrices. incr_matrix_cell(prev_loc, loc, 1, *transp_freq); incr_matrix_cell(prev_loc, loc, gap, *spacer_ave); // Add the occurrence to the order list. add_occurrence(motif_num, gap, new_order); // store as the previous accepted site prev_position = ssite->position + get_motif_length(motif); prev_loc = loc; } // Record the transition to the end state. if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Adding transition to end in sequence %s.\n", sseq->seq_id); } // we're at the end, store in the last column of the matrix. gap = sseq->length - prev_position; loc = m_count + 1; // Increment the transition count and spacer length matrices. incr_matrix_cell(prev_loc, loc, 1, *transp_freq); incr_matrix_cell(prev_loc, loc, gap, *spacer_ave); // Add the occurrence to the order list. add_occurrence(END_TRANSITION, gap, new_order); if (find_order) { int nd_new, nd_old; double pv_new, pv_old; nd_new = get_num_distinct(new_order); nd_old = get_num_distinct(*order_spacing); pv_new = get_pvalue(new_order); pv_old = get_pvalue(*order_spacing); // Decide whether to keep the new order object. if ((nd_new > nd_old) || (((nd_new == nd_old) && (pv_new < pv_old)))) { if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Storing order from sequence %s (%g < %g).\n", sseq->seq_id, pv_new, pv_old); print_order_and_spacing(stderr, new_order); } free_order(*order_spacing); *order_spacing = new_order; } else { free_order(new_order); } } } free(m_nums); } /************************************************************************* * Setup motif-to-motif occurrence and spacer length frequency * transition matrices in a naive fashion, without taking into account * any motif occurrence information. *************************************************************************/ static void compute_naive_transitions_and_spacers (const int nmotifs, // The number of motifs. MATRIX_T** transp_freq, // Motif-to-motif occurrence matrix. MATRIX_T** spacer_ave) // Average spacer length matrix. { int i; int j; PROB_T prob; // Allocate the matrices. *transp_freq = allocate_matrix(nmotifs + 2, nmotifs + 2); *spacer_ave = allocate_matrix(nmotifs + 2, nmotifs + 2); // Set up transition frequencies and spacer lengths from start state. prob = 1.0 / (PROB_T)nmotifs; for (j = 1; j <= nmotifs; j++) { set_matrix_cell(0, j, prob, *transp_freq); set_matrix_cell(0, j, SPACER_LENGTH, *spacer_ave); } /* Set up transition frequencies and spacer lengths between motifs and to the end state. */ prob = 1.0 / ((PROB_T)(nmotifs + 1)); for (i = 1; i <= nmotifs; i++) { for (j = 1; j <= nmotifs+1; j++) { set_matrix_cell(i, j, prob, *transp_freq); set_matrix_cell(i, j, SPACER_LENGTH, *spacer_ave); } } } /*********************************************************************** * Convert transition counts to transition probabilities, and compute * average spacer lengths. * * Each matrix is indexed 0 ... n+1, where n is the number of motifs. * The entry at [i,j] corresponds to the transition from motif i to * motif j. Hence, after normalization, each row in the transition * matrix should sum to 1. ***********************************************************************/ static void normalize_spacer_counts( double trans_pseudo, double spacer_pseudo, // Pseudocount for self-loop. BOOLEAN_T keep_unused, MATRIX_T* transp_freq, MATRIX_T* spacer_ave ) { int i_row; int i_col; int num_rows; double total_spacer; double num_transitions; double ave_spacer; /* Divide the spacer lengths by the number of occurrences. */ num_rows = get_num_rows(transp_freq); for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo; num_transitions = get_matrix_cell(i_row, i_col, transp_freq); if (spacer_pseudo > 0) num_transitions++; if (num_transitions != 0.0) { ave_spacer = total_spacer / num_transitions; set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave); } } } // Add pseudocounts. for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { // Force some transitions to zero. if (// No transitions to the start state. (i_col == 0) || // No transitions from the end state. (i_row == num_rows - 1) || // No transition from start to end. ((i_row == 0) && (i_col == num_rows - 1))) { set_matrix_cell(i_row, i_col, 0.0, transp_freq); } else { // Only increment the used transitions. if ((keep_unused) || (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) { incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq); } } } } // Normalize rows. for (i_row = 0; i_row < num_rows - 1; i_row++) { if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) { normalize(SLOP, get_matrix_row(i_row, transp_freq)); } } } /*********************************************************************** * Discard motifs that are not connected. ***********************************************************************/ static void throw_out_unused_motifs (RBTREE_T *motifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave) { int i_motif_loc, j_motif; ARRAY_T* row_sums; ARRAY_T* col_sums; RBNODE_T *node, *next_node; // Extract the margins of the transition matrix. row_sums = get_matrix_row_sums(transp_freq); col_sums = get_matrix_col_sums(transp_freq); for (i_motif_loc = 1, node = rbtree_first(motifs); node != NULL; node = next_node, i_motif_loc++) { // calculate the next node before we delete anything next_node = rbtree_next(node); // Is this row or column empty? if ((get_array_item(i_motif_loc, row_sums) == 0.0) || (get_array_item(i_motif_loc, col_sums) == 0.0)) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Removing unused motif %s. " "No occurrences of this motif were found.\n", get_motif_st_id((MOTIF_T*)rbtree_value(node))); } // Remove the row and column from the transition matrix. remove_matrix_row(i_motif_loc, transp_freq); remove_matrix_col(i_motif_loc, transp_freq); assert(get_num_rows(transp_freq) == get_num_cols(transp_freq)); remove_matrix_row(i_motif_loc, spacer_ave); remove_matrix_col(i_motif_loc, spacer_ave); assert(get_num_rows(spacer_ave) == get_num_cols(spacer_ave)); // Remove the motif from the tree rbtree_delete(motifs, node, NULL, NULL); i_motif_loc--; // Recalculate the row and column sums. free_array(row_sums); free_array(col_sums); row_sums = get_matrix_row_sums(transp_freq); col_sums = get_matrix_col_sums(transp_freq); } } free_array(row_sums); free_array(col_sums); } /*********************************************************************** * Read the motifs, filter them, create an order and spacing if * possible and create the transition frequency matrix and the * spacer average matrix. ***********************************************************************/ void load_filter_process_motifs_for_hmm ( int request_n, // IN select the top n positive strand motifs RBTREE_T *request_nums, // IN select the requested motif numbers RBTREE_T *request_ids, // IN select the requested motif ids double request_ev, // IN select motifs with a <= evalue double request_complexity, // IN select motifs with a >= complexity const char *request_order, // IN specifiy motifs and their ordering and spacing double hit_pv, // IN specify the p value for keeping motif occurences const char *motif_filename, // IN file containing motifs to load const char *bg_source, // IN source of background double motif_pseudo, // IN pseudocount applied to motifs double trans_pseudo, // IN pseudocount applied to transitions double spacer_pseudo, // IN pseudocount applied to spacers BOOLEAN_T keep_unused, // IN keep the unused motifs ALPH_T *alphabet, // OUT alphabet of the motifs RBTREE_T **motifs, // OUT motifs with file position as key ARRAY_T **background, // OUT background from motif file ORDER_T **order_spacing, // OUT order and spacing (uses motif key as identifiers) MATRIX_T **transp_freq, // OUT transition frequencies MATRIX_T **spacer_ave // OUT spacer averages ) { int (*filter)(void *context, int motif_num, MOTIF_T *motif); void *filter_context; char *filter_name; REQMOT_T requested; RBTREE_T *ids2nums; // mapping of motif IDs to numbers RBNODE_T *node; MREAD_T *mread; MOTIF_T *motif; int motif_i, rc_motif_i, strands, i; BOOLEAN_T created, has_reverse_strand; ARRAYLST_T *motif_occurrences; /********************************************** * SELECT MOTIF FILTER **********************************************/ filter = NULL; filter_context = NULL; filter_name = ""; // filter top n if (request_n != 0) { filter = select_top_n; filter_context = &request_n; filter_name = "-nmotifs"; } // filter requested if (rbtree_size(request_nums) != 0 || rbtree_size(request_ids) != 0) { if (filter != NULL) { die("Can't use -motif or -motif-id with %s.\n", filter_name); } requested.nums = request_nums; requested.ids = request_ids; filter = select_requested; filter_context = &requested; filter_name = "-motif or -motif-id"; } // filter e threshold if (request_ev != 0.0) { if (filter != NULL) { die("Can't use -ethresh with %s.\n", filter_name); } filter = select_better_e_value; filter_context = &request_ev; } // filter complexity threshold if (request_complexity != 0.0) { if (filter != NULL) { die("Can't use -lowcomp with %s.\n", filter_name); } filter = select_more_complex; filter_context = &request_complexity; } // filter order (but not when reading motifs) if (request_order != NULL && filter != NULL) { die("Can't use -order with %s.\n", filter_name); } if (filter == NULL) filter = select_all; /********************************************** * READING THE MOTIFS **********************************************/ mread = mread_create(motif_filename, OPEN_MFILE | CALC_AMBIGS | SCANNED_SITES); mread_set_bg_source(mread, bg_source); mread_set_pseudocount(mread, motif_pseudo); has_reverse_strand = (mread_get_strands(mread) == 2); *motifs = rbtree_create(motif_num_cmp, rbtree_intcpy, free, NULL, destroy_motif); ids2nums = rbtree_create(rbtree_strcmp, rbtree_strcpy, free, rbtree_intcpy, free); for (motif_i = 1, rc_motif_i = -1; mread_has_motif(mread); ++motif_i, --rc_motif_i) { motif = mread_next_motif(mread); strands = filter(filter_context, motif_i, motif); if (strands & MOTIF_PLUS) rbtree_make(*motifs, &motif_i, motif); if (strands & MOTIF_MINUS) rbtree_make(*motifs, &rc_motif_i, dup_rc_motif(motif)); if (strands) rbtree_make(ids2nums, get_motif_id(motif), &motif_i); else destroy_motif(motif); } *background = mread_get_background(mread); motif_occurrences = mread_get_motif_occurrences(mread); mread_destroy(mread); // Parse the order string. *order_spacing = create_order_from_ids(request_order, ids2nums); rbtree_destroy(ids2nums); if (*order_spacing) { // Remove unused motifs for (node = rbtree_first(*motifs); node != NULL; node = rbtree_next(node)) { motif_i = *((int*)rbtree_key(node)); if (!order_contains(motif_i, *order_spacing)) { rbtree_delete(*motifs, node, NULL, NULL); } } } // Turn the raw motifs and motif occurrences into the elements of the model if (motif_occurrences != NULL && arraylst_size(motif_occurrences) > 0) { compute_transitions_and_spacers( *motifs, motif_occurrences, has_reverse_strand, hit_pv, order_spacing, transp_freq, spacer_ave); } else { // If no occurrences are found, initialize matrices uniformly. compute_naive_transitions_and_spacers( rbtree_size(*motifs), transp_freq, spacer_ave); } // Convert spacer info to probabilities. normalize_spacer_counts( trans_pseudo, spacer_pseudo, keep_unused, *transp_freq, *spacer_ave); // Throw out unused motifs. throw_out_unused_motifs(*motifs, *transp_freq, *spacer_ave); destroy_motif_occurrences(motif_occurrences); } /************************************************************************* * store a motif number in a passed set with the strands of it that * have been requested. * Returns true on success. *************************************************************************/ BOOLEAN_T store_requested_num(RBTREE_T *request_nums, const char *value) { int strands, motif_i; char *end; RBNODE_T *node; BOOLEAN_T created; strands = MOTIF_PLUS; motif_i = strtol(value, &end, 10); if (*end != '\0' || motif_i == 0) { return FALSE; } if (motif_i < 0) { strands = MOTIF_MINUS; motif_i = -motif_i; } node = rbtree_lookup(request_nums, &motif_i, TRUE, &created); if (!created) strands |= *((int*)rbtree_value(node)); rbtree_set(request_nums, node, &strands); return TRUE; } /************************************************************************* * store a motif identifier in a passed set with the strands of it that * have been requested. * Returns true on success. *************************************************************************/ BOOLEAN_T store_requested_id(RBTREE_T *request_ids, char *value) { int strands; RBNODE_T *node; BOOLEAN_T created; strands = MOTIF_PLUS; // assume positive strand if (value[0] == '+') { value++; } else if (value[0] == '-') { value++; strands = MOTIF_MINUS; } if (value[0] == '\0') return FALSE; node = rbtree_lookup(request_ids, value, TRUE, &created); if (!created) strands |= *((int*)rbtree_value(node)); rbtree_set(request_ids, node, &strands); return TRUE; } #ifdef MAIN #include "simple-getopt.h" VERBOSE_T verbosity = NORMAL_VERBOSE; /************************************************************************* * int main *************************************************************************/ int main(int argc, char *argv[]) { /* Data structures. */ RBTREE_T *motifs; // The motifs. int *motif_nums; // sorted list of motif nums to provide mapping to matricies ALPH_T alph = INVALID_ALPH; ARRAY_T* background = NULL; /* Background probs for alphabet. */ ORDER_T* order_spacing; /* Linear HMM order and spacing. */ MATRIX_T* transp_freq = NULL; /* Matrix of inter-motif transitions freqs. */ MATRIX_T* spacer_ave = NULL; /* Matrix of average spacer lengths. */ MHMM_T * the_hmm = NULL; /* The HMM being constructed. */ /* Command line parameters. */ char * motif_filename; /* Input file containg motifs. */ char * hmm_type_str; /* HMM type. */ HMM_T hmm_type; RBTREE_T *request_nums; /* Indexes of requested motifs */ RBTREE_T *request_ids; /* Identifiers of requested motifs */ int request_n; /* The user asked for the first n motifs. */ double e_threshold; /* E-value threshold for motif inclusion. */ double complexity_threshold; // For eliminating low-complexity motifs. double p_threshold; /* p-value threshold for motif occurences. */ char* order_string; /* Motif order and spacing. */ int spacer_states; /* Number of states in each spacer. */ BOOLEAN_T fim; /* Represent spacers as free insertion modules? */ BOOLEAN_T keep_unused; // Drop unused inter-motif transitions? double trans_pseudo; /* Transition pseudocount. */ double spacer_pseudo; // Spacer (self-loop) pseudocount. */ char* description; // Descriptive text to be stored in model. BOOLEAN_T print_header; /* Print file header? */ BOOLEAN_T print_params; /* Print parameter summary? */ BOOLEAN_T print_time; /* Print timing data (dummy: always false). */ /* Local variables. */ int motif_i, strands, i; RBNODE_T *node; BOOLEAN_T created; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. cmdoption const options[] = { {"type", OPTIONAL_VALUE}, {"description", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"nmotifs", REQUIRED_VALUE}, {"ethresh", REQUIRED_VALUE}, {"lowcomp", REQUIRED_VALUE}, {"pthresh", REQUIRED_VALUE}, {"order", REQUIRED_VALUE}, {"nspacer", REQUIRED_VALUE}, {"fim", NO_VALUE}, {"keep-unused", NO_VALUE}, {"transpseudo", REQUIRED_VALUE}, {"spacerpseudo", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE}, {"noheader", NO_VALUE}, {"noparams", NO_VALUE}, {"notime", NO_VALUE}, {"quiet", NO_VALUE}, }; int option_count = 18; int option_index = 0; // Define the usage message. char *usage = "USAGE: mhmm [options] \n" "\n" " Options:\n" " --type [linear|complete|star] (default=linear)\n" " --description (may be repeated)\n" " --motif (may be repeated)\n" " --motif-id" " --nmotifs <#>\n" " --ethresh \n" " --lowcomp \n" " --pthresh \n" " --order \n" " --nspacer (default=1)\n" " --fim\n" " --keep-unused\n" " --transpseudo \n" " --spacerpseudo \n" " --verbosity 1|2|3|4|5 (default=2)\n" " --noheader\n" " --noparams\n" " --notime\n" " --quiet\n" "\n"; /* Make sure various options are set to NULL or defaults. */ motif_filename = NULL; hmm_type_str = NULL; hmm_type = INVALID_HMM; request_nums = rbtree_create(motif_num_cmp, rbtree_intcpy, free, rbtree_intcpy, free); request_ids = rbtree_create(rbtree_strcmp, rbtree_strcpy, free, rbtree_intcpy, free); request_n = 0; e_threshold = 0.0; complexity_threshold = 0.0; p_threshold = 0.0; order_string = NULL; spacer_states = DEFAULT_SPACER_STATES, fim = FALSE; keep_unused = FALSE; trans_pseudo = DEFAULT_TRANS_PSEUDO; spacer_pseudo = DEFAULT_SPACER_PSEUDO; description = NULL; print_header = TRUE; print_params = TRUE; print_time = FALSE; simple_setopt(argc, argv, option_count, options); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; char *end; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "type") == 0) { if (option_value != NULL) { hmm_type_str = option_value; } } else if (strcmp(option_name, "description") == 0) { description = option_value; } else if (strcmp(option_name, "motif") == 0) { if (!store_requested_num(request_nums, option_value)) { die("Error processing \"-motif %s\". " "Option value should be a (non-zero) motif position. " "For example the first motif in the file has the " "position 1 and it's reverse complement has the position -1.\n", option_value); } } else if (strcmp(option_name, "motif-id") == 0) { if (!store_requested_id(request_ids, option_value)) { die("Error processing \"-motif-id %s\"." "Option value should be a motif identifer and hence " "not just be a + or a -.\n", option_value); } } else if (strcmp(option_name, "nmotifs") == 0) { request_n = atoi(option_value); } else if (strcmp(option_name, "ethresh") == 0) { e_threshold = atof(option_value); } else if (strcmp(option_name, "lowcomp") == 0) { complexity_threshold = atof(option_value); } else if (strcmp(option_name, "pthresh") == 0) { p_threshold = atof(option_value); } else if (strcmp(option_name, "order") == 0) { order_string = option_value; } else if (strcmp(option_name, "nspacer") == 0) { spacer_states = atoi(option_value); } else if (strcmp(option_name, "fim") == 0) { fim = TRUE; } else if (strcmp(option_name, "keep-unused") == 0) { keep_unused = TRUE; } else if (strcmp(option_name, "transpseudo") == 0) { trans_pseudo = atof(option_value); } else if (strcmp(option_name, "spacerpseudo") == 0) { spacer_pseudo = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T)atoi(option_value); } else if (strcmp(option_name, "noheader") == 0) { print_header = FALSE; } else if (strcmp(option_name, "noparams") == 0) { print_params = FALSE; } else if (strcmp(option_name, "notime") == 0) { print_time = FALSE; } else if (strcmp(option_name, "quiet") == 0) { print_header = print_params = print_time = FALSE; verbosity = QUIET_VERBOSE; } } // Read the single required argument. if (option_index + 1 != argc) { fputs(usage, stderr); exit(1); } motif_filename = argv[option_index]; /* Set the model type. */ hmm_type = convert_enum_type_str(hmm_type_str, LINEAR_HMM, HMM_STRS, NUM_HMM_T); /* Gotta have positive spacer length. */ if (spacer_states <= 0) { die("Negative spacer length (%d).\n", spacer_states); } /* Prevent trying to build a complete or star model with ordering. */ if (order_string != NULL) { if (hmm_type == COMPLETE_HMM) die("Can't specify motif order with a completely connected model."); else if (hmm_type == STAR_HMM) die("Can't specify motif order with a star model."); } /********************************************** * LOADING THE MOTIFS **********************************************/ load_filter_process_motifs_for_hmm( request_n, request_nums, request_ids, e_threshold, complexity_threshold, order_string, p_threshold, motif_filename, "motif-file", // bg source 0.0, // motif pseudocount trans_pseudo, spacer_pseudo, keep_unused, &alph, &motifs, &background, &order_spacing, &transp_freq, &spacer_ave ); /********************************************** * BUILDING THE HMM **********************************************/ /* Build the motif-based HMM. */ if (hmm_type == LINEAR_HMM) { if (order_spacing == NULL) { die("No order specified for the motifs.\n" "For the linear model the motif file must contain motif occurence\n" "data or the motif order must be specified using " "the --order option."); } build_linear_hmm( background, order_spacing, spacer_states, motifs, fim, &the_hmm ); } else { // flatten motif list because we no longer need to handle the order int num_motifs; MOTIF_T *flat_motif_list; motif_tree_to_array(motifs, &flat_motif_list, &num_motifs); if (hmm_type == COMPLETE_HMM) { build_complete_hmm( background, spacer_states, flat_motif_list, num_motifs, transp_freq, spacer_ave, fim, &the_hmm ); } else if (hmm_type == STAR_HMM) { build_star_hmm( background, spacer_states, flat_motif_list, num_motifs, fim, &the_hmm ); } free_motif_array(flat_motif_list, num_motifs); } // Add some global information. copy_string(&(the_hmm->motif_file), motif_filename); /********************************************** * WRITING THE HMM **********************************************/ /* Print the header. */ if (print_header) write_header( program, "", description, motif_filename, NULL, NULL, stdout ); /* Write the HMM. */ write_mhmm(verbosity, the_hmm, stdout); /* Print the program parameters. */ if (print_params) { printf("Program parameters for mhmm\n"); printf(" MEME file: %s\n", motif_filename); printf(" Motifs:"); for (node = rbtree_first(motifs); node != NULL; node = rbtree_next(node)) { MOTIF_T* motif = (MOTIF_T*)rbtree_value(node); printf(" %s", (get_motif_strands(motif) ? get_motif_st_id(motif) : get_motif_id(motif))); } printf("\n"); printf(" Model topology: %s\n", convert_enum_type(hmm_type, HMM_STRS, NUM_HMM_T)); printf(" States per spacer: %d\n", spacer_states); printf(" Spacers are free-insertion modules: %s\n", boolean_to_string(fim)); printf("\n"); } free_array(background); rbtree_destroy(request_ids); rbtree_destroy(request_nums); free_order(order_spacing); free_matrix(transp_freq); free_matrix(spacer_ave); rbtree_destroy(motifs); free_mhmm(the_hmm); return(0); } #endif /* MAIN */