/* * $Id: starts.c 3885 2009-07-16 10:02:35Z tbailey $ * */ /*********************************************************************** * * * MEME++ * * Copyright 1994, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* 7-29-99 tlb; change MAX_NSITES to be sites/w */ /* 6-29-99 tlb; add ic to subseq7 */ /* Routines to set up an array of starting points for EM. */ #include "meme.h" #include "seed.h" #include "sp_matrix.h" #include /* static int get_nsites0 ( */ /* int w, ///< width of motif */ /* double min_nsites, ///< minimum nsites0 */ /* double max_nsites, ///< maximum nsites0 */ /* int heap_size, ///< size of seed heap */ /* S_POINT *s_points ///< array of starting points */ /* ); */ static void global_search ( SP_MATRIX *sp_matrix,///< Matrix of spoints each with different n and w ///< values DATASET *dataset, ///< Dataset containing (eg) substrings MODEL *model ///< The motif model (nascent) ); static void substring_search ( SP_MATRIX *sp_matrix, ///< Matrix of spoints each with different n and w ///< values DATASET *dataset, ///< Dataset containing sequences and parameters HASH_TABLE evaluated_seed_ht, ///< A hash table for recording novel seeds MODEL *model ///< The motif model (nascent) ); static int *make_width_prog ( int min_width, ///< Minimum width value in the progression int max_width, ///< Maximum integer value in the progression double factor, ///< Factor specifying rate of increase in progression int *n_vals, ///< The final number of values in the progression - OUT BOOLEAN pal, ///< Indicates whether palindromes are being enforced BOOLEAN all_widths /// all widths between min and max ); static void use_input_cons( SP_MATRIX *sp_matrix, ///< The s_point matrix that will be filled up char *e_cons ///< This seed will be placed at every central s_point ); /** * get_starts * * Create a list of starting points for EM. * * If dataset->prob > 0, samples subsequences in the input dataset for good * starting points. Otherwise, e_cons is specified as the starting point. * * In order to generate starting points for EM, get_starts either performs * a "global search", or else uses a specified consensus string. Global search * always includes substring search, but several other algorithms ("branching" * searches) can optionally be run after substring search in order to improve * the starting points from which EM shall be run. * * Returns number of starting points in list. */ extern S_POINT *get_starts( DATASET *dataset, ///< the dataset MODEL *model, ///< the model char *e_cons, ///< encoded consensus sequence int *n_starts ///< number of starting points ) { S_POINT *s_points = NULL; // array of starting points to perform EM from BOOLEAN pal = dataset->pal; /* force DNA palindromes */ int min_w = model->min_w; // minimum width int max_w = model->max_w; // maximum width double min_nsites = model->min_nsites; // min. allowed sites double max_nsites = model->max_nsites; // max. allowed sites double sample_prob = dataset->prob; // sampling probability for subsequences // Generate geometric progressions over width and nsites. Make these // progressions identical to those used in original meme: int n_ns; int *central_ns = make_geometric_prog(min_nsites, max_nsites, 2, &n_ns); int n_ws; int *central_ws = make_width_prog(min_w, max_w, sqrt(2), &n_ws, pal, model->all_widths); // Create a matrix of S_POINTS over which global search will be performed. // Each S_POINT corresponds to a separate combination of n and w: SP_MATRIX *sp_matrix = create_spoint_matrix(central_ws, n_ws, central_ns, n_ns, dataset); // Fill in the starting points with the subsequence to use if (!e_cons && sample_prob != 0) { // sample subsequences (and do branching) // Perform "global search" in order to find starting points for EM. // global_search() will establish a string, from which EM will be // performed, for a number of S_POINT objects in the matrix (one string per // S_POINT). This process results in S_POINTS in the matrix being filled // in: global_search(sp_matrix, dataset, model); } else { // don't sample subsequences // Use the specified consensus sequence, "e_cons", in order to fill in // the sp_matrix: use_input_cons(sp_matrix, e_cons); } // Choose (from the matrix) which S_POINTS em will be executed from: s_points = choose_em_starts(sp_matrix, dataset, model, n_starts); // Free memory associated with the sp_matrix, as we no longer need it: free_sp_matrix(sp_matrix); return s_points; } // get_starts /** * make_width_prog * * Generate a list of central motif widths for global search, rounding values * to integer (using a cast to int) when they are not integers. The sequence * is identical that originally used by MEME: a geometric progression starting * from min_width, except that it is terminated by a specified max_width, and * the minimum increment between widths is 2. * * If palindromes are being enforced, then width "w+1" will be included in * addition to every width "w". * * \return the array of integers in the sequence */ static int *make_width_prog ( int min_width, ///< Minimum width value in the progression int max_width, ///< Maximum integer value in the progression double factor, ///< Factor specifying rate of increase in progression int *n_vals, ///< The final number of values in the progression - OUT BOOLEAN pal, ///< Indicates whether palindromes are being enforced BOOLEAN all_widths /// all widths between min and max ) { assert (max_width >= min_width); // Precondition int curr_width; int *progression = NULL; *n_vals = 0; for (curr_width = min_width; curr_width < max_width; curr_width = all_widths? curr_width+1 : (MAX(curr_width+2, (int)(curr_width*factor)))) { (*n_vals)++; // Adding an extra value to the progression. Resize(progression, *n_vals, int); progression[*n_vals - 1] = curr_width; // If palindromes are being enforced, then make sure adjacent widths are // "central", so that they will be considered for starting points (we // don't need to do this is all widths are being considered): if (pal && !all_widths) { (*n_vals)++; Resize(progression, *n_vals, int); progression[*n_vals - 1] = curr_width+1; } } // Add the terminal value: (*n_vals)++; Resize(progression, *n_vals, int); progression[*n_vals - 1] = max_width; // If palindromes are being enforced, then add an extra starting point at // the end of the array (this time we do need to do this even if all // widths are considered): if (pal) { (*n_vals)++; Resize(progression, *n_vals, int); progression[*n_vals - 1] = max_width+1; } assert (progression != NULL); // Postcondition return progression; } /** * global_search * * Perform substring search, followed by optional branching search. Modifies * sp_matrix as a result. * */ static void global_search ( SP_MATRIX *sp_matrix,///< Matrix of spoints each with different n and w ///< values DATASET *dataset, ///< Dataset containing (eg) substrings MODEL *model ///< The motif model (nascent) ) { // Allocate memory for a hash table recording which seeds have already // been evaluated... HASH_TABLE evaluated_seed_ht = NULL; /* Maximum number of entries in hash table (after branching search has also been performed) is: (Number of seeds evaluated in substring search) + (Number of seeds evaluated during branching search). Calculate rough upper bound on the number of novel seeds that could be found by substring search and branching search... */ /* NOTE: The same branching factor is currently used for width-branching and x-branching alike. This was done because w-branching is not a default parameter, and so its performance has not been optimised. */ int bfactor = dataset->branch_params->bfactor; int n_central_w = get_num_central_widths(sp_matrix); int n_nsites = sp_get_num_cols(sp_matrix); // Get the number of letters that can be branched to moves tried in point // branching: int point_br_alen; if (dataset->branch_params->point_branch == NORMAL) { point_br_alen = dataset->alength; } else if (dataset->branch_params->point_branch == ALL) { point_br_alen = dataset->alength + 1; } else if (dataset->branch_params->point_branch == NO_POINT_B) { point_br_alen = 0; } else { point_br_alen = 1; // X only branching } // Calculate max novel seeds generated for each seed branched from: double avg_w = (((double)get_max_width(sp_matrix) + get_min_width(sp_matrix))/2); int seeds_per_branch = (avg_w*point_br_alen); if (dataset->branch_params->w_branch) { seeds_per_branch = seeds_per_branch + 2*dataset->alength+2; } // Calculate approximate upper bound on number of novel seeds produced: int max_novel = (dataset->total_res*n_central_w) + 3*dataset->main_hs*bfactor*n_central_w*n_nsites*seeds_per_branch; /* Placing an upper limit on the amount of memory used by the hashtable bins. * Note that a dynamically-resized hashtable could also be used to deal with * memory usage concerns. This did not appear necessary in our studies. */ int n_bins = MIN(max_novel, MAX_BINS); evaluated_seed_ht = hash_create(n_bins); // Perform substring search: substring_search(sp_matrix, dataset, evaluated_seed_ht, model); // Use a new BRANCH_PARAMS object to control branching: BRANCH_PARAMS branch_params; if (dataset->branch_params->w_branch == TRUE) { // User requested width-branching => Set up branching parameters accordingly // and commence branching: branch_params.point_branch = NO_POINT_B; branch_params.w_branch = TRUE; branch_params.bfactor = dataset->branch_params->bfactor; branching_search(&branch_params, model, dataset, sp_matrix, evaluated_seed_ht); transfer_final_scores(sp_matrix); } if (dataset->branch_params->point_branch != NO_POINT_B) { // User requested point-wise branching => Set up branching parameters // accordingly and commence branching: branch_params.point_branch = dataset->branch_params->point_branch; branch_params.w_branch = FALSE; branch_params.bfactor = dataset->branch_params->bfactor; branching_search(&branch_params, model, dataset, sp_matrix, evaluated_seed_ht); transfer_final_scores(sp_matrix); } // Branching has finished => free the memory associated with the hash table. hash_destroy(evaluated_seed_ht); /* At this stage, the top of each of the heaps of the s_points contains the (hopefully) best starting point for the respective combination of w and nsites0. Set the attributes in each of the s_point objects accordingly: */ transfer_final_scores(sp_matrix); } // global_search /** * use_intput_cons * * Use the specified consensus string as the EM seed, for all nsites values. * */ static void use_input_cons( SP_MATRIX *sp_matrix, ///< The s_point matrix that will be filled up char *e_cons ///< This seed will be placed at every central s_point ) { int w_idx, n_idx, lett_idx; // get the number of central widths and nsites int n_widths = get_num_central_widths(sp_matrix); int n_nsites = get_num_central_nsites(sp_matrix); // get the central widths int *central_ws = get_central_ws(sp_matrix); // get the central nsites int *central_ns = get_central_ns(sp_matrix); // only fill the central spoints for (w_idx = 0; w_idx < n_widths; w_idx++){ // get the current central width int curr_w = central_ws[w_idx]; for (n_idx = 0; n_idx < n_nsites; n_idx++){ // get the current central nsites int curr_ns = central_ns[n_idx]; // get the spoint with width=curr_w nsites=curr_ns S_POINT *curr_sp = get_s_point(sp_matrix, curr_w, curr_ns); // set the spoint encoded consensus curr_sp->e_cons0 = e_cons; // set the score to an abitrary value curr_sp->score = 1.0; // set up human readable consensus sequence cons0 for (lett_idx = 0; lett_idx < curr_w; lett_idx++){ curr_sp->cons0[lett_idx] = (e_cons ? unhash(e_cons[lett_idx]) : 'x'); } // lett_idx // add the final character to the string curr_sp->cons0[lett_idx] = '\0'; } // n_idx } // w_idx } // use_input_cons /** * substring_search * * Perform substring search. For each central width in the sp_matrix, consider * every substring of that width from the dataset, as a possible EM seed, and * place it in the relevant s_point in the starting point matrix. * */ static void substring_search ( SP_MATRIX *sp_matrix, ///< Matrix of spoints each with different n and w values DATASET *dataset, ///< Dataset containing sequences and parameters HASH_TABLE evaluated_seed_ht, ///< A hash table for recording novel seeds MODEL *model ///< The motif model (nascent) ) { /* For each central width in the sp_matrix, perform substring search on the appropriate S_POINT array. Finish on curr_w == max_w: */ int w_idx; // The index of the current "central" width. int *central_ws = get_central_ws(sp_matrix); int n_nsites = sp_get_num_cols(sp_matrix); for (w_idx = 0; w_idx < get_num_central_widths(sp_matrix); w_idx++) { int curr_w = central_ws[w_idx]; // Get the row from the matrix that has the current width: S_POINT *curr_sp_row = get_sp_arr(sp_matrix, curr_w); // Perform substring search under the current width: subseq7(model, dataset, curr_w, n_nsites, curr_sp_row, evaluated_seed_ht); // set up human readable consensus sequence for starting point int sp_idx; for (sp_idx=0; sp_idx < n_nsites; sp_idx++) { // consensus char *e_cons0 = curr_sp_row[sp_idx].e_cons0; int lett_idx; for (lett_idx=0; lett_idx