/**************************************************************************** * FILE: motiph_scoring.c * AUTHOR: William Stafford Noble, Charles E. Grant, Timothy L. Bailey * CREATE DATE: 12/03/2004 * PROJECT: EVOMCAST * DESCRIPTION: Define scores using the ratio of log-likelihoods * between foreground an background models. * COPYRIGHT: 2004, UW ****************************************************************************/ #include #include #include #include #include "motiph-scoring.h" #include "alphabet.h" #include "motif.h" #include "mhmm-state.h" #include "pssm.h" #include "utils.h" extern char* program_name; /************************************************************************* * Get the (index of) the base in the sequence in the alignment column * corresponding to the given tree node. *************************************************************************/ static int get_base_from_node( TREE_T* node, STRING_LIST_T* seq_names, char* alignment_col, ALPH_T alph ) { char* name = get_label(node); int i = get_index_in_string_list(name, seq_names); if (i == -1) { // No sequence matches the leaf label die("No sequence in the alignment matches the tree " "node labeled %s.\n", name); } char new_char = alignment_col[i]; int base = alph_index(alph, new_char); assert(base != -1 && base < alph_size(alph, ALPH_SIZE)); return(base); } // get_sequence_index_from_node /************************************************************************* * Build a Position Specific Scoring Matrix (PSSM) for alignment columns * given a phylogenetic tree and an array of evolutionary models. * The first model in the array is assumed to be the background model. * The PSSM will have one row for each model, and one column for each * possible alignment column. * * The elements of the PSSM are the log-odds scores for the corresponding * model and alignment column. * * The caller is responsible for freeing the returned PSSM. *************************************************************************/ MATRIX_T* build_alignment_pssm_matrix( ALPH_T alph, STRING_LIST_T* seq_names, int num_models, EVOMODEL_T** models, TREE_T* tree, GAP_SUPPORT_T gap_support ) { assert(seq_names != NULL); assert(num_models >= 2); assert(models != NULL); assert(tree != NULL); SUBSTMATRIX_TABLE_T* substmatrix_table = NULL; SUBSTMATRIX_ARRAY_T* bg_substmatrices = NULL; SUBSTMATRIX_ARRAY_T* fg_substmatrices = NULL; // The number of possible alignment columns is (asize^column_height) // The column height is given by the number of labels in the tree. int asize = alph_size(alph, ALPH_SIZE); // Use the first column, not the background column (0) because // AVERAGE_MODEL has JC_MODEL in background column. MODEL_TYPE_T model_type = get_model_type(models[1]); //int num_leaves; int num_labels; // TLB; Count labels, not leaves to get size // of multiple alignment. if (model_type == SINGLE_MODEL) { num_labels = 1; } else { num_labels = label_count(tree); // Create a table of the transition probability matrices // for each model and each edge of the tree. // One model is needed for the background to compute column frequencies // for the AVERAGE_MODEL. substmatrix_table = make_substmatrix_table( alph, tree, (model_type == AVERAGE_MODEL) ? 1 : num_models, models ); // Get the array of substitution matrices for the // background model bg_substmatrices = get_substmatrix_array_from_table(substmatrix_table, 0); } int num_alignment_cols = (int) pow((double) asize, (double) num_labels); char* alignment_col = mm_malloc((num_labels + 1) * sizeof(char)); MATRIX_T* pssm_matrix = allocate_matrix(num_models, num_alignment_cols); EVOMODEL_T* bg_model = models[0]; // Process in column major order to avoid re-calculating bg_likelihood int col_index; double total_bg_likelihood = 0.0; double total_fg_likelihood = 0.0; for (col_index = 0; col_index < num_alignment_cols; col_index++) { // Translate the column number into a string giving the // bases for that column. unhash_alignment_col( alph, col_index, alignment_col, num_labels ); // Calculate the likelihood for this column using the background model. double bg_likelihood = site_likelihood( alph, alignment_col, seq_names, tree, (model_type == AVERAGE_MODEL) ? JC_MODEL : model_type, get_model_equil_freqs(bg_model), bg_substmatrices, gap_support ); // save background likelihood of this column for use as column frequency int row_index = 0; set_matrix_cell(row_index, col_index, bg_likelihood, pssm_matrix); // recompute likelihood if using AVERAGE_MODEL for use in the // denominator of the log-odds if (model_type == AVERAGE_MODEL) { bg_likelihood = site_likelihood( alph, alignment_col, seq_names, tree, model_type, get_model_equil_freqs(bg_model), bg_substmatrices, gap_support ); } total_bg_likelihood += bg_likelihood; for (row_index = 1; row_index < num_models; row_index++) { if (model_type != SINGLE_MODEL && model_type != AVERAGE_MODEL) { // Get the array of substitution matrices for the // current foreground model. fg_substmatrices = get_substmatrix_array_from_table(substmatrix_table, row_index); } // Calculate the log-odds for this column using the // foreground and background models. double fg_likelihood = site_likelihood( alph, alignment_col, seq_names, tree, model_type, get_model_equil_freqs(models[row_index]), fg_substmatrices, gap_support ); total_fg_likelihood += fg_likelihood; double log_odds = my_log2(fg_likelihood / bg_likelihood); set_matrix_cell(row_index, col_index, log_odds, pssm_matrix); free_substmatrix_array(fg_substmatrices); } } free_substmatrix_array(bg_substmatrices); free_substmatrix_table(substmatrix_table); myfree(alignment_col); return pssm_matrix; } /**************************************************************************** * This function implements the Felsentein pruning algorithm for evaluating * the log likelihood of a phylogentic tree for a given site in an alignment. * It sums over all possible base assignments of the internal nodes * using the law of total probability and recursively calls itself until it * reaches the tree leaves. At the leaves paths through the tree whose final * base doesn't match the appropriate sequence site are "pruned" * by returning a value of 0.0. * TLB: Compute * Pr(t | r(t)=root_base), where "t" is the tree and "r(t)" is the base * assigned to the root of t. ****************************************************************************/ static double pruning_algorithm( ALPH_T alph, char* alignment_col, STRING_LIST_T* seq_names, int root_base, // base at "root" of current subtree TREE_T* tree, SUBSTMATRIX_ARRAY_T* substmatrix_array, GAP_SUPPORT_T gap_support ) { assert(alignment_col != NULL); assert(seq_names != NULL); assert(tree != NULL); assert(substmatrix_array != NULL); int asize = alph_size(alph, ALPH_SIZE); assert(root_base < asize); int new_root_base = 0; double sum = 0.0; // Sum for normal case. double min = 0.0; // Minimum for min_gap strategy. double result = 0.0; if (is_leaf(tree) == TRUE) { // End of recursion, get character at this leaf new_root_base = get_base_from_node( tree, seq_names, alignment_col, alph ); if (root_base == new_root_base) { // TLB; Pr(T | r(t)=observed_base) result = 1.0; } else if ( gap_support == WILDCARD_GAP && (new_root_base == '-' || new_root_base == '.') ) { result = 1.0; } else { result = 0.0; } } else { result = 1.0; int num_children = get_num_children(tree); int c = 0; // TLB; Pr(t | r(t) = root_base) = \prod_{children(t)} Pr(child | r(t) = root_base) for (c = 0; c < num_children; c++) { min = HUGE_VAL; sum = 0.0; TREE_T* child = get_nth_child(c, tree); MATRIX_T* prob_matrix = get_substmatrix_for_time(substmatrix_array, get_length(child)); assert(prob_matrix != NULL); // TLB // Pr(child | r(t) = root_base) = // \sum_{x \in \alphabet} Pr(r(t)->x | time=d) Pr(child | r(child)=x) for (new_root_base = 0; new_root_base < asize; new_root_base++) { double prior = get_matrix_cell(root_base, new_root_base, prob_matrix); double prob = pruning_algorithm( alph, alignment_col, seq_names, new_root_base, child, substmatrix_array, gap_support ); // Implement minimum gap-handling strategy: choose // the base that minimizes the score. if (gap_support == MIN_GAPS) { if (is_leaf(child)) { char child_char; // Get the character at this leaf. int i = get_index_in_string_list(get_label(child), seq_names); child_char = alignment_col[i]; // Is it a gap character? if ((child_char == '-') || (child_char == '.')) { if (prior < min) { // FIXME: Add a user-specified multiplicative factor. min = prior; } } } } // Normal case: just sum all the values. sum = sum + (prior * prob); } if ((gap_support == MIN_GAPS) && (min != HUGE_VAL)) { result = result * min; } else { result = result * sum; } if (prob_matrix != NULL) { free_matrix(prob_matrix); } } } return result; } /**************************************************************************** * This function is our first implementation of the Felsentein pruning * algorithm for evaluating the log likelihood of a phylogentic tree for a * given site in an alignment. * It sums over all possible base assignments of the internal nodes * using the law of total probability and recursively calls itself until it * reaches the tree leaves. At the leaves paths through the tree whose final * base doesn't match the appropriate sequence site are "pruned" * by returning a value of 0.0. ****************************************************************************/ static double old_pruning_algorithm(ALPH_T alph, char* alignment_col, STRING_LIST_T* seq_names, char old_base, TREE_T* tree, EVOMODEL_T* model, GAP_SUPPORT_T gap_support) { char new_base = 0; char* name = NULL; int a, b; // Indices for old base and new base int c; // Index for child nodes int i; int asize = 0; int num_children = 0; double prior = 0.0; double prob = 0.0; double result = 0.0; double sum = 0.0; // Sum for normal case. double min = 0.0; // Minimum for min_gap strategy. double t = 0.0; TREE_T* child = NULL; MATRIX_T* prob_matrix = NULL; if (is_leaf(tree) == TRUE) { // End of recursion, get character at this leaf name = get_label(tree); i = get_index_in_string_list(name, seq_names); if (i == -1) { // No sequence matches the leaf label die("No sequence in the alignment matches the tree " "node labeled %s.\n", name); } new_base = alignment_col[i]; if (old_base == new_base || new_base == 'N' || old_base == 'N') { result = 1.0; } else { result = 0.0; } } else { result = 1.0; num_children = get_num_children(tree); asize = alph_size(alph, ALPH_SIZE); for (c = 0; c < num_children; c++) { min = HUGE_VAL; sum = 0.0; child = get_nth_child(c, tree); t = get_length(child); prob_matrix = old_get_model_prob_matrix(alph, model, t); a = alph_index(alph, old_base); assert(a != -1 && a < asize); // Call recursively for each child for (b = 0; b < asize; b++) { new_base = alph_char(alph, b); prior = get_matrix_cell(a, b, prob_matrix); prob = old_pruning_algorithm( alph, alignment_col, seq_names, new_base, child, model, gap_support ); // Normal case: just sum all the values. sum = sum + (prior * prob); } result = result * sum; if (prob_matrix != NULL) { free_matrix(prob_matrix); } } } return result; } /**************************************************************************** * Calculate the log likelihood of a phylogenetic tree * at one site in the alignment. ****************************************************************************/ double site_likelihood( ALPH_T alph, char* alignment_col, STRING_LIST_T* seq_names, TREE_T* tree, MODEL_TYPE_T model_type, ARRAY_T* priors, SUBSTMATRIX_ARRAY_T* substmatrix_array, GAP_SUPPORT_T gap_support) { assert(alignment_col != NULL); assert(seq_names != NULL); assert(tree != NULL); int alpha_size = alph_size(alph, ALPH_SIZE); int i = 0; double likelihood = 0.0; if (model_type == SINGLE_MODEL) { // Calculate likelihood directly from PSFM i = alph_index(alph, *alignment_col); likelihood = get_array_item(i, priors); } else if (model_type == AVERAGE_MODEL) { // Calculate likelihood as the *product* of the likelihoods // of the bases in the column. This corresponds to *summing* // the log-likelihoods. likelihood = 1.0; while (*alignment_col != '\0') { i = alph_index(alph, *alignment_col); likelihood *= get_array_item(i, priors); alignment_col++; } } else { // Calcluate likelihood using Felsenstein pruning algorithm int root_base = 0; int start_base = 0; int end_base = alpha_size - 1; // TLB; Check if the root is labeled with a sequence. // If it is, this is the target genome. // Use the observed base only as the ancestor base. if (has_label(tree)) { // target genome at root // TLB; Only compute conditional probability given target at root start_base = end_base = get_base_from_node( tree, seq_names, alignment_col, alph); } for (root_base = start_base; root_base <= end_base; root_base++) { double prob = pruning_algorithm( alph, alignment_col, seq_names, root_base, tree, substmatrix_array, gap_support ); double prior = get_array_item(root_base, priors); likelihood += (prob * prior); } } free_array(priors); return likelihood; } /**************************************************************************** * Calculate the log likelihood of a phylogenetic tree * at one site in the alignment. ****************************************************************************/ static double old_site_likelihood( ALPH_T alph, char* alignment_col, STRING_LIST_T* seq_names, TREE_T* tree, ARRAY_T* priors, EVOMODEL_T* model, GAP_SUPPORT_T gap_support) { int alpha_size = 0; int i, num_seq; double prob = 0.0; double prior = 0.0; double likelihood = 0.0; MODEL_TYPE_T model_type; assert(model != NULL); model_type = get_model_type(model); if (model_type == SINGLE_MODEL) { // Calculate likelihood directly from PSFM i = alph_index(alph, *alignment_col); likelihood = get_array_item(i, priors); } else if (model_type == AVERAGE_MODEL) { // Calculate likelihood directly from PSFM averaged over // bases in alignment column. num_seq = 0; while (*alignment_col != '\0') { i = alph_index(alph, *alignment_col); likelihood += get_array_item(i, priors); alignment_col++; num_seq++; } likelihood = likelihood / num_seq; } else { // Calculate likelihood using Felsenstein pruning algorithm alpha_size = alph_size(alph, ALPH_SIZE); for (i = 0; i < alpha_size; i++) { prob = old_pruning_algorithm(alph, alignment_col, seq_names, alph_char(alph, i), tree, model, gap_support); prior = get_array_item(i, priors); likelihood += (prob * prior); } } return likelihood; } /************************************************************************* * Calculate the log odds score score at a single position. *************************************************************************/ static double slowly_compute_logodds( ALPH_T alph, int i, char* alignment_col, ALIGNMENT_T* alignment, STRING_LIST_T* seq_names, TREE_T* tree, ARRAY_T* bg_freqs, EVOMODEL_T* fg_model, EVOMODEL_T* bg_model, GAP_SUPPORT_T gap_support ) { // Compute the foreground and background scores. get_alignment_col(i, alignment_col, alignment); double fg_result = old_site_likelihood(alph, alignment_col, seq_names, tree, bg_freqs, fg_model, gap_support); double bg_result = old_site_likelihood(alph, alignment_col, seq_names, tree, bg_freqs, bg_model, gap_support); // Compute the log-odds score. double log_odds = my_log2(fg_result / bg_result); return(log_odds); } /************************************************************************* * Calculate the log odds score and p-value for each motif-sized window * at each site in the specified sequence of the alignment using * the given evolutionary models. * Returns number of sites scored (not skipped). *************************************************************************/ int score_sequence_in_alignment( ALPH_T alph, int ref_seq_index, ALIGNMENT_T* alignment, char* motif_id, TREE_T* tree, int window_size, EVOMODEL_T** models, ARRAY_T* bg_freqs, PSSM_T* pssm, int* coord_conv_table, GAP_SUPPORT_T gap_support, double gap_cost, double pthresh, SCANNED_SEQUENCE_T* scanned_seq ) { assert(alignment != NULL); assert(models != NULL); // This function has two modes: fast and memory-intensive, or slow and not memory-intensive. // The function will operate in fast mode, unless the pssm argument is null. if (pssm == NULL) { assert(tree != NULL); assert(bg_freqs != NULL); } else { assert(tree == NULL); assert(bg_freqs == NULL); } // keep track of how many positions were scored int n_scored = 0; // maximum legal score in pv table int max_score = (pssm != NULL) ? get_pssm_pv_length(pssm) : 0; // JCH // This has been added so that the single mode scan // will scan the entire reference sequence and ignore // all gaps in the aligment ALIGNMENT_T* new_alignment; if (get_model_type(models[0]) == SINGLE_MODEL) { // We need a new alignment file that has just the reference sequence char* seqname = get_seq_name(get_alignment_sequence(ref_seq_index, alignment)); new_alignment = remove_alignment_gaps(seqname, alignment); // now remove all other seqs STRING_LIST_T* list = new_string_list(); add_string(seqname, list); new_alignment = remove_alignment_seqs(list, new_alignment); ref_seq_index = 0; // and we need a dummy table for converting coordinates coord_conv_table = make_alignment_to_seq_table(ref_seq_index, new_alignment); //ALIGNMENT_T* temp = alignment; alignment = new_alignment; } // Print the bloody thing out and have a look //print_phylip_alignment(alignment, fopen("Test_alignment_output_file", "w")); // Prepare storage for the string representing // the current alignment column. int alignment_length = get_alignment_length(alignment); int num_sequences = get_num_aligned_sequences(alignment); char* alignment_col = mm_malloc(sizeof(char) * (num_sequences + 1)); // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (window_size + 1)); window_seq[window_size] = '\0'; SEQ_T* ref_seq = get_alignment_sequence(ref_seq_index, alignment); // Build a descriptive name for the sequence being scored. // If the user provided a sequence ID, use that. char* align_name = NULL; char* full_name = NULL; char* seq_name = NULL; seq_name = get_seq_name(ref_seq); align_name = get_alignment_name(alignment); full_name = (char*) mm_malloc( sizeof(char) * (strlen(align_name) + strlen(seq_name) + 2) ); strcpy(full_name, align_name); strcat(full_name, "."); strcat(full_name, seq_name); // For each site in the alignment int i; for (i = 0; i < alignment_length - window_size + 1; i++) { int j; // Fill in the sequence for this window. for (j = 0; j < window_size; j++) { window_seq[j] = get_seq_char(i + j, ref_seq); } // Decide whether to skip this window. BOOLEAN_T skip_window = FALSE; for (j = 0; j < window_size; j++) { if ((alignment_site_has_gaps(i + j, alignment) == TRUE) && (gap_support == SKIP_GAPS)) { skip_window = TRUE; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Skipping sequence %s position %d (%s) due to gap at %d.\n", full_name, i, window_seq, i+j); } break; } if (alignment_site_ambiguous(alph, i + j, alignment) == TRUE) { skip_window = TRUE; if (verbosity >= DUMP_VERBOSE) { fprintf( stderr, "Skipping sequence %s position %d (%s) due to ambiguity at %d.\n", full_name, i, window_seq, i+j ); } break; } } if (skip_window == TRUE) { if (verbosity >= DUMP_VERBOSE) { fprintf( stderr, "Skipping sequence %s position %d (%s) due to ambiguity or gap.\n", full_name, i, window_seq ); } continue; } //fprintf(stderr, "HERE 2\n"); // Compute the total log-odds score across the window. double log_odds = 0.0; for (j = 0; j < window_size; j++) { //fprintf(stderr, "HERE 2.1\n"); // Check for gaps at this site if ((alignment_site_has_gaps(i + j, alignment) == TRUE) && (gap_support == FIXED_GAP_COST)) { log_odds -= gap_cost; } // Use the lookup table to compute log-odds. else if (pssm != NULL) { //fprintf(stderr, "HERE 2.2\n"); //fprintf(stderr, "Model is : %s \n", get_model_type(models[0])); int alignment_col_index; if (get_model_type(models[0]) == SINGLE_MODEL) { //fprintf(stderr, "HERE 2.2.1\n"); alignment_col_index = alph_index(alph, window_seq[j]); } else { //fprintf(stderr, "HERE 2.2.2\n"); get_alignment_col(i + j, alignment_col, alignment); alignment_col_index = hash_alignment_col(alph, alignment_col, num_sequences); } //log_odds += get_matrix_cell(j, alignment_col_index, pssm); log_odds += get_pssm_score(j, alignment_col_index, pssm); } // If no lookup table is provided, compute log-odds the slow way. else { //fprintf(stderr, "HERE 2.3\n"); log_odds += slowly_compute_logodds( alph, i, alignment_col, alignment, get_species_names( alignment), tree, bg_freqs, models[1], models[0], gap_support ); } if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "%s %d+%d %c log_odds=%g\n", full_name, i, j, window_seq[j], log_odds); } } // Retrieve the p-value corresponding to this log-odds. double pvalue = 0.0; if (pssm != NULL) { if ((int)log_odds >= max_score) log_odds = (float)(max_score - 1); pvalue = get_pssm_pv((int) log_odds, pssm); } // Store information about this site. if ((pssm != NULL)) { char strand = *motif_id; if ((strand != '-') && (strand != '+')) { strand = '.'; } int start = 0; int stop = 0; if (strand == '-') { // Reverse the sense of start/stop for the reverse strand start = coord_conv_table[i + window_size - 1]; stop = coord_conv_table[i]; } else { start = coord_conv_table[i]; stop = coord_conv_table[i + window_size - 1]; } MATCHED_ELEMENT_T *element = allocate_matched_element_with_score(start, stop, log_odds, pvalue, scanned_seq); add_scanned_sequence_scanned_position(scanned_seq); PATTERN_T* pattern = get_scanned_sequence_parent(scanned_seq); BOOLEAN_T added = add_pattern_matched_element(pattern, element); if (added == TRUE) { // Sub-set of raw sequence copied to matched element set_matched_element_sequence(element, window_seq); } else { // Element was rejected by pattern free_matched_element(element); } } n_scored++; } // Clean up memory. if (full_name != seq_name) { myfree(full_name); } myfree(alignment_col); myfree(window_seq); return(n_scored); } // score_sequence_in_alignment //~/Projects/MEME/trunk/src/motiph --bgfile intergenic_markov0 --output-pthresh 0.001 --model single --text --column-freqs empirical --pseudocount 0.01 --list alignments.txt yeast.tree ABF1.meme.txt > testout /************************************************************************* * Calculate the Branch Length Score for each motif-sized window * at each site in the specified sequence of the alignment using * the given evolutionary models. * Returns number of sites scored (not skipped). *************************************************************************/ int bls_score_sequence_in_alignment( ALPH_T alph, int ref_seq_index, ALIGNMENT_T* alignment, char* motif_id, TREE_T* tree, int window_size, ARRAY_T* bg_freqs, EVOMODEL_T** models, PSSM_T* pssm, char* inverse_motif_id, EVOMODEL_T** inverse_models, PSSM_T* inverse_pssm, int* coord_conv_table, GAP_SUPPORT_T gap_support, double gap_cost, double pthresh, int distThreshold, SCANNED_SEQUENCE_T* scanned_seq ) { // The first thing we do is get a single sequence scan score for all sequences // within the alignment. We use the existing functions and data structures to do this. int numSeqs = get_num_aligned_sequences(alignment); int i, j, t, e, temp_scored; int n_scored = 0; //int distThreshold = 20; //fprintf(stderr, "We have %d sequences in the alignment.\n", numSeqs); // We need to create some CISML crap to store each of the single scans results SCANNED_SEQUENCE_T **single_seq_scans = (SCANNED_SEQUENCE_T **) mm_malloc(numSeqs * sizeof(SCANNED_SEQUENCE_T *)); // Using this appears to be causing problems so I need to create a // a set of temporary CSMl crap instead // PATTERN_T* pat = get_scanned_sequence_parent(scanned_seq); PATTERN_T *pat = allocate_pattern(motif_id, motif_id); CISML_T *cisml = allocate_cisml("BLS", motif_id, "clustal-w alignment"); set_cisml_site_pvalue_cutoff(cisml, pthresh); // Duplicate the above if we need to do inverse scans as well SCANNED_SEQUENCE_T **inverse_single_seq_scans = NULL; PATTERN_T *inverse_pat = NULL; CISML_T *inverse_cisml = NULL; if(inverse_models != NULL) { inverse_single_seq_scans = (SCANNED_SEQUENCE_T **) mm_malloc(numSeqs * sizeof(SCANNED_SEQUENCE_T *)); inverse_pat = allocate_pattern(inverse_motif_id, inverse_motif_id); inverse_cisml = allocate_cisml("BLS", inverse_motif_id, "clustal-w alignment"); set_cisml_site_pvalue_cutoff(inverse_cisml, pthresh); //fprintf(stderr, "Inverse Data Structures Created: \n"); } MATCHED_ELEMENT_T *** posElements = (MATCHED_ELEMENT_T ***) mm_malloc(numSeqs * sizeof(MATCHED_ELEMENT_T **)); MATCHED_ELEMENT_T *** negElements = (MATCHED_ELEMENT_T ***) mm_malloc(numSeqs * sizeof(MATCHED_ELEMENT_T **)); MATCHED_ELEMENT_T *** allElements = (MATCHED_ELEMENT_T ***) mm_malloc(numSeqs * sizeof(MATCHED_ELEMENT_T **)); int *numHits = malloc( sizeof(int) * numSeqs); //int[numSeqs]; for(i=0;i 0) sort_matched_elements(TRUE, numHits[i], allElements[i]); // fprintf(stderr, "Scored %d positions in sequence %d with %d matches \n", temp_scored, i, numHits[i]); } // Ave a geezer /* for(i=0;i 0) { // Now we need to find the shortest distance int daStopSeq = get_matched_element_stop(elements[j]); // Translate these into positions in the alignment int daStart = seq_to_aln_tables[i][daStartSeq]; int daStop = seq_to_aln_tables[i][daStopSeq]; int numGaps1 = abs( cumulativeGapCounts[targStart] - cumulativeGapCounts[daStart] ); int diff1 = abs(targStart - daStart) - numGaps1; int smallest = diff1; int numGaps2 = abs( cumulativeGapCounts[targStop] - cumulativeGapCounts[daStop] ); int diff2 = abs(targStop - daStop) - numGaps2; if(diff2 < smallest) smallest = diff2; int numGaps3 = abs( cumulativeGapCounts[targStart] - cumulativeGapCounts[daStop] ); int diff3 = abs(targStart - daStop) - numGaps3; if(diff3 < smallest) smallest = diff3; int numGaps4 = abs( cumulativeGapCounts[targStop] - cumulativeGapCounts[daStart] ); int diff4 = abs(targStop - daStart) - numGaps4; if(diff4 < smallest) smallest = diff4; if(smallest < distThreshold) { char* label = get_nth_string(i, speciesLabels); //fprintf(stderr, " Match in seq [%d] %s : %d - %d \n", i, label, start, targStart); add_string(label, list_of_matching_seqs); numOfMatches++; // Remove that match from further consideration // we set the start to -1.0 as a flag set_matched_element_start(elements[j], -1); break; } } } } } // If the number of matches in comparative species is greater than 0 // Then we add in the target genome to the list of labels if(numOfMatches > 0) { char* label = get_nth_string(ref_seq_index, speciesLabels); add_string(label, list_of_matching_seqs); } // Now we have a list of leaf labels for seqs that have the motif // We retrieve the branch length of the sub-tree they define float subtreeLength = get_subtree_length(tree, list_of_matching_seqs); //fprintf(stderr, " Subtree length %f and total tree length %f \n", subtreeLength, totalLength); float score = (float) subtreeLength / totalLength; //fprintf(stderr, " Final Score %f \n", score); // Store information about this site. MATCHED_ELEMENT_T *element = allocate_matched_element_without_inversion( targStartSeq, targStopSeq, get_matched_element_sequence(targetElements[t]), scanned_seq ); set_matched_element_score(element, score); set_matched_element_pvalue(element, 1-score); add_scanned_sequence_matched_element(scanned_seq, element); /* float p_val_sub = get_matched_element_pvalue(targetElements[t]); if(score > 0) p_val_sub = (1/score) * get_matched_element_pvalue(targetElements[t]); else p_val_sub = (2*numSeqs) * get_matched_element_pvalue(targetElements[t]); if(p_val_sub > 1) p_val_sub = 1; set_matched_element_pvalue(element, p_val_sub ); */ n_scored++; // Free up some memory free_string_list(list_of_matching_seqs); } // Clean up memory. for(i=0;i