/************************************************************************** * FILE: mhmms.c * AUTHOR: William Stafford Noble, Timothy L. Bailey * CREATE DATE: 8-13-97 * PROJECT: MHMM * VERSION: $Revision: 1.3 $ * COPYRIGHT: 1998-2002, WNG, 2001-2002, TLB * DESCRIPTION: Search a database of sequences using a motif-based HMM. **************************************************************************/ #include #include #include #include #include #include #include #include "utils.h" // Generic utilities. #include "matrix.h" // Routines for floating point matrices. #include "array.h" // Routines for floating point arrays. #include "metameme.h" // Global metameme functions. #include "alphabet.h" // The amino acid / nucleotide alphabet. #include "fasta-io.h" // Read FASTA format sequences. #include "mhmm-state.h" // HMM data structure. #include "read-mhmm.h" // HMM input/output. #include "log-hmm.h" // HMM log/log-odds conversion. #include "fitevd.h" // Extreme value distribution routines. #include "rdb-matrix.h" // For reading background files #include "match.h" // Find sequence-to-model match from traceback matrix. #include "dp.h" // Dynamic programming routines. #include "pssm.h" // Motif-scoring routines. #include "seq.h" // Sequence manipulation routines #include "mhmms.h" #ifndef DEBUG #define DEBUG 0 #endif /************************************************************************* * The following block of global variables and #define's concerns * storing sequence scores and descriptions for later sorting and * printing. *************************************************************************/ #define ALLOCATE_BLOCK 1000 // Number of sequences to allocate at once. #define PV_WIDTH 11 // Number of digits in one p-value. #define SCORE_WIDTH 7 // Number of digits in one score. #define SCORE_PRECISION 2 // Number of digits after zero. #define LENGTH_WIDTH 10 // Number of digits to print sequence length. #define MAX_SEQ_ID_LENGTH 40 // Maximum allowed ID length. #define GC_WIN_SIZE 500 // Size of GC window left and right of match. /* scaling and unscaling of scores for non-local EVDs */ #define ALPHA 2.6 // Kludge factor for non-local EVDs. #define BETA 1 // Kludge factor for non-local EVDs. #define SCALE(s, m) (pow((s) - (m) + BETA, ALPHA)) #define UNSCALE(s, m) (pow((s), 1/ALPHA) + (m) - BETA) static int num_stored = 0; // Number of sequences stored so far. static char** stored_IDs; // Stored sequence IDs. static PROB_T* stored_keys; // Stored sequence keys. static PROB_T* stored_scores; // Stored sequence scores. static int* stored_lengths; // Stored sequence lengths. static double* stored_gcs; // Stored match local GC-contents. static char** stored_seqs; // Stored sequence scores and descriptions. static char** stored_traces; // Stored Viterbi traces. // Store the first part of the match line and the subsequent motif lines // separately, so we can insert the correct E-value later. static char** stored_match_gffs; // GFF entry for 'match' line. static char** stored_motif_gffs; // GFF entry for 'motif' lines. static int longest_ID = 0; // Length of the longest ID. static int num_allocated = 0; // Number of sequences allocated. BOOLEAN_T skipped_last = FALSE; // Did we skip the previous sequence? /************************************************************************* * See .h file. *************************************************************************/ int get_num_stored() { return(num_stored); } /************************************************************************* * Print parameters associated with the extreme value distribution. *************************************************************************/ static void print_distr_parameters( EVD_SET evd_set, // Distributions. SCORE_SET score_set, // Scores. FILE* out_stream // Output stream. ) { int i; fprintf(out_stream, "\nDerived parameters for E-value calculations\n"); fprintf(out_stream, " type of distribution: %s\n", evd_set.exponential ? "Exponential" : "Extreme Value"); fprintf(out_stream, " source of distribution: %s\n", evd_set.negatives_only ? "Synthetic sequence scores" : "Actual sequence scores"); fprintf(out_stream, " status: %s\n", evd_set.msg); if (evd_set.n > 0) { for (i=0; i 1) { // Length ranges. fprintf(out_stream, " min_t: %.3f\n", evd_set.evds[i].min_t); fprintf(out_stream, " max_t: %.3f\n", evd_set.evds[i].max_t); } if (evd_set.exponential) { fprintf(out_stream, " mu1: %.3g\n", evd_set.evds[i].mu1); if (!evd_set.negatives_only) { // No second component? fprintf(out_stream, " mu2: %.3g\n", evd_set.evds[i].mu2); fprintf(out_stream, " sigma2: %.3g\n", evd_set.evds[i].sigma2); fprintf(out_stream, " c: %.3g\n", evd_set.evds[i].c); } fprintf(out_stream, " number of scores used for estimation: %d\n", evd_set.evds[i].n); } else { // Extreme value parameters. fprintf(out_stream, " lambda: %.2g\n", evd_set.evds[i].lambda); fprintf(out_stream, " K: %.2g\n", evd_set.evds[i].K); fprintf(out_stream, " number of non-outliers: %d\n", evd_set.non_outliers); } } fprintf(out_stream, " N (E-value = N * p-value): %d\n", evd_set.N); fprintf(out_stream, " mininum E-value: %.3g\n", evd_set.min_e); fprintf(out_stream, " number of E-values < 1.0: %d\n", evd_set.outliers); fprintf(out_stream, " sum(-log E-values < 1.0): %.3g\n", -evd_set.sum_log_e); } fprintf(out_stream, " number of matches: %d\n", score_set.n); fprintf(out_stream, " db length: %.0f\n", score_set.total_length); if (evd_set.exponential) { fprintf(out_stream, " average minimum p-value: %.3g\n", score_set.avg_min_pvalue); fprintf(out_stream, " average motif width: %.3g\n", score_set.avgw); /* fprintf(out_stream, " probability of a false hit: %.3g\n", score_set.phit); fprintf(out_stream, " probability of a false hit within max-gap: %.3g\n", score_set.wimp); fprintf(out_stream, " E[distance between hits]: %.3g\n", score_set.egap); fprintf(out_stream, " E[hits/match]: %.3g\n", score_set.ehits); fprintf(out_stream, " E[span]: %.3g\n", score_set.espan); */ } fprintf(out_stream, "\n"); } // print_distr_parameters /************************************************************************* * Print a program's parameters and CPU time. *************************************************************************/ void print_parameters( char* const argv[], // command line int argc, // number of fields in command line char* name, // program name ALPH_T alph, // alphabet char* hmm_filename, // hmm input file name char* seq_filename, // sequence file name BOOLEAN_T viterbi, // Viterbi search? double dp_threshold, // Repeated match threshold BOOLEAN_T motif_scoring, // No partial motifs? BOOLEAN_T use_pvalues, // Convert matches to p-values? double p_threshold, // P-value threshold for motif hits. double e_threshold, // E-value threshold for printing. BOOLEAN_T both_strands, // Score both DNA strands? char* bg_filename, // Background file name. char* sc_filename, // Score file name. int pam_distance, // PAM distance, double beta, // Pseudo-weight. BOOLEAN_T zero_spacer_emit_lo, // Zero emission for spacers? int max_gap, // Maximum gap. double egcost, // Cost factor for gaps. double gap_open, // Gap open penalty. double gap_extend, // Gap extend penalty. BOOLEAN_T print_fancy, // Print alignments? BOOLEAN_T print_time, // Print CPU time? double start_time, // Starting time double end_time, // Ending time int num_seqs, // Number of sequences processed. EVD_SET evd_set, // EVD data structure. SCORE_SET score_set, // Scores. FILE* out_stream // Output stream. ) { // Print parameters used in calculating p-values. print_distr_parameters(evd_set, score_set, out_stream); fprintf(out_stream, "Program parameters for %s\n", name); fprintf(out_stream, " HMM file: "); if (hmm_filename == NULL) { fprintf(out_stream, "stdin\n"); } else { fprintf(out_stream, "%s\n", hmm_filename); } fprintf(out_stream, " Sequence file: "); if (seq_filename == NULL) { fprintf(out_stream, "stdin\n"); } else { fprintf(out_stream, "%s\n", seq_filename); } if (viterbi) { fprintf(out_stream, " Paths: single\n"); } else { fprintf(out_stream, " Paths: all\n"); } if (dp_threshold != NO_REPEAT) { fprintf(out_stream, " Minimum match score: %g\n", dp_threshold); } fprintf(out_stream, " Partial motif matches are: %s\n", motif_scoring ? "not allowed" : "allowed"); if (motif_scoring) fprintf(out_stream, " Motif scores are: %s\n", use_pvalues ? "-log(p_value(log_odds)/p-thresh))" : "log_odds"); if (use_pvalues) fprintf(out_stream, " P-value threshold for motif hits: %g\n", p_threshold); if (alph == DNA_ALPH) { // FIXME: Add this if we ever implement scoring both strands. //fprintf(out_stream, " Matches can be on: %s\n", // both_strands ? "either strand" : "given strand only"); if (both_strands) both_strands = TRUE; // prevent compiler warning } fprintf(out_stream, " E-value threshold for scores: %g\n", e_threshold); fprintf(out_stream, " Background: %s\n", bg_filename ? bg_filename : "nrdb"); fprintf(out_stream, " Score matrix for pseudocount frequencies: %s", sc_filename ? sc_filename : "PAM"); if (sc_filename) { fprintf(out_stream, "\n"); } else { fprintf(out_stream, " %d\n", pam_distance); } fprintf(out_stream, " Beta (weight on pseudocount frequencies): %6.2f\n", beta); fprintf(out_stream, " Zero spacer emission log-odds: %s\n", zero_spacer_emit_lo ? "true" : "false"); if (max_gap > 0) { fprintf(out_stream, " Maximum allowed gap: %d\n", max_gap); } if (egcost > 0) { fprintf(out_stream, " Cost factor for gaps: %8.2f\n", egcost); } if (gap_open >= 0) { fprintf(out_stream, " Gap open cost: %8.2g\n", gap_open); } if (gap_extend >= 0) { fprintf(out_stream, " Gap extension cost: %8.2g\n", gap_extend); } fprintf(out_stream, " Fancy output format: %s\n", print_fancy ? "true" : "false"); fprintf(out_stream, "\n"); // Print the actual command line. { char *command = NULL; // To hold the actual command line. int i = 1, pos = 0, len = 0; len += strlen(name)+1; // +1 for space following mm_resize(command, len+2, char); // +1 for null strcpy(command+pos, name); command[len-1] = ' '; command[len] = '\0'; pos = len; for (i=1; istates[trace_entry]); } else { the_state = NULL; } // Mark unmatched section with spaces. if ((trace_entry == -1) || (the_state->type == START_STATE) || (the_state->type == END_STATE)) { score_sequence[i_trace] = ' '; id_sequence[i_trace] = ' '; model_sequence[i_trace] = ' '; match_sequence[i_trace] = ' '; } else { // Mark spacer states with dots in consensus, spaces elsewhere. if (the_state->type == SPACER_STATE) { score_sequence[i_trace] = ' '; id_sequence[i_trace] = ' '; model_sequence[i_trace] = '.'; match_sequence[i_trace] = ' '; } // Mark motif sections ... else { // Find the motif score at this position. if (the_state->type == START_MOTIF_STATE && motif_score_matrix) { char tmp[400]; int i; double s = get_matrix_cell(the_state->i_motif, i_trace, motif_score_matrix); double score = p_threshold > 0 ? pow(2.0, -s) * p_threshold : s; sprintf(tmp, "%-*.1e/", the_state->w_motif, score); for (i=0; iw_motif; i++) { score_sequence[i_trace+i] = tmp[i]; } } // Find the ID character at this position. id_sequence[i_trace] = the_state->id_char; assert(the_state->id_char != '\0'); // Find the consensus letter at this position. model_sequence[i_trace] = choose_consensus(the_log_hmm->alph, TRUE, the_state); // Label exact matches with letters. if (model_sequence[i_trace] == get_seq_char(i_trace, sequence)) { match_sequence[i_trace] = get_seq_char(i_trace, sequence); } // Label close matches with plus signs. else if (get_array_item(get_seq_int(i_trace, sequence), the_state->emit_odds) > 0.0) { match_sequence[i_trace] = '+'; } else { match_sequence[i_trace] = ' '; } } } } // Add null terminators. score_sequence[trace_end+1] = '\0'; id_sequence[trace_end+1] = '\0'; model_sequence[trace_end+1] = '\0'; match_sequence[trace_end+1] = '\0'; // Replace flanking Xs with spaces. set_seq_char(0, ' ', sequence); set_seq_char(get_seq_length(sequence) - 1, ' ', sequence); } /************************************************************************* * A trivial function to check whether a given string consists of all * spaces. *************************************************************************/ static BOOLEAN_T all_spaces (char* a_string) { int i; int length; length = strlen(a_string); for (i = 0; i < length; i++) { if (a_string[i] != ' ') { return(FALSE); } } return(TRUE); } /************************************************************************* * Figure out which positions to start and end the alignment. * * We are given the length of this sequence segment, its offset * relative to the entire sequence, the width of the desired * alignment, and the start and end positions of the match within this * segment. The function converts the start and end positions from * segment-relative to sequence-relative coordinates. Using these new * positions, the function computes the start position of the row of * the alignment that would contain the start of this match, and the * end position of the row of the alignment that would contain the end * of this match. These coordinates are then converted back into * segment-relative coordinates. *************************************************************************/ void compute_alignment_start_end( int align_width, int seq_length, int seq_offset, int start_match, int end_match, int* align_start, int* align_end ) { // Convert the start and end to absolute sequence coordinates. int absolute_start_match = start_match + seq_offset; int absolute_end_match = end_match + seq_offset; // The trace starts at first multiple of align_width prior // to the start of the match. int absolute_align_start = ((absolute_start_match / align_width) * align_width); // Trace ends in the first whole multiple of align_width // containing the end of the match. int absolute_align_end = ((absolute_end_match / align_width) * align_width) + align_width; // The end cannot occur beyond the end of the sequence. absolute_align_end = MIN(seq_length + seq_offset - 2, absolute_align_end); // Revert to the relative coordinate for this segment. *align_start = MAX(1, absolute_align_start - seq_offset); *align_end = absolute_align_end - seq_offset; // Make sure everything is hunky dory. assert(start_match <= end_match); assert(*align_start < *align_end); } /************************************************************************* * Given a sequence, a model, and a match, print the sequence-to-model * alignment in BLAST-style format. * * The formatted output is returned as a single string, which must * be freed by the caller. *************************************************************************/ void print_alignment (int align_width, // Number of characters in one alignment line. MHMM_T* the_log_hmm, // Log-odds version of the HMM. MATCH_T* this_match, // The match. SEQ_T* sequence, // The sequence as characters. MATRIX_T* motif_score_matrix, // Motif score matrix. PROB_T p_threshold, // P-value threshold for motif hits. char** outstring) // Output string. { char* score_sequence; // Motif scores. char* id_sequence; // Motif ID numbers. char* model_sequence; // Consensus chars along path. char* match_sequence; // Indications of matches between // sequence & model. int i_seq; // Index into the sequence. int seq_length; // Length of the sequence. char* raw_sequence; // The sequence itself. int end_position; // End position of the current line. int outstring_length; // Number of chars in output string. int align_start; // Position in sequence to start alignment at. int align_end; // Position in sequence to end alignment at. // Buffers for storing stuff before printing it. static char* buffer0 = NULL; static char* buffer1; static char* buffer2; static char* buffer3; static char* buffer4; // Create local dynamic storage. if (buffer0 == NULL) { buffer0 = (char*)mm_malloc(sizeof(char) * (align_width + 1)); buffer1 = (char*)mm_malloc(sizeof(char) * (align_width + 1)); buffer2 = (char*)mm_malloc(sizeof(char) * (align_width + 1)); buffer3 = (char*)mm_malloc(sizeof(char) * (align_width + 1)); buffer4 = (char*)mm_malloc(sizeof(char) * (align_width + 1)); } // Return the empty string if there's no trace. if (this_match == NULL) { *outstring = (char *)mm_malloc(sizeof(char)); (*outstring)[0] = '\0'; return; } // Set up the start and end points for the trace. compute_alignment_start_end(align_width, get_seq_length(sequence), get_seq_offset(sequence), get_start_match(this_match), get_end_match(this_match), &align_start, &align_end); // Allocate space for sequences. seq_length = get_seq_length(sequence); score_sequence = (char *)mm_malloc(sizeof(char) * (seq_length + 1)); id_sequence = (char *)mm_malloc(sizeof(char) * (seq_length + 1)); model_sequence = (char *)mm_malloc(sizeof(char) * (seq_length + 1)); match_sequence = (char *)mm_malloc(sizeof(char) * (seq_length + 1)); // Find the correspondence between sequence and model. generate_match_sequence(the_log_hmm, align_start, align_end, this_match, motif_score_matrix, p_threshold, sequence, score_sequence, id_sequence, model_sequence, match_sequence); // Allocate plenty of space for the output string. outstring_length = ((align_end - align_start) + align_width) * 10; *outstring = (char *)mm_malloc(sizeof(char) * outstring_length); // Display the alignment. sprintf(*outstring, "\n"); buffer0[align_width] = '\0'; buffer1[align_width] = '\0'; buffer2[align_width] = '\0'; buffer3[align_width] = '\0'; buffer4[align_width] = '\0'; raw_sequence = get_raw_sequence(sequence); for (i_seq = align_start; i_seq <= align_end; i_seq += align_width) { // Figure out whether we're nearing the end of the sequence. if (i_seq + align_width <= align_end) { strncpy(buffer0, &(score_sequence[i_seq]), align_width); strncpy(buffer1, &(id_sequence[i_seq]), align_width); strncpy(buffer2, &(model_sequence[i_seq]), align_width); strncpy(buffer3, &(match_sequence[i_seq]), align_width); strncpy(buffer4, &(raw_sequence[i_seq]), align_width); end_position = i_seq + align_width - 1; } else { strcpy(buffer0, &(score_sequence[i_seq])); strcpy(buffer1, &(id_sequence[i_seq])); strcpy(buffer2, &(model_sequence[i_seq])); strcpy(buffer3, &(match_sequence[i_seq])); strncpy(buffer4, &(raw_sequence[i_seq]), align_end - i_seq + 1); buffer4[align_end - i_seq + 1] = '\0'; // Null terminate the sequence. end_position = align_end; } // If there is no alignment in this part, then skip it. if ((all_spaces(buffer1)) && (all_spaces(buffer2)) && (all_spaces(buffer3))) { continue; } if (motif_score_matrix) { // scores above hits? sprintf(&((*outstring)[strlen(*outstring)]), "%*s %s %*s\n", LENGTH_WIDTH, "", buffer0, LENGTH_WIDTH, ""); } sprintf(&((*outstring)[strlen(*outstring)]), "%*s %s %*s\n", LENGTH_WIDTH, "", buffer1, LENGTH_WIDTH, ""); sprintf(&((*outstring)[strlen(*outstring)]), "%*s %s %*s\n", LENGTH_WIDTH, "", buffer2, LENGTH_WIDTH, ""); sprintf(&((*outstring)[strlen(*outstring)]), "%*s %s %*s\n", LENGTH_WIDTH, "", buffer3, LENGTH_WIDTH, ""); sprintf(&((*outstring)[strlen(*outstring)]), "%*d %-*s %*d\n", LENGTH_WIDTH, MAX(1, i_seq) + get_seq_offset(sequence), align_width, buffer4, LENGTH_WIDTH, MIN(end_position, get_seq_length(sequence) - 2) + get_seq_offset(sequence)); sprintf(&((*outstring)[strlen(*outstring)]), "\n"); } sprintf(&((*outstring)[strlen(*outstring)]), "\n"); // Check for array bounds write. assert((int)strlen(*outstring) < outstring_length); // Free locally allocated memory. myfree(score_sequence); myfree(id_sequence); myfree(model_sequence); myfree(match_sequence); } /************************************************************************* * Combine a bunch of information into a single GFF line. The result * is returned in a local static string. *************************************************************************/ #define MAX_GFF 10000 static char* make_motif_gff_line (char* seq_id, char* motif_id, int start_match, int width, double pvalue, char strand) { static char return_value[MAX_GFF]; sprintf(return_value, "%s\tMeta-MEME\tmotif%s\t%d\t%d\t%g\t%c\t.\n", seq_id, motif_id, start_match, start_match + width - 1, pvalue, strand); return(return_value); } /************************************************************************* * Store a GFF entry for the given match object in the given string. * The gff_entry string is allocated by this function and must be * freed by the caller. *************************************************************************/ static void print_gff_entry (SEQ_T* sequence, MATCH_T* this_match, MHMM_T* the_log_hmm, MATRIX_T* motif_score_matrix, // Motif score matrix. char** match_string, char** motif_string) { static char temp_match[MAX_GFF]; // Temporary storage. static char temp_motif[MAX_GFF]; // Temporary storage. int i_match; // Create a single line for the entire match, minus the evalue. sprintf(temp_match, "%s\t%s\t%s\t%d\t%d\t", get_seq_name(sequence), "Meta-MEME", "match", // GFF indexes from 1... get_start_match(this_match) + get_seq_offset(sequence) + 1, get_end_match(this_match) + get_seq_offset(sequence) + 1); // Create additional lines for the motif matches. temp_motif[0] = '\0'; for (i_match = get_start_match(this_match); i_match < get_end_match(this_match); i_match++) { int i_state = get_trace_item(i_match, this_match); MHMM_STATE_T* this_state = &((the_log_hmm->states)[i_state]); // Are we at the start of a motif? if (this_state->type == START_MOTIF_STATE) { strcat(temp_motif, make_motif_gff_line(get_seq_name(sequence), get_state_motif_id(FALSE, this_state), get_seq_offset(sequence) + i_match + 1, this_state->w_motif, motif_score_matrix ? get_matrix_cell(this_state->i_motif, i_match, motif_score_matrix) : 0.0, get_strand(this_state) ) ); } } // Allocate space and copy the gff entries. copy_string(match_string, temp_match); copy_string(motif_string, temp_motif); } /************************************************************************* * Store a sequence's scores, ID and description for later printing. * Also stores the score and length in score_set. *************************************************************************/ void store_sequence (ALPH_T alph, // Alphabet BOOLEAN_T motif_scoring, // Motif-scoring being used. BOOLEAN_T mhmmscan, // Print mhmmscan output (else mhmms output)? int output_width, // Width in chars of the output. SEQ_T* sequence, // Info about the sequence. MATCH_T* this_match, // Info about this match. PROB_T e_threshold, // E-value reporting threshold. double dp_threshold, // Subtract from viterbi score. PROB_T p_threshold, // P-value threshold for motif hits. SCORE_SET *score_set, // Scores and lengths for computing distribution. BOOLEAN_T got_evd, // EVD is available EVD_SET evd_set, // EVD BOOLEAN_T store_trace, // Store the Viterbi trace? MHMM_T* the_log_hmm, // The HMM. MATRIX_T* motif_score_matrix, // Motif score matrix. BOOLEAN_T store_gff, // Store a GFF entry? SCANNED_SEQUENCE_T *scanned_seq // CISML scanned sequence structure ) { int length; // Length of this sequence. int span; // Span of match (end - start + 1). int nhits; // Number of motif hits in match. double viterbi_score; // Score of this sequence. int out_len = 0; // Length printed so far. double gc = 0; // GC-content around match. double t_or_gc; // Length or GC-content of sequence. viterbi_score = get_score(this_match) - dp_threshold; length = get_match_seq_length(this_match); span = get_end_match(this_match) - get_start_match(this_match) + 1; nhits = get_nhits(this_match); char *seq_name = get_seq_name(sequence); // // Save the score and sequence length. Keep track of maximum length // and smallest score. // if (viterbi_score > LOG_SMALL) { /* don't save tiny scores */ // Save unscaled score and sequence length for calculating EVD. if (!(score_set->n % ALLOCATE_BLOCK)){ mm_resize(score_set->scores, score_set->n + ALLOCATE_BLOCK, SCORE); //fprintf(stderr, "Resized score_set to %d \n", score_set->n + ALLOCATE_BLOCK); } score_set->scores[score_set->n].s = viterbi_score; /* unscaled score */ score_set->scores[score_set->n].t = length; score_set->scores[score_set->n].nhits = nhits; score_set->scores[score_set->n].span = span; if (length > score_set->max_t) score_set->max_t = length; // FIXME: Bill: do I need to worry about the sequence offset if // the match doesn't contain the whole sequence? if (alph == DNA_ALPH) { // save GC content // the GC count is (gc[match_end] - gc[match_start-1]) int start = MAX(0, get_start_match(this_match)-GC_WIN_SIZE); int end = MIN(get_seq_length(sequence)-1, get_end_match(this_match)+GC_WIN_SIZE); double n = end - start; gc = score_set->scores[score_set->n].gc = ( get_seq_gc(end, sequence) - get_seq_gc(start, sequence) ) / n; } score_set->n++; //fprintf(stderr, "score_set->n %d %f\n", score_set->n, viterbi_score); } else { /* skip tiny scores */ return; } /* Use E-value to determine if score should be skipped. * The threshold is multiplied by 10 because E-values may be fairly * inaccurate at this point. */ t_or_gc = (got_evd && evd_set.exponential) ? gc : length; if ((e_threshold < 0) || ((got_evd) && (Evd_set_evalue(score_set->n, viterbi_score, t_or_gc, 1, evd_set) > 10.0 * e_threshold))) { return; } /* Allocate memory, if necessary. */ if (num_stored >= num_allocated) { if (num_allocated > 0) { num_allocated = 2 * num_allocated; } else { num_allocated = ALLOCATE_BLOCK; } stored_IDs = (char**)mm_realloc(stored_IDs, sizeof(char*) * num_allocated); stored_keys = (PROB_T *)mm_realloc(stored_keys, sizeof(PROB_T) * num_allocated); stored_scores = (PROB_T *)mm_realloc(stored_scores, sizeof(PROB_T) * num_allocated); stored_lengths = (int *)mm_realloc(stored_lengths, sizeof(int) * num_allocated); stored_gcs = (double *)mm_realloc(stored_gcs, sizeof(double) * num_allocated); stored_seqs = (char**)mm_realloc(stored_seqs, sizeof(char*) * num_allocated); stored_traces = (char**)mm_realloc(stored_traces, sizeof(char*) * num_allocated); stored_match_gffs = (char**)mm_realloc(stored_match_gffs, sizeof(char*) * num_allocated); stored_motif_gffs = (char**)mm_realloc(stored_motif_gffs, sizeof(char*) * num_allocated); } /* Check how long this ID is. */ if ((signed)strlen(get_seq_name(sequence)) > longest_ID) { longest_ID = strlen(get_seq_name(sequence)); } /* Allocate memory for the new sequence. */ stored_IDs[num_stored] = (char*)mm_calloc(longest_ID + 1, sizeof(char)); stored_seqs[num_stored] = (char*)mm_calloc(output_width + 1, sizeof(char)); /* Store the ID, the key and the length. */ strcpy(stored_IDs[num_stored], get_seq_name(sequence)); stored_keys[num_stored] = viterbi_score; stored_lengths[num_stored] = length; stored_gcs[num_stored] = gc; // If scaned sequence is provided, add the match to it if (scanned_seq) { int start = MAX(0, get_start_match(this_match)-GC_WIN_SIZE); int end = MIN(get_seq_length(sequence)-1, get_end_match(this_match)+GC_WIN_SIZE); char *matched_sequence = get_raw_subsequence(start, end, sequence); MATCHED_ELEMENT_T *element = allocate_matched_element(start, end, scanned_seq); set_matched_element_score(element, viterbi_score); set_matched_element_sequence(element, matched_sequence); } // // Store the scores and the description. // // Print score or "NaN" if too small. if (viterbi_score <= LOG_SMALL) { sprintf(stored_seqs[num_stored], "%*s ", SCORE_WIDTH, "NaN"); } else { sprintf(stored_seqs[num_stored], "%*.*f ", SCORE_WIDTH, SCORE_PRECISION, viterbi_score + dp_threshold); } out_len += SCORE_WIDTH + 1; // Print GC content. if (0) { // Disabled. sprintf(stored_seqs[num_stored] + out_len, "%*.*f ", SCORE_WIDTH, SCORE_PRECISION, score_set->scores[(score_set->n)-1].gc); out_len += SCORE_WIDTH + 1; } // Print number of hits and span if doing motif scoring. if (motif_scoring) { sprintf(stored_seqs[num_stored] + out_len, "%*d %*d ", LENGTH_WIDTH, nhits, LENGTH_WIDTH, span ); out_len += 2 * (LENGTH_WIDTH + 1); } // Print start and end of match. sprintf(stored_seqs[num_stored] + out_len, "%*d %*d ", LENGTH_WIDTH, get_start_match(this_match) + get_seq_offset(sequence), LENGTH_WIDTH, get_end_match(this_match) + get_seq_offset(sequence) ); out_len += 2 * (LENGTH_WIDTH+1); // Print sequence length. if (!mhmmscan) { sprintf(stored_seqs[num_stored] + out_len, "%*d ", LENGTH_WIDTH, get_seq_length(sequence) - 2 // Chop off X's. ); out_len += LENGTH_WIDTH+1; } // Print description of sequence. if (output_width > out_len) { sprintf(stored_seqs[num_stored] + out_len, "%-*.*s", output_width - out_len, output_width - out_len, get_seq_description(sequence)); } // Store the Viterbi alignment. if (store_trace) { print_alignment(compute_align_width(output_width), the_log_hmm, this_match, sequence, motif_score_matrix, p_threshold, &(stored_traces[num_stored])); } // Store the GFF entry. if (store_gff) { print_gff_entry(sequence, this_match, the_log_hmm, motif_score_matrix, &(stored_match_gffs[num_stored]), &(stored_motif_gffs[num_stored])); } /* Increment the number of stored sequences. */ num_stored++; } // store_sequence #define SEPARATOR "-----------------------------------------------------------------------------------------------------------------------------------" /* I define this as a struct only so that the two items can be passed as a single pointer to the 'qsort' routine. */ typedef struct tosort { PROB_T key; char* out_string; char* gff_string; } TOSORT_T; /************************************************************************* * A comparison function used in sorting the output before printing. * Sorts by keys in ascending order. *************************************************************************/ static int tosort_compare (const void * elem1, const void * elem2) { const PROB_T key1 = ((TOSORT_T *)elem1)->key; const PROB_T key2 = ((TOSORT_T *)elem2)->key; const char* string1 = ((TOSORT_T *)elem1)->out_string; const char* string2 = ((TOSORT_T *)elem2)->out_string; if (key1 < key2) { return(-1); } else if (key1 > key2) { return(1); } else { return(strcmp(string2, string1)); } } /************************************************************************* * Tiny function to find an EOL in a string. Dies if none is found. *************************************************************************/ static int find_eol (char* this_entry) { int i; int length; length = strlen(this_entry); for (i = 0; i < length; i++) { if (this_entry[i] == '\n') { return(i); } } die("No EOL found in %s.", this_entry); return(0); } /************************************************************************* * Sort the stored list of sequences, scores and IDs, and print them * out. *************************************************************************/ void sort_and_print_scores (BOOLEAN_T print_fancy, // Include Viterbi alignment? BOOLEAN_T print_header, // Print header? BOOLEAN_T got_evd, // EVD found? BOOLEAN_T motif_scoring,// Motif-scoring being used. BOOLEAN_T mhmmscan, // Print in mhmmscan format (else mhmms)? int maxseqs, // Maximum number of sequences to print int output_width, // Width of output, in chars. PROB_T threshold, // Print sequences scoring below this. BOOLEAN_T sort_output, // Sort scores? FILE* gff_file, // Auxiliary GFF file. FILE* outfile) // Print to this file. { int i_stored; // Index into the arrays of stored sequences. TOSORT_T* tosort; // Array of output strings to be sorted. int num_tosort; // Number of output strings to be sorted. int i_tosort; // Index into array of to-be-sorted sequences. int i_toprint; // Index for printing sequences. int num_toprint; // Number of sequences to print. int num_under = 0;/* Number of sequences stored with E-values below given threshold. */ /* If we get ridiculously long sequence IDs, truncate them. */ if (longest_ID > MAX_SEQ_ID_LENGTH) { fprintf(stderr, "Warning: truncating absurdly long sequence IDs.\n"); longest_ID = MAX_SEQ_ID_LENGTH; } // Get the accurate number of sequences below the E-value threshold. // If there were too few scores for the E-value computation, print // the first "threshold" matches, sorted by decreasing score. if (got_evd) { num_under = 0; for (i_stored = 0; i_stored < num_stored; i_stored++) { double evalue = stored_keys[i_stored]; if (evalue <= threshold) num_under++; } } else { // The sorted keys contain the raw scores. Replace them // with -score so that the sort by increasing order will // put the best scores first. fprintf(stderr, "Warning: Unable to compute E-values. "); fprintf(stderr, "Sorting by score instead.\n"); for (i_stored = 0; i_stored < num_stored; i_stored++) { stored_keys[i_stored] *= -1; // -score so sort works. } num_under = num_stored; threshold = HUGE_VAL; // All keys less than this. } tosort = (TOSORT_T *)mm_malloc(sizeof(TOSORT_T) * num_stored); // Consider each stored sequence in turn. i_tosort = 0; for (i_stored = 0; i_stored < num_stored; i_stored++) { double score = stored_keys[i_stored]; // E-value or raw score. double evalue; // E-value or NaN. // Skip low-scoring sequences if (score > threshold) continue; // Allocate memory for this item. if (print_fancy) { tosort[i_tosort].out_string = (char *)mm_malloc(sizeof(char) * (output_width + 2 + strlen(stored_traces[i_stored]))); } else { tosort[i_tosort].out_string = (char *)mm_malloc(sizeof(char) * (output_width + 2)); } if (gff_file != NULL) { tosort[i_tosort].gff_string = (char *)mm_malloc(sizeof(char) * (strlen(stored_match_gffs[i_stored]) + strlen(stored_motif_gffs[i_stored]) + 50)); // Space for the evalue, etc. } // Format the ID, scores and description. evalue = got_evd ? score : NaN(); sprintf( tosort[i_tosort].out_string, "%-*.*s %*.2e %*.*s\n", longest_ID, longest_ID, stored_IDs[i_stored], PV_WIDTH, evalue, output_width - (longest_ID + PV_WIDTH + 2), output_width - (longest_ID + PV_WIDTH + 2), stored_seqs[i_stored] ); // Add the trace, if requested. if (print_fancy) { strcat(tosort[i_tosort].out_string, stored_traces[i_stored]); } // Add the GFF entry, if requested. if (gff_file != NULL) { sprintf( tosort[i_tosort].gff_string, "%s%g\t.\t.\n%s", stored_match_gffs[i_stored], fabs(score), stored_motif_gffs[i_stored] ); } // Store the associated key. tosort[i_tosort].key = score; i_tosort++; } num_tosort = i_tosort; // Sort the sequences. if (sort_output) { qsort((void *)tosort, num_tosort, sizeof(TOSORT_T), tosort_compare); } // Decide how many sequences to print. num_toprint = got_evd ? num_under : num_stored; // If the maxseqs limit has been set, apply it if (maxseqs > NO_MAX_SEQS) { num_toprint = MIN(num_toprint, maxseqs); } // Print the header. if (print_header) { fprintf(outfile, "%-*s ", longest_ID, "ID"); fprintf(outfile, "%*.*s ", PV_WIDTH, PV_WIDTH, "E-value"); fprintf(outfile, "%*s ", SCORE_WIDTH, "Score"); if (motif_scoring) { fprintf(outfile, "%*s ", LENGTH_WIDTH, "Hits"); fprintf(outfile, "%*s ", LENGTH_WIDTH, "Span"); } fprintf(outfile, "%*s ", LENGTH_WIDTH, "Start"); fprintf(outfile, "%*s ", LENGTH_WIDTH, "End"); if (!mhmmscan) { fprintf(outfile, "%*s ", LENGTH_WIDTH, "Length"); } fprintf(outfile, "Description\n"); fprintf(outfile, "%*.*s\n", output_width, output_width, SEPARATOR); } // Print the sequences (only the first line). for (i_toprint = 0; i_toprint < num_toprint; i_toprint++) { char* this_entry = tosort[i_toprint].out_string; int end_of_line = find_eol(this_entry); this_entry[end_of_line] = '\0'; fprintf(outfile, "%s\n", this_entry); this_entry[end_of_line] = '\n'; } // Print the traces (if requested). if (print_fancy) { fprintf(outfile, "%*.*s\n\n", output_width, output_width, SEPARATOR); for (i_toprint = 0; i_toprint < num_toprint; i_toprint++) { fprintf(outfile, "%s", tosort[i_toprint].out_string); } } if (print_header) { fprintf(outfile, "%*.*s\n\n", output_width, output_width, SEPARATOR); } // Print the GFFs. if (gff_file != NULL) { for (i_toprint = 0; i_toprint < num_toprint; i_toprint++) { fprintf(gff_file, "%s", tosort[i_toprint].gff_string); } } // Free up memory for the sorted sequences. for (i_tosort = 0; i_tosort < num_tosort; i_tosort++) { myfree(tosort[i_tosort].out_string); if (gff_file != NULL) { myfree(tosort[i_tosort].gff_string); } } myfree(tosort); // Free up memory for the stored sequences. for (i_stored = 0; i_stored < num_stored; i_stored++) { myfree(stored_seqs[i_stored]); myfree(stored_IDs[i_stored]); if (print_fancy) { myfree(stored_traces[i_stored]); } if (gff_file != NULL) { myfree(stored_match_gffs[i_stored]); myfree(stored_motif_gffs[i_stored]); } } myfree(stored_seqs); myfree(stored_IDs); myfree(stored_traces); myfree(stored_match_gffs); myfree(stored_motif_gffs); myfree(stored_keys); myfree(stored_scores); myfree(stored_lengths); myfree(stored_gcs); } /*********************************************** * Calculate the score distribution. ***********************************************/ EVD_SET calc_distr( SCORE_SET score_set, // Set of scores and lengths. BOOLEAN_T exponential, // Use exponential distribution? BOOLEAN_T match_evalues // Use match E-values? ) { double H = 0; // don't use H int maxiter1 = 10; // max. iterations in L-H loop int maxiter2 = 20; // max. iterations in N-R loop int size = 1000; // size of length groups double lspan = 1.5; // min. length ratio/group // // If using match E-values instead of score E-values, set all // the sequence lengths to total_length/# matches. // if (!exponential && match_evalues) { int i; int length = score_set.n ? score_set.total_length/score_set.n : score_set.total_length; for (i=0; in <= 0) return; // no EVD available // Get min(E-value) and sum of log(E). evd_set->outliers = 0; // Number with E < 1. evd_set->min_e = BIG; // Smallest E-value. evd_set->sum_log_e = 0; // Sum of log(E < 1). for (i=0; iexponential ? stored_gcs[i] : (t ? t : stored_lengths[i]); evalue = stored_keys[i] = Evd_set_evalue(n, stored_keys[i], t_or_gc, q, *evd_set); if (evalue < evd_set->min_e) evd_set->min_e = evalue; if (evalue < 1) { evd_set->outliers++; evd_set->sum_log_e += log(evalue); } } } // calc_evalues #ifdef MAIN #include "simple-getopt.h" VERBOSE_T verbosity = NORMAL_VERBOSE; /************************************************************************** * int main **************************************************************************/ int main (int argc, char *argv[]) { // Command line parameters. char* hmm_filename; // File containing the HMM. FILE* hmm_file; char* seq_filename; // File containing the sequences. FILE* seq_file; char* viterbi_string; // Command line buffer. BOOLEAN_T viterbi; // Use Viterbi scoring. BOOLEAN_T local_scoring; // Use local scoring. BOOLEAN_T motif_scoring; // Perform motif-scoring. BOOLEAN_T use_pvalues; // Convert motif scores to p-values. PROB_T p_threshold; // P-value threshold for motif hits. int maxseqs; // Maximum number of sequences to print BOOLEAN_T both_strands; // Score both DNA strands. PROB_T e_threshold; // E-value threshold for scores. char* bg_filename; // File containing background frequencies. int pam_distance; // PAM distance char* sc_filename; // File containing substitution scores. double beta; // Weight on pseudocounts. BOOLEAN_T allow_weak_motifs; // Allow motifs with min p-value > p-thresh? BOOLEAN_T zero_spacer_emit_lo; // Set spacer emission log-odds = 0? double gap_open; // Cost to open a gap; ignore if < 0. double gap_extend; // Cost to extend a gap; ignore if < 0. int output_width; // Width of output, in chars. BOOLEAN_T print_fancy; // Print Viterbi alignments? BOOLEAN_T sort_output; // Sort output scores? int progress_every; // Show progress after every n iterations. BOOLEAN_T print_header; // Print header information? BOOLEAN_T print_params; // Print program parameters? BOOLEAN_T print_time; // Print timing info? // Data structures. MHMM_T* the_hmm; // The HMM itself. MHMM_T* the_log_hmm; // The HMM, with probs converted to logs. SEQ_T* sequence; // Sequence to search against. MATRIX_T* motif_score_matrix; // Number of motifs x sequence length. MATRIX_T* dp_matrix = NULL; // Dynamic programming matrix. MATRIX_T* trace_matrix = NULL; // Traceback for Viterbi. MATCH_T* this_match = allocate_match(); // This sequence-to-model match. int dp_rows = 0; // Size of the DP matrix. int dp_cols = 0; EVD_SET evd_set; // EVD data structure BOOLEAN_T got_evd = FALSE; // no EVD found yet SCORE_SET *score_set = NULL;// Set of scores/lengths for computing EVD. // Local variables. ALPH_T alph; double start_time; // Time at start of sequence processing. double end_time; // Time at end of sequence processing. int num_seqs; // Number of sequences processed. // Record CPU time. myclock(); /*********************************************** * Parse the command line. ***********************************************/ // Set defaults. hmm_filename = NULL; seq_filename = NULL; viterbi_string = NULL; viterbi = TRUE; local_scoring = TRUE; motif_scoring = FALSE; use_pvalues = FALSE; p_threshold = -1; // Don't do p-value scoring. maxseqs = NO_MAX_SEQS; both_strands = FALSE; // Score given DNA strand only. e_threshold = DEFAULT_E_THRESHOLD; bg_filename = NULL; pam_distance = -1; // Use default PAM distance. sc_filename = NULL; beta = -1; // Illegal beta, use defaults. allow_weak_motifs = FALSE; // Don't allow weak motifs. zero_spacer_emit_lo = FALSE; gap_open = -1; // No gap open penalty. gap_extend = -1; // No gap extension penalty. output_width = DEFAULT_OUTPUT_WIDTH; print_fancy = FALSE; sort_output = TRUE; progress_every = DEFAULT_PROGRESS_EVERY; print_header = TRUE; print_params = TRUE; print_time = TRUE; { // Define the usage message. char usage[1000] = ""; // Define command line options. cmdoption const options[] = { {"paths", REQUIRED_VALUE}, {"global", NO_VALUE}, {"maxseqs", REQUIRED_VALUE}, {"pthresh", REQUIRED_VALUE}, {"both-strands", NO_VALUE}, {"ethresh", REQUIRED_VALUE}, {"fancy", NO_VALUE}, {"width", REQUIRED_VALUE}, {"nosort", NO_VALUE}, {"bg-file", REQUIRED_VALUE}, {"allow-weak-motifs", NO_VALUE}, {"progress", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE}, {"noheader", NO_VALUE}, {"noparams", NO_VALUE}, {"notime", NO_VALUE}, {"quiet", NO_VALUE}, {"zselo", NO_VALUE}, {"gap-open", REQUIRED_VALUE}, {"gap-extend", REQUIRED_VALUE}, {"motif-scoring", NO_VALUE}, {"pseudo-weight", REQUIRED_VALUE}, {"pam", REQUIRED_VALUE}, {"score-file", REQUIRED_VALUE} }; int option_count = 24; int option_index = 0; strcat(usage, "USAGE: mhmms [options] \n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --paths [single|all] (default=single)\n"); strcat(usage, " --global (default=local)\n"); strcat(usage, " --maxseqs \n"); strcat(usage, " --pthresh \n"); strcat(usage, " --both-strands\n"); strcat(usage, " --ethresh \n"); strcat(usage, " --fancy\n"); strcat(usage, " --width (default=79)\n"); strcat(usage, " --nosort\n"); strcat(usage, " --bg-file \n"); strcat(usage, " --allow-weak-motifs\n"); strcat(usage, " --progress \n"); strcat(usage, " --verbosity 1|2|3|4|5 (default=2)\n"); strcat(usage, " --noheader\n"); strcat(usage, " --noparams\n"); strcat(usage, " --notime\n"); strcat(usage, " --quiet\n"); strcat(usage, "\n"); strcat(usage, " Advanced options:\n"); strcat(usage, " --zselo\n"); strcat(usage, " --gap-open \n"); strcat(usage, " --gap-extend \n"); strcat(usage, " --motif-scoring\n"); strcat(usage, " --pseudo-weight (default=10)\n"); strcat(usage, " --pam (default=250 [protein] 1 [DNA])\n"); strcat(usage, " --score-file \n"); strcat(usage, "\n"); simple_setopt(argc, argv, option_count, options); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char* message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error process command line options (%s)\n", message); } if (strcmp(option_name, "paths") == 0) { viterbi_string = option_value; } else if (strcmp(option_name, "global") == 0) { local_scoring = FALSE; } else if (strcmp(option_name, "maxseqs") == 0) { maxseqs = atoi(option_value); } else if (strcmp(option_name, "pthresh") == 0) { p_threshold = atof(option_value); } else if (strcmp(option_name, "both-strands") == 0) { both_strands = TRUE; } else if (strcmp(option_name, "ethresh") == 0) { e_threshold = atof(option_value); } else if (strcmp(option_name, "fancy") == 0) { print_fancy = TRUE; } else if (strcmp(option_name, "width") == 0) { output_width = atoi(option_value); } else if (strcmp(option_name, "nosort") == 0) { sort_output = FALSE; } else if (strcmp(option_name, "bg-file") == 0) { bg_filename = option_value; } else if (strcmp(option_name, "allow-weak-motifs") == 0) { allow_weak_motifs = TRUE; } else if (strcmp(option_name, "progress") == 0) { progress_every = atoi(option_value); } else if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T)atoi(option_value); } else if (strcmp(option_name, "noheader") == 0) { print_header = FALSE; } else if (strcmp(option_name, "noparams") == 0) { print_params = FALSE; } else if (strcmp(option_name, "notime") == 0) { print_time = FALSE; } else if (strcmp(option_name, "quiet") == 0) { print_header = print_params = print_time = FALSE; verbosity = QUIET_VERBOSE; } else if (strcmp(option_name, "zselo") == 0) { zero_spacer_emit_lo = TRUE; } else if (strcmp(option_name, "gap-open") == 0) { gap_open = atof(option_value); } else if (strcmp(option_name, "gap-extend") == 0) { gap_extend = atof(option_value); } else if (strcmp(option_name, "motif-scoring") == 0) { motif_scoring = TRUE; } else if (strcmp(option_name, "pseudo-weight") == 0) { beta = atof(option_value); } else if (strcmp(option_name, "pam") == 0) { pam_distance = atof(option_value); } else if (strcmp(option_name, "score-file") == 0) { sc_filename = option_value; } } // Read the two required arguments. if (option_index + 2 != argc) { fprintf(stderr, "%s", usage); exit(1); } hmm_filename = argv[option_index]; seq_filename = argv[option_index+1]; } // Make sure we got the required files. if (hmm_filename == NULL) die("No HMM file given.\n"); if (seq_filename == NULL) die("No sequence file given.\n"); // Figure out what kind of scoring to do. if ((viterbi_string == NULL) || (strcmp(viterbi_string, "single") == 0)) { viterbi = TRUE; } else if (strcmp(viterbi_string, "all") == 0) { viterbi = FALSE; } else { die("Illegal option (\"-paths %s\").", viterbi_string); } // Force p-value scoring if p-value threshold given. if (p_threshold != -1) use_pvalues = TRUE; // Check p-threshold is in range [0 1)) die("You must specify p-thresh in the range [0alph; if (!(alph == DNA_ALPH || alph == PROTEIN_ALPH)) die("%s alphabet not allowed.\n", alph_name(alph)); // Set the PAM distance. if (pam_distance == -1) { pam_distance = (alph == DNA_ALPH ? DEFAULT_DNA_PAM : DEFAULT_PROTEIN_PAM); } if (beta < 0) beta = (alph == PROTEIN_ALPH) ? 10 : 1; // Read the background distribution. free_array(the_hmm->background); the_hmm->background = get_background(alph, bg_filename); // Convert the model to log space. convert_to_from_log_hmm(TRUE, // Convert to logs. zero_spacer_emit_lo, gap_open, gap_extend, the_hmm->background, sc_filename, pam_distance, beta, the_hmm, &the_log_hmm); // Set up PSSM matrices if doing motif_scoring // and pre-compute motif p-values if using p-values. // Set up the hot_states list. set_up_pssms_and_pvalues(motif_scoring, p_threshold, both_strands, allow_weak_motifs, the_log_hmm); // // Set up for computing score distribution. // score_set = set_up_score_set( p_threshold, -1, // dp_threshold, -1, // max_gap FALSE, // negatives_only the_log_hmm); /*********************************************** * Search the database one sequence at a time. ***********************************************/ // Open the file for reading. if (open_file(seq_filename, "r", TRUE, "sequence", "sequences", &seq_file) == 0) exit(1); start_time = myclock(); num_seqs = 0; while (read_one_fasta(alph, seq_file, MAX_SEQ, &sequence)) { num_seqs++; // Keep track of total database size for E-value calculation. score_set->total_length += get_seq_length(sequence); // Let the user know what's going on. if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Scoring %s (length=%d).\n", get_seq_name(sequence), get_seq_length(sequence)); } // Convert the sequence to alphabet-specific indices. prepare_sequence(sequence, alph); assert(get_seq_char(get_seq_length(sequence), sequence) == '\0'); /* Allocate the dynamic programming matrix. Rows correspond to states in the model, columns to positions in the sequence. */ if ((dp_rows < the_log_hmm->num_states) || (dp_cols < get_seq_length(sequence))) { free_matrix(dp_matrix); free_matrix(trace_matrix); if (dp_rows < the_log_hmm->num_states) dp_rows = the_log_hmm->num_states; if (dp_cols < get_seq_length(sequence)) dp_cols = get_seq_length(sequence); // (Add one column for repeated match algorithm.) dp_matrix = allocate_matrix(dp_rows, dp_cols + 1); trace_matrix = allocate_matrix(dp_rows, dp_cols + 1); } // Compute the motif scoring matrix. if (motif_scoring) { motif_score_matrix = allocate_matrix(the_log_hmm->num_motifs, get_seq_length(sequence)); compute_motif_score_matrix(use_pvalues, p_threshold, get_int_sequence(sequence), get_seq_length(sequence), the_log_hmm, &motif_score_matrix); } else { //FIXME: remove motif_score_matrix = NULL; } // Perform the appropriate type of dynamic programming. if (viterbi) { viterbi_algorithm( local_scoring, get_int_sequence(sequence), get_seq_length(sequence), the_log_hmm, TRUE, // Save Viterbi path? motif_score_matrix, dp_matrix, trace_matrix, this_match ); } else { forward_algorithm( local_scoring, get_int_sequence(sequence), get_seq_length(sequence), the_log_hmm, //motif_score_matrix, // FIXME dp_matrix, this_match ); } // Store the score, ID, length and comment for later printing. //assert(get_match_seq_length(this_match) == get_seq_length(sequence)); //assert(get_match_seq_length(this_match) == get_seq_length(sequence)); store_sequence(alph, motif_scoring, FALSE, // Don't print in mhmmscan format. output_width, sequence, this_match, e_threshold, 0, // no dp_threshold p_threshold, score_set, got_evd, evd_set, print_fancy, the_log_hmm, motif_score_matrix, FALSE, // Store GFF? NULL ); /* Calculate the initial E-value distribution if the required * number of sequences has been saved. This will allow the * descriptions of low-scoring sequences to not be stored. The * distribution will be recomputed using all saved scores when all * sequences have been read. */ if (score_set->n == EVD_NUM_SEQS && got_evd == FALSE) { evd_set = calc_distr( *score_set, // Set of scores. use_pvalues, // Use exponential distribution? FALSE // Use sequence E-values. ); if (evd_set.n > 0) got_evd = TRUE; } // Free the memory used by this sequence. free_matrix(motif_score_matrix); free_seq(sequence); if ((verbosity >= NORMAL_VERBOSE) && (num_seqs % progress_every == 0)) { fprintf(stderr, "\rSequences: %d", num_seqs); } } end_time = myclock(); /*********************************************** * Calculate the E-values and store them as the keys. ***********************************************/ // Recalculate the EVD using all scores. // If successful, calculate the E-values and // store them as keys. evd_set = calc_distr( *score_set, // Set of scores. use_pvalues, // Use exponential distribution? FALSE // Use sequence E-values. ); if (evd_set.n > 0) { int q, t, N; q = 1; // Ignore query "length". t = 0; // Use stored target lengths. N = evd_set.non_outliers; // Use number of non-outliers. evd_set.N = N; calc_evalues(&evd_set, N, q, t); got_evd = TRUE; } evd_set.negatives_only = FALSE; // Used real sequences. /*********************************************** * Print header information. ***********************************************/ if (print_header) { write_header( "MIAO", "Database search results", the_hmm->description, the_hmm->motif_file, hmm_filename, seq_filename, stdout ); } /*********************************************** * Sort and print the results. ***********************************************/ if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "\nSorting the E-values.\n"); } sort_and_print_scores( print_fancy, print_header, got_evd, motif_scoring, FALSE, // Don't print in mhmmscan format. maxseqs, // Maximum number of sequences to print output_width, e_threshold, sort_output, NULL, // FIXME: GFF output? stdout ); /*********************************************** * Print the program parameters. ***********************************************/ if (print_params) { print_parameters( argv, argc, "mhmms", alph, hmm_filename, seq_filename, viterbi, NO_REPEAT, // Repeated match threshold. motif_scoring, use_pvalues, p_threshold, e_threshold, both_strands, bg_filename, sc_filename, pam_distance, beta, zero_spacer_emit_lo, -1, // No max_gap. 0, // No egcost. gap_open, gap_extend, print_fancy, print_time, start_time, end_time, num_seqs, evd_set, *score_set, stdout ); } /* Tie up loose ends. */ // myfree(evd_set.evds); free_mhmm(the_hmm); free_mhmm(the_log_hmm); free_matrix(dp_matrix); free_matrix(trace_matrix); free_match(this_match); fclose(seq_file); return(0); } #endif /* MAIN */ /* * Local Variables: * mode: c++ * c-basic-offset: 2 * End: */