/******************************************************************** * FILE: centrimo.c * AUTHOR: Timothy Bailey, Philip Machanick * CREATE DATE: 06/06/2011 * PROJECT: MEME suite * COPYRIGHT: 2011, UQ ********************************************************************/ #define DEFINE_GLOBALS #include #include #include #include #include #include // for basename #include "matrix.h" #include "alphabet.h" #include "dir.h" #include "fasta-io.h" #include "hash_alph.h" #include "io.h" #include "motif-in.h" #include "projrel.h" #include "pssm.h" #include "seq-reader-from-fasta.h" #include "simple-getopt.h" #include "string-list.h" #include "utils.h" #include "background.h" #include "seq.h" #include #include #define BLOCK_SIZE 1000000 char* program_name = "centrimo"; VERBOSE_T verbosity = NORMAL_VERBOSE; // Structure for tracking centrimo command line parameters. typedef struct options { BOOLEAN_T allow_clobber; // Allow overwritting of files in output directory. BOOLEAN_T scan_both_strands; // Scan forward and reverse strands BOOLEAN_T invoke_centrimo_plots; char* bg_filename; // Name of file file containg background freq. char* command_line; // Full command line char* meme_filename; // Name of file containg motifs. char* output_dirname; // Name of the output directory char* seq_filename; // Name of file containg sequences. double score_thresh; // Minimum value of site score (bits). int max_plots; // Maximum number of plots on combined plot int seed; // Seed for random numbers. double alpha; // Non-motif specific scale factor. double pseudocount; // Pseudocount added to Motif PSFM. ALPH_T alphabet; // Alphabet specified by MEME file. STRING_LIST_T* selected_motifs; // Indices of requested motifs. char *html_path; // Path to CENTRIMO HTML output file char *text_path; // Path to CENTRIMO plain-text output file const char* HTML_FILENAME; // Name of HTML output file. const char* TEXT_FILENAME; // Name of plain-text output file. char* commandline; // command line args FILE * html_file; FILE * text_file; const char* usage; // Usage statment } CENTRIMO_OPTIONS_T; // Structure for generating stats in --scorestats mode. // store scores and sites together for more efficient memory access. typedef struct scores_sites { double count; // count of sites at current position double sites; // number of scores at current position } CENTRIMO_SCORES_SITES_T; typedef struct stats { long score_chunk; long positions_allocated; // the number of positions in the sequences we have allocated space for CENTRIMO_SCORES_SITES_T *stats_data; long site_max; long motif_max_site; } CENTRIMO_STATS_T; // useful for initialising const CENTRIMO_SCORES_SITES_T ZEROS = {0.0, 0.0}; /*********************************************************************** Print plain text record for motif site to standard output. ***********************************************************************/ static void print_site_as_text( const char *motif_id, const char *seq_name, const char *raw_seq, int start, int stop, char strand, double score, double pvalue ); /*********************************************************************** Free memory allocated in options processing ***********************************************************************/ static void cleanup_options(CENTRIMO_OPTIONS_T *options) { if (fclose (options->text_file) != 0) fprintf(stderr,"failed to close text output file\n"); if (options->html_file) { fprintf(options->html_file, "
\n\n"); if (fclose (options->html_file) != 0) fprintf(stderr,"failed to close html output file\n"); } } /*********************************************************************** Process command line options ***********************************************************************/ static void process_command_line( int argc, char* argv[], CENTRIMO_OPTIONS_T *options ) { // Define command line options. const int num_options = 10; cmdoption const centrimo_options[] = { {"bgfile", REQUIRED_VALUE}, {"o", REQUIRED_VALUE}, {"oc", REQUIRED_VALUE}, {"score", REQUIRED_VALUE}, {"maxplots", REQUIRED_VALUE}, {"motif-pseudo", REQUIRED_VALUE}, {"text", NO_VALUE}, {"seed", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE}, {"noinvoke", NO_VALUE}, // won't be able to run centrimo-plots before install }; BOOLEAN_T set_output_dir = FALSE; // Define the usage message. options->usage = "USAGE: centrimo [options] \n" "\n" " Options:\n" " --o output directory; default: 'centrimo_out'\n" " --oc allow overwriting; default: 'centrimo_out'\n" " --bgfile background frequency model for PWMs;\n" " default: base frequencies in input sequences\n" " Note: Use \'--bgfile motif-file\' to use the\n" " background frequencies given in the motif file.\n" " --motif-pseudo pseudo-count to use creating PWMs;\n" " default: %.3g\n" " --score score threshold for PWMs, in bits; default=%.1g\n" " sequences without a site with score >= \n" " are ignored\n" " --maxplots output site-probability curves for the motifs\n" " with the lowest central enrichment p-values\n" " on a single plot; default: %d\n" " Note: After running 'centrimo', you can use\n" " 'centrimo-plots' to remake plots and/or to make\n" " make separate plots for every motif.\n" " Type 'centrimo-plots -h' for more information.\n" " --seed seed for random number generator; default: %d\n" " --verbosity [1|2|3|4] verbosity of output: 1 (quiet) - 4 (dump);\n" " default: %d\n" "\n"; int option_index = 0; /* Make sure various options are set to NULL or defaults. */ options->allow_clobber = TRUE; // options->text_only = FALSE; options->scan_both_strands = TRUE; options->invoke_centrimo_plots = TRUE; options->bg_filename = NULL; options->command_line = NULL; options->meme_filename = NULL; options->output_dirname = "centrimo_out"; options->seq_filename = NULL; options->score_thresh = 5; options->max_plots = 5; options->pseudocount = 0.1; options->alpha = 1.0; options->selected_motifs = new_string_list(); options->seed = 0; verbosity = 2; simple_setopt(argc, argv, num_options, centrimo_options); // Parse the command line. while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "bgfile") == 0){ options->bg_filename = option_value; } else if (strcmp(option_name, "motif") == 0){ if (options->selected_motifs == NULL) { options->selected_motifs = new_string_list(); } add_string(option_value, options->selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0){ options->pseudocount = atof(option_value); } else if (strcmp(option_name, "norc") == 0){ options->scan_both_strands = FALSE; } else if (strcmp(option_name, "o") == 0){ // Set output directory with no clobber options->output_dirname = option_value; options->allow_clobber = FALSE; set_output_dir = TRUE; } else if (strcmp(option_name, "oc") == 0){ // Set output directory with clobber options->output_dirname = option_value; options->allow_clobber = TRUE; set_output_dir = TRUE; } //else if (strcmp(option_name, "text") == 0){ // options->text_only = TRUE; //} else if (strcmp(option_name, "score") == 0){ options->score_thresh = atof(option_value); } else if (strcmp(option_name, "maxplots") == 0){ options->max_plots = atoi(option_value); } else if (strcmp(option_name, "seed") == 0){ options->seed = atoi(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } else if (strcmp(option_name, "noinvoke") == 0) { options->invoke_centrimo_plots = FALSE; } } // Must have sequence and motif file names if (argc != option_index + 2) { fprintf(stderr, options->usage, options->pseudocount, options->score_thresh, options->max_plots, options->seed, verbosity); exit(EXIT_FAILURE); } // Record the command line options->command_line = get_command_line(argc, argv); // Record the input file names options->meme_filename = argv[option_index]; option_index++; options->seq_filename = argv[option_index]; option_index++; // open the output files only after the output directory's created options->text_file = NULL; options->html_file = NULL; options->html_path = options->text_path = NULL; //if (options->text_only) { // if (set_output_dir) { // fprintf(stderr, "Warning: set output directory `%s' in text only mode; ignored.\n", // options->output_dirname); // } // options->html_path = options->text_path = NULL; //} else { //options->HTML_FILENAME = "centrimo.html"; options->TEXT_FILENAME = "site_counts.txt"; // Set up path values for needed stylesheets and output files. //options->html_path = make_path_to_file(options->output_dirname, options->HTML_FILENAME); options->text_path = make_path_to_file(options->output_dirname, options->TEXT_FILENAME); //} // from ame.c: { // make enough space for all the command line args, with one space between each int line_length = 0; int i; for (i = 0; i < argc; i++) line_length += strlen(i == 0 ? basename(argv[0]) : argv[i]); // add on argc to allow one char per word for separating space + terminal '\0' options->commandline = (char*) malloc(sizeof(char)*(line_length+argc)); int nextpos = 0; for (i = 0; i < argc; i++) { // been here before? put in a space before adding the next word if (nextpos) { options->commandline[nextpos] = ' '; nextpos ++; } char * nextword = i == 0 ? basename(argv[0]) : argv[i]; strcpy(&options->commandline[nextpos], nextword); nextpos += strlen (nextword); } } } /************************************************************************* * Calculate the log odds score for a single motif-sized window. *************************************************************************/ static inline BOOLEAN_T score_motif_site( CENTRIMO_OPTIONS_T *options, char *seq, PSSM_T *pssm, double *pvalue, // OUT double *score // OUT ) { ALPH_T alph = options->alphabet; int asize = alph_size(alph, ALPH_SIZE); ARRAY_T* pv_lookup = pssm->pv; MATRIX_T* pssm_matrix = pssm->matrix; BOOLEAN_T scorable_site = TRUE; double scaled_log_odds = 0.0; // For each position in the site int motif_position; for (motif_position = 0; motif_position < pssm->w; motif_position++) { char c = seq[motif_position]; int aindex = alph_index(alph, c); // Check for gaps and ambiguity codes at this site if(aindex == -1 || aindex >= asize) { scorable_site = FALSE; break; } scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix); } if (scorable_site == TRUE) { int w = pssm->w; *score = get_unscaled_pssm_score(scaled_log_odds, pssm); // Handle scores that are out of range if ((int) scaled_log_odds >= get_array_length(pv_lookup)) { scaled_log_odds = (float)(get_array_length(pv_lookup) - 1); *score = scaled_to_raw(scaled_log_odds, w, pssm->scale, pssm->offset); } *pvalue = get_array_item((int) scaled_log_odds, pv_lookup); } return scorable_site; } /************************************************************************* * Calculate and record the log-odds score and p-value for each * possible motif site in the sequence. * * Returns 1 if found a site >= options->score_thresh, 0 otherwise. *************************************************************************/ static long centrimo_score_sequence( CENTRIMO_OPTIONS_T *options, SEQ_T* sequence, char strand, MOTIF_T* motif, MOTIF_T* rev_motif, ARRAY_T* bg_freqs, PSSM_T* pssm, PSSM_T* rev_pssm, CENTRIMO_STATS_T* output_stats ) { assert(motif != NULL); assert(bg_freqs != NULL); assert(pssm != NULL); long num_positions = 0L; // Score and record each possible motif site in the sequence char *raw_seq = get_raw_sequence(sequence); long L = strlen(raw_seq); int r = pssm->w; int c = pssm->alphsize; // This stuff is for compute the max-scoring site double best_llr = -BIG;// best LLR so far double *llr_scores = NULL;// scores int *llr_starts = NULL;// starting positions int n_llr_scores = 0; // Read and score each position in the sequence. int j; for (j = 0; j < L - r + 1; j++) { double pvalue = 0.0; double score = 0.0; double *prior = NULL; // Track number of positions in sequence ++num_positions; char *raw_seq_seg = &raw_seq[j]; // Score and record forward strand BOOLEAN_T scoreable_site = score_motif_site(options, raw_seq_seg, pssm, &pvalue, &score); double score_max = -BIG; BOOLEAN_T valid_score = scoreable_site; valid_score = scoreable_site; if (valid_score) score_max = score; // Score and record reverse strand if appropriate. if (rev_pssm != NULL) { scoreable_site = score_motif_site(options, raw_seq_seg, rev_pssm, &pvalue, &score); if (scoreable_site == TRUE) { if (! valid_score) { valid_score = TRUE; score_max = score; } else if (score > score_max) score_max = score; } } if (valid_score) { // allocate memory for score sums if (output_stats->positions_allocated <= j) { // make sure we have enough memory for new data long newstart = output_stats->positions_allocated; long i; output_stats->positions_allocated += output_stats->score_chunk; mm_resize(output_stats->stats_data, output_stats->positions_allocated, CENTRIMO_SCORES_SITES_T); // zero out sums for (i = newstart; i < output_stats->positions_allocated; i++) { output_stats->stats_data[i] = ZEROS; } } // Save the best score so far and all scores if (score_max > best_llr) best_llr = score_max; if (n_llr_scores % BLOCK_SIZE == 0) { mm_resize(llr_scores, n_llr_scores+BLOCK_SIZE, double); mm_resize(llr_starts, n_llr_scores+BLOCK_SIZE, int); } llr_scores[n_llr_scores] = score_max; llr_starts[n_llr_scores++] = j; output_stats->stats_data[j].sites++;// sites numbered from 1 if (j > output_stats->site_max) output_stats->site_max = j; // need following check because reverse comp could reset the maximum if (output_stats->motif_max_sitemotif_max_site = j; } } // Record the position of best site, randomly choosing to break ties if (best_llr >= options->score_thresh && llr_scores != NULL) { int *ties = NULL; mm_resize(ties, n_llr_scores, int); int n_ties = 0; // get all the ties int i; for (i=0; istats_data[pos].count += 1;// increment count at best position myfree(ties); } myfree(llr_scores); myfree(llr_starts); // Count reminaing positions in the sequence. num_positions += r - 1; if (num_positions != L) fprintf (stderr, "counted positions %ld != %ld seq length\n", num_positions, L); return(best_llr >= options->score_thresh ? 1 : 0); } /************************************************************************* * Read all the sequences into an array of SEQ_T *************************************************************************/ static void read_sequences (ALPH_T alph, char * seqfilename, SEQ_T*** sequences, int * seqN) { const int max_sequence = 32768; // unlikely to be this big FILE * SEQFILE = fopen(seqfilename, "r"); if (!SEQFILE) die ("failed to open sequence file `%s'", seqfilename); read_many_fastas (alph, SEQFILE, max_sequence, seqN, sequences); if (fclose(SEQFILE) != 0) die ("failed to close sequence file"); } /************************************************************************* * return NULL if a string is empty, otherwise the string: * to simplify testing for empty strings down to one case *************************************************************************/ char * null_string (char * str) { if (str != NULL) if (str[0] == '\0') return NULL; return str; } static const char HTMLHEADER[] = "\n" "\n" "\n" "CentriMo results\n" "\n" "\n" " CentriMo (CENTRalIty analysis of MOtifs): Compiled on " __DATE__ " at " __TIME__ "
" "Copyright © Philip Machanick p.machanick@ru.ac.za & Timothy Bailey t.bailey@imb.uq.edu.au, 2011.\n" "\n" "

If CentriMo is of use to you in your research, please cite:" "

Philip Machanick and Timothy L. Bailey, "CentriMo: Central enrichment of" "Transcription Factor Binding Site Motifs", in preparation.


Command line
\n"; /************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; CENTRIMO_STATS_T output_stats = {512, 0L, NULL, 0L, 0L}; SEQ_T** sequences = NULL; int seqN; // number of sequences /********************************************** * COMMAND LINE PROCESSING **********************************************/ process_command_line(argc, argv, &options); /********************************************** * Read the motifs. **********************************************/ ARRAY_T* bg_freqs; ARRAYLST_T *motifs; MREAD_T *mread; ALPH_T alph; // use DNA alphabet alph = DNA_ALPH; read_sequences (alph, options.seq_filename, &sequences, &seqN); int i; if (options.bg_filename) { bg_freqs = get_file_frequencies(&alph, options.bg_filename, NULL); } else { bg_freqs = calc_bg_from_file(alph, options.seq_filename, FALSE); } mread = mread_create(options.meme_filename, OPEN_MFILE); mread_set_background(mread, bg_freqs); mread_set_pseudocount(mread, options.pseudocount); motifs = mread_load(mread, NULL); if (mread_get_alphabet(mread) != alph) { die("Expected %s alphabet motifs\n", alph_name(alph)); } mread_destroy(mread); options.alphabet = alph; // Doesn't include rev comp motifs int num_motif_names = arraylst_size(motifs); PRIOR_DIST_T *prior_dist = NULL; if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(alph, bg_freqs); normalize_subarray(0, alph_size(alph, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(alph, FALSE, bg_freqs); // Make reverse complement motifs. add_reverse_complements(motifs); } // Create output directory unless in text only mode //if (options.text_only) { // options.text_file = stdout; //} else { if (create_output_directory( options.output_dirname, options.allow_clobber, FALSE /* Don't print warning messages */ ) ) { // Failed to create output directory. die("Couldn't create output directory %s.\n", options.output_dirname); } // FIXME: What should really be in the HTML file? Not the plot data. //options.html_file = fopen(options.html_path, "w"); //fprintf(options.html_file, "%s\n%s

", HTMLHEADER, options.commandline); options.text_file = fopen(options.text_path, "w"); //} /************************************************************** * Score each of the sites for each of the selected motifs. **************************************************************/ int motif_index = 0; long max_sequence_length = 0; // Count of all motifs including rev. comp. motifs int num_motifs = arraylst_size(motifs); for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T *) arraylst_get(motif_index, motifs); char* motif_id = get_motif_st_id(motif); char* bare_motif_id = get_motif_id(motif); if ((get_num_strings(options.selected_motifs) == 0) || (have_string(bare_motif_id, options.selected_motifs) == TRUE)) { if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build PSSM for motif and tables for p-value calculation. // FIXME: the non-averaged freqs should be used for p-values PSSM_T* pos_pssm = build_motif_pssm( motif, bg_freqs, bg_freqs, prior_dist, options.alpha, PSSM_RANGE, 0, // no GC bins FALSE // make log-likelihood pssm ); // If required, do the same for the reverse complement motif. MOTIF_T* rev_motif = NULL; PSSM_T* rev_pssm = NULL; if (options.scan_both_strands) { ++motif_index; rev_motif = (MOTIF_T *) arraylst_get(motif_index, motifs); motif_id = get_motif_st_id(rev_motif); // FIXME: the non-averaged freqs should be used for p-values rev_pssm = build_motif_pssm( rev_motif, bg_freqs, bg_freqs, prior_dist, options.alpha, PSSM_RANGE, 0, // GC bins FALSE ); if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } } char strand = (alph == PROTEIN_ALPH ? '.' : '+'); // Set the random number seed so that this motif will always // get the same score (as long as the sequences are in the same // order!) my_srand(options.seed); // Read the FASTA file one sequence at a time. int num_sequences_with_site = 0; for (i = 0; i < seqN; i++) { num_sequences_with_site += centrimo_score_sequence( &options, sequences[i], strand, motif, rev_motif, bg_freqs, pos_pssm, rev_pssm, &output_stats ); } // All sequences parsed fprintf(stderr, "number of sequences with site: %d\n", num_sequences_with_site); { // output for each motif's stats starts here long j; char * primary = get_motif_id(motif); char * secondary = null_string(get_motif_id2(motif)); // single test for empty string fprintf(options.text_file, "MOTIF\t%s", primary); if (secondary) fprintf(options.text_file, "\t%s", secondary); fprintf(options.text_file, "\n"); if (options.html_file) { char * url = null_string(get_motif_url(motif)); fprintf(options.html_file, "MOTIF "); if (url) fprintf(options.html_file,"%s", url, primary); else fprintf(options.html_file,"%s", primary); if (secondary) fprintf(options.html_file, " %s", secondary); fprintf(options.html_file, "
\n"); } for (j=0; j <= output_stats.motif_max_site; j++) { double n = output_stats.stats_data[j].sites; int count = output_stats.stats_data[j].count; fprintf(options.text_file, "%ld\t%d\t%2G\n", j+1, count, n); if (options.html_file) { fprintf(options.html_file, "%ld\t%d\t%2G
\n", j+1, count, n); } } // now zero out the data structures for (j = 0; j < output_stats.positions_allocated; j++) { output_stats.stats_data[j] = ZEROS; } output_stats.motif_max_site = 0; } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); } else { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Skipping motif %s.\n", motif_id); } } } // All motifs parsed // Clean up. // Close input readers if (prior_dist != NULL) { free_prior_dist(prior_dist); } free_motifs(motifs); free_array(bg_freqs); free_string_list(options.selected_motifs); free(output_stats.stats_data); cleanup_options(&options); if (options.invoke_centrimo_plots) { // Create the plots using a system call char *script_name = "centrimo-plots"; int command_length = strlen(BIN_DIR) + strlen(options.output_dirname) + strlen(options.text_path) + strlen(script_name); char *command = mymalloc(command_length + 100); sprintf(command, "%s/%s -oc %s -max %d -title 'score>=%.2f (bits)' < %s/%s\n", BIN_DIR, script_name, options.output_dirname, options.max_plots, options.score_thresh, options.output_dirname, options.TEXT_FILENAME); fprintf(stderr, "command: %s\n", command); system(command); free(command); } return 0; }