/* * $Id: init.c 5565 2011-05-10 23:07:42Z cegrant $ * */ /*********************************************************************** * * * MEME * * Copyright 1994-2007, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* init.c */ /* Initialize meme. */ #define ABS_MIN_W 2 #include "meme.h" #include "general.h" #include "banner.h" #include "dir.h" #include #include #include "io.h" #include "psp.h" #ifndef EXP #define EXP 0 #else #define EXP 1 #endif /* priors */ #define PROTEIN_PLIB "prior30.plib" #define DNA_PLIB "dna.plib" #define ROUNDERROR (1E-12) /* User input parameters */ static BOOLEAN check_syntax = FALSE; /* exit after checking syntax if true */ static char *datafile = NULL; /* positive examples */ static char *sf = NULL; /* name to print for datafile */ static char *negfile = NULL; /* negative examples */ static char *ntype = "pair"; /* output two matrices per motif if negatives */ static char *obj = "ev"; /* objective function */ static char *bfile = NULL; /* use default background Markov model file */ static char *pspfile = NULL; /* use positional priors */ static BOOLEAN psp2 = FALSE; /* true if 2-stranded positional priors */ static char *default_output_dirname = "meme_out"; /* default name of output directory */ static BOOLEAN clobber = FALSE; /* default is not to overwrite existing files */ static char *plib_name = NULL; /* use default library */ static char *mod = "zoops"; /* model type input string; default ZOOPS */ static char *alph = "PROTEIN"; /* default alphabet IUPAC protein 1-letter */ static BOOLEAN revcomp = FALSE; /* don't use reverse complement strand of DNA */ static int pal = 0; /* = 0, no palindromes = 1, force DNA palindromes, */ static BOOLEAN ma_trim = TRUE; /* trim width using multiple alignment method */ static double wg = 11; /* default gap weight */ static double ws = 1; /* default space weight */ static BOOLEAN endgaps = TRUE; /* count end gaps in multiple alignment */ static double distance = 1e-5; /* squared euclidean distance for convergence */ static char *prior = NULL; /* prior type input string */ static double beta = -1; /* scale factor for prior; defaults differ */ static double prob = 1.0; /* try enough subsequences so P=prob */ static int nmotifs = 1; /* number of motifs to find */ static char *mfile = NULL; /* name of known .motifs file*/ static int maxiter = 50; /* max number iterations of EM on best start */ static double nsites = 0; /* try one value of nsites0 only if > 0 */ static int min_nsites = 0; /* minimum nsites0 to try */ static int max_nsites = 0; /* maximum nsites0 to try */ static double wnsites = 0.8; /* weight on prior on nsites */ static int w = 0; /* width of motifs */ static int min_w = MIN_W; /* minimum W0 to try */ static BOOLEAN all_widths = FALSE; /* all widths between min and max */ static BOOLEAN min_w_set; /* if set on command line don't override */ static int max_w = MAX_W; /* maximum W0 to try */ static int min_ic = 0; /* minimum per column IC */ static MAP_TYPE map_type; /* type of sequence to theta mapping */ static char *mapname = NULL; /* map type input string */ static double map_scale=-1; /* scale of sequence to theta mapping: Uni - size of add-n prior (n) Pam - PAM distance (120) Default set in init_em. */ static int n_spcons = 0; /* number of specified start points */ static char *spcons[MAXG]; /* starting point consensus strings */ static int main_hs = HSIZE; /* size of heap at "main" w values */ static double hs_decrease = HS_DECREASE; /* Rate of decrease for heap size */ static BOOLEAN x_branch = FALSE; /* Use x_branch regardless of seq model */ static BOOLEAN no_x_branch = FALSE; /* Don't use x_branch, regardless of seq model */ static BOOLEAN w_branch = FALSE;/* Controls whether width branching occurs */ static BOOLEAN print_heaps = FALSE; /* Print heaps after branching rounds*/ static BOOLEAN print_pred = FALSE; /* Print out the predicted sites after each round of MEME search (eg subsequence, EM) */ static BOOLEAN print_pllr = FALSE; /* print the LLR of the aligned planted sites*/ static int bfactor = BFACTOR; /* branching factor for branching search */ //static double param_V = 1; /* Parameter used by the "deme" objective fn */ //static double pseu = 1; /* User-specified pseudo-count used in GLAM */ static int maxsize= 100000; /* dataset size limit */ static int seed = 0; /* random number seed */ static double seqfrac = 1; /* fraction of input dataset sequences to use */ static char *meme_directory = NULL; /* meme source directory */ static double max_time = 0; /* maximum allowed CPU time; ignore if 0 */ /* subroutines */ static void init_meme_background ( char *bfile, /* background model file */ BOOLEAN rc, /* average reverse comps */ DATASET *dataset /* the dataset */ ); static PRIORS *create_priors( PTYPE ptype, /* type of prior to use */ double beta, /* beta for dirichlet priors; < 0 only returns alphabet */ DATASET *dataset, /* the dataset */ char *plib_name /* name of prior library */ ); /**********************************************************************/ /* init_meme Set up all the stuff for meme. */ /**********************************************************************/ extern void init_meme( int argc, /* number of input arguments */ char **argv, /* input arguments */ MODEL **model_p, /* the model */ MODEL **best_model_p, /* the best model */ MODEL **scratch_model_p, /* the best model */ MODEL **neg_model_p, /* model of negative examples */ DATASET **dataset_p, /* the dataset */ DATASET **neg_dataset_p, /* dataset of negative examples */ char *text_filename, /* name of the text output file */ char **output_dirname, /* name of the output directory */ FILE **text_output /* destination for text output */ ) { int i, j, cc=0, len, pos; OBJTYPE objfun = Ev; /* type of objective function */ MOTYPE mtype; /* type of model */ PTYPE ptype; /* type of prior */ PRIORS *priors; /* the prior probabilities model */ P_POINT *p_point; /* previously learned starting points */ char *alphabet; /* alphabet for dataset */ int alength; /* length of alphabet */ MODEL *model=NULL, *best_model=NULL, *scratch_model=NULL, *neg_model=NULL; DATASET *dataset=NULL, *neg_dataset=NULL; double evt = BIG; /* no E-value cutoff */ BOOLEAN no_print = FALSE; /* turn off printing if parallel and not main */ #ifdef PARALLEL /* turn off printing if parallel and not the main processor */ no_print = (mpMyID() != 0); #endif *output_dirname = default_output_dirname; /* get the command line arguments */ i = 1; #ifndef lint /* print the command line */ argv[0] = ""; DO_STANDARD_COMMAND_LINE(1, USAGE(USAGE:\n\tmeme\t [optional arguments]\n); NON_SWITCH(1, \t\tfile containing sequences in FASTA format\n, switch (i++) { case 1: datafile = _OPTION_; break; default: COMMAND_LINE_ERROR; }); FLAG_OPTN(1, h, \t\t\tprint this message, USAGE_MESSAGE); DATA_OPTN(0, neg, , \tfile containing negative example sequences, negfile = _OPTION_); DATA_OPTN(0, ntype, pair|blend, \thow to use negative examples, ntype = _OPTION_); DATA_OPTN(1, o, , \tname of directory for output files\n\t\t\t\twill not replace existing directory, *output_dirname = _OPTION_); DATA_OPTN(1, oc, , \tname of directory for output files\n\t\t\t\twill replace existing directory, clobber=TRUE; *output_dirname = _OPTION_); FLAG_OPTN(1, text, \t\t\toutput in text format (default is HTML), TEXT_ONLY = TRUE); DATA_OPTN(EXP, objfun, ev|pv, \tobjective function (default: ev), obj = _OPTION_); FLAG_OPTN(1, dna, \t\t\tsequences use DNA alphabet, alph = "DNA"); FLAG_OPTN(1, protein, \t\tsequences use protein alphabet, alph = "PROTEIN"); DATA_OPTN(1, mod, oops|zoops|anr, \tdistribution of motifs,mod = _OPTION_); DATA_OPTN(1, nmotifs, , \tmaximum number of motifs to find, nmotifs = atoi(_OPTION_)); DATA_OPTN(1, evt, , \t\tstop if motif E-value greater than , evt = atof(_OPTION_)); DATA_OPTN(1, nsites, , \tnumber of sites for each motif, nsites=atof(_OPTION_)); DATA_OPTN(1, minsites, , \tminimum number of sites for each motif, min_nsites=atoi(_OPTION_)); DATA_OPTN(1, maxsites, , \tmaximum number of sites for each motif, max_nsites=atoi(_OPTION_)); DATA_OPTN(1, wnsites, , \tweight on expected number of sites, wnsites=atof(_OPTION_)); DATA_OPTN(1, w, , \t\tmotif width, w = atoi(_OPTION_)); DATA_OPTN(1, minw, , \t\tminimum motif width, min_w = atoi(_OPTION_); min_w_set=TRUE); DATA_OPTN(1, maxw, , \t\tmaximum motif width, max_w = atoi(_OPTION_)); DATA_OPTN(EXP, minic, , \tminimum column information content (default: 0), min_ic = atof(_OPTION_)); FLAG_OPTN(1, nomatrim, \t\tdo not adjust motif width using multiple\n\t\t\t\talignment, ma_trim = FALSE); DATA_OPTN(1, wg, , \t\tgap opening cost for multiple alignments, wg=atof(_OPTION_)); DATA_OPTN(1, ws, , \t\tgap extension cost for multiple alignments, ws=atof(_OPTION_)); FLAG_OPTN(1, noendgaps, \t\tdo not count end gaps in multiple alignments, endgaps = FALSE); DATA_OPTN(1, bfile, , \tname of background Markov model file, bfile = _OPTION_); FLAG_OPTN(1, revcomp, \t\tallow sites on + or - DNA strands, revcomp = TRUE); FLAG_OPTN(1, pal, \t\t\tforce palindromes (requires -dna), pal = 1); DATA_OPTN(1, maxiter, , \tmaximum EM iterations to run, maxiter = atoi(_OPTION_)); DATA_OPTN(1, distance, , \tEM convergence criterion, distance = atof(_OPTION_)); DATA_OPTN(1, psp, , \tname of positional priors file, pspfile = _OPTION_); DATA_OPTN(EXP, psp2, , \tname of 2-stranded positional priors file\n\t\t\t\t(requires -revcomp), pspfile = _OPTION_;psp2 = TRUE); DATA_OPTN(1, prior, dirichlet|dmix|mega|megap|addone, \n\t\t\t\ttype of prior to use, prior = _OPTION_); DATA_OPTN(1, b, , \t\tstrength of the prior, beta = atof(_OPTION_)); DATA_OPTN(1, plib, , \t\tname of Dirichlet prior file, plib_name = _OPTION_); DATA_OPTN(1, spfuzz, , \tfuzziness of sequence to theta mapping, map_scale = atof(_OPTION_)); DATA_OPTN(1, spmap, uni|pam, \tstarting point seq to theta mapping type, mapname = _OPTION_); DATA_OPTN(1, cons, , \t\tconsensus sequence to start EM from, spcons[n_spcons++] = _OPTION_); DATA_OPTN(1, heapsize, , \tsize of heaps for widths where substring \n\t\t\t\tsearch occurs, main_hs = atoi(_OPTION_)); FLAG_OPTN(1, x_branch, \t\tperform x-branching, x_branch=TRUE); FLAG_OPTN(EXP, no_x_branch, \t\tdo not perform x-branching, no_x_branch=TRUE); FLAG_OPTN(1, w_branch, \t\tperform width branching, w_branch=TRUE); FLAG_OPTN(1, allw, \t\t\tinclude all motif widths from min to max, all_widths=TRUE); DATA_OPTN(1, bfactor, , \t\tbranching factor for branching search, bfactor = atoi(_OPTION_)); FLAG_OPTN(EXP, print_pred, \t\tprint out the sites predicted by meme, print_pred = TRUE); FLAG_OPTN(EXP, print_heaps, \t\tprint heaps after each branching round, print_heaps = TRUE); FLAG_OPTN(EXP, planted_LLR, \t\tprint the LLR of the aligned planted sites, print_pllr = TRUE); DATA_OPTN(1, maxsize, , \tmaximum dataset size in characters, maxsize = atoi(_OPTION_)); FLAG_OPTN(1, nostatus, \t\tdo not print progress reports to terminal, NO_STATUS = TRUE); DATA_OPTN(1, p, , \t\tuse parallel version with processors, i=i); DATA_OPTN(1, time, , \t\tquit before CPU seconds consumed, max_time = atof(_OPTION_)); DATA_OPTN(1, sf, , \t\tprint as name of sequence file, sf = _OPTION_); FLAG_OPTN(2, check_syntax, \t\tcheck input syntax and exit, check_syntax = TRUE); FLAG_OPTN(1, V, \t\t\tverbose mode, VERBOSE = TRUE); DATA_OPTN(EXP, mfile, , \tfile of known motifs, mfile = _OPTION_); DATA_OPTN(EXP, seed, , \t\tseed for random numbers in sampling, seed = atoi(_OPTION_)); DATA_OPTN(EXP, seqfrac, , \tfraction of sequences to use, seqfrac= atof(_OPTION_)); FLAG_OPTN(EXP, trace, \t\ttrace starting points, TRACE = TRUE); FLAG_OPTN(EXP, print_all, \t\tprint all debug information, PRINTALL = TRUE); FLAG_OPTN(EXP, print_w, \t\tprint erasure matrix, PRINT_W = TRUE); FLAG_OPTN(EXP, print_z, \t\tprint missing information matrix, PRINT_Z = TRUE); FLAG_OPTN(EXP, print_ll, \t\tprint log-likelihood during EM, PRINT_LL = TRUE); FLAG_OPTN(EXP, print_starts, \t\tprint starting points, PRINT_STARTS = TRUE); ) #endif /* exit if check_syntax is on */ if (check_syntax) exit(0); /* set random number generator */ srand48(seed); if (TEXT_ONLY == TRUE) { // Legacy: plain text output to standard out. *text_output = stdout; } else { if (!no_print) { // allow clobbering of the default output directory if (*output_dirname == default_output_dirname) { clobber = TRUE; } if (create_output_directory(*output_dirname, clobber, !NO_STATUS)) { // Failed to create output directory. exit(1); } // Create the name of the text output file // "/text_filename/" and open it for writing char *path = make_path_to_file(*output_dirname, text_filename); *text_output = fopen(path, "w"); //FIXME CEG check for errors myfree(path); } } /* set all the print flags appropriately */ if (PRINTALL) { PRINT_W = TRUE; PRINT_Z = TRUE; PRINT_LL = TRUE; PRINT_STARTS = TRUE; } /* check input arguments */ if (prob < 0 || prob > 1) { fprintf(stderr, "-prob

,

must be between 0 and 1\n"); exit(1); } if (nmotifs >= MAXG) { fprintf(stderr, "-nmotifs larger than MAXG-1. Use smaller -nmotifs or recompile with larger MAXG.\n"); exit(1); } /* get the name of the directory where MEME is installed */ if (!meme_directory) { /* not given on command line */ meme_directory = MEME_DIR; } /* get the objective function type */ if (!strcmp(obj, "pv")) { objfun = Pv; } else if (!strcmp(obj, "ev")) { objfun = Ev; } else { printf("Unknown objective function type %s. \n", obj); exit(1); } /* get the model type */ if (!strcmp(mod, "anr") || !strcmp(mod, "tcm")) { mtype = Tcm; if (pspfile) { /* PM FIXME */ fprintf (stderr, "-mod anr (-mod tcm) not yet supported for with -psp.\n"); exit(1); } } else if (!strcmp(mod, "oops")) { mtype = Oops; } else if (!strcmp(mod, "zoops")) { mtype = Zoops; } else { mtype = Zoops; /* prevent warning */ fprintf(stderr, "Unknown model type %s. \n", mod); exit(1); } /* check seqfrac */ if (seqfrac > 1 || seqfrac <=0) { fprintf(stderr, "seqfrac must be in (0, 1]\n"); exit(1); } /* check the alphabet and set up default mappings and priors */ if (!strcmp(alph, "DNA")) { /* DNA */ alphabet = DNA0; if (!mapname) mapname = "uni"; /* uniform prior mapping */ if (!prior) prior = "dirichlet"; /* simple dirichlet prior */ } else { /* PROTEIN */ alphabet = PROTEIN0; if (!mapname) mapname = "pam"; /* PAM mapping */ if (!prior) { switch (mtype) { case Oops: prior = "dmix"; break; case Zoops: case Tcm: prior = "megap"; break; default: prior = "dirichlet"; break; } } if (revcomp) { fprintf(stderr, "You cannot use -revcomp with -protein.\n"); exit(1); } } /* set alphabet and priors defaults */ /* find out type of prior */ if (!strcmp(prior, "dirichlet")) { ptype = Dirichlet; if (beta < 0) beta = 0.01; /* default b = 0.01 */ } else if (!strcmp(prior, "dmix")) { ptype = Dmix; if (beta < 0) beta = 0; /* default b = 0 for dmix */ } else if (!strcmp(prior, "megadmix") || !strcmp(prior, "mega")) { ptype = Mega; /* use mega prior heuristic */ } else if (!strcmp(prior, "megap")) { ptype = MegaP; /* discretization uses b=0 */ } else if (!strcmp(prior, "addone")) { ptype = Addone; } else { ptype = Dirichlet; /* prevent warning */ fprintf(stderr, "Unknown type of prior: %s!\n", prior); exit(1); } /* convert the alphabet to upper case */ for (i=0; alphabet && (cc = alphabet[i]); i++) if (islower(cc)) alphabet[i] = toupper(cc); /* setup hashing function for encoding strings as integers */ alength = setup_hash_alph(alphabet); /* read the samples and set up globals */ dataset = read_seq_file(datafile, alphabet, revcomp, seqfrac); if (!dataset) exit(1); // read in psp file if one given if (pspfile) { read_psp_file(pspfile, dataset, psp2, revcomp, mtype); // warn that we are using the W in the PSP file as minw if (!min_w_set) { fprintf(stderr,"Setting minimum motif width to width of the prior in the PSP file: %d\n", dataset->psp_w); min_w = dataset->psp_w; } } else { /* no PSP file */ dataset->psp_w = min_w; } /* set dataset alphabet flag */ dataset->dna = !strcmp(alph, "DNA"); /* set the dataset objfun */ dataset->objfun = objfun; /* initialize the background model */ init_meme_background(bfile, revcomp, dataset); /* reset the current alphabet since it may be clobbered by the above */ (void) setup_hash_alph(alphabet); /* prevent too long jobs */ if (dataset->total_res > maxsize) { fprintf(stderr, "Dataset too large (> %d). Rerun with larger -maxsize.\n", maxsize); exit(1); } /* Calculate optimal split points for this dataset; no-op if non-parallel */ balance_loop(dataset->samples, dataset->n_samples); /* read in known motifs file */ if (mfile) { FILE *fdata = fopen(datafile, "r"); //FIXME CEG check for errors dataset->nkmotifs = read_motifs(fdata, mfile, dataset->motifs, FALSE, NULL); nmotifs = dataset->nkmotifs; prob = 0; /* don't sample starts */ dataset->pal = 0; /* no palindrome testing */ } else { dataset->nkmotifs = 0; } /* create the priors */ if (ptype == Dmix || ptype == Mega || ptype == MegaP) { /* make the name of the prior library */ if (!plib_name) { char *tmp1, *tmp2; if (!strcmp(alphabet, PROTEIN0)) { plib_name = PROTEIN_PLIB; /* default mixture prior for proteins */ } else { plib_name = DNA_PLIB; /* default mixture prior for DNA */ } /* prepend meme_directory to file name */ Strcat(tmp1, meme_directory, "/etc/"); Strcat(tmp2, tmp1, plib_name); plib_name = tmp2; } } if ((ptype == Mega || ptype == MegaP) && beta == -1) { /* tlb 5-9-97; wgt_total_res */ /*beta = 10.0 * dataset->wgt_total_res;*/ /* size of mega prior */ beta = 5.0 * dataset->wgt_total_res; /* size of mega prior */ } priors = create_priors(ptype, beta, dataset, plib_name); /* check that alphabet in mixture prior matches alphabet */ if (ptype == Dmix || ptype == Mega || ptype == MegaP) { if (strcmp(alphabet, priors->plib->alphabet)) { fprintf(stderr, "The alphabet in the mixture prior file (%s) doesn't match the sequence alphabet.\n", plib_name); fprintf(stderr, "prior alphabet : %s\n", priors->plib->alphabet); fprintf(stderr, "sequence alphabet: %s\n", alphabet); exit(1); } } /* set number of occurrences of sites */ if (nsites != 0) { if (mtype == Oops) { fprintf(stderr, "You may not specify -sites with -mod oops\n"); exit(1); } min_nsites = max_nsites = nsites; } /* set search range for nsites */ if (mtype == Oops) { min_nsites = max_nsites = dataset->n_samples; } else if (mtype == Zoops) { if (min_nsites > dataset->n_samples) { fprintf(stderr, "Minimum number of sites too large. Setting to 2.\n"); min_nsites = 2; } if (!min_nsites) min_nsites = 2; /* default */ if (max_nsites > dataset->n_samples) { fprintf(stderr, "Maximum number of sites exceeded. Setting to %d.\n", dataset->n_samples); max_nsites = dataset->n_samples; } if (!max_nsites) max_nsites = dataset->n_samples; /* default */ } else { /* TCM model */ if (min_nsites<2) min_nsites = 2; /* default */ if (!max_nsites) max_nsites = MIN(5*dataset->n_samples, 50); } /* check that max number of sites >= min number of sites */ if (min_nsites > max_nsites) { fprintf( stderr, "The minimum number of sites is set to %d. " "It should be less than the max number of sites (%d).\n", min_nsites, max_nsites ); exit(1); } /* check that there are enough possible sites */ if (min_nsites < 2) { fprintf(stderr, "You must specify a minimum of 2 or more sites.\n"); exit(1); } if (max_nsites < 2) { fprintf(stderr, "It must be possible for at least 2 sites to fit.\n"); exit(1); } /* check weight on prior on nsites */ if (wnsites >= 1 || wnsites < 0) { fprintf(stderr, " must be in range [0..1).\n"); exit(1); } /* check that no sequence too short */ if (dataset->min_slength < MIN_W) { fprintf(stderr, "All sequences must be at least %d characters long. Remove ", MIN_W); fprintf(stderr, "shorter sequences\nand rerun.\n"); exit(1); } /* set up globals */ if (w != 0) { /* w specified; set min_w and max_w */ max_w = min_w = w; fprintf(stderr,"w set, setting max and min to %d#######\n",w); } /* oops model: limit max_w to shortest seq */ if (mtype == Oops && max_w > dataset->min_slength) { max_w = dataset->min_slength; fprintf(stderr, "maxw > length of shortest sequence (%ld).", dataset->min_slength); fprintf(stderr, " Setting maxw to %d.\n", max_w); } /* all models: limit max_w to longest seq */ if (max_w > dataset->max_slength) { max_w = dataset->max_slength; fprintf(stderr, "maxw > length of longest sequence (%ld).", dataset->max_slength); fprintf(stderr, " Setting maxw to %d.\n", max_w); } if (max_w > MAXSITE) { fprintf(stderr, "maxw too large (> %d). Recompile with larger MAXSITE.\n", MAXSITE); exit(1); } if (max_w < 0) { /* use default */ max_w = MIN(MAXSITE, dataset->min_slength); /* maximum W0 */ } /* check that min_w <= max_w */ if (min_w > max_w) { if (pspfile) { fprintf(stderr, "PSP file w = %d > maxw = %d. Respecify larger -maxw.\n", dataset->psp_w, max_w); exit(1); } fprintf(stderr, "minw > maxw. Setting minw to %d.\n", max_w); min_w = max_w; } /* check that min_w and max_w are at least ABS_MIN_W */ if (min_w < ABS_MIN_W) { fprintf(stderr, "Minimum width must be >= %d. Respecify larger -w or -minw.\n", ABS_MIN_W); exit(1); } else if (max_w < ABS_MIN_W) { fprintf(stderr, "Maximum width must be >= %d. Respecify larger -w or -maxw.\n", ABS_MIN_W); exit(1); } /* must use TCM if only one sequence */ if (mtype != Tcm && dataset->n_samples==1) { fprintf(stderr, "You must specify '-mod anr' since your dataset contains only one sequence.\n" ); fprintf(stderr, "Alternatively, you might wish to break your sequence into several sequences.\n" ); exit(1); } /* check that using right alphabet if model type requires it */ if (strcmp(alph, "DNA")) { if (revcomp) { fprintf(stderr, "You must use default DNA alphabet if using complementary strand!\n"); exit(1); } if (pal) { fprintf(stderr, "You must use default DNA alphabet if using -pal !\n"); exit(1); } } /* flag search for palindromes */ dataset->pal = pal; /* get the type of mapping between sequences and thetas */ if (!strcmp(mapname, "uni")) { map_type = Uni; if (map_scale == -1) map_scale = .5; /* default add .5 */ } else if (!strcmp(mapname, "pam")) { map_type = Pam; if (map_scale == -1) map_scale = 120; /* default PAM 120 */ } else { fprintf(stderr, "Unknown mapping type %s. \n", mapname); exit(1); } /* check that IUPAC alphabet if using PAM mapping */ // mapname == "pam" && alphabet != PROTEIN0 if (!strcmp(mapname,"pam") && (strcmp(alphabet,PROTEIN0))) { fprintf(stderr, "Setting sequence to theta mapping type to `Uni' since alphabet not IUPAC.\n"); mapname = "uni"; } /* set up the sequence to theta mapping matrix for starts */ dataset->map = init_map(map_type, map_scale, alength, dataset->back, FALSE); dataset->lomap = init_map(map_type, map_scale, alength, dataset->back, TRUE); /* set up p_point: previously learned components start points */ p_point = (P_POINT *) mymalloc(sizeof(P_POINT)); p_point->c = n_spcons; /* number of specified starts */ /* the default starting points */ for (i=0; ie_cons0[i] = NULL; p_point->w[i] = max_w; p_point->nsites[i] = nsites; } /* setup user-specified start points */ for (i=0; i= MAXG) { fprintf(stderr, "Too many starting points. Increase MAXG and recompile."); exit(1); } p_point->e_cons0[i] = e_cons; /* convert the consensus sequence to upper case and encode as integer */ for (j=0; (cc = spcons[i][j]) != 0; j++) { char uc = (islower(cc) ? toupper(cc) : cc); e_cons[j] = (uc == 'X') ? alength : hash(uc); if (e_cons[j] > alength) { fprintf(stderr, "Illegal letter %c in consensus string!\n", cc); exit(1); } } /* set width to length of consensus unless -w specified */ if (w == 0) p_point->w[i] = j; /* pad out the consensus sequence with A's */ for ( ; j < MAXSITE; j++) e_cons[j] = 0; } // Setup heap size for storage of starting points: if (main_hs < 1) { fprintf(stderr, "Heap size must be >= 1.\n"); exit(1); } else { dataset->main_hs = main_hs; dataset->hs_decrease = hs_decrease; } // Setup a struct recording the desired parameters for branching search: BRANCH_PARAMS *branch_params = NULL; branch_params = mymalloc(sizeof(BRANCH_PARAMS)); // Setup branching factor for branching search: if (bfactor < 1) { fprintf(stderr, "bfactor must be >= 1.\n"); exit(1); } else { branch_params->bfactor = bfactor; } // Record whether the user wants w-branching carried out: branch_params->w_branch = w_branch; // Record whether the user wants x-branching carried out: if (x_branch){ branch_params->point_branch = X_ONLY; } else { branch_params->point_branch = NO_POINT_B; } if (FALSE) { // FIXME: This code can be used to set branching as the default // for different alphabets/sequence models using the -x_branch // and -no_x_branch switches. Currently the default is set to // no branching (in previous if statement). /* Record whether the user wants x_branching carried out. The result is recorded in the "point_branch" attribute of branch_params. Note that the user is only able to specify x_branching or no x_branching. This is because we found ACGT (ie regular) branching to be of little benefit. */ if (x_branch) { // User has specified they want x-branching... if (no_x_branch) { // Invalid to specify x_branch and no_branch simultaneously: fprintf(stderr, "x_branch and no_branch cannot be specified"\ "at the same time"); exit(1); } else { branch_params->point_branch = X_ONLY; } } else if (no_x_branch) { branch_params->point_branch = NO_POINT_B; } else { // User did not specify x_branching => Decide based on sequence model: // NOTE: branching is the default for oops for DNA only if (mtype == Oops && !strcmp(alph, "DNA")) { branch_params->point_branch = X_ONLY; // Only x_branch under oops by default. } else { branch_params->point_branch = NO_POINT_B; // Only x_branch under oops by default. } } // Deciding branch_params->x_branch } // end if // Store the branching parameters for future reference: dataset->branch_params = branch_params; // Print sites predicted by MEME: dataset->print_pred = print_pred; // Record whether heaps are to be printed: dataset->print_heaps = print_heaps; // Record whether llr is to be printed for the alignment of planted sites: if (print_pllr) { if (w <= 0) { fprintf(stderr, "Not valid to request llr of planted sites unless length of sites" " has been specified.\n"); exit(1); } } dataset->print_pllr = print_pllr; /* make sure nmotifs is as large as the number of starting points */ if (nmotifs < n_spcons) { nmotifs = n_spcons; fprintf(stderr, "Setting nmotifs to %d\n", nmotifs); } /* create the model */ model = create_model(mtype, revcomp, max_w, alength); best_model = create_model(mtype, revcomp, max_w, alength); scratch_model = create_model(mtype, revcomp, max_w, alength); /* create the negative dataset and negative model if negative file given */ if (negfile) { neg_dataset = read_seq_file(negfile, dataset->alphabet, revcomp, seqfrac); if (!strcmp(ntype, "pair")) { neg_dataset->negtype = Pair; } else if (!strcmp(ntype, "blend")) { neg_dataset->negtype = Blend; } else { fprintf(stderr, "Unknown ntype %s. \n", ntype); exit(1); } neg_dataset->pal = dataset->pal; neg_dataset->back = dataset->back; neg_model = create_model(mtype, revcomp, max_w, alength); } /* initialize log and exp lookup tables */ init_log(); init_exp(); /* Initialize the probability tables for the objective function. */ /* Get for 2 and up because weighting may cause < min_nsites sites */ init_llr_pv_tables(2, max_nsites, alength, dataset->back, dataset->pal); /* set up scratch models */ model->min_w = min_w; model->max_w = max_w; model->all_widths = all_widths; model->min_nsites = min_nsites; model->max_nsites = max_nsites; copy_model(model, best_model, alength); copy_model(model, scratch_model, alength); /* put meme parameters in dataset */ dataset->priors = priors; dataset->p_point = p_point; dataset->wg = wg; dataset->ws = ws; dataset->endgaps = endgaps; dataset->wnsites = wnsites; dataset->ma_adj = ma_trim; dataset->distance = distance; dataset->prob = prob; dataset->nmotifs = nmotifs; dataset->maxiter = maxiter; dataset->evt = evt; dataset->mod = mod; dataset->mapname = mapname; dataset->map_scale = map_scale; dataset->priorname = prior; dataset->beta = beta; dataset->seed = seed; dataset->seqfrac = seqfrac; /* save name of prior library */ if (plib_name) { dataset->plib_name = plib_name + strlen(plib_name); while (*dataset->plib_name != '/') dataset->plib_name--; dataset->plib_name++; /* strip off directory */ } else { dataset->plib_name = NULL; } dataset->datafile = sf ? sf : datafile; /* name to print */ dataset->negfile = negfile; dataset->bfile = bfile; dataset->min_ic = min_ic; dataset->max_time = max_time; /* save command line */ dataset->command = NULL; argv[0] = "meme"; /*for (i=pos=len=0; icommand, len+2, char); /* +1 for null */ strcpy((dataset->command)+pos, argv[i]); dataset->command[len-1] = ' '; dataset->command[len] = '\0'; pos = len; } if (TEXT_ONLY) { dataset->output_directory = NULL; } else { dataset->output_directory = *output_dirname; } /* set up return values */ *model_p = model; *scratch_model_p = scratch_model; *best_model_p = best_model; *neg_model_p = neg_model; *dataset_p = dataset; *neg_dataset_p = neg_dataset; /* announce meme */ banner("MEME", *text_output); fprintf(*text_output, "\n\n"); } /* init_meme */ /***************************************************************************/ /* init_meme_background Read in the background Markov model or use the residue frequencies adjusted by add-one prior. Precalculate the log cumulative background probabilities. The log probability of any substring of any sequence will then be accessible via: Log_back(sample[i]->logcumback, j, w) where j is the start of the length-w substring of sequence i. */ /***************************************************************************/ static void init_meme_background ( char *bfile, /* background model file */ BOOLEAN rc, /* average reverse comps */ DATASET *dataset /* the dataset */ ) { int i, j; char *alphabet = dataset->alphabet; /* alphabet */ SAMPLE **samples = dataset->samples; /* the sequences */ int n_samples = dataset->n_samples; /* # of sequences in dataset */ double *back; /* freq. of tuples */ int order; /* order of background model */ BOOLEAN add_x = TRUE; /* add x-tuples to model */ /* read in background Markov model or use dataset frequencies (0-order) */ if (bfile) { /* read in probs; set X probs */ back = read_markov_model(bfile, NULL, alphabet, add_x, rc, &order); } else { /* use dataset frequencies */ int alength = dataset->alength; /* length of alphabet */ double *res_freq = dataset->res_freq; /* weighted residue freq */ double wtr = dataset->wgt_total_res; /* weighted residue count */ double *adj_res_freq = NULL; Resize(adj_res_freq, alength, double); /* adjust residue frequencies with add-one prior to avoid 0s */ for (i=0; ilog_total_prob = 0; for (i=0; ilogcumback; /* log cum. back. prob */ /* unhash the sequence to convert ambiguous characters to X */ Resize(seq, s->length+1, char); for (j=0; jlength; j++) seq[j] = unhash(s->res[j]); seq[s->length] = '\0'; /* compute probabilites */ dataset->log_total_prob += log_cum_back(seq, back, order, lcb); myfree(seq); } /* sequence */ /* return values */ dataset->back = back; dataset->back_order = order; } /* init_meme_background */ /**********************************************************************/ /* create_priors */ /**********************************************************************/ static PRIORS *create_priors( PTYPE ptype, /* type of prior to use */ double beta, /* beta for dirichlet priors; < 0 only returns alphabet */ DATASET *dataset, /* the dataset */ char *plib_name /* name of prior library */ ) { int i; int alength = (dataset ? dataset->alength : 0); double *back = (dataset ? dataset->back : (double *)NULL); PRIORS *priors = (PRIORS *) mymalloc(sizeof(PRIORS)); priors->ptype = ptype; /* set up the prior counts */ switch (ptype) { case Addone: /* add one prior */ for (i=0; iprior_count[i] = 1.0; break; case Dirichlet: /* simple dirichlet prior */ for (i=0; iprior_count[i] = beta * back[i]; break; case Dmix: /* mixture of dirichlet's */ case Mega: /* megaprior heuristic */ case MegaP: /* mod. megaprior heuristic */ { priors->plib = read_PriorLib(plib_name, beta); /* get b=0 prior for modified mega prior heuristic */ if (ptype == MegaP || ptype == Mega) { /* used adj freq with Mega */ double b = 0; priors->plib0 = read_PriorLib(plib_name, b); } break; } } return priors; } /* create_priors */