/************************************************************************** * FILE: gendb.c * AUTHOR: Timothy L. Bailey * CREATE DATE: 10/29/02 * PROJECT: MHMM * COPYRIGHT: 2002, TLB * DESCRIPTION: Generate sequences using a Markov model. **************************************************************************/ #ifdef MAIN #define DEFINE_GLOBALS #endif #include "macros.h" #include "background.h" #include "fcodon.h" #include "gendb.h" #include "hash_alph.h" #include "seq.h" #define LOWEST 50 /* shortest sequence */ #define HIGHEST 2000 /* longest sequence */ #define SEQS 10 /* number of sequences */ #define MAX_ORDER 10 // largest Markov model order /**************************************************************************/ /* get_letters Convert an alphabet into a set of letter strings. Get the size of alphabet, size of strings and default 0-order Markov frequencies. */ /**************************************************************************/ char **get_letters( int type, // Type of alphabet. char **alph_p, // Alphabet. int *r_p, // Alphabet length. int *c_p, // Length of letter string. double **f_p // 0-order Markov frequencies. ) { int i, j, k; char *alph, *old_alph, *ptr; int r, c, alen, old_alen; double *f = NULL; char **letters; /* letters/codons to print */ /* set up alphabet, frequency table, letters/codon array */ switch (type) { case 0: alph = PROTEINB; f = nrfreq; alen = strlen(alph); r = alen; c = 2; create_2array(letters, char, r, c); for (i=0; i MAX_ORDER) { fprintf(stderr, "Order of Markov model %d exceeds MAX_ORDER %d.\n", *order, MAX_ORDER); exit(1); } if (*order < use_order) { fprintf(stderr, "The is only order %d so you can't use order %d.\n", *order, use_order); exit(1); } *order = use_order < 0 ? *order : use_order; } // Cumulative distribution. // Get the number of probabilities. n = r; for (i=1; i<=*order; i++) n += pow(r, i+1); // Convert the conditional probabilites to cumulative probs. for (i=0; iSEQ_%-d %d\n", i+1, n); } else { mm_resize(id, 50, char); sprintf(id, ">SEQ_%-d %d\n", i+1, n); } /* Generate letters by 1) random x ~ [0,1) 2) binary search of cum for cum[i-1]= 1) { // Markov model. int start_ptr; // Find the offset into the cumulative prob array by looking // for the offset of the preceeding "order" characters. buffer[j] = first_letter; // Now contains index into array. buffer[j+1] = '\0'; start_ptr = j > order ? j-order : 0; // Start of index string. offset = s2i(buffer + start_ptr); //fprintf(stderr, "b: %s\n offset: %d x: %f\n", buffer+start_ptr, offset, x); } while (hi-lo > 1) { /* binary search */ int mid = (lo+hi)/2; /* midpoint */ if (x > cum[mid+offset]) { lo = mid; } else { hi = mid; } } //fprintf(stderr, "%11.8f %s\n", x, letters[x\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --bfile \n"); strcat(usage, " --seed (default=0)\n"); strcat(usage, " --minseq (default=50)\n"); strcat(usage, " --maxseq (default=2000)\n"); strcat(usage, " --order use Markov model of given order\n"); strcat(usage, " --type 0|1|2|3|4\n"); strcat(usage, " 0=protein (default)\n"); strcat(usage, " 1=dna with ambiguous characters\n"); strcat(usage, " 2=codons\n"); strcat(usage, " 3=dna w/o ambiguous characters\n"); strcat(usage, " 4=protein w/o ambiguous characters\n"); strcat(usage, " --dummy\n"); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char* message = ""; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "bfile") == 0) { bfile = option_value; } else if (strcmp(option_name, "seed") == 0) { seed = atoi(option_value); } else if (strcmp(option_name, "minseq") == 0) { min = (int)atof(option_value); } else if (strcmp(option_name, "maxseq") == 0) { max = (int)atof(option_value); } else if (strcmp(option_name, "order") == 0) { use_order = atoi(option_value); } else if (strcmp(option_name, "type") == 0) { type = atoi(option_value); } else if (strcmp(option_name, "dummy") == 0) { dummy = TRUE; } } if (option_index + 1 != argc) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } int nseqs = atoi(argv[option_index]); fprintf(stderr, "Generating %d sequences.\n", nseqs); // Print dummy sequence if asked to. if (dummy) { printf(">SEQ_0 n= %d s= %d l= %d seed= %d\n", nseqs, min, max, seed); } gendb( stdout, type, bfile, use_order, NULL, // 0-order model. -- WSN 8/13/03 nseqs, min, max, seed ); exit(0); } // main #endif // MAIN