/* * $Id: background.c 6008 2011-09-22 02:21:24Z james_johnson $ * * $Log$ */ /**************************************************************************** * * * MEME++ * * Copyright 1994-1999, The Regents of the University of California * * Author: Timothy L. Bailey * * * ****************************************************************************/ #include "macros.h" #include "background.h" #include "hash_alph.h" /* index from ASCII to integer and back */ static int a2i_index[MAXASCII]; static int i2a_index[MAXASCII]; static int index_alen; /* length of alphabet */ #define a2i(a) a2i_index[(int)(a)] #define i2a(i) i2a_index[(int)(i)] static int setup_index( const char *alpha /* the alphabet */ ); static char *check_prob( BOOLEAN add_x, /* add x-tuples if TRUE */ double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */ ); static void average_rc( BOOLEAN add_x, /* add x-tuples if TRUE */ double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ const char *alpha /* alphabet */ ); static void add_x_tuples( double *a_p, /* tuple->prob assoc. array */ char *ntuple, /* non-X-containing tuple */ int ntuplew, /* ntuple width */ int ntuplei, /* ntuple index */ char *xtuple, /* X-containing tuple */ int xtuplew, /* xtuple width */ int xcnt /* count of X's in xtuple */ ); static void print_prob( double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */ ); static double *get_cond_prob ( double *a_p, /* tuple->prob array */ int n /* length of array */ ); /***************************************************************************/ /* read_markov_model Read a file of tuple probabilities defining a Markov model. Check that all required tuples are present. File must have format: [

]+ and may have comment lines beginning with "#" in column 1. If the freq is not NULL, uses the 0-order frequencies in freq as the model, after adding X if requested. Sets the order of the model. Returns array: cp[s2i(wa)] = Pr(a | w) where "a" is a character and "w" is a string. The 0-order probabilities are in positions 0..alength-1 of the array. */ /***************************************************************************/ extern double *read_markov_model( char *pfile, /* name of probability file */ double *freq, /* letter frequencies */ char *alpha, /* alphabet expected */ BOOLEAN add_x, /* add x-tuples if TRUE */ BOOLEAN rc, /* average reverse complements*/ int *order /* order of model read */ ) { int i; /* index into array */ double a_p[MAX_BACK_SIZE]; /* tuple-prob array */ double *a_cp=NULL; /* conditional prob. array */ FILE *pfilep; /* file pointer to file */ char *line=NULL; /* line buffer */ char **fields=NULL; /* fields of line */ int nfields; /* number of fields in line */ int line_no=0; /* line number */ char *tuple; /* the tuple */ double p; /* the probability */ int maxw=0; /* maximum tuple width */ int alen=strlen(alpha); /* length of alphabet */ int ntuples; /* number of tuples */ /* check input */ if (!pfile && !freq) { fprintf(stderr, "read_markov_model error: specify pfile or freq\n"); exit(1); } /* add 'X 'to the alphabet if requested */ if (add_x) { char *tmp = NULL; Resize(tmp, alen+2, char); strcpy(tmp, alpha); tmp[alen] = 'X'; tmp[alen+1] = '\0'; alpha = tmp; alen++; } /* setup the mapping from ascii to integer and back */ setup_index(alpha); /* use the frequencies if given */ if (freq) { /* frequencies given */ Resize(a_cp, alen, double); for (i=0; i1) { fprintf(stderr, "Illegal probability in file %s line %d: %s\n", pfile, line_no, line); } len = strlen(tuple); maxw = MAX(len, maxw); index = s2i(tuple); if (index < 0) { fprintf(stderr, "Illegal character in word `%s' in file %s line %d: %s\n", tuple, pfile, line_no, line); exit(1); } if (index >= MAX_BACK_SIZE) { for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i); fprintf(stderr, "Background model too large. Use smaller model or increase \nMAX_BACK_SIZE to at least %d in background.h and recompile.\n", ntuples); exit(1); } a_p[index] = p; /* store probability */ } fclose(pfilep); /* check that all necessary probabilities are defined */ tuple = check_prob(add_x, a_p, maxw, "", 0, alpha); if (tuple) { fprintf(stderr, "File %s gives no probability for %s.\n", pfile, tuple); exit(1); } *order = maxw - 1; /* order of Markov model */ /* average reverse complement probabilities together if requested */ if (rc) average_rc(add_x, a_p, maxw, "", 0, alpha); /* get conditional probabilities */ for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i); a_cp = get_cond_prob(a_p, ntuples); /* print the probabilities */ #ifdef DEBUG print_prob(a_cp, maxw, "", 0, alpha); #endif return(a_cp); /* return conditionals */ } /* read_markov_model */ /***************************************************************************/ /* get_markov_from_sequence Compute a Markov model from the letters in a sequence. Returns array: cp[s2i(wa)] = Pr(a | w) where "a" is a character and "w" is a string. The 0-order probabilities are in positions 0..alength-1 of the array. Warning this function modifies the seq parameter. */ /***************************************************************************/ extern double *get_markov_from_sequence( char *seq, // the raw ASCII sequence const char *alpha, // alphabet expected BOOLEAN rc, // average reverse complements if TRUE int order, // order of Markov model to create double epsilon // pseudocount ) { int i, w; /* setup the mapping from ascii to integer and back */ setup_index(alpha); /* initialize probability array */ int alen=strlen(alpha); /* length of alphabet */ int ntuples = 0; // size of array int maxw = order + 1; for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i); double *a_p = NULL; Resize(a_p, ntuples, double); /* tuple-prob array */ for (i=0; i= 0) { a_p[index]++; // update count of tuple total_count[w]++; // update count for width w } tuple[w] = save; // remove null } // sequence position } // w // // Convert counts to probabilities // int index = 0; // array index for (w=1; w<=maxw; w++) { int j; // tuple index for (j=0; jprob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */ ) { int i; char *t = NULL, *x = NULL; /* tuple and x-tuple */ char *missing; /* missing tuple */ int ti; /* index of tuple */ if (n==0) return(NULL); /* everything is OK */ Resize(t, tuplew+2, char); Resize(x, tuplew+2, char); for(i=0; alpha[i+add_x]; i++) { /* ignore last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* check that tuple exists */ ti = s2i(t); /* index of tuple */ if (a_p[ti] < 0) return(t); /* add the current tuple's probability to matching X-containing tuples */ if (add_x) add_x_tuples(a_p, t, tuplew+1, ti, x, 0, 0); /* recur */ missing = check_prob(add_x, a_p, n-1, t, tuplew+1, alpha); if (missing) return(missing); } /* letter */ myfree(t); if (add_x) myfree(x); return(NULL); /* all found */ } /* check_prob */ /***************************************************************************/ /* average_rc Recursively average the probabilities of reverse complement tuples. */ /***************************************************************************/ static void average_rc( BOOLEAN add_x, /* add x-tuples if TRUE */ double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ const char *alpha /* alphabet */ ) { int i, j; char *t = NULL, *rc = NULL; /* tuple and rc-tuple */ int ti, rci; /* index of tuple */ if (n==0) return; /* everything is OK */ Resize(t, tuplew+2, char); Resize(rc, tuplew+2, char); for(i=0; alpha[i+add_x]; i++) { /* ignore last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* make reverse complement of tuple */ for (j=0; j<=tuplew; j++) rc[j] = comp_dna(t[tuplew-j]); rc[tuplew+1] = '\0'; /* get tuple and rc indices */ ti = s2i(t); /* index of tuple */ rci = s2i(rc); /* index of reverse complement*/ /* average their probabilites */ a_p[ti] = a_p[rci] = (a_p[ti] + a_p[rci]) / 2.0; /* recur */ average_rc(add_x, a_p, n-1, t, tuplew+1, alpha); } /* letter */ myfree(t); myfree(rc); return; } /* average_rc */ /***************************************************************************/ /* print_prob Recursively print the probabilities for all tuples up to width n. */ /***************************************************************************/ static void print_prob( double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */ ) { int i; char *t = NULL; /* tuple */ if (n==0) return; /* everything is OK */ Resize(t, tuplew+2, char); for(i=0; alpha[i]; i++) { /* include last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* print tuple and probability */ if (a_p[s2i(t)]) printf("%10s %8.4f\n", t, a_p[s2i(t)]); /* recur */ print_prob(a_p, n-1, t, tuplew+1, alpha); } /* letter */ myfree(t); } /* print_prob */ /***************************************************************************/ /* add_x_tuples Compute the probability of all X-containing tuples by adding the probability of the current non-X-containing tuple to each X-containing tuple it matches. When this has been done for all non-X-containing tuples, the X-containing tuple probabilities will be correct. */ /***************************************************************************/ static void add_x_tuples( double *a_p, /* tuple->prob assoc. array */ char *ntuple, /* non-X-containing tuple */ int ntuplew, /* ntuple width */ int ntuplei, /* ntuple index */ char *xtuple, /* X-containing tuple */ int xtuplew, /* xtuple width */ int xcnt /* count of X's in xtuple */ ) { if (xtuplew == ntuplew) { /* add prob. of non-X-tuple */ int xtuplei; xtuple[xtuplew] = '\0'; /* end tuple */ xtuplei = s2i(xtuple); if (a_p[xtuplei] == -1) a_p[xtuplei] = 0; /* first time for x-tuple */ a_p[xtuplei] += a_p[ntuplei]; } else { /* continue building x-tuple */ xtuple[xtuplew] = 'X'; /* add X to xtuple */ add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt+1); if (xcnt || xtuplew < ntuplew-1) { /* OK to add non-X to xtuple */ xtuple[xtuplew] = ntuple[xtuplew]; /* add X to xtuple */ add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt); } } } /* add_x_tuples */ /***************************************************************************/ /* get_cond_prob Compute Pr(a | w) where "w" is a word and "a" is a character and store in an array with index "s2i(wa)". */ /***************************************************************************/ static double *get_cond_prob ( double *a_p, /* tuple->prob array */ int n /* length of array */ ) { double *a_cp=NULL; Resize(a_cp, n, double); /* create array */ double total_w_prob = 0; int wa; for (wa=0; wa 0); Resize(string, l + 1, char) string[l] = '\0'; for (i=0; i