/* * $Id: llr.c 5826 2011-07-04 04:57:47Z tbailey $ * * $Log$ * Revision 1.2 2006/03/08 20:50:11 nadya * merge chamges from v3_5_2 branch * * Revision 1.1.1.1.4.2 2006/01/26 09:16:27 tbailey * Rename local function getline() to getline2() to avoid conflict with * function defined in stdio.h. * * Revision 1.1.1.1.4.1 2006/01/24 20:44:08 nadya * update copyright * * Revision 1.1.1.1 2005/07/29 00:22:18 nadya * Importing from meme-3.0.14, and adding configure/make * */ /************************************************************************ * Copyright * * (1999-2006) The Regents of the University of California. * * All Rights Reserved. * * Author: Timothy L. Bailey * * * Permission to use, copy, modify, and distribute any part of * * this software for educational, research and non-profit purposes,* * without fee, and without a written agreement is hereby granted, * * provided that the above copyright notice, this paragraph and * * the following three paragraphs appear in all copies. * * * * Those desiring to incorporate this software into commercial * * products or use for commercial purposes should contact the * * Technology Transfer Office, University of California, San Diego,* * 9500 Gilman Drive, La Jolla, California, 92093-0910, * * Ph: (619) 534 5815. * * * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO * * ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR * * CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF * * THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA * * HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE * * UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE * * MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. * * THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND * * EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT * * THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, * * TRADEMARK OR OTHER RIGHTS. * ************************************************************************/ /***********************************************************************/ /* Information Content Statistics Routines */ /***********************************************************************/ #ifdef MAIN #define DEFINE_GLOBALS #endif #include "llr.h" #include "macros.h" #include "general.h" #include "io.h" #include #include "logs.h" #include "message.h" /* probability distribution */ typedef struct distr { int w; /* maximum width of alignment */ double alpha; /* scale factor for distribution */ int *range; /* range of distributions w = [1..w] */ int *offset; /* offset of distributions w = [1..w] */ double **d; /* array of (log) distributions w = [1..w] */ double **cdf; /* array of llr (log) 1 - CDFs w = [1..w] */ double mean; /* mean of llr for w = 1 */ } DISTR; static int ndistrs = -1; /* largest N for which distr known */ static DISTR *distrs = NULL; /* array of distributions for N = [1..ndistrs]*/ /************************************************************************/ /* init_llr_pv_tables Initialize the single-column log likelihood ratio p-value tables. Note: Palindromes have effectively double the apparent sites but half the width. */ /************************************************************************/ extern void init_llr_pv_tables( int min, /* minimum number of sites */ int max, /* maximum number of sites */ int alength, /* alphabet length */ double *back, /* background frequencies */ BOOLEAN pal /* sites are palindromes */ ) { int nsites; /* number of sites */ /* set effective number of sites to double if pal */ if (pal) { min *= 2; max *= 2; } if (!NO_STATUS) fprintf(stderr, "Initializing the motif probability tables for %d to %d sites...\n", min, max); /* make sure the distr table gets initialized on all nodes */ (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back); for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) { /* nsites */ /* allocate space for table */ (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back); if (!load_balance_llr(nsites, pal)) { continue; /* for parallel */ } /* create table */ if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); } (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back); } /* nsites */ broadcast_llr(min, max, pal); /* for parallel; collect the tables */ #ifdef DEBUG /* print results */ int n; for (n=min; n<=max; n++) { int I; int w = 1; printf("# N I llr 1-cdf\n"); for (I=0; I<=distrs[n].range[w]; I++) { /* LLR */ double m2, e2; if (distrs[n].cdf[w][I] == LOGZERO) { m2 = e2 = 0; } else { exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1); } printf("%3d %3d %5.1f %3.1fe%+05.0f\n", n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2); } /* LLR */ } #endif if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n"); } /* init_llr_pv_tables */ /******************************************************************************/ /* llr_distr Compute the probability distribution of the log-likelihood ratio (llr) statistic for a discrete distribution. Because of roundoff, LLR may be as small as -1 (instead of 0). Returns an array p where p[i] = log(Pr(LLR=i)) */ /******************************************************************************/ double *llr_distr( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples */ int desired_range, /* desired range for scaled LLR */ double frac, /* fraction of scores to use */ double *alpha, /* scale factor for scaled LLR */ int *offset, /* prob[0] = prob(offset) */ int *range /* range for scaled LLR */ ) { int i; /* index over alphabet */ int n; /* index over samples */ int k; /* other index */ int I; /* LLR */ double dd_sum; /* sum of dd */ int **IP; /* I'_i[n] */ int *minI=NULL; /* minimum intermediate value of I */ int *maxI=NULL; /* maximum intermediate value of I */ int Irange; /* maxI-minI+1 */ double logNfact; /* log N! */ double **logP; /* log P_i[n] */ double **logSP; /* log script_P[# samples][LLR] */ double *prob=NULL; /* final probability distribution */ double min, max, min_dd; /* create space for IP, P, minI and maxI */ create_2array(IP, int, A, N+1); create_2array(logP, double, A, N+1); Resize(minI, N+1, int); Resize(maxI, N+1, int); /* make sure distribution sums to 1.0 and has no 0's */ for (i=dd_sum=0; i=0; n--) { /* index over samples */ for (k=1; k<=n; k++) { /* index over samples of new letter */ int min = minI[n-k]; int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1)); /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/ for (I=min; I<=max; I++) { /* index over I */ if (logSP[n-k][I] > LOGZERO) { /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/ logSP[n][I+IP[i][k]] = LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]); } } /* get current minimum and maximum I in intermediate arrays */ minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]); } if (n==N && i==A-1) break; /* all done */ } } /* compute range */ /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/ *range = maxI[N] - minI[N]; /* move to probability array with prob(offset) in position 0 */ *offset += minI[N]; /* prob[0] = prob(offset) */ Resize(prob, *range+2, double); for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I]; /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, *offset, *alpha);*/ /* free up space */ free_2array(IP, A); free_2array(logP, A); free_2array(logSP, N+1); myfree(minI); myfree(maxI); return prob; } /* llr_distr */ /******************************************************************************/ /* sum_distr Compute the distribution of the sum of two integer-valued random variables whose ranges are [0..r1] and [0..r2], respectively, given the log of their distributions. */ /******************************************************************************/ double *sum_distr( double *d1, /* (log) distribution of RV1 */ int r1, /* range of RV1 */ double *d2, /* (log) distribution of RV2 */ int r2, /* range of RV2 */ int *r_sum /* range of sum of RV1 and RV2 */ ) { int i, j, k; int range = r1 + r2; /* potential range of sum */ double *d_sum = NULL; /* distribution of sum */ Resize(d_sum, range+1, double); /* space for distribution */ for (i=0; i<=range; i++) { /* value of sum */ d_sum[i] = LOGZERO; } for (i=0; i<=r1; i++) { /* range of RV1 */ if (d1[i]==LOGZERO) continue; for (j=0, k=i; j<=r2; j++, k++) { /* range of RV2 */ if (d2[j]==LOGZERO) continue; d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]); } /* RV2 */ } /* RV1 */ for (i=range; i>=0; i--) { /* value of sum */ if (d_sum[i] > LOGZERO) break; } *r_sum = i; /* non-zero range */ return d_sum; } /* sum_distr */ /******************************************************************************/ /* cdf Compute (log) 1-CDF of an integer-valued (log) distribution. Smooths the CDF by linear interpolation so that adjacent positions in the table will have different values if possible. */ /******************************************************************************/ static double *cdf( double *d, /* integer valued distribution */ int r /* range [0..r] */ ) { double *cdf=NULL, slope=0; int I, i, j, k; Resize(cdf, r+1, double); cdf[r] = d[r]; for (I=r-1; I>=0; I--) { cdf[I] = LOGL_SUM(cdf[I+1], d[I]); } /* smooth cdf by linear interpolation in logs */ for (i=r; i>0; i=j) { for (j=i-1; j>0 && d[j]==LOGZERO; j--) ; /* find next non-zero p */ if (i!=j) slope = (cdf[i]-cdf[j])/(i-j); /* slope */ for (k=j+1; k= 0 ? llr : 0; } /* get_scaled_llr */ /******************************************************************************/ /* brute Calculate the distribution of scaled LLR using a brute force algorithm (for small values of n only). Used for checking get_llr. */ /******************************************************************************/ void brute1( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples of distribution */ double alpha, /* scale factor for llr */ double *o, double *e, double *p, double prob ){ int i; int llr; if (N==0) { llr = get_scaled_llr(alpha, A, o, e); p[llr] += prob; } else { for (i=0; i= distrs[N].range[w]) { /* upper bound I */ logpv = distrs[N].cdf[w][distrs[N].range[w]]; } else { /* lin. interpolate */ logpv = distrs[N].cdf[w][I0] + (I-I0)*(distrs[N].cdf[w][I1]-distrs[N].cdf[w][I0]); } /* return log p-value */ return logpv; } /* get_llr_pv */ /******************************************************************************/ /* get_llr_mean Returns the mean of the LLR distribution for w=1. Assumes get_llr_pv has already been called. */ /******************************************************************************/ extern double get_llr_mean( double n /* wgt. number sequences in alignment */ ) { return(distrs[NINT(n)].mean); } /* get_llr_mean */ #ifdef MAIN /************************************************************************/ /* llr Compute the probability distribution for the scaled information content (LLR) of N letters. Only the most probable LLR values are used to speed the calculation. */ /************************************************************************/ extern int main( int argc, char** argv ) { int i, j, I, n; int A=0; /* length of alphabet */ char *ffreq=NULL; /* name of frequency file */ FILE *ffile; /* frequency file */ double *dd = NULL; /* discrete distribution */ char *alphabet = NULL; /* alphabet */ int N=2, minN=0, maxN=0; /* number of observations */ int w=1; /* maximum width of alignment */ double frac = 1; /* fraction of scores to use */ int range = 100; /* desired range per N */ double *p2=NULL; /* LLR statistic distribution */ double *pv2=NULL; int brute_maxN = 0; /* maximum N for checking llr_distr */ char *line; /* input buffer line */ double alpha; /* scale factor used */ i = 1; argv[0] = "llr"; DO_STANDARD_COMMAND_LINE(2, USAGE( [options]); USAGE(\n\t\tnumber of letters in alphabet); USAGE(\t\tfile containing alphabet and letter frequencies); USAGE(\t\t\twhere each line is: ); NON_SWITCH(1,\r, switch (i++) { case 1: A = atoi(_OPTION_); break; case 2: ffreq = _OPTION_; break; default: COMMAND_LINE_ERROR; } ); DATA_OPTN(1, N, , \tnumber of observations; default=2, N = atof(_OPTION_)); DATA_OPTN(1, minN, , \tminimum number of observations; default=2, minN = atoi(_OPTION_)); DATA_OPTN(1, maxN, , \tmaximum number of observations; default=2, maxN = atoi(_OPTION_)); DATA_OPTN(1, w, , \tmaximum width of alignment; default=1, w = atoi(_OPTION_)); DATA_OPTN(1, frac, , \tfraction of possible scores to use; default=1, frac = atoi(_OPTION_)); DATA_OPTN(1, range, , scale llr to have values; default=100, range = atoi(_OPTION_)); USAGE(\n\tCompute the probability distribution for the log-likelihood); USAGE(\tratio (LLR) of N letters. If a range for N is given); USAGE(\tthe distributions for each value in the range are computed.); USAGE(\tIf is given only the most probable fraction of); USAGE(\tvalues are used to speed the calculation.); USAGE(\n\tCopyright); USAGE(\t(2000-2006) The Regents of the University of California); USAGE(\tAll Rights Reserved.); USAGE(\tAuthor: Timothy L. Bailey); ); init_log(); init_exp(); /* set up range for N */ if (minN==0) minN = N; if (maxN==0) maxN = N; if (minN > maxN) { fprintf(stderr, "You must specify -maxN larger than -minN.\n"); exit(1); } /* get alphabet and frequencies */ ffile = fopen(ffreq, "r"); Resize(dd, A, double); Resize(alphabet, A, char); i = 0; while ((line = getline2(ffile))) { char c; double f; if (line[0] == '#') continue; /* skip comments */ if (sscanf(line, "%c %lf", &c, &f) != 2) break; alphabet[i] = c; dd[i++] = f; myfree(line); } if (i != A) { fprintf(stderr, "%d frequencies were found; %d were expected.\n", i,A); for (j=0; j