/* * $Id: tcm.c 3885 2009-07-16 10:02:35Z tbailey $ * */ /*********************************************************************** * * * MEME++ * * Copyright 1994, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* 9-30-99 tlb; remove computation of cum. background probability */ /* 7-02-99 tlb; remove clobbering of theta */ /* 6-23-99 tlb; rewrite to support alternate DNA strands */ /* EM algorithm. Two component mixture model. */ #include "meme.h" static double smooth( int w, /* width to smooth over */ MODEL *model, /* the model */ DATASET *dataset /* the dataset */ ); /**********************************************************************/ /* tcm_e_step Do the E step of EM. Estimate the expectation of model 1 for each position in the data. In other words, calculate E[z_ij] in z. Updates z. Returns log pr(X | theta, lambda). Time: O(n_samples*lseq*w) */ /**********************************************************************/ double tcm_e_step( MODEL *model, /* the model */ DATASET *dataset /* the dataset */ ) { int i, j, k, ii; THETA logtheta1 = model->logtheta; /* motif log(theta) */ THETA logtheta1_rc = model->logtheta_rc;// motif log(theta) reverse complement int w = model->w; /* motif width */ int n_samples = dataset->n_samples; /* number of sequences */ BOOLEAN invcomp = model->invcomp; /* use reverse complement strand, too */ double log_sigma = invcomp ? log(0.5) : 0; /* log \sigma */ double lambda = model->lambda; /* \lambda of tcm model */ double log_lambda = LOG(lambda); /* log \lambda */ double log_1mlambda = LOG(1-lambda); /* log (1 - \lambda) */ double log_pX; /* log likelihood; no erase or smooth */ double log_Pij = 0; // position-specific prior log_Pij = log_sigma; /* E step */ convert_theta_to_log(model, dataset); /* calculate all the posterior offset probabilities */ log_pX = 0; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = dataset->samples[i]; int lseq = s->length; double *zi = s->z; /* Pr(z_ij=1 | X_i, \theta) */ double *not_o = s->not_o; /* Pr(V_ij = 1) */ double *lcb = s->logcumback; /* cumulative background probability */ double log_pXij; // log Pr(X_ij | \phi) double log_pXi = 0; // log Pr(X_i | \phi) if (lseq < w) continue; /* sequence too short for motif */ int m = lseq - w + 1; /* number of possible sites */ for (k=0; kres+k; if (invcomp) { for (ii=0; ii 1.0. Winner-take-all: the largest value of z_ij is never reduced. Returns the total expected number of sites of motif. */ /***********************************************************************/ static double smooth( int w, /* width to smooth over */ MODEL *model, /* the model */ DATASET *dataset /* the dataset */ ) { int i, j, p; int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; double p_sum = 0.0; BOOLEAN invcomp = model->invcomp; /* use reverse complement strand, too */ for (i=0; ilength; double *zi= sample->z; /* z */ int max_o = lseq - w + 1; /* largest possible offset */ if (lseq < w) continue; /* sequence too short for motif */ /* normalize adjacent windows of length w, then shift and repeat */ for (ioff = 0; ioff < MIN(w, max_o); ioff+=2) { /* window start */ for (j=ioff; j max_z) { max_z = z; /* largest z in window */ max_p = p; /* position with largest z */ } } /* normalize if necessary; leave largest z in window unchanged */ if (local_z > 1.0) { /* normalize */ double scale = (1 - max_z) / (local_z - max_z); for (p=j; p