/* * $Id: subseq7.c 4278 2009-12-23 09:58:37Z james_johnson $ * */ /*********************************************************************** * * * MEME++ * * Copyright 1994-2006, the Regents of the University of California* * Author: Timothy L. Bailey * * * ***********************************************************************/ /* subseq7.c */ /* 5-23-00 tlb; change objective function to be log likelihood ratio for all models */ /* 7-12-99 tlb; move not_o out of pY calculation and move to local/global_max */ /* 7-12-99 tlb; multiply sw * not_o in m_step of align_top_subsequences so that erased starts will have low likelihood */ /* 7-01-99 tlb; multiply background counts by motif width for Tcm model */ /* 6-29-99 tlb; add reverse complement DNA strand support */ #include "calculate_p_y.h" #include "heap.h" #include "meme.h" #include "psp.h" #include "seed.h" #include "sp_matrix.h" #include "macros.h" #include #include // For the "log" function; temporary. #define trace(Y) fprintf(stderr,Y) /* minimum probability of NOT overlapping a previous site for starting points */ #define MIN_NOT_O .1 /* use logs and integers */ #define LOG_THETA_TYPE(V) int *V[2][MAXSITE] #define LOG_THETAG_TYPE(V) int *V[MAXSITE] /* print a subsequence from its integer encoded form */ #define Print_res(out, res, w) \ { \ int i; \ for (i=0; i<(w); i++) { fprintf(out, "%c", unhash((res)[i])); } \ fprintf(out, "\n"); \ } /* local functions */ static void next_pY( DATASET *dataset, /* the dataset */ LOG_THETAG_TYPE(theta_1), /* integer log theta_1 */ int w, /* width of motif */ int *theta_0, /* first column of previous theta_1 */ int pYindex /* which pY array to use */ ); static void init_theta_1( int w, /* width of site */ char *res, /* (encoded) letters of subsequence */ LOG_THETAG_TYPE(theta_1), /* theta_1 */ int lmap[MAXALPH][MAXALPH] /* matrix of frequency vectors */ ); static int global_max( DATASET *dataset, /* the dataset */ int w, /* length of sites */ P_PROB maxima, /* array of encoded site starts of local maxima */ BOOLEAN ic /* use reverse complement, too */ ); static int local_max( DATASET *dataset, /* the dataset */ int w, /* length of sites */ P_PROB maxima, /* array of encoded site starts of local maxima */ BOOLEAN ic /* use reverse complement, too */ ); /**********************************************************************/ /* subseq7 Try subsequences as starting points and choose the one which yields the highest score. Score is computed by: 1) computing log p(Y | theta_1) 2) finding the (sorted) postions of maximum log pY 3) aligning the top NSITES0 scores for each value of NSITES0 4) computing the expected likelihood for each value of NSITES0 The computing of p(Y | theta_1) for successive subsequences (values of theta_1) is optimized by using p_i to calculate p_{i+1}. Returns number of starting points updated in s_points array. Updates s_points, array of starting points, one for each value of NSITES0 tried-- finds one \theta for each value of nsites0 specified in the input. NOTE: (22/11/06) The approach used to facilitate dynamic programming was left unchanged. HOWEVER, it could be "unified" with the dynamic programming approach used in branching_search, in which "SEED_DIFF" objects are used. This has not yet been done because "if it ain't broke, don't fix it". */ /**********************************************************************/ extern void subseq7( MODEL *model, // the model DATASET *dataset, /* the dataset */ int w, // w to use int n_nsites0, /* number of nsites0 values to try */ S_POINT s_points[], /* array of starting points: 1 per nsites0 */ HASH_TABLE evaluated_seed_ht /* A hash table used for remembering which seeds have been evaluated previously */ ) { MOTYPE mtype = model->mtype; /* type of model */ BOOLEAN ic = model->invcomp; /* use reverse complement strand of DNA, too */ THETA map = dataset->map; /* freq x letter map */ LOG_THETA_TYPE(ltheta); /* integer encoded log theta */ int iseq, ioff; int alength = dataset->alength; /* length of alphabet */ int n_samples = dataset->n_samples; /* number of samples in dataset */ SAMPLE **samples = dataset->samples; /* samples in dataset */ int n_starts = 0; /* number of sampled start subseq */ int n_maxima = ps(dataset, w); /* upper bound on # maxima */ /* the local maxima positions */ P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); int lmap[MAXALPH][MAXALPH]; /* consensus letter vs. log frequency matrix */ double col_scores[MAXSITE]; /* not used */ #ifdef PARALLEL int start_seq, start_off=0, end_seq, end_off=0; #endif char *str_seed; // A string representation of a seed. // PRECONDITIONS: // 1. If the sequence model is oops, then n_nsites0 is exactly 1: if (mtype == Oops) { assert(n_nsites0 == 1); } convert_to_lmap(map, lmap, alength); if (TRACE) { printf("w= %d\n", w); } /* get the probability that a site starting at position x_ij would NOT overlap a previously found motif. */ get_not_o(dataset, w); // Set up log_not_o: log_not_o[site] is: // log ( Pr(site not overlapped) * scaled_to_one_Pr(site) ) if (model->mtype != Tcm) { add_psp_to_log_not_o(dataset, w, model->invcomp, model->mtype); } /* score all the sampled positions saving the best position for each value of NSITES0 */ #ifdef PARALLEL /* Retrieve the previously-calculated starting and ending points. */ get_start_n_end(&start_seq, &start_off, &end_seq, &end_off); /* Divide the various samples among processors. */ for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */ #else /* not PARALLEL */ for (iseq = 0; iseq < n_samples; iseq++) { /* sequence */ #endif /* PARALLEL */ SAMPLE *s = samples[iseq]; int lseq = s->length; char *res = s->res; /* left to right */ char *name = s->sample_name; double *not_o = s->not_o; int max_off, init_off; if (lseq < w) continue; /* shorter than motif */ #ifdef PARALLEL if (mpMyID() == 0) #endif if ((!NO_STATUS) && ((iseq % 5) == 0)) { fprintf(stderr, "starts: w=%d, seq=%d, l=%d \r", w, iseq, lseq); } /* Set the appropriate starting and ending points. */ #ifdef PARALLEL if (iseq == start_seq) init_off = start_off; else #endif init_off = 0; #ifdef PARALLEL if (iseq == end_seq) max_off = MIN(end_off, lseq - w); else #endif max_off = lseq - w; /* Loop over all subsequences in the current sequence testing them each as "starting points" (inital values) for theta */ for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */ /* warning: always do the next step; don't ever "continue" or the value of pY will not be correct since it is computed based the previous value */ /* convert subsequence in dataset to starting point for EM */ init_theta_1(w, res+ioff, <heta[1][0], lmap); if (ioff == init_off) { /* new sequence */ /* Compute p(Y_ij | theta_1^0) */ if (!ic) { get_pY(dataset, <heta[1][0], w, 0); } else { get_pY(dataset, <heta[1][0], w, 1); get_pY(dataset, <heta[1][0], w, 2); } } else { /* same sequence */ /* get theta[0][0]^{k-1} */ init_theta_1(1, res+ioff-1, <heta[0][0], lmap); /* compute p(Y_ij | theta_1^k) */ if (!ic) { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 0); } else { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 1); next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 2); } } /* same sequence */ /* skip if there is a high probability that this subsequence is part of a site which has already been found */ if (not_o[ioff] < MIN_NOT_O) continue; /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/ // Put highest pY into first scratch array if using both DNA strands: if (ic) { combine_strands(samples, n_samples, w); } /* get a sorted list of the maxima of pY */ n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE); /* "fake out" align_top_subsequences by setting each of the scores in the s_points objects to LITTLE, thereby forcing align_top_subsequences to record the attributes for the current seed in the s_points, rather than the seed with the highest respective scores: */ int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { s_points[sp_idx].score = LITTLE; } /* align the top nsites0 subsequences for each value of nsites0 and save the alignments with the highest likelihood */ n_starts += align_top_subsequences( mtype, w, dataset, iseq, ioff, res+ioff, name, n_nsites0, n_maxima, maxima, col_scores, s_points ); /* A string version of the current seed is required for updating the S_POINT heaps: */ str_seed = to_str_seed(res+ioff, w); /* For each of the S_POINT objects, add the current seed to that S_POINT'S heap. Also, branching search will require a hash_table of all seeds that have been evaluated prior to when branching search is called. Hence also record the current seed (string, nsites0) combination in the hash_table, for all nsites0, unless that seed was already in the hash_table: */ hash_insert_str(str_seed, evaluated_seed_ht); update_s_point_heaps(s_points, str_seed, n_nsites0); myfree(str_seed); } /* subsequence */ } /* sequence */ #ifdef PARALLEL reduce_across_heaps(s_points, n_nsites0); #endif // PARALLEL // Print the sites predicted using the seed after subsequence search, for // each of the starting points, if requested: if (dataset->print_pred) { int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { // Retrieve the best seed, from the heap: HEAP *heap = s_points[sp_idx].seed_heap; // Only print sites for the s_point if its heap was non-empty: if (get_num_nodes(heap) > 0) { SEED *best_seed = (SEED *)get_node(heap, get_best_node(heap)); char *seed = get_str_seed(best_seed); /* Print the sites predicted using the motif corresponding to that seed, according to the sequence model being used: */ int nsites0 = s_points[sp_idx].nsites0; fprintf(stdout, "PREDICTED SITES AFTER SUBSEQUENCE SEARCH WITH W = %i " "NSITES = %i MOTIF = %i\n", w, nsites0, dataset->imotif); int n_maxima = ps(dataset, w); // upper bound on number of maxima P_PROB psites = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); n_maxima = get_pred_sites(psites, mtype, w, seed, ltheta[1], lmap, dataset, ic); print_site_array(psites, nsites0, stdout, w, dataset); myfree(psites); } // get_num_nodes > 0 } //sp_idx } // print_pred if (TRACE){ printf("Tested %d possible starts...\n", n_starts); } myfree(maxima); } // subseq7 /**********************************************************************/ /* next_pY Compute the value of p(Y_ij | theta_1^{k+1}) from p(Y_ij | theta_1^{k} and the probability of first letter of Y_ij given theta_1^k, p(Y_ij^0 | theta_1^k). */ /**********************************************************************/ static void next_pY( DATASET *dataset, /* the dataset */ LOG_THETAG_TYPE(theta_1), /* integer log theta_1 */ int w, /* width of motif */ int *theta_0, /* first column of previous theta_1 */ int pYindex /* which pY array to use */ ) { int i, k; int *theta_last = theta_1[w-1]; /* last column of theta_1 */ int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = samples[i]; /* sequence */ int lseq = s->length; /* length of sequence */ char *res = pYindex<2 ? s->res : s->resic; /* integer sequence */ int *pY = s->pY[pYindex]; /* log p(Y_j | theta_1) */ char *r = res+lseq-1; /* last position in sequence */ char *r0 = res+lseq-w-1; /* prior to start of last subsequence */ int j, p; if (lseq < w) continue; /* skip if sequence too short */ /* calculate p(Y_ij | theta_1) */ int *pY_shifted_1 = pY - 1; for (j=lseq-w; j>0; j--) { pY[j] = pY_shifted_1[j] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)]; } /* calculate log p(Y_i0 | theta_1) */ p = 0; r = res; for (k=0; ksamples; /* datset samples */ int n_samples = dataset->n_samples; /* number samples in dataset */ int n_maxima = 0; /* number of maxima found */ /* find the position with maximum pY in each sequence */ for (i=0; ilength; int last_j = lseq-w; /* start of last subseq */ int *pY = s->pY[0]; /* log p(Y_j | theta_1) */ char *pYic = s->pYic; /* site on - strand */ int *log_not_o = s->log_not_o; // Pr(site) * Pr(no overlap) int max_j = 0; /* best offset */ int max = pY[0] + log_not_o[0]; /* initial maximum */ if (lseq < w) continue; /* skip if too short */ for (j=0; j<=last_j; j++) { /* subsequence */ if (pY[j] + log_not_o[j] > max) { /* new maximum found */ max_j = j; // FIXME: We are assumming that priors are always symmetrical here. max = pY[j] + log_not_o[j]; // log (pY * Pr(site) * Pr(no overlap)) } /* new maximum */ } /* subsequence */ /* record the maximum for this sequence */ maxima[n_maxima].x = i; maxima[n_maxima].y = max_j; maxima[n_maxima].ic = ic && pYic[max_j]; /* on - strand */ maxima[n_maxima].prob = max; n_maxima++; } /* sequence */ return n_maxima; } /* global_max */ /**********************************************************************/ /* local_max Find the local maxima of pY * log_not_o subject to the constraint that they are separated by at least w positions. Returns the number of local maxima found and the updated array of maxima positions. */ /**********************************************************************/ static int local_max( DATASET *dataset, /* the dataset */ int w, /* length of sites */ P_PROB maxima, /* array of encoded site starts of local maxima */ BOOLEAN ic /* use reverse complement, too */ ) { int i, j, k, next_j, n_maxima; SAMPLE **samples = dataset->samples; /* datset samples */ int n_samples = dataset->n_samples; /* number samples in dataset */ /* Find the non-overlapping local maxima of p(Y_ij | theta_1) */ n_maxima = 0; for (i=0; ilength; /* length of sequence */ int *pY = s->pY[0]; /* log p(Y_j | theta_1) */ int *log_not_o = s->log_not_o; // Pr(site) * Pr(no overlap) int last_j = lseq-w; /* last possible site */ int max = pY[0]+log_not_o[0]; /* initial maximum */ if (lseq < w) continue; /* skip if too short */ maxima[n_maxima].x = i; /* candidate */ maxima[n_maxima].y = 0; /* candidate site */ maxima[n_maxima].prob = max; next_j = MIN(w, last_j+1); /* next possible maximum */ for (j=0; j<=last_j; j++) { /* subsequence */ // FIXME: We are assumming that priors are always symmetrical here. int prob = pY[j]+log_not_o[j]; /* log (pY * Pr(site) * Pr(no overlap)) */ if (j==next_j) n_maxima++; /* candidate not exceeded */ if (j==next_j || prob>max) { /* create/overwrite */ max = prob; /* new max */ maxima[n_maxima].x = i; /* overwrite the candidate */ maxima[n_maxima].y = j; /* site */ maxima[n_maxima].prob = max; /* max */ next_j = MIN(j+w, last_j+1); /* next possible maximum */ } /* create/overwrite candidate */ } n_maxima++; /* record last maxima */ } /* set the strand and position */ for (k=0; kpYic[j]; /* on - strand */ } /* n_maxima */ return n_maxima; } /* local_max */ /**********************************************************************/ /* pY_compare Compare the pY of two start sequences. Return <0 0 >0 if the first pY is <, =, > the first pY. */ /**********************************************************************/ extern int pY_compare( const void *v1, const void *v2 ) { double result; const struct p_prob * s1 = (const struct p_prob *) v1; const struct p_prob * s2 = (const struct p_prob *) v2; if ((result = s2->prob - s1->prob) != 0) { return (result<0) ? -1 : +1; } else if ((result = s2->x - s1->x) != 0) { return result; } else { return s2->y - s1->y; } } /**********************************************************************/ /* init_theta_1 Convert a subsequence to a motif. Uses globals: */ /**********************************************************************/ static void init_theta_1( int w, /* width of site */ char *res, /* (encoded) letters of subsequence */ LOG_THETAG_TYPE(theta_1), /* theta_1 */ int lmap[MAXALPH][MAXALPH] /* matrix of frequency vectors */ ) { int m; for (m=0; malength; /* lenght of alphabet */ double *back = dataset->back; /* background frequencies */ SAMPLE **samples = dataset->samples; /* the sequences */ double counts[MAXSITE][MAXALPH]; /* array to hold observed counts */ double wN; /* weighted number of sites */ double log_pop; /* log product of p-values */ double min_ic = dataset->min_ic; /* min. per-column IC */ /* initialize letter counts to 0 */ wN = 0; /* weighted number of sites */ for (i=0; ilength-w-y : y; /* - strand offset from rgt. */ char *res = ic ? s->resic+off : s->res+off; /* integer sequence */ double sw = s->sw; /* sequence weight */ // // TLB: Note that log_not_o contains Pr(site) scaled to have max=1 // when called from subseq7() but not when called from discretize(). // // Why not revert to not_o[y] here? TLB: Because the other one works // much better, although its kind of a hack. //double esw = sw * s->not_o[y]; // Pr(site not overlapped) // // FIXME: We are assumming that priors are always symmetrical here. double esw = sw * INT_DELOG(s->log_not_o[y]); // Pr(site not overlapped) * Pr(site) wN += esw; /* total sequence wgt */ /* residue counts */ for (j=0; jpal) palindrome(counts, counts, w, alength); // Updated on 13-12-06: Only calculate objective function score if the // current s_point is supposed to be evaluated: if (s_points[i_nsites0].evaluate) { /* convert COUNTS to FREQUENCIES and calculate log likelihood ratio */ log_pop = 0; /* product of p-values */ for (i=0; iuse_llr) { // Using llr instead of pop: col_scores[i] = log_pv; log_pop -= llr; } else { log_pop += col_scores[i] = log_pv; } } /* position in site */ RND(log_pop, RNDDIG, log_pop); /* print the start sequence and other stuff */ if (TRACE) { if (eseq) { char seq[MAXSITE+1]; r2seq(seq, eseq, w); fprintf(stdout, "( %3d %3d ) ( %*.*s ) %.*s logpop %8.3f nsites0 %6d\n", iseq+1, ioff+1, MSN, MSN, name, w, seq, -log_pop, nsites0); } else { fprintf(stdout, "l_off %3d w %d logpop %8.3f nsites0 %6d\n", iseq, w, -log_pop, nsites0); } } /* save the best start */ if (-log_pop > s_points[i_nsites0].score) { /* Save the starting point and offset so we can re-calculate eseq later. */ s_points[i_nsites0].iseq = iseq; s_points[i_nsites0].ioff = ioff; s_points[i_nsites0].e_cons0 = eseq; s_points[i_nsites0].wgt_nsites = wN; s_points[i_nsites0].score = -log_pop; } } // Evaluating only if told to do so. } /* nsites0 */ return n_starts; } /* score_llr_pop */ /**********************************************************************/ /* align_top_subsequences Align the top nsites0 subsequences for each value of nsites0 and save the alignments with the best values according to the objective function. Returns number of values of nsites0 tried. */ /**********************************************************************/ extern int align_top_subsequences( MOTYPE mtype, /* type of model */ int w, /* width of motif */ DATASET *dataset, /* the dataset */ int iseq, /* sequence number of starting point */ int ioff, /* sequence offset of starting point */ char *eseq, /* integer encoded subsequence */ char *name, /* name of sequence */ int n_nsites0, /* number of nsites0 values to try */ int n_maxima, /* number of local maxima */ P_PROB maxima, /* sorted local maxima indices */ double *col_scores, /* column scores for last start point */ S_POINT s_points[] /* array of starting points */ ) { return(score_llr_pop(mtype, w, dataset, iseq, ioff, eseq, name, n_nsites0, n_maxima, maxima, col_scores, s_points)); } /* align_top_subsequences */ /* * Local Variables: * mode: c * c-basic-offset: 2 * End: */