/** * @file calculate_p_y.c * * This module relates to generic pY calculations. "pY" is Pr(Y_ij | theta). * The pY calculation functions contained can be applied in multiple scenarios. * In contrast, other modules (eg branching_search.c and subseq7.c) might include * pY calculation functions specific to a process. Eg. branching search uses * dynamic programming, which dictates a specific approach to some pY * calculations. * * $Id: calculate_p_y.c 4278 2009-12-23 09:58:37Z james_johnson $ * * $Log$ * Revision 1.1.2.2 2006/02/09 23:51:15 twhitington * Removed compile-time errors from branching_search module. * * Revision 1.1.2.1 2006/02/07 02:02:52 twhitington * Added calculate_p_y module to repository. * * */ #include "calculate_p_y.h" #include #include #include "meme.h" #include "seed.h" /** * get_pY * * Compute log Pr(Y_ij | theta_1) for all positions (i, j) in the dataset. * */ void get_pY( DATASET *dataset, ///< the dataset int *theta_1[MAXSITE], ///< integer log theta_1 int w, ///< width of motif int pYindex ///< which pY array to use ) { int i, j, k; int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; for (i=0; ilength; char *res = pYindex<2 ? s->res : s->resic; /* integer sequence */ int *pY = s->pY[pYindex]; /* log p(Y_j | theta_1) */ if (lseq < w) continue; /* skip if sequence too short */ for (j=0; j<=lseq-w; j++) { /* site start */ char *r = res + j; int p = 0; for (k=0; klength; int *pY = s->pY[0]; /* log p(Y_j | theta_1) both */ int *pY1 = s->pY[1]; /* log p(Y_j | theta_1) + strand */ int *pY2 = s->pY[2]; /* log p(Y_j | theta_1) - strand */ char *pYic = s->pYic; /* site on - strand */ int last_j = lseq-w; /* last possible site */ if (lseq < w) continue; /* skip if sequence too short */ int j, k; for (j=0, k=last_j; j<=last_j; j++, k--) { /* site start */ if (pY2[k] > pY1[j]) { /* site on - strand */ pYic[j] = '\1'; pY[j] = pY2[k]; } else { /* site on + strand */ pYic[j] = '\0'; pY[j] = pY1[j]; } } /* site start */ } /* sequence */ } // Combine strands /* NOTE: The concept of an "lmotif" is somewhat redundant with the * already-existing concept of an "ltheta". Merging these concepts and * adapting them to a more "object-oriented" philosophy could lead to clearer * code. However, this would require substantial revision of the existing code * where "ltheta" was already used (before I started working on MEME). */ /** * create_lmotif * * Create a motif (of iteger-converted log letter probabilities) corresponding * to the specified seed (in string format). This function "fills in" the array * of int pointers, lmotif, so that each pointer points to an array in lmap. * */ extern void create_lmotif ( char *seed_str, ///< The seed as a string int lmap[MAXALPH][MAXALPH], ///< Matrix representing a seq-to-theta mapping fn int *lmotif[MAXSITE], ///< The resulting lmotif - OUT int *mot_width ///< The width of the motif - OUT ) { // Retrieve the integer-encoded representation of the seed: char *seed_enc = NULL; int seed_len; seed_enc = to_e_seed(seed_str, &seed_len); // Allocate space for the lmotif data structure: /* int **columns = NULL; */ /* Resize(columns, seed_len, int *); */ /* int col_idx; */ /* for (col_idx = 0; col_idx < seed_len; col_idx++) { */ /* int *curr_col; */ /* Resize(curr_col, ALPH, int); */ /* columns[col_idx] = curr_col; */ /* } */ // Convert the encoded seed to an lmotif, thus filling in the memory // allocated for the "lmotif" data structure: init_lmotif(seed_len, seed_enc, lmotif, lmap); *mot_width = seed_len; myfree(seed_enc); } // create_lmotif /** * init_lmotif * * Convert an integer-encoded subsequence to a motif. * * Memory must be pre-allocated for theta_1. * */ extern void init_lmotif ( int w, ///< width of site char *res, ///< Integer encoded letters of subsequence int *theta_1[MAXSITE], ///< theta_1 int lmap[MAXALPH][MAXALPH] ///< matrix of frequency vectors ) { int m; for (m=0; m Obtain it now: int *lmotif[MAXSITE]; int mot_width; create_lmotif(new_seed, lmap, lmotif, &mot_width); assert(mot_width == new_seed_len); // Update the array of pY values given the new seed. This is where the // dynamic programming occurs: if (!ic) { //fprintf(stderr, "NPY....\n"); next_pY_branching(lmotif, s_diffs, dataset, 0); //get_pY(dataset, lmotif, s_diffs->length_new, 0); } else { next_pY_branching(lmotif, s_diffs, dataset, 1); next_pY_branching(lmotif, s_diffs, dataset, 2); } /* Combine the log Pr(Y_ij | theta_1) values from each strand if using * the inverse complement: */ if (ic) { combine_strands(dataset->samples, dataset->n_samples, new_seed_len); } // Get a list of "maxima" under the specified sequence model. These maxima // will indicate where the pY values are highest: int n_maxima = ps(dataset, new_seed_len); // upper bound on # maxima P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); n_maxima = get_max(mtype, dataset, new_seed_len, maxima, ic, TRUE); // Obtain the S_POINT array of the new seed's length, from the s_point matrix: S_POINT *sp_arr = get_sp_arr(sp_mat, new_seed_len); int n_nsites = sp_get_num_cols(sp_mat); /* Evaluate the objective function, LLR_POP, given the alignment of "maxima". This will result in a score being entered in each S_POINT from the above array... */ int iseq = -1; // Dummy int ioff = -1; // Dummy char *eseq = NULL; // Dummy char *name = NULL; // Dummy double col_scores[MAXSITE]; // Dummy // Force align_top_subsequences to set the score of every s_point: int sp_idx; for (sp_idx = 0; sp_idx < n_nsites; sp_idx++) { sp_arr[sp_idx].score = LITTLE; } align_top_subsequences( mtype, new_seed_len, dataset, iseq, ioff, eseq, name, n_nsites, n_maxima, maxima, col_scores, sp_arr ); // Maxima not required anymore => delete them: myfree(maxima); // Update the heap in each of the relevant S_POINTs, adding a new "SEED" // object with the appropriate score the the S_POINT's heap in each case: update_s_point_heaps(sp_arr, new_seed, n_nsites); } // evaluate_seed_DP /** * next_pY_branching * * Next_pY_branching computes log Pr(Y_ij | theta_1) for all i and j in the * dataset. * It uses dynamic program to perform this computation, reusing the pY results * obtained for the previous seed evaluated. * * This function updates the "pY" arrays by considering the DIFFERENCES of * the seed of interest with respect to the seed for which pY is currently * evaluated. * * NOTE: Readability of this function is slightly compromised in order * to optimise the speed of the function. * * NOTE: This function was developed in order to support the dynamic programming * performed in branching search. HOWEVER, it can equally be applied to * substring search, although this approach has not yet been implemented. * */ extern void next_pY_branching( int *lmotif_new[MAXSITE], ///< The lmotif corresponding to the new seed. SEED_DIFFS *s_diffs, ///< The differences between the new and old seeds. DATASET *dataset, ///< The dataset of sequences. Contains the pY arrays. int pYindex ///< Variable indicating which pY array to use. ) { // Retrieve the information about the differences that are specifically // required by the "guts" of this function: int n_diffs = get_n_diffs(s_diffs); // Currently this function only caters for 1 difference or 2 differences: assert ((n_diffs == 1) || (n_diffs == 2)); int *diff_idxs = get_diff_idxs(s_diffs); int **diff_cols = get_diff_cols(s_diffs); int diff_idx1 = 0; int diff_idx2 = 0; int *diff_col1 = NULL; int *diff_col2 = NULL; if (n_diffs == 1) { diff_idx1 = diff_idxs[0]; diff_col1 = diff_cols[0]; } else { diff_idx1 = diff_idxs[0]; diff_col1 = diff_cols[0]; diff_idx2 = diff_idxs[1]; diff_col2 = diff_cols[1]; } int new_old_shift = get_seed_shift(s_diffs); int old_seed_len, new_seed_len; get_seed_lengths(s_diffs, &old_seed_len, &new_seed_len); /* Currently, if there is a shift between the new seed and the old seed, then ALL the differences between the old and new seed must be accounted for by gaps. Otherwise this function is unable to compute pY for the new seed. I don't yet know how to improve on this, but I doubt it is necessary to do so anyway: */ if (new_old_shift != 0) { int n_gaps = abs(new_seed_len + new_old_shift - old_seed_len) + abs(new_old_shift); assert(n_gaps == n_diffs); } SAMPLE **samples = dataset->samples; int n_samples = dataset->n_samples; /* If the new and old seeds are not of equal length and with zero offset, then there will be some pY values at the end(s) of each sequence, which must be calculated "from scrath". Figure out how many such positions there are: */ int n_start_pos = MAX(0, new_old_shift); int n_end_pos = MAX(0, (old_seed_len - new_seed_len - new_old_shift)); /* If the new seed is unshifted with respect to the previous seed, then * set each pY by simply iterating through dataset and using * Pr(Yij|old theta) in order to determine Pr(Yij|new theta) in each case: */ if (new_old_shift == 0) { int seq_idx; for (seq_idx=0; seq_idx < n_samples; seq_idx++) { // Sequences in dataset SAMPLE *curr_seq = samples[seq_idx]; // The current sequence int lseq = curr_seq->length; // Length of current sequence // Integer encoding of the sequence: char *res = pYindex<2 ? curr_seq->res : curr_seq->resic; int *pY = curr_seq->pY[pYindex]; // log p(Y_j | theta_1) if (lseq < new_seed_len) continue; // Skip if sequence is too short // Shift the sequence (adjusting according to the indeces at which the // current and previous motifs differ) now, rather than inside inner // loop... // Residues - for the current sequence position - at which the new and old // seeds differ: char *diff_1 = res+diff_idx1; char *diff_2 = NULL; if (n_diffs == 2) { diff_2 = res+diff_idx2; } int lett_idx; for (lett_idx=0; lett_idx <= (lseq - new_seed_len - n_end_pos); lett_idx++) { if (n_diffs == 1) { /* Only one column differs compared to the previous motif for which pY * was evaluated. Evaluate pY (for all i,j in the dataset) * appropriately. It is found by adjusting the previous * pY value by the amount indicated in the difference column for the * letter at that column in the current subsequence... */ pY[lett_idx] = pY[lett_idx] + diff_col1[(int)(*diff_1++)]; } else { /* There are two columns differing between this motif and the previous * motif evaluated. Evaluate each pY taking this into consideration. * It is found by adjusting the previous pY value by the amounts * indicated in the TWO difference columns for the letters at those * columns in the current subsequence... */ pY[lett_idx] = pY[lett_idx] + diff_col1[(int)(*diff_1++)] + diff_col2[(int)(*diff_2++)]; } // examining col_b_idx } // offset lett_idx } // sequence seq_idx } else if (new_old_shift < 0) { /* The start of the new seed is shifted "to the left" of the start of the old seed => iterate forwards from the beggining of the sequence, and then deal with remaining positions (that don't have a valid score from the old sequence) explicitly: */ int seq_idx; for (seq_idx=0; seq_idx < n_samples; seq_idx++) { SAMPLE *curr_seq = samples[seq_idx]; // The current sequence int lseq = curr_seq->length; // Length of current sequence // Integer encoding of the sequence: char *res = pYindex<2 ? curr_seq->res : curr_seq->resic; int *pY = curr_seq->pY[pYindex]; // p(Y_j | theta_1) if (lseq < new_seed_len) continue; // Skip if sequence is too short // Shift the sequence (adjusting according to the indeces at which the // current and previous motifs differ) now, rather than inside inner // loop... // Residues - for the current sequence position - at which the new and old // seeds differ: char *diff_1 = res+diff_idx1; char *diff_2 = NULL; if (n_diffs == 2) { diff_2 = res+diff_idx2; } int *pY_shifted = pY - new_old_shift; int lett_idx; for(lett_idx=0; lett_idx < (lseq - old_seed_len + new_old_shift); lett_idx++) { if (n_diffs == 1) { /* Only one column differs compared to the previous motif for which pY * was evaluated. Evaluate pY (for all i,j in the dataset) * appropriately. It is found by adjusting the previous * pY value by the amount indicated in the difference column for the * letter at that column in the current subsequence... */ // (n_diffs == 1) && (shift < 0) => The difference is the single // shift. pY[ij] is set using pY[i(j+1)] under the old seed: pY[lett_idx] = pY_shifted[lett_idx] + diff_col1[(int)(*diff_1++)]; } else { /* There are two columns differing between this motif and the previous * motif evaluated. Evaluate each pY taking this into consideration. * It is found by adjusting the previous pY value by the amounts * indicated in the TWO difference columns for the letters at those * columns in the current subsequence... */ // Remember we have assumed that if there is a shift, all differences // (between seeds) are accounted for by the shift: pY[lett_idx] = pY_shifted[lett_idx] + diff_col1[(int)(*diff_1++)] + diff_col2[(int)(*diff_2++)]; } // examining n_diffs } // offset lett_idx } // sequence seq_idx } else { /* The start of the new seed is shifted "to the right" of the start of the old seed => iterate backwards from the end of the sequence, and then deal with remaining positions (that don't have a valid score from the old seed) explicitly: */ int seq_idx; for (seq_idx=0; seq_idx < n_samples; seq_idx++) { SAMPLE *curr_seq = samples[seq_idx]; // The current sequence int lseq = curr_seq->length; // Length of current sequence // Integer encoding of the sequence: char *res = pYindex<2 ? curr_seq->res : curr_seq->resic; int *pY = curr_seq->pY[pYindex]; // p(Y_j | theta_1) if (lseq < new_seed_len) continue; // Skip if sequence is too short // Shift the sequence (adjusting according to the indeces at which the // current and previous motifs differ) now, rather than inside inner // loop... int first_char_pos = (lseq-new_seed_len-n_end_pos); // Residues - for the current sequence position - at which the new and old // seeds differ: char *diff_1 = res+diff_idx1+first_char_pos; char *diff_2 = NULL; if (n_diffs == 2) { diff_2 = res+diff_idx2+first_char_pos; } int *pY_shifted = pY - new_old_shift; // Scan backwards (because we don't want to "overwrite" the previous // pY values prematurely): int lett_idx; for(lett_idx=first_char_pos; lett_idx >= (n_start_pos); lett_idx--) { if (n_diffs == 1) { // Only 1 column differs compared with the previous motif... pY[lett_idx] = pY_shifted[lett_idx] + diff_col1[(int)(*diff_1--)]; } else { pY[lett_idx] = pY_shifted[lett_idx] + diff_col1[(int)(*diff_1--)] + diff_col2[(int)(*diff_2--)]; } // examining n_diffs } // lett_idx } // seq_idx } // Considering new_old_shift /* pY has now been calculated for all positions that dynamic programming can be used for. However, some positions will remain un-calculated if the new and old seeds were not of equal length with zero offset. These pY values will be calculated now... */ // NOTE: Optimisation by pointer arithmetic is NOT carried out below, as // the number of iterations in the following loops should be low. int seq_idx; for (seq_idx=0; seq_idx < n_samples; seq_idx++) { SAMPLE *curr_seq = samples[seq_idx]; // The current sequence int lseq = curr_seq->length; // Length of current sequence // Integer encoding of the sequence: char *res = pYindex<2 ? curr_seq->res : curr_seq->resic; int *pY = curr_seq->pY[pYindex]; // p(Y_j | theta_1) if (lseq < new_seed_len) continue; // Skip if sequence is too short // Calculate pY values at the start of the sequence: int lett_idx; for (lett_idx = 0; lett_idx < n_start_pos; lett_idx++) { int extra_pY_value = 0; int mot_idx; // Index within the lmotif for the new seed. // Retrieve the log prob from the letter for each position in the site: for (mot_idx = 0; mot_idx < new_seed_len; mot_idx++) { /* for (mot_idx = 0; mot_idx < (lett_idx + new_seed_len); mot_idx++) { */ char curr_lett = res[lett_idx + mot_idx]; // Current letter in seq extra_pY_value += lmotif_new[mot_idx][(int)curr_lett]; } pY[lett_idx] = extra_pY_value; } // lett_idx // Calculate pY values at the end of the sequence: for (lett_idx = (lseq - new_seed_len - n_end_pos + 1); lett_idx <= (lseq - new_seed_len); lett_idx++) { int extra_pY_value = 0; int mot_idx; // Index within the lmotif for the new seed. // Retrieve the log prob from the letter for each position in the site: for (mot_idx = 0; mot_idx < new_seed_len; mot_idx++) { char curr_lett = res[lett_idx + mot_idx]; // Current letter in seq extra_pY_value += lmotif_new[mot_idx][(int)curr_lett]; } pY[lett_idx] = extra_pY_value; } // lett_idx }// seq_idx //trace("Exited npyb...\n"); } // next_pY_branching /* * Local Variables: * mode: c * c-basic-offset: 2 * End: */