/* * $Id: dpalign.c 61 2005-07-29 00:16:55Z nadya $ * * $Log$ * Revision 1.1 2005/07/29 00:18:53 nadya * Initial revision * */ /*#define DEBUG*/ /* dpalign.c */ /************************************************************************ * Copyright * * (1999) The Regents of the University of California. * * All Rights Reserved. * * Author: Timothy L. Bailey * * * Permission to use, copy, modify, and distribute any part of * * this software for educational, research and non-profit purposes,* * without fee, and without a written agreement is hereby granted, * * provided that the above copyright notice, this paragraph and * * the following three paragraphs appear in all copies. * * * * Those desiring to incorporate this software into commercial * * products or use for commercial purposes should contact the * * Technology Transfer Office, University of California, San Diego,* * 9500 Gilman Drive, La Jolla, California, 92093-09m, * * Ph: (619) 534 5815. * * * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO * * ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR * * CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF * * THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA * * HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE * * UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE * * MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. * * THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND * * EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT * * THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, * * TRADEMARK OR OTHER RIGHTS. * ************************************************************************/ /* 10-3-99 tlb; replace nrdb with back */ #ifdef DPALIGN_SO #define DEFINE_GLOBALS #endif #include "meme.h" #include "general.h" /* data structures */ typedef struct dp_table_entry { /* entry in dp_table */ double v; /* score: max(e, f, g) */ double e; /* score when gap in first seq here */ double f; /* score when gap in second seq here */ double g; /* score when no gap at this position */ } DP_TABLE_ENTRY; typedef DP_TABLE_ENTRY **DP_TABLE; /* rows: seq1 colums: seq2 */ #define MATCH(i, j) logodds((i)-1, (int)eseq[(j)-1]) /******************************************************************************/ /* dp_align Align a sequence to a logodds matrix using dynamic programming. Returns a string containing the two aligned sequences each terminated by a null. */ /******************************************************************************/ extern char *dp_align( int alength, /* length of alphabet */ int n, /* length of logodds matrix */ LOGODDS logodds, /* logodds matrix */ char *seq1, /* consensus of logodds */ int m, /* length of sequence */ char *eseq, /* integer encoded sequence */ double wg, /* gap cost (initialization) */ double ws, /* space cost (extension) */ BOOLEAN endgaps /* penalize end gaps if TRUE */ ) { int i, j, k, ii; int gaptype; /* used in traceback */ DP_TABLE dpt; /* dyn. prog. table */ int best_i, best_j; /* best endpoint */ char *align1=NULL, *align2=NULL; /* create the dynamic programming table */ create_2array(dpt, DP_TABLE_ENTRY, n+1, m+1); /* initialize the table */ dpt[0][0].v = dpt[0][0].f = 0; for (i=0; i<=n; i++)dpt[i][0].v = dpt[i][0].f = dpt[i][0].g = dpt[i][0].e = 0; for (j=0; j<=m; j++)dpt[0][j].v = dpt[0][j].f = dpt[0][j].g = dpt[0][j].e = 0; if (endgaps) { /* penalize end gaps */ for (i=1; i<=n; i++) dpt[i][0].v = dpt[i][0].e = -wg - i*ws; for (j=1; j<=m; j++) dpt[0][j].v = dpt[0][j].f = -wg - j*ws; } else { /* don't penalize end gaps */ for (i=1; i<=n; i++) dpt[i][0].v = dpt[i][0].e = 0; for (j=1; j<=m; j++) dpt[0][j].v = dpt[0][j].f = 0; } /* fill up the table row by row; note: row in logodds matrix corresponds to a position in seq1 */ best_i = 1; best_j = m; for (i=1; i<=n; i++) { /* row: logodds matrix col */ for (j=1; j<=m; j++) { /* column: seq2 */ dpt[i][j].e = MAX(dpt[i][j-1].e, dpt[i][j-1].v-wg)-ws; dpt[i][j].f = MAX(dpt[i-1][j].f, dpt[i-1][j].v-wg)-ws; dpt[i][j].g = dpt[i-1][j-1].v + MATCH(i,j); dpt[i][j].v = MAX(dpt[i][j].e, MAX(dpt[i][j].f, dpt[i][j].g)); if (!endgaps && (j==m || i==n) && dpt[i][j].v>dpt[best_i][best_j].v) { best_i = i; best_j = j; } } /* column */ } /* row */ #ifdef DEBUG /* print the dp table */ printf("dp table\n"); for(i=-1; i<=m; i++) printf("%-3c %3.3s ", i==0 ? '-' : i>0 ? unhash(eseq[i-1]) : ' ', " "); printf("\n"); for (i=0; i<=n; i++) { printf("%3c ", i>0 ? seq1[i-1] : '-'); for (j=0; j<=m; j++) { printf("%3.0f %3.0f ", dpt[i][j].g, dpt[i][j].f); } printf("\n"); printf("%3.3s ", " "); for (j=0; j<=m; j++) { printf("%3.0f %3.0f ", dpt[i][j].e, dpt[i][j].v); } printf("\n\n"); } printf("best_i = %d best_j = %d\n", best_i, best_j); #endif /* trace the optimal alignment and put into arrays align1 and align2 */ Resize(align1, m+n+1, char); Resize(align2, m+n+1, char); k = 0; /* pointer into alignment */ gaptype = 0; if (!endgaps) { /* print end gap */ for (i=n; i>best_i;) { align1[k] = seq1[--i]; align2[k++] = ' '; gaptype = (dpt[i][m].v-wg-ws == dpt[i+1][m].f) ? 0 : 2; } for (j=m; j>best_j;) { align1[k] = ' '; align2[k++] = unhash(eseq[--j]); gaptype = (dpt[n][j].v-wg-ws == dpt[n][j+1].e) ? 0 : 1; } } /* endgap */ for (i=endgaps?n:best_i, j=endgaps?m:best_j; i>0 && j>0 && k<=m+n; ) { #ifdef DEBUG printf("i %d j %d k %d e %f f %f g %f v %f gt %d %c %c\n", i,j,k, dpt[i][j].e, dpt[i][j].f, dpt[i][j].g, dpt[i][j].v, gaptype, seq1[i-1], unhash(eseq[j-1])); #endif switch (gaptype) { case 0: /* not in gap */ if (dpt[i][j].g > dpt[i][j].e) { if (dpt[i][j].g > dpt[i][j].f) { /* match */ align1[k] = seq1[--i]; align2[k++] = unhash(eseq[--j]); } else { /* gap in seq2 */ align1[k] = seq1[--i]; align2[k++] = ' '; gaptype = (dpt[i][j].v-wg-ws == dpt[i+1][j].f) ? 0 : 2; } } else { if (dpt[i][j].e > dpt[i][j].f) { /* gap in seq1 */ align1[k] = ' '; align2[k++] = unhash(eseq[--j]); gaptype = (dpt[i][j].v-wg-ws == dpt[i][j+1].e) ? 0 : 1; } else { /* gap in seq2 */ align1[k] = seq1[--i]; align2[k++] = ' '; gaptype = (dpt[i][j].v-wg-ws == dpt[i+1][j].f) ? 0 : 2; } } break; case 1: /* in seq1 gap */ align1[k] = ' '; align2[k++] = unhash(eseq[--j]); gaptype = (dpt[i][j].v-wg-ws == dpt[i][j+1].e) ? 0 : 1; break; case 2: /* in seq2 gap */ align1[k] = seq1[--i]; align2[k++] = ' '; gaptype = (dpt[i][j].v-wg-ws == dpt[i+1][j].f) ? 0 : 2; break; } /* gaptype */ } /* i, j, k */ /* fill in remainder of seq1 */ for (ii=i; ii>0; ii--) { align1[k] = seq1[ii-1]; align2[k++] = ' '; } for (ii=j; ii>0; ii--) { align2[k] = unhash(eseq[ii-1]); align1[k++] = ' '; } /* reverse the alignment */ for (i=0; ifinal_pos */ alignment[i+1][l] = seq2[i][j]; } } /* seq2 */ #ifdef DEBUG /* print alignment */ printf("alignment:\n"); printf("%10.10s .%s\n", "motif", alignment[0]); for (i=1; i=left && ma[0][i] != ' ') lalign++; /* alignment length */ if (ngaps[i] > g) lalign = 0; /* bad column--too many gaps */ if (pos>=left && lalign>best_w) { /* within motif and better */ best_end = pos; best_w = lalign; } if (ma[0][i] != ' ') pos++; /* next position in motif */ if (pos>right) break; /* at right edge */ } /* position in alignment & motif */ if (best_w >= minw) break; /* found a legal one */ } /* g */ best_w = MAX(best_w, minw); /* make sure at least min_w */ /* set up return values */ *w = best_w; *off = best_end-best_w+1-left; /* relative to leftmost start */ /* printf("g_align: Old w %d nsites %d Best w = %d offset = %d with g %d\n", right-left+1, nseq, *w, *off, g); */ /* free space */ myfree(ngaps); return(g); } /* g_align */ /******************************************************************************/ /* dp_multi_align Perform a multiple alignment of a motif versus a set of sequences. Uses MEME data structures. Logodds matrix uses back as background frequencies. Returns an array of null terminated, aligned sequences. */ /******************************************************************************/ extern char **dp_multi_align( P_PROB sites, /* the sites */ int nsites, /* the number of sites */ int w, /* width of sites */ int flank, /* add flank cols on left+rgt */ DATASET *dataset /* the dataset */ ) { int i; SAMPLE **samples = dataset->samples; /* sequences */ int alength = dataset->alength; /* dataset */ double wg = dataset->wg; /* gap cost (initialization) */ double ws = dataset->ws; /* space cost (extension) */ BOOLEAN endgaps = dataset->endgaps; /* penalize end gaps if TRUE */ THETA lomap = dataset->lomap; /* seq to theta logodds map */ char **eseqs=NULL; /* encoded sequences */ int *lengths=NULL; /* lengths of sequences */ char **names=NULL; /* names of sequences */ LOGODDS lo=NULL; /* logodds matrix */ char *seq = NULL; /* ascii for first sequence */ char **ma; /* multiple aligment */ /* create the array of sequences */ Resize(eseqs, nsites, char *); Resize(lengths, nsites, int); Resize(names, nsites, char *); /* prepare input for dp_ma_motif_seqs */ for (i=0; ilength; /* length of sequence */ char *eseq = ic ? s->resic : s->res; /* encoded sequence */ int left = MAX(0, y-flank); /* left edge of subsequence */ int right = MIN(lseq-1, y+w+flank-1); /* right edge of subsequence */ eseqs[i] = eseq+left; /* subsequence */ lengths[i] = right-left+1; /* subsequence length */ names[i] = s->sample_name; /* sequence name */ } /* site */ /* convert the first site to a logodds matrix */ create_2array(lo, double, lengths[0], alength+1); init_theta(lo, eseqs[0], lengths[0], lomap, alength); /* get the ascii version of the first site */ Resize(seq, lengths[0]+1, char); for (i=0; i [options]); USAGE(\n\t\t\tfirst sequence to align); USAGE(\t\t\tsecond sequence to align); NON_SWITCH(1, \r, switch (i++) { case 1: seq1 = _OPTION_; break; case 2: seq2 = _OPTION_; break; default: COMMAND_LINE_ERROR; } ); DATA_OPTN(1, wg, , \tgap cost, wg = atof(_OPTION_)); DATA_OPTN(1, ws, , \tspace cost, ws = atof(_OPTION_)); DATA_OPTN(1, m, , \tmatch score, m = atof(_OPTION_)); FLAG_OPTN(1, eg, \tpenalize end gaps, endgaps = TRUE); USAGE(\tGlobally align two sequences.); USAGE(\n\tCopyright); USAGE(\t(1999) The Regents of the University of California); USAGE(\tAll Rights Reserved.); USAGE(\tAuthor: Timothy L. Bailey); ); /* Set up the hashing functions for mapping between letters or codons and alphabet positions. */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ alen = strlen(alphabet); /* convert seq1 to logodds matrix */ w = strlen(seq1); create_2array(logodds, LOGODDSB, w, alen); for (i=0; i