/***************************************************************************** * FILE: karlin.c * AUTHOR: Timothy Bailey (adapted from karlin.c, see below) * CREATE DATE: 4/23/2002 * PROJECT: shared * COPYRIGHT: 2002, University of Queensland * VERSION: $Revision: 1.1.1.1 $ * DESCRIPTION: Substitution matrices: scoring matrices and target frequencies *****************************************************************************/ #include "karlin.h" /**************** Statistical Significance Parameter Subroutine **************** Version 1.0 February 2, 1990 Version 2.0 March 18, 1993 Program by: Stephen Altschul Address: National Center for Biotechnology Information National Library of Medicine National Institutes of Health Bethesda, MD 20894 Internet: altschul@ncbi.nlm.nih.gov See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical Significance of Molecular Sequence Features by Using General Scoring Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268. Computes the parameters lambda and K for use in calculating the statistical significance of high-scoring segments or subalignments. The scoring scheme must be integer valued. A positive score must be possible, but the expected (mean) score must be negative. A program that calls this routine must provide the value of the lowest possible score, the value of the greatest possible score, and a pointer to an array of probabilities for the occurence of all scores between these two extreme scores. For example, if score -2 occurs with probability 0.7, score 0 occurs with probability 0.1, and score 3 occurs with probability 0.2, then the subroutine must be called with low = -2, high = 3, and pr pointing to the array of values { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide pointers to lambda and K; the subroutine will then calculate the values of these two parameters. In this example, lambda=0.330 and K=0.154. The parameters lambda and K can be used as follows. Suppose we are given a length N random sequence of independent letters. Associated with each letter is a score, and the probabilities of the letters determine the probability for each score. Let S be the aggregate score of the highest scoring contiguous segment of this sequence. Then if N is sufficiently large (greater than 100), the following bound on the probability that S is greater than or equal to x applies: P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ]. In other words, the p-value for this segment can be written as 1-exp[-KN*exp(-lambda*S)]. This formula can be applied to pairwise sequence comparison by assigning scores to pairs of letters (e.g. amino acids), and by replacing N in the formula with N*M, where N and M are the lengths of the two sequences being compared. In addition, letting y = KN*exp(-lambda*S), the p-value for finding m distinct segments all with score >= S is given by: 2 m-1 -y 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e Notice that for m=1 this formula reduces to 1-exp(-y), which is the same as the previous formula. long low; * Lowest score (must be negative) * long high; * Highest score (must be positive) * double *pr; * Probabilities for various scores * double *lambda; * Pointer to parameter lambda * double *K; * Pointer to parmeter K * double *H; * Pointer to parmeter H * *******************************************************************************/ Boolean karlin(long low,long high,double *pr,double *lambda,double *K,double *H) { long i,j,range,lo,hi,first,last; double up,new,sum,Sum,av,beta,oldsum,ftemp; // tlb; set ratio so SGI compiler won't complain double ratio = 1; double *p,*P,*ptrP,*ptr1,*ptr2; /* Check that scores and their associated probabilities are valid */ if (low>=0) { fprintf(stderr,"Lowest score must be negative.\n"); return FALSE; } for (i=range=high-low;i> -low && !pr[i];--i); if (i<= -low) { fprintf(stderr,"A positive score must be possible.\n"); return FALSE; } for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) { fprintf(stderr,"Negative probabilities not allowed.\n"); return FALSE; } if (sum<0.99995 || sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f. Normalizing.\n",sum); (void)NEW(p,range+1,double); for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum); if (Sum>=0) { fprintf(stderr,"Invalid (non-negative) expected score: %.3f\n",Sum); free(p); return FALSE; } /* Calculate the parameter lambda */ up=0.5; do { up*=2; ptr1=p; beta=exp(up); ftemp=exp(up*(low-1)); for (sum=i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta); } while (sum<1.0); for (*lambda=j=0;j<25;++j) { new=(*lambda+up)/2.0; beta=exp(new); ftemp=exp(new*(low-1)); ptr1=p; for (sum=i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta); if (sum>1.0) up=new; else *lambda=new; } /* Calculate the pamameter K */ ptr1=p; ftemp=exp(*lambda*(low-1)); for (av=0,i=low;i<=high;++i) av+= *ptr1++ *i*(ftemp*=beta); *H= *lambda*av/log(2.0); Sum=lo=hi=0; (void)NEW(P,KARLINMAXIT*range+1,double); for (*P=sum=oldsum=j=1;j<=KARLINMAXIT && sum>0.001;Sum+=sum/=j++) { first=last=range; for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) { ptr1=ptrP-first; ptr2=p+first; for (sum=0,i=first;i<=last;++i) sum+= *ptr1-- * *ptr2++; if (first) --first; if (ptrP-P<=range) --last; } ftemp=exp(*lambda*(lo-1)); for (sum=0,i=lo;i;++i) sum+= *++ptrP * (ftemp*=beta); for (;i<=hi;++i) sum+= *++ptrP; ratio=sum/oldsum; oldsum=sum; } for (;j<=200;Sum+=oldsum/j++) oldsum*=ratio; for (i=low;!p[i-low];++i); for (j= -i;i1;) if (p[++i-low]) j=karlin_gcd(j,i); *K = (j*exp(-2*Sum))/(av*(1.0-exp(- *lambda*j))); free(p); free(P); return TRUE; /* Parameters calculated successfully */ } double ExpectedInformation(a_type A, double lambda, double *freq) { long i,j; double sum,tot,fij,eij,mij; sum = tot = 0.0; for (i=1; i<=20; i++){ for (j=1; j<=20; j++) { mij = valAlphaR(i,j,A); fij = freq[i]*freq[j]; tot += fij; eij = mij*fij*exp(lambda*mij); sum += eij; } } return(lambda*sum/tot); } long karlin_gcd(long a,long b) { long c; if (b<0) b= -b; if (b>a) { c=a; a=b; b=c; } for (;b;b=c) { c=a%b; a=b; } return a; }