#include #include #include #include #include #include #include #include "cl.h" #define DEBUG 0 #define KMAX 1000000 typedef struct { int col1, col2, df; double p0, q0, D, Dprime, R2, CS, CP; } RESULT; typedef struct { int nrow, ncol; char **data; double *y; char **cname; char **rname; } DATA_MATRIX; void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ); int LD ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ); double EM( int** freq, double tol, int NMAX, double *D, double *Dprime, double *R2, double *P0, double *Q0 ); int chisquare( int ** freq, double *CS, double *P, int *df ); void FreqTable( char *genos1, char *genos2, int N, int **freq ); DATA_MATRIX *ReadData( char *infile ); /* int debug =0; */ int main( int argc, char **argv ) { double *y; char **genos; int Ncol; int N; int col1, col2; int k = 0; int do_LD=0; char resultfile[256]; char infile1[256]; char infile2[256]; char pfile[256]; char phenotype[256]; double pthresh=0.000001; float p=1.0e-6; DATA_MATRIX *d1=NULL, *d2=NULL; RESULT *result = (RESULT*)calloc(KMAX+1, sizeof(RESULT)); phenotype[0] = resultfile[0] = infile1[0] = infile2[0] = pfile[0] = 0; getarg( "-chr1", infile1, argc, argv); getarg( "-chr2", infile2, argc, argv ); getarg( "-pfile", pfile, argc, argv); getarg( "-phenotype", phenotype, argc, argv); getarg( "-output", resultfile, argc, argv ); getfloat( "-pthresh", &p, argc, argv ); getbool( "-LD", &do_LD, argc, argv ); gethelp(argc, argv); d1 = ReadData( infile1 ); pthresh = p; if ( strlen(infile2) && strcmp( infile1, infile2 ) ) d2 = ReadData( infile2 ); else d2 = d1; if ( do_LD ) { if ( d1->nrow == d2->nrow) { if ( d1->nrow > 0 ) { FILE *fp = NULL; if ( fp = fopen(resultfile, "w") ) { fprintf(fp, "%s %s\nSNP1 SNP2 p1 p2 D Dprime R2\n", infile1, infile2); fclose(fp); LD( d1,d2, pthresh, result, resultfile ); fp = fopen(resultfile, "a"); fprintf(fp, "done\n"); fclose(fp); return 0; } else { fprintf(stderr, "ERROR - could not open file %s\n", resultfile ); } } else { fprintf(stderr, "ERROR - no matching subject names\n"); } } else { fprintf(stderr, "ERROR, incompatible inputs %s %s\n", infile1, infile2 ); exit(1); } return 1; } else { if ( d1->nrow == d2->nrow) { if ( d1->nrow > 0 ) { FILE *fp = NULL; if ( fp = fopen(resultfile, "w") ) { fprintf(fp, "%s %s\n", infile1, infile2); fclose(fp); LD( d1,d2, pthresh, result, resultfile ); fp = fopen(resultfile, "a"); fprintf(fp, "done\n"); fclose(fp); return 0; } else { fprintf(stderr, "ERROR - could not open file %s\n", resultfile ); } } else { fprintf(stderr, "ERROR - no matching subject names\n"); } } else { fprintf(stderr, "ERROR, incompatible inputs %s %s\n", infile1, infile2 ); exit(1); } return 1; } } void FreqTable( char *genos1, char *genos2, int N, int **freq ) { int j, i; int tmp[4][4]; for(j=0;j<4;j++) tmp[j][0] = tmp[j][1] = tmp[j][2] = tmp[j][3] = 0; i = N; while(i--) { tmp[*(genos1++)][*(genos2++)] ++; } for(j=0;j<3;j++) for(i=0;i<3;i++) freq[i][j] = tmp[i+1][j+1]; } int LD ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ) { char **genos1 = d1->data; char **genos2 = d2->data; double *y = d1->y; int col1, col2; int N = d1->nrow; int i, ok=0; double missing = nan("char-sequence"); int M=0; int k = 0; int **freq; double tol = 1.0e-6; int NMAX = 20; freq = (int**)calloc(3,sizeof(int*)); for(i=0;i<3;i++) freq[i] = (int*)calloc( 3, sizeof(int)); for( col1=0; col1ncol;col1++) { int col2_start, col2_end; if ( d1 == d2 ) { col2_start = col1+1; col2_end = d2->ncol; } else { col2_start=0; col2_end = d2->ncol; } for(col2=col2_start;col2cname[col1], "rs3683945") && ! strcmp( d2->cname[col2], "rs3661215")) { debug = 1; int n; printf( "%s %s\n", d1->cname[col1], d2->cname[col2] ); for(n=0;n<3;n++) printf("%5d %5d %5d\n", freq[n][0], freq[n][1], freq[n][2] ); } else debug = 0; */ if ( EM( freq, tol, NMAX, &D, &Dprime, &R2, &p0, &q0 ) > 0 ) { result[k].p0 = p0; result[k].q0 = q0; result[k].D = D; result[k].Dprime = Dprime; result[k].R2 = R2; ok = 1; } else { result[k].col1 = col1; result[k].col2 = col2; result[k].p0 = 0.0; result[k].q0 = 0.0; result[k].D = 0.0; result[k].Dprime = 0.0; result[k].R2 = 0.0; } if ( chisquare( freq, &CS, &CP, &df ) ) { /* if ( ! strcmp( d1->cname[col1], "rs3683945") && ! strcmp( d2->cname[col2], "rs3661215")) { int n; printf( "%s %s\n", d1->cname[col1], d2->cname[col2] ); for(n=0;n<3;n++) printf("%5d %5d %5d\n", freq[n][0], freq[n][1], freq[n][2] ); printf( "CS %e CP %e df %d\n", CS, CP, df ); } */ result[k].CS = CS; result[k].CP = CP; result[k].df = df; ok = 1; } else { result[k].CS = 0.0; result[k].CP = 0.0; result[k].df = 0; } /* printf( "%d %d %e %e %e %e\n", k, KMAX, result[k].p0, result[k].D, result[k].Dprime, result[k].R2); */ if ( ok ) k++; if ( k >= KMAX-10 ) { FlushResult( result, k, resultfile, d1, d2 ); k = 0; } } } FlushResult( result, k, resultfile, d1, d2 ); k = 0; return ok; } DATA_MATRIX *ReadData( char *infile ) { FILE *fp; int nrow, ncol; char **data = NULL; char *buffer; char name[256]; int n, m; double x; DATA_MATRIX *d = NULL; if ( fp = fopen( infile, "r" ) ) { if ( fscanf( fp, "%d %d", &nrow, &ncol ) ==2 ) { int i; char tmp[256]; char **cname = (char**)calloc(ncol,sizeof(char*)); char **rname = (char**)calloc(nrow,sizeof(char*)); for(i=0;inrow = n; d->ncol = ncol; d->data = data; d->cname = cname; d->rname = rname; } } return d; } void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ) { FILE *fp; fprintf(stderr, "flushing to %s\n", resultfile ); if ( fp = fopen( resultfile, "a" ) ) { int j; for(j=0;jcname[result[j].col1], d2->cname[result[j].col2], result[j].p0, result[j].q0, result[j].D, result[j].Dprime, result[j].R2, result[j].CS, result[j].CP, result[j].df ); } fclose(fp); } } int chisquare( int ** freq, double *CS, double *P, int *df ) { double margin1[3], margin2[3]; double expected[3][3]; double total=0.0, X = 0.0; int i, j, n=0, n1=0, n2=0; margin1[0] = margin1[1] = margin1[2] = 0.0; margin2[0] = margin2[1] = margin2[2] = 0.0; for(i=0;i<3;i++) expected[i][0] = expected[i][1] = expected[i][2] = 0.0; for(i=0;i<3;i++) for(j=0;j<3;j++) { margin1[i] += freq[i][j]; margin2[j] += freq[i][j]; total += freq[i][j]; } for(i=0;i<3;i++) { n1 += (margin1[i]>0); n2 += (margin2[i]>0); } *df = (n1-1)*(n2-1); *CS = 0.0; if ( total > 0 && *df > 0 ) { for(i=0;i<3;i++) for(j=0;j<3;j++) { expected[i][j] = margin1[i]*margin2[j]/total; if ( expected[i][j] > 0 ) { X = freq[i][j] - margin1[i]*margin2[j]/total; *CS += X*X/expected[i][j]; n++; /* if ( debug ) printf( "%d %d %5d %e %e %e\n", i, j, freq[i][j], expected[i][j], X, *CS ); */ } } } if ( *df > 0 ) { if ( *CS < 100 ) *P = -log10( 1.0e-20 + gsl_cdf_chisq_Q( *CS, *df )); else *P = 20; return 1; } else { *P = 0.0; *CS = 0.0; *df = 0; return 0; } } double EM( int** freq, double tol, int NMAX, double *D, double *Dprime, double *R2, double *P0, double *Q0 ) { int i, j; double total = freq[0][0] + freq[0][1] + freq[0][2] + freq[1][0] + freq[1][1] + freq[1][2] + freq[2][0] + freq[2][1] + freq[2][2]; /* for(j=0;j<3;j++) { printf ( "%5d %5d %5d\n", freq[j][0],freq[j][1],freq[j][2]); } */ if ( total > 0 ) { double p0 = (freq[0][0] + freq[0][1] + freq[0][2] + ( freq[1][0] + freq[1][1] + freq[1][2])/2.0)/total; double p1 = 1.0 - p0; double q0 = (freq[0][0] + freq[1][0] + freq[2][0] + ( freq[0][1] + freq[1][1] + freq[2][1] )/2.0)/total; double q1 = 1.0 - q0; *P0 = p0 < p1 ? p0 : p1; *Q0 = q0 < q1 ? q0 : q1; /* printf( "p0 %g q0 %g total %g\n", p0, q0, total ); */ if ( p0 > 0 && p1 > 0 && q0 > 0 && q1 > 0 ) { double Dmax ; double total2 = 2.0*total; double p00 = p0*q0; double p01 = p0*q1; double p10 = p1*q0; double p11 = p1*q1; double x = p00*p11/(p00*p11 + p01*p10); double y = -1.0; double r00 = 2*freq[0][0] + freq[0][1] + freq[1][0]; double r01 = freq[0][1] + 2*freq[0][2] + freq[1][2]; double r10 = 2*freq[2][0] + freq[1][0] + freq[2][1]; double r11 = freq[2][1] + freq[1][2] + 2*freq[2][2]; int N = 0; /* printf( " x-y %g tol %g N %d NMAX %d\n", fabs(x-y), tol, N, NMAX ); */ while( fabs( x-y ) > tol && N < NMAX ) { double x1 = freq[1][1]*x; double x2 = (1-x)*freq[1][1]; p00 = ( r00 + x1)/total2; p01 = ( r01 + x2)/total2; p10 = ( r10 + x2)/total2; p11 = ( r11 + x1)/total2; y = x; x = p00*p11/(p00*p11 + p01*p10); /* printf( "N %d x %g total %g\n", N, x, p00+p01+p10+p11); */ N++; } *D = p00*p11 - p01*p10; if ( *D > 0 ) Dmax = p0*q1 < p1*q0 ? p0*q1 : p1*q0; else Dmax = - (p0*q0 < p1*q1 ? p0*q0 : p1*q1) ; *Dprime = *D/Dmax; *R2 = (*D)*(*D)/( p0*p1*q0*q1 ); /* printf( "p0 %g q0 %g Dmax %g\n", p0, q0,Dmax ); printf( "D %g Dprime %g R2 %g\n", *D, *Dprime, *R2 ); */ return(x); } } return(-1); }