#include #include #include #include #include #include #include #include #include #include #include #include #include "cl.h" #define DEBUG 0 #define KMAX 1000000 typedef struct { int col1, col2; double P1, P2, PAdd, PFull, Ppartial; } RESULT; typedef struct { int nrow, ncol; char **data; double *y; char **cname; char **rname; } DATA_MATRIX; void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ); int TwoWay ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ); int LD ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ); int my_gsl_multifit_wlinear (const gsl_matrix * X, const gsl_vector * w, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, double *chisq, gsl_multifit_linear_workspace * work); DATA_MATRIX *ReadData( char *infile ); int ReadPhenotypes( char *filename, char *phenotype, DATA_MATRIX *d1, DATA_MATRIX *d2 ); int main( int argc, char **argv ) { double *y; char **genos; int Ncol; int N; int col1, col2; int k = 0; int do_LD=0; char resultfile[256]; char infile1[256]; char infile2[256]; char pfile[256]; char phenotype[256]; double pthresh=0.000001; float p=1.0e-6; DATA_MATRIX *d1=NULL, *d2=NULL; RESULT *result = (RESULT*)calloc(KMAX+1, sizeof(RESULT)); phenotype[0] = resultfile[0] = infile1[0] = infile2[0] = pfile[0] = 0; getarg( "-chr1", infile1, argc, argv); getarg( "-chr2", infile2, argc, argv ); getarg( "-pfile", pfile, argc, argv); getarg( "-phenotype", phenotype, argc, argv); getarg( "-output", resultfile, argc, argv ); getfloat( "-pthresh", &p, argc, argv ); getbool( "-LD" &do_LD, argc, argv ); gethelp(argc, argv); d1 = ReadData( infile1 ); pthresh = p; if ( strcmp( infile1, infile2 ) ) d2 = ReadData( infile2 ); else d2 = d1; if ( do_LD ) { if ( d1->nrow == d2->nrow) { if ( d1->nrow > 0 ) { FILE *fp = NULL; if ( fp = fopen(resultfile, "w") ) { fprintf(fp, "%s %s\n", infile1, infile2); fclose(fp); LD( d1,d2, pthresh, result, resultfile ); fp = fopen(resultfile, "a"); fprintf(fp, "done\n"); fclose(fp); return 0; } else { fprintf(stderr, "ERROR - could not open file %s\n", resultfile ); } } else { fprintf(stderr, "ERROR - no matching subject names\n"); } } else { fprintf(stderr, "ERROR, incompatible inputs %s %s\n", infile1, infile2 ); exit(1); } return 1; } else { if ( d1->nrow == d2->nrow) { ReadPhenotypes( pfile, phenotype, d1, d2 ); if ( d1->nrow > 0 ) { FILE *fp = NULL; if ( fp = fopen(resultfile, "w") ) { fprintf(fp, "%s %s\n", infile1, infile2); fclose(fp); TwoWay( d1,d2, pthresh, result, resultfile ); fp = fopen(resultfile, "a"); fprintf(fp, "done\n"); fclose(fp); return 0; } else { fprintf(stderr, "ERROR - could not open file %s\n", resultfile ); } } else { fprintf(stderr, "ERROR - no matching subject names\n"); } } else { fprintf(stderr, "ERROR, incompatible inputs %s %s\n", infile1, infile2 ); exit(1); } return 1; } } int TwoWay ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ) { char **genos1 = d1->data; char **genos2 = d2->data; double *y = d1->y; int col1, col2; int N = d1->nrow; int i, ok=0; double missing = nan("char-sequence"); int *geno1 = (int*)malloc(N*sizeof(int)); int *geno2 = (int*)malloc(N*sizeof(int)); int *geno12 = (int*)malloc(N*sizeof(int)); double *residual = (double*)malloc(N*sizeof(double)); double *weight = (double*)malloc(N*sizeof(double)); int M=0; int k = 0; gsl_matrix *X = gsl_matrix_alloc(N,5); gsl_vector *W = gsl_vector_alloc(N); gsl_vector *Y = gsl_vector_alloc(N); gsl_vector *c = gsl_vector_alloc(5); gsl_matrix *cov = gsl_matrix_alloc(5,5); gsl_multifit_linear_workspace *work = gsl_multifit_linear_alloc(N,5); for(i=0;incol;col1++) { int col2_start, col2_end; if ( d1 == d2 ) { col2_start = col1+1; col2_end = d2->ncol; } else { col2_start=0; col2_end = d2->ncol; } for(i=0;i= KMAX-10 ) { FlushResult( result, k, resultfile, d1, d2 ); k = 0; } if (DEBUG) fprintf(stderr, "col1 %d col2 %d\n", col1, col2 ); for( i=0;i0 && g2>0 ) ? g1+3*(g2-1) : 0; /* if ( DEBUG ) fprintf(stderr, "i %d g1 %d g2 %d g %d\n", i, g1, g2, g);*/ if ( g > 0 ) { geno1[i] = g1; geno2[i] = g2; geno12[i] = g; weight[i] = 1; } } gsl_matrix_set(X,i,3,geno2[i]==1); gsl_matrix_set(X,i,4,geno2[i]==2); gsl_vector_set(W,i,weight[i]); } for(g=0;g<=3;g++) { count1[g] = count2[g] = countAdd[g] = 0; sum1[g] = sum2[g] = sumAdd[g] = 0.0; } for(g=0;g<=9;g++) { countFull[g] = 0; sumFull[g] = 0.0; } for( i=0;i 0 ) { double x = y[i]; g1 = geno1[i]; g2 = geno2[i]; count1[g1]++; count2[g2]++; countFull[g]++; sum1[g1] += x; sum2[g2] += x; sumFull[g] += x; tss += x*x; } } mean = sum1[1] + sum1[2] + sum1[3]; M = count1[1] + count1[2] + count1[3]; if (M > 0 ) { mean /= M; tss -= mean*mean*M; fss1 = fss2 = 0.0; for(g=1;g<=3;g++) { if ( count1[g] > 0 ) { sum1[g] /= count1[g]; fss1 += count1[g]*(sum1[g]*sum1[g]-mean*mean); df1++; } if ( count2[g] > 0 ) { sum2[g] /= count2[g]; fss2 += count2[g]*(sum2[g]*sum2[g]-mean*mean); df2++; } } if ( df1 > 1 && df2 > 1 ) { rss1 = tss - fss1; if ( df1 > 0 ) { F1 = (fss1/rss1)*((M-df1)/(df1-1.0)); P1 = gsl_cdf_fdist_Q( F1, df1-1, M-df1); } rss2 = tss - fss2; if ( df2 > 0 ) { F2 = (fss2/rss2)*((M-df2)/(df2-1.0)); P2 = gsl_cdf_fdist_Q( F2, df2-1, M-df2); } if ( DEBUG) fprintf(stderr, "N %d M %d\n", N, M ); if ( DEBUG) fprintf(stderr, "fss1 %f rss1 %f F1 %f P1 %f df1 %d\n", fss1, rss1, F1, P1, df1 ); if ( DEBUG) fprintf(stderr, "fss2 %f rss2 %f F2 %f P2 %f df2 %d\n", fss2, rss2, F2, P2, df2 ); for( i=0;iS,i); dfAdd += (alpha>1.0e-7); if ( DEBUG ) fprintf( stderr, "i=%d alpha %e rank %d\n", i, alpha, dfAdd); } dfAdd--; if ( DEBUG ) fprintf( stderr, "additivemodel rank %d rss %f\n", dfAdd, rssAdd ); fssAdd = tss - rssAdd; if ( dfAdd > 0 ) { FAdd = (fssAdd/rssAdd)*((M-dfAdd)/(dfAdd-1.0)); if ( DEBUG ) fprintf( stderr, "Fadd %e\n", FAdd ); PAdd = gsl_cdf_fdist_Q( FAdd, dfAdd, M-dfAdd); } if ( DEBUG) fprintf(stderr, "fssAdd %f rssAdd %f FAdd %f PAdd %f dfAdd %d\n", fssAdd, rssAdd, FAdd, PAdd, dfAdd ); /* full model */ fssFull = 0.0; for(g=1;g<=9;g++) { if ( countFull[g] > 0 ) { sumFull[g] /= countFull[g]; fssFull += countFull[g]*(sumFull[g]*sumFull[g]-mean*mean); dfFull++; } } dfFull--; rssFull = tss - fssFull; if ( dfFull > 0 ) { FFull = (fssFull/rssFull)*((M-dfFull)/(dfFull)); PFull = gsl_cdf_fdist_Q( FFull, dfFull, M-dfFull); } if ( DEBUG) fprintf(stderr, "fssFull %f rssFull %f FFull %f PFull %f dfFull %d\n", fssFull, rssFull, FFull, PFull, dfFull ); /* Partial F of full vs additive */ pfss = fssFull-fssAdd; if ( dfFull-dfAdd > 0 ) { Fpartial = (pfss/rssFull)*((M-dfFull)/(dfFull-dfAdd)); Ppartial = gsl_cdf_fdist_Q(Fpartial, dfFull-dfAdd, M-dfFull); } if ( DEBUG) fprintf(stderr, "pfss %f rsspartial %f Fpartial %f Ppartial %f dfrss %d dfpartial %d\n", pfss, rssFull, Fpartial, Ppartial,M-dfFull, dfFull-dfAdd ); if ( PFull < pthresh ) { result[k].col1 = col1; result[k].col2 = col2; result[k].P1 = P1; result[k].P2 = P2; result[k].PFull = PFull; result[k].PAdd = PAdd; result[k].Ppartial = Ppartial; k++; } } } } } FlushResult( result, k, resultfile, d1, d2 ); k = 0; free(geno1); free(geno2); free(geno12), free(residual); gsl_multifit_linear_free(work); gsl_matrix_free(X); gsl_vector_free(Y); gsl_vector_free(W); gsl_vector_free(c); gsl_matrix_free(cov); return ok; } DATA_MATRIX *ReadData( char *infile ) { FILE *fp; int nrow, ncol; char **data = NULL; char *buffer; char name[256]; int n, m; double x; double *y; DATA_MATRIX *d = NULL; if ( fp = fopen( infile, "r" ) ) { if ( fscanf( fp, "%d %d", &nrow, &ncol ) ==2 ) { int i; char tmp[256]; char **cname = (char**)calloc(ncol,sizeof(char*)); char **rname = (char**)calloc(nrow,sizeof(char*)); for(i=0;inrow = n; d->ncol = ncol; d->data = data; d->y = y; d->cname = cname; d->rname = rname; } } return d; } void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ) { FILE *fp; if ( fp = fopen( resultfile, "a" ) ) { int j; for(j=0;jcname[result[j].col1], d2->cname[result[j].col2], result[j].P1, result[j].P2, result[j].PAdd, result[j].PFull, result[j].Ppartial ); } fclose(fp); } } int my_gsl_multifit_wlinear (const gsl_matrix * X, const gsl_vector * w, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, double *chisq, gsl_multifit_linear_workspace * work) { if (X->size1 != y->size) { GSL_ERROR ("number of observations in y does not match rows of matrix X", GSL_EBADLEN); } else if (X->size2 != c->size) { GSL_ERROR ("number of parameters c does not match columns of matrix X", GSL_EBADLEN); } else if (w->size != y->size) { GSL_ERROR ("number of weights does not match number of observations", GSL_EBADLEN); } else if (cov->size1 != cov->size2) { GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR); } else if (c->size != cov->size1) { GSL_ERROR ("number of parameters does not match size of covariance matrix", GSL_EBADLEN); } else if (X->size1 != work->n || X->size2 != work->p) { GSL_ERROR ("size of workspace does not match size of observation matrix", GSL_EBADLEN); } else { const size_t n = X->size1; const size_t p = X->size2; size_t i, j; gsl_matrix *A = work->A; gsl_matrix *Q = work->Q; gsl_matrix *QSI = work->QSI; gsl_vector *S = work->S; gsl_vector *t = work->t; gsl_vector *xt = work->xt; gsl_vector *D = work->D; /* Scale X, A = sqrt(w) X */ gsl_matrix_memcpy (A, X); for (i = 0; i < n; i++) { double wi = gsl_vector_get (w, i); if (wi < 0) wi = 0; { gsl_vector_view row = gsl_matrix_row (A, i); gsl_vector_scale (&row.vector, sqrt (wi)); } } /* Balance the columns of the matrix A */ gsl_linalg_balance_columns (A, D); /* Decompose A into U S Q^T */ gsl_linalg_SV_decomp_mod (A, QSI, Q, S, xt); /* Solve sqrt(w) y = A c for c, by first computing t = sqrt(w) y */ for (i = 0; i < n; i++) { double wi = gsl_vector_get (w, i); double yi = gsl_vector_get (y, i); if (wi < 0) wi = 0; gsl_vector_set (t, i, sqrt (wi) * yi); } gsl_blas_dgemv (CblasTrans, 1.0, A, t, 0.0, xt); /* Scale the matrix Q, Q' = Q S^-1 */ gsl_matrix_memcpy (QSI, Q); for (j = 0; j < p; j++) { gsl_vector_view column = gsl_matrix_column (QSI, j); double alpha = gsl_vector_get (S, j); if (alpha >1.0e-7) alpha = 1.0 / alpha; else { alpha = 0.0; gsl_vector_set(S,j,0.0); } gsl_vector_scale (&column.vector, alpha); } gsl_vector_set_zero (c); /* Solution */ gsl_blas_dgemv (CblasNoTrans, 1.0, QSI, xt, 0.0, c); /* Unscale the balancing factors */ gsl_vector_div (c, D); /* Form covariance matrix cov = (Q S^-1) (Q S^-1)^T */ for (i = 0; i < p; i++) { gsl_vector_view row_i = gsl_matrix_row (QSI, i); double d_i = gsl_vector_get (D, i); for (j = i; j < p; j++) { gsl_vector_view row_j = gsl_matrix_row (QSI, j); double d_j = gsl_vector_get (D, j); double s; gsl_blas_ddot (&row_i.vector, &row_j.vector, &s); gsl_matrix_set (cov, i, j, s / (d_i * d_j)); gsl_matrix_set (cov, j, i, s / (d_i * d_j)); } } /* Compute chisq, from residual r = y - X c */ { double r2 = 0; for (i = 0; i < n; i++) { double yi = gsl_vector_get (y, i); double wi = gsl_vector_get (w, i); gsl_vector_const_view row = gsl_matrix_const_row (X, i); double y_est, ri; gsl_blas_ddot (&row.vector, c, &y_est); ri = yi - y_est; r2 += wi * ri * ri; } *chisq = r2; } return GSL_SUCCESS; } } int ReadPhenotypes( char *filename, char *phenotype, DATA_MATRIX *d1, DATA_MATRIX *d2 ) { FILE *fp; char buffer[5000]; int pindex=-1; int sindex=-1; int n = 0; int i, j; double missing = nan("char-sequence"); if ( fp = fopen( filename, "r" ) ) { hcreate( d1->nrow ); for(i=0;inrow;i++) { ENTRY e; e.key = d1->rname[i]; e.data = (void*)i; hsearch( e, ENTER ); d1->y[i] = missing; } while(skip_comments( fp, buffer ) != EOF ) { if ( n == 0 ) { int k = 0; char *tok = strtok( buffer, "\t" ); while ( tok ) { if ( !strcmp( tok, "SUBJECT.NAME" ) ) sindex = k; if ( !strcmp( tok, phenotype ) ) pindex = k; tok = strtok( NULL, "\t" ); k++; } if ( sindex < 0 || pindex < 0 ) { fprintf( stderr, "ERROR, missing index for phenotype %s %d %d\n", phenotype, pindex, sindex); exit(1); } else { if ( DEBUG ) fprintf(stderr, "pindex %d sindex %d\n", pindex, sindex ); } } else { int k = 0; int idx = -1; int max = pindex > sindex ? pindex : sindex; double p = missing; char *tok = strtok( buffer, "\t" ); while( tok && k <= max ) { if ( k == sindex ) { ENTRY e, *f; e.key = tok; if ( f = hsearch( e, FIND ) ) { idx = (int)f->data; if ( DEBUG ) fprintf(stderr, "%s %d\n", f->key, (int)f->data ); } else { /* fprintf( stderr, "Missing subject %s\n", tok ); */ } } if ( k == pindex ) { sscanf( tok, "%lf", &p ); d1->y[idx] = p; if ( DEBUG) fprintf( stderr, "%d = %s = %e\n", idx, tok, p ); } k++; tok = strtok(NULL, "\t"); } } n++; } j = 0; for(i=0;inrow;i++) { if ( ! isnan(d1->y[i]) ) { d1->y[j] = d2->y[j] = d1->y[i]; d1->data[j] = d1->data[i]; d2->data[j] = d2->data[i]; d1->rname[j] = d1->rname[i]; d2->rname[j] = d2->rname[i]; if ( DEBUG ) fprintf(stderr, "%d %s %e\n", i, d1->rname[j], d1->y[j]); j++; } else { if ( DEBUG ) fprintf(stderr, "skipping row %d %d\n", i, j); } } if ( DEBUG ) fprintf( stderr, "nrow = %d\n", j ); d1->nrow = d2->nrow = j; if ( DEBUG ) fprintf( stderr, "nrow = %d\n", j ); hdestroy(); } else { fprintf(stderr, "ERROR could not open file %s\n", filename ); exit(1); } return d1->nrow; }