#include #include #include #include #define _GNU_SOURCE #include #include #include #include #include #include #include #include "cl.h" #include "search.h" #define DEBUG 0 #define KMAX 1000000 typedef struct { int col1, col2; double P1, P2, PAdd, PFull, Ppartial; } RESULT; typedef struct { int nrow, ncol; char **data; double *y; char **cname; char **rname; } DATA_MATRIX; typedef struct { int chrom; int pos; } MARKER_INDEX; typedef struct { char *snp; char *cluster; int chrom; int pos; } IDATA; void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ); int TwoWay ( DATA_MATRIX **d, IDATA *id1, IDATA *id2 ); int LD ( DATA_MATRIX *d1, DATA_MATRIX *d2, double pthresh, RESULT *result, char *resultfile ); IDATA * ReadIntervalData ( char *filename, char *phenofile, char *phenotype, int *intervals, DATA_MATRIX **d ); int my_gsl_multifit_wlinear (const gsl_matrix * X, const gsl_vector * w, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, double *chisq, gsl_multifit_linear_workspace * work); DATA_MATRIX *ReadData( char *infile ); int ReadPhenotypes( char *filename, char *phenotype, DATA_MATRIX **d, int nchrom ); int main( int argc, char **argv ) { double *y; char **genos; int Ncol; int N; int col1, col2, i, j; int k = 0, c; int do_LD=0; char resultfile[256]; char infile[256]; char pfile[256]; char phenotype[256]; char genotypedir[256]; char phenotypedir[256]; char peakfile[256]; char tmpfile[256]; char suffix[256]; int autosomes=19; double pthresh=0.000001; float p=1.0e-6; DATA_MATRIX **d; RESULT *result = (RESULT*)calloc(KMAX+1, sizeof(RESULT)); IDATA *idata; int nintervals; phenotype[0] = resultfile[0] = infile[0] = genotypedir[0] = suffix[0] = pfile[0] = 0; getarg( "-genotypedir", genotypedir, argc, argv); getarg( "-phenotypedir", phenotypedir, argc, argv); getarg( "-suffix", suffix, argc, argv ); getint( "-autosomes", &autosomes, argc, argv ); getarg( "-peakfile", peakfile, argc, argv); getarg( "-phenotype", phenotype, argc, argv); getarg( "-output", resultfile, argc, argv ); getfloat( "-pthresh", &p, argc, argv ); getbool( "-LD" ,&do_LD, argc, argv ); gethelp(argc, argv); d = (DATA_MATRIX**)calloc(autosomes,sizeof(DATA_MATRIX*)); hcreate( autosomes*1000 ); for( c=0; cncol, sizeof(MARKER_INDEX)); for(m=0;mncol;m++) { ENTRY e, *entry; mi[m].chrom = c; mi[m].pos = m; e.key = strdup( d[c]->cname[m] ); e.data = (void *)&(mi[m]); entry = hsearch( e, ENTER ); } } idata = ReadIntervalData( peakfile, pfile, phenotype, &nintervals, d ); hdestroy(); pthresh = p; sprintf( tmpfile, "%s/%s", phenotypedir, pfile ); ReadPhenotypes( tmpfile, phenotype, d, autosomes ); for(i=0;inrow > 0 ) { TwoWay( d, &idata[i], &idata[j] ); } else { /* fprintf(stderr, "ERROR - no matching subject names\n"); */ } } } } int TwoWay ( DATA_MATRIX **d, IDATA *id1, IDATA *id2 ) { DATA_MATRIX *d1 = d[id1->chrom]; DATA_MATRIX *d2 = d[id2->chrom]; char **genos1 = d1->data; char **genos2 = d2->data; double *y = d1->y; int col1 = id1->pos; int col2 = id2->pos; int N = d1->nrow; int i, ok=0; double missing = nan("char-sequence"); int *geno1 = (int*)malloc(N*sizeof(int)); int *geno2 = (int*)malloc(N*sizeof(int)); int *geno12 = (int*)malloc(N*sizeof(int)); double *residual = (double*)malloc(N*sizeof(double)); double *weight = (double*)malloc(N*sizeof(double)); int M=0; int k = 0; gsl_matrix *X = gsl_matrix_alloc(N,5); gsl_vector *W = gsl_vector_alloc(N); gsl_vector *Y = gsl_vector_alloc(N); gsl_vector *c = gsl_vector_alloc(5); gsl_matrix *cov = gsl_matrix_alloc(5,5); gsl_multifit_linear_workspace *work = gsl_multifit_linear_alloc(N,5); /* printf( "d1 %u genos1 %u\n", d1, genos1 ); */ for(i=0;i0 && g2>0 ) ? g1+3*(g2-1) : 0; /* if ( DEBUG ) fprintf(stderr, "i %d g1 %d g2 %d g %d\n", i, g1, g2, g);*/ if ( g > 0 ) { geno1[i] = g1; geno2[i] = g2; geno12[i] = g; weight[i] = 1; } } gsl_matrix_set(X,i,3,geno2[i]==1); gsl_matrix_set(X,i,4,geno2[i]==2); gsl_vector_set(W,i,weight[i]); } /* printf( "\n"); for( i=0;i 0 ) { double x = y[i]; g1 = geno1[i]; g2 = geno2[i]; count1[g1]++; count2[g2]++; countFull[g]++; sum1[g1] += x; sum2[g2] += x; sumFull[g] += x; tss += x*x; } } mean = sum1[1] + sum1[2] + sum1[3]; M = count1[1] + count1[2] + count1[3]; if (M > 0 ) { mean /= M; tss -= mean*mean*M; fss1 = fss2 = 0.0; for(g=1;g<=3;g++) { if ( count1[g] > 0 ) { sum1[g] /= count1[g]; fss1 += count1[g]*(sum1[g]*sum1[g]-mean*mean); df1++; } if ( count2[g] > 0 ) { sum2[g] /= count2[g]; fss2 += count2[g]*(sum2[g]*sum2[g]-mean*mean); df2++; } } if ( df1 > 1 && df2 > 1 ) { rss1 = tss - fss1; if ( df1 > 0 ) { F1 = (fss1/rss1)*((M-df1)/(df1-1.0)); P1 = gsl_cdf_fdist_Q( F1, df1-1, M-df1); } rss2 = tss - fss2; if ( df2 > 0 ) { F2 = (fss2/rss2)*((M-df2)/(df2-1.0)); P2 = gsl_cdf_fdist_Q( F2, df2-1, M-df2); } if ( DEBUG) fprintf(stderr, "N %d M %d\n", N, M ); if ( DEBUG) fprintf(stderr, "fss1 %f rss1 %f F1 %f P1 %f df1 %d\n", fss1, rss1, F1, P1, df1 ); if ( DEBUG) fprintf(stderr, "fss2 %f rss2 %f F2 %f P2 %f df2 %d\n", fss2, rss2, F2, P2, df2 ); for( i=0;iS,i); dfAdd += (alpha>1.0e-7); if ( DEBUG ) fprintf( stderr, "i=%d alpha %e rank %d\n", i, alpha, dfAdd); } dfAdd--; if ( DEBUG ) fprintf( stderr, "additivemodel rank %d rss %f\n", dfAdd, rssAdd ); fssAdd = tss - rssAdd; if ( dfAdd > 0 ) { FAdd = (fssAdd/rssAdd)*((M-dfAdd)/(dfAdd-1.0)); if ( DEBUG ) fprintf( stderr, "Fadd %e\n", FAdd ); PAdd = gsl_cdf_fdist_Q( FAdd, dfAdd, M-dfAdd); } if ( DEBUG) fprintf(stderr, "fssAdd %f rssAdd %f FAdd %f PAdd %f dfAdd %d\n", fssAdd, rssAdd, FAdd, PAdd, dfAdd ); /* full model */ fssFull = 0.0; for(g=1;g<=9;g++) { if ( countFull[g] > 0 ) { sumFull[g] /= countFull[g]; fssFull += countFull[g]*(sumFull[g]*sumFull[g]-mean*mean); dfFull++; } } dfFull--; rssFull = tss - fssFull; if ( dfFull > 0 ) { FFull = (fssFull/rssFull)*((M-dfFull)/(dfFull)); PFull = gsl_cdf_fdist_Q( FFull, dfFull, M-dfFull); } if ( DEBUG) fprintf(stderr, "fssFull %f rssFull %f FFull %f PFull %f dfFull %d\n", fssFull, rssFull, FFull, PFull, dfFull ); /* Partial F of full vs additive */ pfss = fssFull-fssAdd; if ( dfFull-dfAdd > 0 ) { Fpartial = (pfss/rssFull)*((M-dfFull)/(dfFull-dfAdd)); Ppartial = gsl_cdf_fdist_Q(Fpartial, dfFull-dfAdd, M-dfFull); } if ( DEBUG) fprintf(stderr, "pfss %f rsspartial %f Fpartial %f Ppartial %f dfrss %d dfpartial %d\n", pfss, rssFull, Fpartial, Ppartial,M-dfFull, dfFull-dfAdd ); printf( "%20s %20s %20s %20s %e %e %e %e %e\n" , id1->snp, id1->cluster, id2->snp, id2->cluster, P1, P2, PAdd, PFull, Ppartial ); } } } free(geno1); free(geno2); free(geno12), free(residual); gsl_multifit_linear_free(work); gsl_matrix_free(X); gsl_vector_free(Y); gsl_vector_free(W); gsl_vector_free(c); gsl_matrix_free(cov); return ok; } DATA_MATRIX *ReadData( char *infile ) { FILE *fp; int nrow, ncol; char **data = NULL; char *buffer; char name[256]; int n, m; double x; double *y; DATA_MATRIX *d = NULL; if ( fp = fopen( infile, "r" ) ) { if ( fscanf( fp, "%d %d", &nrow, &ncol ) ==2 ) { int i; char tmp[256]; char **cname = (char**)calloc(ncol,sizeof(char*)); char **rname = (char**)calloc(nrow,sizeof(char*)); for(i=0;inrow = n; d->ncol = ncol; d->data = data; d->y = y; d->cname = cname; d->rname = rname; } } /* printf( "read from %s %d %d %u %u\n", infile, d->nrow, d->ncol, d, d->data ); */ return d; } void FlushResult ( RESULT *result, int k, char *resultfile, DATA_MATRIX *d1, DATA_MATRIX *d2 ) { FILE *fp; if ( fp = fopen( resultfile, "a" ) ) { int j; for(j=0;jcname[result[j].col1], d2->cname[result[j].col2], result[j].P1, result[j].P2, result[j].PAdd, result[j].PFull, result[j].Ppartial ); } fclose(fp); } } int my_gsl_multifit_wlinear (const gsl_matrix * X, const gsl_vector * w, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, double *chisq, gsl_multifit_linear_workspace * work) { if (X->size1 != y->size) { GSL_ERROR ("number of observations in y does not match rows of matrix X", GSL_EBADLEN); } else if (X->size2 != c->size) { GSL_ERROR ("number of parameters c does not match columns of matrix X", GSL_EBADLEN); } else if (w->size != y->size) { GSL_ERROR ("number of weights does not match number of observations", GSL_EBADLEN); } else if (cov->size1 != cov->size2) { GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR); } else if (c->size != cov->size1) { GSL_ERROR ("number of parameters does not match size of covariance matrix", GSL_EBADLEN); } else if (X->size1 != work->n || X->size2 != work->p) { GSL_ERROR ("size of workspace does not match size of observation matrix", GSL_EBADLEN); } else { const size_t n = X->size1; const size_t p = X->size2; size_t i, j; gsl_matrix *A = work->A; gsl_matrix *Q = work->Q; gsl_matrix *QSI = work->QSI; gsl_vector *S = work->S; gsl_vector *t = work->t; gsl_vector *xt = work->xt; gsl_vector *D = work->D; /* Scale X, A = sqrt(w) X */ gsl_matrix_memcpy (A, X); for (i = 0; i < n; i++) { double wi = gsl_vector_get (w, i); if (wi < 0) wi = 0; { gsl_vector_view row = gsl_matrix_row (A, i); gsl_vector_scale (&row.vector, sqrt (wi)); } } /* Balance the columns of the matrix A */ gsl_linalg_balance_columns (A, D); /* Decompose A into U S Q^T */ gsl_linalg_SV_decomp_mod (A, QSI, Q, S, xt); /* Solve sqrt(w) y = A c for c, by first computing t = sqrt(w) y */ for (i = 0; i < n; i++) { double wi = gsl_vector_get (w, i); double yi = gsl_vector_get (y, i); if (wi < 0) wi = 0; gsl_vector_set (t, i, sqrt (wi) * yi); } gsl_blas_dgemv (CblasTrans, 1.0, A, t, 0.0, xt); /* Scale the matrix Q, Q' = Q S^-1 */ gsl_matrix_memcpy (QSI, Q); for (j = 0; j < p; j++) { gsl_vector_view column = gsl_matrix_column (QSI, j); double alpha = gsl_vector_get (S, j); if (alpha >1.0e-7) alpha = 1.0 / alpha; else { alpha = 0.0; gsl_vector_set(S,j,0.0); } gsl_vector_scale (&column.vector, alpha); } gsl_vector_set_zero (c); /* Solution */ gsl_blas_dgemv (CblasNoTrans, 1.0, QSI, xt, 0.0, c); /* Unscale the balancing factors */ gsl_vector_div (c, D); /* Form covariance matrix cov = (Q S^-1) (Q S^-1)^T */ for (i = 0; i < p; i++) { gsl_vector_view row_i = gsl_matrix_row (QSI, i); double d_i = gsl_vector_get (D, i); for (j = i; j < p; j++) { gsl_vector_view row_j = gsl_matrix_row (QSI, j); double d_j = gsl_vector_get (D, j); double s; gsl_blas_ddot (&row_i.vector, &row_j.vector, &s); gsl_matrix_set (cov, i, j, s / (d_i * d_j)); gsl_matrix_set (cov, j, i, s / (d_i * d_j)); } } /* Compute chisq, from residual r = y - X c */ { double r2 = 0; for (i = 0; i < n; i++) { double yi = gsl_vector_get (y, i); double wi = gsl_vector_get (w, i); gsl_vector_const_view row = gsl_matrix_const_row (X, i); double y_est, ri; gsl_blas_ddot (&row.vector, c, &y_est); ri = yi - y_est; r2 += wi * ri * ri; } *chisq = r2; } return GSL_SUCCESS; } } IDATA * ReadIntervalData ( char *filename, char *phenofile, char *phenotype, int *intervals, DATA_MATRIX **d ) { char line[256]; char snp[256]; char cluster[256]; int nMAX = 20000; IDATA *data = (IDATA*)calloc(nMAX, sizeof(IDATA)); int m=0; FILE *fp; *intervals = 0; if ( fp = fopen( filename, "r" ) ) { skip_comments( fp, phenotype ); skip_comments( fp, phenofile ); while( skip_comments( fp, line ) != EOF ) { if ( sscanf( line, "%s %s", snp, cluster ) == 2 ) { ENTRY item, *entry; item.key = snp; if ( entry = hsearch( item, FIND ) ) { MARKER_INDEX *mi = (MARKER_INDEX*)entry->data; data[m].snp = strdup(snp); data[m].cluster = strdup(cluster); data[m].chrom = mi->chrom; data[m].pos = mi->pos; m++; printf( "%s %s %d %d %s\n", snp, cluster, mi->chrom, mi->pos, d[mi->chrom]->cname[mi->pos] ); if ( strcmp( snp, d[mi->chrom]->cname[mi->pos] ) ) printf( "WARNING: mismatch %s\n", d[mi->chrom]->cname[mi->pos] ); /* { int k; for(k=0;kchrom]->ncol;k++) printf( "%d", (int)(d[mi->chrom]->data[k][mi->pos])); printf( "\n"); } */ } } } close(fp); *intervals = m; return data; } return NULL; } int ReadPhenotypes( char *filename, char *phenotype, DATA_MATRIX **d, int nchrom ) { FILE *fp; char buffer[5000]; int pindex=-1; int sindex=-1; int n = 0; int i, j, c; double missing = nan("char-sequence"); DATA_MATRIX *d1 = d[0]; if ( fp = fopen( filename, "r" ) ) { hcreate( d1->nrow ); for(i=0;inrow;i++) { ENTRY e, *f; long j = i; e.key = d1->rname[i]; e.data = (void*)j; f = hsearch( e, ENTER ); d1->y[i] = missing; } while(skip_comments( fp, buffer ) != EOF ) { if ( n == 0 ) { int k = 0; char *tok = strtok( buffer, "\t" ); while ( tok ) { if ( !strcmp( tok, "SUBJECT.NAME" ) ) sindex = k; if ( !strcmp( tok, phenotype ) ) pindex = k; tok = strtok( NULL, "\t" ); k++; } if ( sindex < 0 || pindex < 0 ) { fprintf( stderr, "ERROR, missing index for phenotype %s %d %d\n", phenotype, pindex, sindex); exit(1); } else { if ( DEBUG ) fprintf(stderr, "pindex %d sindex %d\n", pindex, sindex ); } } else { int k = 0; int idx = -1; int max = pindex > sindex ? pindex : sindex; double p = missing; char *tok = strtok( buffer, "\t" ); while( tok && k <= max ) { if ( k == sindex ) { ENTRY e, *f; e.key = tok; if ( f = hsearch( e, FIND ) ) { idx = (int)f->data; if ( DEBUG ) fprintf(stderr, "%s %d\n", f->key, (int)f->data ); } else { /* fprintf( stderr, "Missing subject %s\n", tok ); */ } } if ( k == pindex ) { sscanf( tok, "%lf", &p ); d1->y[idx] = p; if ( DEBUG) fprintf( stderr, "%d = %s = %e\n", idx, tok, p ); } k++; tok = strtok(NULL, "\t"); } } n++; } j = 0; for(i=0;inrow;i++) { if ( ! isnan(d1->y[i]) ) { for(c=0;cy[j] = d1->y[i]; d[c]->data[j] = d[c]->data[i]; d[c]->rname[j] = d[c]->rname[i]; } if ( DEBUG ) fprintf(stderr, "%d %s %e\n", i, d1->rname[j], d1->y[j]); j++; } else { if ( DEBUG ) fprintf(stderr, "skipping row %d %d\n", i, j); } } if ( DEBUG ) fprintf( stderr, "nrow = %d\n", j ); for(c=0;cnrow = j; } if ( DEBUG ) fprintf( stderr, "nrow = %d\n", j ); hdestroy(); } else { fprintf(stderr, "ERROR could not open file %s\n", filename ); exit(1); } return d1->nrow; }