#include #include #include #include #include #include #include #include #include"reconstruction.h" static SCAN_DATA *Scan; int idx_cmp( const void *A, const void *B) { CHAR_INDEX *a = (CHAR_INDEX*) A; CHAR_INDEX *b = (CHAR_INDEX*) B; return( strcmp( a->id, b->id )); } int scan_data_cmp( const void *A, const void *B) { int *a = (int*) A; int *b = (int*) B; double x = Scan->logP[*b]-Scan->logP[*a]; int d; if ( x < 0 ) d = -1; else if ( x > 0 ) d = 1; else { d = Scan->chr[*a] - Scan->chr[*b]; if ( d == 0 ) d = Scan->bp[*a]-Scan->bp[*b]; } return( d ); } int scan_data_cmp2( const void *A, const void *B) { int *a = (int*) A; int *b = (int*) B; int d = Scan->chr[*a] - Scan->chr[*b]; if ( d == 0 ) d = Scan->bp[*a]-Scan->bp[*b]; return( d ); } int IntersectIDs ( char **id1, char **id2, int N1, int N2, int**Index1, int **Index2 ) { CHAR_INDEX *idx1, *idx2; int *index1, *index2, nobs=0, i, j, k, N; idx1 = (CHAR_INDEX*)calloc(N1, sizeof(CHAR_INDEX)); for(k=0;k N2 ? N2 : N1; index1 = (int*)calloc(N,sizeof(int)); index2 = (int*)calloc(N,sizeof(int)); i=0; j=0; k=0; while( i 0 ) j++; else i++; } *Index1 = index1; *Index2 = index2; free(idx1); free(idx2); return(k); } void WriteGenomeScan( SCAN_STATISTICS **stats, int nchr, BINARY_DATA **genome, char *filename, double threshold ) { FILE *fp = fopen(filename, "w"); if ( fp ) { int k; printf("writing %s\n", filename ); fprintf(fp, "chr bp logP\n"); for(k=0;knsite); for(i=0;insite;i++) { if ( stats[k][i].logP >=threshold) fprintf(fp, "%d %d %e\n", genome[k]->chr, genome[k]->site[i], stats[k][i].logP ); } } fclose(fp); } } void WriteGenomeScanEstimate( SCAN_STATISTICS **stats, int nchr, BINARY_DATA **genome, ALLELE_DATA **ad, char *filename, double threshold ) { FILE *fp = fopen(filename, "w"); if ( fp ) { int k; printf("writing %s\n", filename ); fprintf(fp, "chr bp logP sigma2 nalleles allele count estimate\n"); for(k=0;knsite); for(i=0;insite;i++) { if ( stats[k][i].logP >=threshold) { int n; fprintf(fp, "%d %d %e %d %e", genome[k]->chr, genome[k]->site[i], stats[k][i].logP, stats[k][i].nalleles, stats[k][i].sigma ); char *sdp = ad[k]->sdp[i]; // strain distribution pattern in founders; int *idx = (int*)calloc(ad[k]->Ncol,sizeof(int)); int j, m; for(j=0;jNcol;j++) { int a = sdp[j]; // idx maps allele to the first strain with that allele idx[a] = j; for(m=0;mcomplete[i][idx[a]], stats[k][i].count[n], stats[k][i].estimate[n]); } fprintf(fp, "\n"); } } } fclose(fp); } } void WriteHaplotypeGenomeScanEstimate( SCAN_STATISTICS **stats, int nchr, BINARY_DATA **genome, char *filename, double threshold ) { FILE *fp = fopen(filename, "w"); if ( fp ) { int k; printf("writing %s\n", filename ); fprintf(fp, "chr bp logP sigma2 nalleles allele count estimate\n"); for(k=0;knsite); for(i=0;insite;i++) { if ( stats[k][i].logP >=threshold) { int n; fprintf(fp, "%d %d %e %d %e", genome[k]->chr, genome[k]->site[i], stats[k][i].logP, stats[k][i].nalleles, stats[k][i].sigma ); for(n=0;nid, genome[0]->id, pt->nobs, genome[0]->nseq, &Index1, &Index2 ); printf("writing %s threshold %lf\n", filename, threshold ); fprintf(fp, "SUBJECT.NAME\t%s", pt->name ); for(k=0;knsite,sizeof(int)); // printf("%d %d\n", k+1, genome[k]->nsite); for(i=0,j=0;insite;i++) { if ( stats[k][i].logP >=threshold ) { use[k][j++] = i; nuse[k]++; tot++; fprintf(fp, "\tc%d.%d", k+1, genome[k]->site[i] ); } } } fprintf(fp, "\n"); for(n=0;nid[Index1[n]], pt->x[Index1[n]] ); // printf( "%s\t%g\n", pt->id[Index1[n]], pt->x[Index1[n]] ); for(k=0;kmat[i][Index2[n]]); } } fprintf( fp, "\n"); } printf("wrote %d sites at threshold %lf\n", tot, threshold); free(nuse); free(Index1); free(Index2); for(k=0;knsite;i++) *mx = (*mx < logP[chr][i]) ? logP[chr][i] : *mx; } if ( *mx > permlogP[nperm-1] ) p=0; else { i=nperm-1; while( i>0 && *mx < permlogP[i] ) i--; p = i/(double)nperm; } return(p); } double *PermutedGenomeScans( PHENOTYPE_TABLE *pt, BINARY_DATA **genome, int nchr, int nperm ) { int n, k, chr; double *maxlogP = (double*)calloc(nperm,sizeof(double)); // double **logP = (double**)calloc(nchr,sizeof(double*)); PHENOTYPE_TABLE *ptable = PermutePhenotypes(pt, 0 ); SCAN_STATISTICS **stats = (SCAN_STATISTICS**)calloc(nchr,sizeof(SCAN_STATISTICS*)); for( n=0;nnsite;k++) maxlogP[n] = maxlogP[n] < stats[chr][k].logP ? stats[chr][k].logP : maxlogP[n]; FreeScanStatistics(stats[chr], genome[chr]->nsite);// free(logP[chr]); printf( "perm %d logP %e\n", n, maxlogP[n]); } qsort(maxlogP, nperm, sizeof(double), double_cmp ); free(stats); // free(logP); } return(maxlogP); } void FreeScanStatistics( SCAN_STATISTICS *stats, int len ) { int k; for(k=0;kmaxlogP, perm->nperm, sizeof(double), double_cmp ); perm->q90 = perm->maxlogP[(int)(0.90*perm->nperm)]; perm->q95 = perm->maxlogP[(int)(0.95*perm->nperm)]; perm->q50 = perm->maxlogP[(int)(0.50*perm->nperm)]; if ( perm->mx > perm->maxlogP[perm->nperm-1] ) perm->pval=0; else { int i=perm->nperm-1; while( i>0 && perm->mx < perm->maxlogP[i] ) i--; perm->pval = i/(double)perm->nperm; } } printf("done all\n"); return(stats); } SCAN_STATISTICS *ChromosomeScan( PHENOTYPE_TABLE *pt, BINARY_DATA *bd, PERMUTATIONS *perm ){ int *Index1, *Index2; int N = IntersectIDs ( pt->id, bd->id, pt->nobs, bd->nseq, &Index1, &Index2 ); if ( bd->chr == 1 ) init_Fcache( N-2 ); // printf( "scanning chromsome N= %d\n", N); if ( N > 10 ) { int n, k; double *y = (double*)calloc(N, sizeof(double)); char *genotype = (char*)calloc(N, sizeof(char)); SCAN_STATISTICS *stat=NULL; ANOVA_TABLE an; for(k=0;kx[Index1[k]]; // printf("%d %s %f\n", k, pt->id[Index1[k]], y[k]); } if ( bd->nsdp>0 ) { //use compressed sdp double *pval = (double*)calloc(bd->nsdp, sizeof(double)); // printf("compressed %d\n", bd->nsdp==0); SCAN_STATISTICS *stat2 = (SCAN_STATISTICS*)calloc(bd->nsite,sizeof(SCAN_STATISTICS)); for( n=0;nnsdp;n++) { int df=0, m; for(k=0;kmat[n][Index2[k]]; stat2[n].logP = OneWayAnovaLog10pval( N, y, genotype, NULL, &df, &an ); stat2[n].sigma = an.RSS/an.rdf; stat2[n].nalleles = an.nlevels; stat2[n].estimate = (double*)calloc(stat[n].nalleles,sizeof(double)); stat2[n].allele = (int*)calloc(stat[n].nalleles,sizeof(int)); stat2[n].count = (int*)calloc(stat[n].nalleles,sizeof(int)); for(m=0,k=0;k<256;k++) { if ( an.N[k] > 0 ) { stat2[n].estimate[m] = an.mean[k]; stat2[n].allele[m] = k; stat2[n].count[m] = an.N[k]; m++; } } stat2[n].nalleles = m; if ( perm != NULL ) { int m; double plogP; for (m=0;mnperm;m++) { df = 0; plogP = OneWayAnovaLog10pvalFast( perm->y[m], genotype, &an ); perm->maxlogP[m] = perm->maxlogP[m] > plogP ? perm->maxlogP[m] : plogP; } } } for(n=0;nnsite;n++) stat[n] = stat2[bd->sdp[n]]; } else { //use uncompressed // printf("uncompressed %d\n", bd->nsdp==0); stat = (SCAN_STATISTICS*)calloc(bd->nsite,sizeof(SCAN_STATISTICS)); for( n=0;nnsite;n++) { int df=0, m; for(k=0;kmat[n][Index2[k]]; // if ( bd->site[n] == 270264 && bd->chr==4 ) an.debug = 1; // if ( bd->site[n] == 260617 && bd->chr==4 ) an.debug = 1; // else an.debug = 0; an.debug=0; // logP[n] = OneWayAnovaLog10pval( N, y, genotype, NULL, &df, &an ); // if ( bd->site[n] == 260617 ) { // for(k=0;kid[Index1[k]], y[k],bd->id[Index2[k]], genotype[k]); // } // } stat[n].logP = OneWayAnovaLog10pval( N, y, genotype, NULL, &df, &an ); stat[n].sigma = an.RSS/an.rdf; stat[n].nalleles = an.nlevels; stat[n].estimate = (double*)calloc(stat[n].nalleles,sizeof(double)); stat[n].allele = (int*)calloc(stat[n].nalleles,sizeof(int)); stat[n].count = (int*)calloc(stat[n].nalleles,sizeof(int)); for(m=0,k=0;k<256;k++) { if ( an.N[k] > 0 ) { stat[n].estimate[m] = an.mean[k]; stat[n].allele[m] = k; stat[n].count[m] = an.N[k]; m++; } } stat[n].nalleles = m; if ( perm != NULL ) { int m; double plogP; // printf("permuting...\n"); for (m=0;mnperm;m++) { df = 0; plogP = OneWayAnovaLog10pvalFast( perm->y[m], genotype, &an ); perm->maxlogP[m] = perm->maxlogP[m] > plogP ? perm->maxlogP[m] : plogP; if ( m < 0 ) { printf("%d %e %e\n", m, plogP, perm->maxlogP[m]); } } // printf("done\n"); } } } if ( perm != NULL ) { for( n=0;nnsite;n++) { perm->mx = perm->mx > stat[n].logP ?perm->mx : stat[n].logP; } } free(y); free(genotype); free(Index1); free(Index2); return(stat); } return(NULL); } void FreePermutations( PERMUTATIONS *perm ) { int i; if ( perm != NULL ) { for(i=0;inperm;i++) free(perm->y[i]); free(perm->maxlogP); free(perm); } } PERMUTATIONS *MakePermutations( PHENOTYPE_TABLE *pt, BINARY_DATA *bd, int nperm ) { PERMUTATIONS *perm = (PERMUTATIONS*)calloc(1,sizeof(PERMUTATIONS)); int *Index1, *Index2; int N = IntersectIDs ( pt->id, bd->id, pt->nobs, bd->nseq, &Index1, &Index2 ); int i, j, *idx; double *Y; perm->nperm = nperm; perm->nobs = N; perm->y = (double**)calloc(nperm,sizeof(double*)); perm->maxlogP = (double*)calloc(nperm,sizeof(double)); idx = (int*)calloc(N,sizeof(int)); Y = (double*)calloc(N,sizeof(double)); for(i=0;ix[Index1[i]]; } for(i=0;iy[i] = (double*)calloc(N, sizeof(double)); PermuteArray(idx, N); for(j=0;jy[i][j] = Y[idx[j]]; } free(Y); free(Index1); free(Index2); free(idx); printf("made permutations\n"); return(perm); } double OneWayAnovaLog10pval( int nobs, double *phenotype, char *genotype, double *residuals , int *Df, ANOVA_TABLE *an) { // set *Df=0 on first call - repeated calls for multipe QTL mapping will increment it, used for forward selection int i, k; for(k=0;k<256;k++) { an->N[k]=an->mean[k]=an->SS[k]=0.0; } an->gm = an->nlevels = an->F = an->log10pval = an->TSS = an->FSS = an->df = 0; an->nobs = nobs; for(i=0;iN[g]++; an->mean[g] += x; an->gm += x; an->SS[g] += x2; an->TSS += x2; } an->gm /= nobs; for(k=0;k<256;k++) if ( an->N[k]>0 ) { an->nlevels = k+1; an->df++; an->mean[k] = an->mean[k]/an->N[k]-an->gm; an->SS[k] -= an->N[k]*an->mean[k]*an->mean[k]; if (an->debug ) printf("k%d mean %lg SS %lg\n", k, an->mean[k], an->SS[k]); } an->df -= 1; if ( an->df > 0 ) { an->TSS -= an->nobs*an->gm*an->gm; for(i=0;inlevels;i++) { an->FSS += an->mean[i]*an->mean[i]*an->N[i]; } an->RSS = an->TSS-an->FSS; an->rdf = an->nobs-an->df-1- *Df; *Df += an->df; //increment *Df an->F = (an->FSS / an->df)/(an->RSS/an->rdf); //Note this is the partial F test an->log10pval = Log10Fpval( an->F, an->df, an->rdf ); // {-log10(comp_F_Distribution(F,(int)df, (int)rdf)); if ( an->debug ) printf( "TSS %lf RSS %lf FSS %lf F %lf logp %lf df %lf rdf %lf\n", an->TSS, an->RSS, an->FSS, an->F, an->log10pval, an->df, an->rdf); } if ( an->debug ) { for(i=0;igm-an->mean[genotype[i]]; } return(an->log10pval); } double OneWayAnovaLog10pvalFast( double *phenotype, char *genotype, ANOVA_TABLE *an) { // Used for anova with a permuted set of phenotypes, after a call to OneWayAnovaLog10pval() to set up quantities that do not vary under permutation int i, k; an->log10pval = 0.0; if ( an->df > 0 ) { for(k=0;knlevels;k++) { an->mean[k]=an->SS[k]=0.0; } for(i=0;inobs;i++) { int g = genotype[i]; double x = phenotype[i]; double x2 = x*x; an->mean[g] += x; an->SS[g] += x2; } for(k=0;knlevels;k++) if ( an->N[k]>0 ) { an->mean[k] = an->mean[k]/an->N[k]-an->gm; an->SS[k] -= an->N[k]*an->mean[k]*an->mean[k]; } an->FSS=0.0; for(i=0;inlevels;i++) { an->FSS += an->mean[i]*an->mean[i]*an->N[i]; } an->RSS = an->TSS-an->FSS; an->F = (an->FSS / an->df)/(an->RSS/an->rdf); //Note this is the partial F test an->log10pval = Log10Fpval( an->F, an->df, an->rdf ); // {-log10(comp_F_Distribution(F,(int)df, (int)rdf)); } return(an->log10pval); } SCAN_DATA *NewScanData(int N ) { SCAN_DATA *sc = (SCAN_DATA*)calloc(1,sizeof(SCAN_DATA)); sc->N = N; sc->chr = (int*)calloc(sc->N,sizeof(int)); sc->bp = (int*)calloc(sc->N,sizeof(int)); sc->site = (int*)calloc(sc->N,sizeof(int)); sc->logP = (double*)calloc(sc->N,sizeof(double)); return(sc); } SCAN_DATA *PopulateScanData ( BINARY_DATA **gd, SCAN_STATISTICS **stats, int nchr, double thresh ) { int i, j=0, k; SCAN_DATA *sc; for(k=0;knsite;i++) j += stats[k][i].logP>=thresh; sc = NewScanData(j); j=0; for(k=0;knsite;i++) if ( stats[k][i].logP>=thresh) { sc->chr[j] = k; sc->bp[j] = gd[k]->site[i]; sc->site[j] = i; sc->logP[j] = stats[k][i].logP; j++; } } return(sc); } SCAN_DATA *ForwardSelection( PHENOTYPE_TABLE *pt, int nchr, BINARY_DATA **gd, ALLELE_DATA **ad, SCAN_STATISTICS **stats, double thresh, char *outfile ) { int i, j, k, ok=1, max_qtl=1000, n_qtl=0, df=0; SCAN_DATA *sc=NULL, *qtl=NULL; ANOVA_TABLE an, *anova = (ANOVA_TABLE*)calloc(max_qtl,sizeof(ANOVA_TABLE)); int *max = (int*)calloc(max_qtl, sizeof(int)); int *Index1, *Index2; int N = IntersectIDs ( pt->id, gd[0]->id, pt->nobs, gd[0]->nseq, &Index1, &Index2 ); double *residuals = (double*)calloc(N, sizeof(double)); char *genotype = (char*)calloc(N, sizeof(char)); FILE *fp = ( outfile != NULL ) ? fopen(outfile, "w") : stdout; double TSS=0, H=0; for(k=0;kx[Index1[k]]; sc = PopulateScanData ( gd, stats, nchr, thresh ); qtl = NewScanData(max_qtl); fprintf(fp, "trait n.qtl df chr bp logP H cum.H\n"); while(n_qtlN;j++) { if ( sc->logP[j] > mx ) { mx = sc->logP[j]; w = j; } } // printf ("w %d chr %d site %d mx %lf sc->N %d\n", w, sc->chr[w], sc->site[w], mx, sc->N ); if ( mx < thresh ) { ok = 0; } else { int chr = sc->chr[w]; int m=0; double h; int n; if ( gd[chr]->nsdp>0 ) n = gd[chr]->sdp[sc->site[w]]; // index in GENOME_DATA else n = sc->site[w]; for(j=0;jmat[n][Index2[j]]; qtl->logP[n_qtl] = OneWayAnovaLog10pval( N, residuals, genotype, residuals, &df, &anova[n_qtl] ); qtl->chr[n_qtl] = chr; qtl->bp[n_qtl] = sc->bp[w]; qtl->site[n_qtl] = sc->site[w]; if ( n_qtl == 0 ) TSS = anova[n_qtl].FSS + anova[n_qtl].RSS; h = anova[n_qtl].FSS/TSS; H = h + H; printf("n_qtl %d df %d chr %d n %d bp %d logP %.3f H %.4f cum_H %.4f\n", n_qtl, df, chr+1, n, sc->bp[w], qtl->logP[n_qtl], h, H); fprintf(fp, "%.20s %5d %3d %2d %12d %.3f %.4f %.4f\n", pt->name, n_qtl+1, df, chr+1, sc->bp[w], qtl->logP[n_qtl], h, H); n_qtl++; for( j=0;jN;j++) { int chr = sc->chr[j], Df=df, n; double lp; if ( gd[chr]->nsdp>0 ) n = gd[chr]->sdp[sc->site[j]]; else n = sc->site[j]; for(i=0;imat[n][Index2[i]]; an.debug=0; lp = OneWayAnovaLog10pval( N, residuals, genotype, NULL, &Df, &an ); if ( lp > thresh ) { sc->chr[m] = sc->chr[j]; sc->bp[m] = sc->bp[j]; sc->logP[m] = lp; sc->site[m] = sc->site[j]; m++; } } sc->N = m; } } { int *Index1, *Index2; int N = IntersectIDs ( pt->id, gd[0]->id, pt->nobs, gd[0]->nseq, &Index1, &Index2 ); int n; fprintf(fp, "\n\nID\t%s", pt->name); for(n=0;nchr[n]; int bp = qtl->bp[n]; int idx = ad[chr]->index[bp]; fprintf( fp, "\t%d-%d", chr, bp); } fprintf(fp, "\n"); for(k=0;kid[idx1], pt->x[idx1]); for(n=0;nchr[n]; int bp = qtl->bp[n]; int idx = ad[chr]->index[bp]; int g = gd[chr]->mat[idx][Index2[k]]; fprintf( fp, "\t%d", g); } fprintf( fp, "\n"); } } fclose(fp); return(qtl); } void WriteAnnotatedScan( PHENOTYPE_TABLE *pt, int nchr,SCAN_STATISTICS **stats, BINARY_DATA **bd, ALLELE_DATA **ad, double thresh, int max, char *filename ) { FILE *fp; fp = fopen(filename, "w"); if ( fp ) { SCAN_DATA *sc = PopulateScanData( bd, stats, nchr, thresh ); int k=0, i, n; int *Index1, *Index2; int N = IntersectIDs ( pt->id, bd[0]->id, pt->nobs, bd[0]->nseq, &Index1, &Index2 ); int *sc_idx = (int*)calloc(sc->N,sizeof(int)); for(i=0;iN;i++) sc_idx[i] = i; Scan = sc; qsort( sc_idx, sc->N, sizeof(int), scan_data_cmp ); while( k< sc->N && klogP[sc_idx[k]]>=thresh ) k++; qsort( sc_idx, k, sizeof(int), scan_data_cmp2 ); max = k; printf("writing annotations to %s\n", filename ); fprintf(fp, "trait\tchr\tbp\tlogP"); for(i=0;iNcol;i++) { fprintf( fp, "\t%s", ad[0]->col[i]); } fprintf(fp, "\n"); for(n=0;nchr[k]; int bp = sc->bp[k]; int idx = ad[chr]->index[bp]; fprintf( fp, "%s\t%d\t%d\t%.3f", pt->name, chr+1, bp, sc->logP[k]); for(i=0;iNcol;i++) { fprintf( fp, "\t%s", ad[chr]->complete[idx][i]); } fprintf(fp, "\n"); } fclose(fp); } } PHENOTYPE_TABLE **ReadPhenotypes( char *filename, char **phenotypes, int nphen ) { PHENOTYPE_TABLE **ptable = (PHENOTYPE_TABLE**)calloc(nphen,sizeof(PHENOTYPE_TABLE*)); int n; for(n=0;n=nobs ) { nobs *=2; id = (char**)realloc( id, nobs*sizeof(char*)); y = (double*)realloc( y, sizeof(double)); } while( tok ) { if ( idx == id_idx ) id[k] = strdup(tok); if ( idx == p_idx ) { double x=0; if ( sscanf( tok, "%lf", &x ) == 1) { y[k] = x; ok = 1; } } idx++; tok = strtok(NULL, sep); } if ( ok ) k++; } fclose(fp); ptable->id = id; ptable->x = y; ptable->nobs = k; ptable->name = strdup(phenotype); free(buffer); /* for(k=0;knobs;k++) printf("%s %lf\n", ptable->id[k], ptable->x[k]); */ return(ptable); } } free(buffer); } return(NULL); }