#include #include #include #include #include #include #include #include #include #include"reconstruction.h" // Compute the optimal mosaic of a MAGIC genome in terms of its founders, using dynamic programming int ReconstructMosaic( SEQ_CALL *sq, int penalty, int chr, char *chrom, FILE *fp, int backcross ) { int ns = sq->chr_end[chr]-sq->chr_start[chr]+1; //ns = number of sites if ( ns < 10 ) return(1); // too few sites int nf = sq->Nfounders; // nf = number of founders int offset = sq->chr_start[chr]; int i, j, k, p, y; char **mat = (char**)calloc(ns,sizeof(char*)); char **path = (char**)calloc(ns,sizeof(char*)); int **opt = (int**)calloc(ns,sizeof(int*)); int *step = (int*)calloc(ns+1,sizeof(int)); int *jump = (int*)calloc(ns+1,sizeof(int)); for(i=0;i= 0 ) { // data are from a backcross to the strain indexed by backcross. this means each allele can match either the backcross strain or another strain (a aspcial case of diploid) set backcross=-1 to disable this for(i=0;iallele[k] == sq->founder[k][j]) || (sq->allele[k] == sq->founder[k][backcross]); // mat[i][j] = indicates if allele at site i matches founder j or founder backcross } } else { // inbred for(i=0;iallele[k] == sq->founder[k][j]; // mat[i][j] = indicates if allele at site i matches founder j } } for(j=0;j opt[i][j] ) { opt[i][j] = x; path[i][j] = k; } } } } y = opt[ns-1][0]; p = 0; for(j=0;j y ) { y = opt[ns-1][j]; p = j; } } step[ns] = p; i = ns; while(i>0) { p = path[--i][p]; step[i] = p; } jump[0] = 0; k = 1; for(i=1;i<=ns;i++) { if ( step[i] != step[i-1] ) jump[k++] = i; } jump[k] = ns; sq->mosaic[chr] = (SEGMENTS*)calloc(k,sizeof(SEGMENTS)); for(i=0;ibp[offset+jump[i+1]-1]-sq->bp[offset+jump[i]]; int qlen = jump[i+1]-jump[i]; double q, r; int m=0; for(j=jump[i];j0 ? m/(double)qlen : 0; r = len >0 ? m/(double)len : 0; j=jump[i]; printf( "%s %s %s %10d %0d %10d %8d %8d %8d %8d %e %e\n", sq->name, chrom, sq->founder_name[step[j]], sq->bp[offset+j], sq->bp[offset+jump[i+1]-1], j, jump[i+1]-1, len, qlen, m, q, r ); fprintf( fp, "%s %s %s %10d %0d %10d %8d %8d %8d %8d %e %e\n", sq->name, chrom, sq->founder_name[step[j]], sq->bp[offset+j], sq->bp[offset+jump[i+1]-1], j, jump[i+1]-1, len, qlen, m, q, r ); sq->mosaic[chr][i].chr = chr; sq->mosaic[chr][i].from = sq->bp[offset+j]; sq->mosaic[chr][i].to = sq->bp[offset+jump[i+1]-1]; sq->mosaic[chr][i].id = step[j]; sq->mosaic[chr][i].error = q; } sq->mosaic[chr][0].from = 1; sq->nseg[chr] = k; free(jump); free(step); for(i=0;ichr_end[chr]-sq->chr_start[chr]+1; if ( ns > 10 ) { int nf = sq->Nfounders; int offset = sq->chr_start[chr]; int g, h, i, j, k, m, n, p, y, nf2=nf*(nf+1)/2; char **name = (char**)calloc(nf2,sizeof(char*)); float **tr = (float**)calloc(nf2, sizeof(float*)); float **mat = (float**)calloc(ns,sizeof(float*)); float **path = (float**)calloc(ns,sizeof(float*)); float **opt = (float**)calloc(ns,sizeof(float*)); int *step = (int*)calloc(ns+1,sizeof(int)); int *jump = (int*)calloc(ns+1,sizeof(int)); for(i=0;iallele[k] == sq->founder[k][j]); } else { mat[i][h] = ((sq->allele[k] == sq->founder[k][j]) || (sq->allele[k] == sq->founder[k][g])); //mat[i][h] = 0.5*((sq->allele[k] == sq->founder[k][j]) && (sq->allele[k] == sq->founder[k][g])) + ((sq->allele[k] == sq->founder[k][j]) && (sq->allele[k] != sq->founder[k][g])) + ((sq->allele[k] != sq->founder[k][j]) && (sq->allele[k] == sq->founder[k][g])); } // if ( i < 10 ) printf( " , %d %d %d = %g %c %c %c", h,j,g, mat[i][h], sq->allele[k] , sq->founder[k][j], sq->founder[k][g] ); } } // if( i < 10 ) printf ("\n" ); } for(j=0,h=0;jfounder_name[j], sq->founder_name[g]); /* printf("%d %d %d %s\n", j, g, h, name[h]);*/ } } for(j=0;j opt[i][j] ) { opt[i][j] = x; path[i][j] = k; } } } } y = opt[ns-1][0]; p = 0; for(j=0;j y ) { y = opt[ns-1][j]; p = j; } } step[ns] = p; i = ns; while(i>0) { p = path[--i][p]; step[i] = p; } jump[0] = 0; k = 1; for(i=1;i<=ns;i++) { if ( step[i] != step[i-1] ) jump[k++] = i; } jump[k] = ns; for(i=0;ibp[offset+jump[i+1]-1]-sq->bp[offset+jump[i]]; int qlen = jump[i+1]-jump[i]; double q, r; int m=0; for(j=jump[i];j0 ? m/(double)qlen : 0; r = len >0 ? m/(double)len : 0; j=jump[i]; fprintf( fp, "%s %s %s\t %10d %0d %10d %8d %8d %8d %8d %e %e\n", sq->name, chrom, name[step[j]], sq->bp[offset+j], sq->bp[offset+jump[i+1]-1], offset+j, offset+jump[i+1]-1, len, qlen, m, q, r ); } free(jump); free(step); for(i=0;icol = col; ad->Ncol = ncol; ad->Nsites = 0; ad->site = (int*)calloc(Nbuf,sizeof(int)); //base -pair coordinate of site ad->ok = (char*)calloc(Nbuf,sizeof(char)); //indicator that variant is a snp with no missing or het alleles ad->allele = (char**)calloc(Nbuf,sizeof(char*)); //allele truncated to first charcter (used for single-nucleotide chages) ad->complete = (char***)calloc(Nbuf,sizeof(char**)); //allele as full length ad->sdp = (char**)calloc(Nbuf,sizeof(char*)); //numerically coded alleles, represented as an array of chars (since there are never more than 127 alleles) } else { char *chr, *pse, *bp, *nalleles, *maf, *allele; int Pse, Bp=0, Nalleles=0, Maf=0, k, ok; double x; int sdp; chr = strtok( buffer, " \t"); if ( format==1) pse = strtok( NULL, " \t"); bp = strtok( NULL, " \t"); nalleles = strtok( NULL, " \t"); maf = strtok( NULL, " \t"); if ( nalleles != NULL && sscanf( nalleles, "%d", &Nalleles ) == 1 ) { sscanf( bp, "%lf", &x ); Bp = (int)x; if ( format ==1 ) { sscanf( pse, "%lf", &x ); Pse = (int)x; } else { Pse = Bp; } sscanf( maf, "%d", &Maf ); if ( n >= Nbuf ) { Nbuf *= 1.5; ad->site = (int*)realloc(ad->site,sizeof(int)*Nbuf); ad->ok = (char*)realloc(ad->ok,sizeof(char)*Nbuf); ad->allele = (char**)realloc(ad->allele,sizeof(char*)*Nbuf); ad->complete = (char***)realloc(ad->complete,Nbuf*sizeof(char**)); ad->sdp = (char**)realloc(ad->sdp,Nbuf*sizeof(char*)); } if ( n==1 ) ad->chr = strdup(chr); ad->complete[n] = (char**)calloc(ncol,sizeof(char*)); ad->allele[n] = (char*)calloc(ncol+1, sizeof(char)); ok = Nalleles == 2 ? 1: 0; //diallelic for(k=0;k 1) || !good[allele[0]]) ok = 0; ad->complete[n][k] = strdup(allele); ad->allele[n][k] = allele[0]; } ad->sdp[n] = (char*)calloc(ncol,sizeof(char)); ad->sdp[n][0] = 1; sdp = 1; for(k=1;ksdp[n][k] = -1; for(j=0;jcomplete[n][k],ad->complete[n][j]) ) { ad->sdp[n][k] = ad->sdp[n][j]; // sdp codes alleles in the order that the strains are encountered break; } } if ( ad->sdp[n][k] < 0 ) ad->sdp[n][k] = ++sdp; } ad->ok[n] = ok; ad->site[n] = Bp; n++; } nrow++; } } ad->Nsites = n; ad->max = ad->site[ad->Nsites-1]; ad->index = (int*)calloc(ad->max+1,sizeof(int)); // map bp to array index for(k=0;k<=ad->max;k++) ad->index[k] = -1; // initilise to -1 so nonpolymorphic sites map to -1 n=0; for(k=0;kNsites;k++) if ( ad->ok[k] ) { n++; ad->index[ad->site[k]] = k; } printf ( "read %d SNPs / %d variants for %d cols from %s\n", n, ad->Nsites, ad->Ncol, filename ); free(buffer); return(ad); } } // read in a genome's worth of founder allele data ALLELE_DATA **ReadGenomeAlleleData( char *dir, char *maskfile, int format, int nchr, char **chroms ) { int k=1, nseg=0; char filename[256]; ALLELE_DATA** add = (ALLELE_DATA**)calloc(100,sizeof(ALLELE_DATA)); SEGMENTS *mask = ReadGFF( maskfile, &nseg ); for( k=0;kindex[j] = -1; } } return(add); } int WriteSeqCalls ( SEQ_CALL **sq, int n, char *dir ) { int i; printf("in WriteSeqCalls n=%d\n", n); for( i=0;iname); WriteSeqCall( filename, sq[i]); } } // Write a text file of allele calls for one genome int WriteSeqCall( char *filename, SEQ_CALL *sq) { FILE *fp = fopen( filename, "w"); if ( NULL == fp ) { fprintf(stderr, "ERROR Could not open %s\n", filename); return(1); } else { int i; for (i=0;ilen;i++) { fprintf( fp, "Chr%d %d %c %c\n", sq->chr[i], sq->bp[i], sq->ref[i], sq->allele[i]); } fclose(fp); printf("wrote %d alleles to %s\n", sq->len, filename ); return(0); } } // Read in allele calls, filtering on the known alleles SEQ_CALL *ReadSeqCall( char *filename, ALLELE_DATA **add, int nchr, char **chroms ) { FILE *fp = fopen(filename, "r"); if ( NULL == fp ) { fprintf(stderr, "ERROR Could not open %s\n", filename); } else { SEQ_CALL *sq = (SEQ_CALL*)calloc(1,sizeof(SEQ_CALL)); int chr, bp, n=0, m=0; char r, a, p; sq->name = strdup(suffix(filename, '/')); sq->len=100000; sq->nchr = nchr; sq->chroms = chroms; sq->bp=(int*)calloc(sq->len,sizeof(int)); sq->chr=(int*)calloc(sq->len,sizeof(int)); sq->allele=(char*)calloc(sq->len,sizeof(char)); sq->ref=(char*)calloc(sq->len,sizeof(char)); sq->chr_start=(int*)calloc(1000,sizeof(int)); sq->chr_end=(int*)calloc(1000,sizeof(int)); sq->founder = (char**)calloc(sq->len,sizeof(char*)); sq->mosaic = (SEGMENTS**)calloc(nchr+1,sizeof(SEGMENTS*)); sq->nseg = (int*)calloc(nchr+1,sizeof(int)); for(chr=0;chr< nchr;chr++) { sq->chr_start[chr] = -1; sq->chr_end[chr] = -1; } int nn=0; char Chr[256]; while(fscanf( fp, "%chr%s %d %c %c\n", &p, Chr, &bp, &r, &a )==5 ) { if ( Chr[0] == 'X' ) chr = nchr-2; else if ( Chr[0] == 'Y' ) chr = nchr-1; else if ( sscanf(Chr, "%d", &chr ) ==1 ) chr--; // if ( nn++ < 100 ) printf( "%d %d %d %c %c %d %d\n", n, chr, bp, r, a, isalpha(a), add[chr]->max ); if ( isalpha(a) && bp <= add[chr]->max ) { int idx = add[chr]->index[bp] ; m++; if ( idx >= 0 ) { if ( n >= sq->len ) { sq->len *= 1.5; sq->bp=(int*)realloc(sq->bp, sq->len*sizeof(int)); sq->chr=(int*)realloc(sq->chr, sq->len*sizeof(int)); sq->allele=(char*)realloc(sq->allele, sq->len*sizeof(char)); sq->ref=(char*)realloc(sq->ref, sq->len*sizeof(char)); sq->founder = (char**)realloc(sq->founder, sq->len*sizeof(char*)); int j; for(j=n;jlen;j++) { sq->bp[j] = 0; sq->chr[j] = 0; sq->allele[j] = 0; sq->ref[j] = 0; sq->founder[j] = 0; } } sq->bp[n] = bp; sq->chr[n] = chr; sq->allele[n] = a; sq->ref[n] = r; sq->founder[n] = add[chr]->allele[idx]; if ( sq->chr_start[chr] == -1 || sq->chr_start[chr] > n ) sq->chr_start[chr] = n; if ( sq->chr_end[chr] < n ) sq->chr_end[chr] = n; if ( n==0 ) { sq->Nfounders = add[chr]->Ncol; sq->founder_name =add[chr]->col; } n++; } } } fclose(fp); sq->len = n; printf( "read %d/%d alleles from %s\n", n, m, filename ); if ( n==0 ) sq=NULL; return(sq); } } SEQ_CALL **ReadSeqCalls( char *dir, char *suffix, ALLELE_DATA **add, int nchr, char **chroms, int *nseq ) { glob_t globbuf; char pattern[256]; struct dirent **d; sprintf(pattern, "%s/*%s", dir, suffix ); printf("pattern %s\n", pattern); glob( pattern, 0, NULL, &globbuf ); int n = globbuf.gl_pathc; int chr; // if ( n > 21 ) n =21; if (n < 0) perror("scandir"); else { SEQ_CALL **seqs = (SEQ_CALL**)calloc(n,sizeof(SEQ_CALL)); int k=0,i, penalty=100; for(i=0;iname); free(sq->bp); free(sq->chr); free(sq->allele); free(sq->ref); free(sq->chr_start); free(sq->chr_end); free(sq->founder); for(c=0;cnchr;c++) free(sq->mosaic[c]); free(sq->mosaic); free(sq->nseg); free(sq); } SEGMENTS *ReadGFF( char *filename, int *nsegments ) { FILE *fp = fopen(filename, "r"); if ( fp == NULL ) { fprintf(stderr, "ERROR Could not open %s\n", filename); } else { int chr, from, to, nseg=100000, n=0; char build[256], type[256]; SEGMENTS *segs = (SEGMENTS*)calloc(nseg,sizeof(SEGMENTS)); while( fscanf( fp, "Chr%d %s %s %d %d", &chr, build, type, &from, &to ) ==5 ) { if ( n >= nseg ) { nseg *= 1.5; segs = (SEGMENTS*)realloc( segs, sizeof(SEGMENTS)*nseg); } segs[n].chr = chr; segs[n].from = from; segs[n].to = to; n++; } *nsegments = n; return(segs); } } static char **Mat; static int Nseq; static int sdpcmp( const void *a, const void *b ) { const int *A = (int* const)(a); const int *B = (int* const)(b); char *MA = Mat[*A]; char *MB = Mat[*B]; int k=0, d=0; while( kNsites,sizeof(char*)); char **hmat = (char**)calloc(ad->Nsites,sizeof(char*)); int *ptr = (int*)calloc(ad->Nsites,sizeof(int)); int *sdp = (int*)calloc(ad->Nsites,sizeof(int)); int *use = (int*)calloc(ad->Nsites,sizeof(int)); int *hbreak = (int*)calloc(ad->Nsites, sizeof(int)); int *hsite = (int*)calloc(ad->Nsites, sizeof(int)); int breaks = 0; for(i=0;iNsites;i++) { mat[i] = (char*)calloc(nseq+1,sizeof(char)); hmat[i] = (char*)calloc(nseq+1,sizeof(char)); ptr[i] = i; } for(s=0;snseg[c];k++) { int t = sq->mosaic[c][k].id; //strain id of the mosiac segment int from_idx = add[c]->index[sq->mosaic[c][k].from]; // start index int to_idx = add[c]->index[sq->mosaic[c][k].to]; //end index if ( k==0 ) from_idx = 1; if ( k==sq->nseg[c]-1) to_idx = ad->Nsites-1; for(m=from_idx; m<=to_idx;m++) { mat[m][s] = ad->sdp[m][t]; // mat[site][line] = coded genotype = numeric allele of strain t at site m hmat[m][s] = t; // hmat[site][line] = founder strain at site m in line s is t } } } // find all recombination breakpoints hbreak[0] = 0; breaks=0; hsite[0] = 1; for(m=1;mNsites;m++) { int k=0, d=0; while( ksite[m]; } } breaks++; printf("breaks %d c %d %lu\n", breaks, c, breaks*(nseq*sizeof(char)+sizeof(int))); // write the haplotype data sprintf( type, "type=uncompressed binary haplotype"); fwrite( type, sizeof(char), 40, hubfp); fwrite( &nseq, sizeof(int), 1, hubfp ); fwrite( &breaks, sizeof(int), 1, hubfp ); fwrite( &c, sizeof(int), 1, hubfp ); for(m=0;mname)+1; fwrite( &n, sizeof(int), 1, hubfp); fwrite( sqq[m]->name, sizeof(char), strlen(sqq[m]->name)+1, hubfp ); } for(m=0;mNsites, sizeof(int), 1, ubfp ); fwrite( &c, sizeof(int), 1, ubfp ); for(m=0;mname)+1; fwrite( &n, sizeof(int), 1, ubfp); fwrite( sqq[m]->name, sizeof(char), strlen(sqq[m]->name)+1, ubfp ); } for(m=0;mNsites;m++) fwrite( mat[m], sizeof(char), nseq, ubfp ); fwrite( ad->site, sizeof(int), ad->Nsites, ubfp ); fclose(ubfp); //ptr is a sorted vector of indices into mat, sorting sites so that those with the same sdp are adjacent qsort( &ptr[1], ad->Nsites-1, sizeof(int), sdpcmp ); sdp_n = 0; sdp[0] = sdp_n; for(m=1;mNsites;m++) { int idx = ptr[m]; int different=sdpcmp(&(ptr[m]),&(ptr[m-1]))!=0; sdp_n += different; //increment the sdp_n counter if different sdp[idx] = sdp_n; use[sdp_n] = idx; // use[sdp] = index of last occurence of that sdp in mat } // binary output // output format of binary files: // (0) 40 char text header // (i) four integers giving: nseq sdp_n NSites chr_id // (ii) for each sequence, the length(name) and name of the sequence // (iii) for each distinct sdp its strain distribution pattern across the sequences // the bp positions of all Nsites variable sites // the sdp id at each variable site for(i=0;i<256;i++) type[i] = 0; sprintf( type, "type=compressed binary"); fwrite( type, sizeof(char), 40, bfp); fwrite( &nseq, sizeof(int), 1, bfp ); fwrite( &sdp_n, sizeof(int), 1, bfp ); fwrite( &ad->Nsites, sizeof(int), 1, bfp ); fwrite( &c, sizeof(int), 1, bfp ); for(m=0;mname)+1; fwrite( &n, sizeof(int), 1, bfp); fwrite( sqq[m]->name, sizeof(char), strlen(sqq[m]->name)+1, bfp ); } for(m=0;m<=sdp_n;m++) fwrite( mat[use[m]], sizeof(char), nseq, bfp ); fwrite( ad->site, sizeof(int), ad->Nsites, bfp ); fwrite( sdp, sizeof(int), ad->Nsites, bfp ); fclose(bfp); // ascii output fprintf( fp, "SDP"); for(m=1;m<=sdp_n;m++) fprintf(fp, " %d", m); fprintf(fp, "\n"); for(m=1;mNsites;m++) fprintf(sdp_fp, "%d %d %d\n", c, ad->site[m], sdp[m] ); fclose(sdp_fp); for(s=0;sname ); for(m=1;m<=sdp_n;m++) fprintf(fp," %d", mat[use[m]][s]); fprintf(fp, "\n"); } fclose(fp); } } } //char *suffix( char *s, char sep ) { // int n = strlen(s)-1; // while(n>0 && s[n] != sep ) n--; // if ( s[n] == sep ) n++; // return(&(s[n])); // } BINARY_DATA *ReadBinaryData( char *filename) { FILE *fp = fopen( filename, "r" ); double bytes=0; if ( fp != NULL ) { int nsdp=0, nsite, nseq, chr; char type[50]; BINARY_DATA *bd=(BINARY_DATA*)calloc(1,sizeof(BINARY_DATA)); int n = fread( type, sizeof(char), 40, fp ); if ( !strcmp( type, "type=compressed binary") ) { n += fread( &nseq, sizeof(int), 1, fp ); n += fread( &nsdp, sizeof(int), 1, fp ); n += fread( &nsite, sizeof(int), 1, fp ); n += fread( &chr, sizeof(int), 1, fp ); type[40] = 0; printf( "reading from %s %s nsdp %d nsite %d nseq %d chr %d Gb %.3f\n", filename, type, nsdp, nsite, nseq, chr, nsdp*nseq/1.0e9 ); if ( nseq > 0 && nsdp >0 && nsite >0 && chr > 0 ) { int s, m; bd->nseq = nseq; bd->nsdp = nsdp; bd->nsite = nsite; bd->chr = chr; bd->uncompressed = 0; bd->haplotype = 0; bd->mat = (char**)calloc( nsdp+1, sizeof(char*)); bytes += (nsdp+1)*sizeof(char*); bd->id = (char**)calloc( nseq, sizeof(char*)); bytes += nseq*sizeof(char*); bd->sdp = (int*)calloc( nsite, sizeof(int)); bytes += nsite*sizeof(int); bd->site = (int*)calloc( nsite, sizeof(int)); bytes += nsite*sizeof(int); for(s=0;sid[s] = t; } for(m=0;m<=nsdp;m++) { char *buf = (char*)calloc(nseq, sizeof(char)); bytes += nseq*sizeof(char); n += fread( buf, sizeof(char), nseq, fp); bd->mat[m] = buf; } n += fread(bd->site, sizeof(int), nsite, fp); n += fread(bd->sdp, sizeof(int), nsite, fp); } } else if( (! strcmp( type, "type=uncompressed binary haplotype")) || (! strcmp( type, "type=uncompressed binary") ) ){ n += fread( &nseq, sizeof(int), 1, fp ); n += fread( &nsite, sizeof(int), 1, fp ); n += fread( &chr, sizeof(int), 1, fp ); type[30] = 0; printf( "reading from %s %s nsdp %d nsite %d nseq %d chr %d Gb %.3f\n", filename, type, nsdp, nsite, nseq, chr, nsite*nseq/1.0e9 ); if ( nseq > 0 && nsite >0 && chr > 0 ) { int s, m; bd->nseq = nseq; bd->nsdp = 0; bd->nsite = nsite; bd->chr = chr; bd->uncompressed = 1; bd->haplotype = (strcmp( type, "type=uncompressed binary haplotype") == 0 ); bd->mat = (char**)calloc( nsite+1, sizeof(char*)); bytes += (nsite+1)*sizeof(char*); bd->id = (char**)calloc( nseq, sizeof(char*)); bytes += nseq*sizeof(char*); bd->site = (int*)calloc( nsite, sizeof(int)); bytes += nsite*sizeof(int); for(s=0;sid[s] = t; } for(m=0;mmat[m] = buf; } n += fread(bd->site, sizeof(int), nsite, fp); } } fclose(fp); printf(" %lf Gb allocated, nseq=%d\n", bytes/1.0e9, bd->nseq); return(bd); } fprintf(stderr, "ERROR, could not open %s\n", filename ); return(NULL); } BINARY_DATA **ReadBinaryGenome( int nchr, char *dir, int uncompressed, int haplotype ) { char filename[256]; int chr; BINARY_DATA **bd = (BINARY_DATA**)calloc(nchr, sizeof(BINARY_DATA*)); for(chr=1;chr<=nchr;chr++) { if ( haplotype ) { sprintf( filename, "%s/chr%d.haplotype.Data", dir, chr); } else if ( uncompressed ) sprintf( filename, "%s/chr%d.uncompressed.Data", dir, chr); else sprintf( filename, "%s/chr%d.imputed.Data", dir, chr); bd[chr-1] = ReadBinaryData( filename ); } return(bd); } int WriteBinaryAlleleData( ALLELE_DATA *ad, char *filename ) { FILE *fp = fopen( filename, "w"); if ( fp ) { int n = strlen(ad->chr)+1; int i, j; printf("writing %s\n", filename ); fwrite( &ad->Nsites, 1, sizeof(int), fp ); fwrite( &ad->Ncol, 1, sizeof(int), fp); fwrite( &n, 1, sizeof(int), fp ); fwrite( ad->chr, n, sizeof(char), fp); for(i=0;iNcol;i++) { n = strlen(ad->col[i])+1; fwrite(&n,1,sizeof(int), fp); fwrite(ad->col[i], n, sizeof(char), fp); } fwrite( ad->site, ad->Nsites, sizeof(int), fp ); fwrite( ad->ok, ad->Nsites, sizeof(char), fp ); for(i=1;iNsites;i++) { fwrite( ad->allele[i], ad->Ncol, sizeof(char), fp); fwrite( ad->sdp[i], ad->Ncol, sizeof(char), fp); for(j=0;jNcol;j++) { n = strlen(ad->complete[i][j])+1; fwrite(ad->complete[i][j], n, sizeof(char), fp); } } fclose(fp); return(1); } return(0); } ALLELE_DATA **ReadAllBinaryAlleleData( char *dir, int index_all, int nchr, char **chroms ) { char filename[256]; int k; ALLELE_DATA **ad = (ALLELE_DATA**)calloc(nchr, sizeof(ALLELE_DATA*)); for(k=0;kNsites, 1, sizeof(int), fp); br += fread( &ad->Ncol, 1, sizeof(int), fp); br += fread( &n, 1, sizeof(int), fp); bytes = ad->Nsites*ad->Ncol*5; bytes100 = bytes-100; buffer = (char*)calloc(bytes,sizeof(char)); br += fread( buffer, n, sizeof(char), fp); // fprintf(stderr, "here 1\n"); ad->chr = strdup(buffer); ad->col = (char**)calloc(ad->Ncol,sizeof(char*)); for(i=0;iNcol;i++) { int nn; br += fread( &nn, 1, sizeof(int), fp); br += fread( buffer, nn, sizeof(char), fp); ad->col[i] = strdup(buffer); // fprintf(stderr, "%d %s\n", i, ad->col[i]); } // fprintf(stderr, "here 2\n"); ad->site = (int*)calloc(ad->Nsites+1, sizeof(int)); br += fread( ad->site, ad->Nsites, sizeof(int), fp); ad->ok = (char*)calloc(ad->Nsites+1, sizeof(char)); br += fread( ad->ok, ad->Nsites, sizeof(char), fp); // fprintf(stderr, "here 3\n"); ad->allele = (char**)calloc(ad->Nsites,sizeof(char*)); ad->sdp = (char**)calloc(ad->Nsites,sizeof(char*)); ad->complete = (char***)calloc(ad->Nsites,sizeof(char**)); buf_ptr = buffer; idx = 0; for(i=1;iNsites;i++) { ad->allele[i] = (char*)calloc(ad->Ncol, sizeof(char)); br += fread(ad->allele[i], ad->Ncol, sizeof(char), fp); ad->sdp[i] = (char*)calloc(ad->Ncol, sizeof(char)); br += fread(ad->sdp[i], ad->Ncol, sizeof(char), fp); ad->complete[i] = (char**)calloc(ad->Ncol, sizeof(char*)); for(j=0;jNcol;j++) { int m; buf_ptr = &buffer[idx]; while( m=getc(fp) ) { if ( idx > bytes100 ) { char *tmp; bytes = 10000000; bytes100 = bytes-100; tmp = (char*)calloc( bytes, sizeof(char)); strncpy(tmp,buf_ptr,n); buf_ptr = buffer = tmp; idx = n-1; // printf("calloc %d\n", bytes); } buffer[idx++] = m; } buffer[idx++] = 0; ad->complete[i][j] = buf_ptr; } } // fprintf(stderr, "here 6\n"); fclose(fp); ad->max = ad->site[ad->Nsites-1]; ad->index = (int*)calloc(ad->max+1,sizeof(int)); // map bp to array index for(i=0;i<=ad->max;i++) ad->index[i] = -1; // initilise to -1 so nonpolymorphic sites map to -1 for(i=0;iNsites;i++) if ( index_all || ad->ok[i] ) { n++; ad->index[ad->site[i]] = i; } printf ( "read %d sites / %d rows for %d cols from %s\n", n, ad->Nsites, ad->Ncol, filename ); return(ad); } return(NULL); } MOSAIC *ReadMosaic( char *mosaicfile, char **strings, int nstrings ) { FILE *fp = fopen( mosaicfile, "r" ); if ( fp ) { MOSAIC *mosaic = (MOSAIC*)calloc(1, sizeof(MOSAIC)); int max_segs = mosaic->nsegmax = 1000000; int chr, frombp, tobp, fromsite, tosite, lenbp, sites, errors, last_id=-1, i; char id[256], acc[256], buf[1024], *bu; double errorsite, errorbp, x, y; printf("reading from mosaicfile %s\n", mosaicfile); mosaic->seg = (MOSAIC_SEGMENT*)calloc(max_segs, sizeof(MOSAIC_SEGMENT)); mosaic->idx = (int*)calloc(max_segs, sizeof(int)); mosaic->dict_ids = (DICTIONARY*)NewDictionary( 10000); //assumes this is big enough not to check! mosaic->dict_accs = (DICTIONARY*)NewDictionary( 10000); for(i=0;idict_accs ); for(i=0;idict_accs ); printf( "%d %d %s\n", i, id, strings[i]); } bu = fgets(buf, 1024, fp); // skip the header while( fgets(buf, 1024, fp) ) { // printf("%s\n", buf); if ( sscanf(buf, "%s %d %s %d %d %d %d", id, &chr, acc, &frombp, &tobp, &fromsite, &tosite) == 7 ) { int seq_id = UpdateDictionary( id, mosaic->dict_ids); int acc_id = UpdateDictionary( acc, mosaic->dict_accs); printf( "%s %d %s %d\n", id, seq_id, acc, acc_id); if ( mosaic->nsegs >= max_segs ) { mosaic->nsegmax = max_segs *= 1.5; mosaic->seg = (MOSAIC_SEGMENT*)realloc(mosaic->seg, max_segs*sizeof(MOSAIC_SEGMENT)); } mosaic->seg[mosaic->nsegs].id = seq_id; mosaic->seg[mosaic->nsegs].acc = acc_id; mosaic->seg[mosaic->nsegs].chr = chr; mosaic->seg[mosaic->nsegs].frombp = frombp; mosaic->seg[mosaic->nsegs].tobp = tobp; mosaic->seg[mosaic->nsegs].fromsite = fromsite; mosaic->seg[mosaic->nsegs].tosite = tosite; mosaic->seg[mosaic->nsegs].errors = errors; if ( seq_id != last_id ) mosaic->idx[seq_id] = mosaic->nsegs; last_id = seq_id; mosaic->nsegs++; } } mosaic->idx[last_id+1] = mosaic->nsegs; fclose(fp); mosaic->nseqs = mosaic->dict_ids->nwords; printf("read %d segments %d sequences\n", mosaic->nsegs, mosaic->dict_ids->nwords); return(mosaic); } return(NULL); } void ImputeFromMosaic( MOSAIC *mosaic, ALLELE_DATA **ad, int nchr, char *outdir ) { int chr; for( chr=1;chr<=nchr;chr++) { ALLELE_DATA *a = ad[chr]; int nseqs = mosaic->nseqs, nsites=a->Nsites, i, j, k, jj; DICTIONARY *dict = mosaic->dict_ids; char **mat = (char**)calloc(nsites,sizeof(char*)); FILE *ubfp, *fp; char binaryfile[256], textfile[256], type[256]; int *inv = (int*)calloc(nseqs,sizeof(int)); for(i=0;iwords[i].id] = i; } for(i=0;iwords[jj].id; int idx_from = mosaic->idx[j]; int idx_to = mosaic->idx[j+1]; int k, n=0, m, q; int *right = (int*)calloc(idx_to-idx_from+2,sizeof(int)); int *state = (int*)calloc(idx_to-idx_from+2,sizeof(int)); printf("j=%d check %d from %d to %d\n", j, mosaic->seg[idx_from].id, idx_from, idx_to); right[0] = 1; for(k=idx_from;kseg[k].chr==chr ) { state[n] = mosaic->seg[k].acc; right[n++] = mosaic->seg[k].tobp; printf( "k=%d n=%d state=%d right=%d\n", k, n-1, state[n-1], right[n-1]); } } right[n] = right[n-1] + 1000000; state[n] = state[n-1]; n++; q = 1; //site index m = 0; //segment index printf("here j %d nsites %d\n", j, nsites); while( msite[q]; while ( bp > right[m] && m < n ) m++; mat[q][j] = a->sdp[q][state[m]]; if ( bp==260617 ) printf("bp=%d q=%d j=%d id=%s state=%d %s mat=%d\n", bp, q, j, mosaic->dict_ids->words[j].word, state[m],mosaic->dict_accs->words[state[m]].word, mat[q][j]); q++; } free(state); free(right); } sprintf( binaryfile, "%s/chr%d.uncompressed.Data", outdir, chr); sprintf( textfile, "%s/chr%d.uncompressed.txt", outdir, chr); ubfp = fopen(binaryfile, "w"); fp = fopen(textfile, "w"); sprintf( type, "type=uncompressed binary"); fwrite( type, sizeof(char), 40, ubfp); fwrite( &nseqs, sizeof(int), 1, ubfp ); fwrite( &nsites, sizeof(int), 1, ubfp ); fwrite( &chr, sizeof(int), 1, ubfp ); int m; for(m=0;mwords[m].word; int n = strlen(w)+1; fwrite( &n, sizeof(int), 1, ubfp); fwrite( w, sizeof(char), strlen(w)+1, ubfp ); fprintf( fp, "%s ", w); } fprintf(fp, "\n"); for(m=0;mwords[mm].id], ubfp); } // fwrite( mat[m], sizeof(char), 1nseqs, ubfp ); fprintf(fp, "%d ", a->site[m]); for( k=0;ksite, sizeof(int), nsites, ubfp ); fclose(ubfp); fclose(fp); } } char **TheChromosomes( char *species, int *nchr ) { static char *human_chrom[24] = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"}; static char *mouse_chrom[22] = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X", "Y"}; static char *arabidopsis_chrom[5] = {"1", "2", "3", "4", "5"}; if ( ! strcmp( species, "human" )) { *nchr = 24; return( human_chrom ); } else if ( ! strcmp( species, "mouse" )) { *nchr = 21; return( mouse_chrom ); } else if ( ! strcmp( species, "arabidopsis" )) { *nchr = 5; return( arabidopsis_chrom ); } else { *nchr = 0; return(NULL); } } double NewRecombination( double bp_per_recomb ) { int r = random(); double y = (double)r/(double)RAND_MAX; return(-log(y)*bp_per_recomb); } int NewState( int old_state, int states ) { int n=-1; do { int r = random(); double y = states*(double)r/(double)RAND_MAX; n = (int)y; } while( n == old_state); return(n); } int BernouilliTrial( double p ) { int r = random(); double y = (double)r/(double)RAND_MAX; return(y<=p); } void Simulate( int nchr, ALLELE_DATA **add, int diploid, double bp_per_recomb, int N, char *filename) { FILE *fp = fopen(filename, "w"); char mosaicfile[256]; sprintf(mosaicfile, "%s.mosaic", filename ); FILE *mosaicfp = fopen(mosaicfile, "w"); MOSAIC_SEGMENT *mosaic = (MOSAIC_SEGMENT*)calloc(N, sizeof(MOSAIC_SEGMENT)); if ( fp && mosaicfp) { time_t seconds = time(NULL); int sites=0, i; double p, r; srandom(seconds); for(i=0;iNsites; p = (double)N/(double)sites; for(i=0;iNcol; double len = a->site[a->Nsites-1]-a->site[0]+1; int last = a->site[a->Nsites-1]; if ( diploid ) { char rel; int state1, last_state1=0; int state2, last_state2=0; int x=a->site[1], last_x = a->site[1], m=0, last_m, k=1; state1 = NewState( state1, states); state2 = NewState( state2, states); while( x < last ) { last_state1 = state1; last_state2 = state2; rel = ( state1 == state2 ) ? '=' : '*'; r = NewRecombination(bp_per_recomb/2.0); last_x = x; x += r; if ( BernouilliTrial(0.5) ) state1 = NewState(state1, states); else state2 = NewState(state2, states); last_m = m; while ( a->site[k] < x && k < a->Nsites ) { if ( BernouilliTrial( p ) ) { m++; if ( BernouilliTrial(0.5)) fprintf( fp, "chr%s %d %s %s\n", a->chr, a->site[k], a->complete[k][0], a->complete[k][last_state1] ); else fprintf( fp, "chr%s %d %s %s\n", a->chr, a->site[k], a->complete[k][0], a->complete[k][last_state2] ); } k++; } fprintf( mosaicfp, "%s %s %s%c%s %d %d %d %d %d %d 0 0 0\n", filename, a->chr, a->col[last_state1], rel, a->col[last_state2], last_x+1, x, last_m+1, m, x-last_x, m-last_m ); } } else { int j = 0, k=1, x=a->site[1], last_x=a->site[1]; int m=0, last_m = 0; int state, last_state=0; state = NewState( state, states ); while ( x < last ) { last_x = x; x += NewRecombination(bp_per_recomb); last_state = state; state = NewState(state, states); last_m = m; while ( a->site[k] < x && k < a->Nsites ) { if ( BernouilliTrial( p ) ) { m++; fprintf( fp, "chr%s %d %s %s\n", a->chr, a->site[k], a->complete[k][0], a->complete[k][last_state] ); } k++; } fprintf( mosaicfp, "%s %s %s %d %d %d %d %d %d 0 0 0\n", filename, a->chr, a->col[last_state], last_x+1, x, last_m+1, m, x-last_x, m-last_m ); } } } fclose(fp); fclose(mosaicfp); } }