/* File: glfCheckGenotype.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (C) Genome Research Limited, 2008 *------------------------------------------------------------------- * Description: * Exported functions: * HISTORY: * Last edited: Dec 20 09:38 2008 (rd) * Created: Tue Dec 9 17:52:22 2008 (rd) *------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include "glf.h" #include "array.h" #include "bgzf.h" #include "tools.h" #ifdef BCF int bcfCheckGenotype (int argc, char *argv[]); #endif void usage(void) { fprintf(stderr, "Usage: glf checkGenotype [OPTIONS] \n" #ifdef BCF " glf checkGenotype [OPTIONS] -s \n" #endif "Options:\n" " -h, --homs-only Homozygous positions only\n" " -s, --sample One of the BCF samples or \"-\" for the first sample. Required for BCF file, if not present, assuming GLF file\n" " -v, --verbose\n" "\n" "Comments:\n" " The binary genotype file hapmap.bin is made with hapmap2bin.\n" ); exit(-1); } typedef struct { char *name; int likelihood; int depthsum; int Nsites; float score; } SAMPLE ; int sampleSort (const void* vx, const void* vy) { SAMPLE *x = (SAMPLE*)vx, *y = (SAMPLE*)vy ; // The original approach does not work when samples have different number of positions, the // ones with fewer positions are always listed first. Try different approach, weight the likelihood // by Nsites. // // return x->likelihood - y->likelihood float stat = x->score - y->score; if ( stat<0 ) return -1; if ( stat>0 ) return 1; return 0; } int glfCheckGenotype (int argc, char *argv[]) { char name2[256] ; char *name; char *genotype_fname=NULL, *glf_fname=NULL; glfFile fpIn; FILE *genotypeFile=NULL, *glfIn=NULL; DICT *chromDict; /* chromosome names */ int i, j, n, len, chrom, nsites_total; size_t nread; int *gList; double sum, entropy = 0 ; Array chrArray, posArray, lArray, starts, lengths; // not used:, glfChrArray ; Array samples ; SAMPLE *s ; glf3_t *g3 ; glf3_header_t *h; int homs_only = 0; for (i=1; imax, int) ; lengths = arrayCreate (chromDict->max, int) ; chromosome_ranges(starts, lengths, chrArray, posArray); /* read the likelihoods from the GLF file */ lArray = arrayCreate(chromDict->max,Array); for (i=0; imax; i++) array(lArray,i,Array) = arrayCreate(arr(lengths,i,int),glf3_t); Array current_array; entropy = 0 ; /* initialise glf3 objects */ h = glf3_header_read(fpIn); g3 = glf3_init1(); // for (i = 0; i < n; i++) printf("%s\t%d\tN\tN\tM\n",dictName(chromDict,arr(chrArray,i,int)),arr(posArray,i,int)); /* Read the GLF file and pick up the sites listed in the genotype file. */ i = 0; j = 0; while ((name = glf3_ref_read(fpIn, &len)) != 0) { //debug("%s %d\n",name,n); if (!dictFind (chromDict, name, &chrom)) { // Skip this chromosome - it is not listed in the dictionary of the genotype file while (glf3_read1(fpIn, g3) && g3->rtype != GLF3_RTYPE_END); continue; } // The array with the positions for this chromosome current_array = arr(lArray,chrom,Array); int pos = 0, chromStart, chromEnd, j = 0, chromStartIndex, chromEndIndex; chromStartIndex = arr(starts,chrom,int); chromEndIndex = chromStartIndex + arr(lengths,chrom,int) -1; chromStart = arr(posArray,chromStartIndex,int); chromEnd = arr(posArray, chromEndIndex, int); i = chromStartIndex; int k, read_status; read_status = glf3_read1(fpIn, g3); pos += g3->offset; while (read_status && g3->rtype != GLF3_RTYPE_END) { if (pos + 1 < chromStart || pos + 1 > chromEnd || i > chromEndIndex || g3->rtype == GLF3_RTYPE_INDEL || pos + 1 < arr(posArray,i,int)) { // Either the position pos in the glf file is outside of the sites contained in the genotype file, // or the site i in the genotype is on different chromosome already (but we still must read the rest of the glf file) // or it is an indel, // or the next site i in the genotype file is bigger than the position. // -> ignore the position pos in the glf file and go to the next one. read_status = glf3_read1(fpIn, g3); pos += g3->offset; continue; } if (pos + 1 > arr(posArray,i,int)) { // The position pos in the glf file is ahead of the site i in the genotype file. i++; continue; } g3->offset = pos; array(current_array, j, glf3_t) = *g3 ; double entropyBit = 0 ; // int ii; // debug("%s:%d %d %d %d\t", dictName(chromDict,chrom), arr(posArray,i,int),pos,i,j); // for (ii=0; ii<10; ii++) // debug(" %d", g3->lk[ii]); // debug("\n"); if (g3->depth) { for (k=0; k<10; k++) { double x = expPhred(g3->lk[k]); entropyBit += x*g3->lk[k] ; sum += x ; } entropy += entropyBit / sum ; } i++; j++; } free(name); } /* clean everything up */ glf3_header_destroy(h); glf3_destroy1(g3); bgzf_close(fpIn); /* read and check the samples from the genotype file */ samples = arrayCreate (1024, SAMPLE) ; while (freadString(genotypeFile, name2, 256)) { debug("Evaluating %s...\n", name2); nread = fread(gList, sizeof(int), n, genotypeFile); if ( nread!=n ) error("Could not read from %s: fread returned %d\n", genotype_fname,nread); s = arrayp(samples, arrayMax(samples), SAMPLE) ; s->name = strdup (name2) ; s->likelihood = 0; s->score = INT_MAX; j=0; i=0; for (chrom=0; chrom < chromDict->max; chrom++) { int imax = arr(starts,chrom,int) + arr(lengths,chrom,int); if ( imax>=arrayMax(posArray) ) // >= or =?? error("FIXME: chrom=%d imax=%d (%d+%d) posArray max=%d\n", chrom,imax,arr(starts,chrom,int),arr(lengths,chrom,int),arrayMax(posArray)); current_array = arr(lArray,chrom,Array); i = arr(starts,chrom,int); // arr(posArray,i,int) now gives the position j = 0; while ( i arr(current_array,j,glf3_t).offset + 1) { // The site i in the genotype file is ahead of the position j in the glf file j++; continue; } if (arr(posArray,i,int) < arr(current_array,j,glf3_t).offset + 1) { // The site i in the genotype file is behind the position j in the glf file i++; continue; } #if 0 int ii; for (ii=0; ii<10; ii++) debug(" %d", arr(current_array,j,glf3_t).lk[ii]); debug("\t"); debug("pos .. %s:%d gList[%d]=%d depth=%d \t", dictName(chromDict,chrom), arr(posArray,i,int), i,gList[i],arr(current_array,j,glf3_t).depth); #endif glf3_t *g = arrp(current_array,j,glf3_t); if ( g->depth && ( (homs_only && (gList[i]==GT_AA || gList[i]==GT_CC || gList[i]==GT_GG || gList[i]==GT_TT)) || (!homs_only && gList[i]likelihood += g->lk[gList[i]]; s->depthsum += g->depth; s->Nsites++; if ( s->score==INT_MAX ) s->score=0; s->score += g->lk[gList[i]]; } //debug(" .. min_lk=%d\n", g->min_lk); i++;j++; } } } for (i = 0, s=arrp(samples, 0, SAMPLE) ; i < arrayMax (samples) ; ++i, ++s) if ( s->Nsites!=0 ) s->score /= s->Nsites; /* sort and find the best match */ arraySort (samples, sampleSort) ; printf ("entropy %.1f, hets included %d\n", entropy,homs_only?0:1); for (i = 0, s=arrp(samples, 0, SAMPLE) ; i < arrayMax (samples) ; ++i, ++s) printf ("sample %s likelihood %d over %d sites, score %f, avg depth %.6f\n", s->name, s->likelihood,s->Nsites,s->score,((float)s->depthsum)/nsites_total); return 0; } /*************** enf of file *****************/