/* Author: petr.danecek@sanger Modified glfCheckGenotype to work with BCF. */ #include #include #include #include #include #include #include #include "glf.h" #include "array.h" #include "bgzf.h" #include "tools.h" #ifdef BCF #include #include #define debug(...) int get_sample_idx(bcf_hdr_t *header, const char *sample) { int i; if ( !strcmp(sample,"-") ) { return 0; } for (i=0; in_smpl; i++) { if ( !strcmp(sample,header->sns[i]) ) return i; } return -1; } uint8_t *get_PL(bcf1_t *bcf_line, int isample, int *n_pl_items) { *n_pl_items = bcf_line->n_alleles * (bcf_line->n_alleles + 1) / 2; int idat; for (idat=0; idatn_gi; idat++) { if (bcf_line->gi[idat].fmt != bcf_str2int("PL", 2)) continue; return (uint8_t*)bcf_line->gi[idat].data + isample*(*n_pl_items); } *n_pl_items = 0; return NULL; } uint16_t get_DP(bcf1_t *bcf_line, int isample) { int idat; for (idat=0; idatn_gi; idat++) { if (bcf_line->gi[idat].fmt != bcf_str2int("DP", 2)) continue; return ((uint16_t*)bcf_line->gi[idat].data)[isample]; } return (uint16_t)-1; } #define ACGTX_bits(x) x=='A'?1:(x=='C'?2:(x=='G'?4:(x=='T'?8:(x=='X'?15:0)))); void fill_lks(int nitems,uint8_t *items,bcf1_t *bcf_line,uint8_t *lks) { // Using the mapping A=1 C=2 G=4 T=8 X=15 // The likelihoods lks are ordered as AA AC AG AT CC CG CT GG GT TT // The BCF likelihoods are for REF=A ALT=G ordered as AA AG GG AX GX XX // Code the ref and alt alleles into bits int nalleles = 1, alleles[5] = {0,0,0,0,0}; alleles[0] = ACGTX_bits(*(bcf_line->ref)); char *p = bcf_line->alt; while (*p) { if ( *p==',' ) { p++; continue; } alleles[nalleles++] = ACGTX_bits(*p); p++; if ( nalleles>5 ) { fprintf(stderr,"FIXME: not ready for this, more than 5 alleles %s,%s\n",bcf_line->ref,bcf_line->alt); exit(1); } } // Which bases are "X"? int i,j,xi=-1; int xbits = 15; for (i=0; imax, int) ; lengths = arrayCreate (chromDict->max, int) ; chromosome_ranges(starts, lengths, chrArray, posArray); /* initialize the likelihoods structures */ lArray = arrayCreate(chromDict->max,Array); for (i=0; imax; i++) array(lArray,i,Array) = arrayCreate(arr(lengths,i,int),glf3_t); Array current_array; entropy = 0 ; /* Read the BCF file and pick up the sites listed in the genotype file. */ int ipos_gtfile = 0; int jpos_records = 0; bcf1_t *bcf_line = calloc(1, sizeof(bcf1_t)); int skip_chrom_id = -1; int prev_chrom_id = -1; int pos = 0, chromStart, chromEnd, chromStartIndex, chromEndIndex; while (vcf_read(bcf, bcf_header, bcf_line) > 0) { int chrom_id = bcf_line->tid; debug("BCF reading: %s %d [this=%d, skip=%d]\n", bcf_header->ns[chrom_id],bcf_line->pos+1,chrom_id,skip_chrom_id); if ( chrom_id==skip_chrom_id ) continue; skip_chrom_id = -1; // Consider only one-base records and ignore indels if ( bcf_is_indel(bcf_line) ) continue; if ( strlen(bcf_line->ref) > 1 ) continue; // New chromosome or first time here? if ( prev_chrom_id != chrom_id ) { // Get the chromosome name name = bcf_header->ns[chrom_id]; if (!dictFind (chromDict, name, &chrom)) { skip_chrom_id = chrom_id; continue; } // The array with the positions for this chromosome current_array = arr(lArray,chrom,Array); chromStartIndex = arr(starts,chrom,int); chromEndIndex = chromStartIndex + arr(lengths,chrom,int) -1; chromStart = arr(posArray,chromStartIndex,int); chromEnd = arr(posArray, chromEndIndex, int); ipos_gtfile = chromStartIndex; } prev_chrom_id = chrom_id; pos = bcf_line->pos+1; while (1) { if (pos < chromStart || pos > chromEnd || ipos_gtfile > chromEndIndex || pos < arr(posArray,ipos_gtfile,int)) { // Either the position pos in the bcf file is outside of the block of sites contained in the genotype file, // or the site i in the genotype is on different chromosome already (but we still must read the rest of the bcf file) // or the next site i in the genotype file is bigger than the position. // -> ignore the position pos in the bcf file and go to the next one. debug("break: %s %d (chromStart=%d,chromEnd=%d ipos_gtfile=%d chromEndIndex=%d, arrpos=%d)\n", bcf_header->ns[chrom_id],pos,chromStart,chromEnd,ipos_gtfile,chromEndIndex,arr(posArray,ipos_gtfile,int)); break; } if (pos > arr(posArray,ipos_gtfile,int)) { // The position pos in the bcf file is ahead of the site i in the genotype file. ipos_gtfile++; continue; } g3 = arrayp(current_array, jpos_records, glf3_t); g3->depth = get_DP(bcf_line,isample); g3->offset = pos; if ( bcf_line->ref[0] == 'A' ) g3->ref_base = 1; else if ( bcf_line->ref[0] == 'C' ) g3->ref_base = 2; else if ( bcf_line->ref[0] == 'G' ) g3->ref_base = 4; else if ( bcf_line->ref[0] == 'T' ) g3->ref_base = 8; else error("TODO: fixme %s:%d [%s]\n", name,pos,bcf_line->ref); int nitems = 0; uint8_t *items = get_PL(bcf_line,isample,&nitems); fill_lks(nitems,items,bcf_line,g3->lk); debug("Reading BCF: %s %d",name,g3->offset); double entropyBit = 0 ; int k; for (k=0; k<10; k++) { double x = expPhred(g3->lk[k]); entropyBit += x*g3->lk[k] ; sum += x ; debug(" %d",g3->lk[k]); } debug("\n"); entropy += entropyBit / sum ; ipos_gtfile++; jpos_records++; break; } } /* read and check the samples from the genotype file */ samples = arrayCreate (1024, SAMPLE) ; while (freadString(genotypeFile, name2, 256)) { debug("Evaluating %s...\n", name2); nread = fread(gList, sizeof(int), n, genotypeFile); if ( nread!=n ) error("Could not read from %s: fread returned %d\n", genotype_fname,nread); s = arrayp(samples, arrayMax(samples), SAMPLE) ; s->name = strdup (name2) ; s->likelihood = 0; s->score = INT_MAX; j=0; i=0; for (chrom=0; chrom < chromDict->max; chrom++) { int imax = arr(starts,chrom,int) + arr(lengths,chrom,int); if ( imax>=arrayMax(posArray) ) // >= or =?? error("FIXME: chrom=%d imax=%d (%d+%d) posArray max=%d\n", chrom,imax,arr(starts,chrom,int),arr(lengths,chrom,int),arrayMax(posArray)); current_array = arr(lArray,chrom,Array); i = arr(starts,chrom,int); // arr(posArray,i,int) now gives the position j = 0; while ( i arr(current_array,j,glf3_t).offset ) { // The site i in the genotype file is ahead of the position j in the glf file j++; continue; } if (arr(posArray,i,int) < arr(current_array,j,glf3_t).offset ) { // The site i in the genotype file is behind the position j in the glf file i++; continue; } #if 0 int ii; for (ii=0; ii<10; ii++) debug(" %d", arr(current_array,j,glf3_t).lk[ii]); debug("\t"); debug("pos .. %s:%d gList[%d]=%d depth=%d \t", dictName(chromDict,chrom), arr(posArray,i,int), i,gList[i],arr(current_array,j,glf3_t).depth); #endif glf3_t *g = arrp(current_array,j,glf3_t); if ( g->depth && ( (homs_only && (gList[i]==GT_AA || gList[i]==GT_CC || gList[i]==GT_GG || gList[i]==GT_TT)) || (!homs_only && gList[i]likelihood += g->lk[gList[i]]; if ( g->depth != (uint16_t)-1 ) s->depthsum += g->depth; s->Nsites++; if ( s->score==INT_MAX ) s->score=0; s->score += g->lk[gList[i]]; } debug(" .. min_lk=%d\n", g->min_lk); i++;j++; } } } for (i = 0, s=arrp(samples, 0, SAMPLE) ; i < arrayMax (samples) ; ++i, ++s) if ( s->Nsites!=0 ) s->score /= s->Nsites; /* sort and find the best match */ arraySort (samples, sampleSort) ; printf ("entropy %.1f, hets included %d\n", entropy,homs_only?0:1); for (i = 0, s=arrp(samples, 0, SAMPLE) ; i < arrayMax (samples) ; ++i, ++s) printf ("sample %s likelihood %d over %d sites, score %f, avg depth %.6f\n", s->name, s->likelihood,s->Nsites,s->score,((float)s->depthsum)/nsites_total); return 0; } #endif