/* File: glSubCall.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (C) Genome Research Limited, 2008 *------------------------------------------------------------------- * Description: call variants at predetermined list of sites from a .glf file * Exported functions: glfSubCall * HISTORY: * Last edited: May 4 20:40 2009 (lj4@sanger.ac.uk) * * April 30 15:24 2009 (lj4@sanger.ac.uk): updated code to handle GLF version 3 files * Created: Sun Sep 7 17:10:09 2008 (rd) *------------------------------------------------------------------- */ #include #include #include #include #include #include "glf.h" #include "bgzf.h" typedef struct { char *name ; int pos ; char c1, c2, c3 ; } SITE ; /****************************************************/ /* output format spec from http://maq.sourceforge.net/maq-manpage.shtml 080901 Extract SNP sites. Each line consists of chr chromosome, x position, r reference base, b consensus base, q Phred-like consensus quality, d read depth, n the average number of hits of reads covering this position, max_mapQ the highest mapping quality of the reads covering the position, nqs the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total), b2 the second best call, q2 log likelihood ratio of the second best and the third best call, b3 and the third best call. Heng uses q2 and b23 to give the SNP detection quality as follows: if b2 is the reference, then SNP quality is q, else if b3 is the reference then SNP quality is q + q2 (or if none of b, b2, b3 are the reference) I think the SNP detection quality should be logPhred(p_ref). So I will set q2 to be this - q, so that whenever snpfilter uses it the right value will be obtained. */ /****************************************************/ static void report (char *name, int pos, glf3_t *g, glf3_t *buf, SITE *site) { int i, k ; int k1, v1 = 256 ; int q, nqs = 255 ; /* the quality of the best call at this site, and of a SNP call */ q = quality(g) ; /* find best call */ for (k = 0 ; k < 10 ; ++k) if (g->lk[k] < v1) { k1 = k ; v1 = g->lk[k] ; } /* calculate nqs */ for (i = 0 ; i < 7 ; ++i, ++buf) if (buf != g) { int q = quality(buf) ; if (q < nqs) nqs = q ; } printf ("%s\t%d\t%c\t", name, pos+1, iupac[g->ref_base]) ; printf ("%c\t%d\t", iupac[glfBase[k1]], q) ; printf ("%d\t%.2f\t%d\t%d\t", g->depth, 0.00, g->rms_mapQ, nqs) ; /* can't give av_hits, so set to 0 */ printf ("%c\t%c\t%c\n", site->c1, site->c2, site->c3) ; } /**********************************************/ static SITE *readSite (FILE *fp) { static SITE s ; if (!s.name) s.name = malloc (1024) ; if (fscanf (fp, "%s\t%d\t%c\t%c\t%c\n", s.name, &s.pos, &s.c1, &s.c2, &s.c3) != 5) if (feof (fp)) return 0 ; else { fprintf (stderr, "Failed to read site properly\n") ; exit (-1) ; } else { --s.pos ; /* change to 0 offset */ return &s ; } } /**********************************************/ int glfSubCall (int argc, char *argv[]){ int nameLen, chrLen ; glfFile fpIn ; FILE *fpSites = 0 ; SITE *site ; --argc ; ++argv ; /* remove program command */ while (argc) if (!strcmp (*argv, "-sites") && argc >= 2) { if (!(fpSites = fopen (argv[1], "r"))) { fprintf (stderr, "failed to open sites file %s\n", argv[1]) ; return -1 ; } argc -= 2 ; argv += 2 ; } else if (!strcmp (*argv, "-help") || !strcmp (*argv, "-h")) { fprintf (stderr, "usage: glf subCall -sites in.refgeno < in.glz > out.geno\n") ; fprintf (stderr, "subCall calls specified bases from .glf file in maq geno.snp format.\n") ; fprintf (stderr, "Any priors information needs to be applied beforehand; this just normalises the likelihoods,\n") ; fprintf (stderr, "treating them as posteriors.\n") ; return 1 ; } else { fprintf (stderr, "unknown argument %s : type \"glf subCall -help\" for help\n", *argv) ; return -1 ; } if (!fpSites) { fprintf (stderr, "no sites file - type glf subCall -help for help\n") ; return -1 ; } if (!(fpIn = bgzf_fdopen(fileno(stdin), "r"))) { fprintf (stderr, "failed to open stdin as .glz file\n") ; return 1 ; } if (!(site = readSite (fpSites))) { fprintf (stderr, "sites file is empty\n") ; return 0 ; } while (site->pos < 3) if (!(site = readSite (fpSites))) return 0 ; glf3_header_t *h; char *name; glf3_t *g3, *g3Buff[7]; int len, pos[7]; h = glf3_header_read(fpIn); g3 = glf3_init1(); int i; for (i = 0; i < 7; i++) g3Buff[i] = glf3_init1(); while ((name = glf3_ref_read(fpIn, &len)) != 0) { for (i = 0; i < 7; i++) pos[i] = -1; if (strcmp (name, site->name)) while (glf3_read1(fpIn, g3) && g3 -> rtype != GLF3_RTYPE_END); // This doesn't really work with GLFv3 /* if (site->pos >= chrLen-3) */ /* { fprintf (stderr, "site pos %d is bigger than chrlen %d - 3 for chrom %s\n", site->pos, chrLen, name) ; */ /* return -1 ; */ /* } */ i = 0; while (glf3_read1(fpIn, g3Buff[i%7]) && g3Buff[i%7]->rtype != GLF3_RTYPE_END) { g3 = g3Buff[abs((i-3)%7)]; pos[abs((i-3)%7)] = pos[abs((i-4)%7)] + g3->offset; while (site->pos < pos[abs((i-3)%7)]) if (!(site = readSite(fpSites))) return 0; if (i >= 6 && pos[(i-3)%7] == site->pos) { report (name, pos[(i-3)%7], g3, *g3Buff,site) ; if (!(site = readSite (fpSites))) return 0 ; if (strcmp (name, site->name)){ while (glf3_read1(fpIn, g3Buff[i%7]) && g3Buff[i%7]->rtype != GLF3_RTYPE_END); break; } } i++; } free(name); } glf3_header_destroy(h); glf3_destroy1(g3); bgzf_close(fpIn); for (i = 0; i < 7; i++) glf3_destroy1(g3Buff[i]); return 0 ; } /************** end of file ************/