/* File: glfStats.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (C) Genome Research Limited, 2008 *------------------------------------------------------------------- * Description: Calculates per-chromosome statistics from a .glf file * Exported functions: glfStats * HISTORY: * Last edited: May 4 16:36 2009 (lj4@sanger.ac.uk) * * April 30 15:24 2009 (lj4@sanger.ac.uk): updated code to handle GLF version 3 files * Created: Tue Aug 5 00:37:14 2008 (rd) *------------------------------------------------------------------- */ #include #include #include #include #include #include #include "glf.h" #include "bgzf.h" static int HIST = 1 ; static int IGNORE_N = 0 ; static int nBase = 0 ; /******************* reporting *******************/ static char topLines[3][128] ; static char histLines[256][128] ; static void report (char *label, unsigned int *hist) { int i ; unsigned int n = 0 ; double sum = 0, sumsq = 0 ; static int isFirst = 1 ; if (isFirst) { isFirst = 0 ; for (i = 0 ; i < 3 ; ++i) *topLines[i] = 0 ; for (i = 0 ; i < 256 ; ++i) sprintf (histLines[i], "%d", i) ; } for (i = 0 ; i < 256 ; ++i) { n += hist[i] ; sum += i*hist[i] ; sumsq += i*i*hist[i] ; } if (strlen(label) > 15) label[15] = 0 ; /* prevents ugliness*/ sprintf (&topLines[0][strlen(topLines[0])], "\t%15s", label) ; if (nBase != n) sprintf (&topLines[1][strlen(topLines[1])], "\t%10d %4.1f", n, (100*n)/(double)nBase) ; else sprintf (&topLines[1][strlen(topLines[1])], "\t%10d 100", n) ; if (n > 1) sprintf (&topLines[2][strlen(topLines[2])], "\t %5.1f %5.1f", sum/n, sqrt((sumsq - sum*sum/n)/(n-1))) ; else sprintf (&topLines[2][strlen(topLines[2])], " ") ; for (i = 0 ; i < 256 ; ++i) sprintf (&histLines[i][strlen(histLines[i])], "\t%10d %4.1f", hist[i], n ? 100*hist[i]/(double)n : 0) ; } static void printReport (void) { int i ; printf ("%d bases total\n", nBase) ; for (i = 0 ; i < 3 ; ++i) printf ("%s\n", topLines[i]) ; if (HIST) { printf ("\n") ; for (i = 0 ; i < 256 ; ++i) printf ("%s\n", histLines[i]) ; } } /***************** main function ***************/ int glfStats (int argc, char *argv[]) { int i, j, nameLen, chrLen ; glf3_header_t *h; char *name; glfFile fpIn; glf3_t *g3 ; static unsigned int depthHist[256] ; static unsigned int maxQHist[256] ; static unsigned int callQHist[256] ; static unsigned int lkHist[256] ; static unsigned int hetHist[256] ; static unsigned int homHist[256] ; --argc ; ++argv ; /* remove program command */ if (argc >= 1) if (!strcmp(argv[0], "-h")) { fprintf (stderr, "Usage: glf stats [-ignoreN] [-max ] < x.glz\n") ; return 1 ; } if (!(fpIn = bgzf_fdopen(fileno(stdin), "r"))) // argv[0] { fprintf (stderr, "failed to open in .glz file %s\n", argv[0]) ; return 1 ; } for (i = 0 ; i < 256 ; ++i) { depthHist[i] = 0 ; maxQHist[i] = 0 ; callQHist[i] = 0 ; } /* initialise glf3 objects */ h = glf3_header_read(fpIn); g3 = glf3_init1(); while ((name = glf3_ref_read(fpIn, &chrLen)) != 0) { int pos = 0; fprintf (stderr, "chr %s length %d\n", name, chrLen) ; while (glf3_read1(fpIn, g3) && g3->rtype != GLF3_RTYPE_END) { pos += g3->offset; ++nBase ; ++depthHist[g3->depth] ; ++maxQHist[g3->rms_mapQ] ; if (g3->lk[baseGlf[g3->ref_base]]) ++callQHist[g3->rms_mapQ] ; } free(name); } if (nBase) { report ("depth", depthHist) ; report ("maxQ", maxQHist) ; report ("callQ", callQHist) ; } printReport () ; /* clean up */ glf3_header_destroy(h); glf3_destroy1(g3); bgzf_close(fpIn); return 0 ; } /***************** end of file ****************/