/* File: glfExtract.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (C) Genome Research Limited, 2008 *------------------------------------------------------------------- * Description: extract chromosomes, sites or regions from a .glf file * Exported functions: glfExtract * HISTORY: * Last edited: May 4 15:27 2009 (lj4@sanger.ac.uk) * * April 30 15:24 2009 (lj4@sanger.ac.uk): updated code to handle GLF version 3 files * * Jan 26 14:24 2009 (rd): changed "-file" to "-sites" for consistency with subCall and fixed off by one error for ; until now we reported at pos+1 * Created: Mon Aug 4 21:58:50 2008 (rd) *------------------------------------------------------------------- */ #include #include #include #include #include "glf.h" #include "bgzf.h" #include "dict.h" #include "array.h" static char* nameExtract = 0 ; static int start = 0 ; static int end = 1000000000 ; static DICT *dict = 0 ; Array seqArray ; int nChrom = 0; static void parseFile (FILE *fil) ; int glfExtract (int argc, char *argv[]) { glfFile fpIn, fpOut; glf3_header_t *h; char *name; glf3_t *g3; int chrLen; char* fileName = 0 ; FILE *fil ; --argc ; ++argv ; /* remove program command */ while (argc) if (argc > 1 && !strcmp (*argv, "-sites")) { --argc ; ++argv ; fileName = *argv ; --argc ; ++ argv ; } else if (argc > 1 && !strcmp (*argv, "-name")) { --argc ; ++argv ; nameExtract = *argv ; --argc ; ++ argv ; } else if (argc > 1 && !strcmp (*argv, "-start")) { --argc ; ++argv ; start = atoi (*argv) - 1 ; --argc ; ++ argv ; } else if (argc > 1 && !strcmp (*argv, "-end")) { --argc ; ++argv ; end = atoi (*argv) ; --argc ; ++ argv ; } else if (!strcmp (*argv, "-help") || !strcmp (*argv, "-h")) { fprintf (stderr, "usage: glfExtract {-sites SITE_FILE | -name CHR_NAME} [-start x] [-end y] < in.glz > out.glz\n") ; fprintf (stderr, "extracts (part of) a single chromosome from a .glf file.\n") ; exit (1) ; } else { fprintf (stderr, "unknown argument %s : type \"glfExtract -help\" for help\n", *argv) ; exit (-1) ; } if (fileName) if (fil = fopen (fileName, "r")) parseFile (fil) ; else { fprintf (stderr, "failed to open site file %s\n", fileName) ; exit (-1) ; } else if (!nameExtract) { fprintf (stderr, "you must give a site file or the name of a chromosome/contig to extract from\n") ; exit (-1) ; } if (!(fpIn = bgzf_fdopen(fileno(stdin), "r"))) { fprintf (stderr, "failed to open stdin as .glz file\n") ; exit (-1) ; } if (!(fpOut = bgzf_fdopen(fileno(stdout), "w"))) { fprintf (stderr, "failed to open stdout as .glz file\n") ; exit (-1) ; } //printf("one\n"); /* initialise glf3 objects */ h = glf3_header_read(fpIn); glf3_header_write(fpOut,h); g3 = glf3_init1(); int readChrom = 0; while ((name = glf3_ref_read(fpIn, &chrLen)) != 0) { //printf("two1 - %s\n",name); int isExtract ; int len, n, maxArray, test; Array posArray ; if (nameExtract && !strcmp (name, nameExtract) && (len = (end > chrLen) ? (chrLen - start) : (end - start)) || fileName && dictFind (dict, name, &n) && (len = arrayMax (posArray = arr(seqArray, n, Array)))) { glf3_ref_write(fpOut,name,len); isExtract = 1 ; n = 0 ; readChrom++; } else isExtract = 0 ; //printf("reading %d of %d\n",readChrom,nChrom); /* if we have read all the specified chroms, exit */ if (fileName && readChrom > nChrom || nameExtract && readChrom > 1) { free(name); break; } //printf("two2 - %s, status=%d\n",name,isExtract); int pos = 0, lastpos = 0; //test = posArray; //maxArray = posArray->max; //printf("two3, posArray=%d, maxAray=%d - %s\n",posArray,maxArray, name); if (isExtract == 0){ //printf("clearing..."); while (glf3_read1(fpIn, g3) && g3->rtype != GLF3_RTYPE_END); //printf("done\n"); continue; } if (fileName) maxArray = posArray->max; //printf("two3, posArray=%d, maxAray=%d - %s\n",posArray,maxArray, name); while (glf3_read1(fpIn, g3) && g3->rtype != GLF3_RTYPE_END) { if (g3->rtype == GLF3_RTYPE_INDEL) continue; pos += g3->offset; //printf("three - pos=%d, nChrom=%d, readChrom=%d, chromname=%s, maxpos=%d\n",pos,nChrom,readChrom,name,arr(posArray,maxArray - 1,int)); if (fileName && g3->offset > 1){ while(array(posArray,n,int) < pos && n < maxArray){ ++n; //printf("%d < %d\n",n,arrayMax(posArray)); } } if (fileName && nChrom == readChrom && pos > arr(posArray,maxArray - 1,int)) { g3->rtype = GLF3_RTYPE_END; glf3_write1(fpOut,g3); goto finishUp;} if (isExtract){ if (nameExtract && pos >= start && pos < end || fileName && (pos+1 == arr(posArray, n, int)) && ++n){ g3->offset = pos - lastpos; glf3_write1 (fpOut, g3) ; //fprintf(stderr,"writing pos=%d\n",pos); //glf3_view1(name,g3,pos); if (nChrom == readChrom && pos >= arr(posArray,maxArray - 1,int)) { fprintf(stderr,"ending, pos=%d, max=%d",pos,arr(posArray,maxArray - 1,int)); g3->rtype = GLF3_RTYPE_END; glf3_write1(fpOut,g3); goto finishUp;} lastpos = pos; } else if (nameExtract && pos >= end) { g3->rtype = GLF3_RTYPE_END; glf3_write1(fpOut,g3); goto finishUp ;} } } if (isExtract){ g3->rtype = GLF3_RTYPE_END; glf3_write1(fpOut,g3); } free(name); if (fileName && readChrom == nChrom || nameExtract && readChrom == 1) { break; } } finishUp: glf3_header_destroy(h); glf3_destroy1(g3); bgzf_close(fpIn); bgzf_close(fpOut); return 0 ; } /*********************************************************/ static int posOrder (const void* x, const void* y) { return *(int*)x - *(int*)y ; } static void parseFile (FILE *fil) { char line[4096], name[256] ; int n, p ; Array a ; dict = dictCreate (64) ; seqArray = arrayCreate (64, Array) ; while (!feof (fil) && fgets (line, 4096, fil) && (sscanf (line, "%s %d", name, &p) == 2)) { if (dictAdd (dict, name, &n)) { array(seqArray, n, Array) = arrayCreate (1024, int) ; nChrom++; }//printf("Adding %s\n",name);} a = arr(seqArray, n, Array) ; array(a, arrayMax(a), int) = p ; } for (n = 0 ; n < arrayMax (seqArray) ; ++n) arraySort (arr(seqArray, n, Array), posOrder) ; } /***************** end of file ****************/