/* File: glfExtract.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (C) Genome Research Limited, 2008 *------------------------------------------------------------------- * Description: extract chromosomes, sites or regions from a .glf file * Exported functions: glfExtract * HISTORY: * Last edited: May 4 15:27 2009 (lj4@sanger.ac.uk) * * April 30 15:24 2009 (lj4@sanger.ac.uk): updated code to handle GLF version 3 files * * Jan 26 14:24 2009 (rd): changed "-file" to "-sites" for consistency with subCall and fixed off by one error for ; until now we reported at pos+1 * Created: Mon Aug 4 21:58:50 2008 (rd) *------------------------------------------------------------------- */ #define MAXSAMPLES 10000 #include #include #include #include #include "glf.h" #include "bgzf.h" int glfImport (int argc, char *argv[]) { FILE *fpIn; glfFile fpOut; glfFile fpOuts[MAXSAMPLES]; glf3_header_t *h; glf3_t *g3; int i; int qcall = 0; --argc ; ++argv ; /* remove program command */ while (argc > 1) if (argc > 1 && !strcmp (*argv, "-Q")) { --argc ; ++argv ; qcall = 1; } else if (!strcmp (*argv, "-help") || !strcmp (*argv, "-h")) { fprintf (stderr, "usage: glfImport [-Q] < in.txt > out.glf\n") ; fprintf (stderr, "converts a glf dump to a glf file. NOTE: the -Q argument signifies that the input is in multisample QCall format. In this case, GLF files {samplename}.glf will be created for each sample.\n") ; exit (1) ; } else { fprintf (stderr, "unknown argument %s : type \"glfExtract -help\" for help\n", argv[1]) ; exit (-1) ; } if (!(fpIn = fopen(*argv, "r"))) { fprintf (stderr, "failed to open file %s\n",*argv) ; exit (-1) ; } if (!qcall) if (!(fpOut = bgzf_fdopen(fileno(stdout), "w"))) { fprintf (stderr, "failed to open stdout as .glz file\n") ; exit (-1) ; } //if (qcall){ // fprintf (stderr, "Will get -Q working in a bit\n"); // exit (-1); //} //printf("one\n"); int AA,AC,AG,AT,CC,CG,CT,GG,GT,TT; int depth, mapq, minlk; long int pos; char ref; char chr[256]; int indel_AA,indel_AB,indel_BB; int indel_len1, indel_len2; char indel_seq1[256]; char indel_seq2[256]; char past_chr[256] = "XXXXXXNOTACHROMOSOMEUNLESSYOUAREMADXXXXX"; long int chr_count = 0; char * chr_names[100]; long int chr_counts[100]; int Nchr = 0; char sampleid[256]; char filename[256]; int firstpass = 2; int Nsamples = 0; long int firstpos; //go through once, finding chromosome lengths while (fscanf(fpIn,"%s %ld %c %d %d %d",chr,&pos,&ref,&depth, &mapq, &minlk) == 6) { //printf("%ld, %c\n",pos,ref); if (firstpass == 2){ firstpos = pos; firstpass = 1; } //printf("one!\n"); if (firstpass == 1 && pos != firstpos){ //printf("Done!\n"); firstpass = 0; } //printf("two!\n"); if (strcmp(past_chr,chr)){ //printf("what? %s\n",chr); //printf("%s %ld\n",past_chr,chr_count); chr_names[Nchr] = (char *) malloc(100); strcpy(chr_names[Nchr],past_chr); chr_counts[Nchr] = chr_count; Nchr++; chr_count = 1; strcpy(past_chr,chr); //printf("whatnow? %s\n",past_chr); } else { chr_count++; } //printf("three!\n"); if (ref == '*'){ //printf("four.1!\n"); //printf("A\n"); //fscanf(fpIn,"%d %d %d %d %d %s %s",&indel_AA,&indel_AB,&indel_BB,&indel_len1,&indel_len2,indel_seq1,indel_seq2); fscanf(fpIn,"%*d %*d %*d %*d %*d %*s %*s"); } else { //printf("four.2!\n"); //printf("B\n"); //fscanf(fpIn,"%d %d %d %d %d %d %d %d %d %d",&AA,&AC,&AG,&AT,&CC,&CG,&CT,&GG,>,&TT); fscanf(fpIn,"%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"); } //printf("five!\n"); if (qcall) { //printf("five.1\n"); fscanf(fpIn,"%s",sampleid); //printf("five.2\n"); } if (firstpass && qcall){ sprintf(filename,"%s.glf",sampleid); fpOuts[Nsamples] = bgzf_open(filename, "w"); Nsamples++; } //printf("six!\n"); } chr_names[Nchr] = (char *) malloc(100); strcpy(chr_names[Nchr],past_chr); chr_counts[Nchr] = chr_count; Nchr++; if (qcall) for (i = 0; i < Nchr; i++) chr_counts[i] = chr_counts[i]/Nsamples; //for (i = 0; i < Nchr; i++) fprintf(stderr,"%s:%ld ",chr_names[i],chr_counts[i]); //fprintf(stderr,"Nsamples: %d\n",Nsamples); //fprintf(stderr,"One\n"); /* initialise glf3 objects */ h = glf3_header_init(); if (!qcall){ glf3_header_write(fpOut,h); } else { for (i = 0; i < Nsamples; i++) glf3_header_write(fpOuts[i],h); } g3 = glf3_init1(); //fprintf(stderr,"Two\n"); int curr_chr = 1; if (!qcall){ glf3_ref_write(fpOut,chr_names[curr_chr],chr_counts[curr_chr]); } else { for (i = 0; i < Nsamples; i++){glf3_ref_write(fpOuts[i],chr_names[curr_chr],chr_counts[curr_chr]);} } //fprintf(stderr,"\n%s\n",chr_names[curr_chr]); curr_chr++; long int old_pos = 1, last_pos = 1, Nsam; int refnum; fclose(fpIn); if (!(fpIn = fopen(*argv, "r"))) { fprintf (stderr, "failed to open stdin\n") ; exit (-1) ; } while (fscanf(fpIn,"%s %ld %c %d %d %d",chr,&pos,&ref,&depth, &mapq, &minlk) == 6) { //fprintf(stderr,"%s %s %ld %c %d %d %d\n",past_chr,chr,pos,ref,depth, mapq, minlk); if (strcmp(chr_names[curr_chr - 1],chr)){ g3->rtype = GLF3_RTYPE_END; if (!qcall){ glf3_write1(fpOut,g3); glf3_ref_write(fpOut,chr_names[curr_chr],chr_counts[curr_chr]); } else { for (i = 0; i < Nsamples; i++){glf3_write1(fpOuts[i],g3); glf3_ref_write(fpOuts[i],chr_names[curr_chr],chr_counts[curr_chr]);} curr_chr++; old_pos = 1; } } if (pos != last_pos){ old_pos = last_pos; Nsam = 0; } for (i = 0; i <= 16; i++){ if (iupac[i] == ref) refnum = i; } g3->depth = depth; g3->rms_mapQ = mapq; g3->min_lk = minlk; g3->ref_base = refnum; g3->offset = pos - old_pos; if (ref == '*'){ //printf("A\n"); fscanf(fpIn,"%d %d %d %d %d %s %s",&indel_AA,&indel_AB,&indel_BB,&indel_len1,&indel_len2,indel_seq1,indel_seq2); g3->rtype = GLF3_RTYPE_INDEL; g3->lk[0] = indel_AA; g3->lk[1] = indel_AB; g3->lk[2] = indel_BB; g3->indel_len[0] = indel_len1; g3->indel_len[1] = indel_len2; strcpy(g3->indel_seq[0],indel_seq1); strcpy(g3->indel_seq[1],indel_seq2); } else { //printf("B\n"); fscanf(fpIn,"%d %d %d %d %d %d %d %d %d %d",&AA,&AC,&AG,&AT,&CC,&CG,&CT,&GG,>,&TT); g3->rtype = GLF3_RTYPE_SUB; g3->lk[0] = AA; g3->lk[1] = AC; g3->lk[2] = AG; g3->lk[3] = AT; g3->lk[4] = CC; g3->lk[5] = CG; g3->lk[6] = CT; g3->lk[7] = GG; g3->lk[8] = GT; g3->lk[9] = TT; } if (!qcall){ glf3_write1(fpOut,g3); } else { fscanf(fpIn,"%s",sampleid); glf3_write1(fpOuts[Nsam],g3); Nsam++; } last_pos = pos; } //fprintf(stderr,"end\n"); g3->rtype = GLF3_RTYPE_END; if (!qcall){ glf3_write1(fpOut,g3); } else { for (i = 0; i < Nsamples; i++) glf3_write1(fpOuts[i],g3); } finishUp: glf3_header_destroy(h); glf3_destroy1(g3); fclose(fpIn); if (!qcall){ bgzf_close(fpOut); } else { for (i = 0; i < Nsamples; i++) bgzf_close(fpOuts[i]); } return 0 ; } /*********************************************************/ /***************** end of file ****************/