#include #include #include #include #include #include #include #include #include #include"dictionary.h" typedef struct { int bp; char ref, alt; int type; } VARIANT; typedef struct { int nchr; DICTIONARY *chrom; DICTIONARY *mutation; int *clen, *cmax; VARIANT **vl; } VARIANT_CLASS; typedef struct { int start, end; int gene_id; int variants; int *tabulated_variant; double reads, mapping_qual, base_qual; char *gene; } INTERVAL; typedef struct { char *chr; int N, Ngenes, Nmax; INTERVAL *interval; INTERVAL *aggregate; DICTIONARY *dict; } EXONS; typedef struct { int nchr; DICTIONARY *chroms; EXONS **chromosome; } EXON_GENOME; INTERVAL *FindExonOverlap( int bp, EXONS *ex ); int interval_cmp( const void *a, const void *b ); int start_cmp( const void *a, const void *b ); VARIANT_CLASS *ReadVariants( char *filename, EXON_GENOME *eg ); VARIANT *FindVariant( int chr, int bp, int alt, VARIANT_CLASS *v ); int v_cmp( const void *a, const void *b ); int ExonicVariants( char *filename, EXON_GENOME *eg, VARIANT_CLASS *v ); void ProcessVariantFile( char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile, int aggregate ); void ProcessAndClassifyVariantFile( char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile); EXON_GENOME *ReadExons( char *filename); int v_cmp( const void *a, const void *b ) { const VARIANT *A = (const VARIANT*)a; const VARIANT *B = (const VARIANT*)b; int n = A->bp - B->bp; if ( n == 0 ) n = A->alt-B->alt; return(n); } VARIANT *FindVariant( int chr, int bp, int alt, VARIANT_CLASS *v ) { if ( v->clen[chr]>0 ) { VARIANT vv; vv.bp = bp; vv.alt = alt; VARIANT *vt = (VARIANT*)bsearch( &vv, v->vl[chr], v->clen[chr], sizeof(VARIANT), v_cmp); return(vt); } return(NULL); } int WriteBinaryVariants( char *filename, VARIANT_CLASS *v ) { FILE *fp = fopen(filename, "w"); if ( fp ) { int c; printf("writing binary variants to %s nchr %d\n", filename, v->nchr ); fprintf( fp, "VARIANT_CLASS nchrom %d\n", v->nchr ); WriteBinaryDictionary( fp, v->chrom ); WriteBinaryDictionary( fp, v->mutation ); fwrite( v->clen, sizeof(int), v->nchr,fp); for( c=0;cnchr;c++) { fwrite( v->vl[c], sizeof(VARIANT), v->clen[c], fp); printf( "wrote %d %d\n", c, v->clen[c]); } fclose(fp); return(1); } return(0); } VARIANT_CLASS *ReadBinaryVariants( char *filename ) { FILE *fp = fopen(filename, "r"); if ( fp ) { printf("reading binary variants from %s\n", filename ); int c, nchr=0, t=0; char buffer[256], *s; s=fgets( buffer, 256, fp); printf( "buffer = %s\n", buffer ); if ( sscanf( buffer, "VARIANT_CLASS nchrom %d", &nchr ) == 1 ) { VARIANT_CLASS *v = (VARIANT_CLASS*)calloc(1,sizeof(VARIANT_CLASS)); v->nchr = nchr; v->chrom = ReadBinaryDictionary( fp ); v->mutation = ReadBinaryDictionary( fp ); v->clen = (int*)calloc(v->nchr, sizeof(int)); v->cmax = (int*)calloc(v->nchr, sizeof(int)); t += fread( v->clen, sizeof(int), v->nchr, fp); for(c=0;ccmax[c] = v->clen[c]; v->vl = (VARIANT**)calloc(v->nchr,sizeof(VARIANT*)); for( c=0;cnchr;c++) { v->vl[c] = (VARIANT*)calloc(v->clen[c], sizeof(VARIANT)); t += fread( v->vl[c], sizeof(VARIANT), v->clen[c], fp); } printf( "read %d\n", t); for(c=1;cnchr;c++) { if ( v->vl[c] ) { printf( "chr %d %s variants %d\n", c, v->chrom->words[c].word, v->clen[c] ); } } fclose(fp); return(v); } fclose(fp); return(NULL); } return(NULL); } VARIANT_CLASS *ReadVariants( char *filename, EXON_GENOME *eg ) { char *s = suffix( filename, '.' ); if ( !strcmp(s, "binary") ) return( ReadBinaryVariants( filename )); else { printf("reading variants from %s nchr %d\n", filename,eg->chroms->nwords); /* FILE *fp = fopen( filename, "r"); */ gzFile fp = gzopen((const char*)filename, "r" ); if ( fp ) { VARIANT_CLASS *v = (VARIANT_CLASS*)calloc(1, sizeof(VARIANT_CLASS)); v->chrom = eg->chroms; v->mutation = NewDictionary( 100 ); v->nchr = v->chrom->nwords; int buflen = 100000; char *buffer = (char*)calloc(buflen, sizeof(char)); char *dummy2 = (char*)calloc(buflen, sizeof(char)); char class[256], type[256], chr[256],dummy[256], ref, alt; int bp, bp2, c; int t, nlines=0; v->vl = (VARIANT**)calloc(v->nchr, sizeof(VARIANT*)); v->clen = (int*)calloc(v->nchr, sizeof(int)); v->cmax = (int*)calloc(v->nchr, sizeof(int)); // while( fgets( buffer, buflen, fp ) ) { while( gzgets( fp, buffer, buflen )) { if ( (sscanf( buffer, "%s %s %s %s %s %d %d %c %c", dummy, class, type, dummy2, chr, &bp, &bp2, &ref, &alt ) == 9 ) || (sscanf( buffer, "%s %s %s %s %d %d %c %c", dummy, class, type, chr, &bp, &bp2, &ref, &alt ) == 8 ) ) { t = UpdateDictionary( class, v->mutation ); c = UpdateDictionary( chr, eg->chroms ); if (! v->vl[c] ) { v->cmax[c] = 1000000; v->vl[c] = (VARIANT*)calloc(v->cmax[c],sizeof(VARIANT)); } if ( v->clen[c] >= v->cmax[c] ) { v->cmax[c] *= 2; v->vl[c] = (VARIANT*)realloc(v->vl[c], v->cmax[c]*sizeof(VARIANT)); } v->vl[c][v->clen[c]].bp = bp; v->vl[c][v->clen[c]].ref = (char)ref; v->vl[c][v->clen[c]].alt = (char)alt; v->vl[c][v->clen[c]].type = t; v->clen[c]++; nlines++; } } printf("nlines %d read\n", nlines); for(c=1;cnchr;c++) { if ( v->vl[c] ) { printf( "chr %d variants %d\n", c, v->clen[c] ); } } gzclose(fp); // attempt to write a binary file char binaryfile[256]; FILE *bfp; sprintf( binaryfile, "%s.binary", filename ); if ( bfp = fopen( binaryfile, "r" ) ) fclose(bfp); else WriteBinaryVariants( binaryfile, v ); return(v); } } return(NULL); } EXON_GENOME *ReadExons( char *filename) { // FILE *fp = fopen(filename, "r"); gzFile fp = gzopen(filename, "r" ); if ( !fp ) { fprintf(stderr, "ERROR could not open %s\n", filename ); exit(1); } else { printf( "reading exons from %s\n", filename ); int nchr_max=1000, i, start, end; EXON_GENOME* eg = (EXON_GENOME*)calloc(1, sizeof(EXON_GENOME)); EXONS **exons = (EXONS**)calloc( nchr_max, sizeof(EXONS*)); char chrom[256], gene[256]; int buflen = 10000; char *buffer = (char*)calloc(buflen, sizeof(char)); eg->nchr = 0; eg->chromosome = exons; eg->chroms = NewDictionary( nchr_max ); while( gzgets( fp, buffer, buflen) && sscanf( buffer, "%s %d %d %s", chrom, &start, &end, gene ) == 4 ) { int id = UpdateDictionary( chrom, eg->chroms ); EXONS *exons=NULL; INTERVAL *in, *ag; if ( eg->chromosome[id] == NULL ) { exons = eg->chromosome[id] = (EXONS*)calloc( 1, sizeof(EXONS)); exons->chr = strdup(chrom); //eg->chroms->words[id].word; exons->Nmax = 200000; exons->interval = (INTERVAL*)calloc( exons->Nmax , sizeof(INTERVAL)); exons->aggregate = (INTERVAL*)calloc( exons->Nmax , sizeof(INTERVAL)); exons->dict = NewDictionary( exons->Nmax ); eg->nchr++; } exons = eg->chromosome[id]; if ( exons->N >= exons->Nmax ) { exons->Nmax *= 1.5; exons->interval = (INTERVAL*)realloc(exons->interval, exons->Nmax*sizeof(INTERVAL)); exons->aggregate = (INTERVAL*)realloc(exons->aggregate, exons->Nmax*sizeof(INTERVAL)); } in = &(exons->interval[exons->N]); in->gene_id = UpdateDictionary( gene, exons->dict ); in->gene = strdup(gene); in->start = start; in->end = end; exons->N++; ag = &(exons->aggregate[in->gene_id]); if ( ag->gene == NULL) { ag->gene = strdup(gene); exons->Ngenes++; } if ( ag->start == 0 || ag->start > start ) ag->start = start; if ( ag->end == 0 || ag->end < end ) ag->end = end; } for( i=0;inchr;i++) { EXONS *exons = eg->chromosome[i]; int j, jmax = exons->N <20 ? exons->N : 20; printf("%d\t%s\t%d exons %d genes %d\n", i, exons->chr, exons->N, exons->dict->nwords, exons->Ngenes); qsort( exons->interval, exons->N, sizeof(INTERVAL), start_cmp); // qsort( exons->aggregate, exons->Ngenes, sizeof(INTERVAL), interval_cmp); // for( j=0;jinterval[j].start, exons->interval[j].end, exons->interval[j].gene ); // } } gzclose(fp); return(eg); } return(NULL); } int interval_cmp( const void *a, const void *b ) { const INTERVAL *A = (const INTERVAL*)a; const INTERVAL *B = (const INTERVAL*)b; if ( A->end < B->start ) return(-1); else if ( A->start > B->end ) return(1); else return(0); } int start_cmp( const void *a, const void *b ) { const INTERVAL *A = (const INTERVAL*)a; const INTERVAL *B = (const INTERVAL*)b; return ( A->start-B->start ); } int inrange( int n ) { n = n < 0 ? 0 : n; n = n > 60 ? 60 : n; return(n); } int ExonicVariants( char *filename, EXON_GENOME *eg, VARIANT_CLASS *v ) { // FILE *fp = fopen(filename, "r"); gzFile fp = gzopen(filename, "r" ); int total = 0, exonic=0; int calledsnps_format=0; char *extension = suffix( filename, '.'); if ( !strcmp( extension, "calledsnps") ) calledsnps_format = 1; if ( fp == NULL ) { fprintf(stderr, "ERROR Could not open %s\n", filename ); } else { char *chrom, *last, *alt, *base_qual, *map_qual, *read_pos, ref, *buffer; int cov, bp, bp2, chr_id=-1; EXONS *exons=NULL; int buflen=1000000; // chromosome // position // reference allele // number of reads // alternative allele(s) // base quality // mapping quality // read position(s) buffer = (char*)calloc(buflen,sizeof(char)); chrom = (char*)calloc(10000,sizeof(char)); last = (char*)calloc(10000,sizeof(char)); alt = (char*)calloc(10000,sizeof(char)); base_qual = (char*)calloc(10000,sizeof(char)); map_qual = (char*)calloc(10000,sizeof(char)); read_pos = (char*)calloc(10000,sizeof(char)); last[0] = 0; // while(fgets( buffer, 100000, fp ) ) { while( gzgets( fp, buffer, buflen )) { if ( ( calledsnps_format && sscanf( buffer, "%s %d %d %s %s", chrom, &bp, &bp2, &ref, alt) == 5 ) || ( !calledsnps_format && sscanf( buffer, "%s %d %c %d %s %s %s %s", chrom, &bp, &ref, &cov, alt, base_qual, map_qual, read_pos ) == 8 ) ) { if ( strcmp( last, chrom ) ) { chr_id = LookupDictionary(chrom, eg->chroms ); strcpy(last, chrom); } if ( chr_id == -1 ) { // fprintf(stderr, "ERROR unknown chromosome %s\n", chrom ); // exit(1); } else { INTERVAL *e = FindExonOverlap( bp, eg->chromosome[chr_id] ); total++; // if ( bp >= 9206 && bp <= 9990 ) printf( "here %d\n", bp); if ( e != NULL ) { if ( calledsnps_format ) { e->variants++; e->reads++; exonic++; } else { int len1 = strlen(map_qual), k; int len2 = strlen(base_qual); double mq=0, bq=0; e->variants++; e->reads += cov; exonic++; for(k=0;kmapping_qual += mq; e->base_qual += bq; } if ( v ) { if (v->vl[chr_id]) { VARIANT *var = FindVariant( chr_id, bp, alt[0], v ); if ( var ) {// variant is annotated { if ( ! e->tabulated_variant ) e->tabulated_variant = (int*)calloc(v->mutation->nwords,sizeof(int)); e->tabulated_variant[var->type]++; } } } } } } } printf("read %d exonic / %d total variants\n", exonic, total); return(total); } return(0); } INTERVAL *FindExonOverlap( int bp, EXONS *ex ) { INTERVAL in; in.start = in.end = bp; INTERVAL *e = bsearch( &in, ex->interval, ex->N, sizeof(INTERVAL), interval_cmp); // printf( "searching %d %u\n", bp, e); return(e); } int main( int argc, char **argv ) { int c; char exonfile[256]; char variantfile[256]; char outputfile[256]; char classfile[256]; char extension[256]; char dir[256]; FILE *fp=NULL; int aggregate=0; exonfile[0] = 0; outputfile[0] = 0; variantfile[0] = 0; classfile[0] = 0; dir[0] = 0; extension[0] = 0; strcpy(extension, ".calledsnps"); while ((c = getopt (argc, argv, "ac:d:e:s:v:o:h")) != -1) switch (c) { case 'a': aggregate=1; break; case 'c': strcpy(classfile, optarg); // file classifying variants into synonmous etc break; case 'd': // directory of input variant files strcpy(dir, optarg); break; case 'e': // file of exon coords strcpy(exonfile, optarg); break; case 's': // input file extension strcpy(extension, optarg); break; case 'v': // name of single input file (alternative to -d) strcpy(variantfile, optarg); break; case 'o': // name of output file , or name of output directory if in directory mode strcpy(outputfile, optarg); break; case 'h': printf( "variant_count \n -a [switch to agggregate results by gene, default is off]\n -c [input variant classfile, if extension is \"binary\" then assumes binary format]\n -d [directory of input files]\n -e [input exon file of coordinates]\n -s input file extension default is \"variant_output\"]\n -v [input variant file (alternative to specifying input dir)]\n -o [output file, or output directory if -d used]\n"); break; } if ( exonfile[0] && (variantfile[0] || dir[0] ) ) { EXON_GENOME *eg = ReadExons( exonfile ); VARIANT_CLASS *v = ReadVariants( classfile, eg ); if ( variantfile[0] ) ProcessVariantFile( variantfile, eg, v, outputfile, aggregate ) ; else { glob_t globbuf; char pattern[256]; struct dirent **d; sprintf(pattern, "%s/*%s", dir, extension ); printf("pattern %s\n", pattern); glob( pattern, 0, NULL, &globbuf ); int n = globbuf.gl_pathc; if (n < 0) fprintf(stderr, "no files to process in %s\n", dir ); else { int i; for(i=0;inchr;i++) { EXONS *exons = eg->chromosome[i]; // printf("%d\t%s\t%d exons %d genes\n", i, exons->chr, exons->N, exons->dict->nwords); INTERVAL *in = exons->interval; for(j=0;jN;j++) { // printf( "int %d v %d gene %d %s\n", j, in[j].variants, in[j].gene_id, in[j].gene ); if ( in[j].variants>0 ) { INTERVAL *ag = &(exons->aggregate[in[j].gene_id]); if ( aggregate ) { ag->mapping_qual += in[j].mapping_qual; ag->base_qual += in[j].base_qual; ag->variants += in[j].variants; ag->reads += in[j].reads; // printf( "ag %d in %d %s\n", ag->variants, in[j].variants, ag->gene ); } else { in[j].mapping_qual /= in[j].variants; in[j].base_qual /= in[j].variants; in[j].reads /= in[j].variants; fprintf( fp, "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[j].gene, exons->chr, in[j].start, in[j].end, in[j].variants, in[j].reads, in[j].mapping_qual, in[j].base_qual); // printf( "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[j].gene, exons->chr, in[j].start, in[j].end, in[j].variants, in[j].reads, in[j].mapping_qual, in[j].base_qual); } } } if ( aggregate ) { int k; qsort( exons->aggregate, exons->Ngenes, sizeof(INTERVAL), start_cmp); for(k=0;kNgenes;k++) { INTERVAL *in = exons->aggregate; if ( in[k].variants > 0 ) { in[k].mapping_qual /= in[k].variants; in[k].base_qual /= in[k].variants; in[k].reads /= in[k].variants; } fprintf( fp, "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[k].gene, exons->chr, in[k].start, in[k].end, in[k].variants, in[k].reads, in[k].mapping_qual, in[k].base_qual); } } } fclose(fp); } void ProcessAndClassifyVariantFile( char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile){ int hits = ExonicVariants(variantfile, eg, v ); int i=0, j, len = strlen(variantfile); char *suf = suffix(variantfile, '/'); char *pref = prefix(suf, '.'); int allvar = UpdateDictionary( "allvariants", v->mutation); printf("pref %s v %s\n", pref, variantfile); FILE *fp = fopen(outputfile, "w"); if ( fp==NULL) { fprintf(stderr, "ERROR Could not open outputfile %s\n", outputfile); exit(1); } int variant_types = v->mutation->nwords; int total_genes=0, t; for( i=0;inchr;i++) { EXONS *exons = eg->chromosome[i]; total_genes += exons->Ngenes; } int **count_genes = (int**)calloc( variant_types+1, sizeof(int*)); char **gene_names = (char**)calloc( total_genes, sizeof(char*)); for( t=0;tnchr;i++) { EXONS *exons = eg->chromosome[i]; INTERVAL *in = exons->interval; for(j=0;jN;j++) { if ( in[j].variants>0 ) { INTERVAL *ag = &(exons->aggregate[in[j].gene_id]); if ( ! ag->tabulated_variant ) ag->tabulated_variant = (int*)calloc(variant_types,sizeof(int)); if ( in[j].tabulated_variant) { for( t=0;ttabulated_variant[t] += in[j].tabulated_variant[t]; in[j].tabulated_variant[t] = 0; } } ag->tabulated_variant[allvar] += in[j].variants; in[j].variants = 0; } } qsort( exons->aggregate, exons->Ngenes, sizeof(INTERVAL), start_cmp); for(k=0;kNgenes;k++) { INTERVAL *ag = &(exons->aggregate[k]); gene_names[gene_index] = ag->gene; for( t=0;ttabulated_variant) { count_genes[t][gene_index] = ag->tabulated_variant[t]; ag->tabulated_variant[t] = 0; } gene_index++; } } fprintf(fp, "ID type"); for(k=0;kmutation->words[t].word); for(k=0;kmutation->words[t].id][k]); } fprintf(fp, "\n"); } for( t=0;t