#include //char *strdup(const char *s); #include #include #include #include #include #include #include #include #include #include "dictionary.h" typedef struct { int bp; char ref, alt; int type; } VARIANT; typedef struct { int nchr; DICTIONARY *chrom; DICTIONARY *mutation; int *clen, *cmax; VARIANT **vl; } VARIANT_CLASS; typedef struct { int start, end; int gene_id; int variants; int *tabulated_variant; double reads, mapping_qual, base_qual; char *gene; } INTERVAL; typedef struct { char *chr; int N, Ngenes, Nmax; INTERVAL *interval; INTERVAL *aggregate; DICTIONARY *dict; } EXONS; typedef struct { int nchr; DICTIONARY *chroms; EXONS **chromosome; } EXON_GENOME; INTERVAL* FindExonOverlap(int bp, EXONS *ex); int interval_cmp(const void *a, const void *b); int start_cmp(const void *a, const void *b); VARIANT_CLASS* ReadVariants(char *filename, EXON_GENOME *eg); VARIANT* FindVariant(int chr, int bp, int alt, VARIANT_CLASS *v); int v_cmp(const void *a, const void *b); int ExonicVariants(char *filename, EXON_GENOME *eg, VARIANT_CLASS *v); void ProcessVariantFile(char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile, int aggregate); void ProcessAndClassifyVariantFile(char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile); EXON_GENOME* ReadExons(char *filename); int v_cmp(const void *a, const void *b) { const VARIANT *A = (const VARIANT*)a; const VARIANT *B = (const VARIANT*)b; int n = A->bp - B->bp; if (n == 0) n = A->alt - B->alt; return (n); } VARIANT* FindVariant(int chr, int bp, int alt, VARIANT_CLASS *v) { if (v->clen[chr] > 0) { VARIANT vv; vv.bp = bp; vv.alt = alt; VARIANT *vt = (VARIANT*)bsearch(&vv, v->vl[chr], v->clen[chr], sizeof(VARIANT), v_cmp); return (vt); } return (NULL); } int WriteBinaryVariants(char *filename, VARIANT_CLASS *v) { FILE *fp = fopen(filename, "w"); fprintf(stderr, "fopen %s\n", filename); if (fp) { int c; printf("writing binary variants to %s nchr %d\n", filename, v->nchr); fprintf(fp, "VARIANT_CLASS nchrom %d\n", v->nchr); WriteBinaryDictionary(fp, v->chrom); WriteBinaryDictionary(fp, v->mutation); fwrite(v->clen, sizeof(int), v->nchr, fp); for (c = 0; c < v->nchr; c++) { fwrite(v->vl[c], sizeof(VARIANT), v->clen[c], fp); printf("wrote %d %d\n", c, v->clen[c]); } fclose(fp); fprintf(stderr, "fclose %s\n", filename); return (1); } return (0); } VARIANT_CLASS* ReadBinaryVariants(char *filename) { FILE *fp = fopen(filename, "r"); fprintf(stderr, "fopen %s\n", filename); if (fp) { printf("reading binary variants from %s\n", filename); int c, nchr = 0, t = 0; char buffer[256], *s; s = fgets(buffer, 256, fp); printf("buffer = %s\n", buffer); if (sscanf(buffer, "VARIANT_CLASS nchrom %d", &nchr) == 1) { VARIANT_CLASS *v = (VARIANT_CLASS*)calloc(1, sizeof(VARIANT_CLASS)); v->nchr = nchr; v->chrom = ReadBinaryDictionary(fp); v->mutation = ReadBinaryDictionary(fp); v->clen = (int*)calloc(v->nchr, sizeof(int)); v->cmax = (int*)calloc(v->nchr, sizeof(int)); t += fread(v->clen, sizeof(int), v->nchr, fp); for (c = 0; c < nchr; c++) v->cmax[c] = v->clen[c]; v->vl = (VARIANT**)calloc(v->nchr, sizeof(VARIANT*)); for (c = 0; c < v->nchr; c++) { v->vl[c] = (VARIANT*)calloc(v->clen[c], sizeof(VARIANT)); t += fread(v->vl[c], sizeof(VARIANT), v->clen[c], fp); } printf("read %d\n", t); for (c = 1; c < v->nchr; c++) { if (v->vl[c]) { printf("chr %d %s variants %d\n", c, v->chrom->words[c].word, v->clen[c]); } } fclose(fp); fprintf(stderr, "fclose %s\n", filename); return (v); } fclose(fp); fprintf(stderr, "fclose %s\n", filename); return (NULL); } return (NULL); } VARIANT_CLASS* ReadVariants(char *filename, EXON_GENOME *eg) { char *s = suffix(filename, '.'); char binaryfilename[256]; sprintf(binaryfilename, "%s.binary", filename); VARIANT_CLASS *ret = ReadBinaryVariants(binaryfilename); if (ret) { fprintf(stderr, "read cached data from binary file %s\n", binaryfilename); return ret; } printf("reading variants from %s nchr %d\n", filename, eg->chroms->nwords); /* FILE *fp = fopen( filename, "r"); */ gzFile fp = gzopen((const char*)filename, "r"); fprintf(stderr, "gzopen %s\n", filename); if (!fp) return NULL; VARIANT_CLASS *v = (VARIANT_CLASS*)calloc(1, sizeof(VARIANT_CLASS)); v->chrom = eg->chroms; v->mutation = NewDictionary(100); v->nchr = v->chrom->nwords; int buflen = 100000; char *buffer = (char*)calloc(buflen, sizeof(char)); char *dummy2 = (char*)calloc(buflen, sizeof(char)); char class[256], type[256], chr[256], dummy[256], ref, alt; int bp, bp2, c; int t, nlines = 0; v->vl = (VARIANT**)calloc(v->nchr, sizeof(VARIANT*)); v->clen = (int*)calloc(v->nchr, sizeof(int)); v->cmax = (int*)calloc(v->nchr, sizeof(int)); // while( fgets( buffer, buflen, fp ) ) { while (gzgets(fp, buffer, buflen)) { if ((sscanf(buffer, "%s %s %s %s %s %d %d %c %c", dummy, class, type, dummy2, chr, &bp, &bp2, &ref, &alt) == 9) || (sscanf(buffer, "%s %s %s %s %d %d %c %c", dummy, class, type, chr, &bp, &bp2, &ref, &alt) == 8)) { t = UpdateDictionary(class, v->mutation); c = UpdateDictionary(chr, eg->chroms); if (!v->vl[c]) { v->cmax[c] = 1000000; v->vl[c] = (VARIANT*)calloc(v->cmax[c], sizeof(VARIANT)); } if (v->clen[c] >= v->cmax[c]) { v->cmax[c] *= 2; v->vl[c] = (VARIANT*)realloc(v->vl[c], v->cmax[c] * sizeof(VARIANT)); } v->vl[c][v->clen[c]].bp = bp; v->vl[c][v->clen[c]].ref = (char)ref; v->vl[c][v->clen[c]].alt = (char)alt; v->vl[c][v->clen[c]].type = t; v->clen[c]++; nlines++; } } printf("nlines %d read\n", nlines); for (c = 1; c < v->nchr; c++) { if (v->vl[c]) { printf("chr %d variants %d\n", c, v->clen[c]); } } gzclose(fp); fprintf(stderr, "gzopen %s\n", filename); // attempt to write a binary file FILE *bfp; // Do not over write: if we can read successful, then skip writing if (bfp = fopen(binaryfilename, "r")) fclose(bfp); else WriteBinaryVariants(binaryfilename, v); return (v); } EXON_GENOME* ReadExons(char *filename) { // FILE *fp = fopen(filename, "r"); gzFile fp = gzopen(filename, "r"); fprintf(stderr, "gzopen %s\n", filename); if (!fp) { fprintf(stderr, "ERROR could not open %s\n", filename); exit(1); } else { printf("reading exons from %s\n", filename); int nchr_max = 1000, i, start, end; EXON_GENOME *eg = (EXON_GENOME*)calloc(1, sizeof(EXON_GENOME)); EXONS * *exons = (EXONS**)calloc(nchr_max, sizeof(EXONS*)); char chrom[256], gene[256]; int buflen = 10000; char *buffer = (char*)calloc(buflen, sizeof(char)); eg->nchr = 0; eg->chromosome = exons; eg->chroms = NewDictionary(nchr_max); while (gzgets(fp, buffer, buflen) && sscanf(buffer, "%s %d %d %s", chrom, &start, &end, gene) == 4) { int id = UpdateDictionary(chrom, eg->chroms); EXONS *exons = NULL; INTERVAL * in, *ag; if (eg->chromosome[id] == NULL) { exons = eg->chromosome[id] = (EXONS*)calloc(1, sizeof(EXONS)); exons->chr = strdup(chrom); //eg->chroms->words[id].word; exons->Nmax = 200000; exons->interval = (INTERVAL*)calloc(exons->Nmax, sizeof(INTERVAL)); exons->aggregate = (INTERVAL*)calloc(exons->Nmax, sizeof(INTERVAL)); exons->dict = NewDictionary(exons->Nmax); eg->nchr++; } exons = eg->chromosome[id]; if (exons->N >= exons->Nmax) { exons->Nmax *= 1.5; exons->interval = (INTERVAL*)realloc(exons->interval, exons->Nmax * sizeof(INTERVAL)); exons->aggregate = (INTERVAL*)realloc(exons->aggregate, exons->Nmax * sizeof(INTERVAL)); } in = &(exons->interval[exons->N]); in->gene_id = UpdateDictionary(gene, exons->dict); in->gene = strdup(gene); in->start = start; in->end = end; exons->N++; ag = &(exons->aggregate[in->gene_id]); if (ag->gene == NULL) { ag->gene = strdup(gene); exons->Ngenes++; } if (ag->start == 0 || ag->start > start) ag->start = start; if (ag->end == 0 || ag->end < end) ag->end = end; } for (i = 0; i < eg->nchr; i++) { EXONS *exons = eg->chromosome[i]; int j, jmax = exons->N < 20 ? exons->N : 20; printf("%d\t%s\t%d exons %d genes %d\n", i, exons->chr, exons->N, exons->dict->nwords, exons->Ngenes); qsort(exons->interval, exons->N, sizeof(INTERVAL), start_cmp); // qsort( exons->aggregate, exons->Ngenes, sizeof(INTERVAL), interval_cmp); // for( j=0;jinterval[j].start, exons->interval[j].end, exons->interval[j].gene ); // } } gzclose(fp); fprintf(stderr, "gzclose %s\n", filename); return (eg); } return (NULL); } int interval_cmp(const void *a, const void *b) { const INTERVAL *A = (const INTERVAL*)a; const INTERVAL *B = (const INTERVAL*)b; if (A->end < B->start) return (-1); else if (A->start > B->end) return (1); else return (0); } int start_cmp(const void *a, const void *b) { const INTERVAL *A = (const INTERVAL*)a; const INTERVAL *B = (const INTERVAL*)b; return (A->start - B->start); } int inrange(int n) { n = n < 0 ? 0 : n; n = n > 60 ? 60 : n; return (n); } int ExonicVariants(char *filename, EXON_GENOME *eg, VARIANT_CLASS *v) { // FILE *fp = fopen(filename, "r"); gzFile fp = gzopen(filename, "r"); fprintf(stderr, "gzopen %s\n", filename); int total = 0, exonic = 0; int calledsnps_format = 0; char *extension = suffix(filename, '.'); int filename_len = strlen(filename); if (filename_len > 14) { if (!strcmp(filename + filename_len - 13, "calledsnps.gz")) calledsnps_format = 1; fprintf(stderr, "N.B. calledsnps_format = %d, filename = %s , extension = %s\n", calledsnps_format, filename, filename + filename_len - 13); } if (fp == NULL) { fprintf(stderr, "ERROR Could not open %s\n", filename); } else { char *chrom, *last, *alt, *base_qual, *map_qual, *read_pos, ref, *buffer; int cov, bp, bp2, chr_id = -1; EXONS *exons = NULL; int buflen = 1000000; // chromosome // position // reference allele // number of reads // alternative allele(s) // base quality // mapping quality // read position(s) buffer = (char*)calloc(buflen, sizeof(char)); chrom = (char*)calloc(10000, sizeof(char)); last = (char*)calloc(10000, sizeof(char)); alt = (char*)calloc(10000, sizeof(char)); base_qual = (char*)calloc(10000, sizeof(char)); map_qual = (char*)calloc(10000, sizeof(char)); read_pos = (char*)calloc(10000, sizeof(char)); last[0] = 0; // while(fgets( buffer, 100000, fp ) ) { while (gzgets(fp, buffer, buflen)) { if ((calledsnps_format && sscanf(buffer, "%s %d %d %s %s", chrom, &bp, &bp2, &ref, alt) == 5) || (!calledsnps_format && sscanf(buffer, "%s %d %c %d %s %s %s %s", chrom, &bp, &ref, &cov, alt, base_qual, map_qual, read_pos) == 8)) { if (strcmp(last, chrom)) { chr_id = LookupDictionary(chrom, eg->chroms); strcpy(last, chrom); } if (chr_id == -1) { // fprintf(stderr, "ERROR unknown chromosome %s\n", chrom ); // exit(1); } else { INTERVAL *e = FindExonOverlap(bp, eg->chromosome[chr_id]); total++; // if ( bp >= 9206 && bp <= 9990 ) printf( "here %d\n", bp); if (e != NULL) { if (calledsnps_format) { e->variants++; e->reads++; exonic++; } else { int len1 = strlen(map_qual), k; int len2 = strlen(base_qual); double mq = 0, bq = 0; e->variants++; e->reads += cov; exonic++; for (k = 0; k < len1; k++) mq += inrange(map_qual[k] - 33); for (k = 0; k < len2; k++) bq += inrange(base_qual[k] - 33); mq /= len1; bq /= len2; e->mapping_qual += mq; e->base_qual += bq; } if (v) { if (v->vl[chr_id]) { VARIANT *var = FindVariant(chr_id, bp, alt[0], v); if (var) // variant is annotated { { if (!e->tabulated_variant) e->tabulated_variant = (int*)calloc(v->mutation->nwords, sizeof(int)); e->tabulated_variant[var->type]++; } } } } } } } printf("read %d exonic / %d total variants\n", exonic, total); gzclose(fp); fprintf(stderr, "gzclose %s\n", filename); return (total); } return (0); } INTERVAL* FindExonOverlap(int bp, EXONS *ex) { INTERVAL in; in.start = in.end = bp; INTERVAL *e = bsearch(&in, ex->interval, ex->N, sizeof(INTERVAL), interval_cmp); // printf( "searching %d %u\n", bp, e); return (e); } int main(int argc, char **argv) { int c; char exonfile[256]; char variantfile[256]; char outputfile[256]; char classfile[256]; char extension[256]; char dir[256]; FILE *fp = NULL; int aggregate = 0; exonfile[0] = 0; outputfile[0] = 0; variantfile[0] = 0; classfile[0] = 0; dir[0] = 0; extension[0] = 0; strcpy(extension, ".calledsnps"); while ((c = getopt(argc, argv, "ac:d:e:s:v:o:h")) != -1) switch (c) { case 'a': aggregate = 1; break; case 'c': strcpy(classfile, optarg); // file classifying variants into synonmous etc break; case 'd': // directory of input variant files strcpy(dir, optarg); break; case 'e': // file of exon coords strcpy(exonfile, optarg); break; case 's': // input file extension strcpy(extension, optarg); break; case 'v': // name of single input file (alternative to -d) strcpy(variantfile, optarg); break; case 'o': // name of output file , or name of output directory if in directory mode strcpy(outputfile, optarg); break; case 'h': printf("variant_count \n -a [switch to agggregate results by gene, default is off]\n -c [input variant classfile, if extension is \"binary\" then assumes binary format]\n -d [directory of input files]\n -e [input exon file of coordinates]\n -s input file extension default is \"variant_output\"]\n -v [input variant file (alternative to specifying input dir)]\n -o [output file, or output directory if -d used]\n"); break; } if (exonfile[0] && (variantfile[0] || dir[0])) { EXON_GENOME *eg = ReadExons(exonfile); VARIANT_CLASS *v = ReadVariants(classfile, eg); if (variantfile[0]) ProcessVariantFile(variantfile, eg, v, outputfile, aggregate); else { glob_t globbuf; char pattern[256]; int flags = 0 | GLOB_APPEND; struct dirent **d; sprintf(pattern, "%s/*%s", dir, extension); printf("pattern %s\n", pattern); glob(pattern, 0, NULL, &globbuf); sprintf(pattern, "%s/*%s.gz", dir, extension); glob(pattern, flags, NULL, &globbuf); int n = globbuf.gl_pathc; if (n < 0) fprintf(stderr, "no files to process in %s\n", dir); else { fprintf(stderr, "%d files to process in %s\n", n, dir); int i; for (i = 0; i < n; i++) { printf("%s\n", globbuf.gl_pathv[i]); int k = strlen(globbuf.gl_pathv[i]); int n = 0; char outfile[256]; char *basename = suffix(globbuf.gl_pathv[i], '/'); basename = prefix(basename, '.'); sprintf(outfile, "%s/%s.variant_counts", outputfile, basename); ProcessAndClassifyVariantFile(globbuf.gl_pathv[i], eg, v, outfile); } } } return (0); } return (1); } void ProcessVariantFile(char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile, int aggregate) { int hits = ExonicVariants(variantfile, eg, v); int i, j; FILE *fp = fopen(outputfile, "w"); fprintf(stderr, "fopen %s\n", outputfile); if (fp == NULL) { fprintf(stderr, "ERROR Could not open outputfile %s\n", outputfile); exit(1); } for (i = 0; i < eg->nchr; i++) { EXONS *exons = eg->chromosome[i]; // printf("%d\t%s\t%d exons %d genes\n", i, exons->chr, exons->N, exons->dict->nwords); INTERVAL *in = exons->interval; for (j = 0; j < exons->N; j++) { // printf( "int %d v %d gene %d %s\n", j, in[j].variants, in[j].gene_id, in[j].gene ); if (in[j].variants > 0) { INTERVAL *ag = &(exons->aggregate[in[j].gene_id]); if (aggregate) { ag->mapping_qual += in[j].mapping_qual; ag->base_qual += in[j].base_qual; ag->variants += in[j].variants; ag->reads += in[j].reads; // printf( "ag %d in %d %s\n", ag->variants, in[j].variants, ag->gene ); } else { in[j].mapping_qual /= in[j].variants; in[j].base_qual /= in[j].variants; in[j].reads /= in[j].variants; fprintf(fp, "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[j].gene, exons->chr, in[j].start, in[j].end, in[j].variants, in[j].reads, in[j].mapping_qual, in[j].base_qual); // printf( "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[j].gene, exons->chr, in[j].start, in[j].end, in[j].variants, in[j].reads, in[j].mapping_qual, in[j].base_qual); } } } if (aggregate) { int k; qsort(exons->aggregate, exons->Ngenes, sizeof(INTERVAL), start_cmp); for (k = 0; k < exons->Ngenes; k++) { INTERVAL *in = exons->aggregate; if (in[k].variants > 0) { in[k].mapping_qual /= in[k].variants; in[k].base_qual /= in[k].variants; in[k].reads /= in[k].variants; } fprintf(fp, "%s %s %s %d %d %d %.2f %.2f %.2f\n", variantfile, in[k].gene, exons->chr, in[k].start, in[k].end, in[k].variants, in[k].reads, in[k].mapping_qual, in[k].base_qual); } } } fclose(fp); fprintf(stderr, "fclose %s\n", outputfile); } void ProcessAndClassifyVariantFile(char *variantfile, EXON_GENOME *eg, VARIANT_CLASS *v, char *outputfile) { int hits = ExonicVariants(variantfile, eg, v); int i = 0, j, len = strlen(variantfile); char *suf = suffix(variantfile, '/'); char *pref = prefix(suf, '.'); int allvar = UpdateDictionary("allvariants", v->mutation); printf("pref %s v %s\n", pref, variantfile); FILE *fp = fopen(outputfile, "w"); fprintf(stderr, "fopen %s\n", outputfile); if (fp == NULL) { fprintf(stderr, "ERROR Could not open outputfile %s\n", outputfile); exit(1); } int variant_types = v->mutation->nwords; int total_genes = 0, t; for (i = 0; i < eg->nchr; i++) { EXONS *exons = eg->chromosome[i]; total_genes += exons->Ngenes; } int **count_genes = (int**)calloc(variant_types + 1, sizeof(int*)); char **gene_names = (char**)calloc(total_genes, sizeof(char*)); for (t = 0; t < variant_types + 1; t++) { count_genes[t] = (int*)calloc(total_genes, sizeof(int)); } int gene_index = 0, k; for (i = 0; i < eg->nchr; i++) { EXONS *exons = eg->chromosome[i]; INTERVAL *in = exons->interval; for (j = 0; j < exons->N; j++) { if (in[j].variants > 0) { INTERVAL *ag = &(exons->aggregate[in[j].gene_id]); if (!ag->tabulated_variant) ag->tabulated_variant = (int*)calloc(variant_types, sizeof(int)); if (in[j].tabulated_variant) { for (t = 0; t < variant_types; t++) { ag->tabulated_variant[t] += in[j].tabulated_variant[t]; in[j].tabulated_variant[t] = 0; } } ag->tabulated_variant[allvar] += in[j].variants; in[j].variants = 0; } } qsort(exons->aggregate, exons->Ngenes, sizeof(INTERVAL), start_cmp); for (k = 0; k < exons->Ngenes; k++) { INTERVAL *ag = &(exons->aggregate[k]); gene_names[gene_index] = ag->gene; for (t = 0; t < variant_types; t++) if (ag->tabulated_variant) { count_genes[t][gene_index] = ag->tabulated_variant[t]; ag->tabulated_variant[t] = 0; } gene_index++; } } fprintf(fp, "ID type"); for (k = 0; k < gene_index; k++) { fprintf(fp, " %s", gene_names[k]); } fprintf(fp, "\n"); for (t = 0; t < variant_types; t++) { fprintf(fp, "%s %s", pref, v->mutation->words[t].word); for (k = 0; k < gene_index; k++) { fprintf(fp, " %d", count_genes[v->mutation->words[t].id][k]); } fprintf(fp, "\n"); } for (t = 0; t < variant_types + 1; t++) free(count_genes[t]); free(count_genes); free(gene_names); fclose(fp); fprintf(stderr, "fclose %s\n", outputfile); }