//version for large files - creates a buffer and appends to the h5 file #include "hdf5.h" #include "hdf5_hl.h" #include #include #include #define SAMPLES 20 #define MAXLINES 100 #define CHROMLEN 6 #define IDLEN 12 #define FILTERLEN 8 #define INFOLEN 300 #define FORMATLEN 50 #define SITES "Sites" #define NGENOFIELDS (hsize_t) 9 #define NSITEFIELDS (hsize_t) 9 #define NRECORDS (hsize_t) 1000000 //#define NRECORDS (hsize_t) 512122 typedef struct Geno { int GT_0; int GT_1; int AD_ref; int AD_alt; int DP; float GQ; int AA_like; int AB_like; int BB_like; } Geno; typedef struct Site{ char chrom[CHROMLEN]; int pos; char id[IDLEN]; char ref; char alt; float qual; char filter[FILTERLEN]; char info[INFOLEN]; char format[FORMATLEN]; } Site; int main(int argc, char *argv[]){ if (argc<3){ printf("no filename. %i args\n",argc); return -1; } Geno* dst_buf=malloc(sizeof(Geno)*NRECORDS); /* Calculate the size and the offsets of our struct members in memory */ size_t dst_size = sizeof(Geno); size_t dst_offset[NGENOFIELDS] = { HOFFSET(Geno, GT_0 ), HOFFSET(Geno, GT_1 ), HOFFSET(Geno, AD_ref ), HOFFSET(Geno, AD_alt ), HOFFSET(Geno, DP ), HOFFSET(Geno, GQ ), HOFFSET(Geno, AA_like ), HOFFSET(Geno, AB_like ), HOFFSET(Geno, BB_like ), }; size_t dst_sizes[NGENOFIELDS] = { sizeof(dst_buf[0].GT_0), sizeof(dst_buf[0].GT_1), sizeof(dst_buf[0].AD_ref), sizeof(dst_buf[0].AD_ref), sizeof(dst_buf[0].DP), sizeof(dst_buf[0].GQ), sizeof(dst_buf[0].AA_like), sizeof(dst_buf[0].AB_like), sizeof(dst_buf[0].BB_like), }; /* Define field information */ const char *geno_field_names[NGENOFIELDS] = { "GT_0", "GT_1", "AD_ref", "AD_alt", "DP", "GQ", "AA_like", "AB_like", "BB_like" }; hid_t geno_field_type[NGENOFIELDS]; hid_t file_id; hsize_t chunk_size = 10; int *fill_data = NULL; int compress = 0; herr_t status; /* Initialize field_type */ geno_field_type[0] = H5T_NATIVE_INT; geno_field_type[1] = H5T_NATIVE_INT; geno_field_type[2] = H5T_NATIVE_INT; geno_field_type[3] = H5T_NATIVE_INT; geno_field_type[4] = H5T_NATIVE_INT; geno_field_type[5] = H5T_NATIVE_FLOAT; geno_field_type[6] = H5T_NATIVE_INT; geno_field_type[7] = H5T_NATIVE_INT; geno_field_type[8] = H5T_NATIVE_INT; /* Create a new file using default properties. */ file_id = H5Fcreate(argv[1], H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT ); hid_t group_id; /* identifiers */ /* Create a group */ group_id = H5Gcreate(file_id, "/Genotypes", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); Site * site_data=malloc(sizeof(Site)*NRECORDS); /* Calculate the size and the offsets of our struct members in memory */ size_t site_size = sizeof(Site); /* Initialize field_type */ hid_t string_chrom = H5Tcopy( H5T_C_S1 ); H5Tset_size( string_chrom, CHROMLEN); hid_t string_id = H5Tcopy( H5T_C_S1 ); H5Tset_size( string_id, IDLEN); hid_t string_filter = H5Tcopy( H5T_C_S1 ); H5Tset_size( string_filter, FILTERLEN); hid_t string_info = H5Tcopy( H5T_C_S1 ); H5Tset_size( string_info, INFOLEN); hid_t string_format = H5Tcopy( H5T_C_S1 ); H5Tset_size( string_format, FORMATLEN); const char *site_field_names[NSITEFIELDS] = {"chrom","pos","id","ref","alt","qual","filter","info","format" }; hid_t site_field_type[NSITEFIELDS]; site_field_type[0] = string_chrom; site_field_type[1] = H5T_NATIVE_INT; site_field_type[2] = string_id; site_field_type[3] = H5T_NATIVE_CHAR; site_field_type[4] = H5T_NATIVE_CHAR; site_field_type[5] = H5T_NATIVE_FLOAT; site_field_type[6] = string_filter; site_field_type[7] = string_info; site_field_type[8] = string_format; size_t site_offset[NSITEFIELDS] = { HOFFSET(Site, chrom ), HOFFSET(Site, pos ), HOFFSET(Site, id ), HOFFSET(Site, ref ), HOFFSET(Site, alt), HOFFSET(Site, qual ), HOFFSET(Site, filter ), HOFFSET(Site, info ), HOFFSET(Site, format ), }; Site site; size_t site_sizes[NSITEFIELDS] = { sizeof(site.chrom ), sizeof(site.pos ), sizeof(site.id), sizeof(site.ref ), sizeof(site.alt), sizeof(site.qual ), sizeof(site.filter ), sizeof(site.info ), sizeof(site.format ), }; //array of array of genotypes, one per sample Geno * genos[SAMPLES]; int p; for (p = 0; p < SAMPLES; p++){ Geno *p_data=malloc(NRECORDS*sizeof(Geno)); genos[p] = p_data; } fflush(stdout); char line[10000]; int j = 0; char * header[100]; char headerline[10000]; int numscanned; //number of site columns FILE *fp; if((fp=fopen(argv[2], "r"))==NULL) { printf("Cannot open file.\n"); exit(1); } printf ("input file opened %s\n", argv[2]); fflush(stdout); status=H5TBmake_table( "Site Data", group_id, SITES, NSITEFIELDS, 0, site_size, site_field_names, site_offset, site_field_type, chunk_size, fill_data, compress, NULL); while(fgets(line, 10000, fp) !=NULL){ /* printf ("%i\n", j); */ fflush(stdout); //check for comment line[strlen(line) - 1] = '\0'; if (line[0] == '#'){ //check if headerline if (line[1] == '#'){ continue; } else { memcpy(headerline, line, sizeof(char) * strlen(line)); char *thiscol = NULL; thiscol=strtok(headerline, "\t"); thiscol=thiscol+1; int h = 0; while (thiscol != NULL ){ header[h] = thiscol; printf("header %i: %s\n", h, header[h]); h++; thiscol = strtok(NULL, "\t"); } //make the geno tables int i; for (i = 0; i < SAMPLES; i++){ char tablename[30] = "sample"; strcat(tablename, header[i + h -SAMPLES ]); printf ("table name %s\n", tablename); fflush(stdout); status=H5TBmake_table( tablename, group_id, tablename, NGENOFIELDS, 0, dst_size, geno_field_names, dst_offset, geno_field_type, chunk_size, fill_data, compress, NULL); } } continue; } //get columns describing site Site site; numscanned = sscanf(line, "%s %i %s %c %c %f %s %s %s" , site.chrom, &site.pos, site.id, &site.ref, &site.alt, &site.qual, site.filter, site.info, site.format); site_data[j%NRECORDS]=site; /* printf ("chrom is %s and int is %i \n", site.chrom, site.pos); */ /* printf ("quality is %f and filter is %s \n", site.qual, site.filter); */ /* printf ("format is %s \n", site.format); */ char * thisgenotype; thisgenotype=strtok(line, "\t"); int i; //move past the site columns for (i = 0; i < numscanned - 1; i++){ strtok(NULL, "\t"); } //read the genotype columns for (i = 0; i < SAMPLES; i++){ thisgenotype = strtok(NULL, "\t"); Geno thisgeno; if (strncmp(thisgenotype, "./.", 3) == 0){ //printf("no genotype \n"); Geno geno = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; thisgeno = geno; } else { //printf("genotype %s \n", thisgenotype); sscanf(thisgenotype, "%i/%i:%i,%i:%i:%f:%i,%i,%i", &thisgeno.GT_0, &thisgeno.GT_1, &thisgeno.AD_ref, &thisgeno.AD_alt, &thisgeno.DP, &thisgeno.GQ, &thisgeno.AA_like, &thisgeno.AB_like, &thisgeno.BB_like ); } genos[i][j%NRECORDS]=thisgeno; } if (j > 0 && (j+1) % NRECORDS == 0){ printf("load a batch of data %ld \n", j); fflush(stdout); /* load a batch of data */ status=H5TBappend_records(group_id, SITES, NRECORDS, site_size, site_offset, site_sizes, site_data); for (i = 0; i < SAMPLES; i++){ char tablename[30] = "sample"; /* printf(" numscanned %i\n", numscanned); */ strcat(tablename, header[i + numscanned]); status=H5TBappend_records(group_id, tablename, NRECORDS, dst_size, dst_offset, dst_sizes, genos[i]); /* printf("status %i \n", status); */ /* fflush(stdout); */ } } j++; } /* XXX if not a whole number of records need to load the last incomplete batch */ //if haven't just completed loadthen just load j % NRECORDS records if (j % NRECORDS != 0){ printf("load the last batch of data %ld \n", j); fflush(stdout); /* load a batch of data */ status=H5TBappend_records(group_id, SITES, j%NRECORDS, site_size, site_offset, site_sizes, site_data); int i; for (i = 0; i < SAMPLES; i++){ char tablename[30] = "sample"; strcat(tablename, header[i + numscanned]); status=H5TBappend_records(group_id, tablename, j%NRECORDS, dst_size, dst_offset, dst_sizes, genos[i]); } } /*------------------------------------------------------------------------- * H5TBmake_table *------------------------------------------------------------------------- */ /* Close the group. */ status = H5Gclose(group_id); /* close the file */ H5Fclose( file_id ); return 0; }