#define _GNU_SOURCE #include #include #include #include #include #include "glf.h" #include "array.h" #include "dict.h" #include "tools.h" void usage(void) { fprintf(stderr, "Usage: hapmap2bin [OPTIONS] ...\n" "Options:\n" " -f, --file-list \n" " -i, --iupac \n" " -m, --matching-positions \n" " -v, --verbose\n" "\n" ); exit(-1); } static int posOrder (void* x, void* y) { return *(int*)x - *(int*)y ; } // In the end, we want to print the dictionary merged for all files with positions for each chromosome // in ascending order. The countArray contains the number of positions for each chromosome and // posArrays contain for each chromosome a sorted list of positions. // void dict_add_file(char *fname, DICT *ChromDict, Array posArrays, Array countArray) { // If you need to change the dimensions here, adjust also the scanf() call. char chromName[256]; int chrom, pos; FILE *fd; char * line = NULL; size_t len = 0; ssize_t read; debug("adding data from %s .. ", fname); fd = fopen(fname,"r"); if ( !fd ) error("%s: %s\n", fname, strerror(errno)); // Print some numbers for the user to check sanity int nchrom_added = 0; int npos_added = 0; // Use getline instead of fscanf, as the latter cannot handle variable field widths. // It could write outside the allocated memory or would spend much time with malloc (with the 'a' parameter) while ((read = getline(&line, &len, fd)) != -1) { if ( sscanf(line, "%255s %d\n", chromName, &pos) != 2) error("%s: Could not parse the line %s", fname, line); // debug("\n%s\n", line); // check chromName against the dictionary if (dictAdd(ChromDict, chromName, &chrom)) { // debug("chromName %s added at pos %d\n", chromName, chrom); // This is a new chromosome - create new array where to put positions for it. array(posArrays,chrom,Array) = arrayCreate(100000,int); array(countArray,chrom,int) = 0; nchrom_added++; } if (arrayInsertFast(array(posArrays,chrom,Array), &pos, posOrder)) { // debug("adding position %d\n", pos); npos_added++; array(countArray,chrom,int)++; // debug("Size of countarray now %d\n", array(countArray,chrom,int)); } // else // debug("position %d already exists\n", pos); } fclose(fd); if ( line ) free(line); debug("added %d chrms and %d positions %s\n", nchrom_added, npos_added, ((!nchrom_added||!npos_added) ? "(0 is OK)" : "")); } int parseLine_iupac (char *line, char *name, int *x, int *g) { char c; static int map[26], isFirst = 1 ; static regex_t preg; regmatch_t pmatch[4]; int len; if (isFirst) { if ( regcomp(&preg, "^([^[:space:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]+([ACGTMRWSYKN0])[[:space:]]*$", REG_EXTENDED) ) error("FIXME: parseLine_iupac regcomp\n"); int A = 0, C = 'C'-'A', G = 'G'-'A', T='T'-'A', Y = 'Y' - 'A', S = 'S' - 'A', W = 'W' - 'A', K = 'K' - 'A', M = 'M' - 'A', R = 'R' - 'A'; map[A] = GT_AA; map[M] = GT_AC; map[R] = GT_AG; map[W] = GT_AT; map[C] = GT_CC; map[S] = GT_CG; map[Y] = GT_CT; map[G] = GT_GG; map[K] = GT_GT; map[T] = GT_TT; isFirst = 0 ; } if ( regexec(&preg,line, 4, pmatch, 0) != 0 ) error("\n[parseLine_iupac]: Expected chromosome, position and one IUPAC-coded base [ACGTMRWSYKN0]:\n\t[%s]\n", line); len = pmatch[1].rm_eo-pmatch[1].rm_so; if ( len>255 ) error("Chromosome name too long?\n\t%s\n", line); strncpy(name,line+pmatch[1].rm_so,len); name[len] = 0; sscanf(line+pmatch[2].rm_so,"%d", x); c = line[pmatch[3].rm_so]; if (c == '0' || c == 'N') *g = GT_NN; else *g = map[c-'A'] ; return 1 ; } int parseLine_noniupac (char *line, char *name, int *x, int *g) { char c1,c2; static int map[26][26], isFirst = 1 ; static regex_t preg; regmatch_t pmatch[5]; int len; if (isFirst) { if ( regcomp(&preg, "^([^[:space:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]+([ACGTN0])([ACGTN0])[[:space:]]*$", REG_EXTENDED) ) error("FIXME: parseLine_noniupac regcomp\n"); int A = 0, C = 'C'-'A', G = 'G'-'A', T='T'-'A' ; map[A][A] = GT_AA; map[A][C] = map[C][A] = GT_AC; map[A][G] = map[G][A] = GT_AG; map[A][T] = map[T][A] = GT_AT; map[C][C] = GT_CC; map[C][G] = map[G][C] = GT_CG; map[C][T] = map[T][C] = GT_CT; map[G][G] = GT_GG; map[G][T] = map[T][G] = GT_GT; map[T][T] = GT_TT; isFirst = 0 ; } if ( regexec(&preg,line, 5, pmatch, 0) != 0 ) error("\n[parseLine_noniupac]: Expected chromosome, position and two bases [ACGTN0]:\n\t%s\n", line); len = pmatch[1].rm_eo-pmatch[1].rm_so; if ( len>255 ) error("Chromosome name too long?\n\t%s\n", line); strncpy(name,line+pmatch[1].rm_so,len); name[len] = 0; sscanf(line+pmatch[2].rm_so,"%d", x); c1 = line[pmatch[3].rm_so]; c2 = line[pmatch[4].rm_so]; if (c1 == '0' || c2 == '0' || c1 == 'N' || c2 == 'N') *g = GT_NN; else *g = map[c1-'A'][c2-'A'] ; return 1 ; } void process_file(char *fname,DICT *chromDict,Array chrArray,Array countArray,Array posArrays, Array gArray, int iupac) { FILE *fd; int offset=0, prev_chrom=-1, chrom, pos, gtype, iline=0, isite, nsites, ichrom; char chromName[256]; // If you need to change the dimensions here, adjust also the scanf() call in parseline_*. char * line = NULL; size_t len = 0; ssize_t written; debug("processing file %s .. ", fname); fd = fopen(fname,"r"); if ( !fd ) error("%s: %s\n", fname, strerror(errno)); nsites = arrayMax(chrArray); for (isite=0; isite= arrayMax(posArrays) ) error("FIXME: chrom too lage %d %d\n", chrom,arrayMax(posArrays)); if ( !arrayFind(arr(posArrays,chrom,Array), &pos, &isite, posOrder) ) { error( "\nError: The position %s:%d is not in the dictionary. Please run the program without the\n" "-m option. If the problem persists, you have encountered a bug.\n%s:%d\n", chromName,pos,fname,iline); } if ( isite+offset > nsites ) error("FIXME: the position is wrong %d %d %d\n", isite,offset,nsites); arr(gArray, isite+offset, int) = gtype; //debug("adding %s: %s:%d %d\n", fname,chromName,pos,gtype); } fclose(fd); if (line) free(line); fwriteString(stdout,fname); if (( written=fwrite(arrp(gArray, 0, int), sizeof(int), nsites, stdout)) != nsites ) error("process_file: Write error - wanted %d, written %d\n", nsites,written); debug("\n"); } int main (int argc, char *argv[]) { FILE *flist_fd=NULL; // If you need to change the dimensions here, adjust also the scanf() call. char fileName[256], **file_list=NULL, *flist_fname=NULL; int nfiles_cmdline=0, nfiles=0, c, k, i, nsites=0, iupac=0, matching_positions=0; Array posArrays, posArray, chrArray, countArray, gArray; DICT *ChromDict ; ssize_t written; ChromDict = dictCreate (64) ; posArrays = arrayCreate(64,Array); countArray = arrayCreate(64,int); for (i=1; i= argc ) { error("Expected the file list after -f\n"); } flist_fname = argv[i]; continue; } // There are files on the command line - save the names to the flist array if ( !file_list ) file_list = malloc(sizeof(char *)*argc); file_list[nfiles_cmdline++] = argv[i]; } if ( !flist_fname & !file_list ) usage(); // Read file names from a file (-f) if ( flist_fname ) { debug("Reading from file %s\n", flist_fname); flist_fd = fopen(flist_fname,"r"); if ( !flist_fd ) error("%s: %s\n", flist_fname,strerror(errno)); while ( fscanf(flist_fd, "%255s\n", fileName) == 1 ) { dict_add_file(fileName, ChromDict,posArrays,countArray); nfiles++; if ( matching_positions ) break; } fclose(flist_fd); } // Read file names from command line if ( file_list ) { for (i=0; i1 ) break; dict_add_file(file_list[i], ChromDict,posArrays,countArray); nfiles++; } } // Concatenate them together in the order of the dictionary debug("Concatenating...\n"); chrArray = arrayCreate(1000000, int); posArray = arrayCreate(1000000, int); nsites = 0; for (c=0; cmax; c++) { int npositions; if ( countArray->max <= c ) error("FIXME! c too large for countArray: %d %d\n", countArray->max,c); npositions = arr(countArray,c,int); for (k=0; kmax <= c ) error("FIXME! c too large: %d %d\n", posArrays->max,c); if ( arr(posArrays,c,Array)->max <= k ) error("FIXME! k too large: %d %d\n", arr(posArrays,c,Array)->max,k); array(posArray,nsites,int) = arr(arr(posArrays,c,Array),k,int); array(chrArray,nsites,int) = c; nsites++; } } // Write the magic, the dictionary and the two arrays fwriteDict (stdout, ChromDict) ; if ( (written=fwrite(&nsites, sizeof(int),1, stdout))!=1 ) error("IO error: written %d, wanted %d\n", written,1); if ( (written=fwrite(arrp(chrArray, 0, int), sizeof(int),nsites,stdout))!=nsites ) error("IO error: written %d, wanted %d\n", written,nsites); if ( (written=fwrite(arrp(posArray, 0, int), sizeof(int),nsites,stdout))!=nsites ) error("IO error: written %d, wanted %d\n", written,nsites); debug("Wrote the chromosome dictionary .. %d sites on %d chromosomes from %d files\n",nsites,ChromDict->max,nfiles); gArray = arrayCreate (1000000, int); if ( flist_fname ) { flist_fd = fopen(flist_fname,"r"); while ( fscanf(flist_fd, "%255s\n", fileName) == 1 ) { process_file(fileName, ChromDict,chrArray,countArray,posArrays,gArray,iupac); } fclose(flist_fd); } if ( file_list ) { for (i=0; imax; k++) arrayDestroy(arr(posArrays,k,Array)); arrayDestroy(posArrays); arrayDestroy(countArray); return(0); }