/* * vcf_file_output.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 249 $) */ #include "vcf_file.h" void vcf_file::output_as_plink(const string &output_file_prefix) { // Output as PLINK formatted PED/MAP files. if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output as PLINK."); LOG.printLOG("Writing PLINK PED file ... \n"); string ped_file = output_file_prefix + ".ped"; string map_file = output_file_prefix + ".map"; vector tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); (*tmp_file) << indv[ui] << "\t" << indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0; tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } vector alleles; char phase; pair genotype; string vcf_line; vcf_entry e(N_indv); ofstream *tmp_file; for (unsigned int s=0; s 2) { LOG.one_off_warning("\tPLINK: Only outputting biallelic loci."); continue; } e.get_alleles_vector(alleles); for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } PED.close(); LOG.printLOG("Writing PLINK MAP file ... "); ofstream MAP(map_file.c_str()); if (!MAP.is_open()) LOG.error("Could not open output file: " + map_file, 12); int POS; string ID, CHROM, CHROM2; map CHROM_to_PLINK; for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; for (unsigned int s=0; s 2) continue; POS = e.get_POS(); ID = e.get_ID(); CHROM = e.get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { LOG.one_off_warning("\nUnrecognized values used for CHROM: " + CHROM + " - Replacing with 0.\n"); CHROM_to_PLINK[CHROM] = "0"; } CHROM2 = CHROM_to_PLINK[CHROM]; if (ID == ".") MAP << CHROM2 << "\t" << CHROM << ":" << POS << "\t0\t" << POS << endl; else MAP << CHROM2 << "\t" << ID << "\t0\t" << POS << endl; } MAP.close(); LOG.printLOG("Done.\n"); } // Output as Plink Transposed file void vcf_file::output_as_plink_tped(const string &output_file_prefix) { // Output as PLINK formatted PED/MAP files. if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output as PLINK TPED."); LOG.printLOG("Writing PLINK TPED file ... "); string tped_file = output_file_prefix + ".tped"; string tfam_file = output_file_prefix + ".tfam"; ofstream TPED(tped_file.c_str()); if (!TPED.is_open()) LOG.error("Could not open output file: " + tped_file, 12); string ID, CHROM, CHROM2; map CHROM_to_PLINK; for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; vector alleles; char phase; pair genotype; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s 2) // Only output sites with at most one alternative allele { LOG.one_off_warning("\tPLINK-TPED: Only outputting biallelic loci."); continue; } CHROM = e.get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { LOG.one_off_warning("\nUnrecognized values used for CHROM: " + CHROM + " - Replacing with 0.\n"); CHROM_to_PLINK[CHROM] = "0"; } CHROM2 = CHROM_to_PLINK[CHROM]; if (e.get_ID() == ".") TPED << CHROM2 << "\t" << e.get_CHROM() << ":" << e.get_POS() << "\t0\t" << e.get_POS(); else TPED << CHROM2 << "\t" << e.get_ID() << "\t0\t" << e.get_POS(); e.get_alleles_vector(alleles); for (unsigned int ui=0; ui genotype; string vcf_line; vcf_entry e(N_indv); for (unsigned int ui=0; ui tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); (*tmp_file) << ui; tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } FAM.close(); vector alleles; char phase; pair genotype; string vcf_line; vcf_entry e(N_indv); ofstream *tmp_file; for (unsigned int s=0; s 2) { LOG.one_off_warning("\t012: Only outputting biallelic loci."); continue; } e.get_alleles_vector(alleles); for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } PED.close(); LOG.printLOG("Writing 012 positions file ... "); ofstream MAP(map_file.c_str()); if (!MAP.is_open()) LOG.error("Could not open output file: " + map_file, 12); for (unsigned int s=0; s alleles; string vcf_line; vcf_entry e(N_indv); for (s=0; s 2) { LOG.one_off_warning("\tIMPUTE: Only outputting biallelic loci."); continue; } // Exclude entries with missing data and/or unphased bool missing = false; for (ui=0; ui alleles; sites << n_indv*2 << "\t" << n_sites << "\t1" << endl; // Note - this is incorrect for the X-chr. vector tmp_files(2*N_indv); vector tmp_filenames(2*N_indv); for (unsigned int ui=0; uigood()) LOG.error("Could not open temp file #" + output_log::int2str(ui) + ".\n", 12); tmp_files[2*ui] = tmp_file; tmp_filenames[2*ui] = filename; string filename2(tmpnam(NULL)); ofstream *tmp_file2 = new ofstream(filename2.c_str()); if (!tmp_file2->good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); tmp_files[2*ui+1] = tmp_file2; tmp_filenames[2*ui+1] = filename2; } string vcf_line; vcf_entry e(N_indv); ofstream *tmp_file; for (unsigned int s=0; sclose(); ifstream read_file(tmp_filenames[2*ui+k].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << indv[ui] << "-" << k << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[2*ui+k].c_str()); } } sites.close(); } void vcf_file::output_as_LDhat_unphased(const string &output_file_prefix) { if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhat format."); LOG.printLOG("Outputting in unphased LDhat format\n"); unsigned int n_sites; output_LDhat_locs_file(output_file_prefix, n_sites); string sites_file = output_file_prefix + ".ldhat.sites"; ofstream sites(sites_file.c_str()); if (!sites.is_open()) LOG.error("Could not open LDhat sites Output File: " + sites_file, 2); unsigned int n_indv = N_kept_individuals(); pair alleles; sites << n_indv << "\t" << n_sites << "\t2" << endl; vector tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } string vcf_line; vcf_entry e(N_indv); ofstream *tmp_file; for (unsigned int s=0; sclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << indv[ui] << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } sites.close(); } // Output INFO fields in tab-delimited format void vcf_file::output_INFO_for_each_site(const string &output_file_prefix, const vector &INFO_to_extract) { if (INFO_to_extract.size() == 0) return; LOG.printLOG("Outputting INFO for each site\n"); string output = output_file_prefix + ".INFO"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open INFO Output File: " + output, 3); out << "CHROM\tPOS\tREF\tALT"; for (unsigned int ui=0; ui