/* * vcf_file_merge.cpp * * Created on: Oct 30, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_file.h" void vcf_file::return_site_union(vcf_file &file2, map, pair > &CHROMPOS_to_filepos_pair) { unsigned int s; int POS; string CHROM; string vcf_line; for (s=0; s(CHROM, POS)] = make_pair(s, -1); } } for (s=0; s(CHROM, POS)) != CHROMPOS_to_filepos_pair.end()) { CHROMPOS_to_filepos_pair[make_pair(CHROM, POS)].second = s; } else { CHROMPOS_to_filepos_pair[make_pair(CHROM, POS)] = make_pair(-1, s); } } } } void vcf_file::return_indv_union(vcf_file &file2, map > &combined_individuals) { for (unsigned int ui=0; ui(ui, -1); for (unsigned int ui=0; ui(-1, ui); } } void vcf_file::output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Comparing sites in VCF files...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); string vcf_line; string CHROM; int POS; string output_file = output_file_prefix + ".diff.sites_in_files"; ofstream sites_in_files(output_file.c_str()); sites_in_files << "CHROM\tPOS\tIN_FILE\tREF\tALT1\tALT2" << endl; int s1, s2; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } e1.parse_basic_entry(true); e2.parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1.get_REF(); string REF2 = e2.get_REF(); if ((REF == "N") || (REF == ".")) REF = REF2; if ((REF2 == "N") || (REF2 == ".")) REF2 = REF; if ((REF != REF2) && (REF2 != "N") && (REF != "N") && (REF != ".") && (REF2 != ".")) LOG.warning("Non-matching REF at " + CHROM + ":" + output_log::int2str(POS) + " " + REF + "/" + REF2 + ". Diff results may be unreliable."); sites_in_files << CHROM << "\t" << POS << "\t"; if ((s1 != -1) && (s2 != -1)) { N_common_SNPs++; sites_in_files << "B"; } else if ((s1 != -1) && (s2 == -1)) { N_SNPs_file1_only++; sites_in_files << "1"; } else if ((s1 == -1) && (s2 != -1)) { N_SNPs_file2_only++; sites_in_files << "2"; } else LOG.error("SNP in neither file!?"); sites_in_files << "\t" << REF << "\t" << e1.get_ALT() << "\t" << e2.get_ALT() << endl; } sites_in_files.close(); LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " SNPs common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " SNPs only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " SNPs only in second file.\n"); } void vcf_file::output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Comparing individuals in VCF files...\n"); string output_file = output_file_prefix + ".diff.indv_in_files"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Indv Differences File: " + output_file, 3); out << "INDV\tFILES" << endl; // Build a list of individuals contained in each file map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_vcf_file, combined_individuals); unsigned int N_combined_indv = combined_individuals.size(); unsigned int N[3]={0,0,0}; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1)) { N[0]++; out << combined_individuals_it->first << "\tB" << endl; } else if (combined_individuals_it->second.first != -1) { N[1]++; out << combined_individuals_it->first << "\t1" << endl; } else if (combined_individuals_it->second.second != -1) { N[2]++; out << combined_individuals_it->first << "\t2" << endl; } else LOG.error("Unhandled case"); } out.close(); LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n"); LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n"); LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n"); LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n"); } void vcf_file::output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Outputting Discordance By Individual...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_vcf_file, combined_individuals); map > indv_sums; string vcf_line, CHROM; int POS; int s1, s2, indv1, indv2; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } e1.parse_basic_entry(true); e2.parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1.get_REF(); string REF2 = e2.get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if (REF.size() != REF2.size()) { LOG.warning("REF sequences at " + CHROM + ":" + output_log::int2str(POS) + " are not comparable. Skipping site"); continue; } if ((REF != REF2) && (REF2 != "N") && (REF != "N")) LOG.warning("Non-matching REF " + CHROM + ":" + output_log::int2str(POS) + " " + REF + "/" + REF2); // Do the alternative alleles match? string ALT, ALT2; ALT = e1.get_ALT(); ALT2 = e2.get_ALT(); bool alleles_match = (ALT == ALT2) && (REF == REF2); e1.parse_full_entry(true); e1.parse_genotype_entries(true); e2.parse_full_entry(true); e2.parse_genotype_entries(true); pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1.get_indv_GENOTYPE_ids(indv1, geno_ids1); e2.get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { indv_sums[combined_individuals_it->first].first++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing // Don't do anything. } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything. } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything. } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1.get_indv_GENOTYPE_strings(indv1, genotype1); e2.get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data indv_sums[combined_individuals_it->first].first++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing // Don't do anything } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything } else LOG.error("Unknown condition"); } } } string output_file = output_file_prefix + ".diff.indv"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Sites Differences File: " + output_file, 3); out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; int N, N_discord; double discordance; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { out << combined_individuals_it->first; N = indv_sums[combined_individuals_it->first].first; N_discord = indv_sums[combined_individuals_it->first].second; discordance = N_discord / double(N); out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl; } out.close(); } void vcf_file::output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Outputting Discordance By Site...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_vcf_file, combined_individuals); string CHROM, vcf_line; int POS; int s1, s2, indv1, indv2; string output_file = output_file_prefix + ".diff.sites"; ofstream diffsites(output_file.c_str()); if (!diffsites.is_open()) LOG.error("Could not open Sites Differences File: " + output_file, 3); diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; diffsites << CHROM << "\t" << POS; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); bool data_in_both = true; // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } else data_in_both = false; if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } else data_in_both = false; if (data_in_both) diffsites << "\tB"; else if ((s1 != -1) && (s2 == -1)) diffsites << "\t1"; else if ((s1 == -1) && (s2 != -1)) diffsites << "\t2"; else LOG.error("Unhandled condition"); e1.parse_basic_entry(true); e2.parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1.get_REF(); string REF2 = e2.get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if (REF.size() != REF2.size()) { LOG.warning("REF sequences at " + CHROM + ":" + output_log::int2str(POS) + " are not comparable. Skipping site"); continue; } if ((REF != REF2) && (REF2 != "N") && (REF != "N")) LOG.warning("Non-matching REF " + CHROM + ":" + output_log::int2str(POS) + " " + REF + "/" + REF2); // Do the alternative alleles match? string ALT, ALT2; ALT = e1.get_ALT(); ALT2 = e2.get_ALT(); bool alleles_match = ((ALT == ALT2) && (REF == REF2)); diffsites << "\t" << alleles_match; e1.parse_full_entry(true); e1.parse_genotype_entries(true); e2.parse_full_entry(true); e2.parse_genotype_entries(true); pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int N_missing_1=0, N_missing_2=0; unsigned int N_discord=0; unsigned int N_concord_non_missing=0; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1.get_indv_GENOTYPE_ids(indv1, geno_ids1); e2.get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { N_common_called++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing N_missing_1++; N_missing_2++; } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1.get_indv_GENOTYPE_strings(indv1, genotype1); e2.get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing N_missing_1++; N_missing_2++; } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } } double discordance = N_discord / double(N_common_called); diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance; diffsites << endl; } diffsites.close(); } void vcf_file::output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_vcf_file, combined_individuals); string vcf_line; int s1, s2, indv1, indv2; vector > discordance_matrix(4, vector(4, 0)); for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } e1.parse_basic_entry(true); e2.parse_basic_entry(true); if ((e1.get_N_alleles() != 2) || (e2.get_N_alleles() != 2)) continue; // Set the reference to the non-missing entry (if available) string REF = e1.get_REF(); string REF2 = e2.get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if (REF.size() != REF2.size()) continue; if ((REF != REF2) && (REF2 != "N") && (REF != "N")) continue; // Do the alternative alleles match? string ALT, ALT2; ALT = e1.get_ALT(); ALT2 = e2.get_ALT(); bool alleles_match = (ALT == ALT2) && (REF == REF2); if (alleles_match == false) continue; e1.parse_full_entry(true); e1.parse_genotype_entries(true); e2.parse_full_entry(true); e2.parse_genotype_entries(true); pair geno_ids1, geno_ids2; int N1, N2; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files // Alleles match, so can compare ids instead of strings e1.get_indv_GENOTYPE_ids(indv1, geno_ids1); e2.get_indv_GENOTYPE_ids(indv2, geno_ids2); if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) || ((geno_ids2.first != -1) && (geno_ids2.second == -1))) { // Haploid LOG.one_off_warning("***Warning: Haploid chromosomes not counted!***"); continue; } N1 = geno_ids1.first + geno_ids1.second; N2 = geno_ids2.first + geno_ids2.second; if ((N1 == -1) || (N1 < -2) || (N1 > 2)) LOG.error("Unhandled case"); if ((N2 == -1) || (N2 < -2) || (N2 > 2)) LOG.error("Unhandled case"); if (N1 == -2) N1 = 3; if (N2 == -2) N2 = 3; discordance_matrix[N1][N2]++; } } string output_file = output_file_prefix + ".diff.discordance_matrix"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Discordance Matrix File: " + output_file, 3); out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl; out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl; out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl; out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl; out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl; out.close(); } void vcf_file::output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Outputting Phase Switch Errors...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_vcf_file, combined_individuals); string CHROM, vcf_line; int POS; int s1, s2, indv1, indv2; string output_file = output_file_prefix + ".diff.switch"; ofstream switcherror(output_file.c_str()); if (!switcherror.is_open()) LOG.error("Could not open Switch Error file: " + output_file, 4); switcherror << "CHROM\tPOS\tINDV" << endl; unsigned int N_combined_indv = combined_individuals.size(); vector N_phased_het_sites(N_combined_indv, 0); vector N_switch_errors(N_combined_indv, 0); pair missing_genotype(".","."); vector > prev_geno_file1(N_combined_indv, missing_genotype); vector > prev_geno_file2(N_combined_indv, missing_genotype); pair file1_hap1, file1_hap2, file2_hap1; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } e1.parse_basic_entry(true); e2.parse_basic_entry(true); e1.parse_full_entry(true); e1.parse_genotype_entries(true); e2.parse_full_entry(true); e2.parse_genotype_entries(true); pair genotype1, genotype2; pair missing_genotype(".","."); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int indv_count=0; // Bug fix applied (#3354189) - July 5th 2011 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it, indv_count++) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files e1.get_indv_GENOTYPE_strings(indv1, genotype1); e2.get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Have a matching genotypes in files 1 and 2 if (genotype1.first != genotype1.second) { // It's a heterozgote char phase1, phase2; phase1 = e1.get_indv_PHASE(indv1); phase2 = e2.get_indv_PHASE(indv2); if ((phase1 == '|') && (phase2 == '|')) { // Calculate Phasing error (switch error) N_phased_het_sites[indv_count]++; file1_hap1 = make_pair(prev_geno_file1[indv_count].first, genotype1.first); file1_hap2 = make_pair(prev_geno_file1[indv_count].second, genotype1.second); file2_hap1 = make_pair(prev_geno_file2[indv_count].first, genotype2.first); if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2)) { // Must be a switch error string indv_id; N_switch_errors[indv_count]++; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_vcf_file.indv[indv2]; switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl; } prev_geno_file1[indv_count] = genotype1; prev_geno_file2[indv_count] = genotype2; } } } } } } switcherror.close(); output_file = output_file_prefix + ".diff.indv.switch"; ofstream idiscord(output_file.c_str()); if (!idiscord.is_open()) LOG.error("Could not open Individual Discordance File: " + output_file, 3); idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl; unsigned int indv_count=0; double switch_error; string indv_id; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_vcf_file.indv[indv2]; if (N_phased_het_sites[indv_count] > 0) switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count]; else switch_error = 0; idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl; indv_count++; } idiscord.close(); } /* void vcf_file::output_concensus_statistics(const string &output_file_prefix, vcf_file &diff_vcf_file) { LOG.printLOG("Outputting Consensus Statistics... \n"); unsigned int ui; string output_file; string vcf_line; // Build a list of individuals contained in each file map > combined_individuals; map >::iterator combined_individuals_it; for (ui=0; ui(ui, -1); for (ui=0; ui(-1, ui); } unsigned int N_combined_indv = combined_individuals.size(); unsigned int N[3]={0,0,0}; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1)) N[0]++; else if (combined_individuals_it->second.first != -1) N[1]++; else N[2]++; } vector indv_N_discord(N_combined_indv, 0); vector indv_N_called_sites(N_combined_indv, 0); LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n"); LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n"); LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n"); LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n"); // Build a table of included entries in both files map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; string CHROM; int POS; return_site_union(diff_vcf_file, CHROMPOS_to_filepos_pair); output_file = output_file_prefix + ".diff.sites_in_files"; ofstream sites_in_files(output_file.c_str()); sites_in_files << "CHROM\tPOS\tIN_FILE" << endl; int s1, s2; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; sites_in_files << CHROMPOS_to_filepos_pair_it->first.first << "\t" << CHROMPOS_to_filepos_pair_it->first.second << "\t"; if ((s1 != -1) && (s2 != -1)) { N_common_SNPs++; sites_in_files << "B" << endl; } else if ((s1 != -1) && (s2 == -1)) { N_SNPs_file1_only++; sites_in_files << "1" << endl; } else if ((s1 == -1) && (s2 != -1)) { N_SNPs_file2_only++; sites_in_files << "2" << endl; } else error("SNP in neither file!?"); } sites_in_files.close(); LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " SNPs common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " SNPs only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " SNPs only in second file.\n"); output_file = output_file_prefix + ".diff.sites"; ofstream diffsites(output_file.c_str()); if (!diffsites.is_open()) error("Could not open Sites Differences File: " + output_file, 3); diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALT\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE\tN_FILE1_NONREF_GENOTYPES\tNON_REF_DISCORDANCE" << endl; output_file = output_file_prefix + ".diff.switch"; ofstream switcherror(output_file.c_str()); if (!switcherror.is_open()) error("Could not open Switch Error file: " + output_file, 4); switcherror << "CHROM\tPOS\tINDV" << endl; // Now try and merge the entries. unsigned int N_common_genotypes = 0; unsigned int N_common_discordant_genotypes = 0; unsigned int N_sites_with_mismatching_ALT = 0; unsigned int N_non_ref_genotypes = 0; unsigned int N_discordant_non_ref_genotypes = 0; pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_HQUAL(0,0); pair homo_ref(0, 0); vector > prev_geno_file1(N_combined_indv, missing_genotype); vector > prev_geno_file2(N_combined_indv, missing_genotype); pair file1_hap1, file1_hap2, file2_hap1; vector N_phased_het_sites(N_combined_indv, 0); vector N_switch_errors(N_combined_indv, 0); vector > indv_depth_at_common_sites(N_combined_indv, make_pair(0,0)); vector > indv_count_at_common_sites(N_combined_indv, make_pair(0,0)); vector > genotype_concord_matrix(4, vector(4, 0)); for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; vcf_entry e1(N_indv); vcf_entry e2(diff_vcf_file.N_indv); bool data_in_both = true; // Read entries from file (if available) if (s1 != -1) { get_vcf_entry(s1, vcf_line); e1.reset(vcf_line); } else data_in_both = false; if (s2 != -1) { diff_vcf_file.get_vcf_entry(s2, vcf_line); e2.reset(vcf_line); } else data_in_both = false; e1.parse_basic_entry(true, true, true); e2.parse_basic_entry(true, true, true); // Set the reference to the non-missing entry (if available) string REF = e1.get_REF(); string REF2 = e2.get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if (REF.size() != REF2.size()) { warning("REF sequences at " + CHROM + ":" + output_log::int2str(POS) + " are not comparable. Skipping site"); continue; } if ((REF != REF2) && (REF2 != "N") && (REF != "N")) warning("Non-matching REF " + CHROM + ":" + output_log::int2str(POS) + " " + REF + "/" + REF2); // Do the alternative alleles match? set ALT1, ALT2; for (ui=0; ui<(e1.get_N_alleles()-1); ui++) ALT1.insert(e1.get_ALT_allele(ui)); for (ui=0; ui<(e2.get_N_alleles()-1); ui++) ALT2.insert(e2.get_ALT_allele(ui)); bool matching_ALT=true; if ((data_in_both) && (ALT1 != ALT2) && (ALT1.size() > 0) && (ALT2.size() > 0)) { N_sites_with_mismatching_ALT++; matching_ALT = false; } if (data_in_both) { diffsites << CHROM << "\t" << POS << "\t"; diffsites << "B\t" << matching_ALT << "\t"; } else { continue; } if (s1 != -1) { e1.parse_full_entry(true); e1.parse_genotype_entries(true, true, true, true, true); } if (s2 != -1) { e2.parse_full_entry(true); e2.parse_genotype_entries(true, true, true, true, true); } // Now merge the genotypes. unsigned int indv_count=0; int indv1, indv2; unsigned int N_discordant_site_counter=0; unsigned int N_indvs_with_data=0; unsigned int site_N_non_ref_genotypes=0; unsigned int site_N_discordant_non_ref_genotypes = 0; int depth; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) && (indv2 == -1)) { // Genotype is completely missing... should never happen error("Missing genotype!?", 83); } else if ((indv1 == -1) && (indv2 != -1)) { // Data is missing from first file, so just use second file. } else if ((indv1 != -1) && (indv2 == -1)) { // Data is missing from second file, so just use first file. } else { // Data from both files, so figure out what to do bool non_ref_genotype = false; if (data_in_both) { e1.get_indv_GENOTYPE_strings(indv1, genotype1); e2.get_indv_GENOTYPE_strings(indv2, genotype2); e1.get_indv_GENOTYPE_ids(indv1, geno_ids1); e2.get_indv_GENOTYPE_ids(indv2, geno_ids2); N_common_genotypes++; N_indvs_with_data++; if (geno_ids1 != homo_ref) { // First file is not a hom ref N_non_ref_genotypes++; site_N_non_ref_genotypes++; non_ref_genotype = true; } depth = e1.get_indv_DEPTH(indv1); if (depth >= 0) { indv_depth_at_common_sites[indv_count].first += depth; indv_count_at_common_sites[indv_count].first++; } depth = e2.get_indv_DEPTH(indv2); if (depth >= 0) { indv_depth_at_common_sites[indv_count].second += depth; indv_count_at_common_sites[indv_count].second++; } } if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { genotype_concord_matrix[3][3]++; } if ((genotype1 == missing_genotype) && (genotype2 != missing_genotype)) { // Missing data, Favour second file if (matching_ALT && (ALT2.size() <= 1)) { unsigned int idx2 = geno_ids2.first + geno_ids2.second; genotype_concord_matrix[3][idx2]++; } } if ((genotype2 == missing_genotype) && (genotype1 != missing_genotype)) { // Favour first file if (matching_ALT && (ALT1.size() <= 1)) { unsigned int idx1 = geno_ids1.first + geno_ids1.second; genotype_concord_matrix[idx1][3]++; } } if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { if (data_in_both) { if (matching_ALT && (ALT1.size() <= 1) && (ALT2.size() <= 1)) { unsigned int idx1 = geno_ids1.first + geno_ids1.second; unsigned int idx2 = geno_ids2.first + geno_ids2.second; genotype_concord_matrix[idx1][idx2]++; } indv_N_called_sites[indv_count]++; if (!vcf_entry::genotypes_equal(genotype1, genotype2)) { N_common_discordant_genotypes++; N_discordant_site_counter++; indv_N_discord[indv_count]++; if (non_ref_genotype) { N_discordant_non_ref_genotypes++; site_N_discordant_non_ref_genotypes++; } } else { // Have a matching genotype in files 1 and 2 if (geno_ids1.first != geno_ids1.second) { // It's a heterozgote char phase1, phase2; phase1 = e1.get_indv_PHASE(indv1); phase2 = e2.get_indv_PHASE(indv2); if ((phase1 == '|') && (phase2 == '|')) { // Calculate Phasing error (switch error) N_phased_het_sites[indv_count]++; file1_hap1 = make_pair(prev_geno_file1[indv_count].first, genotype1.first); file1_hap2 = make_pair(prev_geno_file1[indv_count].second, genotype1.second); file2_hap1 = make_pair(prev_geno_file2[indv_count].first, genotype2.first); if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2)) { // Must be a switch error string indv_id; N_switch_errors[indv_count]++; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_vcf_file.indv[indv2]; switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl; } prev_geno_file1[indv_count] = genotype1; prev_geno_file2[indv_count] = genotype2; } } } } } } indv_count++; } double discordance = 0.0; if (N_indvs_with_data > 0) discordance = double(N_discordant_site_counter) / N_indvs_with_data; double non_ref_discordance = 0.0; if (site_N_non_ref_genotypes > 0) non_ref_discordance = double(site_N_discordant_non_ref_genotypes) / site_N_non_ref_genotypes; diffsites << N_indvs_with_data << "\t" << N_discordant_site_counter << "\t" << discordance; diffsites << "\t" << site_N_non_ref_genotypes << "\t" << non_ref_discordance; diffsites << endl; } output_file = output_file_prefix + ".diff.4x4"; ofstream four_by_four(output_file.c_str()); if (!four_by_four.is_open()) error("Could not open 3x3 File: " + output_file, 3); four_by_four << "-\tN00_file1\tN01_file1\tN11_file1\tN.._file1" << endl; four_by_four << "N00_file2\t" << genotype_concord_matrix[0][0] << "\t" << genotype_concord_matrix[1][0] << "\t" << genotype_concord_matrix[2][0] << "\t" << genotype_concord_matrix[3][0] << endl; four_by_four << "N01_file2\t" << genotype_concord_matrix[0][1] << "\t" << genotype_concord_matrix[1][1] << "\t" << genotype_concord_matrix[2][1] << "\t" << genotype_concord_matrix[3][1] << endl; four_by_four << "N11_file2\t" << genotype_concord_matrix[0][2] << "\t" << genotype_concord_matrix[1][2] << "\t" << genotype_concord_matrix[2][2] << "\t" << genotype_concord_matrix[3][2] << endl; four_by_four << "N.._file2\t" << genotype_concord_matrix[0][3] << "\t" << genotype_concord_matrix[1][3] << "\t" << genotype_concord_matrix[2][3] << "\t" << genotype_concord_matrix[3][3] << endl; four_by_four.close(); output_file = output_file_prefix + ".diff.indv.discord"; ofstream idiscord(output_file.c_str()); if (!idiscord.is_open()) error("Could not open Individual Discordance File: " + output_file, 3); idiscord << "INDV\tMEAN_DP_1\tMEAN_DP_2\tN_COMMON_CALLED\tN_DISCORD\tDISCORD\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl; unsigned int indv_count=0; double discordance, switch_error; int indv1, indv2; string indv_id; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_vcf_file.indv[indv2]; if (indv_N_called_sites[indv_count] > 0) discordance = double(indv_N_discord[indv_count]) / indv_N_called_sites[indv_count]; else discordance = 0.0; idiscord << indv_id; double mean_depth1 = 0, mean_depth2=0; if (indv_count_at_common_sites[indv_count].first > 0) { mean_depth1 = double(indv_depth_at_common_sites[indv_count].first) / indv_count_at_common_sites[indv_count].first; } if (indv_count_at_common_sites[indv_count].second > 0) { mean_depth2 = double(indv_depth_at_common_sites[indv_count].second) / indv_count_at_common_sites[indv_count].second; } idiscord << "\t" << mean_depth1 << "\t" << mean_depth2; idiscord << "\t" << indv_N_called_sites[indv_count] << "\t" << indv_N_discord[indv_count] << "\t" << discordance; if (N_phased_het_sites[indv_count] > 0) switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count]; else switch_error = 0; idiscord << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl; indv_count++; } idiscord.close(); LOG.printLOG("Found " + output_log::int2str(N_sites_with_mismatching_ALT) + " sites with mismatching ALT alleles.\n"); LOG.printLOG("Found " + output_log::int2str(N_non_ref_genotypes) + " non-reference genotypes called in both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_discordant_non_ref_genotypes) + " discordant non-reference genotypes.\n"); double concordance = 1.0 - (double(N_discordant_non_ref_genotypes)) / N_non_ref_genotypes; LOG.printLOG("Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n"); LOG.printLOG("Found " + output_log::int2str(N_common_genotypes) + " genotypes called in both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_common_discordant_genotypes) + " discordant genotypes.\n"); concordance = 1.0 - (double(N_common_discordant_genotypes)) / N_common_genotypes; LOG.printLOG("Overall Concordance rate: " + dbl2str_fixed(concordance * 100,2) + "%\n"); diffsites.close(); switcherror.close(); LOG.printLOG("Done\n"); } */