/* * vcf_file_filters.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 148 $) */ #include "vcf_file.h" void vcf_file::apply_filters(const parameters ¶ms) { LOG.printLOG("Applying Required Filters.\n"); // Apply all filters in turn. filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file); filter_sites_by_allele_type(params.keep_only_indels, params.remove_indels); filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file); filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites); string chr_to_keep = ""; if (params.chrs_to_keep.size() == 1) chr_to_keep = *(params.chrs_to_keep.begin()); // Get first chromosome in list (there should only be one). filter_sites_by_position(chr_to_keep, params.start_pos, params.end_pos); filter_sites_by_positions(params.positions_file, params.exclude_positions_file); filter_sites_by_BED_file(params.BED_file, params.BED_exclude); filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles); filter_sites_by_INFO_flags(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep); filter_sites_by_quality(params.min_quality); filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth); filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value); filter_individuals_by_mean_depth(params.min_indv_mean_depth, params.max_indv_mean_depth); if (params.phased_only == true) { filter_individuals_by_phase(); filter_sites_by_phase(); } filter_genotypes_by_quality(params.min_genotype_quality); filter_genotypes_by_depth(params.min_genotype_depth, params.max_genotype_depth); filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes); filter_individuals_by_call_rate(params.min_indv_call_rate); filter_individuals_randomly(params.max_N_indv); filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_site_call_rate); filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.max_missing_call_count); filter_sites_by_HWE_pvalue(params.min_HWE_pvalue); filter_sites_by_thinning(params.min_interSNP_distance); } void vcf_file::filter_genotypes_by_quality(double min_genotype_quality) { // Filter genotypes by quality if ((min_genotype_quality <= 0) || (has_genotypes == false)) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter genotypes by Quality."); LOG.printLOG("Filtering out Genotypes with Quality less than " + output_log::dbl2str(min_genotype_quality,0) + "\n"); string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter genotypes by Depth."); LOG.printLOG("Filtering out Genotypes with Depth less than " + output_log::dbl2str(min_depth,0) + " and greater than " + output_log::dbl2str(max_depth, 0) + "\n"); string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s &filter_flags_to_remove, bool remove_all) { // Filter genotypes by Filter Flags if ((remove_all == false) && (filter_flags_to_remove.size() == 0)) return; if (remove_all == true) LOG.printLOG("Filtering out all genotypes with FILTER flag.\n"); else LOG.printLOG("Filtering out genotypes by Filter Status.\n"); if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter genotypes by Filter Flag."); string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s &indv_to_keep, const set &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude) { // Filter individuals by user provided lists if (keep_then_exclude) { filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename); filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename); } else { filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename); filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename); } } void vcf_file::filter_individuals_by_keep_list(const set &indv_to_keep, const string &indv_to_keep_filename) { // Filter individuals by user provided list if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0)) return; LOG.printLOG("Keeping individuals in 'keep' list\n"); set indv_to_keep_copy = indv_to_keep; if (indv_to_keep_filename != "") { ifstream infile(indv_to_keep_filename.c_str()); if (!infile.is_open()) LOG.error("Could not open Individual file:" + indv_to_keep_filename, 1); string line; string tmp_indv; stringstream ss; while (!infile.eof()) { getline(infile, line); ss.str(line); ss >> tmp_indv; indv_to_keep_copy.insert(tmp_indv); ss.clear(); } infile.close(); } for (unsigned int ui=0; ui &indv_to_exclude, const string &indv_to_exclude_filename) { // Filter individuals by user provided list if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0)) return; LOG.printLOG("Excluding individuals in 'exclude' list\n"); set indv_to_exclude_copy = indv_to_exclude; if (indv_to_exclude_filename != "") { ifstream infile(indv_to_exclude_filename.c_str()); if (!infile.is_open()) { LOG.error("Could not open Individual file:" + indv_to_exclude_filename, 1); } string line; string tmp_indv; stringstream ss; while (!infile.eof()) { getline(infile, line); ss.str(line); ss >> tmp_indv; indv_to_exclude_copy.insert(tmp_indv); ss.clear(); } infile.close(); } for (unsigned int ui=0; ui genotype; vector N_sites_included(N_indv, 0); vector N_missing(N_indv, 0); string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter individuals by mean depth"); LOG.printLOG("Filtering individuals by mean depth\n"); unsigned int ui; vector N_sites_included(N_indv, 0); vector depth_sum(N_indv,0.0); int depth; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s= 0) { depth_sum[ui] += depth; N_sites_included[ui]++; } } } } for (ui=0; ui max_mean_depth)) include_indv[ui] = false; } } void vcf_file::filter_individuals_by_phase() { // Filter individuals that are completely unphased. // TODO: Alter this to allow for a max/min level of unphased-ness. LOG.printLOG("Filtering Unphased Individuals\n"); if (has_genotypes == false) LOG.error("Require Genotypes in VCF file to filter by Phase."); unsigned int ui, s; vector indv_count(N_indv, 0); vector indv_count_unphased(N_indv, 0); string vcf_line; vcf_entry e(N_indv); for (s=0; s keep_index(N_kept_indv); int count = 0; for (unsigned int ui=0; ui &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude) { // Filter sites by user provided lists if (keep_then_exclude) { filter_sites_to_keep(snps_to_keep, snps_to_keep_file); filter_sites_to_exclude(snps_to_exclude_file); } else { filter_sites_to_exclude(snps_to_exclude_file); filter_sites_to_keep(snps_to_keep, snps_to_keep_file); } } void vcf_file::filter_sites_to_keep(const set &snps_to_keep, const string &snps_to_keep_file) { // Filter sites by user provided list if ((snps_to_keep.size() == 0) && (snps_to_keep_file == "")) return; set local_snps_to_keep = snps_to_keep; LOG.printLOG("Keeping sites by user-supplied list\n"); if (snps_to_keep_file != "") { ifstream in(snps_to_keep_file.c_str()); string tmp; if (!in.is_open()) { LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0); } while (!in.eof()) { in >> tmp; local_snps_to_keep.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } string vcf_line; for (unsigned int s=0; s snps_to_exclude; if (snps_to_exclude_file != "") { ifstream in(snps_to_exclude_file.c_str()); string tmp; if (!in.is_open()) { LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0); } while (!in.eof()) { in >> tmp; snps_to_exclude.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } string vcf_line; for (unsigned int s=0; s::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter sites by mean depth"); LOG.printLOG("Filtering sites by mean depth\n"); int depth; string vcf_line; for (unsigned int s=0; s= 0) { depth_sum += depth; } N_indv_included++; } } double mean_depth = depth_sum / N_indv_included; if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth)) include_entry[s] = false; } } void vcf_file::filter_sites_by_position(const string &chr, int start_pos, int end_pos) { // Filter sites by user provided position range if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits::max()))) return; LOG.printLOG("Filtering sites by chromosome and/or position\n"); string vcf_line; string chrom; int pos1; for (unsigned int s=0; s end_pos)) include_entry[s] = false; } else include_entry[s] = false; } } void vcf_file::filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file) { // Filter sites by a user defined file containing a list of positions if ((positions_file == "") && (exclude_positions_file == "")) return; LOG.printLOG("Filtering sites by include/exclude positions files\n"); string chr; int pos1, idx; unsigned int N_chr=0; map chr_to_idx; bool keep=false, exclude=false; vector< set > keep_positions, exclude_positions; stringstream ss; string line, vcf_line; if (positions_file != "") { ifstream BED(positions_file.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + positions_file); keep = true; // Skip header BED.ignore(numeric_limits::max(), '\n'); while (!BED.eof()) { getline(BED, line); if (line[0] == '#') continue; ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } BED.close(); } if (exclude_positions_file != "") { ifstream BED(exclude_positions_file.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + exclude_positions_file); exclude = true; // Skip header BED.ignore(numeric_limits::max(), '\n'); while (!BED.eof()) { getline(BED, line); if (line[0] == '#') continue; ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); exclude_positions.resize(N_chr); } idx = chr_to_idx[chr]; exclude_positions[idx].insert(pos1); } BED.close(); } for (unsigned int s=0; s chr_to_idx; vector< deque > > lims; BED.ignore(numeric_limits::max(), '\n'); // Ignore header unsigned int N_BED_entries=0; while (!BED.eof()) { BED >> chr >> pos1 >> pos2; BED.ignore(numeric_limits::max(), '\n'); if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); lims.resize(N_chr); } idx = chr_to_idx[chr]; lims[idx].push_back(make_pair(pos1,pos2)); N_BED_entries++; } BED.close(); LOG.printLOG("\tRead " + output_log::int2str(N_BED_entries) + " BED file entries.\n"); for (unsigned int ui=0; ui range; string vcf_line; vector min_ui(lims.size(), 0); for (unsigned int s=0; s lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == false) include_entry[s] = false; } } else { // Exclude sites in BED file if (chr_to_idx.find(chr) != chr_to_idx.end()) { idx = chr_to_idx[chr]; bool found=false; unsigned int max_ui = lims[idx].size(); for (unsigned int ui=min_ui[idx]; ui lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == true) include_entry[s] = false; } } } } void vcf_file::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value) { // Filter sites on the basis of a fasta-like mask file. if (mask_file == "") return; if (invert_mask == false) LOG.printLOG("Filtering sites by mask file\n"); else LOG.printLOG("Filtering sites by inverted mask file\n"); ifstream mask(mask_file.c_str()); if (!mask.is_open()) LOG.error("Could not open mask file: " + mask_file); string line; string next_chr="", vcf_line; unsigned int next_pos = 0; unsigned int next_s = 0; unsigned int current_pos = 1; string current_header = ""; bool keep; while (!mask.eof()) { getline(mask, line); line.erase( line.find_last_not_of(" \t") + 1); if (line[0] == '>') { // Header current_header = line.substr(1, line.find_first_of(" \t")-1); current_pos = 1; for (unsigned int s=0; s= next_pos) && (next_chr == current_header)) { for (unsigned int ui=0; ui::max())) return; LOG.printLOG("Filtering sites by number of alleles\n"); int N_alleles; string vcf_line; for (unsigned int s=0; s max_alleles)) { include_entry[s] = false; } } } void vcf_file::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate) { // Filter sites so that all allele frequencies are between limits if ((min_maf <= 0.0) && (max_maf >= 1.0) && (min_site_call_rate <= 0) && (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0)) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file to filter by frequency and/or call rate"); LOG.printLOG("Filtering sites by allele frequency and call rate\n"); unsigned int N_alleles; unsigned int N_non_missing_chr; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s allele_counts; e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); double freq; double maf=numeric_limits::max(); for (unsigned int ui=0; ui 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af))) include_entry[s] = false; } if ((maf < min_maf) || (maf > max_maf)) include_entry[s] = false; //unsigned int N_geno_included = e.get_N_chr(); double call_rate = N_non_missing_chr / double(e.get_N_chr(include_indv, include_genotype[s])); if (call_rate < min_site_call_rate) include_entry[s] = false; } } void vcf_file::filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels) { if ((keep_only_indels == false) && (remove_indels == false)) return; if ((keep_only_indels == true) && (remove_indels == true)) LOG.error("Can't both keep and remove all indels!"); LOG.printLOG("Filtering sites by allele type\n"); string vcf_line; vcf_entry e(N_indv); string allele; unsigned int ref_len, N_alleles; bool is_indel; for (unsigned int s=0; s::max()) && (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits::max()) && (max_missing_call_count == numeric_limits::max())) return; // Filter sites so that all allele counts are between limits if (has_genotypes == false) LOG.error("Require Genotypes in VCF file to filter by allele counts and/or missing data"); LOG.printLOG("Filtering sites by allele count and missing data\n"); unsigned int N_alleles, N_chr, N_non_missing_chr; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s allele_counts; e.get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); N_chr = e.get_N_chr(include_indv, include_genotype[s]); int mac = numeric_limits::max(); for (unsigned int ui=0; ui 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac))) include_entry[s] = false; } if ((mac < min_mac) || (mac > max_mac)) include_entry[s] = false; if ((N_chr-N_non_missing_chr) > max_missing_call_count) include_entry[s] = false; } } void vcf_file::filter_sites_by_HWE_pvalue(double min_HWE_pvalue) { // Filter sites by HWE p-value if (min_HWE_pvalue <= 0) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file to filter sites by HWE."); // Note this assumes Biallelic SNPs. LOG.printLOG("Filtering sites by HWE p-value (only including bi-allelic sites)\n"); unsigned int b11, b12, b22; double p; string vcf_line; for (unsigned int s=0; s &filter_flags_to_remove, const set &filter_flags_to_keep, bool remove_all) { // Filter sites by entries in the FILTER field. if ((remove_all == false) && (filter_flags_to_remove.size() == 0) && (filter_flags_to_keep.size() == 0)) return; LOG.printLOG("Filtering sites by FILTER Status.\n"); vector FILTERs; string vcf_line; unsigned int N_to_remove = filter_flags_to_remove.size(); unsigned int N_to_keep = filter_flags_to_keep.size(); for (unsigned int s=0; s 0) { bool keep = false; for (unsigned int ui=0; ui 0)) include_entry[s] = false; else if (N_to_remove > 0) { for (unsigned int ui=0; ui 0) include_entry[s] = false; } } void vcf_file::filter_sites_by_thinning(int min_SNP_distance) { // Filter sites so that no two SNPs are within some minimum distance if (min_SNP_distance < 1) return; LOG.printLOG("Filtering sites so that no two sites are within " + output_log::int2str(min_SNP_distance) + "bp\n"); string vcf_line; vcf_entry e(N_indv); map CHROM_to_idx; string CHROM, last_CHROM=""; int POS, last_POS = -1; int distance_from_last_SNP; for (unsigned int s=0; s &flags_to_remove, const set &flags_to_keep) { // Filter sites by entries in the INFO field. if ((flags_to_remove.size() == 0) && (flags_to_keep.size() == 0)) return; LOG.printLOG("Filtering sites by INFO flags.\n"); vector INFOs; string vcf_line; string value; unsigned int N_to_remove = flags_to_remove.size(); unsigned int N_to_keep = flags_to_keep.size(); for (unsigned int s=0; s 0) { bool keep = false; for (set::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it) { value = e.get_INFO_value(*it); if (value == "1") keep = true; } include_entry[s] = keep; } if (include_entry[s]==false) continue; if (N_to_remove > 0) { for (set::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it) { value = e.get_INFO_value(*it); if (value == "1") { include_entry[s] = false; continue; } } } } }