#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); merge_vcf_files($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak join('',@msg); } die "About: Merges VCF files by position, creating multi-sample VCFs from fewer-sample VCFs.\n", " The tool requires bgzipped and tabix indexed VCF files on input. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n", " If you need to concatenate VCFs (e.g. files split by chromosome), look at vcf-concat instead.\n", "Usage: vcf-merge [OPTIONS] file1.vcf file2.vcf.gz ... > out.vcf\n", "Options:\n", " -c, --collapse treat as identical sites with differing alleles [any]\n", " -d, --remove-duplicates If there should be two consecutive rows with the same chr:pos, print only the first one.\n", " -H, --vcf-header Use the VCF header\n", " -h, -?, --help This help message.\n", " -r, --regions Do only the given regions (comma-separated list or one region per line in a file).\n", " -R, --ref-for-missing Use the REF allele instead of the default missing genotype. Because it is not obvious\n", " what ploidy should be used, a user-defined string is used instead (e.g. 0/0).\n", " -s, --silent Try to be a bit more silent, no warnings about duplicate lines.\n", " -t, --trim-ALTs If set, redundant ALTs will be removed\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], joiner => { 'DP4' => \&joiner_dp4, 'DP' => \&joiner_sum, }, trim_redundant_ALTs => 0, collapse_any => 1, }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--remove-duplicates' ) { $$opts{rm_dups}=1; next; } if ( $arg eq '-R' || $arg eq '--ref-for-missing' ) { $$opts{ref_for_missing}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--trim-ALTs' ) { $$opts{trim_redundant_ALTs}=1; next; } if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{silent_dups}=1; next; } if ( $arg eq '-H' || $arg eq '--vcf-header' ) { $$opts{vcf_header}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--collapse' ) { $$opts{collapse_any} = 1; my $c = shift(@ARGV); if ( $c eq 'snps' ) { $$opts{collapse_snps}=1; } elsif ( $c eq 'indels' ) { $$opts{collapse_indels}=1; } elsif ( $c eq 'both' ) { $$opts{collapse_snps}=1; $$opts{collapse_indels}=1; } elsif ( $c eq 'any' ) { $$opts{collapse_any}=1; } elsif ( $c eq 'none' ) { $$opts{collapse_any}=0; $$opts{collapse_snps}=0; $$opts{collapse_indels}=0; } else { error("Expected one of with -c"); } next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{files}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -? for help.\n"); } if ( !exists($$opts{files}) ) { error() } return $opts; } # Returns the common prefix of the files. sub common_prefix { my ($files) = @_; my @paths; my $len = -1; for my $file (@$files) { my @path = split(m{/+},$file); if ( $len<0 || $len>scalar @path ) { $len=scalar @path; } push @paths, \@path; } my @common; for (my $i=0; $i<$len; $i++) { my $identical=1; for (my $ifile=1; $ifile[$i] ne $paths[0]->[$i] ) { $identical=0; last; } } if ( !$identical ) { last; } push @common, $paths[0]->[$i]; } return join('/+',@common); } sub read_region_list { my ($opts) = @_; my @regions = (); if ( exists($$opts{regions_list}) ) { if ( -e $$opts{regions_list} ) { open(my $rgs,'<',$$opts{regions_list}) or error("$$opts{regions_list}: $!"); while (my $line=<$rgs>) { chomp($line); push @regions, $line; } close($rgs); } else { @regions = split(/,/,$$opts{regions_list}); } } return (@regions); } sub check_AGtags_definition { my ($vcf) = @_; if ( $$vcf{version} >= 4.1 ) { return; } # Whatever is the value set to, the user takes the responsibility for the merging strategy used if ( exists($ENV{DONT_FIX_VCF40_AG_TAGS}) ) { return; } my @tags; if ( exists($$vcf{header}{INFO}{PL}) && $$vcf{header}{INFO}{PL}{Number} != -1 ) { push @tags, 'PL'; } if ( exists($$vcf{header}{INFO}{GL}) && $$vcf{header}{INFO}{GL}{Number} != -1 ) { push @tags, 'GL'; } if ( exists($$vcf{header}{INFO}{AC}) && $$vcf{header}{INFO}{AC}{Number} != -1 ) { push @tags, 'AC'; } if ( exists($$vcf{header}{INFO}{AF}) && $$vcf{header}{INFO}{AF}{Number} != -1 ) { push @tags, 'AF'; } if ( !@tags ) { return; } $ENV{DONT_FIX_VCF40_AG_TAGS} = 1; my $tags = join(',',@tags); print STDERR "Warning: The $tags tag(s) will not be merged correctly for multiallelic sites.\n", " To be handled correctly, please redefine with Number=. or set the environment\n", " variable DONT_FIX_VCF40_AG_TAGS=0.\n"; } sub init_cols { my ($opts,$vcf_out) = @_; my $prefix; my @regions = read_region_list($opts); my @vcfs; my @cols; my %has_chrom; my %col_names; my $icol = 9; my $ncols_total = 0; if ( !$$opts{has_col_names} ) { $prefix = common_prefix($$opts{files}); } # Go through all files and read header, obtain list of chromosomes. The file names will be used for columns, unless # they were read from the header. for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $$vcf{line_buffer} = []; $vcf->parse_header(); check_AGtags_definition($vcf); $vcf->close(); push @vcfs, $vcf; # Precompute the weighting factor for the QUAL column my $ncols = scalar @{$$vcf{columns}} - 9; if ( $ncols<=0 ) { $ncols = 1; } $$vcf{qual_weight} = 1.0*$ncols; $ncols_total += $ncols; # Update the list of known chromosomes if ( !exists($$opts{regions_list}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @regions, $chr; } } my $col_prefix = ''; if ( !$$opts{has_col_names} ) { # Make the column names nice - strip common prefix and the suffix .vcf.gz $col_prefix = $file; $col_prefix =~ s{^/*$prefix/*}{}; $col_prefix =~ s/\.gz$//i; $col_prefix =~ s/\.vcf$//i; $col_prefix .= '_'; } if ( !exists($$vcf{columns}) ) { error("No header present? $file\n"); } # Create good names for the columns in the merged vcf file my @vcf_cols = @{$$vcf{columns}}; $$vcf{__col_names} = []; for my $col (@vcf_cols[9..$#vcf_cols]) { my $col_name = $col; if ( $$opts{has_col_names} ) { if ( $icol >= @{$$vcf_out{columns}} ) { error("Fewer columns in the header than in the VCF files total.\n"); } $col_name = $$vcf_out{columns}[$icol]; $icol++; if ( exists($col_names{$col_name}) ) { error("The column names not unique in the header: $col_name\n"); } } else { if ( exists($col_names{$col_name}) ) { $col_name = $col_prefix.$col; } if ( exists($col_names{$col_name}) ) { warn("FIXME: the column name [$col_name] not unique.\n"); } } warn("Using column name '$col_name' for $file:$col\n"); $col_names{$col_name} = 1; push @cols, $col_name; push @{$$vcf{__col_names}}, $col_name; } } if ( $$opts{has_col_names} && $icol!=@{$$vcf_out{columns}} ) { error("More columns in the header than in the VCF files total.\n"); } # QUAL weighting for my $vcf (@vcfs) { $$vcf{qual_weight} /= $ncols_total; } $$opts{vcfs} = \@vcfs; $$opts{cols} = \@cols; $$opts{regions} = \@regions; } sub merge_vcf_files { my ($opts) = @_; # Create output VCF my $vcf_out; if ( $$opts{vcf_header} ) { $vcf_out = Vcf->new(file=>$$opts{vcf_header}); $vcf_out->parse_header(); if ( $$vcf_out{columns} && @{$$vcf_out{columns}} ) { $$opts{has_col_names}=1; } } else { $vcf_out = Vcf->new(); } $$vcf_out{trim_redundant_ALTs} = $$opts{trim_redundant_ALTs}; init_cols($opts,$vcf_out); my @regions = @{$$opts{regions}}; my @cols = @{$$opts{cols}}; my @vcfs = @{$$opts{vcfs}}; # Get the header of the output VCF ready $vcf_out->add_columns(@cols); if ( !$$vcf_out{has_header} ) { for my $vcf (@vcfs) { # To get the missig fields filled by the default values for my $hline (@{$$vcf{header_lines}}) { if ( $$hline{key} eq 'fileformat' ) { next; } $vcf_out->add_header_line($hline,silent=>1); } } } # List source files my $source; for (my $i=0; $i<@vcfs; $i++) { if ( $i ) { $source .= ','; } $source .= "$i:$vcfs[$i]{file}"; } $vcf_out->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf_out->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp'); $vcf_out->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'}); my $have_samples = @{$$vcf_out{columns}}>9 ? 1 : 0; $vcf_out->recalc_ac_an($have_samples ? 2 : 0); $vcf_out->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf_out->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); print $vcf_out->format_header(); # Go through all VCF files simultaneously and output each line, one region at a time. for my $region (@regions) { # Open files for my $vcf (@vcfs) { delete($$vcf{done}); $vcf->open(region=>$region); } while ( my $pos=advance_position($opts,\@vcfs) ) { my %out; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{FORMAT} = []; my %format; my %info; my @src_files; my %filters; my (@quals,@qual_weights,$qual_weights_sum,%ac,$an); my %ref_alt_map = (); # Find out the REFs and ALTs: in VCFv4.0, the REFs can differ and ALTs must be converted for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } if ( !exists($out{CHROM}) ) { $out{CHROM} = $$line{CHROM}; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $ref_alt_map{$ref}{$alt}=$alt; } } # Do the REF,ALT conversion only when necessary my $new_ref; if ( scalar keys %ref_alt_map > 1 ) { $new_ref = $vcf_out->fill_ref_alt_mapping(\%ref_alt_map); } if ( !$have_samples or !$$opts{trim_redundant_ALTs} ) { # Do not loose information from the ALT column when samples are not present my %alts; for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $alts{$ref_alt_map{$ref}{$alt}}=1; } delete($alts{'.'}); $out{ALT} = [ keys %alts ]; } } for (my $ivcf=0; $ivcf<@vcfs; $ivcf++) { my $vcf = $vcfs[$ivcf]; my $line = $$vcf{last_line}; # If this file does not have a record for this position, then for all its columns output undef gtype if ( !$line ) { for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $name = $$vcf{__col_names}->[$i]; $out{gtypes}{$name}{GT} = exists($$opts{ref_for_missing}) ? $$opts{ref_for_missing} : $$vcf_out{defaults}{GT}; } next; } # Check if the site has been filtered if ( scalar @{$$line{FILTER}}>1 or ($$line{FILTER}[0] ne $$vcf{filter_passed} && $$line{FILTER}[0] ne $$vcf{defaults}{default}) ) { push @src_files,$ivcf.'f'; } else { push @src_files,$ivcf; } # Collect information for the FILTER field for my $flt (@{$$line{FILTER}}) { if ( $flt eq $$vcf{filter_passed} ) { $filters{$$vcf_out{filter_passed}} = 1; } elsif ( $flt ne $$vcf{defaults}{default} ) { $filters{$flt} = 1; } } # Collect information for the QUAL field if ( $$line{QUAL} ne $$vcf{defaults}{QUAL} && $$line{QUAL} ne $$vcf{defaults}{default} && $$line{QUAL}>0 ) { push @quals,$$line{QUAL}; push @qual_weights,$$vcf{qual_weight}; $qual_weights_sum += $$vcf{qual_weight}; } if ( $$line{ID} ne '.' && $out{ID} eq '.' ) { $out{ID}=$$line{ID}; } # Remember the FORMAT fields for my $field (@{$$line{FORMAT}}) { $format{$field} = 1; } # VCF without genotypes: calculate AC,AN if present if ( !$have_samples ) { if ( exists($$line{INFO}{AN}) ) { $an += $$line{INFO}{AN}; } if ( exists($$line{INFO}{AC}) ) { my (@acs) = split(/,/,$$line{INFO}{AC}); for (my $i=0; $i<@acs; $i++) { my $alt = $ref_alt_map{$$line{REF}}{$$line{ALT}[$i]}; $ac{$alt} += $acs[$i]; } } } # Join the INFO field for my $inf (keys %{$$line{INFO}}) { # When conflicting INFO fields are present, use the first one, unless a joining method exists if ( exists($info{$inf}) ) { if ( exists($$opts{joiner}{$inf}) ) { &{$$opts{joiner}{$inf}}(\$info{$inf},$$line{INFO}{$inf}); } next; } $info{$inf} = $$line{INFO}{$inf}; } my $ref = $$line{REF}; # The ALT column may change after the merge, take care of ALT dependent tags such as GL. if ( $have_samples ) { if ( defined $new_ref ) { $vcf->parse_AGtags($line,\%ref_alt_map,$$line{REF}); } else { $vcf->parse_AGtags($line); } } # Now fill in the genotype information for each column for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $ori_name = $$vcf{columns}->[$i+9]; my $out_name = $$vcf{__col_names}->[$i]; $out{gtypes}{$out_name} = $$line{gtypes}{$ori_name}; # This is to convert 0/1 to G/C my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($line,$ori_name); if ( defined $new_ref ) { my @als; for my $al (@$alleles) { push @als, exists($ref_alt_map{$ref}{$al}) ? $ref_alt_map{$ref}{$al} : '.'; } $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype(\@als,$seps); } else { $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype($alleles,$seps); } } $out{REF} = defined $new_ref ? $new_ref : $ref; } $out{INFO} = { %info }; $out{INFO}{SF} = join(',',@src_files); # Output the QUAL information my $qual; for (my $i=0; $i<@quals; $i++) { $qual += $quals[$i] * $qual_weights[$i] * (1.0 / $qual_weights_sum); } $out{QUAL} = defined $qual ? sprintf("%.2f",$qual) : $$vcf_out{defaults}{QUAL}; # Output the FILTER information: remove PASS or missing value if some other information # is present. delete($filters{$$vcf_out{defaults}{default}}); if ( exists($filters{$$vcf_out{filter_passed}}) && scalar keys %filters > 1 ) { delete($filters{$$vcf_out{filter_passed}}); } $out{FILTER} = [ keys %filters ]; if ( !@{$out{FILTER}} ) { push @{$out{FILTER}},$$vcf_out{defaults}{default}; } # The GT field must come as first delete($format{GT}); $out{FORMAT} = ['GT']; for my $key (keys %format) { push @{$out{FORMAT}},$key; } if ( $have_samples ) { $vcf_out->format_genotype_strings(\%out); } else { if ( defined $an ) { $out{INFO}{AN}=$an; } if ( scalar keys %ac ) { my @acs; for my $alt (@{$out{ALT}}) { # Some of the files may not have AC, the AC count can be undefined in such a case. push @acs, exists($ac{$alt}) ? $ac{$alt} : 0; } $out{INFO}{AC} = join(',',@acs); } } print $vcf_out->format_line(\%out); } } for my $vcf (@vcfs) { $vcf->close() or error("close failed: $$vcf{file}\n"); } } sub advance_position { my ($opts,$vcfs) = @_; my $min_pos; for my $vcf (@$vcfs) { fill_buffer($opts,$vcf) unless $$vcf{done}; if ( @{$$vcf{line_buffer}} && (!defined $min_pos or $min_pos>$$vcf{line_buffer}[0]{POS}) ) { $min_pos = $$vcf{line_buffer}[0]{POS}; } } if ( !defined $min_pos ) { return undef; } my ($first,$has_snp,$has_indel); for my $vcf (@$vcfs) { delete($$vcf{last_line}); if ( @{$$vcf{line_buffer}} && $min_pos ne $$vcf{line_buffer}[0]{POS} ) { next; } if ( !defined $first ) { $$vcf{last_line} = shift @{$$vcf{line_buffer}}; $first = $$vcf{last_line}; next; } my $irec; for (my $i=0; $i<@{$$vcf{line_buffer}}; $i++) { my $line = $$vcf{line_buffer}[$i]; if ( $$line{POS} ne $$first{POS} ) { last; } if ( $$opts{collapse_any} ) { $irec=$i; last; } # checking position only if ( $$opts{collapse_snps} && $$first{variant_type}&1 && $$line{variant_type}&1 ) { $irec=$i; last; } if ( $$opts{collapse_indels} && $$first{variant_type}&2 && $$line{variant_type}&2 ) { $irec=$i; last; } if ( $$vcf{line_buffer}[$i]{REF} ne $$first{REF} ) { next; } # refs do not match for my $al1 (@{$$line{ALT}}) { for my $al2 (@{$$first{ALT}}) { if ( $al1 eq $al2 ) { $irec=$i; last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { $$vcf{last_line} = splice(@{$$vcf{line_buffer}},$irec,1); } } return $min_pos; } sub fill_buffer { my ($opts,$vcf) = @_; if ( @{$$vcf{line_buffer}} && $$vcf{line_buffer}[0]{POS}!=$$vcf{line_buffer}[-1]{POS} ) { return; } while ( 1 ) { my $line = $vcf->next_data_hash(); if ( !$line ) { $$vcf{done} = 1; return; } if ( !$$opts{collapse_any} ) { for my $al (@{$$line{ALT}}) { my ($type,$len,$ht) = $vcf->event_type($$line{REF},$al); if ( $type eq 's' or $type eq 'r' ) { $$line{variant_type} |= 1; } if ( $type eq 'i' or $type eq 'o' ) { $$line{variant_type} |= 2; } } } push @{$$vcf{line_buffer}}, $line; if ( $$vcf{line_buffer}[0]{POS} != $$vcf{line_buffer}[-1]{POS} ) { return; } } } # Field joiner methods sub joiner_sum { my ($ori,$new) = @_; $$ori += $new; } sub joiner_dp4 { my ($ori,$new) = @_; my @vals1 = split(/,/,$$ori); my @vals2 = split(/,/,$new); if ( @vals1 != @vals2 ) { error("Cannot join: $$ori vs $new\n"); } for (my $i=0; $i<@vals1; $i++) { $vals1[$i] += $vals2[$i]; } $$ori = join(',',@vals1); }