#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); vcf_subset($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-subset [OPTIONS] in.vcf.gz > out.vcf\n", "Options:\n", " -a, --trim-alt-alleles Remove alternate alleles if not found in the subset\n", " -c, --columns File or comma-separated list of columns to keep in the vcf file. If file, one column per row\n", " -e, --exclude-ref Exclude rows not containing variants.\n", " -f, --force Proceed anyway even if VCF does not contain some of the samples.\n", " -p, --private Print only rows where only the subset columns carry an alternate allele.\n", " -r, --replace-with-ref Replace the excluded types with reference allele instead of dot.\n", " -t, --type Comma-separated list of variant types to include: ref,SNPs,indels,MNPs,other.\n", " -u, --keep-uncalled Do not exclude rows without calls.\n", " -h, -?, --help This help message.\n", "Examples:\n", " cat in.vcf | vcf-subset -r -t indels -e -c SAMPLE1 > out.vcf\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { exclude_ref=>0, keep_uncalled=>0, replace_with_ref=>0, private=>0, args=>[$0, @ARGV] }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-t' || $arg eq '--type' ) { my %known = ( ref=>'r', SNPs=>'s', indels=>'i', MNPs=>'m', other=>'o' ); my $types = shift(@ARGV); for my $t (split(/,/,$types)) { if ( !(exists($known{$t})) ) { error("Unknown type [$t] with -t [$types]\n"); } $$opts{types}{$known{$t}} = 1; } next; } if ( $arg eq '-a' || $arg eq '--trim-alt-alleles' ) { $$opts{'trim_alts'} = 1; next } if ( $arg eq '-e' || $arg eq '--exclude-ref' ) { $$opts{'exclude_ref'} = 1; next } if ( $arg eq '-f' || $arg eq '--force' ) { $$opts{'force'} = 1; next } if ( $arg eq '-p' || $arg eq '--private' ) { $$opts{'private'} = 1; next } if ( $arg eq '-r' || $arg eq '--replace-with-ref' ) { $$opts{'replace_with_ref'} = 1; next } if ( $arg eq '-u' || $arg eq '--keep-uncalled' ) { $$opts{'keep_uncalled'} = 1; next } if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{'columns_file'} = shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file} = $arg; next } if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !$$opts{exclude_ref} && !$$opts{'columns_file'} && !exists($$opts{'types'}) && !exists($$opts{trim_alts}) ) { error("Missing the -c or -t or -r option.\n") } return $opts; } sub read_columns { my ($fname) = @_; my @columns; if ( !-e $fname ) { @columns = split(/,/,$fname); return \@columns; } open(my $fh,'<',$fname) or error("$fname: $!"); while (my $line=<$fh>) { chomp($line); $line=~s/\s+//g; push @columns, $line; } close($fh); return \@columns; } sub check_columns { my ($opts,$vcf,$columns) = @_; my @out; for my $col (@$columns) { if ( exists($$vcf{has_column}{$col}) ) { push @out, $col; next; } my $msg = qq[No such column in the VCF file: "$col"\n]; if ( $$opts{force} ) { warn($msg); } else { error($msg); } } return \@out; } sub vcf_subset { my ($opts) = @_; my $vcf = $$opts{file} ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); my $AGtags; if ( $$opts{trim_alts} ) { $$vcf{trim_redundant_ALTs} = 1; $AGtags = $vcf->has_AGtags(); } # Init requested column info. If not present, include all columns. my $columns = exists($$opts{columns_file}) ? read_columns($$opts{columns_file}) : []; $columns = check_columns($opts,$vcf,$columns); if ( !@$columns && (my $ncols=@{$$vcf{columns}})>9 ) { push @$columns, @{$$vcf{columns}}[9..($ncols-1)]; } my $columns_to_keep = { map { $_ => 1 } @$columns }; my %has_col = map { $_ => 1 } @$columns; $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf->set_samples(include=>$columns) unless $$opts{private}; print $vcf->format_header($columns); my $check_private = $$opts{private}; while (my $x=$vcf->next_data_hash()) { my $site_has_call = 0; my $site_has_nonref = 0; my $site_is_private = 1; my $ref = $$x{REF}; for my $col (keys %{$$x{gtypes}}) { if ( !$has_col{$col} && ($site_is_private==0 || !$check_private) ) { # This column is not to be printed delete($$x{gtypes}{$col}); next; } my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); my $sample_has_call = 0; my $sample_has_nonref = 0; my @out_alleles; for (my $i=0; $i<@$alleles; $i++) { my ($type,$len,$ht) = $vcf->event_type($ref,$$alleles[$i]); $out_alleles[$i] = $$alleles[$i]; # Exclude unwanted variant types if requested if ( exists($$opts{types}) ) { if ( $type eq 's' && $len>1 ) { $type = 'm'; } elsif ( $type eq 'b' or $type eq 'u' ) { $type = 'o'; } if ( !exists($$opts{types}{$type}) ) { $out_alleles[$i] = $$opts{replace_with_ref} ? $ref : '.'; next; } $sample_has_call = 1; } elsif ( !$is_empty ) { $sample_has_call = 1; } if ( $type ne 'r' ) { $site_has_nonref = 1; $sample_has_nonref = 1; } } if ( $check_private && !$has_col{$col} ) { if ( $sample_has_nonref ) { $site_is_private=0; } delete($$x{gtypes}{$col}); next; } if ( !$sample_has_call ) { if ( $$opts{replace_with_ref} ) { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = $ref; } } else { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = '.'; } } } else { $site_has_call = 1; } $$x{gtypes}{$col}{GT} = $vcf->format_haplotype(\@out_alleles,$seps); } if ( !$site_has_call && !$$opts{keep_uncalled} ) { next; } if ( !$site_has_nonref && $$opts{exclude_ref} ) { next; } if ( $check_private && (!$site_is_private || !$site_has_nonref) ) { next; } if ( $$opts{trim_alts} && defined $AGtags ) { $vcf->remove_columns($x, keep=>$columns_to_keep); $vcf->parse_AGtags($x); } $vcf->format_genotype_strings($x); print $vcf->format_line($x,$columns); } }