#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); flanking_sequence($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Annotate VCF with flanking sequence (INFO/FS tag)\n", "Usage: fill-fs [OPTIONS] file.vcf\n", "Options:\n", " -b, --bed-mask Regions to mask (tabix indexed), multiple files can be given\n", " -c, --cluster Do self-masking of clustered variants within this range.\n", " -l, --length Flanking sequence length [100]\n", " -m, --mask-char The character to use or \"lc\" for lowercase. This option must preceed\n", " -b, -v or -c in order to take effect. With multiple files works\n", " as a switch on the command line, see the example below [N]\n", " -r, --refseq The reference sequence.\n", " -v, --vcf-mask Mask known variants in the flanking sequence, multiple files can be given (tabix indexed)\n", " -h, -?, --help This help message.\n", "Example:\n", " # Mask variants from the VCF file with N's and use lowercase for the bed file regions\n", " fill-fs file.vcf -v mask.vcf -m lc -b mask.bed\n", "\n"; } sub parse_params { my $opts = { length=>100, mask=>[], cluster=>0 }; my $mask = $$opts{mask_char}{default} = 'N'; my $mask_changed = 0; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-c' || $arg eq '--cluster' ) { $$opts{cluster}=shift(@ARGV); $$opts{mask_char}{default}=$mask; $mask_changed=0; next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-l' || $arg eq '--length' ) { $$opts{length}=shift(@ARGV); next; } if ( $arg eq '-m' || $arg eq '--mask' ) { $mask=shift(@ARGV); check_mask_char($mask); $mask_changed=1; next; } if ( $arg eq '-b' || $arg eq '--bed-mask' ) { $arg=shift(@ARGV); push @{$$opts{bed_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-v' || $arg eq '--vcf-mask' ) { $arg=shift(@ARGV); push @{$$opts{vcf_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !($$opts{length}=~/^\d+$/) ) { error("Expected integer after -l, got $$opts{length}\n"); } if ( !exists($$opts{refseq}) ) { error("Missing the -r option.\n"); } if ( $mask_changed ) { error("The -m parameter must preceed -b, -v, or the file in order to take effect.\n"); } return $opts; } sub check_mask_char { my ($mask) = @_; if ( $mask eq 'lc' ) { return; } if ( length($mask) eq 1 ) { return; } error("Currently only \"lc\" or one-character mask is supported, got \"$mask\".\n"); } sub flanking_sequence { my ($opts) = @_; $$opts{faref} = FaSlice->new(file=>$$opts{refseq},size=>1_024,oob=>'N'); my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header; print $vcf->format_header; my (@lines,@mask); while (my $line=$vcf->next_data_array) { my $chr = $$line[0]; my $pos = $$line[1]; my $ref = $$line[3]; my $alt = $$line[4]; my $off; $alt =~ s/,.+$//; # first allele is used at multiallelic sites ($off,$ref,$alt) = $vcf->normalize_alleles_pos($ref,$alt); $pos += $off; push @lines, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line }; push @mask, { chr=>$chr, pos=>$pos, ref=>$ref }; flush_buffers($opts,\@lines,\@mask); } flush_buffers($opts,\@lines,\@mask,1); } sub flush_buffers { my ($opts,$lines,$mask,$force) = @_; if ( !@$lines ) { return; } if ( !$$opts{cluster} ) { shift(@$mask); output_line($opts,shift(@$lines),$mask); return; } while ( @$lines && ($force or $$mask[0]{chr} ne $$lines[-1]{chr} or $$mask[0]{pos}+2*$$opts{cluster}<=$$lines[-1]{pos}) ) { output_line($opts,$$lines[0],$mask); shift(@$lines); while ( @$mask && @$lines && ($$mask[0]{chr} ne $$lines[0]{chr} or $$mask[0]{pos}+$$opts{cluster}<=$$lines[0]{pos}) ) { shift(@$mask); } } } sub output_line { my ($opts,$hline,$mask) = @_; my $chr = $$hline{chr}; my $pos = $$hline{pos}; my $ref = $$hline{ref}; my $alt = $$hline{alt}; my $line = $$hline{line}; my $seq_pos = $$opts{length}; my $reflen = length($ref); my $from = $pos-$$opts{length}; my $to = $pos+($reflen-1)+$$opts{length}; my $seq = $$opts{faref}->get_slice($chr,$from,$to); $seq = mask_sequence($opts,$seq,$chr,$from,$to,$mask); my $reflen_ori = $reflen; my ($len,$indel,$off) = $$opts{vcf}->is_indel($ref,$alt); if ( $len<0 ) { $seq_pos += $off; $ref = $indel; $reflen = abs($len); $alt = '-'; } elsif ( $len>0 ) { $seq_pos += $off; $ref = '-'; $alt = $indel; $reflen = $off-1; } substr($seq,$seq_pos,$reflen,"[$ref/$alt]"); if ( $reflen_ori - $reflen > 0 ) { # for redundant pad bases which cannot be removed without changing the position, e.g. ACGT AC $seq = substr($seq,$reflen_ori-$reflen); } if ( $$line[7] eq '.' or !defined $$line[7] ) { $$line[7] = ''; } else { $$line[7] .= ';'; } $$line[7] .= "FS=$seq"; print join("\t",@$line),"\n"; } sub mask_sequence { my ($opts,$seq,$chr,$from,$to,$mask) = @_; for my $m (@$mask) { my $reflen = length($$m{ref}); if ( $$m{chr} ne $chr or $$m{pos}+$reflen<$from or $$m{pos}>$to ) { next; } apply_mask($opts,\$seq,$$m{pos}-$from,$$m{ref},$$opts{mask_char}{default}); } for my $file (@{$$opts{vcf_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my $items = $$opts{vcf}->split_mandatory($ret); # In different situations one may want to treat indels differently. For # now, mask the whole REF string as for primer design it is safer to # mask the whole thing; for example, a 2bp deletion can be reported by # samtools as REF=GACACACA ALT=GACACA, the script will mask it all. apply_mask($opts,\$seq,$$items[1]-$from,$$items[3],$$opts{mask_char}{$file}); } } for my $file (@{$$opts{bed_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my @items = split(/\t/,$ret); apply_mask($opts,\$seq,$items[1]-$from+1,$items[2]-$from,$$opts{mask_char}{$file}); } } return $seq; } sub apply_mask { my ($opts,$seq,$from,$ref,$mask_char) = @_; if ( $from<0 ) { $from=0; } my $ref_len = $ref=~/^\d+$/ ? $ref-$from+1 : length($ref); my $seq_len = length($$seq); if ( $from+$ref_len>=$seq_len ) { $ref_len = $seq_len - $from; } if ( $ref_len<0 ) { return; } if ( $ref_len==1 ) { my $rpl = substr($$seq,$from,1); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,1)) : $mask_char; substr($$seq,$from,1,$rpl); return; } my $rpl = substr($$seq,$from,$ref_len); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,$ref_len)) : ($mask_char x $ref_len); substr($$seq,$from,$ref_len,$rpl); }