=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/legal/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =cut # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence # # =head1 NAME Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor =head1 SYNOPSIS use Bio::EnsEMBL::Variation::Utils::VEP qw(configure); my $config = configure(); =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::Utils::VEP; # module list use Getopt::Long; use FileHandle; use File::Path qw(mkpath); use Storable qw(nstore_fd fd_retrieve freeze thaw); use Scalar::Util qw(weaken); use Digest::MD5 qw(md5_hex); use IO::Socket; use IO::Select; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Variation::VariationFeature; use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap); use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code SO_variation_class); use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; use Bio::EnsEMBL::Variation::StructuralVariationFeature; use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; # we need to manually include all these modules for caching to work use Bio::EnsEMBL::CoordSystem; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Translation; use Bio::EnsEMBL::Exon; use Bio::EnsEMBL::ProteinFeature; use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::DBSQL::GeneAdaptor; use Bio::EnsEMBL::DBSQL::SliceAdaptor; use Bio::EnsEMBL::DBSQL::TranslationAdaptor; use Bio::EnsEMBL::DBSQL::TranscriptAdaptor; use Bio::EnsEMBL::DBSQL::MetaContainer; use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; use Exporter; use vars qw(@ISA @EXPORT_OK); @ISA = qw(Exporter); @EXPORT_OK = qw( &detect_format &parse_line &vf_to_consequences &validate_vf &read_cache_info &dump_adaptor_cache &load_dumped_adaptor_cache &load_dumped_variation_cache &get_all_consequences &get_slice &build_slice_cache &build_full_cache ®ions_from_hash &get_time &debug &convert_to_vcf &progress &end_progress @REG_FEAT_TYPES @OUTPUT_COLS @VEP_WEB_CONFIG %FILTER_SHORTCUTS ); our @OUTPUT_COLS = qw( Uploaded_variation Location Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation Extra ); our @REG_FEAT_TYPES = qw( RegulatoryFeature MotifFeature ); our @VEP_WEB_CONFIG = qw( format check_existing coding_only core_type symbol protein hgvs terms check_frequency freq_filter freq_gt_lt freq_freq freq_pop filter_common sift polyphen regulatory ); our @VAR_CACHE_COLS = qw( variation_name failed start end allele_string strand minor_allele minor_allele_freq ); our %FILTER_SHORTCUTS = ( upstream => { '5KB_upstream_variant' => 1, '2KB_upstream_variant' => 1, }, downstream => { '5KB_downstream_variant' => 1, '2KB_downstream_variant' => 1, '500B_downstream_variant' => 1, }, utr => { '5_prime_UTR_variant' => 1, '3_prime_UTR_variant' => 1, }, splice => { splice_donor_variant => 1, splice_acceptor_variant => 1, splice_region_variant => 1, }, coding_change => { stop_lost => 1, stop_gained => 1, missense_variant => 1, frameshift_variant => 1, inframe_insertion => 1, inframe_deletion => 1, }, regulatory => { regulatory_region_variant => 1, TF_binding_site_variant => 1, }, ); # parses a line of input, returns VF object(s) sub parse_line { my $config = shift; my $line = shift; # find out file format - will only do this on first line if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) { $config->{format} = &detect_format($line); debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet}); # HGVS and ID formats need DB die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline}); # force certain options if format is VEP output if($config->{format} eq 'vep') { $config->{no_consequence} = 1; delete $config->{regulatory}; debug("Forcing no consequence calculation") unless defined($config->{quiet}); } } # check that format is vcf when using --individual die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf'; my $parse_method = 'parse_'.$config->{format}; $parse_method =~ s/vep_//; my $method_ref = \&$parse_method; my $vfs = &$method_ref($config, $line); $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg}); return $vfs; } # sub-routine to detect format of input sub detect_format { my $line = shift; my @data = split /\s+/, $line; # HGVS: ENST00000285667.3:c.1047_1048insC if ( scalar @data == 1 && $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i ) { return 'hgvs'; } # variant identifier: rs123456 elsif ( scalar @data == 1 ) { return 'id'; } # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ elsif ( $data[0] =~ /(chr)?\w+/ && $data[1] =~ /^\d+$/ && $data[3] =~ /^[ACGTN-]+$/i && $data[4] =~ /^([\.ACGTN-]+\,?)+$/i ) { return 'vcf'; } # pileup: chr1 60 T A elsif ( $data[0] =~ /(chr)?\w+/ && $data[1] =~ /^\d+$/ && $data[2] =~ /^[\*ACGTN-]+$/i && $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i ) { return 'pileup'; } # ensembl: 20 14370 14370 A/G + elsif ( $data[0] =~ /\w+/ && $data[1] =~ /^\d+$/ && $data[2] =~ /^\d+$/ && $data[3] =~ /(ins|dup|del)|([ACGTN-]+\/[ACGTN-]+)/i ) { return 'ensembl'; } # vep output: ID 1:142849179 - - - - INTERGENIC elsif ( $data[0] =~ /\w+/ && $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ && scalar @data == 14 ) { return 'vep'; } else { die("ERROR: Could not detect input file format\n"); } } # parse a line of Ensembl format input into a variation feature object sub parse_ensembl { my $config = shift; my $line = shift; my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line; my $vf; # sv? if($allele_string !~ /\//) { my $so_term; # convert to SO term my %terms = ( INS => 'insertion', DEL => 'deletion', TDUP => 'tandem_duplication', DUP => 'duplication' ); $so_term = defined $terms{$allele_string} ? $terms{$allele_string} : $allele_string; $vf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ start => $start, end => $end, strand => $strand, adaptor => $config->{svfa}, variation_name => $var_name, chr => $chr, class_SO_term => $so_term, }); } # normal vf else { $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $end, allele_string => $allele_string, strand => $strand, map_weight => 1, adaptor => $config->{vfa}, variation_name => $var_name, chr => $chr, }); } return [$vf]; } # parse a line of VCF input into a variation feature object sub parse_vcf { my $config = shift; my $line = shift; my @data = split /\s+/, $line; # non-variant my $non_variant = 0; if($data[4] eq '.') { if(defined($config->{allow_non_variant})) { $non_variant = 1; } else { return []; } } # get relevant data my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]); # some VCF files have a GRCh37 pos defined in GP flag in INFO column # if user has requested, we can use that as the position instead if(defined $config->{gp}) { $chr = undef; $start = undef; foreach my $pair(split /\;/, $data[7]) { my ($key, $value) = split /\=/, $pair; if($key eq 'GP') { ($chr, $start) = split /\:/, $value; $end = $start; } } unless(defined($chr) and defined($start)) { warn "No GP flag found in INFO column" unless defined $config->{quiet}; return []; } } # adjust end coord $end += (length($ref) - 1); # structural variation if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) { # parse INFO field my %info = (); foreach my $bit(split /\;/, $data[7]) { my ($key, $value) = split /\=/, $bit; $info{$key} = $value; } # like indels, SVs have the base before included for reference $start++; # work out the end coord if(defined($info{END})) { $end = $info{END}; } elsif(defined($info{SVLEN})) { $end = $start + abs($info{SVLEN}) - 1; } # check for imprecise breakpoints my ($min_start, $max_start, $min_end, $max_end); if(defined($info{CIPOS})) { my ($low, $high) = split /\,/, $info{CIPOS}; $min_start = $start + $low; $max_start = $start + $high; } if(defined($info{CIEND})) { my ($low, $high) = split /\,/, $info{CIEND}; $min_end = $end + $low; $max_end = $end + $high; } # get type my $type; if($alt =~ /\<|\[|\]|\>/) { $type = $alt; $type =~ s/\<|\>//g; $type =~ s/\:.+//g; if($start >= $end && $type =~ /del/i) { warn "WARNING: VCF line on line ".$config->{line_number}." looks incomplete, skipping:\n$line\n"; return []; } } else { $type = $info{SVTYPE}; } my $so_term; if(defined($type)) { # convert to SO term my %terms = ( INS => 'insertion', DEL => 'deletion', TDUP => 'tandem_duplication', DUP => 'duplication' ); $so_term = defined $terms{$type} ? $terms{$type} : $type; } my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ start => $start, inner_start => $max_start, outer_start => $min_start, end => $end, inner_end => $min_end, outer_end => $max_end, strand => 1, adaptor => $config->{svfa}, variation_name => $data[2] eq '.' ? undef : $data[2], chr => $chr, class_SO_term => $so_term, }); return [$svf]; } # normal variation else { # find out if any of the alt alleles make this an insertion or a deletion my ($is_indel, $is_sub, $ins_count, $total_count); foreach my $alt_allele(split /\,/, $alt) { $is_indel = 1 if $alt_allele =~ /D|I/; $is_indel = 1 if length($alt_allele) != length($ref); $is_sub = 1 if length($alt_allele) == length($ref); $ins_count++ if length($alt_allele) > length($ref); $total_count++; } # multiple alt alleles? if($alt =~ /\,/) { if($is_indel) { my @alts; if($alt =~ /D|I/) { foreach my $alt_allele(split /\,/, $alt) { # deletion (VCF <4) if($alt_allele =~ /D/) { push @alts, '-'; } elsif($alt_allele =~ /I/) { $alt_allele =~ s/^I//g; push @alts, $alt_allele; } } } else { $ref = substr($ref, 1) || '-'; $start++; foreach my $alt_allele(split /\,/, $alt) { $alt_allele = substr($alt_allele, 1); $alt_allele = '-' if $alt_allele eq ''; push @alts, $alt_allele; } } $alt = join "/", @alts; } else { # for substitutions we just need to replace ',' with '/' in $alt $alt =~ s/\,/\//g; } } elsif($is_indel) { # deletion (VCF <4) if($alt =~ /D/) { my $num_deleted = $alt; $num_deleted =~ s/\D+//g; $end += $num_deleted - 1; $alt = "-"; $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1; } # insertion (VCF <4) elsif($alt =~ /I/) { $ref = '-'; $alt =~ s/^I//g; $start++; } # insertion or deletion (VCF 4+) elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) { # chop off first base $ref = substr($ref, 1) || '-'; $alt = substr($alt, 1) || '-'; $start++; } } # create VF object my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $end, allele_string => $non_variant ? $ref : $ref.'/'.$alt, strand => 1, map_weight => 1, adaptor => $config->{vfa}, variation_name => $data[2] eq '.' ? undef : $data[2], chr => $chr, }); # flag as non-variant $vf->{non_variant} = 1 if $non_variant; # individuals? if(defined($config->{individual})) { my @alleles = split /\//, $ref.'/'.$alt; my @return; foreach my $ind(keys %{$config->{ind_cols}}) { # get alleles present in this individual my @bits; my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0]; my $phased = ($gt =~ /\|/ ? 1 : 0); foreach my $bit(split /\||\/|\\/, $gt) { push @bits, $alleles[$bit] unless $bit eq '.'; } # shallow copy VF my $vf_copy = { %$vf }; bless $vf_copy, ref($vf); # get non-refs my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits; # construct allele_string if(scalar keys %non_ref) { $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref); } else { $vf_copy->{allele_string} = $ref; $vf_copy->{hom_ref} = 1; if(defined($config->{process_ref_homs})) { $vf_copy->{allele_string} .= "/".$ref; } else { $vf_copy->{non_variant} = 1; } } # store phasing info $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased); # store GT $vf_copy->{genotype} = \@bits; # store individual name $vf_copy->{individual} = $ind; push @return, $vf_copy; } return \@return; } else { return [$vf]; } } } # parse a line of pileup input into variation feature objects sub parse_pileup { my $config = shift; my $line = shift; my @data = split /\s+/, $line; # pileup can produce more than one VF per line my @return; # normal variant if($data[2] ne "*"){ my $var; if($data[3] =~ /^[A|C|G|T]$/) { $var = $data[3]; } else { ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig; } for my $alt(split //, $var){ push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $data[1], end => $data[1], allele_string => $data[2].'/'.$alt, strand => 1, map_weight => 1, adaptor => $config->{vfa}, chr => $data[0], }); } } # in/del else { my %tmp_hash = map {$_ => 1} split /\//, $data[3]; my @genotype = keys %tmp_hash; foreach my $allele(@genotype){ if(substr($allele,0,1) eq "+") { #ins push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $data[1] + 1, end => $data[1], allele_string => '-/'.substr($allele, 1), strand => 1, map_weight => 1, adaptor => $config->{vfa}, chr => $data[0], }); } elsif(substr($allele,0,1) eq "-"){ #del push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $data[1] + 1, end => $data[1] + length(substr($allele, 1)), allele_string => substr($allele, 1).'/-', strand => 1, map_weight => 1, adaptor => $config->{vfa}, chr => $data[0], }); } elsif($allele ne "*"){ warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet}; } } } return \@return; } # parse a line of HGVS input into a variation feature object sub parse_hgvs { my $config = shift; my $line = shift; my $vf; # not all hgvs notations are supported yet, so we have to wrap it in an eval eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) }; if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) { eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) }; } if(!defined($vf) || (defined $@ && length($@) > 1)) { warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet}; return []; } # get whole chromosome slice my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name); $vf = $vf->transfer($slice); # name it after the HGVS $vf->{variation_name} = $line; # add chr attrib $vf->{chr} = $vf->slice->seq_region_name; return [$vf]; } # parse a variation identifier e.g. a dbSNP rsID sub parse_id { my $config = shift; my $line = shift; my $v_obj = $config->{va}->fetch_by_name($line); return [] unless defined $v_obj; my @vfs = @{$v_obj->get_all_VariationFeatures}; for(@vfs) { delete $_->{dbID}; delete $_->{overlap_consequences}; $_->{chr} = $_->seq_region_name; $config->{slice_cache}->{$_->{chr}} = $_->slice; } return \@vfs; } # parse a line of VEP output sub parse_vep { my $config = shift; my $line = shift; my @data = split /\t/, $line; my ($chr, $start, $end) = split /\:|\-/, $data[1]; $end ||= $start; # might get allele string from ID my $allele_string; if($data[0] =~ /^\w\_\w\_\w$/) { my @split = split /\_/, $data[0]; $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/; } $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]); my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $end, allele_string => $allele_string, strand => 1, map_weight => 1, adaptor => $config->{vfa}, chr => $chr, variation_name => $data[0], }); return [$vf]; } # converts to VCF format sub convert_to_vcf { my $config = shift; my $vf = shift; # look for imbalance in the allele string if($vf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { my %allele_lengths; my @alleles = split /\//, $vf->allele_string; map {reverse_comp(\$_)} @alleles if $vf->strand < 0; foreach my $allele(@alleles) { $allele =~ s/\-//g; $allele_lengths{length($allele)} = 1; } # in/del/unbalanced if(scalar keys %allele_lengths > 1) { # we need the ref base before the variation # default to N in case we can't get it my $prev_base = 'N'; unless(defined($config->{cache})) { my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); $prev_base = $slice->seq if defined($slice); } for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = $prev_base.$alleles[$i]; } return [ $vf->{chr} || $vf->seq_region_name, $vf->start - 1, $vf->variation_name, shift @alleles, (join ",", @alleles), '.', '.', '.' ]; } # balanced sub else { return [ $vf->{chr} || $vf->seq_region_name, $vf->start, $vf->variation_name, shift @alleles, (join ",", @alleles), '.', '.', '.' ]; } } # SV else { # convert to SO term my %terms = ( 'insertion' => 'INS', 'deletion' => 'DEL', 'tandem_duplication' => 'TDUP', 'duplication' => 'DUP' ); my $alt = '<'.($terms{$vf->class_SO_term} || $vf->class_SO_term).'>'; return [ $vf->{chr} || $vf->seq_region_name, $vf->start, $vf->variation_name, '.', $alt, '.', '.', '.' ]; } } # tries to map a VF to the LRG coordinate system sub add_lrg_mappings { my $config = shift; my $vfs = shift; my @new_vfs; foreach my $vf(@$vfs) { # add the unmapped VF to the array push @new_vfs, $vf; # make sure the VF has an attached slice $vf->{slice} ||= get_slice($config, $vf->{chr}); next unless defined($vf->{slice}); # transform LRG <-> chromosome my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg'); # add it to the array if transformation worked if(defined($new_vf)) { # update new VF's chr entry $new_vf->{chr} = $new_vf->seq_region_name; push @new_vfs, $new_vf; } } return \@new_vfs; } # wrapper for whole_genome_fetch and vf_to_consequences # takes config and a listref of VFs, returns listref of line hashes for printing sub get_all_consequences { my $config = shift; my $listref = shift; if ($config->{extra}) { eval "use Plugin qw($config);" } # check we can use MIME::Base64 if(defined($config->{fork})) { eval q{ use MIME::Base64; }; if($@) { debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); delete $config->{fork}; } } my (@temp_array, @return, %by_pid, @pids); my $active_forks = 0; if(defined($config->{fork})) { my $total_size = scalar @$listref; my $done_vars = 0; # this variable stores child process references my $sel = IO::Select->new; debug("Calculating consequences") unless defined($config->{quiet}); progress($config, 0, 1); # loop while variants in $listref or forks running while (scalar @$listref or $active_forks ) { # only spawn new forks if we have space if ($active_forks <= $config->{fork} ) { my $delta = 0.5; my $minForkSize = 5; my $maxForkSize = 200; my $numLines = scalar @$listref; my $forkSize = int($numLines / ($config->{fork} + $delta*$config->{fork}) + $minForkSize ) + 1; $forkSize = $maxForkSize if $forkSize > $maxForkSize; while($active_forks <= $config->{fork} && scalar @$listref) { my $tmp_vf = shift @$listref; push @temp_array, $tmp_vf; # fork if(scalar @temp_array >= $forkSize || scalar @$listref == 0) { my ($child,$parent); socketpair($child, $parent, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!"; $child->autoflush(1); $parent->autoflush(1); $sel->add($child); my $pid = fork; if(!defined($pid)) { die("WARNING: Failed to fork -") unless defined($config->{quiet}); last; } elsif($pid) { push @pids, $pid; $active_forks++; @temp_array = (); } elsif($pid == 0) { $config->{forked} = $$; $config->{quiet} = 1; $config->{stats} = {}; *PARENT = $parent; *STDERR = *PARENT; my $cons = vf_list_to_cons($config, \@temp_array); # what we're doing here is sending a serialised hash of the # results through to the parent process through the socket. # This is then thawed by the parent process. # $$, or the PID, is added so that the input can be sorted # back into the correct order for output print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons; # some plugins may cache stuff, check for this and try and # reconstitute it into parent's plugin cache foreach my $plugin(@{$config->{plugins}}) { next unless defined($plugin->{has_cache}); # delete unnecessary stuff and stuff that can't be serialised delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params); print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n"; } # tell parent about stats print PARENT $$." STATS ".encode_base64(freeze($config->{stats}), "\t")."\n" if defined($config->{stats}); # we need to tell the parent this child is finished # otherwise it keeps listening print PARENT "DONE $$\n"; exit(0); } } } } my $fh = $config->{out_file_handle}; my $done_processes = 0; # read child input while(my @ready = $sel->can_read and $done_processes < scalar @pids ) { foreach $fh (@ready) { my $line = $fh->getline(); next unless defined($line) && $line; # child finished if($line =~ /^DONE/) { $sel->remove($fh); $fh->close; $done_processes++; $active_forks--; last; } # variant finished / progress indicator elsif($line =~ /^BUMP/) { $line =~ m/BUMP ?(\d*)/; $done_vars += $1 || 1; progress($config, $done_vars, $total_size); } # output elsif($line =~ /^\-?\d+ /) { # plugin if($line =~ /^\-?\d+ PLUGIN/) { $line =~ m/^(\-?\d+) PLUGIN (\w+) /; my ($pid, $plugin) = ($1, $2); # remove the PID $line =~ s/^\-?\d+ PLUGIN \w+ //; chomp $line; my $tmp = thaw(decode_base64($line)); next unless defined($plugin); # copy data to parent plugin my ($parent_plugin) = grep {ref($line) eq $plugin} @{$config->{plugins}}; next unless defined($parent_plugin); merge_hashes($parent_plugin, $tmp); } # filtered count elsif($line =~ /^\-?\d+ STATS/) { $line =~ s/^\-?\d+\sSTATS\s//; my $tmp = thaw(decode_base64($line)); $config->{stats} ||= {}; # special case chr lengths my %chr_lengths; if(defined($config->{stats}->{chr_lengths})) { merge_hashes($config->{stats}->{chr_lengths}, $tmp->{chr_lengths}); %chr_lengths = %{$config->{stats}->{chr_lengths}}; } merge_hashes($config->{stats}, $tmp, 1); $config->{stats}->{chr_lengths} = \%chr_lengths; } else { # grab the PID $line =~ m/^(\-?\d+)\s/; my $pid = $1; die "ERROR: Could not parse forked PID from line $line" unless defined($pid); # remove the PID $line =~ s/^\-?\d+\s//; chomp $line; # decode and thaw "output" from forked process push @{$by_pid{$pid}}, thaw(decode_base64($line)); } } # something's wrong else { print STDERR "\n$line\n"; # kill the other pids #foreach my $pid(@pids) { # kill(15, $pid); #} #die("\nERROR: Forked process failed\n$line\n"); } } last if $active_forks < $config->{fork}; } } end_progress($config); debug("Writing output") unless defined($config->{quiet}); waitpid($_, 0) for @pids; # add the sorted data to the return array push @return, @{$by_pid{$_} || []} for @pids; } # no forking else { push @return, @{vf_list_to_cons($config, $listref)}; } if(defined($config->{debug})) { eval q{use Devel::Size qw(total_size)}; my $mem = memory(); my $tot; $tot += $_ for @$mem; if($tot > 1000000) { $tot = sprintf("%.2fGB", $tot / (1024 * 1024)); } elsif($tot > 1000) { $tot = sprintf("%.2fMB", $tot / 1024); } my $mem_diff = mem_diff($config); debug( "LINES ", $config->{line_number}, "\tMEMORY $tot ", (join " ", @$mem), "\tDIFF ", (join " ", @$mem_diff), ); #exit(0) if grep {$_ < 0} @$mem_diff; } return \@return; } sub vf_list_to_cons { my $config = shift; my $listref = shift; # initialize caches $config->{$_.'_cache'} ||= {} for qw(tr rf slice); # build hash my %vf_hash; push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for @$listref; # get regions my $regions = ®ions_from_hash($config, \%vf_hash); my $trim_regions = $regions; # prune caches if(!defined($config->{forked})) { prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); } # get chr list my @chrs = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{{map {$_->{chr} => 1} @$listref}}; my $fetched_tr_count = 0; $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions) unless defined($config->{no_consequences}); my $fetched_rf_count = 0; $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions) if defined($config->{regulatory}) && !defined($config->{no_consequences}); # get non-variants my @non_variants = grep {$_->{non_variant}} @$listref; # check existing VFs if(defined($config->{'cache_var_type'}) && $config->{'cache_var_type'} eq 'tabix') { check_existing_tabix($config, $listref) if defined($config->{check_existing}); } else { check_existing_hash($config, \%vf_hash) if defined($config->{check_existing}); } my $new_listref = []; # skip any based on frequency checks? if(defined($config->{check_frequency})) { foreach my $vf(@$listref) { if(defined($vf->{existing}) && scalar @{$vf->{existing}}) { my @passed = grep {$_} map {check_frequencies($config, $_)} reverse @{$vf->{existing}}; push @$new_listref, $vf if scalar @passed == scalar @{$vf->{existing}}; $vf->{freqs} = $config->{filtered_freqs}; } else { push @$new_listref, $vf; } } } else { $new_listref = $listref; } # if using consequence filter, we're not interested in how many remain yet $config->{stats}->{filter_count} += scalar @$new_listref unless defined($config->{filter}); # remake hash without non-variants %vf_hash = (); push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$new_listref; # get overlapping SVs &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs}); my @return; foreach my $chr(@chrs) { my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash); # non-variants? if(scalar @non_variants) { push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants; # need to re-sort @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs; } debug("Calculating consequences") unless defined($config->{quiet}); my $vf_count = scalar @$finished_vfs; my $vf_counter = 0; while(my $vf = shift @$finished_vfs) { progress($config, $vf_counter++, $vf_count) unless $vf_count == 1; my $filter_ok = 1; # filtered output if(defined($config->{filter})) { $filter_ok = filter_by_consequence($config, $vf); $config->{stats}->{filter_count} += $filter_ok; } # skip filtered lines next unless $filter_ok; # original output if(defined($config->{original})) { push @return, \$vf->{_line}; } # GVF output elsif(defined($config->{gvf})) { $vf->source("User"); $config->{gvf_id} ||= 1; # get custom annotation my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {}; $custom_annotation->{ID} = $config->{gvf_id}++; my $tmp = $vf->to_gvf( include_consequences => defined($config->{no_consequences}) ? 0 : 1, extra_attrs => $custom_annotation, ); push @return, \$tmp; } # VCF output elsif(defined($config->{vcf})) { # convert to VCF, otherwise get line my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf); if(!defined($line->[7]) || $line->[7] eq '.') { $line->[7] = ''; } # nuke existing CSQ field if($line->[7] =~ /CSQ\=/ && !defined($config->{keep_csq})) { $line->[7] =~ s/CSQ\=\S+?\;?(\s|$)/$1/; } # get all the lines the normal way # and process them into VCF-compatible string my $string = 'CSQ='; foreach my $line(@{vf_to_consequences($config, $vf)}) { # use the field list (can be user-defined by setting --fields) for my $col(@{$config->{fields}}) { # skip fields already represented in the VCF next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra'; # search for data in main line hash as well as extra field my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col}; reverse_comp(\$data) if $vf->strand < 0 and $col eq 'Allele'; # "-" means null for everything except the Allele field (confusing...) $data = undef if defined($data) and $data eq '-' and $col ne 'Allele'; $data =~ s/\,/\&/g if defined $data; $string .= defined($data) ? $data : ''; $string .= '|'; } $string =~ s/\|$//; $string .= ','; } $string =~ s/\,$//; if(!defined($config->{no_consequences}) && $string ne 'CSQ=') { $line->[7] .= ($line->[7] ? ';' : '').$string; } # get custom annotation if(defined($config->{custom}) && scalar @{$config->{custom}}) { my $custom_annotation = get_custom_annotation($config, $vf); foreach my $key(keys %{$custom_annotation}) { $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key}; } } $_ ||= '.' for @$line; my $tmp = join "\t", @$line; push @return, \$tmp; } # no consequence output from vep input elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') { my $line = [split /\s+/, $vf->{_line}]; if($line->[13] eq '-') { $line->[13] = ''; } # get custom annotation if(defined($config->{custom})) { my $custom_annotation = get_custom_annotation($config, $vf); foreach my $key(keys %{$custom_annotation}) { $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key}; } } my $tmp = join "\t", @$line; push @return, \$tmp; } # XML output for Solr elsif(defined($config->{solr})) { eval q{ use CGI qw(escape); }; foreach my $con(@{vf_to_consequences($config, $vf)}) { my $line = "\n"; # create unique ID $line .= sprintf(qq{ %s_%i_%i_%s_%s\n}, $vf->{chr}, $vf->{start}, $vf->{end}, $con->{Allele} || '-', $con->{Feature} || '-'); # add proper location fields that can be indexed $line .= sprintf(qq{ %s\n}, $vf->{chr}); $line .= sprintf(qq{ %s\n}, $vf->{start}); $line .= sprintf(qq{ %s\n}, $vf->{end}); foreach my $col(@{$config->{fields}}) { # search for data in main line hash as well as extra field my $val = defined $con->{$col} ? $con->{$col} : $con->{Extra}->{$col}; next unless defined($val) && $val ne '-'; # some have multiple values foreach my $data(split(',', $val)) { # split SIFT and PolyPhen if($col eq 'SIFT' || $col eq 'PolyPhen') { if($data =~ m/([a-z\_]+)?\(?([\d\.]+)?\)?/i) { my ($pred, $score) = ($1, $2); $line .= sprintf(qq{ %s\n}, $col.'_pred', $pred) if $pred; $line .= sprintf(qq{ %s\n}, $col.'_score', $score) if defined($score); } } # GMAF elsif($col eq 'GMAF') { if($data =~ m/([\d\.]+)/) { $line .= sprintf(qq{ %s\n}, $col, $1) if defined($1); } } else { $line .= sprintf(qq{ %s\n}, $col, escape($data)) if defined($data); } } } $line .= "\n"; push @return, \$line; } } # normal output else { push @return, @{vf_to_consequences($config, $vf)}; } print PARENT "BUMP\n" if defined($config->{forked}) && !defined($config->{no_progress}); } end_progress($config) unless scalar @$listref == 1; } return \@return; } # takes a variation feature and returns ready to print consequence information sub vf_to_consequences { my $config = shift; my $vf = shift; # force empty hash into object's transcript_variations if undefined from whole_genome_fetch # this will stop the API trying to go off and fill it again $vf->{transcript_variations} ||= {} if defined $config->{whole_genome}; # pos stats $config->{stats}->{chr}->{$vf->{chr}}->{1e6 * int($vf->start / 1e6)}++; $config->{stats}->{var_cons}->{$vf->display_consequence}++; # use a different method for SVs return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); my @return = (); # get allele nums if(defined($config->{allele_number})) { my @alleles = split /\//, $vf->allele_string || ''; %{$vf->{_allele_nums}} = map {$alleles[$_] => $_} (0..$#alleles); } # method name for consequence terms my $term_method = $config->{terms}.'_term'; # method name stub for getting *VariationAlleles my $allele_method = defined($config->{process_ref_homs}) ? 'get_all_' : 'get_all_alternate_'; # find any co-located existing VFs $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing}; # get stats my $so_term = SO_variation_class($vf->allele_string, 1); if(defined($so_term)) { $config->{stats}->{classes}->{$so_term}++; $config->{stats}->{allele_changes}->{$vf->allele_string}++ if $so_term eq 'SNV'; } # stats $config->{stats}->{existing}++ if defined($vf->{existing}) && scalar @{$vf->{existing}}; # regulatory stuff if(!defined $config->{coding_only} && defined $config->{regulatory}) { for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { my $rf = $rfv->regulatory_feature; my $base_line = { Feature_type => 'RegulatoryFeature', Feature => $rf->stable_id, }; $config->{stats}->{regfeats}->{$rf->stable_id} = 1; if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { $base_line->{Extra}->{CELL_TYPE} = join ",", map {$_.':'.$rf->{cell_types}->{$_}} grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}}; $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; } # this currently always returns 'RegulatoryFeature', so we ignore it for now #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name; my $method = $allele_method.'RegulatoryFeatureVariationAlleles'; for my $rfva (@{ $rfv->$method }) { my $line = init_line($config, $vf, $base_line); $line->{Allele} = $rfva->variation_feature_seq; $line->{Consequence} = join ',', map { $_->$term_method || $_->SO_term } @{ $rfva->get_all_OverlapConsequences }; map {$config->{stats}->{consequences}->{$_->$term_method}++} @{$rfva->get_all_OverlapConsequences}; $line = run_plugins($rfva, $line, $config); push @return, $line; } } for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { my $mf = $mfv->motif_feature; # check that the motif has a binding matrix, if not there's not # much we can do so don't return anything next unless defined $mf->binding_matrix; my $matrix = $mf->binding_matrix->description.' '.$mf->display_label; $matrix =~ s/\s+/\_/g; my $base_line = { Feature_type => 'MotifFeature', Feature => $mf->binding_matrix->name, Extra => { MOTIF_NAME => $matrix, } }; if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { $base_line->{Extra}->{CELL_TYPE} = join ",", map {$_.':'.$mf->{cell_types}->{$_}} grep {$mf->{cell_types}->{$_}} @{$config->{cell_type}}; $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; } my $method = $allele_method.'MotifFeatureVariationAlleles'; for my $mfva (@{ $mfv->$method }) { my $line = init_line($config, $vf, $base_line); $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start; $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N'); my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/; $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta; $line->{Allele} = $mfva->variation_feature_seq; $line->{Consequence} = join ',', map { $_->$term_method || $_->SO_term } @{ $mfva->get_all_OverlapConsequences }; map {$config->{stats}->{consequences}->{$_->$term_method}++} @{$mfva->get_all_OverlapConsequences}; $line = run_plugins($mfva, $line, $config); push @return, $line; } } } # get TVs my $tvs = $vf->get_all_TranscriptVariations; # only most severe if(defined($config->{most_severe}) || defined($config->{summary})) { my $line = init_line($config, $vf); if(defined($config->{summary})) { $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type}; } else { $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence; } push @return, $line; } # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check # (to stay consistent with the rest of the VEP) elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) { my $method = $allele_method.'IntergenicVariationAlleles'; for my $iva (@{ $iv->$method }) { my $line = init_line($config, $vf); $line->{Allele} = $iva->variation_feature_seq; my $cons = $iva->get_all_OverlapConsequences->[0]; $line->{Consequence} = $cons->$term_method || $cons->SO_term; $config->{stats}->{consequences}->{$cons->$term_method || $cons->SO_term}++; $line = run_plugins($iva, $line, $config); push @return, $line; } } # user wants only one conseqeunce per gene elsif(defined($config->{per_gene})) { # sort the TVA objects into a hash by gene my %by_gene; foreach my $tv(@$tvs) { next if(defined $config->{coding_only} && !($tv->affects_cds)); my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id; my $method = $allele_method.'TranscriptVariationAlleles'; push @{$by_gene{$gene}}, @{$tv->$method}; } foreach my $gene(keys %by_gene) { my ($lowest, $lowest_tva); # at the moment this means the one that comes out last will be picked # if there is more than one TVA with the same rank of consequence foreach my $tva(@{$by_gene{$gene}}) { foreach my $oc(@{$tva->get_all_OverlapConsequences}) { if(!defined($lowest) || $oc->rank < $lowest) { $lowest = $oc->rank; $lowest_tva = $tva; } } } push @return, tva_to_line($config, $lowest_tva); } } else { foreach my $tv(@$tvs) { next if(defined $config->{coding_only} && !($tv->affects_cds)); my $method = $allele_method.'TranscriptVariationAlleles'; push @return, map {tva_to_line($config, $_)} @{$tv->$method}; } } return \@return; } # get consequences for a structural variation feature sub svf_to_consequences { my $config = shift; my $svf = shift; my @return = (); # stats $config->{stats}->{classes}->{$svf->{class_SO_term}}++; my $term_method = $config->{terms}.'_term'; if(defined $config->{whole_genome}) { $svf->{transcript_structural_variations} ||= []; $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES; } if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) { for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) { my $line = init_line($config, $svf); $line->{Allele} = '-'; my $cons = $iva->get_all_OverlapConsequences->[0]; $line->{Consequence} = $cons->$term_method || $cons->SO_term; $config->{stats}->{consequences}->{$cons->$term_method || $cons->SO_term}++; $line = run_plugins($iva, $line, $config); push @return, $line; } } foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) { next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation'); my $feature = $svo->feature; # get feature type my $feature_type = (split '::', ref($feature))[-1]; my $base_line = { Feature_type => $feature_type, Feature => $feature->stable_id, Allele => $svf->class_SO_term, }; if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) { $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end). (defined($config->{total_length}) ? '/'.$feature->length : ''); $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end). (defined($config->{total_length}) && $feature->{_variation_effect_feature_cache}->{translateable_seq} ? '/'.length($feature->{_variation_effect_feature_cache}->{translateable_seq}) : '' ); $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end). (defined($config->{total_length}) && $feature->{_variation_effect_feature_cache}->{peptide} ? '/'.length($feature->{_variation_effect_feature_cache}->{peptide}) : '' ); } foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) { my $line = init_line($config, $svf, $base_line); $line->{Consequence} = join ",", #map {s/feature/$feature_type/e; $_} map {$_->$term_method} sort {$a->rank <=> $b->rank} @{$svoa->get_all_OverlapConsequences}; map {$config->{stats}->{consequences}->{$_->$term_method}++} @{$svoa->get_all_OverlapConsequences}; # work out overlap amounts my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1]; my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0]; my $overlap_length = ($overlap_end - $overlap_start) + 1; my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1)); $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0; $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0; add_extra_fields($config, $line, $svoa); push @return, $line; } } return \@return; } # run all of the configured plugins on a VariationFeatureOverlapAllele instance # and store any results in the provided line hash sub run_plugins { my ($bvfoa, $line_hash, $config) = @_; my $skip_line = 0; for my $plugin (@{ $config->{plugins} || [] }) { # check that this plugin is interested in this type of variation feature if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) { # check that this plugin is interested in this type of feature if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) { eval { my $plugin_results = $plugin->run($bvfoa, $line_hash); if (defined $plugin_results) { if (ref $plugin_results eq 'HASH') { for my $key (keys %$plugin_results) { $line_hash->{Extra}->{$key} = $plugin_results->{$key}; } } else { warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n"; } } else { # if a plugin returns undef, that means it want to filter out this line $skip_line = 1; } }; if ($@) { warn "Plugin '".(ref $plugin)."' went wrong: $@"; } # there's no point running any other plugins if we're filtering this line, # because the first plugin to skip the line wins, so we might as well last # out of the loop now and avoid any unnecessary computation last if $skip_line; } } } return $skip_line ? undef : $line_hash; } # turn a TranscriptVariationAllele into a line hash sub tva_to_line { my $config = shift; my $tva = shift; my $tv = $tva->transcript_variation; my $t = $tv->transcript; # method name for consequence terms my $term_method = $config->{terms}.'_term'; my $base_line = { Feature_type => 'Transcript', Feature => (defined $t ? $t->stable_id : undef), cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end). (defined($config->{total_length}) ? '/'.$t->length : ''), CDS_position => format_coords($tv->cds_start, $tv->cds_end). (defined($config->{total_length}) && $t->{_variation_effect_feature_cache}->{translateable_seq} ? '/'.length($t->{_variation_effect_feature_cache}->{translateable_seq}) : '' ), Protein_position => format_coords($tv->translation_start, $tv->translation_end). (defined($config->{total_length}) && $t->{_variation_effect_feature_cache}->{peptide} ? '/'.length($t->{_variation_effect_feature_cache}->{peptide}) : '' ), Allele => $tva->variation_feature_seq, Amino_acids => $tva->pep_allele_string, Codons => $tva->display_codon_allele_string, Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences}, }; # update stats $config->{stats}->{transcripts}->{$t->stable_id} = 1 if defined($t); map {$config->{stats}->{consequences}->{$_}++} map {$_->$term_method || $_->SO_term} @{$tva->get_all_OverlapConsequences}; if(defined($tv->translation_start)) { $config->{stats}->{protein_pos}->{int(10 * ($tv->translation_start / ($t->{_variation_effect_feature_cache}->{peptide} ? length($t->{_variation_effect_feature_cache}->{peptide}) : $t->translation->length)))}++; } my $line = init_line($config, $tva->variation_feature, $base_line); # HGVS if(defined $config->{hgvs}) { my $hgvs_t = $tva->hgvs_transcript; my $hgvs_p = $tva->hgvs_protein; # URI encode "=" $hgvs_p =~ s/\=/\%3D/g if $hgvs_p && !defined($config->{no_escape}); $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t; $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p; } foreach my $tool (qw(SIFT PolyPhen)) { my $lc_tool = lc($tool); if (my $opt = $config->{$lc_tool}) { my $want_pred = $opt =~ /^p/i; my $want_score = $opt =~ /^s/i; my $want_both = $opt =~ /^b/i; if ($want_both) { $want_pred = 1; $want_score = 1; } next unless $want_pred || $want_score; my $pred_meth = $lc_tool.'_prediction'; my $score_meth = $lc_tool.'_score'; my $pred = $tva->$pred_meth; if($pred) { if ($want_pred) { $pred =~ s/\s+/\_/; $line->{Extra}->{$tool} = $pred; } if ($want_score) { my $score = $tva->$score_meth; if(defined $score) { if($want_pred) { $line->{Extra}->{$tool} .= "($score)"; } else { $line->{Extra}->{$tool} = $score; } } } } # update stats $config->{stats}->{$tool}->{$tva->$pred_meth}++ if $tva->$pred_meth; } } $line = add_extra_fields($config, $line, $tva); # allele number if(defined($config->{allele_number})) { $line->{Extra}->{ALLELE_NUM} = $tv->variation_feature->{_allele_nums}->{$tva->variation_feature_seq} || '?' if $tv->variation_feature->{_allele_nums}; } return $line; } sub add_extra_fields { my $config = shift; my $line = shift; my $bvfoa = shift; # overlapping SVs if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) { $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs}; } # add transcript-specific fields $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele'); # run plugins $line = run_plugins($bvfoa, $line, $config); return $line; } sub add_extra_fields_transcript { my $config = shift; my $line = shift; my $tva = shift; my $tv = $tva->base_variation_feature_overlap; my $tr = $tva->transcript; # get gene my $gene; $line->{Gene} = $tr->{_gene_stable_id}; if(!defined($line->{Gene})) { $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); $line->{Gene} = $gene ? $gene->stable_id : '-'; } $config->{stats}->{genes}->{$line->{Gene}}++ if defined($line->{Gene}); # exon/intron numbers if ($config->{numbers}) { $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number; $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number; } if ($config->{domains}) { my $feats = $tv->get_overlapping_ProteinFeatures; my @strings; for my $feat (@$feats) { my $label = $feat->analysis->display_label.':'.$feat->hseqname; # replace any special characters $label =~ s/[\s;=]/_/g; push @strings, $label; } $line->{Extra}->{DOMAINS} = join ',', @strings if @strings; } # distance to transcript if($line->{Consequence} =~ /(up|down)stream/i) { $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript; } # gene symbol if(defined $config->{symbol}) { my $symbol; $symbol = $tr->{_gene_symbol} || $tr->{_gene_hgnc}; if(!defined($symbol)) { if(!defined($gene)) { $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); } $symbol = $gene->display_xref ? $gene->display_xref->display_id : undef; } $symbol = undef if defined($symbol) && $symbol eq '-'; $line->{Extra}->{SYMBOL} = $symbol if defined($symbol); } # CCDS if(defined($config->{ccds})) { my $ccds = $tr->{_ccds}; if(!defined($ccds)) { my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; $ccds = $entries[0]->display_id if scalar @entries; } $ccds = undef if defined($ccds) && $ccds eq '-'; $line->{Extra}->{CCDS} = $ccds if defined($ccds); } # refseq xref if(defined($config->{xref_refseq})) { my $refseq = $tr->{_refseq}; if(!defined($refseq)) { my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; if(scalar @entries) { $refseq = join ",", map {$_->display_id."-".$_->database} @entries; } } $refseq = undef if defined($refseq) && $refseq eq '-'; $line->{Extra}->{RefSeq} = $refseq if defined($refseq); } # protein ID if(defined $config->{protein}) { my $protein = $tr->{_protein}; if(!defined($protein)) { $protein = $tr->translation->stable_id if defined($tr->translation); } $protein = undef if defined($protein) && $protein eq '-'; $line->{Extra}->{ENSP} = $protein if defined($protein); } # canonical transcript if(defined $config->{canonical}) { $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical; } # biotype if(defined $config->{biotype}) { $line->{Extra}->{BIOTYPE} = $tr->biotype; } return $line; } # initialize a line hash sub init_line { my $config = shift; my $vf = shift; my $base_line = shift; my $line = { Uploaded_variation => $vf->variation_name, Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end), Existing_variation => defined $vf->{existing} && scalar @{$vf->{existing}} ? join ",", map {$_->{variation_name}} @{$vf->{existing}} : '-', Extra => {}, }; # add custom info if(defined($config->{custom}) && scalar @{$config->{custom}}) { # merge the custom hash with the extra hash my $custom = get_custom_annotation($config, $vf); for my $key (keys %$custom) { $line->{Extra}->{$key} = $custom->{$key}; } } # individual? if(defined($vf->{individual})) { $line->{Extra}->{IND} = $vf->{individual}; # zygosity if(defined($vf->{genotype})) { my %unique = map {$_ => 1} @{$vf->{genotype}}; $line->{Extra}->{ZYG} = (scalar keys %unique > 1 ? 'HET' : 'HOM').(defined($vf->{hom_ref}) ? 'REF' : ''); } } # frequencies? $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs}); # gmaf? if(defined($config->{gmaf}) && defined($vf->{existing}) && scalar @{$vf->{existing}}) { my @gmafs = map {$_->{minor_allele}.':'.$_->{minor_allele_freq}} grep {defined($_->{minor_allele}) && $_->{minor_allele_freq} =~ /\d/} @{$vf->{existing}}; $line->{Extra}->{GMAF} = join ",", @gmafs if scalar @gmafs; } # existing var stuff if(defined($vf->{existing}) && scalar @{$vf->{existing}}) { # 1KG MAFs? if(defined($config->{maf_1kg})) { my @pops = qw(AFR AMR ASN EUR); foreach my $var(@{$vf->{existing}}) { foreach my $pop(grep {defined($var->{$_})} @pops) { my $freq = $var->{$pop}; $freq = '-' unless defined($freq); $line->{Extra}->{$pop.'_MAF'} = exists($line->{Extra}->{$pop.'_MAF'}) ? $line->{Extra}->{$pop.'_MAF'}.','.$freq : $freq; } } } # ESP MAFs? if(defined($config->{maf_esp})) { my @pops = qw(AA EA); foreach my $var(@{$vf->{existing}}) { foreach my $pop(grep {defined($var->{$_})} @pops) { my $freq = $var->{$pop}; $freq = '-' unless defined($freq); $line->{Extra}->{$pop.'_MAF'} = exists($line->{Extra}->{$pop.'_MAF'}) ? $line->{Extra}->{$pop.'_MAF'}.','.$freq : $freq; } } } # clin sig and pubmed? foreach my $var(@{$vf->{existing}}) { if(defined($var->{clin_sig}) && $var->{clin_sig}) { $line->{Extra}->{CLIN_SIG} = exists($line->{Extra}->{CLIN_SIG}) ? $line->{Extra}->{CLIN_SIG}.','.$var->{clin_sig} : $var->{clin_sig}; } if(defined($config->{pubmed}) && defined($var->{pubmed}) && $var->{pubmed}) { $line->{Extra}->{PUBMED} = exists($line->{Extra}->{PUBMED}) ? $line->{Extra}->{PUBMED}.','.$var->{pubmed} : $var->{pubmed}; } } } # copy entries from base_line merge_hashes($line, $base_line) if defined($base_line); return $line; } # get custom annotation for a single VF sub get_custom_annotation { my $config = shift; my $vf = shift; my $cache = shift; return $vf->{custom} if defined($vf->{custom}); my $annotation = {}; my $chr = $vf->{chr}; if(!defined($cache)) { # spoof regions my $regions; $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}]; $cache = cache_custom_annotation($config, $regions, $chr); } foreach my $custom(@{$config->{custom}}) { next unless defined($cache->{$chr}->{$custom->{name}}); # exact type must match coords of variant exactly if($custom->{type} eq 'exact') { foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) { next unless $feature->{chr} eq $chr && $feature->{start} eq $vf->{start} && $feature->{end} eq $vf->{end}; $annotation->{$custom->{name}} .= $feature->{name}.','; foreach my $field(@{$custom->{fields}}) { $annotation->{$custom->{name}."_".$field} .= $feature->{$field}.',' if defined($feature->{$field}); } } } # overlap type only needs to overlap, but we need to search the whole range elsif($custom->{type} eq 'overlap') { foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) { foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) { next unless $feature->{chr} eq $chr && $feature->{end} >= $vf->{start} && $feature->{start} <= $vf->{end}; $annotation->{$custom->{name}} .= $feature->{name}.','; foreach my $field(@{$custom->{fields}}) { $annotation->{$custom->{name}."_".$field} = $feature->{$field} if defined($feature->{$field}); } } } } # trim off trailing commas $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}}); foreach my $field(@{$custom->{fields}}) { $annotation->{$custom->{name}."_".$field} =~ s/\,$//g if defined($annotation->{$custom->{name}."_".$field}); } } return $annotation; } # decides whether to print a VF based on user defined consequences sub filter_by_consequence { my $config = shift; my $vf = shift; my $filters = $config->{filter}; # find it if we only have "no"s my $only_nos = 0; $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0; my ($yes, $no) = (0, 0); # get all consequences across all term types my @types = ('SO', 'display'); my @cons; push @cons, @{$vf->consequence_type($_)} for @types; my $method_mod = $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature') ? 'Structural' : ''; # add regulatory consequences if(defined($config->{regulatory})) { foreach my $term_type(@types) { my $term_method = $term_type.'_term'; my $m1 = 'get_all_RegulatoryFeature'.$method_mod.'Variations'; my $m2 = 'get_all_RegulatoryFeature'.$method_mod.'VariationAlleles'; for my $rfv (@{ $vf->$m1 }) { for my $rfva(@{$rfv->$m2}) { push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences }; } } $m1 = 'get_all_MotifFeature'.$method_mod.'Variations'; $m2 = 'get_all_MotifFeature'.$method_mod.'VariationAlleles'; for my $mfv (@{ $vf->$m1 }) { for my $mfva(@{$mfv->$m2}) { push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences }; } } } } foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) { if($filters->{$con} == 1) { $yes = 1; } else { $no = 1; } } # check special case, coding if(defined($filters->{coding})) { my $method = 'get_all_Transcript'.$method_mod.'Variations'; if(grep {$_->affects_cds} @{$vf->$method}) { if($filters->{coding} == 1) { $yes = 1; } else { $no = 1; } } } my $ok = 0; if($only_nos) { $ok = 1 if !$no; } else { $ok = 1 if $yes && !$no; } return $ok; } # takes VFs created from input, fixes and checks various things sub validate_vf { my $config = shift; my $vf = shift; # user specified chr skip list return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}}; # fix inputs $vf->{chr} =~ s/^chr//ig unless $vf->{chr} =~ /^chromosome$/i; $vf->{chr} = 'MT' if $vf->{chr} eq 'M'; $vf->{strand} ||= 1; $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1"); # sanity checks unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) { warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet}; return 0; } # structural variation? return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); # uppercase allele string $vf->{allele_string} =~ tr/[a-z]/[A-Z]/; unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) { warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet}; return 0; } # insertion should have start = end + 1 if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) { warn( "WARNING: Alleles look like an insertion (". $vf->{allele_string}. ") but coordinates are not start = end + 1 (START=". $vf->{start}.", END=".$vf->{end}. ") on line ".$config->{line_number}."\n" ) unless defined($config->{quiet}); return 0; } # check start <= end + 1 if($vf->{start} > $vf->{end} + 1) { warn( "WARNING: start > end+1 : (START=".$vf->{start}. ", END=".$vf->{end}. ") on line ".$config->{line_number}."\n" ) unless defined($config->{quiet}); return 0; } # check length of reference matches seq length spanned my @alleles = split /\//, $vf->{allele_string}; my $ref_allele = shift @alleles; my $tmp_ref_allele = $ref_allele; $tmp_ref_allele =~ s/\-//g; #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) { # warn( # "WARNING: Length of reference allele (".$ref_allele. # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}. # " on line ".$config->{line_number} # ) unless defined($config->{quiet}); # return 0; #} # flag as unbalanced foreach my $allele(@alleles) { $allele =~ s/\-//g; $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele); } # check reference allele if requested if(defined $config->{check_ref}) { my $ok = 0; my $slice_ref_allele; # insertion, therefore no ref allele to check if($ref_allele eq '-') { $ok = 1; } else { my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand}); if(!defined($slice_ref)) { warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet}; } else { $slice_ref_allele = $slice_ref->seq; $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0); } } if(!$ok) { warn "WARNING: Specified reference allele $ref_allele ", "does not match Ensembl reference allele", ($slice_ref_allele ? " $slice_ref_allele" : ""), " on line ".$config->{line_number} unless defined $config->{quiet}; return 0; } } return 1; } # validate a structural variation sub validate_svf { my $config = shift; my $svf = shift; return 1; } # takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts # from database and/or cache sub whole_genome_fetch { my $config = shift; my $chr = shift; my $vf_hash = shift; my (%vf_done, @finished_vfs, %seen_rfs); if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) { debug("No cache found for chromsome $chr") unless defined($config->{quiet}); foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; } } return \@finished_vfs; } my $slice_cache = $config->{slice_cache}; $slice_cache = build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr}); $slice_cache = build_slice_cache($config, $config->{rf_cache}) unless defined($slice_cache->{$chr}); debug("Analyzing chromosome $chr") unless defined($config->{quiet}); # custom annotations whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom}); # split up normal variations from SVs my ($tmp_vf_hash, @svfs); foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { push @svfs, $vf; } else { push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf; } } } } $vf_hash = $tmp_vf_hash; # transcript annotations whole_genome_fetch_transcript($config, $vf_hash, $chr) unless defined($config->{no_consequences}); # regulatory annotations whole_genome_fetch_reg($config, $vf_hash, $chr) if defined($config->{regulatory}) && !defined($config->{no_consequences}); # structural variations @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)} if scalar @svfs; # sort results into @finished_vfs array foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { # pinch slice from slice cache if we don't already have it $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}}; if(defined($config->{regulatory})) { foreach my $type(@REG_FEAT_TYPES) { $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}}; } } if(defined($config->{custom})) { $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; } $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; # add to final array push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; } } # sort @finished_vfs = sort { ($a->{_line_number} || 1) <=> ($b->{_line_number} || 1) || $a->{start} <=> $b->{start} || $a->{end} <=> $b->{end} } @finished_vfs; # clean hash delete $vf_hash->{$chr}; return \@finished_vfs; } sub whole_genome_fetch_custom { my $config = shift; my $vf_hash = shift; my $chr = shift; return unless scalar @{$config->{custom}}; # create regions based on VFs instead of chunks my $tmp_regions; foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1); } } } return unless defined($tmp_regions->{$chr}); # cache annotations my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr); # count and report my $total_annotations = 0; $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}}; debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")") unless defined($config->{quiet}); # compare annotations to variations in hash debug("Analyzing custom annotations") unless defined($config->{quiet}); my $total = scalar keys %{$vf_hash->{$chr}}; my $i = 0; foreach my $chunk(keys %{$vf_hash->{$chr}}) { progress($config, $i++, $total); foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache); } } } end_progress($config); } sub whole_genome_fetch_transcript { my $config = shift; my $vf_hash = shift; my $chr = shift; my $tr_cache = $config->{tr_cache}; my $slice_cache = $config->{slice_cache}; my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT; # check we have defined regions return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr}); # copy slice from transcript to slice cache $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); debug("Analyzing variants") unless defined($config->{quiet}); my $tr_counter = 0; my $tr_count = scalar @{$tr_cache->{$chr}}; while($tr_counter < $tr_count) { progress($config, $tr_counter, $tr_count); my $tr = $tr_cache->{$chr}->[$tr_counter++]; # do each overlapping VF my $s = $tr->start - $up_down_size; my $e = $tr->end + $up_down_size; # get the chunks this transcript overlaps my %chunks; $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; # pointer to previous VF # used to tell plugins this is the last variant analysed in this transcript my $previous_vf; foreach my $chunk(keys %chunks) { foreach my $vf( grep {$_->{start} <= $e && $_->{end} >= $s} map {@{$vf_hash->{$chr}{$chunk}{$_}}} keys %{$vf_hash->{$chr}{$chunk}} ) { # pinch slice from slice cache if we don't already have it $vf->{slice} ||= $slice_cache->{$chr}; $vf->{slice} = $slice_cache->{$chr} if defined($vf->{slice}->{is_fake}); my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( -transcript => $tr, -variation_feature => $vf, -adaptor => $config->{tva}, -no_ref_check => 1, -no_transfer => 1 ); # prefetching stuff here prevents doing loads at the # end and makes progress reporting more useful $tv->_prefetch_for_vep; $vf->add_TranscriptVariation($tv); # cache VF on the transcript if it is an unbalanced sub push @{$tr->{indels}}, $vf if defined($vf->{indel}); if(defined($config->{individual})) { # store VF on transcript, weaken reference to avoid circularity push @{$tr->{vfs}->{$vf->{individual}}}, $vf; weaken($tr->{vfs}->{$vf->{individual}}->[-1]); delete $previous_vf->{last_in_transcript}->{$tr->stable_id}; $vf->{last_in_transcript}->{$tr->stable_id} = 1; } $previous_vf = $vf; } } } end_progress($config); } sub whole_genome_fetch_reg { my $config = shift; my $vf_hash = shift; my $chr = shift; my $rf_cache = $config->{rf_cache}; my $slice_cache = $config->{slice_cache}; foreach my $type(keys %{$rf_cache->{$chr}}) { debug("Analyzing ".$type."s") unless defined($config->{quiet}); my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation'; my $rf_counter = 0; my $rf_count = scalar @{$rf_cache->{$chr}->{$type}}; while($rf_counter < $rf_count) { progress($config, $rf_counter, $rf_count); my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++]; # do each overlapping VF my $s = $rf->{start}; my $e = $rf->{end}; # get the chunks this transcript overlaps my %chunks; $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; foreach my $chunk(keys %chunks) { foreach my $vf( grep {$_->{start} <= $e && $_->{end} >= $s} map {@{$vf_hash->{$chr}{$chunk}{$_}}} keys %{$vf_hash->{$chr}{$chunk}} ) { push @{$vf->{regulation_variations}->{$type}}, $constructor->new( -variation_feature => $vf, -feature => $rf, -no_ref_check => 1, -no_transfer => 1 ); } } } end_progress($config); } } sub whole_genome_fetch_sv { my $config = shift; my $svfs = shift; my $chr = shift; my $tr_cache = $config->{tr_cache}; my $rf_cache = $config->{rf_cache}; my $slice_cache = $config->{slice_cache}; debug("Analyzing structural variations") unless defined($config->{quiet}); my($i, $total) = (0, scalar @$svfs); my @finished_vfs; foreach my $svf(@$svfs) { progress($config, $i++, $total); my %done_genes = (); if(defined($tr_cache->{$chr})) { foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) { my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new( -transcript => $tr, -structural_variation_feature => $svf, -no_transfer => 1 ); $svf->add_TranscriptStructuralVariation($svo); } } $svf->{transcript_structural_variations} ||= {}; # do regulatory features if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) { foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) { foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) { my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new( -feature => $rf, -structural_variation_feature => $svf, -no_transfer => 1 ); push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo; } $svf->{regulation_structural_variations}->{$rf_type} ||= []; } } # sort them #$svf->_sort_svos; push @finished_vfs, $svf; } end_progress($config); return \@finished_vfs; } # retrieves transcripts given region list sub fetch_transcripts { my $config = shift; my $regions = shift; my $trim_regions = shift; my $tr_cache = $config->{tr_cache}; my $slice_cache = $config->{slice_cache}; my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); my %seen_trs; $count_from_mem = 0; my $region_count = 0; foreach my $chr(keys %{$regions}) { $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY'; $region_count += scalar @{$regions->{$chr}}; } my $counter; debug("Reading transcript data from cache and/or database") unless defined($config->{quiet}); foreach my $chr(keys %{$regions}) { foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) { progress($config, $counter++, $region_count); # skip regions beyond the end of the chr next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; next if defined($config->{loaded_tr}->{$chr}->{$region}); # force quiet so other methods don't mess up the progress bar my $quiet = $config->{quiet}; $config->{quiet} = 1; # try and load cache from disk if using cache my $tmp_cache; if(defined($config->{cache})) { #$tmp_cache = ( # defined($config->{cache_tr_type}) && $config->{cache_tr_type} eq 'tabix' ? # load_dumped_transcript_cache_tabix($config, $chr, $region) : # load_dumped_transcript_cache($config, $chr, $region) #); $tmp_cache = load_dumped_transcript_cache($config, $chr, $region); $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr}); $config->{loaded_tr}->{$chr}->{$region} = 1; } # no cache found on disk or not using cache if(!defined($tmp_cache->{$chr})) { if(defined($config->{offline})) { # restore quiet status $config->{quiet} = $quiet; debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); next; } # spoof temporary region hash my $tmp_hash; push @{$tmp_hash->{$chr}}, $region; $tmp_cache = cache_transcripts($config, $tmp_hash); # make it an empty arrayref that gets cached # so we don't get confused and reload next time round $tmp_cache->{$chr} ||= []; $count_from_db += scalar @{$tmp_cache->{$chr}}; # dump to disk if writing to cache (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); $config->{loaded_tr}->{$chr}->{$region} = 1; } # add loaded transcripts to main cache if(defined($tmp_cache->{$chr})) { while(my $tr = shift @{$tmp_cache->{$chr}}) { # track already added transcripts by dbID my $dbID = $tr->dbID; if($seen_trs{$dbID}) { $count_duplicates++; next; } # trim out? #if(defined($trim_regions) && defined($trim_regions->{$chr})) { # my $tmp_count = scalar grep { # overlap( # (split /\-/, $_)[0], (split /\-/, $_)[1], # $tr->{start}, $tr->{end} # ) # } @{$trim_regions->{$chr}}; # # if(!$tmp_count) { # $count_trimmed++; # next; # } #} $seen_trs{$dbID} = 1; push @{$tr_cache->{$chr}}, $tr; } } $tr_cache->{$chr} ||= []; undef $tmp_cache; # restore quiet status $config->{quiet} = $quiet; # build slice cache $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); } } end_progress($config); my $tr_count = 0; $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache; debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); return $tr_count; } sub fetch_regfeats { my $config = shift; my $regions = shift; my $trim_regions = shift; my $rf_cache = $config->{rf_cache}; my $slice_cache = $config->{slice_cache}; my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); my %seen_rfs; $count_from_mem = 0; my $region_count = 0; foreach my $chr(keys %$regions) { if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') { $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}}; } $region_count += scalar @{$regions->{$chr}}; } my $counter = 0; debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet}); foreach my $chr(keys %$regions) { foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) { progress($config, $counter++, $region_count); next if defined($config->{loaded_rf}->{$chr}->{$region}); # skip regions beyond the end of the chr next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; # force quiet so other methods don't mess up the progress bar my $quiet = $config->{quiet}; $config->{quiet} = 1; # try and load cache from disk if using cache my $tmp_cache; if(defined($config->{cache})) { $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region); #$tmp_cache = # defined($config->{tabix}) ? # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) : # load_dumped_reg_feat_cache($config, $chr, $region); if(defined($tmp_cache->{$chr})) { $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; } # flag as loaded $config->{loaded_rf}->{$chr}->{$region} = 1; } # no cache found on disk or not using cache if(!defined($tmp_cache->{$chr})) { if(defined($config->{offline})) { # restore quiet status $config->{quiet} = $quiet; debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); next; } # spoof temporary region hash my $tmp_hash; push @{$tmp_hash->{$chr}}, $region; $tmp_cache = cache_reg_feats($config, $tmp_hash); # make it an empty arrayref that gets cached # so we don't get confused and reload next time round $tmp_cache->{$chr} ||= {}; $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; # dump to disk if writing to cache #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); # restore deleted coord_system adaptor foreach my $type(keys %{$tmp_cache->{$chr}}) { $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}}; } # flag as loaded $config->{loaded_rf}->{$chr}->{$region} = 1; } # add loaded reg_feats to main cache if(defined($tmp_cache->{$chr})) { foreach my $type(keys %{$tmp_cache->{$chr}}) { while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) { # filter on cell type if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}}; } # trim out? #if(defined($trim_regions) && defined($trim_regions->{$chr})) { # my $tmp_count = scalar grep { # overlap( # (split /\-/, $_)[0], (split /\-/, $_)[1], # $rf->{start}, $rf->{end} # ) # } @{$trim_regions->{$chr}}; # # if(!$tmp_count) { # $count_trimmed++; # next; # } #} # track already added reg_feats by dbID my $dbID = $rf->{dbID}; if($seen_rfs{$dbID}) { $count_duplicates++; next; } $seen_rfs{$dbID} = 1; push @{$rf_cache->{$chr}->{$type}}, $rf; } } } undef $tmp_cache; # restore quiet status $config->{quiet} = $quiet; } } end_progress($config); my $rf_count = 0; foreach my $chr(keys %$rf_cache) { foreach my $type(keys %{$rf_cache->{$chr}}) { $rf_count += scalar @{$rf_cache->{$chr}->{$type}}; } } debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); return $rf_count; } # gets existing VFs for a vf_hash sub check_existing_hash { my $config = shift; my $vf_hash = shift; my $variation_cache; # we only care about non-SVs here my %new_hash; foreach my $chr(keys %{$vf_hash}) { foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) { push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var; } } } } $vf_hash = \%new_hash; debug("Checking for existing variations") unless defined($config->{quiet}); my ($chunk_count, $counter); $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; foreach my $chr(keys %{$vf_hash}) { my %loaded_regions; foreach my $chunk(keys %{$vf_hash->{$chr}}) { progress($config, $counter++, $chunk_count); # get the VFs for this chunk my ($start, $end); # work out start and end using chunk_size $start = $config->{chunk_size} * $chunk; $end = $config->{chunk_size} * ($chunk + 1); # using cache? if(defined($config->{cache})) { my $tmp_regions; push @{$tmp_regions->{$chr}}, $start.'-'.$end; my $converted_regions = convert_regions($config, $tmp_regions); foreach my $region(@{$converted_regions->{$chr}}) { unless($loaded_regions{$region}) { my $tmp_cache = load_dumped_variation_cache($config, $chr, $region); # load from DB if not found in cache if(!defined($tmp_cache->{$chr})) { if(defined($config->{offline})) { debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet}); next; } $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region); dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); } # merge tmp_cache with the main cache foreach my $key(keys %{$tmp_cache->{$chr}}) { $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key}; delete $tmp_cache->{$chr}->{$key}; } # clear memory undef $tmp_cache; # record this region as fetched $loaded_regions{$region} = 1; } } } # no cache, get all variations in region from DB else { my ($min, $max); # we can fetch smaller region when using DB foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { foreach my $coord(qw(start end)) { $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min; $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max; } } } $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max); } # now compare retrieved vars with vf_hash foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { my @found; if(defined($variation_cache->{$chr})) { if(my $existing_vars = $variation_cache->{$chr}->{$pos}) { foreach my $existing_var(grep {$_->{failed} <= $config->{failed}} @$existing_vars) { unless(is_var_novel($config, $existing_var, $var)) { push @found, $existing_var; } } } } $var->{existing} = \@found; $var->{existing} ||= []; } } } delete $variation_cache->{$chr}; } end_progress($config); } # gets existing VFs for a list of VFs sub check_existing_tabix { my $config = shift; my $listref = shift; debug("Checking for existing variations") unless defined($config->{quiet}); # we only care about non-SVs here my %by_chr; push @{$by_chr{$_->{chr}}}, $_ for grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @$listref; my $max = 200; my $total = scalar @$listref; my $i = 0; foreach my $chr(keys %by_chr) { my $list = $by_chr{$chr}; while(scalar @$list) { my @tmp_list = sort {$a->{start} <=> $b->{start}} splice @$list, 0, $max; progress($config, $i, $total); $i += scalar @tmp_list; my $region_string = join " ", map {$_->{chr}.':'.($_->{start} > $_->{end} ? $_->{end}.'-'.$_->{start} : $_->{start}.'-'.$_->{end})} @tmp_list; my $file = get_dump_file_name($config, $chr, "all", "vars"); die("ERROR: Could not read from file $file\n") unless -e $file; open VARS, "tabix $file $region_string 2>&1 |" or die "\nERROR: Could not open tabix pipe for $file\n"; my $i = 0; VAR: while() { chomp; my $existing = parse_variation($config, $_); #print STDERR "EX ".$existing->{variation_name}." ".$existing->{start}."\n"; # compare to current indexed var my $input = $tmp_list[$i]; last if !$input; while($existing->{start} >= $input->{start}) { #print STDERR "IN ".$input->{variation_name}." ".$input->{start}."\n"; if($existing->{start} == $input->{start}) { push @{$input->{existing}}, $existing unless is_var_novel($config, $existing, $input); next VAR; } else { $i++; $input = $tmp_list[$i]; last if !$input; } } } close VARS; $_->{existing} ||= [] for @tmp_list; } } end_progress($config); } # gets overlapping SVs for a vf_hash sub check_svs_hash { my $config = shift; my $vf_hash = shift; debug("Checking for overlapping structural variations") unless defined($config->{quiet}); my ($chunk_count, $counter); $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; foreach my $chr(keys %{$vf_hash}) { foreach my $chunk(keys %{$vf_hash->{$chr}}) { progress($config, $counter++, $chunk_count); # work out start and end using chunk_size my ($start, $end); $start = $config->{chunk_size} * $chunk; $end = $config->{chunk_size} * ($chunk + 1); # check for structural variations if(defined($config->{sa})) { my $slice = $config->{sa}->fetch_by_region(undef, $chr, $start, $end); if(defined($slice)) { my $svs = $config->{svfa}->fetch_all_by_Slice($slice); foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { my $string = join ",", map {$_->variation_name} grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start} @$svs; $var->{overlapping_svs} = $string if $string; } } } } } } end_progress($config); } # gets a slice from the slice adaptor sub get_slice { my $config = shift; my $chr = shift; my $otherfeatures = shift; $otherfeatures ||= ''; return $config->{slice_cache}->{$chr} if defined($config->{slice_cache}) && defined($config->{slice_cache}->{$chr}); my $slice; # with a FASTA DB we can just spoof slices if(defined($config->{fasta_db})) { $slice = Bio::EnsEMBL::Slice->new( -COORD_SYSTEM => $config->{coord_system}, -START => 1, -END => $config->{fasta_db}->length($chr), -SEQ_REGION_NAME => $chr, -SEQ_REGION_LENGTH => $config->{fasta_db}->length($chr) ); $slice->{is_fake} = 1; return $slice; } return undef unless defined($config->{sa}) && defined($chr); # first try to get a chromosome eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr); }; return $slice; } # METHODS THAT DEAL WITH "REGIONS" ################################## # gets regions from VF hash sub regions_from_hash { my $config = shift; my $vf_hash = shift; my %include_regions; # if using cache we just want the regions of cache_region_size # since that's what we'll get from the cache (or DB if no cache found) if(defined($config->{cache})) { my $region_size = $config->{cache_region_size}; foreach my $chr(keys %$vf_hash) { $include_regions{$chr} = []; my %temp_regions; foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { my @tmp = sort {$a <=> $b} map {($_->{start}, $_->{end})} @{$vf_hash->{$chr}{$chunk}{$pos}}; my ($s, $e) = ($tmp[0] - MAX_DISTANCE_FROM_TRANSCRIPT, $tmp[-1] + MAX_DISTANCE_FROM_TRANSCRIPT); my $low = int ($s / $region_size); my $high = int ($e / $region_size) + 1; for my $i($low..($high - 1)) { $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; } } } @{$include_regions{$chr}} = keys %temp_regions; } } # if no cache we don't want to fetch more than is necessary, so find the # minimum covered region of the variations in the hash else { foreach my $chr(keys %$vf_hash) { $include_regions{$chr} = []; foreach my $chunk(keys %{$vf_hash->{$chr}}) { foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}}; } } } # merge regions merge_regions(\%include_regions, $config); } return \%include_regions; } # adds a region to region list, expanding existing one if overlaps sub add_region { my $start = shift; my $end = shift; my $region_list = shift; # fix end for insertions $end = $start if $end < $start; my $added = 0; my $i = 0; while ($i < scalar @$region_list) { my ($region_start, $region_end) = split /\-/, $region_list->[$i]; if($start <= $region_end && $end >= $region_start) { my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT; my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT; $new_region_start = 1 if $new_region_start < 1; $region_start = $new_region_start if $new_region_start < $region_start; $region_end = $new_region_end if $new_region_end > $region_end; $region_list->[$i] = $region_start.'-'.$region_end; $added = 1; } $i++; } unless($added) { my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT; $s = 1 if $s < 1; push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT); } } # merges overlapping regions from scans sub merge_regions { my $include_regions = shift; my $config = shift; my $consecutive = shift; $consecutive ||= 0; # now merge overlapping regions foreach my $chr(keys %$include_regions) { my $max_index = $#{$include_regions->{$chr}}; my (@new_regions, %skip); for my $i(0..$max_index) { next if $skip{$i}; my ($s, $e) = split /\-/, $include_regions->{$chr}[$i]; for my $j(($i+1)..$max_index) { next if $skip{$j}; my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j]; if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) { $s = $ns if $ns < $s; $e = $ne if $ne > $e; $skip{$j} = 1; } } push @new_regions, $s.'-'.$e; } # replace original $include_regions->{$chr} = \@new_regions; $config->{region_count} += scalar @new_regions; } return $include_regions; } # converts regions as determined by scan_file to regions loadable from cache sub convert_regions { my $config = shift; my $regions = shift; return undef unless defined $regions; my $region_size = $config->{cache_region_size}; my %new_regions; foreach my $chr(keys %$regions) { my %temp_regions; foreach my $region(@{$regions->{$chr}}) { my ($s, $e) = split /\-/, $region; my $low = int ($s / $region_size); my $high = int ($e / $region_size) + 1; for my $i($low..($high - 1)) { $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; } } @{$new_regions{$chr}} = keys %temp_regions; } return \%new_regions; } # CACHE METHODS ############### # prunes a cache to get rid of features not in regions in use sub prune_cache { my $config = shift; my $cache = shift; my $regions = shift; my $loaded = shift; # delete no longer in use chroms foreach my $chr(keys %$cache) { delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}}; } my $new_count = 0; foreach my $chr(keys %$cache) { # get total area spanned by regions my ($min, $max); foreach my $region(@{$regions->{$chr}}) { my ($s, $e) = split /\-/, $region; $min = $s if !defined($min) or $s < $min; $max = $e if !defined($max) or $e > $max; } # transcript cache if(ref($cache->{$chr}) eq 'ARRAY') { $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max); $new_count += scalar @{$cache->{$chr}}; } # regfeat cache elsif(ref($cache->{$chr}) eq 'HASH') { for(keys %{$cache->{$chr}}) { $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max); $new_count += scalar @{$cache->{$chr}->{$_}}; } } # update loaded regions my %have_regions = map {$_ => 1} @{$regions->{$chr}}; foreach my $region(keys %{$loaded->{$chr}}) { delete $loaded->{$chr}->{$region} unless defined $have_regions{$region}; } } return $new_count; } # does the actual pruning sub prune_min_max { my $array = shift; my $min = shift; my $max = shift; # splice out features not in area spanned by min/max my $i = 0; my $f_count = scalar @{$array}; my @new_cache; while($i < $f_count) { my $f = $array->[$i]; $i++; if($max - $f->start() > 0 && $f->end - $min > 0) { push @new_cache, $f; } # do some cleaning for transcripts elsif(defined $f->{translation}) { delete $f->{translation}->{transcript}; delete $f->{translation}; } } undef $array; return \@new_cache; } # get transcripts for slices sub cache_transcripts { my $config = shift; my $include_regions = shift; my $tr_cache; my $i; debug("Caching transcripts") unless defined($config->{quiet}); foreach my $chr(keys %$include_regions) { my $slice = get_slice($config, $chr); next unless defined $slice; # prefetch some things $slice->is_circular; # trim bumf off the slice delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache}); # no regions? if(!scalar @{$include_regions->{$chr}}) { my $start = 1; my $end = $config->{cache_region_size}; while($start < $slice->end) { push @{$include_regions->{$chr}}, $start.'-'.$end; $start += $config->{cache_region_size}; $end += $config->{cache_region_size}; } } my $region_count; if(scalar keys %$include_regions == 1) { my ($chr) = keys %$include_regions; $region_count = scalar @{$include_regions->{$chr}}; debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); } foreach my $region(@{$include_regions->{$chr}}) { progress($config, $i++, $region_count || $config->{region_count}); my ($s, $e) = split /\-/, $region; # sanity check start and end $s = 1 if $s < 1; $e = $slice->end if $e > $slice->end; # get sub-slice my $sub_slice = $slice->sub_Slice($s, $e); # add transcripts to the cache, via a transfer to the chrom's slice if(defined($sub_slice)) { # for some reason unless seq is called here the sequence becomes Ns later $sub_slice->seq; foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) { my $gene_stable_id = $gene->stable_id; my $canonical_tr_id = $gene->{canonical_transcript_id}; my @trs; foreach my $tr(@{$gene->get_all_Transcripts}) { $tr->{_gene_stable_id} = $gene_stable_id; $tr->{_gene} = $gene; # indicate if canonical $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id; if(defined($config->{prefetch})) { prefetch_transcript_data($config, $tr); } # CCDS elsif(defined($config->{ccds})) { my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; $tr->{_ccds} = $entries[0]->display_id if scalar @entries; } # strip some unnecessary data from the transcript object clean_transcript($tr) if defined($config->{write_cache}); push @trs, $tr; } # sort the transcripts by translation so we can share sift/polyphen stuff # between transcripts and save cache space if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) { my $prev_tr; # sort them by peptide seqeuence as transcripts with identical peptides # will have identical SIFT/PolyPhen prediction strings foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) { if( defined($prev_tr) && $prev_tr->{_variation_effect_feature_cache}->{peptide} eq $tr->{_variation_effect_feature_cache}->{peptide} ) { foreach my $analysis(qw(sift polyphen)) { next unless defined($config->{$analysis}); $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis}; } } $prev_tr = $tr; } } # clean the gene clean_gene($gene); push @{$tr_cache->{$chr}}, @trs; } } } } end_progress($config); return $tr_cache; } # gets rid of extra bits of info attached to the transcript that we don't need sub clean_transcript { my $tr = shift; foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) { delete $tr->{$key} if defined($tr->{$key}); } # clean all attributes but miRNA if(defined($tr->{attributes})) { my @new_atts; foreach my $att(@{$tr->{attributes}}) { push @new_atts, $att if $att->{code} eq 'miRNA'; } $tr->{attributes} = \@new_atts; } # clean the translation if(defined($tr->translation)) { # sometimes the translation points to a different transcript? $tr->{translation}->{transcript} = $tr; weaken($tr->{translation}->{transcript}); for my $key(qw(attributes protein_features created_date modified_date)) { delete $tr->translation->{$key}; } } } # gets rid of extra bits of info attached to genes. At the moment this is almost # everything as genes are only used for their locations when looking at # structural variations sub clean_gene { my $gene = shift; # delete almost everything in the gene map {delete $gene->{$_}} grep { $_ ne 'start' && $_ ne 'end' && $_ ne 'strand' && $_ ne 'stable_id' } keys %{$gene}; } # build slice cache from transcript cache sub build_slice_cache { my $config = shift; my $tr_cache = shift; my %slice_cache; foreach my $chr(keys %$tr_cache) { my $tmp = $tr_cache->{$chr}; if(ref($tmp) eq 'HASH') { foreach my $type(keys %$tmp) { $slice_cache{$chr} ||= scalar @{$tmp->{$type}} ? $tmp->{$type}->[0]->slice : &get_slice($config, $chr); } } else { $slice_cache{$chr} ||= scalar @$tmp ? $tmp->[0]->slice : &get_slice($config, $chr); } if(!defined($slice_cache{$chr})) { delete $slice_cache{$chr} } else { # reattach adaptor to the coord system $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa}; # log length for stats $config->{stats}->{chr_lengths}->{$chr} ||= $slice_cache{$chr}->end; } } return \%slice_cache; } # pre-fetches per-transcript data sub prefetch_transcript_data { my $config = shift; my $tr = shift; # introns my $introns = $tr->get_all_Introns; if(defined($introns)) { foreach my $intron(@$introns) { foreach my $key(qw(adaptor analysis dbID next prev seqname)) { delete $intron->{$key}; } } } $tr->{_variation_effect_feature_cache}->{introns} ||= $introns; # translateable_seq, mapper $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq; $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper; # peptide unless ($tr->{_variation_effect_feature_cache}->{peptide}) { my $translation = $tr->translate; $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef; } # protein features if(defined($config->{domains}) || defined($config->{write_cache})) { my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : []; # clean them to save cache space foreach my $pf(@$pfs) { # remove everything but the coord, analysis and ID fields foreach my $key(keys %$pf) { delete $pf->{$key} unless $key eq 'start' || $key eq 'end' || $key eq 'analysis' || $key eq 'hseqname'; } # remove everything from the analysis but the display label foreach my $key(keys %{$pf->{analysis}}) { delete $pf->{analysis}->{$key} unless $key eq '_display_label'; } } $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs; } # codon table unless ($tr->{_variation_effect_feature_cache}->{codon_table}) { # for mithocondrial dna we need to to use a different codon table my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0]; $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1; } # sift/polyphen if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) { foreach my $analysis(qw(sift polyphen)) { next unless defined($config->{$analysis}); my $a = $analysis; $a .= '_humvar' if $a eq 'polyphen'; $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide})) } } # gene $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); # gene symbol if(defined $config->{symbol}) { # get from gene cache if found already if(defined($tr->{_gene}->{_symbol})) { $tr->{_gene_symbol} = $tr->{_gene}->{_symbol}; } else { $tr->{_gene_symbol} ||= $tr->{_gene}->display_xref ? $tr->{_gene}->display_xref->display_id : undef; # cache it on the gene object too $tr->{_gene}->{_symbol} = $tr->{_gene_symbol}; } } # CCDS my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; $tr->{_ccds} = $entries[0]->display_id if scalar @entries; $tr->{_ccds} ||= '-'; # refseq @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; if(scalar @entries) { $tr->{_refseq} = join ",", map {$_->display_id} @entries; } else { $tr->{_refseq} = '-'; } # protein stable ID $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-'; return $tr; } sub get_dump_file_name { my $config = shift; my $chr = shift; my $region = shift; my $type = shift; $type ||= 'transcript'; if($type eq 'transcript') { $type = ''; } else { $type = '_'.$type; } #my ($s, $e) = split /\-/, $region; #my $subdir = int($s / 1e6); # #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir; my $dir = $config->{dir}.'/'.$chr; my $dump_file = $dir.'/'.$region.$type.'.gz'; # make directory if it doesn't exist if(!(-e $dir) && defined($config->{write_cache})) { mkpath($dir); } return $dump_file; } # dumps out transcript cache to file sub dump_transcript_cache { my $config = shift; my $tr_cache = shift; my $chr = shift; my $region = shift; debug("Dumping cached transcript data") unless defined($config->{quiet}); # clean the slice adaptor before storing clean_slice_adaptor($config); strip_transcript_cache($config, $tr_cache); $config->{reg}->disconnect_all; delete $config->{sa}->{dbc}->{_sql_helper}; my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); debug("Writing to $dump_file") unless defined($config->{quiet}); # storable open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; nstore_fd($tr_cache, $fh); close $fh; } #sub dump_transcript_cache_tabix { # my $config = shift; # my $tr_cache = shift; # my $chr = shift; # my $region = shift; # # debug("Dumping cached transcript data") unless defined($config->{quiet}); # # # clean the slice adaptor before storing # clean_slice_adaptor($config); # # strip_transcript_cache($config, $tr_cache); # # $config->{reg}->disconnect_all; # # my $dir = $config->{dir}.'/'.$chr; # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; # # # make directory if it doesn't exist # if(!(-e $dir)) { # mkpath($dir); # } # # debug("Writing to $dump_file") unless defined($config->{quiet}); # # use Storable qw(nfreeze); # use MIME::Base64 qw(encode_base64); # #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; # # # #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { # # print NEW join "\t", ( # # $chr, # # $tr->start, # # $tr->end, # # encode_base64(freeze($tr), "") # # ); # # print NEW "\n"; # #} # #close NEW; # # # ## tabix it # #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`; # #die("ERROR: Failed during tabix indexing\n$output\n") if $output; # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; # # foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { # print NEW join "\t", ( # $chr, # $tr->start, # $tr->end, # encode_base64(freeze($tr), "") # ); # print NEW "\n"; # } # close NEW; #} # loads in dumped transcript cache to memory sub load_dumped_transcript_cache { my $config = shift; my $chr = shift; my $region = shift; my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); return undef unless -e $dump_file; debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); open my $fh, $config->{compress}." ".$dump_file." |" or return undef; my $tr_cache; $tr_cache = fd_retrieve($fh); close $fh; # reattach adaptors foreach my $t(@{$tr_cache->{$chr}}) { if(defined($t->{translation})) { $t->{translation}->{adaptor} = $config->{tra} if defined $config->{tra}; $t->{translation}->{transcript} = $t; weaken($t->{translation}->{transcript}); } $t->{slice}->{adaptor} = $config->{sa}; $_->{slice} ||= $t->{slice} for @{$t->{_trans_exon_array}}; } return $tr_cache; } #sub load_dumped_transcript_cache_tabix { # my $config = shift; # my $chr = shift; # my $region = shift; # # my $dir = $config->{dir}.'/'.$chr; # my $dump_file = $dir.'/all_trs.gz'; # # #print STDERR "Reading from $dump_file\n"; # # return undef unless -e $dump_file; # # debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); # # my $tr_cache; # # use MIME::Base64 qw(decode_base64); # use Storable qw(thaw); # # $DB::single = 1; # # my ($s, $e) = split /\-/, $region; # #my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; # my $regions = ""; # $regions .= " $chr\:$region"; # # #print STDERR "tabix $dump_file $regions |\n"; # open IN, "tabix $dump_file $regions |"; # #open IN, "gzip -dc $dump_file |"; # while() { # chomp; # my ($chr, $start, $end, $blob) = split /\t/, $_; # #next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; # my $tr = thaw(decode_base64($blob)); # push @{$tr_cache->{$chr}}, $tr; # } # close IN; # # # reattach adaptors # foreach my $t(@{$tr_cache->{$chr}}) { # if(defined($t->{translation})) { # $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; # $t->{translation}->{transcript} = $t; # weaken($t->{translation}->{transcript}); # } # # $t->{slice}->{adaptor} = $config->{sa}; # } # # # add empty array ref so code doesn't try and fetch from DB too # $tr_cache->{$chr} ||= []; # # return $tr_cache; #} # strips cache before writing to disk sub strip_transcript_cache { my $config = shift; my $cache = shift; foreach my $chr(keys %$cache) { foreach my $tr(@{$cache->{$chr}}) { foreach my $exon(@{$tr->{_trans_exon_array}}) { delete $exon->{slice}->{adaptor}; for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) { delete $exon->{$_}; } } delete $tr->{adaptor}; delete $tr->{slice}->{adaptor}; } } } # cleans slice adaptor before storing in cache sub clean_slice_adaptor{ my $config = shift; # clean some stuff off the slice adaptor delete $config->{sa}->{asm_exc_cache}; $config->{sa}->{sr_name_cache} = {}; $config->{sa}->{sr_id_cache} = {}; delete $config->{sa}->{db}->{seq_region_cache}; delete $config->{sa}->{db}->{name_cache}; } # dump adaptors to cache sub dump_adaptor_cache { my $config = shift; $config->{reg}->disconnect_all; delete $config->{sa}->{dbc}->{_sql_helper}; my $dir = $config->{dir}; my $dump_file = $dir.'/adaptors.gz'; # make directory if it doesn't exist if(!(-e $dir)) { mkpath($dir); } open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; nstore_fd($config, $fh); close $fh; } # load dumped adaptors sub load_dumped_adaptor_cache { my $config = shift; my $dir = $config->{dir}; my $dump_file = $dir.'/adaptors.gz'; return undef unless -e $dump_file; debug("Reading cached adaptor data") unless defined($config->{quiet}); open my $fh, $config->{compress}." ".$dump_file." |" or return undef; my $cached_config; $cached_config = fd_retrieve($fh); close $fh; $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor); return 1; } # dumps cached variations to disk sub dump_variation_cache { my $config = shift; my $v_cache = shift; my $chr = shift; my $region = shift; my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file"; foreach my $pos(keys %{$v_cache->{$chr}}) { foreach my $v(@{$v_cache->{$chr}->{$pos}}) { my @tmp = ( $v->{variation_name}, $v->{failed} == 0 ? '' : $v->{failed}, $v->{start}, $v->{end} == $v->{start} ? '' : $v->{end}, $v->{allele_string}, $v->{strand} == 1 ? '' : $v->{strand}, $v->{minor_allele} || '', defined($v->{minor_allele_freq}) ? sprintf("%.4f", $v->{minor_allele_freq}) : '', ); if(have_clin_sig($config) && defined($config->{clin_sig})) { push @tmp, $config->{clin_sig}->{$v->{variation_name}} || ''; } if(have_pubmed($config) && defined($config->{pubmed})) { push @tmp, $config->{pubmed}->{$v->{variation_name}} || ''; } if(defined($config->{freqs})) { push @tmp, $config->{'freqs'}->{$v->{variation_name}} || ''; } print DUMP join(" ", @tmp); print DUMP "\n"; } } close DUMP; } # loads dumped variation cache sub load_dumped_variation_cache { my $config = shift; my $chr = shift; my $region = shift; my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); return undef unless -e $dump_file; open DUMP, $config->{compress}." ".$dump_file." |" or return undef; my $v_cache; while() { chomp; my $v = parse_variation($config, $_); push @{$v_cache->{$chr}->{$v->{start}}}, $v; } close DUMP; return $v_cache; } sub parse_variation { my $config = shift; my $line = shift; my @cols = @{get_variation_columns($config)}; my @data = split / |\t/, $line; # assumption fix for old cache files if(scalar @data > scalar @cols) { push @cols, ('AFR', 'AMR', 'ASN', 'EUR'); } my %v = map {$cols[$_] => $data[$_] eq '.' ? undef : $data[$_]} (0..$#data); $v{failed} ||= 0; $v{end} ||= $v{start}; $v{strand} ||= 1; # hack for odd frequency data foreach my $pop(qw(AFR AMR ASN EUR)) { $v{$pop} = 1 - $v{$pop} if defined($v{$pop}) && $v{$pop} =~ /\d+/ && $v{$pop} > 0.5; } return \%v; } # gets variation cache columns sub get_variation_columns { my $config = shift; if(!defined($config->{cache_variation_cols})) { $config->{cache_variation_cols} = \@VAR_CACHE_COLS; push @{$config->{cache_variation_cols}}, 'clin_sig' if have_clin_sig($config) && defined($config->{clin_sig}); push @{$config->{cache_variation_cols}}, 'pubmed' if have_pubmed($config) && defined($config->{pubmed}); push @{$config->{cache_variation_cols}}, @{$config->{freq_file_pops}} if defined($config->{freq_file_pops}); } return $config->{cache_variation_cols}; } # caches regulatory features sub cache_reg_feats { my $config = shift; my $include_regions = shift; my $rf_cache; my $i; debug("Caching regulatory features") unless defined($config->{quiet}); foreach my $chr(keys %$include_regions) { my $slice = get_slice($config, $chr); next unless defined $slice; # prefetch some things $slice->is_circular; # no regions? if(!scalar @{$include_regions->{$chr}}) { my $start = 1; my $end = $config->{cache_region_size}; while($start < $slice->end) { push @{$include_regions->{$chr}}, $start.'-'.$end; $start += $config->{cache_region_size}; $end += $config->{cache_region_size}; } } my $region_count; if(scalar keys %$include_regions == 1) { my ($chr) = keys %$include_regions; $region_count = scalar @{$include_regions->{$chr}}; debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); } foreach my $region(@{$include_regions->{$chr}}) { progress($config, $i++, $region_count || $config->{region_count}); my ($s, $e) = split /\-/, $region; # sanity check start and end $s = 1 if $s < 1; $e = $slice->end if $e > $slice->end; # get sub-slice my $sub_slice = $slice->sub_Slice($s, $e); next unless defined($sub_slice); $sub_slice->{coord_system}->{adaptor} = $config->{csa}; foreach my $type(@REG_FEAT_TYPES) { my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice); next unless defined($features); # cell types if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { foreach my $rf(@$features) { my %cl; # get cell type by fetching all from stable ID if($type eq 'RegulatoryFeature') { %cl = map { $_->feature_set->cell_type->name => $_->feature_type->name } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)}; } # get cell type by fetching regfeats that contain this MotifFeature elsif($type eq 'MotifFeature') { %cl = map { $_->feature_set->cell_type->name => $_->feature_type->name } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)}; } $rf->{cell_types} = \%cl; } } push @{$rf_cache->{$chr}->{$type}}, map { clean_reg_feat($_) } map { $_->transfer($slice) } @{$features}; } } } end_progress($config); return $rf_cache; } # cleans reg feats for caching sub clean_reg_feat { my $rf = shift; foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) { delete $rf->{$key}; } if(defined($rf->{binding_matrix})) { $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq; foreach my $key(qw/adaptor feature_type analysis dbID/) { delete $rf->{binding_matrix}->{$key}; } } return $rf; } # dumps out reg feat cache to file sub dump_reg_feat_cache { my $config = shift; my $rf_cache = shift; my $chr = shift; my $region = shift; debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet}); # clean the slice adaptor before storing clean_slice_adaptor($config); $config->{reg}->disconnect_all; delete $config->{sa}->{dbc}->{_sql_helper}; foreach my $chr(keys %{$rf_cache}) { foreach my $type(keys %{$rf_cache->{$chr}}) { delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}}; } } my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); debug("Writing to $dump_file") unless defined($config->{quiet}); # storable open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; nstore_fd($rf_cache, $fh); close $fh; } #sub dump_reg_feat_cache_tabix { # my $config = shift; # my $rf_cache = shift; # my $chr = shift; # my $region = shift; # # debug("Dumping cached reg feat data") unless defined($config->{quiet}); # # # clean the slice adaptor before storing # clean_slice_adaptor($config); # # $config->{reg}->disconnect_all; # delete $config->{sa}->{dbc}->{_sql_helper}; # # $config->{reg}->disconnect_all; # # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); # # debug("Writing to $dump_file") unless defined($config->{quiet}); # # use Storable qw(nfreeze); # use MIME::Base64 qw(encode_base64); # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; # # foreach my $type(keys %{$rf_cache->{$chr}}) { # foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) { # print NEW join "\t", ( # $chr, # $rf->start, # $rf->end, # $type, # encode_base64(freeze($rf), "") # ); # print NEW "\n"; # } # } # close NEW; #} # loads in dumped transcript cache to memory sub load_dumped_reg_feat_cache { my $config = shift; my $chr = shift; my $region = shift; my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); return undef unless -e $dump_file; debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); open my $fh, $config->{compress}." ".$dump_file." |" or return undef; my $rf_cache; $rf_cache = fd_retrieve($fh); close $fh; return $rf_cache; } #sub load_dumped_reg_feat_cache_tabix { # my $config = shift; # my $chr = shift; # my $region = shift; # my $trim_regions = shift; # # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); # # #print STDERR "Reading from $dump_file\n"; # # return undef unless -e $dump_file; # # debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); # # my $rf_cache; # # use MIME::Base64 qw(decode_base64); # use Storable qw(thaw); # # my ($s, $e) = split /\-/, $region; # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; # my $regions = ""; # $regions .= " $chr\:$_" for @regions; # # #print STDERR "tabix $dump_file $regions |\n"; # #open IN, "tabix $dump_file $regions |"; # open IN, "gzip -dc $dump_file |"; # while() { # my ($chr, $start, $end, $type, $blob) = split /\t/, $_; # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; # my $rf = thaw(decode_base64($blob)); # push @{$rf_cache->{$chr}->{$type}}, $rf; # } # close IN; # # $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES; # # return $rf_cache; #} # get custom annotation for a region sub cache_custom_annotation { my $config = shift; my $include_regions = shift; my $chr = shift; #$include_regions = merge_regions($include_regions, $config, 1); my $annotation = {}; my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}}; my $counter = 0; my $max_regions_per_tabix = 1000; debug("Caching custom annotations") unless defined($config->{quiet}); foreach my $custom(@{$config->{custom}}) { my @regions = @{$include_regions->{$chr}}; while(scalar @regions) { my $got_features = 0; my @tmp_regions = splice @regions, 0, $max_regions_per_tabix; progress($config, $counter, $total); $counter += scalar @tmp_regions; # some files may have e.g. chr10 instead of 10 for my $tmp_chr($chr, 'chr'.$chr) { # bigwig needs to use bigWigToWig utility if($custom->{format} eq 'bigwig') { my @tmp_files; die "\nERROR: Could not find temporary directory ".$config->{tmpdir}." - use --tmpdir [dir] to define an existing directory\n" unless -d $config->{tmpdir}; foreach my $region(@tmp_regions) { my ($s, $e) = split /\-/, $region; my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e; push @tmp_files, $tmp_file; my $bigwig_file = $custom->{file}; my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`; die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output; } # concatenate all the files together my $string = join(" ", @tmp_files); my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$; `cat $string > $tmp_file`; open CUSTOM, $tmp_file or die "\nERROR: Could not read from temporary WIG file $tmp_file\n"; # unlink smaller files unlink($_) for @tmp_files; } # otherwise use tabix else { # tabix can fetch multiple regions, so construct a string my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions; open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |" or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n"; } # set an error flag so we don't have to check every line my $error_flag = 1; # create a hash for storing temporary params (used by bigWig) my %tmp_params = (); while() { chomp; # check for errors if($error_flag) { die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/; $error_flag = 0; } my @data = split /\t/, $_; my $feature; if($custom->{format} eq 'bed') { $feature = { chr => $chr, start => $data[1], end => $data[2], name => $data[3], }; } elsif($custom->{format} eq 'vcf') { my $tmp_vf = parse_vcf($config, $_)->[0]; $feature = { chr => $chr, start => $tmp_vf->{start}, end => $tmp_vf->{end}, name => $tmp_vf->{variation_name}, }; foreach my $field(@{$custom->{fields}}) { if(m/$field\=(.+?)(\;|\s|$)/) { $feature->{$field} = $1; } } } elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') { my $name; # try and get a feature name from the attributes column foreach my $attrib(split /\s*\;\s*/, $data[8]) { my ($key, $value) = split /\=/, $attrib; $name = $value if $key eq 'ID'; } $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4]; $feature = { chr => $chr, start => $data[3], end => $data[4], name => $name, }; } elsif($custom->{format} eq 'bigwig') { # header line from wiggle file if(/^(fixed|variable)Step|^\#bedGraph/i) { my @split = split /\s+/; $tmp_params{type} = shift @split; $tmp_params{type} =~ s/^\#//g; foreach my $pair(@split) { my ($key, $value) = split /\=/, $pair; $tmp_params{$key} = $value; } # default to span of 1 $tmp_params{span} ||= 1; } elsif(defined($tmp_params{type})) { if($tmp_params{type} eq 'fixedStep') { $feature = { chr => $chr, start => $tmp_params{start}, end => ($tmp_params{start} + $tmp_params{span}) - 1, name => $data[0], }; $tmp_params{start} += $tmp_params{step}; } elsif($tmp_params{type} eq 'variableStep') { $feature = { chr => $chr, start => $data[0], end => ($data[0] + $tmp_params{span}) - 1, name => $data[1] }; } elsif($tmp_params{type} eq 'bedGraph') { $feature = { chr => $chr, start => $data[1] + 1, end => $data[2], name => $data[3] }; } } else { die("ERROR: Cannot parse line from bigWigtoWig output: \n$_\n"); } } if(defined($feature)) { $got_features = 1; if(!defined($feature->{name}) || $custom->{coords}) { $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end}; } # add the feature to the cache $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature; } } close CUSTOM; # unlink temporary wig files unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig'; # no need to fetch e.g. "chr21" features if just "21" worked last if $got_features; } } } end_progress($config); return $annotation; } # builds a full cache for this species sub build_full_cache { my $config = shift; my @slices; if($config->{build} =~ /all/i) { @slices = @{$config->{sa}->fetch_all('toplevel')}; push @slices, map {$_->alternate_slice} map {@{$_->get_all_AssemblyExceptionFeatures}} @slices; push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg}); } else { foreach my $val(split /\,/, $config->{build}) { my @nnn = split /\-/, $val; foreach my $chr($nnn[0]..$nnn[-1]) { my $slice = get_slice($config, $chr); push @slices, $slice if defined($slice); } } } # check and load clin_sig $config->{clin_sig} = get_clin_sig($config) if have_clin_sig($config); # check and load pubmed $config->{pubmed} = get_pubmed($config) if have_pubmed($config); foreach my $slice(@slices) { my $chr = $slice->seq_region_name; # check for features, we don't want a load of effectively empty dirs my $dbc = $config->{sa}->db->dbc; my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?"); $sth->execute($slice->get_seq_region_id); my $count; $sth->bind_columns(\$count); $sth->fetch; $sth->finish; next unless $count > 0; my $regions; # for progress my $region_count = int($slice->end / $config->{cache_region_size}) + 1; my $counter = 0; # initial region my $start = 1 + ($config->{cache_region_size} * int($slice->start / $config->{cache_region_size})); my $end = ($start - 1) + $config->{cache_region_size}; debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet}); while($start < $slice->end) { progress($config, $counter++, $region_count); # store quiet status my $quiet = $config->{quiet}; $config->{quiet} = 1; # spoof regions $regions->{$chr} = [$start.'-'.$end]; # store transcripts my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions)); $tmp_cache->{$chr} ||= []; #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end)); dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end); undef $tmp_cache; # store reg feats if(defined($config->{regulatory})) { my $rf_cache = cache_reg_feats($config, $regions); $rf_cache->{$chr} ||= {}; dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end); #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end)); undef $rf_cache; # this gets cleaned off but needs to be there for the next loop $slice->{coord_system}->{adaptor} = $config->{csa}; } # store variations my $variation_cache; $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end); $variation_cache->{$chr} ||= {}; dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end); undef $variation_cache; # restore quiet status $config->{quiet} = $quiet; # increment by cache_region_size to get next region $start += $config->{cache_region_size}; $end += $config->{cache_region_size}; } end_progress($config); undef $regions; } write_cache_info($config); } # write an info file that defines what is in the cache sub write_cache_info { my $config = shift; my $info_file = $config->{dir}.'/info.txt'; open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n"; print OUT "# CACHE UPDATED ".get_time()."\n"; foreach my $param(qw( host port user build regulatory sift polyphen )) { print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n"; } # cell types if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all}); print OUT "\n"; } # sift/polyphen versions foreach my $tool(qw(sift polyphen)) { if(defined($config->{$tool})) { my $var_mca = $config->{reg}->get_adaptor($config->{species}, 'variation', 'metacontainer'); if(defined($var_mca)) { my $version = $var_mca->list_value_by_key($tool.'_version'); print OUT "$tool\_version\t".$version->[0]."\n" if defined($version) and scalar @$version; } } } # variation columns print OUT "variation_cols\t".(join ",", @{get_variation_columns($config)}); print OUT "\n"; close OUT; } # reads in cache info file sub read_cache_info { my $config = shift; my $info_file = $config->{dir}.'/info.txt'; open IN, $info_file or return 0; while() { next if /^#/; chomp; my ($param, $value) = split /\t/; if($param =~ /variation_col/) { $config->{'cache_'.$param} = [split /\,/, $value]; } else { $config->{'cache_'.$param} = $value unless defined $value && $value eq '-'; } } close IN; return 1; } # format coords for printing sub format_coords { my ($start, $end) = @_; if(defined($start)) { if(defined($end)) { if($start > $end) { return $end.'-'.$start; } elsif($start == $end) { return $start; } else { return $start.'-'.$end; } } else { return $start.'-?'; } } elsif(defined($end)) { return '?-'.$end; } else { return '-'; } } # METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS ################################################## # finds an existing VF in the db sub find_existing { my $config = shift; my $new_vf = shift; if(defined($config->{vfa}) && defined($config->{vfa}->db)) { my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols FROM variation_feature vf LEFT JOIN failed_variation fv ON vf.variation_id = fv.variation_id WHERE vf.seq_region_id = ? AND vf.seq_region_start = ? AND vf.seq_region_end = ? ORDER BY vf.source_id ASC }); $new_vf->{slice} ||= get_slice($config, $new_vf->{chr}); $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end); my %v; $v{$_} = undef for @VAR_CACHE_COLS; my %vars_by_id; $sth->bind_col($_+1, \$v{$VAR_CACHE_COLS[$_]}) for (0..4); my @found; while($sth->fetch) { my %v_copy = %v; $v_copy{allele_string} =~ s/\s+/\_/g; push @found, \%v_copy unless is_var_novel($config, \%v_copy, $new_vf) || $v_copy{failed} > $config->{failed}; } $sth->finish(); return (scalar @found ? \@found : []); } return undef; } # compare a new vf to one from the cache / DB sub is_var_novel { my $config = shift; my $existing_var = shift; my $new_var = shift; my $is_novel = 1; $is_novel = 0 if $existing_var->{start} == $new_var->start && $existing_var->{end} == $new_var->end; if(defined($config->{check_alleles})) { my %existing_alleles; $existing_alleles{$_} = 1 for split /\//, $existing_var->{allele_string}; my $seen_new = 0; foreach my $a(split /\//, ($new_var->allele_string || "")) { reverse_comp(\$a) if $new_var->strand ne $existing_var->{strand}; $seen_new = 1 unless defined $existing_alleles{$a}; } $is_novel = 1 if $seen_new; } return $is_novel; } # check frequencies of existing var against requested params sub check_frequencies { my $config = shift; my $var = shift; my $var_name = $var->{variation_name}; my $freq_pop = $config->{freq_pop}; my $freq_freq = $config->{freq_freq}; my $freq_gt_lt = $config->{freq_gt_lt}; my $pass = 0; my $checked_cache = 0; delete $config->{filtered_freqs}; # if we can, check using cached frequencies as this is way quicker than # going to the DB if($freq_pop =~ /1kg|esp/i) { my $freq; my $sub_pop = uc((split /\_/, $freq_pop)[-1]); $freq = $var->{minor_allele_freq} if $sub_pop =~ /all/i; if(!defined($freq)) { $freq = $var->{$sub_pop} if defined($var->{$sub_pop}); } if(defined($freq) && $freq =~ /\d/) { $pass = 1 if $freq >= $freq_freq and $freq_gt_lt eq 'gt'; $pass = 1 if $freq <= $freq_freq and $freq_gt_lt eq 'lt'; push @{$config->{filtered_freqs}}, $freq_pop.':'.$freq; } $checked_cache = 1; } if(defined($config->{va}) && $checked_cache == 0) { my $v = $config->{va}->fetch_by_name($var_name); my $freq_pop_name = (split /\_/, $freq_pop)[-1]; $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap|any/; foreach my $a(@{$v->get_all_Alleles}) { next unless defined $a->{population} || defined $a->{'_population_id'}; next unless defined $a->frequency; next if $a->frequency > 0.5; my $pop_name = $a->population->name; if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; } if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; } if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; } if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; } $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt'; $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt'; $pop_name =~ s/\:/\_/g; push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency; #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass"; } } return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1; return 0 if $config->{freq_filter} eq 'include' and $pass == 0; return 1; } # gets all variations in a region sub get_variations_in_region { my $config = shift; my $chr = shift; my $region = shift; my ($start, $end) = split /\-/, $region; my %variations; if(defined($config->{vfa}->db)) { my $sr_cache = $config->{seq_region_cache}; if(!defined($sr_cache)) { $sr_cache = cache_seq_region_ids($config); $config->{seq_region_cache} = $sr_cache; } # no seq_region_id? return {} unless defined($sr_cache) && defined($sr_cache->{$chr}); my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT vf.variation_id, vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols FROM variation_feature vf LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id WHERE vf.seq_region_id = ? AND vf.seq_region_start >= ? AND vf.seq_region_start <= ? }); $sth->execute($sr_cache->{$chr}, $start, $end); my %v; $v{$_} = undef for @VAR_CACHE_COLS; my ($var_id, %vars_by_id); $sth->bind_col(1, \$var_id); if(have_maf_cols($config)) { $sth->bind_col($_+2, \$v{$VAR_CACHE_COLS[$_]}) for (0..$#VAR_CACHE_COLS); } else { $sth->bind_col($_+2, \$v{$VAR_CACHE_COLS[$_]}) for (0..4); } while($sth->fetch) { my %v_copy = %v; $v_copy{allele_string} =~ s/\s+/\_/g; push @{$variations{$v{start}}}, \%v_copy; # store by var_id too to get stuff from variation table $vars_by_id{$var_id} = \%v_copy; } $sth->finish(); # now get stuff from variation table #if(scalar keys %vars_by_id) { # my $max_size = 200; # my @id_list = keys %vars_by_id; # # while(@id_list) { # my @ids; # if(@id_list > $max_size) { # @ids = splice(@id_list, 0, $max_size); # } # else { # @ids = splice(@id_list, 0); # } # # my $id_str; # if(@ids > 1) { # $id_str = " IN (" . join(',', @ids). ")"; # } # else { # $id_str = " = ".$ids[0]; # } # # $sth = $config->{vfa}->db->dbc->prepare(qq{ # SELECT variation_id, ancestral_allele # FROM variation # WHERE variation_id $id_str # }); # # my $ancestral_allele; # $sth->execute(); # $sth->bind_columns(\$var_id, \$ancestral_allele); # # while($sth->fetch) { # $vars_by_id{$var_id}->{ancestral_allele} = $ancestral_allele; # } # # $sth->finish(); # } #} } return \%variations; } sub cache_seq_region_ids { my $config = shift; my (%cache, $chr, $id); my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT seq_region_id, name FROM seq_region }); $sth->execute(); $sth->bind_columns(\$id, \$chr); $cache{$chr} = $id while $sth->fetch(); $sth->finish; return \%cache; } sub have_maf_cols { my $config = shift; if(!defined($config->{have_maf_cols})) { if(defined($config->{vfa}) && defined($config->{vfa}->db)) { my $sth = $config->{vfa}->db->dbc->prepare(qq{ DESCRIBE variation_feature }); $sth->execute(); my @cols = map {$_->[0]} @{$sth->fetchall_arrayref}; $sth->finish(); $config->{have_maf_cols} = 0; $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols; } else { $config->{have_maf_cols} = 0; } } return $config->{have_maf_cols}; } sub have_clin_sig { my $config = shift; if(!defined($config->{have_clin_sig})) { if(defined($config->{vfa}) && defined($config->{vfa}->db)) { my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT COUNT(*) FROM variation WHERE clinical_significance IS NOT NULL }); $sth->execute; my $count; $sth->bind_columns(\$count); $sth->fetch(); $sth->finish(); $config->{have_clin_sig} = $count; } else { $config->{have_clin_sig} = 0; } } return $config->{have_clin_sig}; } sub get_clin_sig { my $config = shift; my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT name, clinical_significance FROM variation WHERE clinical_significance IS NOT NULL }); $sth->execute; my ($v, $c, %cs); $sth->bind_columns(\$v, \$c); $cs{$v} = $c while $sth->fetch(); $sth->finish(); return \%cs; } sub have_pubmed { my $config = shift; if(!defined($config->{have_pubmed})) { if(defined($config->{vfa}) && defined($config->{vfa}->db)) { my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT COUNT(*) FROM variation_citation }); $sth->execute; my $count; $sth->bind_columns(\$count); $sth->fetch(); $sth->finish(); $config->{have_pubmed} = $count; } else { $config->{have_pubmed} = 0; } } return $config->{have_pubmed}; } sub get_pubmed { my $config = shift; my $sth = $config->{vfa}->db->dbc->prepare(qq{ SELECT v.name, GROUP_CONCAT(p.pmid) FROM variation v, variation_citation c, publication p WHERE v.variation_id = c.variation_id AND c.publication_id = p.publication_id AND p.pmid IS NOT NULL GROUP BY v.variation_id }); $sth->execute; my ($v, $p, %pm); $sth->bind_columns(\$v, \$p); $pm{$v} = $p while $sth->fetch(); $sth->finish(); return \%pm; } sub merge_hashes { my ($x, $y, $add) = @_; foreach my $k (keys %$y) { if (!defined($x->{$k})) { $x->{$k} = $y->{$k}; } else { if(ref($x->{$k}) eq 'ARRAY') { $x->{$k} = merge_arrays($x->{$k}, $y->{$k}); } elsif(ref($x->{$k}) eq 'HASH') { $x->{$k} = merge_hashes($x->{$k}, $y->{$k}, $add); } else { $x->{$k} = ($add && $x->{$k} =~ /^[0-9\.]+$/ && $y->{$k} =~ /^[0-9\.]+$/ ? $x->{$k} + $y->{$k} : $y->{$k}); } } } return $x; } sub merge_arrays { my ($x, $y) = @_; my %tmp = map {$_ => 1} (@$x, @$y); return [keys %tmp]; } # DEBUG AND STATUS METHODS ########################## # gets time sub get_time() { my @time = localtime(time()); # increment the month (Jan = 0) $time[4]++; # add leading zeroes as required for my $i(0..4) { $time[$i] = "0".$time[$i] if $time[$i] < 10; } # put the components together in a string my $time = ($time[5] + 1900)."-". $time[4]."-". $time[3]." ". $time[2].":". $time[1].":". $time[0]; return $time; } # prints debug output with time sub debug { my $text = (@_ ? (join "", @_) : "No message"); my $time = get_time; print $time." - ".$text.($text =~ /\n$/ ? "" : "\n"); } # finds out memory usage sub memory { my @mem; open IN, "ps -o rss,vsz $$ |"; while() { next if $_ =~ /rss/i; chomp; @mem = split; } close IN; return \@mem; } sub mem_diff { my $config = shift; my $mem = memory(); my @diffs; if(defined($config->{memory})) { for my $i(0..(scalar @{$config->{memory}} - 1)) { push @diffs, $mem->[$i] - $config->{memory}->[$i]; } } else { @diffs = @$mem; } $config->{memory} = $mem; return \@diffs; } # update or initiate progress bar sub progress { my ($config, $i, $total) = @_; return if defined($config->{quiet}) || defined($config->{no_progress}); $i = $total if $i > $total; my $width = $config->{terminal_width} || 60; my $percent = int(($i/$total) * 100); my $numblobs = int((($i/$total) * $width) - 2); # this ensures we're not writing to the terminal too much return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog}; $config->{prev_prog} = $numblobs.'-'.$percent; #printf("\r%s of %s", $i, $total); printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]"); } # end progress bar sub end_progress { my $config = shift; return if defined($config->{quiet}) || defined($config->{no_progress}); progress($config, 1,1); print "\n"; delete $config->{prev_prog}; } 1;