=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/legal/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =cut # # Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor # # Copyright (c) 2004 Ensembl # # You may distribute this module under the same terms as perl itself # # =head1 NAME Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor =head1 SYNOPSIS $reg = 'Bio::EnsEMBL::Registry'; $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); $va = $reg->get_adaptor("human","variation","variation"); $pa = $reg->get_adaptor("human","variation","population"); $va = $db->get_VariationAdaptor(); $pa = $db->get_PopulationAdaptor(); # Get a Variation by its internal identifier $var = $va->fetch_by_dbID(145); # fetch a variation by its name $var = $va->fetch_by_name('rs100'); # check if the variation is failed and if so, get the failed description if ($var->is_failed()) { $desc = $var->failed_description(); } # fetch all variations from a population $pop = $pa->fetch_by_name('PACIFIC'); @vars = {$va->fetch_all_by_Population($pop)}; # Modify the include_failed_variations flag in DBAdaptor to also return variations that have been flagged as failed $va->db->include_failed_variations(1); # Fetch all variations from a population, including variations flagged as failed @vars = {$va->fetch_all_by_Population($pop)}; =head1 DESCRIPTION This adaptor provides database connectivity for Variation objects. Variations (SNPs, etc.) may be retrieved from the Ensembl variation database by several means using this module. =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor; use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref wrap_array); use Bio::EnsEMBL::Variation::Variation; use Bio::EnsEMBL::Variation::Allele; use Bio::EnsEMBL::Utils::Iterator; use DBI qw(:sql_types); our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); my $DEFAULT_ITERATOR_CACHE_SIZE = 10000; sub store { my ($self, $var) = @_; my $dbh = $self->dbc->db_handle; # look up source_id if(!defined($var->{source_id})) { my $sth = $dbh->prepare(q{ SELECT source_id FROM source WHERE name = ? }); $sth->execute($var->{source}); my $source_id; $sth->bind_columns(\$source_id); $sth->fetch(); $sth->finish(); $var->{source_id} = $source_id; } throw("No source ID found for source name ", $var->{source}) unless defined($var->{source_id}); my $sth = $dbh->prepare(q{ INSERT INTO variation ( source_id, name, validation_status, ancestral_allele, flipped, class_attrib_id, somatic, minor_allele, minor_allele_freq, minor_allele_count, clinical_significance, evidence ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?) }); $sth->execute( $var->{source_id}, $var->name, (join ",", @{$var->get_all_validation_states}) || undef, $var->ancestral_allele, $var->{flipped}, $var->{class_attrib_id} || ( $var->{class_SO_term} && $self->db->get_AttributeAdaptor->attrib_id_for_type_value('SO_term', $var->{class_SO_term})) || 18, $var->is_somatic, $var->minor_allele, $var->minor_allele_frequency, $var->minor_allele_count, $var->{clinical_significance} ? (join ",", @{$var->{clinical_significance}}) : undef, $var->{evidence} ? (join ",", @{$var->{evidence}}) : undef ); $sth->finish; # get dbID my $dbID = $dbh->last_insert_id(undef, undef, 'variation', 'variation_id'); $var->{dbID} = $dbID; $var->{adaptor} = $self; } sub update { my ($self, $var) = @_; my $dbh = $self->dbc->db_handle; # look up source_id if(!defined($var->{source_id})) { my $sth = $dbh->prepare(q{ SELECT source_id FROM source WHERE name = ? }); $sth->execute($var->{source}); my $source_id; $sth->bind_columns(\$source_id); $sth->fetch(); $sth->finish(); $var->{source_id} = $source_id; } throw("No source ID found for source name ", $var->{source}) unless defined($var->{source_id}); my $sth = $dbh->prepare(q{ UPDATE variation SET source_id = ?, name = ?, validation_status = ?, ancestral_allele = ?, flipped = ?, class_attrib_id = ?, somatic = ?, minor_allele = ?, minor_allele_freq = ?, minor_allele_count = ?, clinical_significance = ?, evidence = ? WHERE variation_id = ? }); $sth->execute( $var->{source_id}, $var->name, (join ",", @{$var->get_all_validation_states}) || undef, $var->ancestral_allele, $var->{flipped}, $var->{class_attrib_id} || $var->adaptor->db->get_AttributeAdaptor->attrib_id_for_type_value('SO_term', $var->{class_SO_term}) || 18, $var->is_somatic, $var->minor_allele, $var->minor_allele_frequency, $var->minor_allele_count, join(",",@{$var->{clinical_significance}}) || undef, join(",",@{$var->{evidence}}) || undef, $var->dbID ); $sth->finish; } =head2 fetch_all Description: Returns a listref of all germline variations Returntype : listref of Variations Status : Stable =cut sub fetch_all { my $self = shift; my $constraint = 'v.somatic = 0'; return $self->generic_fetch($constraint); } =head2 fetch_all_somatic Description: Returns a listref of all somatic variations Returntype : listref of Variations Status : Stable =cut sub fetch_all_somatic { my $self = shift; my $constraint = 'v.somatic = 1'; return $self->generic_fetch($constraint); } =head2 fetch_Iterator Arg [1] : int $cache_size (optional) Example : $var_iterator = $var_adaptor->fetch_Iterator; Description: returns an iterator over all germline variations in the database Returntype : Bio::EnsEMBL::Utils::Iterator Exceptions : none Caller : general Status : Experimental =cut sub fetch_Iterator { my ($self, $cache_size) = @_; return $self->_generic_fetch_Iterator($cache_size, 'v.somatic = 0'); } =head2 fetch_Iterator_somatic Arg [1] : int $cache_size (optional) Example : $var_iterator = $var_adaptor->fetch_Iterator; Description: returns an iterator over all somatic variations in the database Returntype : Bio::EnsEMBL::Utils::Iterator Exceptions : none Caller : general Status : Experimental =cut sub fetch_Iterator_somatic { my ($self, $cache_size) = @_; return $self->_generic_fetch_Iterator($cache_size, 'v.somatic = 1'); } sub _generic_fetch_Iterator { my ($self, $cache_size, $constraint) = @_; my $full_constraint = $constraint ? $constraint.' AND '.$self->db->_exclude_failed_variations_constraint : $self->db->_exclude_failed_variations_constraint; # prepare and execute a query to fetch all dbIDs my $sth = $self->prepare(qq{ SELECT v.variation_id FROM (variation v, source s) LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id WHERE v.source_id = s.source_id AND $full_constraint }); $sth->execute; my $var_id; $sth->bind_columns(\$var_id); # we probably can't fit all of these into memory at once though, # so create an iterator that fetches $cache_size dbIDs from the # statement handle at a time and then fetches these objects, # storing them in a cache. We then return variation objects # from this cache one by one, before filling it again if # necessary $cache_size ||= $DEFAULT_ITERATOR_CACHE_SIZE; my @cache; my $items_to_fetch = 1; return Bio::EnsEMBL::Utils::Iterator->new(sub{ if (@cache == 0 && $items_to_fetch) { # our cache is empty, and there are still items to fetch, so # fetch the next chunk of dbIDs and create objects from them my @dbIDs; my $item_count = 0; while( $sth->fetch ) { push @dbIDs, $var_id; if (++$item_count == $cache_size) { # we have fetched a cache's worth of dbIDs, so flag that # there are still items to fetch and last out of the loop $items_to_fetch = 1; last; } # if this is the last row, this flag will be 0 outside the loop $items_to_fetch = 0; } $sth->finish unless $items_to_fetch; @cache = @{ $self->fetch_all_by_dbID_list(\@dbIDs) } if @dbIDs; } return shift @cache; }); } =head2 fetch_by_dbID Arg [1] : int $dbID Example : $var = $var_adaptor->fetch_by_dbID(5526); Description: Retrieves a Variation object via its internal identifier. If no such variation exists undef is returned. Returntype : Bio::EnsEMBL::Variation::Variation Exceptions : throw if dbID arg is not defined Caller : general, IndividualAdaptor Status : Stable =cut sub fetch_by_dbID { my $self = shift; my $dbID = shift; # Now, it should be fine to just use the superclass method my $result = $self->SUPER::fetch_by_dbID($dbID); return $result; } sub _columns { my $self = shift; my @cols = ( "v.variation_id", "v.name AS v_name", "v.validation_status AS v_validation_status", "v.class_attrib_id AS v_class_attrib_id", "s1.name AS v_source_name", "s1.description AS v_source_description", "s1.url AS v_source_url", "s1.type AS v_source_type", "v.somatic AS v_somatic", "v.flipped AS v_flipped", "v.ancestral_allele AS v_ancestral_allele", "vs.moltype AS vs_moltype", "vs.name AS vs_name", "s2.name AS vs_source_name", "v.minor_allele", "v.minor_allele_freq", "v.minor_allele_count", "v.clinical_significance", "v.evidence" ); return @cols; } sub _tables { my $self = shift; my @tables = ( ['variation', 'v'], ['source', 's1'], ['variation_synonym', 'vs'], ['source', 's2'] ); # If we are constraining on population_id, add the allele table push(@tables,['allele', 'a']) if ($self->{'_constrain_population'}); # Add the failed_variation table if we are filtering on those push(@tables,['failed_variation', 'fv']) unless ($self->db->include_failed_variations()); return @tables; } sub _left_join { my $self = shift; my @left_join = ( ['variation_synonym', 'v.variation_id = vs.variation_id'], ['source s2', 'vs.source_id = s2.source_id'] ); # If we are filtering on failed variations, left join push(@left_join,['failed_variation', 'v.variation_id = fv.variation_id']) unless ($self->db->include_failed_variations()); return @left_join; } sub _default_where_clause { my $self = shift; my $constraint = qq{ s1.source_id = v.source_id }; # If we are constraining on population_id, we should have a constraint on the allele tables as well $constraint .= qq{ AND a.variation_id = v.variation_id } if ($self->{'_constrain_population'}); return $constraint; } =head2 fetch_by_name Arg [1] : string $name Arg [2] : string $source (optional) Example : $var = $var_adaptor->fetch_by_name('rs1453'); Description: Retrieves a variation object via its name Returntype : Bio::EnsEMBL::Variation::Variation Exceptions : throw if name argument is not defined Caller : general Status : Stable =cut sub fetch_by_name { my $self = shift; my $name = shift; my $source = shift; throw('name argument expected') if(!defined($name)); # hack fix for phencode names with '+' in getting encoded/decoded by the webcode $name =~ s/\s/\+/ if $self->db->species =~ /homo_sapiens/i; # Add a constraint on the name my $constraint = qq{v.name = ?}; # If a source is given, add a constraint on the source $constraint .= qq{ AND s1.name = ?} if ( defined $source ); # Bind the parameters $self->bind_param_generic_fetch($name,SQL_VARCHAR); $self->bind_param_generic_fetch($source,SQL_VARCHAR) if ( defined $source ); # Get the results from generic fetch method my $result = $self->generic_fetch($constraint); # We need to check the synonym table in case the name was not found in the variation table unless (scalar(@{$result})) { # Call the fetch by synonym method $result = wrap_array($self->fetch_by_synonym($name,$source)); } # If we still can't find any result and the name looks like an ssId, try fetching by subsnp_id instead unless (scalar(@{$result}) || $name !~ m/^ss\d+$/) { $result = wrap_array($self->fetch_by_subsnp_id($name)); } # Return the result return undef unless (scalar(@{$result})); return $result->[0]; } # alias for fetch_by_name sub fetch_by_stable_id { my $self = shift; return $self->fetch_by_name(@_); } # Internal method for getting the internal dbIDs for a list of names. Will also query the variation_synonym and allele (for subsnp_ids) tables sub _name_to_dbID { my $self = shift; my $name_list = shift; my $synonym = shift; my $subsnp = shift; $name_list = wrap_array($name_list); throw ("A list of names is required") unless (scalar(@{$name_list})); # Use a hash to store the name to dbID mapping my %dbIDs; # Determine which columns we are returning my $cols = ($synonym && $subsnp ? "CONCAT('ss',v.subsnp_id) AS name, v.variation_id" : "v.name, v.variation_id"); # Determine which table we are querying my $table = ($synonym ? ($subsnp ? 'allele' : 'variation_synonym') : 'variation'); # Statement to get the dbIDs from variation or variation_synonym my $stmt = qq{ SELECT $cols FROM $table v WHERE }; my $sth; # Work on batches of $batch_size; my $batch_size = 200; # Make a local copy of the list to work on my $local_list = [@{$name_list}]; while (scalar(@{$local_list})) { # Get the next batch and construct the constraint my @names = splice(@{$local_list},0,$batch_size); my $constraint = "('" . join("','",@names) . "')"; $constraint = ($synonym && $subsnp ? "v.subsnp_id" : "v.name") . qq{ IN $constraint}; # Prepare a statement $sth = $self->prepare($stmt . qq{ $constraint }); $sth->execute(); # Fetch the results and populate the hash my ($name,$dbID); $sth->bind_columns(\$name,\$dbID); while ($sth->fetch()) { $dbIDs{$name} = $dbID; } } # If we are querying variation and have unmapped names, also query variation_synonym and allele if (!$synonym || !$subsnp) { # Get the unmapped names my @unmapped = grep {!exists($dbIDs{$_})} @{$name_list}; # If we are going to query for subsnp_ids, get the names that look like ssIds and strip the ss prefix if ($synonym) { @unmapped = grep {$_ =~ s/^ss(\d+)$/$1/} @unmapped; } # Get the dbID mapping from the synonym table and add them to the hash if there are unmapped if (scalar(@unmapped)) { my $names = $self->_name_to_dbID(\@unmapped,1,$synonym); map {$dbIDs{$_} = $names->{$_}} keys(%{$names}); } } # Return a hashref with the name -> dbID mapping return \%dbIDs; } sub fetch_by_synonym { my $self = shift; my $name = shift; my $source = shift; # Do a query to get the current variation for the synonym and call a fetch method on this variation my $constraint = qq{vs.name = ?}; $constraint .= qq{ AND s.name = ?} if (defined($source)); # This statement will only return 1 row which is consistent with the behaviour of fetch_by_name. # However, the synonym name is only guaranteed to be unique in combination with the source my $stmt = qq{ SELECT vs.variation_id FROM variation_synonym vs JOIN source s ON ( s.source_id = vs.source_id ) WHERE $constraint LIMIT 1 }; my $sth = $self->prepare($stmt); $sth->bind_param(1,$name,SQL_VARCHAR); $sth->bind_param(2,$source,SQL_VARCHAR) if (defined($source)); $sth->execute(); # Bind the results my $dbID; $sth->bind_columns(\$dbID); # Fetch the results $sth->fetch(); # Return undef in case no data could be found return undef unless (defined($dbID)); # Call the fetch_by_name method using the updated name for the synonym return $self->fetch_by_dbID($dbID); } =head2 fetch_by_subsnp_id Arg [1] : string $subsnp_id Example : $var = $var_adaptor->fetch_by_subsnp_id('ss123'); Description: Retrieves a variation object via a component subsnp ID Returntype : Bio::EnsEMBL::Variation::Variation Exceptions : throw if name argument is not defined Caller : general Status : Stable =cut sub fetch_by_subsnp_id { my $self = shift; my $name = shift; throw('name argument expected') if(!defined($name)); # Strip away any ss prefix $name =~ s/^ss//gi; $self->{'_constrain_population'} = 1; # Add a constraint on the subsnp_id my $constraint = qq{a.subsnp_id = ?}; # Bind the parameters $self->bind_param_generic_fetch($name,SQL_INTEGER); # Get the results from generic fetch method my $result = $self->generic_fetch($constraint); delete($self->{'_constrain_population'}); # Return the result return undef unless (scalar(@{$result})); return $result->[0]; } =head2 fetch_all_by_source Arg [1] : string $source_name Arg [2] : int $primary Example : $var = $var_adaptor->fetch_all_by_source(); Description: Retrieves all Variation objects associated with a source. By default ($primary=0) returns variations that have the source or variation_synonym that have the source. If primary set to 1, it returns only variations where the primary name is associated with the source Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : thrown if source_name not provided Caller : general Status : Stable =cut sub fetch_all_by_source { my $self = shift; my $source_name = shift; my $primary = shift; throw('name argument expected') unless (defined($source_name)); # By default, returns ALL variation and variation_synonyms where source = $name $primary ||= 0; # Add the constraint on the source name. If primary is true, only the variation source will be queried, otherwise the variation_synonym source will be as well my $constraint = qq{s1.name = ?}; $constraint = qq{($constraint OR s2.name = ?)} unless ($primary); # If necessary, add the constraint for filtering failed variations $constraint .= qq{ AND } . $self->db->_exclude_failed_variations_constraint(); # Bind the source name parameter $self->bind_param_generic_fetch($source_name,SQL_VARCHAR); $self->bind_param_generic_fetch($source_name,SQL_VARCHAR); # Execute the superclass generic fetch return $self->generic_fetch($constraint); } =head2 fetch_all_by_source_type Arg [1] : string $source_type Arg [2] : int $primary Example : $var = $var_adaptor->fetch_all_by_source_type('chip'); Description: Retrieves all Variation objects associated with a type of source. By default ($primary=0) returns variations that have the source or variation_synonym that have the source. If primary set to 1, it returns only variations where the primary name is associated with the source Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : thrown if source_type not provided Caller : general Status : At risk =cut sub fetch_all_by_source_type { my $self = shift; my $source_type = shift; my $primary = shift; throw('name argument expected') unless (defined($source_type)); # Get the source names that match the type my $stmt = qq{ SELECT name FROM source WHERE type = ? }; my $sth = $self->prepare($stmt); $sth->bind_param(1,$source_type,SQL_VARCHAR); $sth->execute(); # Fetch the results from one source at a time. This should probably be implemented in a more efficient manner. my $name; $sth->bind_columns(\$name); # Store in a hash to avoid duplicates my %out; while ($sth->fetch()) { my $result = $self->fetch_all_by_source($name,$primary); next unless (scalar(@{$result})); # Add the fetched results to the total hash map {$out{$_->dbID()} = $_} @{$result}; } # Return the reference to the list of variations return [values(%out)]; } =head2 fetch_all_by_dbID_list Arg [1] : reference to list of ints $list Example : @vars = @{$va->fetch_all_by_dbID_list([124, 56, 90])}; Description: Retrieves a set of variations via their internal identifiers. This is faster than repeatedly calling fetch_by_dbID if there are a large number of variations to retrieve Returntype : reference to list of Bio::EnsEMBL::Variation::Variation objects Exceptions : throw on bad argument Caller : general, IndividualGenotypeAdaptor, PopulationGenotypeAdaptor Status : Stable =cut sub fetch_all_by_dbID_list { my $self = shift; my $list = shift; # Get an Iterator for the list my $iterator = $self->fetch_Iterator_by_dbID_list($list); # Get an arrayref representing the contents of the Iterator my $result = $iterator->to_arrayref(); return $result; } =head2 fetch_Iterator_by_dbID_list Arg [1] : reference to list of ints $list Example : $variation_iterator = $va->fetch_Iterator_by_dbID_list([124, 56, 90]); Description: Retrieves an iterator over a set of variations via their internal identifiers. Returntype : Bio::EnsEMBL::Utils::Iterator Exceptions : throw on bad argument Caller : general Status : Experimental =cut sub fetch_Iterator_by_dbID_list { my ($self, $dbid_list, $cache_size) = @_; unless ((defined $dbid_list) && (ref $dbid_list eq 'ARRAY')) { throw("list reference argument is required"); } $cache_size ||= $DEFAULT_ITERATOR_CACHE_SIZE; # create an iterator that fetches variations in blocks of # $cache_size and returns them in turn my @object_cache; return Bio::EnsEMBL::Utils::Iterator->new(sub { if (@object_cache == 0 && @$dbid_list > 0 ) { my @dbids = splice @$dbid_list, 0, $cache_size; # Create a constraint on the dbIDs my $id_str = "(" . join(",",@dbids) . ")"; my $constraint = qq{v.variation_id IN $id_str}; @object_cache = @{ $self->generic_fetch($constraint) }; } return shift @object_cache; } ); } =head2 fetch_all_by_name_list Arg [1] : reference to list of names $list Example : @vars = @{$va->fetch_all_by_name_list(["rs3", "rs1333049"])}; Description: Retrieves a set of variations via their names. This is faster than repeatedly calling fetch_by_name if there are a large number of variations to retrieve Returntype : reference to list of Bio::EnsEMBL::Variation::Variation objects Exceptions : throw on bad argument Caller : general Status : Stable =cut sub fetch_all_by_name_list { my $self = shift; my $list = shift; # Get a list of dbIDs for the names my $dbIDs = $self->_name_to_dbID($list); # Then fetch the variations by dbID list instead my @dbID_list = values(%{$dbIDs}); return $self->fetch_all_by_dbID_list(\@dbID_list); } =head2 get_all_sources Args : none Example : $sources = $va->get_all_sources(); Description : Retrieves from the database all sources in the Source table ReturnType : array ref of string Exceptions : none Caller : web Status : Stable =cut sub get_all_sources{ my $self = shift; my @sources; my $source_name; my $sth = $self->prepare(qq{SELECT name from source }); $sth->execute(); $sth->bind_columns(\$source_name); while ($sth->fetch()){ push @sources, $source_name } $sth->finish(); return \@sources; } =head2 get_default_source Args : none Example : $default_source = $va->get_default_source(); Description : Retrieves from the database the default source used for display purposes ReturnType : string Exceptions : none Caller : web Status : At Risk =cut sub get_default_source{ my $self = shift; my $source_name; my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? }); $sth->bind_param(1,'source.default_source',SQL_VARCHAR); $sth->execute(); $sth->bind_columns(\$source_name); $sth->fetch(); $sth->finish(); return $source_name; } =head2 get_source_version Arg[1] : string $name Example : $version = $va->get_source_version('dbSNP'); Description : Retrieves from the database the version for the source given as an argument ReturnType : int Exceptions : none Caller : general Status : Stable =cut sub get_source_version{ my $self = shift; my $name = shift; my $version; my $sth = $self->prepare(qq{SELECT version from source where name = ? }); $sth->bind_param(1,$name,SQL_VARCHAR); $sth->execute(); $sth->bind_columns(\$version); $sth->fetch(); $sth->finish(); return $version; } =head2 get_flanking_sequence Arg[1] : int $variationID Example : $flankinq_sequence = $va->get_flanking_sequence('652'); Description : Retrieves the appropriate flanking sequence (five,three) for the variation position from the genomic reference sequence ReturnType : reference to a list containing (three_flank,five_flank) Exceptions : throw when not possible to obtain sequence Caller : general, Variation Status : Stable =cut sub get_flanking_sequence{ my $self = shift; my $variationID = shift; my $length = shift ||325; my $flanking_sequence; #reference to an array for the three_prime and five_prime seqs my ($seq_region_id, $seq_region_strand, $seq_region_start, $seq_region_end, $up_seq_region_end, $down_seq_region_start, $match ); ## variant may have multiple mappings, some perfect (alignment_quality = 1) others not (alignment_quality =0) my $sth = $self->prepare(qq{ SELECT seq_region_id, seq_region_start, seq_region_end, seq_region_strand, alignment_quality FROM variation_feature WHERE variation_id = ? ORDER BY alignment_quality DESC }); $sth->bind_param(1,$variationID,SQL_INTEGER); $sth->execute(); #retrieve the flank from the variation database $sth->bind_columns(\($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand,$match )); $sth->fetch(); $sth->finish(); unless( $seq_region_id){ warn( "*****[ERROR]: No seq_region_id for SNP with dbID: $variationID. ". "Cannot retrieve flanking region******\n" ); return; } unless( $seq_region_start && $seq_region_end){ warn( "*****[ERROR]: No mapping for SNP with dbID: $variationID. ". "Cannot retrieve flanking region******\n" ); return; } if($seq_region_start > $seq_region_end){ ##insertion - re-order start and end coordinates ($up_seq_region_end,$down_seq_region_start) = ($seq_region_end, $seq_region_start ); } else{ ## deletion or substitution - increment start and end coordinates to avoid variant base(s) ($up_seq_region_end,$down_seq_region_start) = ($seq_region_start, $seq_region_end); $up_seq_region_end--; $down_seq_region_start++; } my $up_seq_region_start = $up_seq_region_end - $length; my $down_seq_region_end = $down_seq_region_start + $length; my $down_seq = $self->_get_flank_from_core( $seq_region_id, $down_seq_region_start, $down_seq_region_end, $seq_region_strand ); my $up_seq = $self->_get_flank_from_core( $seq_region_id, $up_seq_region_start, $up_seq_region_end, $seq_region_strand ); push @{$flanking_sequence},$down_seq,$up_seq; #add to the array the 3 and 5 prime sequences return $flanking_sequence; } =head2 fetch_all_by_Population Arg [1] : Bio::EnsEMBL::Variation::Population Arg [2] : $minimum_frequency (optional) Example : $pop = $pop_adaptor->fetch_by_dbID(1345); @vars = @{$va_adaptor->fetch_all_by_Population($pop)}; Description: Retrieves all variations which are stored for a specified population. If $minimum_frequency is supplied, only variations with a minor allele frequency (MAF) greater than $minimum_frequency will be returned. Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : throw on incorrect argument Caller : general Status : At Risk =cut sub fetch_all_by_Population { my $self = shift; my $pop = shift; my $freq = shift; assert_ref($pop,'Bio::EnsEMBL::Variation::Population'); if(!defined($pop->dbID())) { warning("Cannot retrieve genotypes for population without set dbID"); return []; } # Constraint the query using the population_id for the population my $constraint = qq{a.population_id = ?}; # adjust frequency if given a percentage if (defined($freq)) { $freq /= 100 if $freq > 1; $constraint .= qq{ AND (IF(a.frequency > 0.5, 1-a.frequency, a.frequency) > ?) }; } # Add the constraint for failed variations $constraint .= " AND " . $self->db->_exclude_failed_variations_constraint(); # Bind the parameters $self->bind_param_generic_fetch($pop->dbID(),SQL_INTEGER); $self->bind_param_generic_fetch($freq,SQL_DOUBLE) if (defined($freq)); # Set the flag to indicate that we are constraining on population and should not left join to allele $self->{'_constrain_population'} = 1; # Execute the generic fetch my $result = $self->generic_fetch($constraint); # Unset the flag delete($self->{'_constrain_population'}); return $result; } =head2 fetch_all_by_VariationSet Arg [1] : Bio::EnsEMBL::Variation::VariationSet Example : @vars = @{$va_adaptor->fetch_all_by_VariationSet($vs)}; Description: Retrieves all variations which are present in a specified variation set and its subsets. Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : throw on incorrect argument Caller : general Status : Stable =cut sub fetch_all_by_VariationSet { my $self = shift; return $self->_generic_fetch_by_VariationSet(0, @_); } =head2 fetch_Iterator_by_VariationSet Arg [1] : Bio::EnsEMBL::Variation::VariationSet Example : $var_iterator = $va_adaptor->fetch_Iterator_by_VariationSet($vs); Description: Retrieves an iterator for all variations which are present in a specified variation set and its subsets. Returntype : Bio::EnsEMBL::Utils::Iterator object Exceptions : throw on incorrect argument Caller : general Status : Experimental =cut sub fetch_Iterator_by_VariationSet { my $self = shift; my $set = shift; my $cache_size = shift || $DEFAULT_ITERATOR_CACHE_SIZE; # First, get ids for all subsets, my @var_set_ids = ($set->dbID); map {push(@var_set_ids,$_->dbID())} @{$set->adaptor->fetch_all_by_super_VariationSet($set)}; my $var_set_id = join(",",@var_set_ids); # Prepare a query for getting the span of variation_ids my $stmt = qq{ FROM variation_set_variation vsv LEFT JOIN failed_variation fv ON ( fv.variation_id = vsv.variation_id ) WHERE vsv.variation_set_id IN ($var_set_id) }; # Add the constraint for failed variations my $constraint = " AND " . $self->db->_exclude_failed_variations_constraint(); my $sth = $self->prepare(qq{SELECT MIN(vsv.variation_id), MAX(vsv.variation_id) $stmt $constraint}); $sth->execute(); my ($min_variation_id,$max_variation_id); $sth->bind_columns(\$min_variation_id,\$max_variation_id); $sth->fetch(); $max_variation_id ||= 0; $min_variation_id ||= 1; # Prepare a statement for getting the ids in a range $sth = $self->prepare(qq{SELECT vsv.variation_id $stmt AND vsv.variation_id BETWEEN ? AND ? $constraint}); # Internally, we keep an Iterator that works on the dbID span we're at my $iterator; return Bio::EnsEMBL::Utils::Iterator->new(sub { # If the iterator is empty, get a new chunk of dbIDs, unless we've fetched all dbIDs unless (defined($iterator) && $iterator->has_next() && $min_variation_id <= $max_variation_id) { ## check there are ids in the range to return my $count_sth = $self->prepare(qq{SELECT count(vsv.variation_id) $stmt AND vsv.variation_id BETWEEN ? AND ? $constraint}); my $done = 0; while( $min_variation_id < $max_variation_id && $done == 0 ){ $count_sth->execute($min_variation_id, $min_variation_id+$cache_size); my $count = $count_sth->fetchall_arrayref(); if ($count->[0]->[0] > 0){ $done =1; } else{ $min_variation_id += ($cache_size + 1); } } # Get the next chunk of dbIDs $sth->execute($min_variation_id,$min_variation_id+$cache_size); $min_variation_id += ($cache_size + 1); # Use a hash to keep track of the seen dbIDs my %seen; # Loop over the dbIDs and avoid duplicates my $dbID; my @dbIDs; $sth->bind_columns(\$dbID); while ($sth->fetch()) { push (@dbIDs,$dbID) unless ($seen{$dbID}++); } # Get a new Iterator based on the new dbID span $iterator = $self->fetch_Iterator_by_dbID_list(\@dbIDs); } return $iterator->next(); }); } sub _generic_fetch_by_VariationSet { my $self = shift; my $want_iterator = shift; my $set = shift; assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); if(!defined($set->dbID())) { warning("Cannot retrieve variations for variation set without a dbID"); return []; } # Get the unique dbIDs for all variations in this set and all of its subsets my $dbid_list = $self->_fetch_all_dbIDs_by_VariationSet($set); my $num_vars = @$dbid_list; if ($num_vars > 100_000 && !$want_iterator) { warn "This set contains a large number ($num_vars) of variations, these may not fit". "into memory at once, considering using fetch_Iterator_by_VariationSet instead"; } # Use the dbIDs to get all variations and return them return $want_iterator ? $self->fetch_Iterator_by_dbID_list($dbid_list) : $self->fetch_all_by_dbID_list($dbid_list); } sub _fetch_all_dbIDs_by_VariationSet { my $self = shift; my $set = shift; # First, get ids for all subsets, my @var_set_ids = ($set->dbID); foreach my $var_set (@{$set->adaptor->fetch_all_by_super_VariationSet($set)}) { push @var_set_ids, $var_set->dbID; } my $set_str = "(" . join(",",@var_set_ids) .")"; # Add the constraint for failed variations my $constraint = " AND " . $self->db->_exclude_failed_variations_constraint(); # Then get the dbIDs for all these sets my $stmt = qq{ SELECT DISTINCT vsv.variation_id FROM variation_set_variation vsv LEFT JOIN failed_variation fv ON ( fv.variation_id = vsv.variation_id ) WHERE vsv.variation_set_id in $set_str $constraint }; my $sth = $self->prepare($stmt); $sth->execute(); my @result; my $dbID; $sth->bind_columns(\$dbID); while ($sth->fetch()) { push @result, $dbID; } return \@result; } =head2 fetch_all_by_publication Arg [1] : Bio::EnsEMBL::Variation::Publication Example : @vars = @{$va_adaptor->fetch_all_by_publication($pub)}; Description: Retrieves all variations which are cited in a specific publication Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : throw on incorrect argument Caller : general Status : At Risk =cut sub fetch_all_by_publication{ my $self = shift; my $pub_obj = shift; my @var; my $variation_id; my $sth = $self->prepare(qq{ SELECT variation_id from variation_citation where publication_id = ? }); $sth->execute( $pub_obj->dbID() ); $sth->bind_columns(\$variation_id); while ($sth->fetch()){ push @var, $self->fetch_by_dbID($variation_id); } $sth->finish; return \@var; } =head2 is_failed Description : DEPRECATED. The appropriate subroutine on the Variation/Allele object should be used instead. Exceptions : Thrown on invocation. Status : DEPRECATED =cut sub is_failed { my $self = shift; throw("The is_failed subroutine in VariationAdaptor is deprecated. Use the appropriate subroutine on the Variation/Allele object instead"); } =head2 has_failed_subsnps Description : DEPRECATED. Use has_failed_alleles on the Variation object instead. Exceptions : Thrown on invocation. Status : DEPRECATED =cut sub has_failed_subsnps { my $self = shift; throw("The has_failed_subsnps subroutine in VariationAdaptor is deprecated. Use has_failed_alleles on the Variation object instead"); } =head2 get_all_failed_descriptions Arg[1] : Bio::EnsEMBL::Variation::Variation $variation The variation object to get the failed descriptions for Example : my $failed_descriptions = $adaptor->get_all_failed_descriptions($var); if (scalar(@{$failed_descriptions})) { print "The variation '" . $var->name() . "' has been flagged as failed because '" . join("' and '",@{$failed_descriptions}) . "'\n"; } Description : Gets the unique descriptions for the reasons why the supplied variation has failed. ReturnType : reference to a list of strings Exceptions : none Caller : general Status : Stable =cut sub get_all_failed_descriptions { my $self = shift; my $variation = shift; # Call the internal get method without any constraints my $description = $self->_internal_get_failed_descriptions($variation) || []; return $description; } =head2 get_failed_description Description : DEPRECATED. The appropriate subroutine on the Variation/Allele object should be used instead. Exceptions : Thrown on invocation. Status : DEPRECATED =cut sub get_failed_description { my $self = shift; throw("The get_failed_description subroutine in VariationAdaptor is deprecated. Use the appropriate subroutine on the Variation/Allele object instead"); } # API-internal method for getting failed descriptions for a variation sub _internal_get_failed_descriptions { my $self = shift; my $variation = shift; my $constraint = shift; # Assert that the object passed is a Variation assert_ref($variation,'Bio::EnsEMBL::Variation::Variation'); my $stmt = qq{ SELECT DISTINCT fd.description FROM failed_variation fv JOIN failed_description fd ON ( fd.failed_description_id = fv.failed_description_id ) WHERE fv.variation_id = ? }; $stmt .= qq{ AND $constraint } if (defined($constraint)); my $sth = $self->prepare($stmt); $sth->execute($variation->dbID()); return [map {$_->[0]} @{$sth->fetchall_arrayref([0])}]; } sub _get_flank_from_core{ my $self = shift; my $seq_region_id = shift; my $seq_region_start = shift; my $seq_region_end = shift; my $seq_region_strand = shift; my $flanking_sequence; if (defined $self->db()->dnadb()){ my $slice_adaptor = $self->db()->dnadb()->get_SliceAdaptor(); my $slice = $slice_adaptor->fetch_by_seq_region_id($seq_region_id); if (!$slice){ throw("Not possible to obtain slice for seq_region_id \"$seq_region_id\"\n"); } my $flank = $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); return $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); } return ''; } sub _objs_from_sth { my ($self, $sth) = @_; my %row; # Create the row hash using column names as keys $sth->bind_columns( \( @row{ @{$sth->{NAME_lc} } } )); while ($sth->fetch) { # we don't actually store the returned object because # the _obj_from_row method stores them in a temporary # hash _temp_objs in $self $self->_obj_from_row(\%row); } # Get the created objects from the temporary hash my @objs = values %{ $self->{_temp_objs} }; delete $self->{_temp_objs}; # If the flag for attaching allele objects was set, trigger loading of them by calling the getter method on the variation object if ($self->{_load_alleles}) { map {$_->get_all_Alleles()} @objs; } # Return the created objects return \@objs; } sub _obj_from_row { my ($self, $row) = @_; return undef unless $row->{variation_id}; # If the variation for this variation_id hasn't already been created, do that my $obj = $self->{_temp_objs}{$row->{variation_id}}; unless (defined($obj)) { # Get the validation status my @states; if (defined($row->{v_validation_status})) { @states = split(/,/,$row->{v_validation_status}); } my @evidence; if (defined($row->{evidence})) { @evidence = split(/,/,$row->{evidence}); } my @clin_sig; if (defined($row->{clinical_significance})) { @clin_sig = split(/,/,$row->{clinical_significance}); } # Create the variation object $obj = Bio::EnsEMBL::Variation::Variation->new( -dbID => $row->{variation_id}, -ADAPTOR => $self, -NAME => $row->{v_name}, -SOURCE => $row->{v_source_name}, -SOURCE_DESCRIPTION => $row->{v_source_description}, -SOURCE_URL => $row->{v_source_url}, -SOURCE_TYPE => $row->{v_source_type}, -IS_SOMATIC => $row->{v_somatic}, -FLIPPED => $row->{v_flipped}, -ANCESTRAL_ALLELE => $row->{v_ancestral_allele}, -MOLTYPE => $row->{vs_moltype}, -VALIDATION_STATES => \@states, -FLANK_FLAG => $row->{fs_flank_flag}, -CLASS_SO_TERM => $self->AttributeAdaptor()->attrib_value_for_id($row->{v_class_attrib_id}), -CLINICAL_SIGNIFICANCE => \@clin_sig, -MINOR_ALLELE => $row->{minor_allele}, -MINOR_ALLELE_FREQUENCY => $row->{minor_allele_freq}, -MINOR_ALLELE_COUNT => $row->{minor_allele_count}, -EVIDENCE => \@evidence, ); $self->{_temp_objs}{$row->{variation_id}} = $obj; } # Add a synonym if available if (defined($row->{vs_source_name}) && defined($row->{vs_name})) { $obj->add_synonym($row->{vs_source_name},$row->{vs_name}); } } =head2 load_alleles Arg[1] : boolean $load A flag determining whether to load alleles at the same time as Variations are created (1) or to lazy-load them (0 = default) Example : # Tell the VariationAdaptor to load alleles when creating variations my $variation_adaptor->load_alleles(1); my $variation = $variation_adaptor->fetch_by_dbID(1); Description : Sets the behaviour when it comes to fetching alleles at the time of object creation or on demand. This setting will be in effect for all fetch methods for the lifespan of the adaptor or until reset. ReturnType : none Exceptions : none Caller : general Status : At Risk =cut sub load_alleles { my $self = shift; $self->{_load_alleles} = defined(shift); } 1;