# # EnsEMBL module for Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor # =head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =head1 NAME Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor - A Base class for all Funcgen FeatureAdaptors =head1 SYNOPSIS Abstract class - should not be instantiated. Implementation of abstract methods must be performed by subclasses. =head1 DESCRIPTION This is a base adaptor for Funcgen feature adaptors. This base class is simply a way to redefine some methods to use with the Funcgen DB. =cut package Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; use vars qw(@ISA @EXPORT); use strict; use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Utils::Cache; use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump); use Bio::EnsEMBL::Utils::Argument qw(rearrange); #@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); #re-ordered this as core BaseAdaptor methods were over-riding Funcgen BaseAdaptor methods @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); ### To do ### #Add Translation method for fetch by FeatureSets methods? #Test and implement feature mapping between coord systems #Correct/Document methods!!! #Implement externaldb db_name version registry my %warnings; =head2 generic_fetch Arg [1] : (optional) string $constraint An SQL query constraint (i.e. part of the WHERE clause) Arg [2] : (optional) Bio::EnsEMBL::AssemblyMapper $mapper A mapper object used to remap features as they are retrieved from the database Arg [3] : (optional) Bio::EnsEMBL::Slice $slice A slice that features should be remapped to Example : $fts = $a->generic_fetch('contig_id in (1234, 1235)', 'Swall'); Description: Wrapper method for core BaseAdaptor, build seq_region cache for features Returntype : ARRAYREF of Bio::EnsEMBL::SeqFeature in contig coordinates Exceptions : none Caller : FeatureAdaptor classes Status : at risk =cut sub generic_fetch { my $self = shift; #need to wrap _generic_fetch to always generate the #seq_region_cache other wise non-slice based fetch methods fail #build seq_region cache here once for entire query #Using default schema_build here #So would need to set dnadb appropriately #This is cleaning tmp cache values such that #nested feature fetches cause failure e.g. when regfeats retrieve their reg_attrs #We need a way of always generating the tmp cache, or having it persist? #This is because we haven't built the tmp cache in non_slice based methods i.e. we haven't run get_seq_region_id_by_Slice #This is the same for all non-Slice based methods #And the is no way around it as we are not providing that info about the new DB by passing a slice! #The only way to get around this is to make the tmp_cache persistant $self->build_seq_region_cache(); return $self->SUPER::generic_fetch(@_); } =head2 fetch_all_by_Slice_constraint Arg [1] : Bio::EnsEMBL::Slice $slice the slice from which to obtain features Arg [2] : (optional) string $constraint An SQL query constraint (i.e. part of the WHERE clause) Arg [3] : (optional) string $logic_name the logic name of the type of features to obtain Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5'); Description: Returns a listref of features created from the database which are on the Slice defined by $slice and fulfill the SQL constraint defined by $constraint. If logic name is defined, only features with an analysis of type $logic_name will be returned. Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates Exceptions : thrown if $slice is not defined Caller : Bio::EnsEMBL::Slice Status : Stable =cut sub fetch_all_by_Slice_constraint { my($self, $slice, $constraint, $logic_name) = @_; my @result; if (! (ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))) { throw('Bio::EnsEMBL::Slice argument expected.'); } $constraint ||= ''; my $fg_cs = $self->db->get_FGCoordSystemAdaptor->fetch_by_name ( $slice->coord_system->name(), $slice->coord_system->version() ); if (! defined $fg_cs) { warn "No CoordSystem present for ".$slice->coord_system->name().":".$slice->coord_system->version(); return \@result; } #build seq_region cache here once for entire query $self->build_seq_region_cache($slice); my $syn = $self->_main_table->[1]; $constraint = $self->_logic_name_to_constraint($constraint, $logic_name); #if the logic name was invalid, undef was returned return [] if(!defined($constraint)); #check the cache and return if we have already done this query #Added schema_build to feature cache for EFG my $key; my $cache; if ( !$self->db->no_cache() ) { my $bind_params = $self->bind_param_generic_fetch(); $key = uc(join(':', $slice->name, $constraint, $self->db->_get_schema_build($slice->adaptor->db()))); if ( defined($bind_params) ) { $key .= ':' . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} ); } $cache = $self->_slice_feature_cache(); if (exists($cache->{$key})) { $self->{'_bind_param_generic_fetch'} = (); return $cache->{$key}; } } my $sa = $slice->adaptor(); # Hap/PAR support: retrieve normalized 'non-symlinked' slices my @proj = @{$sa->fetch_normalized_slice_projection($slice)}; if (@proj == 0) { throw('Could not retrieve normalized Slices. Database contains ' . 'incorrect assembly_exception information.'); } # Want to get features on the FULL original slice # as well as any symlinked slices # Filter out partial slices from projection that are on # same seq_region as original slice my $sr_id = $slice->get_seq_region_id(); @proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj; my $segment = bless([1,$slice->length(),$slice ], 'Bio::EnsEMBL::ProjectionSegment'); push( @proj, $segment ); # construct list of Hap/PAR boundaries for entire seq region my @bounds; my $ent_slice = $sa->fetch_by_seq_region_id($sr_id); $ent_slice = $ent_slice->invert() if($slice->strand == -1); my @ent_proj = @{$sa->fetch_normalized_slice_projection($ent_slice)}; shift @ent_proj; # skip first @bounds = map {$_->from_start - $slice->start() + 1} @ent_proj; # fetch features for the primary slice AND all symlinked slices foreach my $seg (@proj) { my $offset = $seg->from_start(); my $seg_slice = $seg->to_Slice(); my $features = $self->_slice_fetch($seg_slice, $constraint); ## NO RESULTS? This is a problem with the cs->equals method? # if this was a symlinked slice offset the feature coordinates as needed if ($seg_slice->name() ne $slice->name()) { FEATURE: foreach my $f (@$features) { if ($offset != 1) { $f->{start} += $offset-1; $f->{end} += $offset-1; } # discard boundary crossing features from symlinked regions foreach my $bound (@bounds) { if ($f->{'start'} < $bound && $f->{'end'} >= $bound) { next FEATURE; } } $f->{'slice'} = $slice; push @result, $f; } } else { push @result, @$features; } } if (defined $key) { $cache->{$key} = \@result; } return \@result; } =head2 build_seq_region_cache Arg [1] : optional - Bio::EnsEMBL::Slice the slice from which to obtain features Example : $self->build_seq_region_cache(); Description: Builds the seq_region_id caches based on the species and schema_build Returntype : None Exceptions : thrown if optional Slice argument is not valid Caller : self Status : At risk - should be private _build_seq_region_cache? Change arg to DBAdaptor? Or remove if we are building the full cache? =cut #do we even need to have the coord system? #so long as we are only using one schema build i.e. one dnadb defualt = current #slice and fg_cs are optional #need to look at this sub build_seq_region_cache{ my ($self, $slice) = @_; #instead of building this for each query #we need to store the current cached coord_system #and test whether it contains the query coord_system #No as this then restricts us to one coord_system #where as we want to cover all from one DB/schema_build if (defined $slice) { throw('Optional argument must be a Bio::EnsEMBL::Slice') if(! ( ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))); } my $dnadb = (defined $slice) ? $slice->adaptor->db() : $self->db->dnadb(); my $schema_build = $self->db->_get_schema_build($dnadb); my $sql = 'select sr.core_seq_region_id, sr.seq_region_id from seq_region sr'; my @args = ($schema_build); if ($self->is_multispecies()) { $sql.= ', coord_system cs where sr.coord_system_id = cs.coord_system_id and cs.species_id=? and'; unshift(@args, $self->species_id()); } else { $sql.= ' where'; } $sql.=' sr.schema_build =?'; #Check we have not already got the right cache my $cache_key = join(':', @args); #Do we already have a valid cache? return if($self->cache_key eq $cache_key); #Can't maintain these caches as we may be adding to them when storing #we should just maintain the cache with the cache_key #$self->{'seq_region_cache'}{$cache_key} ||= {}; #$self->{'core_seq_region_cache'} = {}; my $sth = $self->prepare($sql); $sth->execute(@args); while (my $ref = $sth->fetchrow_arrayref()) { $self->{seq_region_cache}->{$cache_key}->{$ref->[0]} = $ref->[1]; $self->{core_seq_region_cache}->{$cache_key}->{$ref->[1]} = $ref->[0]; } $sth->finish(); $self->cache_key($cache_key); return; } =head2 cache_key Arg [1] : optional string - species_id.schema_build e.g. 1.55_37 the slice from which to obtain features Example : $self->build_seq_region_cache(); Description: Getter/Setter for the seq_region cache_key Returntype : string Exceptions : None Caller : self Status : At risk =cut sub cache_key{ my ($self, $key) = @_; $self->{'cache_key'} = $key if $key; return $self->{'cache_key'} || ''; } #Need to separate these into by slice and by cs? #We could just use an old cs/schema_build to grab the correct slice based on the name #however we want to at least warn that the db needs updating #The problem is that we want to detect whether the seq_region_id is present for _pre_store #but automatically use a comparable seq_region for the normal fetch_methods. #So we need to split or add $? sub get_seq_region_id_by_Slice{ my ($self, $slice, $fg_cs, $test_present) = @_; if (! ($slice && ref($slice) && $slice->isa("Bio::EnsEMBL::Slice"))) { throw('You must provide a valid Bio::EnsEMBL::Slice'); } #We really need to validate the schema_build of the slice to make sure it #present in the current default coord_system i.e. the one which was used to #generate the seq_region_cache #This is set with the dnadb or with a slice query #This may not always have been done. #Now all we have to do is test the cache_key #Or can we just build_seq_region_cache as this checks and rebuilds if not correct #This may generate a mistmach between the dnadb and the schema_build used to generate the cache #This will be reset if required for subesquent queries using the cache key $self->build_seq_region_cache($slice); my ($core_sr_id, $fg_sr_id); #Slice should always have an adaptor, no? if ( $slice->adaptor() ) { $core_sr_id = $slice->adaptor()->get_seq_region_id($slice); } else { $core_sr_id = $self->db->dnadb->get_SliceAdaptor()->get_seq_region_id($slice); } #This does not work!! When updating for a new schema_build we get the first #seq_region stored, than for each subsequent one, it arbitrarily assigns a value from the hash even tho the #the exists condition isn't met! #my $fg_sr_id = $self->{'seq_region_cache'}{$core_sr_id} if exists $self->{'seq_region_cache'}{$core_sr_id}; #Can't replicate this using a normal hash #This cache has been built based on the schema_build if (exists $self->{'seq_region_cache'}{$self->cache_key}{$core_sr_id}) { $fg_sr_id = $self->{'seq_region_cache'}{$self->cache_key}{$core_sr_id}; } if (! $fg_sr_id && ref($fg_cs)) { #This is used to store new seq_region info along side previous stored seq_regions of the same version if ( ! $fg_cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')) { throw('Must pass as valid Bio::EnsEMBL::Funcgen:CoordSystem to retrieve seq_region_ids for forwards compatibility, passed '.$fg_cs); } my $sql = 'select seq_region_id from seq_region where coord_system_id =? and name =?'; my $sth = $self->prepare($sql); $sth->execute($fg_cs->dbID(), $slice->seq_region_name()); #This may not exist, so we need to catch it here? ($fg_sr_id) = $sth->fetchrow_array(); $sth->finish(); #if we are providing forward compatability #Then we know the eFG DB doesn't have the core seq_region_ids in the DB #Hence retrieving the slice will fail in _obj_from_sth #So we need to set it internally here #Then pick it up when get_core_seq_region_id is called for the first time(from where?) #and populate the cache with the value #This only works if there is a comparable slice #If we are dealing with a new assembly, then no $fg_sr_id will be returned #So need to catch this in the caller #Can we remove this now the cache is persistant? #i.e. Cache is not regenerated everytime, hence we don't lose the data? if ($fg_sr_id) { $self->{'_tmp_core_seq_region_cache'}{$self->cache_key} = {( $fg_sr_id => $core_sr_id )}; } } elsif (! $fg_sr_id && ! $test_present) { #This generally happens when using a new core db with a efg db that hasn't been updated #Default to name match or throw if not present in DB my $schema_build = $self->db->_get_schema_build($slice->adaptor->db()); my $core_cs = $slice->coord_system; #Avoids mapping of core to efg seq_region_ids #via schema_build(of the new core db) as we are matching directly to the seq_name my $sql = 'select distinct(seq_region_id) from seq_region sr, coord_system cs where sr.coord_system_id=cs.coord_system_id and sr.name=? and cs.name =?'; my @args = ($slice->seq_region_name(), $core_cs->name()); if ($core_cs->version()) { $sql.= ' and cs.version =?'; push(@args, $core_cs->version()); } if ($self->is_multispecies()) { $sql.=' and cs.species_id=?'; push(@args, $self->species_id()); } my $sth = $self->prepare($sql); $sth->execute(@args); ($fg_sr_id) = $sth->fetchrow_array(); $sth->finish(); if (! $fg_sr_id) { #Warn instead of throw so we can catch absent seq_region without eval #warn('Cannot find previously stored seq_region for: '.$core_cs->name.':'.$core_cs->version.':'.$slice->seq_region_name. # "\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n"); } #Only warn first time this is seen my $warning_key = $core_cs->name.':'.$core_cs->version.':'.$slice->seq_region_name; if (! exists $warnings{$warning_key}) { warn 'Defaulting to previously store seq_region for: '.$warning_key. "\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n"; $warnings{$warning_key} = 1; } } return $fg_sr_id; } sub get_core_seq_region_id{ my ($self, $fg_sr_id) = @_; #This is based on what the current schema_build is #to enable multi-schema/assmelby look up #we will have to nest these in schema_build caches #and use the schema_build of the slice which is passed to acquire the core_seq_region_id #likewise for reverse, i.e. when we store. my $core_sr_id = $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; if (! defined $core_sr_id && exists $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}) { #Do we need to test the cache_key here, might it have changed since get_seq_region_id_by_Slice? #Or will build_seq_region_cache handle this? #Why do we reset this in the main cache here if we are returning the value #Is this not corrupting the main cache? But we always want this value? $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id} = $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; #Delete here so we don't have schema_build specific sr_ids hanging around for another query? #delete $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; #These are valid values and would most likely be required for subsequent queries. #Cache key is now also schema_build specific so this is no longer a problem #Removed this as this causes RegFeat retrieval to fail #As all non slice based methods would fail due to the lack of info about unstored DB i.e. we need a slice $core_sr_id = $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; } elsif (! defined $core_sr_id) { #No cache entry, so get from dnadb my $sql = "select distinct(name) from seq_region where seq_region_id=$fg_sr_id"; my $slice_name = $self->db->dbc->db_handle->selectall_arrayref($sql); ($slice_name) = @{$slice_name->[0]}; #if(scalar(@names) != 1){#This should never happen #We should really grab the coord sys name above too. my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor; $core_sr_id = $slice_adaptor->get_seq_region_id($slice_adaptor->fetch_by_region(undef, $slice_name)); #Set this in the cache so we don't have to retrieve it again $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id} = $core_sr_id; } return $core_sr_id; } =head2 _pre_store Arg [1] : Bio::EnsEMBL::Feature Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5'); Description: Helper function containing some common feature storing functionality Given a Feature this will return a copy (or the same feature if no changes to the feature are needed) of the feature which is relative to the start of the seq_region it is on. The seq_region_id of the seq_region it is on is also returned. This method will also ensure that the database knows which coordinate systems that this feature is stored in. This supercedes the core method, to trust the slice the feature has been generated on i.e. from the dnadb. Also handles multi-coordsys aspect, generating new coord_system_ids as appropriate and assembly projection. Returntype : Bio::EnsEMBL::Feature and the seq_region_id it is mapped to Exceptions : thrown if $slice is not defined Caller : Bio::EnsEMBL::"Type"FeatureAdaptors Status : At risk =cut sub _pre_store { my ($self, $feature, $new_assembly) = @_; #May want to add cs_level arg? #What about ignore length flag? if (!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { throw('Expected Feature argument.'); } $self->_check_start_end_strand($feature->start(),$feature->end(), $feature->strand()); my $db = $self->db(); my $slice = $feature->slice(); if (!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { throw('Feature must be attached to Slice to be stored.'); } # make sure feature coords are relative to start of entire seq_region if ($slice->start != 1 || $slice->strand != 1) { #throw("You must generate your feature on a slice starting at 1 with strand 1"); #We did remove this transfer it uses direct hash access which #did not work with old array based ResultFeatures #move feature onto a slice of the entire seq_region $slice = $slice->adaptor->fetch_by_region($slice->coord_system->name(), $slice->seq_region_name(), undef, #start undef, #end undef, #strand $slice->coord_system->version()); $feature = $feature->transfer($slice); if (!$feature) { throw('Could not transfer Feature to slice of ' . 'entire seq_region prior to storing'); } } #Project here before we start building sr caches and storing CSs if ($new_assembly) { #warn "Projecting ".$feature->start.'-'.$feature->end." on "..$feature->slice->name." to $new_assembly"; my @segments = @{$feature->feature_Slice->project($slice->coord_system->name, $new_assembly)}; # do some sanity checks on the projection results: # discard the projected feature if # 1. it doesn't project at all (no segments returned) # 2. the projection is fragmented (more than one segment) # 3. the projection doesn't have the same length as the original # feature # this tests for (1) and (2) if (scalar(@segments) == 0) { warn "Feature doesn't project to $new_assembly\n"; return; } elsif (scalar(@segments) > 1) { warn "Feature projection is fragmented in $new_assembly\n"; return; } # test (3) my $proj_slice = $segments[0]->to_Slice; if ($feature->length != $proj_slice->length) { #if(! $conf->param('ignore_length')){ warn "Feature projection is wrong length in $new_assembly\n"; return; # } } #warn "proj ".$proj_slice->name; # everything looks fine, so adjust the coords of your feature #Have to generate new_slice here as we are not sure it is going to be #on the same slice as the old assembly $slice = $proj_slice->adaptor->fetch_by_region($proj_slice->coord_system->name, $proj_slice->seq_region_name); #These are just callers for ResultFeature! #For speed. $feature->start($proj_slice->start); $feature->end($proj_slice->end); $feature->slice($slice); #warn "new feature ".$feature->feature_Slice->name; } # Ensure this type of feature is known to be stored in this coord system. my $cs = $slice->coord_system; #from core/dnadb #retrieve corresponding Funcgen coord_system and set id in feature my $csa = $self->db->get_FGCoordSystemAdaptor(); #had to call it FG as we were getting the core adaptor my $fg_cs = $csa->validate_and_store_coord_system($cs); $fg_cs = $csa->fetch_by_name($cs->name(), $cs->version()); #Why are we refetching this? my $tabname = $self->_main_table->[0]; #Need to do this for Funcgen DB my $mcc = $db->get_MetaCoordContainer(); $mcc->add_feature_type($fg_cs, $tabname, $feature->length); #build seq_region cache here once for entire query $self->build_seq_region_cache($slice); #Now need to check whether seq_region is already stored #1 is test present flag my $seq_region_id = $self->get_seq_region_id_by_Slice($slice, undef, 1); if (! $seq_region_id) { #check whether we have an equivalent seq_region_id $seq_region_id = $self->get_seq_region_id_by_Slice($slice, $fg_cs); my $schema_build = $self->db->_get_schema_build($slice->adaptor->db()); my $sql; my $core_sr_id = $slice->get_seq_region_id; my @args = ($slice->seq_region_name(), $fg_cs->dbID(), $core_sr_id, $schema_build); #Add to comparable seq_region if ($seq_region_id) { $sql = 'insert into seq_region(seq_region_id, name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?,?)'; unshift(@args, $seq_region_id); } #No compararble seq_region else { $sql = 'insert into seq_region(name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?)'; } my $sth = $self->prepare($sql); #Need to eval this eval{$sth->execute(@args);}; if (!$@) { $seq_region_id = $sth->{'mysql_insertid'}; } #Now we need to add this to the seq_region caches #As we are not regenerating them every time we query. $self->{seq_region_cache}{$self->cache_key}{$core_sr_id} = $seq_region_id; $self->{core_seq_region_cache}{$self->cache_key}{$seq_region_id} = $core_sr_id; } #Need to return seq_region_id as they are not stored #in the slice retrieved from slice adaptor return ($feature, $seq_region_id); } #This is causing problems with remapping #Need to merge this back into the core code, subbing out the relevant parts #required for funcgen support sub _slice_fetch { my $self = shift; my $slice = shift; my $orig_constraint = shift; my $slice_start = $slice->start(); my $slice_end = $slice->end(); my $slice_strand = $slice->strand(); my $slice_cs = $slice->coord_system(); #my $slice_seq_region = $slice->seq_region_name(); #We Need to translate the seq_regions IDs to efg seq_region_ids #we need to fetch the seq_region ID based on the coord_system id and the name #we don't want to poulate with the eFG seq_region_id, jsut the core one, as we need to maintain core info in the slice. #we need to cache the seq_region_id mappings for the coord_system and schema_build, #so we don't do it for every feature in a slice #should we just reset it in temporarily in fetch above, other wise we lose cache between projections #should we add seq_region cache to coord_system/adaptor? #Then we can access it from here and never change the slice seq_region_id #how are we accessing the eFG CS? As we only have the core CS from the slice here? #No point in having in eFG CS if we're just having to recreate it for everyquery #CSA already has all CS's cached, so query overhead would be mininal #But cache overhead for all CS seq_region_id caches could be quite large. #we're going to need it in the _pre_store, so having it in the CS would be natural there #can we just cache dynamically instead? #we're still calling for csa rather than accessing from cache #could we cache the csa in the base feature adaptor? #my $slice_seq_region_id = $slice->get_seq_region_id(); #get the synonym and name of the primary_table my ($tab_name, $tab_syn) = @{$self->_main_table}; #find out what coordinate systems the features are in my $mcc = $self->db->get_MetaCoordContainer(); my @feat_css=(); my $mca = $self->db->get_MetaContainer(); my $value_list = $mca->list_value_by_key( $tab_name."build.level" ); if ( @$value_list and $slice->is_toplevel()) { push @feat_css, $slice_cs; } else { @feat_css = @{$mcc->fetch_all_CoordSystems_by_feature_type($tab_name)}; } my $asma = $self->db->get_AssemblyMapperAdaptor(); my @features; # fetch the features from each coordinate system they are stored in COORD_SYSTEM: foreach my $feat_cs (@feat_css) { my $mapper; my @coords; my @ids; #This is not returning true if the feat_cs is no default # if ($feat_cs->equals($slice_cs)) { # no mapping is required if this is the same coord system #eFG change #We want to set this to undef if we are dealing with result_features which #are not wsize==0! #This is only required if we load array data at the natural resolution! #This test will effect ever single feature query, can we omit this somehow? my $max_len; if (! (($self->can('_window_size') && $self->_window_size))) { $max_len = $self->_max_feature_length() || $mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs,$tab_name); } my $constraint = $orig_constraint; my $sr_id = $self->get_seq_region_id_by_Slice($slice, $feat_cs); #If we have no slice id here, we know we don't have anything in the DB for this slice return [] if ! $sr_id; #Only add the slice clause if we are not forcing reslice #force_reslice means we have dbID in the query for reg attrs #which maybe of the end of the query slice, but we still need them for #building reg features which overlap the end of the query slice if (! $self->force_reslice) { $constraint .= " AND " if($constraint); $constraint .= "${tab_syn}.seq_region_id = $sr_id AND " . "${tab_syn}.seq_region_start <= $slice_end AND " . "${tab_syn}.seq_region_end >= $slice_start"; if ($max_len) { my $min_start = $slice_start - $max_len; $constraint .= " AND ${tab_syn}.seq_region_start >= $min_start"; } } my $fs = $self->generic_fetch($constraint,undef,$slice); # features may still have to have coordinates made relative to slice # start $fs = $self->_remap($fs, $mapper, $slice); push @features, @$fs; } else { #Table contains some feature on a CS that differs from the Slice CS #can't do CS remapping yet as AssemblyMapper expects a core CS #change AssemblyMapper? #or do we just create a core CS just for the remap and convert back when done? # $mapper = $asma->fetch_by_CoordSystems($slice_cs, $feat_cs); # next unless defined $mapper; # Get list of coordinates and corresponding internal ids for # regions the slice spans # @coords = $mapper->map($slice_seq_region, $slice_start, $slice_end, # $slice_strand, $slice_cs); # @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords; # next COORD_SYSTEM if(!@coords); # @ids = map {$_->id()} @coords; ##coords are now id rather than name ## @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)}; # When regions are large and only partially spanned by slice # it is faster to to limit the query with start and end constraints. # Take simple approach: use regional constraints if there are less # than a specific number of regions covered. # if(@coords > $MAX_SPLIT_QUERY_SEQ_REGIONS) { # my $constraint = $orig_constraint; # my $id_str = join(',', @ids); # $constraint .= " AND " if($constraint); # $constraint .= "${tab_syn}.seq_region_id IN ($id_str)"; # my $fs = $self->generic_fetch($constraint, $mapper, $slice); # $fs = _remap($fs, $mapper, $slice); # push @features, @$fs; # } else { # do multiple split queries using start / end constraints # my $max_len = $self->_max_feature_length() || # $mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs, # $tab_name); # my $len = @coords; # for(my $i = 0; $i < $len; $i++) { # my $constraint = $orig_constraint; # $constraint .= " AND " if($constraint); # $constraint .= # "${tab_syn}.seq_region_id = " . $ids[$i] . " AND " . # "${tab_syn}.seq_region_start <= " . $coords[$i]->end() . " AND ". # "${tab_syn}.seq_region_end >= " . $coords[$i]->start(); # if($max_len) { # my $min_start = $coords[$i]->start() - $max_len; # $constraint .= # " AND ${tab_syn}.seq_region_start >= $min_start"; # } # my $fs = $self->generic_fetch($constraint,$mapper,$slice); # $fs = _remap($fs, $mapper, $slice); # push @features, @$fs; # } # } } } #COORD system loop return \@features; } # # Given a list of features checks if they are in the correct coord system # by looking at the first features slice. If they are not then they are # converted and placed on the slice. # #We have to have this here as this is a sub not a method, hence the _remap #in the core BaseFeatureAdaptor is not available here #Need to over-ride this for Iterator and remap per feature? #Can we skip this altogether if the check test is done before hand? #Would have to account for each CS set of features #but we are currently only handling the query slice CS, so we can skip this completely for now sub _remap { my ($self, $features, $mapper, $slice) = @_; #check if any remapping is actually needed if (@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') || $features->[0]->slice == $slice)) { return $features; } #remapping has not been done, we have to do our own conversion from #to slice coords my @out; my $slice_start = $slice->start(); my $slice_end = $slice->end(); my $slice_strand = $slice->strand(); my $slice_cs = $slice->coord_system(); my ($seq_region, $start, $end, $strand); #my $slice_seq_region_id = $slice->get_seq_region_id(); my $slice_seq_region = $slice->seq_region_name(); foreach my $f (@$features) { #since feats were obtained in contig coords, attached seq is a contig my $fslice = $f->slice(); if (!$fslice) { throw("Feature does not have attached slice.\n"); } my $fseq_region = $fslice->seq_region_name(); my $fseq_region_id = $fslice->get_seq_region_id(); my $fcs = $fslice->coord_system(); if (!$slice_cs->equals($fcs)) { #slice of feature in different coord system, mapping required ($seq_region, $start, $end, $strand) = $mapper->fastmap($fseq_region_id,$f->start(),$f->end(),$f->strand(),$fcs); # undefined start means gap next if(!defined $start); } else { $start = $f->start(); $end = $f->end(); $strand = $f->strand(); $seq_region = $f->slice->seq_region_name(); } # maps to region outside desired area next if ($start > $slice_end) || ($end < $slice_start) || ($slice_seq_region ne $seq_region); #shift the feature start, end and strand in one call if ($slice_strand == -1) { $f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 ); } else { $f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand ); } $f->slice($slice); push @out,$f; } return \@out; } =head2 fetch_all_by_stable_Storable_FeatureSets Arg [1] : Bio::EnsEMBL::Storable Arg [5] : arrayref - Bio::EnsEMBL::Funcgen::FeatureSet Example : ($start, $end) = $self->_set_bounds_by_regulatory_feature_xref ($trans_chr, $start, $end, $transcript, $fsets); Description: Internal method to set an xref Slice bounds given a list of FeatureSets. Returntype : List - ($start, $end); Exceptions : throw if incorrent args provided Caller : self Status : at risk =cut #We really only want this method to work on storables #Namely all SetFeatures, could extend to FeatureTypes #This is a reimplementation of what has been used in the #Funcgen::SliceAdaptor::_set_bounds_by_xref_FeatureSet #we could have a fourth arg here which is a coderef to filter the feature returned? #This could be used by the SliceAdaptor to only return features which exceed a current slice #We would still have to test the start and end in the caller, so not a massive win #for too much code obfucation. Altho we are maintaining two versions of this code #which could get out of sync, epsecially with respect to external_db names #This is currently feature generic #So can call from one adaptor but will return features from all adaptors? #Where can we put this? #DBEntry adaptor? #The only logical place for this is really a convinience method in the core BaseFeatureAdaptor #This removes the problem of the generic nature of the returned data, as the focus is with the #caller which is a singular feature, therefore, not mixing of data types #restrict to one feature type for now. but this will be slower as we will have to make the list call #for each type of feature_set passed, so if these are mixed they will be redundant #Let's just put a warning in #Keep this structure so we can port it to a convinience method somewhere #and slim down later? #This does not descend Gene>Transcript>Translation #Now we have conflicting standards #FeatureSets was kep as a list to prevent having to pass [$fset] for one FeatureSet #This also elimated the need to test ref == ARRAY #But now we want an extra optional descend flag #Should we change back? #Or should we wrap this method in one which will decend? #Write wrappers for now, as we will have to write the descend loop anyway #But this does not resolve the @fsets vs [$fset] issue #Will have to do this in the wrappers anyway, so change back #and live with the dichotomy of FeatureSet implementations for now sub fetch_all_by_stable_Storable_FeatureSets{ my ($self, $obj, $fsets) = @_; my ($extdb_name); my $dbe_adaptor = $self->db->get_DBEntryAdaptor; #Do we need a central registry for ensembl db names, what about versions? if (ref($obj) && $obj->isa('Bio::EnsEMBL::Storable') && $obj->can('stable_id')) { my @tmp = split/::/, ref($obj); my $obj_type = pop @tmp; my $group = $obj->adaptor->db->group; #Could sanity check for funcgen here? if (! defined $group) { throw('You must pass a stable Bio::EnsEMBL::Feature with an attached DBAdaptor with the group attribute set'); } $extdb_name = 'ensembl_'.$group.'_'.$obj_type; #warn "ext db name is $extdb_name"; } else { throw('Must pass a stable Bio::EnsEMBL::Feature, you passed a '.$obj); } #Set which eFG features we want to look at. if (ref($fsets) ne 'ARRAY' || scalar(@$fsets) == 0) { throw('Must define an array of Bio::EnsEMBL::FeatureSets to extend xref Slice bound. You passed: '.$fsets); } my %feature_set_types; foreach my $fset (@$fsets) { $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); $feature_set_types{$fset->feature_class} ||= []; push @{$feature_set_types{$fset->feature_class}}, $fset; } #We can list the outer loop here and put in the BaseFeatureAdaptor, or possible storable as we do have FeatureType xrefs. #This would be useful for fetching all the efg features for a given xref and FeatureSets #Don't implement as a parent sub and call from here as this would mean looping through array twice. #Altho we could pass a code ref to do the filtering? #Just copy and paste for now to avoid obfuscation my @features; #Get xrefs for each eFG feature set type foreach my $fset_type (keys %feature_set_types) { #my $xref_method = 'list_'.$fset_type.'_feature_ids_by_extid'; #e.g. list_regulatory_feature_ids_by_extid #Do type test here my $adaptor_type = ucfirst($fset_type).'FeatureAdaptor'; #remove this for generic method next if ref($self) !~ /$adaptor_type/; #This is for generic method #my $adaptor_method = 'get_'.ucfirst($fset_type).'FeatureAdaptor'; #my $adaptor = $self->db->$adaptor_method; my %feature_set_ids; map $feature_set_ids{$_->dbID} = 1, @{$feature_set_types{$fset_type}}; my $cnt = 0; #Change this self to adaptor for generic method foreach my $efg_feature (@{$self->fetch_all_by_external_name($obj->stable_id, $extdb_name)}) { #Skip if it is not in one of our FeatureSets next if ! exists $feature_set_ids{$efg_feature->feature_set->dbID}; push @features, $efg_feature; } } return \@features; } sub fetch_all_by_Gene_FeatureSets{ my ($self, $gene, $fsets, $dblinks) = @_; if (! ( ref($gene) && $gene->isa('Bio::EnsEMBL::Gene'))) { throw("You must pass a valid Bio::EnsEMBL::Gene object"); } my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($gene, $fsets)}; if ($dblinks) { foreach my $transcript (@{$gene->get_all_Transcripts}) { push @features, @{$self->fetch_all_by_Transcript_FeatureSets($transcript, $fsets, $dblinks)}; } } return \@features; } sub fetch_all_by_Transcript_FeatureSets{ my ($self, $transc, $fsets, $dblinks) = @_; if (! ( ref($transc) && $transc->isa('Bio::EnsEMBL::Transcript'))) { throw("You must pass a valid Bio::EnsEMBL::Transcript object"); } my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($transc, $fsets)}; if ($dblinks) { my $translation = $transc->translation; push @features, @{$self->fetch_all_by_stable_Storable_FeatureSets($translation, $fsets)} if $translation; } return \@features; } =head2 fetch_all_by_display_label Arg [1] : String $label - display label of feature to fetch Example : my $feat = $adaptor->fetch_all_by_display_label('abd-A_dpp:REDFLY:TF000092'); Description: Returns the features with the given display label. Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Feature objects Exceptions : none Caller : general Status : At risk =cut #There is no result_feature.display_label attribute. #Move this to individual feature adaptors to avoid over-riding? sub fetch_all_by_display_label { my ($self, $label) = @_; throw('You must defined a display_label argument') if ! defined $label; my $table_syn = $self->_main_table->[1]; my $constraint = "${table_syn}.display_label = ?"; $self->bind_param_generic_fetch($label, SQL_VARCHAR); return $self->generic_fetch($constraint); } =head2 _get_coord_system_ids Arg [1] : optional - ARRAYREF of Bio::EnsEMBL::Funcgen::CoordSystem objects Example : my $cs_ids = $self->_get_coord_system_ids([$cs1, $cs2], ...); Description: Returns the IDs of the CoordSystems specified or all default CoordSystems. Returntype : ARRAYREF Exceptions : Throws if CoordSystem object are not stored and valid Caller : BaseFeatureAdaptor.pm Status : At risk =cut #Can we remove the need for this by restricting the sr cache to default entries? sub _get_coord_system_ids{ my ($self, $coord_systems) = @_; my @cs_ids; if ($coord_systems) { if (ref($coord_systems) eq 'ARRAY' && (scalar(@$coord_systems) >0)) { foreach my $cs (@$coord_systems) { $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CoordSystem', $cs); push @cs_ids, $cs->dbID; } } else { throw('CoordSystems parameter must be an arrayref of one or more Bio::EnsEMBL::Funcgen::CoordSystems'); } } else { #Get current default cs's foreach my $cs (@{$self->db->get_FGCoordSystemAdaptor->fetch_all($self->db->dnadb, 'DEFAULT')}) { push @cs_ids, $cs->dbID; } } #This should never happen if (scalar(@cs_ids) == 0) { throw('Could not find any default CoordSystems'); } return \@cs_ids; } =head2 count_features_by_field_id Arg [1] : string - table field to count Arg [2] : string/int - id to count Example : my $probe_feature_count = $pfa->count_feature_by_field_id('probe_id', $probe_id); Description: Returns a count of the features with the acciated field id Returntype : string/int - count of features Exceptions : Throws args are not set Caller : FeatureAdaptors Status : At risk =cut #This does not assume one record per feature #But does assume primary key if ${table_name}_id #Can't move to core due to cs issues, but could mirror implementation. sub count_features_by_field_id{ my ($self, $count_field, $count_id) = @_; #Any other params here? if (! ($count_field && $count_id)) { throw('You must provide a count name and a count id to count by'); } my ($table_name, $table_syn) = @{$self->_main_table}; my $table_id = "${table_name}_id"; my @cs_ids = @{$self->_get_coord_system_ids}; my $sql = "SELECT count(distinct($table_id)) from $table_name $table_syn, seq_region sr where ${table_syn}.${count_field}=? and ${table_syn}.seq_region_id in(select distinct(seq_region_id) from seq_region where coord_system_id in(".join(',', @cs_ids).'))'; my $sth = $self->prepare($sql); $sth->bind_param(1, $count_id, SQL_INTEGER); $sth->execute; return $sth->fetchrow_array; } =head2 force_reslice Arg [1] : Optional - Boolean Example : if($self->force_reslice){ # Reslice features past ends of destination Slice } Description: Sets/Returns force_reslice boolean Returntype : Boolean Exceptions : None Caller : FeatureAdaptors::_objs_from_sth Status : At risk =cut sub force_reslice{ my ($self, $force) = @_; if (defined $force) { $self->{force_reslice} = $force; } return $self->{force_reslice}; } 1;