# # Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ResultFeatureAdaptor # # You may distribute this module under the same terms as Perl itself =head1 NAME Bio::EnsEMBL::DBSQL::Funcgen::ResultFeatureAdaptor - adaptor for fetching and storing ResultFeature objects =head1 SYNOPSIS my $rfeature_adaptor = $db->get_ResultFeatureAdaptor(); my @result_features = @{$rfeature_adaptor->fetch_all_by_ResultSet_Slice($rset, $slice)}; =head1 DESCRIPTION The ResultFeatureAdaptor is a database adaptor for storing and retrieving ResultFeature objects. This will automatically query the web optimised result_feature table if a data is present, else it will query the underlying raw data tables. =head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =cut #How are we going to track the association between raw experimental_chip and collection result set? #ExperimentalChip/Channel ResultSets will probably go away, so ignore this association problem for now. #Association between FeatureSet and Input/ResultSet to be handled in import pipeline #ResultFeature ResultSets now have a RESULT_FEATURE_SET status entry. #Add some details about file based collections package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor; use strict; use warnings; use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); use Bio::EnsEMBL::Funcgen::ResultSet; use Bio::EnsEMBL::Funcgen::ResultFeature; use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(mean median); #Bareword SQL_TYPES not exported from BaseFeatureAdpator unless it is 'use'd first use Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; use Bio::EnsEMBL::Funcgen::Collector::ResultFeature; use base qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor Bio::EnsEMBL::Funcgen::Collector::ResultFeature); #Private vars to used to maintain simple implementation of Collector #Should be set in each method to enable trimmingof the start and end bins. #Cannot depend on $dest_slice_start/end in _objs_from_sth #As _collection_start/end are adjusted to the nearest bin my ($_scores_field, $_collection_start, $_collection_end); #my ($_window_size);#Need to be a method as it is used by the BaseFeatureAdaptor. or our? =head2 _tables Args : None Example : None Description: PROTECTED implementation of superclass abstract method. Returns the names and aliases of the tables to use for queries. Returntype : List of listrefs of strings Exceptions : None Caller : Internal Status : At Risk =cut sub _tables { my $self = shift; return ([ 'result_feature', 'rf' ]); } =head2 _columns Args : None Example : None Description: PROTECTED implementation of superclass abstract method. Returns a list of columns to use for queries. Returntype : List of strings Exceptions : None Caller : Internal Status : At Risk =cut sub _columns { my $self = shift; return ('rf.seq_region_start', 'rf.seq_region_end', 'rf.seq_region_strand', "$_scores_field", 'rf.result_set_id'); } =head2 _objs_from_sth Arg [1] : DBI statement handle object Example : None Description: PROTECTED implementation of superclass abstract method. Creates Array objects from an executed DBI statement handle. Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects Exceptions : None Caller : Internal Status : At Risk - Moving to DBFile implementation =cut sub _objs_from_sth { my ($self, $sth, $mapper, $dest_slice) = @_; if(! $dest_slice){ throw('ResultFeatureAdaptor always requires a dest_slice argument'); #Is this correct? #ExperimentalChip based normalisation? #Currently does not use this method #Never have non-Slice based fetchs, so will always have dest_slice and seq_region info. } my (%rfeats, $start, $end, $strand, $scores, $rset_id); my $window_size = 0; #Always natural resolution from table #Could dynamically define simple obj hash dependant on whether feature is stranded and new_fast? #We never call _obj_from_sth for extended queries #This is only for result_feature table queries i.e. standard/new queries $sth->bind_columns(\$start, \$end, \$strand, \$scores, \$rset_id); #Test slice is loaded in eFG? my $slice_adaptor = $self->db->get_SliceAdaptor; if(! $slice_adaptor->get_seq_region_id($dest_slice)){ warn "Cannot get eFG slice for:".$dest_slice->name. "\nThe region you are using is not present in the current dna DB"; return; } if($mapper) { throw('Cannot dynamically assembly map Collections yet'); #See GeneAdaptor for mapping code if required } my $dest_slice_start; my $dest_slice_end; my $dest_slice_strand; my $dest_slice_length; my $dest_slice_sr_name; my $dest_slice_sr_id; $dest_slice_start = $dest_slice->start(); $dest_slice_end = $dest_slice->end(); $dest_slice_strand = $dest_slice->strand(); $dest_slice_length = $dest_slice->length(); $dest_slice_sr_name = $dest_slice->seq_region_name(); $dest_slice_sr_id = $dest_slice->get_seq_region_id(); my (@scores, $slice, $start_pad, $end_pad); #Set up the %rfeats arrays here to prevent having to test in loop #This will speed up 0 wsize, but most likely slow others? FEATURE: while ( $sth->fetch() ) { if($window_size == 0){ warn "0bp window size array based result_features are no longer supported"; #Remove completely or re-instate? } else{ #Cannot have a collection which does not start at 1 #As we cannot compute the bin start/ends correctly? #Actually we can do so long as they have been stored correctly #i.e. start and end are valid bin bounds(extending past the end of the seq_region if needed) #Let's keep it simple for now throw("Collections with a window size > 0 must start at 1, not ($start)") if $start !=1; #Can remove this if we test start and end are valid bin bounds for the given wsize #Account for oversized slices #This is if the slice seq_region_start/end are outside of the range of the record #As collections should really represent a complete seq_region #This should only happen if a slice is defined outside the the bounds of a seq_region #i.e. seq_region_start < collection_start or seq_region_end > slice length #if a test slice has been stored which does not represent the complete seq_region #We don't need to pad at all, just adjust the $_collection_start/ends!!! #Don't need to account for slice start < 1 #Start and end should always be valid bin bounds #These could be removed if we force only full length seq_region collections $_collection_start = $start if($_collection_start < $start); $_collection_end = $end if($_collection_end > $end); #warn "col start now $_collection_start"; #warn "col end now $_collection_end"; # If the dest_slice starts at 1 and is foward strand, nothing needs doing # else convert coords # These need to use $_collection_start/end rather than dest_slice_start/end if($dest_slice_start != 1 || $dest_slice_strand != 1) { if($dest_slice_strand == 1) { $start = $_collection_start - $dest_slice_start + 1; $end = $_collection_end - $dest_slice_start + 1; } else { my $tmp_seq_region_start = $_collection_start; $start = $dest_slice_end - $_collection_end + 1; $end = $dest_slice_end - $tmp_seq_region_start + 1; $strand *= -1; } } #What about 0 strand slices? #throw away features off the end of the requested slice or on different seq_region if($end < 1 || $start > $dest_slice_length){# || #( $dest_slice_sr_id ne $seq_region_id )) { #This would only happen if assembly mapper had placed it on a different seq_region #Dynamically mapped features are not guaranteed to come back in correct order? next FEATURE; } @scores = unpack('('.$self->pack_template.')'.(($_collection_end - $_collection_start + 1)/$window_size ), $scores); } push @{$rfeats{$rset_id}}, Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast({ start => $start, end => $end, strand =>$strand, scores => [@scores], #undef, #undef, window_size => $window_size, slice => $dest_slice, }); } return \%rfeats; } =head2 store Args[0] : List of Bio::EnsEMBL::Funcgen::ResultFeature objects Args[1] : Bio::EnsEMBL::Funcgen::ResultSet Args[2] : Optional - Assembly to project to e.g. GRCh37 Example : $rfa->store(@rfeats); Description: Stores ResultFeature objects in the result_feature table. Returntype : None Exceptions : Throws if a List of ResultFeature objects is not provided or if any of the attributes are not set or valid. Caller : General Status : At Risk - Moving to DBFile implementation =cut sub store{ my ($self, $rfeats, $rset, $new_assembly) = @_; #We can't project collections to a new assembly after they have been generated #as this will mess up the standardised bin bounds. #Just project the 0 window size and then rebuild other window_sizes form there $self->set_collection_defs_by_ResultSet($rset); throw("Must provide a list of ResultFeature objects") if(scalar(@$rfeats == 0)); #These are in the order of the ResultFeature attr array(excluding probe_id, which is the result/probe_feature query only attr)) my $sth = $self->prepare('INSERT INTO result_feature (result_set_id, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, scores) VALUES (?, ?, ?, ?, ?, ?)'); my $db = $self->db(); my ($pack_template, $packed_string); #my @max_allowed_packet = $self->dbc->db_handle->selectrow_array('show variables like "max_allowed_packet"'); #warn "@max_allowed_packet"; FEATURE: foreach my $rfeat (@$rfeats) { if( ! (ref($rfeat) && $rfeat->isa('Bio::EnsEMBL::Funcgen::Collection::ResultFeature'))) { throw('Must be a Bio::EnsEMBL::Funcgen::Collection::ResultFeature object to store'); } if($rfeat->window_size == 0){ throw('Non 0bp window_size ResultFeatures cannot be stored in the result_feature table, write a col file instead'); } #This is the only validation! So all the validation must be done in the caller as we are simply dealing with ints? #Remove result_feature_set from result_set and set as status? my $seq_region_id; ($rfeat, $seq_region_id) = $self->_pre_store($rfeat, $new_assembly); next if ! $rfeat;#No projection to new assembly #Is there a way of logging which ones don't make it? #This captures non full length collections at end of seq_region $pack_template = '('.$self->pack_template.')'.scalar(@{$rfeat->scores}); #Check that we have non-0 values in compressed collections if($rfeat->window_size != 0){ if(! grep { /[^0]/ } @{$rfeat->scores} ){ warn('Collection contains no non-0 scores. Skipping store for '. $rfeat->slice->name.' '.$rfeat->window_size." window_size\n"); next; } } $packed_string = pack($pack_template, @{$rfeat->scores}); $sth->bind_param(1, $rfeat->result_set_id, SQL_INTEGER); $sth->bind_param(2, $seq_region_id, SQL_INTEGER); $sth->bind_param(3, $rfeat->start, SQL_INTEGER); $sth->bind_param(4, $rfeat->end, SQL_INTEGER); $sth->bind_param(5, $rfeat->strand, SQL_INTEGER); $sth->bind_param(7, $packed_string, SQL_BLOB); $sth->execute(); } return $rfeats; } =head2 _list_dbIDs Description: Re-implementation of parent method, as we now store in flat files and do not have internal DB IDs ResultFeature Returntype : None Exceptions : Warns Caller : General Status : At risk =cut sub _list_dbIDs { warn('_list_dbIDs is not appropriate for the ResultFeatureAdaptor as it is likely all data is now stored in flat files'); return $_[0]->_list_dbIDs; } =head2 _window_size Args : None Example : my $wsize = $self->_window_size Description: Gets the window_size of the current ResultFeature query. This needs to be a method rather than just a private variable as it is used by the BaseFeatureAdaptor. Returntype : int Exceptions : None Caller : Bio::EnsEMBL::BaseFeatureAdaptor Status : At risk - ??? Is this required anymore =cut sub _window_size{ my $self = shift; return $self->{'window_size'}; } =head2 set_collection_config_by_Slice_ResultSets Args[0] : Bio::EnsEMBL::Slice Args[1] : ARRAYREF of Bio::EnsEMBL::Funcgen::ResultSet object Args[2] : int - Maximum number of bins required i.e. number of pixels in drawable region Example : $self->set_collection_defs_by_ResultSet([$rset]); Description: Similar to set_collection_defs_by_ResultSet, but used to set a config hash used for multi-ResultSet fetches. Returntype : None Exceptions : throws is args are not valid throws if supporting InputSet is not of type result (i.e. short reads import) throws if supporting InputSet format is not SEQUENCING (i.e. short reads import) throws if ResultSet is not and input_set or experimental_chip based ResultSet (i.e. channel etc) Caller : ResultFeatureAdaptor::fetch_all_by_Slice_ResultSets Status : At Risk =cut sub set_collection_config_by_Slice_ResultSets{ my ($self, $slice, $rsets, $max_bins, $window_size) = @_; if(ref($rsets) ne 'ARRAY'){ throw('You must pass an ARRAYREF of Bio::EnsEMBL::ResultSet objects.'); } if(! (ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))){ throw('You must pass a valid Bio::EnsEMBL::Slice'); } my ($wsize, $window_element, @rsets, %wsize_config); my ($collection_start, $collection_end); if($window_size && $max_bins){ warn "Over-riding max_bins with specific window_size, omit window_size to calculate window_size using max_bins"; } my ($is_rf_set); my $rf_source = 'file'; foreach my $rset(@{$rsets}){ $is_rf_set = $rset->has_status('RESULT_FEATURE_SET') || 0; $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); if(! ($rset->table_name eq 'input_set' || $is_rf_set)){ warn("Skipping non-ResultFeature ResultSet:\t".$rset->name); next; } #Eventually will want to completely remove DB sets? #Depends on how we handle 0bp window sets, BigWig? #Leave 0bp sets in DB for v62 which means we need an extra status #Assume all input_set rsets are file based #NOTE: #Don't need to check for packed_size and packed_template differences as they are now the #same for all collections i.e. float. These would need to be separate queries otherwise. #This method makes assumptions about the window_sizes array structure #If this is to change more the the 0 - 30 bp change below then the config hash generation #needs to be reviewed if($rset->table_name eq 'experimental_chip'){ #Array Intensities i.e. single float $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 0;#Can have natural resolution for low density array data } elsif($rset->table_name eq 'input_set'){ $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 30; #Currently only expecting int from InputSet my @isets = @{$rset->get_InputSets}; my @tmp_isets = grep { !/result/ } (map { $_->feature_class } @isets ); if(@tmp_isets){ throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports result type InputSets, not @tmp_isets types"); } #We still have no way of encoding pack_type for result_feature InputSets @tmp_isets = grep { !/SEQUENCING/ } (map { $_->format } @isets); if(@tmp_isets){ throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports SEQUENCING format InputSets, not @tmp_isets formats"); } } else{ throw('Bio::EnsEMBL::Funcgen::Collector:ResultFeature does not support ResultSets of type'.$rset->table_name); } ### SET ResultSet CONFIG BASED ON OPTIMAL WINDOW SIZE # This sets all based on $rf_source=file # For wsize==0 (experimental_chip), the rf_source is fixed after this loop # This is done to prevent cyclical dependancy and improve cache checking speed if( (defined $window_element ) && exists $wsize_config{$rf_source}{$Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]} ){ $wsize_config{$rf_source}{$Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]}{'result_sets'}{$rset->dbID} = $rset; } else{ #We have not seen this wsize before #This is currently entirely based on the position of the first wsize. #We can't strictly rely that the same window_element will be optimal for each collection #However this will work if they are size ordered and only the first element changes e.g. 0 || 30 #Will need to do for each set if window_sizes change if(! defined $window_element){ my @sizes = @{$self->window_sizes}; #we need to remove wsize 0 if ResultSet was generated from high density seq reads #0 should always be first shift @sizes if ($sizes[0] == 0 && ($rset->table_name eq 'input_set')); $max_bins ||= 700;#This is default size of display? #CR Change to true drawable pixel width #The speed of this track is directly proportional #to the display size, unlike other tracks! #e.g #let's say we have 300000bp #700 pixels will use 450 wsize > Faster but lower resolution #2000 pixels will use 150 wsize > Slower but higher resolution if(defined $window_size){ if(! grep { /^${window_size}$/ } @sizes){ warn "The ResultFeature window_size specifed($window_size) is not valid, the next largest will be chosen from:\t".join(', ', @sizes); } else{ $wsize = $window_size; } } else{#! defined $window_size #Work out window size here based on Slice length #Select 0 wsize if slice is small enough #As loop will never pick 0 #probably half 150 max length for current wsize #Will also be proportional to display size #This depends on size ordered window sizes arrays $window_size = ($slice->length)/$max_bins; if($Bio::EnsEMBL::Utils::Collector::window_sizes->[0] == 0){ my $zero_wsize_limit = ($max_bins * $sizes[1])/2; if($slice->length <= $zero_wsize_limit){ $wsize = 0; $window_element = 0; } } } #Let's try and avoid this loop if we have already grep'd or set to 0 #In the browser this is only ever likely to speed up the 0 window if (! defined $wsize) { #default is maximum $wsize = $sizes[-1]; #Last element #Try and find the next biggest window #As we don't want more bins than there are pixels for (my $i = 0; $i <= $#sizes; $i++) { #We have problems here if we want to define just one window size #In the store methods, this resets the wsizes so we can only pick from those #specified, hence we cannot force the use of 0 #@sizes needs to always be the full range of valid windows sizes #Need to always add 0 and skip_zero window if 0 not defined in window_sizes? if ($window_size <= $sizes[$i]) { $window_element = $i; $wsize = $sizes[$i]; last; } } } } else{ #ASSUME the same window_element has the optimal window_size $wsize = $Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]; } #Set BLOB access & collection_start/end config if ( $wsize == 0) { $wsize_config{$rf_source}{$wsize}{scores_field} = 'rf.scores'; #No need to set start end config here as these are normal features } else { #We want a substring of a whole seq_region collection #Correct to the nearest bin bounds #int rounds towards 0, not always down! #down if +ve or up if -ve #This causes problems with setting start as we round up to zero #Sub this lot as we will use it in the Collector for building indexes? my $start_bin = $slice->start/$wsize; $collection_start = int($start_bin); #Add 1 here so we avoid multiply by 0 below if ($collection_start < $start_bin) { $collection_start +=1; #Add 1 to the bin due to int rounding down } $collection_start = ($collection_start * $wsize) - $wsize + 1 ; #seq_region $collection_end = int($slice->end/$wsize) * $wsize; #This will be <= $slice->end #Add another window if the end doesn't meet the end of the slice if (($collection_end > 0) && ($collection_end < $slice->end)) { $collection_end += $wsize; } #Now correct for packed size #Substring on a blob returns bytes not 2byte ascii chars! #start at the first char of the first bin my $sub_start = (((($collection_start - 1)/$wsize) * $self->packed_size) + 1); #add first char #Default to 1 as mysql substring starts < 1 do funny things $sub_start = 1 if $sub_start < 1; my $sub_end = (($collection_end/$wsize) * ($self->packed_size)); #Set local start end config for collections $wsize_config{$rf_source}{$wsize}{collection_start} = $collection_start; $wsize_config{$rf_source}{$wsize}{collection_end} = $collection_end; if($rf_source eq 'file'){ #file BLOB access config $wsize_config{$rf_source}{$wsize}{'byte_offset'} = $sub_start -1; #offset is always start - 1 $wsize_config{$rf_source}{$wsize}{'byte_length'} = ($sub_end - $sub_start + 1); #warn "byte_offset = $sub_start"; #warn "byte_length = ($sub_end - $sub_start + 1) => ". ($sub_end - $sub_start + 1); } else{ #DB BLOB access config #Finally set scores column for fetch $wsize_config{$rf_source}{$wsize}{scores_field} = "substring(rf.scores, $sub_start, ". ($sub_end - $sub_start + 1).')'; } } #Set the result_set and scores field config #Would also need to set pack template here if this #were to change between collections $wsize_config{$rf_source}{$wsize}{result_sets}{$rset->dbID} = $rset; } } #Fix the 0 wsize source as will be set to 'file' but should be 'db' if(exists $wsize_config{'file'}{0}){ $wsize_config{'db'}{0} = $wsize_config{'file'}{0}; delete $wsize_config{'file'}{0}; if(keys(%{$wsize_config{'file'}}) == 0){ delete $wsize_config{'file'}; } } $self->{'_collection_config'} = \%wsize_config; return $self->{'_collection_config'}; } =head2 fetch_all_by_Slice_ResultSets Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from Arg[2] : ARRAYREF of Bio::EnsEMBL::Funcgen::ResultSets - ResultSet to retrieve results from Arg[3] : OPTIONAL int - max bins, maximum number of scores required Arg[4] : OPTIONAL int - window size, size of bin/window size in base pirs Arg[5] : OPTIONAL string - sql contraint for use only with DB collections Arg[6] : OPTIONAL hasref - config hash Example : my %rfeatures = %{$rsa->fetch_all_by_Slice_ResultSets($slice, [@rsets])}; Description: Gets a list of lightweight ResultFeature collection(s) for the ResultSets and Slice passed. Returntype : HASHREF of ResultSet dbID keys with a LISTREF of ResultFeature collection values Exceptions : Warns and skips ResultSets which are not RESULT_FEATURE_SETS. Caller : general Status : At risk =cut sub fetch_all_by_Slice_ResultSets{ my ($self, $slice, $rsets, $max_bins, $window_size, $orig_constraint) = @_; $orig_constraint .= (defined $orig_constraint) ? ' AND ' : ''; #currently does not accomodate raw experimental_chip ResultSets! my $conf = $self->set_collection_config_by_Slice_ResultSets($slice, $rsets, $max_bins, $window_size); #Loop through each wsize build constraint set private vars and query my (%rset_rfs, $constraint); #Remove this block now we don't really support table based RFs my $rf_conf = $conf->{db}; foreach my $wsize(keys(%{$rf_conf})){ $self->{'window_size'} = $wsize; $_scores_field = $rf_conf->{$wsize}->{scores_field}; $constraint = 'rf.result_set_id IN ('.join(', ', keys(%{$rf_conf->{$wsize}->{result_sets}})).')'. " AND rf.window_size=$wsize"; if ($wsize != 0){ $_collection_start = $rf_conf->{$wsize}->{collection_start}; $_collection_end = $rf_conf->{$wsize}->{collection_end}; } my ($rset_results) = @{$self->fetch_all_by_Slice_constraint($slice, $orig_constraint.$constraint)}; #This maybe undef if the slice is not present $rset_results ||= {}; #Will this work for wsize split queries? #We are not setting values for empty queries #Which is causing errors on deref %rset_rfs = (%rset_rfs, %{$rset_results}); #Account for DB rsets which return no features foreach my $rset(values %{$rf_conf->{$wsize}{result_sets}}){ if(! exists $rset_rfs{$rset->dbID}){ $rset_rfs{$rset->dbID} = []; } } } #Two blocks to prevent extra if-conditional for each rset. #Can easily remove one block if/when we deprecate DB collections #Should always be the same wsize if we are using a file i.e. no 0bp #Unless we have differing window_size ranges my $rf; $rf_conf = $conf->{file}; foreach my $wsize(keys(%{$rf_conf})){ foreach my $file_rset(values %{$rf_conf->{$wsize}{result_sets}}){ $rf = $self->_fetch_from_file_by_Slice_ResultSet($slice, $file_rset, $wsize, $rf_conf); $rset_rfs{$file_rset->dbID} = defined($rf) ? [$rf] : []; } } return \%rset_rfs; } =head2 _fetch_from_file_by_Slice_ResultSet Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from Arg[2] : Bio::EnsEMBL::Funcgen::ResultSets - ResultSet to retrieve results from Arg[3] : int - valid window size defined by set_collection_config_by_ResultSets Arg[4] : HASHREF - Window size config for file based ResultSets Example : my $rfeat = $self->_fetch_from_file_by_Slice_ResultSet($slice, $rset, $wsize, $conf); Description: Generates ResultFeature collection reading packed scores from a 'col' file. Returntype : Bio::EnsEMBL::Funcgen::Collection::ResultFeature Exceptions : Caller : fetch_all_by_Slice_ResultSets Status : At risk =cut #Set/store filepath in ResultSet to avoid having to regenerate? sub _fetch_from_file_by_Slice_ResultSet{ my ($self, $slice, $rset, $window_size, $conf) = @_; #private as window_size needs to ba valid i.e. generated by set_collection_config_by_ResultSets #and no class tests #Cache this as ResultSet::get_dbfile_path_prefix? #Is there any point if the rsets aren't cached? #Data is cached anyway, so no redundant calls #ResultSets are always regenerated #Key would either have to be query or dbID #Former would be hard?(constraint key, is this already done for features?) #Later would be object cache, so we would still do sql query but skip the object generation #given the dbID is enough to pull a valid object from the cache #How can we cache result/feature sets? #Would this really speed anything up? #if(! exists $conf->{$window_size}){ # throw("Specified window_size($window_size) is not present in config.\n". # "Have you called this private method directly?\n". # "Try using the fetch_by_Slice_ResultSets warpapper method\n". # "Or set the RESULT_FEATURE_FILE_SET status and window_size correctly."); # } my $rf; my $efg_sr_id = $self->get_seq_region_id_by_Slice($slice); if($efg_sr_id){ my $packed_scores = $self->read_collection_blob ( $rset->get_dbfile_path_by_window_size($window_size), #Would be in analysis object for unique analysis tracks/data $efg_sr_id, $conf->{$window_size}{'byte_offset'}, $conf->{$window_size}{'byte_length'}, ); my ($start, $end, @scores); if(defined $packed_scores){ ($start, $end) = ($conf->{$window_size}{collection_start}, $conf->{$window_size}{collection_end}); #Need to capture unpack failure here and undef the fh? #i.e. pack/unpack repeat count overflow @scores = unpack('('.$self->pack_template.')'.(($end - $start + 1)/$window_size), $packed_scores); #could validate scores size here #will only ever be <= than expected value #as unpack will discard excess #better validate length of $packed_scores $rf = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast ({ start => $start, end => $end, strand => 0, #These are strandless features scores => [@scores], window_size => $window_size, }); } } return $rf; } =head2 fetch_all_by_Slice_ResultSet Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from Arg[2] : Bio::EnsEMBL::Funcgen::ResultSet - ResultSet to retrieve results from Arg[3] : OPTIONAL int - max bins, maximum number of scores required Arg[4] : OPTIONAL int - window size, size of bin/window size in base pirs Arg[5] : OPTIONAL string - sql contraint for use only with DB collections Arg[6] : OPTIONAL hasref - config hash Example : my %rfeatures = %{$rsa->fetch_all_by_Slice_ResultSets($slice, [@rsets])}; Description: Gets a list of lightweight Collection of ResultFeatures for the ResultSet and Slice passed. NOTE: ExperimentalChip/Channel based ResultFeatures was removed in version 63. Returntype : Bio::EnsEMBL::Funcgen::Collection::ResultFeature Exceptions : None Caller : general Status : At risk =cut #To do #remove Bio::EnsEMBL::Funcgen::ResultFeature in favour of Collection::ResultFeature? sub fetch_all_by_Slice_ResultSet{ my ($self, $slice, $rset, $max_bins, $window_size, $orig_constraint) = @_; #Do this first to avoid double validation of rset my $rf_hashref = $self->fetch_all_by_Slice_ResultSets($slice, [$rset], $max_bins, $window_size, $orig_constraint); return $rf_hashref->{$rset->dbID}; } sub fetch_Iterator_by_Slice_ResultSet{ my ($self, $slice, $rset, $max_bins, $window_size, $constraint, $chunk_size) = @_; return $self->fetch_collection_Iterator_by_Slice_method ($self->can('fetch_all_by_Slice_ResultSet'), [$slice, $rset, $max_bins, $window_size, $constraint], 0,#Slice idx $chunk_size #Iterator chunk length ); } =head2 fetch_collection_Iterator_by_Slice_method Arg [1] : CODE ref of Slice fetch method Arg [2] : ARRAY ref of parameters for Slice fetch method Arg [3] : Optional int: Slice index in parameters array Arg [4] : Optional int: Slice chunk size. Default=500000 Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method ($feature_adaptor->can('fetch_all_by_Slice_Arrays'), \@fetch_method_params, 0,#Slice idx #500 #chunk length ); while(my $feature = $slice_iter->next && defined $feature){ #Do something here } Description: Creates an Iterator which chunks the query Slice to facilitate large Slice queries which would have previously run out of memory Returntype : Bio::EnsEMBL::Utils::Iterator Exceptions : Throws if mandatory params not valid Caller : general Status : at risk - move to core BaseFeatureAdaptor =cut #Essentially the only difference here we have one feature with an array of 'scores' sub fetch_collection_Iterator_by_Slice_method{ my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_; if(! ( defined $slice_method_ref && ref($slice_method_ref) eq 'CODE') ){ throw('Must pass a valid Slice fetch method CODE ref'); } if (! ($params_ref && ref($params_ref) eq 'ARRAY')) { #Don't need to check size here so long as we have valid Slice throw('You must pass a method params ARRAYREF'); } $slice_idx = 0 if(! defined $slice_idx); my $slice = $params_ref->[$slice_idx]; $chunk_size ||= 1000000; my $collection; my $finished = 0; my $start = 1; #local coord for sub slice my $end = $slice->length; my $overlap = 0; my $coderef = sub { my $collection; if(! $finished) { my $new_end = ($start + $chunk_size - 1); if ($new_end >= $end) { # this is our last chunk $new_end = $end; $finished = 1; } #Chunk by sub slicing my $sub_slice = $slice->sub_Slice($start, $new_end); $params_ref->[$slice_idx] = $sub_slice; ($collection) = @{ $slice_method_ref->($self, @$params_ref)}; if(! $collection){ $finished = 1; } else{ #Trim score and start if overlapping found #Will only ever be on overlapping 'score' if($overlap){ shift @{$collection->scores}; $collection->{start} = $collection->start + $collection->window_size; $overlap = 0; } if ( $collection->end > $sub_slice->end ){ $overlap = 1; } $start = $new_end + 1; } } return $collection; }; return Bio::EnsEMBL::Utils::Iterator->new($coderef); } # Over-ride/deprecate generic methods # which do not work with ResultFeature Collections sub fetch_all{ deprecate('The fetch_all method has been disabled as it is not appropriate for the ResultFeatureAdaptor'); return; } sub fetch_by_dbID{ warn 'The fetch_by_dbID method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; #Could use it for 0 wsize DB based data, but not useful. return; } sub fetch_all_by_dbID_list { warn 'The fetch_all_by_dbID_list method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; #Could use it for 0 wsize DB based data, but not useful. return; } sub fetch_all_by_logic_name { warn 'The fetch_all_by_logic_name method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; #Could use it for 0 wsize DB based data, but not useful. return; } sub _list_seq_region_ids{ warn 'The _list_seq_region_ids method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; #Could use it for 0 wsize DB based data, but not useful. return } #Over-ride fetch_all_by_display_label? Or move this to the individual FeatureAdaptors? #Same with fetch_all_by_stable_Storable_FeatureSEts and wrappers (fetch_all_by_external_name)? #Basically this is not a DBAdaptor anymore so should inherit from somewhere else. #Need to separate the common utility methods and have co-inheritance e.g. #DBFile::Adaptor Utils::FeatureAdaptor #DBAdaptor::Adaptor Utils::FeatureAdaptor #Deprecated/Removed =head2 resolve_replicates_by_ResultSet Arg[0] : HASHREF - result_set_input_id => @scores pairs #Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet - ResultSet to retrieve results from Example : my @rfeatures = @{$rsa->fetch_ResultFeatures_by_Slice_ResultSet($slice, $rset, 'DISPLAYABLE')}; Description: Gets a list of lightweight ResultFeatures from the ResultSet and Slice passed. Replicates are combined using a median of biological replicates based on their mean techinical replicate scores Returntype : List of Bio::EnsEMBL::Funcgen::ResultFeature Exceptions : None Caller : general Status : deprecated =cut sub resolve_replicates_by_ResultSet{ die('ExperimentalChip/Channel based ResultFeature support was removed in version 63'); } =head2 fetch_results_by_probe_id_ResultSet Arg [1] : int - probe dbID Arg [2] : Bio::EnsEMBL::Funcgen::ResultSet Example : my @probe_results = @{$ofa->fetch_results_by_ProbeFeature_ResultSet($pid, $result_set)}; Description: Gets result for a given probe in a ResultSet Returntype : ARRAYREF Exceptions : throws if args not valid Caller : General Status : deprecated =cut sub fetch_results_by_probe_id_ResultSet{ die('ExperimentalChip/Channel based ResultFeature support was removed in version 63'); } 1;