=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =head1 NAME Bio::EnsEMBL::Funcgen::Utils::Helper =head1 SYNOPSIS e.g. my $object = Bio::EnsEMBL::Object->new ( logging => 1, log_file => "/tmp/Misc.log", debug_level => 2, debug_file => "/tmp/Misc.dbg", ); $object->log("This is a log message."); $object->debug(1,"This is a debug message."); $object->system("rmdir /tmp/test"); ---------------------------------------------------------------------------- =head1 OPTIONS =over 8 =item B<-debug> Turns on and defines the verbosity of debugging output, 1-3, default = 0 = off =over 8 =item B<-log_file|l> Defines the log file, default = "${instance}.log" =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =back =head1 DESCRIPTION B performs several debugging and logging functions, aswell as providing several inheritable EFGUtils methods. =cut ################################################################################ package Bio::EnsEMBL::Funcgen::Utils::Helper; use Bio::Root::Root; use Data::Dumper qw(Dumper); use Bio::EnsEMBL::Utils::Exception qw (throw stack_trace); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (get_date); use Bio::EnsEMBL::Funcgen::FeatureSet; use Bio::EnsEMBL::Funcgen::DataSet; use Bio::EnsEMBL::Funcgen::ResultSet; #use Devel::Timer; use Carp; #? Can't use unless we can get it to redirect use File::Basename; use strict; use vars qw(@ISA); @ISA = qw(Bio::Root::Root); #default dump level #set this to debug level + 1? $Data::Dumper::Maxdepth = 2; my @rollback_tables = ( 'data_set', 'feature_set', 'result_set', 'input_set', 'experiment', 'array', 'array_chip', 'experimental_chip' ); #List of valid rollback levels #To be used in conjunction with -full_delete #values here used in > ops below, to implement #hierarchical rollback my %rollback_modes = ( feature_set => 1, #deletes all associated SetFeatures (etc.) and revokes_states on FeatureSet #full delete also removes feature/data/supporting_set records #data_set => 2, #This is now handled by feature_set mode result_set => 3, #revokes_states and remove dbfile_registry records #full_delete also removes result_set/_input records #add final levels input_set => 4, #This is moot as there isn't really a rollback mode for InputSets #apart from revoking the states which can be done directly #should have delete_InputSet instead? #input_subsets may be done independantly? as they may belong to >1 input_set/experiment? experiment => 5, ); #Some local filevars to avoid assigning to package typeglobs my ( $DBGFILE, $LOGFILE ); ################################################################################ =head2 new Description : Constructor method to create a new object with passed or default attributes. Arg [1] : hash containing optional attributes :- log_file - name of log file (default = undef -> STDOUT) debug_level - level of detail of debug message [1-3] (default = 0 = off) debug_file - name of debug file (default = undef -> STDERR) ReturnType : Helper Example : my $Helper = new Bio::EnsEMBL::Helper( debug_level => 3, debug_file => "/tmp/efg.debug", log_file => "/tmp/efg.log", ); Exceptions : throws exception if failed to open debug file : throws exception if failed to open log file =cut ################################################################################ #To do , change to rearrange sub new { my ( $caller, %args ) = @_; my ( $self, %attrdata, $argname ); my $class = ref($caller) || $caller; #Create object from parent class $self = $class->SUPER::new(%args); #we need to mirror ensembl behaviour here #use rearrange and set default afterwards if not defined # objects private data and default values #Not all of these need to be in main %attrdata = ( _tee => $main::_tee, _debug_level => $main::_debug_level, _debug_file => $main::_debug_file, _log_file => $main::_log_file, #default should be set in caller _no_log => $main::_no_log, #suppresses log file generation if log file not defined _default_log_dir => $main::_default_log_dir, ); # set each class attribute using passed value or default value foreach my $attrname ( keys %attrdata ) { ( $argname = $attrname ) =~ s/^_//; # remove leading underscore $self->{$attrname} = ( exists $args{$argname} ) ? $args{$argname} : $attrdata{$attrname}; } $self->{'_tee'} = 1 if $self->{'_no_log'}; #should we undef log_file here too? #This currently only turns off default logging $self->{_default_log_dir} ||= $ENV{'HOME'} . '/logs'; $self->{'_report'} = []; # DEBUG OUTPUT & STDERR #should default to lowest or highest debug level here! if ( defined $self->{_debug_level} && $self->{_debug_level} ) { $main::_debug_level = $self->{_debug_level}; if ( defined $self->{_debug_file} ) { $main::_debug_file = $self->{_debug_file}; open( $DBGFILE, '>>', $self->{_debug_file} ) or throw("Failed to open debug file : $!"); #open (DBGFILE, "{_debug_file});#Mirrors STDERR to debug file } else { open( $DBGFILE, '>&STDERR' ); } select $DBGFILE; $| = 1; # make debug file unbuffered $self->debug( 1, "Debugging started " . localtime() . " on $0 at level " . $self->{_debug_level} . " ..." ); } my $log_file = $self->{_log_file}; # LOG OUTPUT if ( defined $self->{_log_file} ) { #This causes print on unopened file as we try and log in the DESTROY throw( 'You have specified mutually exclusive parameters log_file and no_log' ) if ( $self->{'_no_log'} ); $main::_log_file = $self->{_log_file}; #we need to implment tee here if ( $self->{'_tee'} ) { open( $LOGFILE, ' | tee -a ' . $log_file ); } else { open( $LOGFILE, '>>', $log_file ) or throw("Failed to open log file : $log_file\nError: $!"); } } else { #Change this to get the name of the control script and append with PID.out #This is to ensure that we always capture output #We need to also log params #We will have to call this from the child class. #Only do this if we don't have supress default logs set #To avoid loads of loags during testing if ( !$self->{'_no_log'} ) { my @stack = stack_trace(); my $top_level = $stack[$#stack]; my ( undef, $file ) = @{$top_level}; $file =~ s/.*\///; $self->run_system_cmd( 'mkdir ' . $self->{_default_log_dir} ) if ( !-e $self->{_default_log_dir} ); $self->{'_log_file'} = $self->{_default_log_dir} . '/' . $file . '.' . $$ . '.log'; warn "No log file defined, defaulting to:\t" . $self->{'_log_file'} . "\n"; #we should still tee here if ( $self->{'_tee'} ) { open( $LOGFILE, '| tee -a ' . $self->{'_log_file'} ); } else { open( $LOGFILE, '>', $self->{'_log_file'} ) or throw( 'Failed to open log file : ' . $self->{'_log_file'} . "\nError: $!" ); } } ## end if ( !$self->{'_no_log'...}) else { #Have to include STD filehandles in operator open( $LOGFILE, '>&STDOUT' ); } } ## end else [ if ( defined $self->{_log_file...})] select $LOGFILE; $| = 1; # make log file unbuffered $self->log( "\n\nLogging started at " . localtime() . "..." ); # RESET STDOUT TO DEFAULT select STDOUT; $| = 1; $self->debug( 2, "Helper class instance created." ); return $self; } ## end sub new ################################################################################ =head2 DESTROY Description : Called by gargbage collection to enable tidy up before object deleted ReturnType : none Example : none - should not be called directly Exceptions : none =cut ################################################################################ sub DESTROY { my ($self) = @_; #This prevents having to explicitly call report $self->report; if ( $self->{_log_file} ) { $self->log( "Logging complete " . localtime() . "." ); $self->log( 'Virtual Memory ' . `ps -p $$ -o vsz |tail -1` ); $self->log( 'Resident Memory ' . `ps -p $$ -o rss |tail -1` ); # close LOGFILE; # if inherited object then cannot close filehandle !!! } if ( $self->{_debug_level} ) { $self->debug( 1, "Debugging complete " . localtime() . "." ); # close DBGFILE; # if inherited object then cannot close filehandle !!! } if ( defined $self->{'_timer'} ) { $self->{'_timer'}->report(); } $self->debug( 2, "Bio::EnsEMBL::Helper class instance destroyed." ); return; } ## end sub DESTROY ##Need generic method in here to get stack and line info ###Use Root.pm stack methods! # and replace this with caller line method for logging sub _get_stack { my ($self) = shift; #need to resolve this method with that in debug, pass log or debug arg for different format my @prog = ( caller(2) ) ? caller(2) : ( caller(1) ) ? caller(1) : ( undef, "undef", 0 ); return "[" . localtime() . " - " . basename( $prog[1] ) . ":$prog[2]]"; } ################################################################################ =head2 log Arg[0] : string - log message. Arg[1] : boolean - memory usage, appends current process memory stats Description : Method to write messages to a previously set up log file. Return type : none Example : $root->log("Processing file $filename ...", 1); Exceptions : none =cut ################################################################################ sub log { my ( $self, $message, $mem, $date, $no_return ) = @_; if ($mem) { $message .= " :: " . `ps -p $$ -o vsz |tail -1`; chomp $message; $message .= " KB"; } if ($date) { my $time = localtime(); chomp($time); $message .= ' - ' . localtime(); } $message .= "\n" if ! $no_return; print $LOGFILE "::\t$message"; if( ! $self->{'_no_log'}){ $self->debug( 1, $message ); } } ## end sub log ################################################################################ =head2 report Arg[0] : optional string - log message. Arg[1] : optional boolean - memory usage, appends current process memory stats Description : Wrapper method for log, which also stores message for summary reporting Return type : none Example : $root->report("WARNING: You have not done this or that and want it reported at the end of a script"); Exceptions : none =cut ################################################################################ sub report { my ( $self, $message, $mem ) = @_; if ( defined $message ) { $self->log( $message, $mem ); push @{ $self->{'_report'} }, $message; } elsif ( scalar( @{ $self->{'_report'} } ) ) { my $report = "\n::\tSUMMARY REPORT\t::\n". join( "\n", @{ $self->{'_report'} } ) . "\n"; print $LOGFILE $report; if($self->{report_fail}){ die($report); } $self->{'_report'} = []; } return; } sub report_fail{ my ( $self, $msg, @args ) = @_; $self->{report_fail} = 1; return $self->report("FAIL:\t".$msg, @args); } #add report_warning here #should print to STDERR or $DBGFILE? ################################################################################ =head2 log_header Arg[0] : string - log message. Arg[1] : boolean - memory usage, appends current process memory stats Description : Wrapper method to format a log as a header line Return type : none Example : $root->log("Processing file $filename ...", 1); Exceptions : none =cut ################################################################################ sub log_header { my ( $self, $message, $mem, $date ) = @_; print $LOGFILE "\n\n"; $self->log( "::\t$message\t::\t::", $mem, $date ); print $LOGFILE "\n"; } ################################################################################ =head2 log_error Description : Wrapper method to log, warns to STDERR first Return type : none Example : $root->log_error("could not find", ...); Exceptions : none =cut ################################################################################ sub log_error { my ( $self, @args ) = @_; warn $args[0] . "\n"; return $self->log(@args); } ################################################################################ =head2 debug Description : Method to write debug info to a previously set up debug file. Over-rides Root.pm on/off style debugging Args : int: debug level and string: log message. ReturnType : none Example : $root->debug(2,"dir=$dir file=$file"); Exceptions : none =cut ############################################################################### sub debug { my ( $self, $level, $message, $data, $depth ) = @_; #Can we not detect whther message is a scalar, array or hash and Dump or print accordingly? if($data){ my $tmp_depth = $Data::Dumper::Maxdepth; if($depth && ($depth != $Data::Dumper::Maxdepth) ){ $Data::Dumper::Maxdepth = $depth; } $message.= Dumper($data)."\n"; $Data::Dumper::Maxdepth = $tmp_depth;; } my ( @call, $cnt, $prog_name, $prog_line, $call_name, $call_line ); $prog_name = $call_name = "undef"; $prog_line = $call_line = $cnt = 0; # if debug on at the requested level then output the passed message if ( defined $self->{_debug_level} && $level <= $self->{_debug_level} ) { ######Replace this with Carp method? while ( @call = caller( $cnt++ ) ) { if ( $cnt == 2 ) { $call_name = basename( $call[1] ); $call_line = $call[2]; } $prog_name = basename( $call[1] ); $prog_line = $call[2]; } #This still attempts to print if file not opened print $DBGFILE "DEBUG :: $message\t: ". "[$$ - $prog_name:$prog_line $call_name:$call_line]\n"; #carp("carping $message"); } } ## end sub debug ################################################################################ ################################################################################ =head2 run_system_cmd Description : Method to control the execution of the standard system() command ReturnType : none Example : $Helper->debug(2,"dir=$dir file=$file"); Exceptions : throws exception if system command returns none zero =cut ################################################################################ #Move most of this to EFGUtils.pm #Maintain wrapper here with throws, only warn in EFGUtils sub run_system_cmd { my ( $self, $command, $no_exit ) = @_; my $redirect = ''; $self->debug( 3, "system($command)" ); # decide where the command line output should be redirected #This should account for redirects #This just sends everything to 1 no? if ( defined $self->{_debug_level} && $self->{_debug_level} >= 3 ) { if ( defined $self->{_debug_file} ) { $redirect = " >>" . $self->{_debug_file} . " 2>&1"; } else { $redirect = ""; } } else { #$redirect = " > /dev/null 2>&1"; } # execute the passed system command my $status = system("$command $redirect"); my $exit_code = $status >> 8; if ( $status == -1 ) { warn "Failed to execute: $!\n"; } elsif ( $status & 127 ) { warn sprintf( "Child died with signal %d, %s coredump\nError:\t$!", ( $status & 127 ), ( $status & 128 ) ? 'with' : 'without' ); } elsif ( $status != 0 ) { warn sprintf( "Child exited with value %d\nError:\t$!\n", $exit_code ) ; #get the true exit code } if ( $exit_code != 0 ) { if ( !$no_exit ) { throw( "System command failed:\t$command\nExit code:\t$exit_code\n$!"); } else { warn("System command returned non-zero exit code:\t$command\n". "Exit code:\t$exit_code\n$!"); } } #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/ return $exit_code; } ## end sub run_system_cmd #add sys_get method ehre to handle system calls which retrieve data? #i.e.backtick commands `find . -name *fasta` #or use want or flag with above method? #should open pipe instead to capture error? sub get_data { my ( $self, $data_type, $data_name ) = @_; #This method is just to provide standard checking for specific get_data/config methods if ( defined $data_name ) { throw( "Defs data name $data_name for type '$data_type' does not exist\n" ) if ( !exists $self->{"${data_type}"}{$data_name} ); } else { throw("Defs data type $data_type does not exist\n") if ( !exists $self->{"${data_type}"} ); } return ( defined $data_name ) ? $self->{"${data_type}"}{$data_name} : $self->{"${data_type}"}; } #sub Timer{ # my ($self) = shift; # $self->{'_timer'} = new Devel::Timer() if(! defined $self->{'_timer'}); # return $self->{'_timer'}; #} sub set_header_hash { my ( $self, $header_ref, $fields ) = @_; my %hpos; for my $x ( 0 .. $#{$header_ref} ) { $hpos{ $header_ref->[$x] } = $x; } if ($fields) { foreach my $field (@$fields) { if ( !exists $hpos{$field} ) { throw("Header does not contain mandatory field:\t${field}"); } } } return \%hpos; } #Move this to EFGUtils? sub backup_file { my ( $self, $file_path ) = @_; throw("Must define a file path to backup") if ( !$file_path ); if ( -f $file_path ) { $self->log("Backing up:\t$file_path"); system( "mv ${file_path} ${file_path}." . `date '+%T'` ); } return; } #This should move to Utils #as it is a simple string manipulation sub get_schema_and_build { my ( $self, $dbname ) = @_; my @dbname = split /_/, $dbname; return [ $dbname[ ( $#dbname - 1 ) ], $dbname[ ($#dbname) ] ]; } =head2 get_regbuild_set_states Arg [1] : Bio::EnsEMBL::DBAdaptor Example : my ($dset_states, $rset_states, $fset_states) = $helper->get_regbuild_set_states($db); Description: Returns Array refs of appropriate states for sets use din the regulatory build Returntype : Array Exceptions : Warns if cannot find chromosome CoordSystem Caller : HealthChecker & regulatory build code Status : At risk =cut sub get_regbuild_set_states { my ( $self, $db ) = @_; my $cs_a = $db->get_CoordSystemAdaptor; #These states need to be mirrored in RegulatorySets.java my $chrom_cs = $cs_a->fetch_by_name('chromosome'); my ( @dset_states, @rset_states, @fset_states ); if ( !defined $chrom_cs ) { #This species most likely does not have a regbuild #really just need to get the 'highest' level here warn "Could not find Chromosome CoordSystem. " . $db->dbc->dbname . ". most likely does not contain a RegulatoryBuild"; } else { my $imp_cs_status = 'IMPORTED_' . $cs_a->fetch_by_name('chromosome')->version; #What about non-chromosome assemblies? #top level will not return version...why not? @dset_states = ('DISPLAYABLE'); @rset_states = ( @dset_states, 'DAS_DISPLAYABLE', $imp_cs_status ); @fset_states = ( @rset_states, 'MART_DISPLAYABLE' ); } return ( \@dset_states, \@rset_states, \@fset_states ); } ## end sub get_regbuild_set_states =head2 define_ResultSet Arg [1] : Hash - set constructor parameters: -dbadaptor Bio::EnsEMBL::Funcgen::DBAdaptor -name Data/FeatureSet/ResultSet name to create -feature_set_analysis FeatureSet Bio::EnsEMBL::Analysis -result_set_analysis FeatureSet Bio::EnsEMBL::Analysis -feature_class e.g. annotated or regulatory -description FeatureSet description -recovery Allows definition of extant sets so long as they match -append Boolean - Forces import on top of previously imported data -rollback Rolls back product feature set. ####Add permitted values here! -input_sets Complete set of pre-stored supporting or input sets for this DataSet -slices ARRAYREF of Slices to rollback Example : my $dset = $self->define_ResultSet(%params); Description: Checks whether set is already in DB based on set name, feature_type, cell_type and analysis Rolls back features if -rollback is flag set appropriately, or creates new ResultSet if not present. This should only be used for creation or recovery of a ResultSet. Normal access to a pre-existing ResultSet should be via ResultSetAdaptor::fetch_by_name Returntype : Bio::EnsEMBL::Funcgen::DataSet Exceptions : Throws if DBAdaptor param not valid Caller : Importers and Parsers Status : At risk =cut #There is some overlap between this and the new migration code, can it be reused? #The migration code deals with two different DBs #our comparisons are between an unstored object and one fetched from the DB. sub define_ResultSet { my $self = shift; #Need to modify the stack here before passing to _validate_Set_config my ($rollback_level, $ctype, $ftype, $db, $inp_sets, $slices) = $self->_validate_Set_config(@_);#this could also return the following if these are generic # feature_class can be infered from the feature_type from the inp sets my ($anal, $rset_mode) = rearrange( ['RESULT_SET_ANALYSIS', 'RESULT_SET_MODE'], @_ ); #slight hack until we sort out 5mC class and db_file_registry.format #also overlaps with FeatureSet -feature_cl my $fclass = ($inp_sets->[0]->feature_type eq '5mC') ? 'dna_methylation' : 'result'; my $name = $inp_sets->[0]->name; #This will catch mandatory params my $rset = Bio::EnsEMBL::Funcgen::ResultSet->new ( -name => $name, -feature_type => $ftype, -cell_type => $ctype, -support => $inp_sets, -analysis => $anal, -feature_class => $fclass, ); my $rset_adaptor = $db->get_ResultSetAdaptor; #Can only ever be one with all arguments define unique key my ($stored_rset) = @{$rset_adaptor->fetch_all_by_name($name, $ftype, $ctype, $anal, $fclass)}; return $self->_validate_rollback_Set($stored_rset, $rset, 'result_set', $rollback_level, $rset_adaptor, $slices, $rset_mode); } ## end sub define_ResultSet sub define_FeatureSet { my $self = shift; #Need to modify the stack here before passing to _validate_Set_config my ($rollback_level, $ctype, $ftype, $db, $ssets, $slices) = $self->_validate_Set_config(@_); my ($name, $desc, $fclass, $dlabel, $anal) = rearrange(['NAME','DESCRIPTION', 'FEATURE_CLASS', 'DISPLAY_LABEL', 'FEATURE_SET_ANALYSIS'], @_ ); #This will catch mandatory params my $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new ( -name => $name, -feature_type => $ftype, -cell_type => $ctype, -analysis => $anal, -feature_class => $fclass, -description => $desc, -display_label => $dlabel, -input_set => $ssets->[0], ); my $fset_adaptor = $db->get_FeatureSetAdaptor; my $stored_fset = $fset_adaptor->fetch_by_name($name); return $self->_validate_rollback_Set($stored_fset, $fset, 'feature_set', $rollback_level, $fset_adaptor, $slices); } ## end sub define_FeatureSet sub _validate_rollback_Set { my ($self, $stored_set, $new_set, $set_type, $rollback_level, $adaptor, $slices, $set_mode) = @_; #set_type is not validated #assumes other vars are pre-validated by _validate_Set_config (my $rollback_method = ucfirst($set_type)) =~ s/_([a-z])/uc($1)/eg; $rollback_method =~ s/_//g; $rollback_method = 'rollback_'.$rollback_method; if($set_mode){ #This rollback shortcut is a bit result set specific, #but could be generic hence why it is here if($set_type ne 'result_set'){ throw('set_mode is currently only valid for result_sets'); } elsif($set_mode eq 'none'){ if($stored_set){ if($rollback_level < $rollback_modes{result_set}){ throw('Cannot import with -result_set_mode none with a pre-existing result set.'. ' Please specify -rollback result_set'); } else{ #rollback directly here as we aren't concerned with diffs $self->$rollback_method($stored_set, 'full'); } } return; #Bail out early as we don't want a ResultSet! } elsif($set_mode ne 'recover'){ throw("Invalid -result_set_mode $set_mode. ". 'Please omit or specify -result_set_mode none|recover'); } } if($stored_set && ($rollback_level >= $rollback_modes{$set_type})){ #This will throw if there are any diffs other than status entries $self->_compare_set_for_rollback($new_set, $stored_set, $set_type, $rollback_level, $slices); } #FINALLY DO ROLLBACK #Independant of whether there are diffs as there maybe some fault with the #data which has not been caught by the diffs e.g. truncated input if( ($rollback_level >= $rollback_modes{$set_type}) || ($stored_set && ! $stored_set->has_status('IMPORTED') ) || $set_mode ) { #Default full delete for an set with some differences my $delete_mode = 'full'; if ($rollback_level < $rollback_modes{$set_type}) { #Must be an identical set without IMPORTED status #We just want to rollback the features $delete_mode = $set_mode; #undef or recover (which ignores dependant sets) } elsif($set_mode && ($set_mode eq 'recover')){ #Also have rollback set for this set #currently this is always 'recover' throw('Cannot specify recover and rollback for '.$stored_set->name.' '. $set_type.'. Please omit one.'); } #else{ $delete_mode = 'full';}#$rollback >= $rollback_modes{$set_type} && $set_mode #would have to chang ethis if we add more set_modes #Handle full delete and slices clash as rollback_Data/ResultSet don't take slices arg if ($slices && ($delete_mode eq 'full')) { throw('Cannot do a \'full\' delete on a sub set of -slices'); } # DO THE ROLLBACK $self->$rollback_method($stored_set, $delete_mode, $slices); # UNDEF STORED SET FOR FULL DELETE if($delete_mode eq 'full'){ undef $stored_set; } } if(! defined $stored_set){ ($stored_set) = @{ $adaptor->store($new_set) }; } return $stored_set; } #This assmes we will always have a FeatureSet and a ResultSet sub _validate_Set_config { my $self = shift; my ($db, $rollback, $slices, $ssets, $ftype, $ctype, $fclass ) = rearrange ( ['DBADAPTOR', 'ROLLBACK', 'SLICES', 'SUPPORTING_SETS', 'FEATURE_TYPE', 'CELL_TYPE', 'FEATURE_CLASS'], @_ ); #Currently sets FeatureSet and ResultSet f/ctypes to same value if($fclass eq 'annotated'){ #Currently 1 InputSet is mandatory for FeatureSet & ResultSet if( ! ( defined $ssets && (ref($ssets) eq 'ARRAY') && (scalar(@$ssets) == 1) && (ref($ssets->[0]) eq 'Bio::EnsEMBL::Funcgen::InputSet')) ){ $self->debug(1, "Sets pass are:\n", $ssets); throw('Currently must provide 1 defined InputSet is permitted for a Result/FeatureSet'); } #This isn't full robust as we are not testing if they are defined if($ctype->name ne $ssets->[0]->cell_type->name){ throw('Found mismatch between '.ref($ssets->[0]). " CellType and CellType specified:\n\t".$ssets->[0]->cell_type->name. "\t".$ctype->name); } if($ftype->name ne $ssets->[0]->feature_type->name){ throw('Found mismatch between '.ref($ssets->[0]). " FeatureType and FeatureType specified:\n\t".$ssets->[0]->feature_type->name. "\t".$ftype->name); } } #This will be caught when calling Set::new in caller #elsif( ! ($ctype && $ftype)){#allow MultiCell and different ftypes for regbuild # throw('Must specify a -cell_type and -feature_type when creating an \''. # $fclass.'\' FeatureSet'); #} my $rollback_level = 0; if ($rollback) { if ( ! exists $rollback_modes{$rollback} ) { throw("$rollback is not a valid rollback mode, please specify one of the following:\n\t" . join( ', ', keys %rollback_modes ) . "\n" ); } $rollback_level = $rollback_modes{$rollback}; } if ( $slices && ( ref($slices) ne 'ARRAY' ) ) { throw('-slices param must be an ARRAYREF of Bio::EnsEMBL::Slice objects'); #Rest of slice validation done in other methods } #Check mandatory params if ( ! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))) { throw('Must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); } #set validation now done in define set methods as is conditional on feature_class # #returns rearranged vars for convenience return ( $rollback_level, $ctype, $ftype, $db, $ssets, $slices); } #This is a controlling method, which calls define_Result/FeatureSet #can we extend this for use in the registration pipeline #by adding define_InputSet & define_InputSubset etc. sub define_DataSet { my $self = shift; my ($ssets, $fclass, $name, $db) = rearrange(['SUPPORTING_SETS', 'FEATURE_CLASS' ,'NAME', 'DBADAPTOR'], @_ ); #No need to _validate_Set_config here as it will be done in the below methods #Always create the FeatureSet first #This will rollback_DataSet if required my $fset = $self->define_FeatureSet(@_); #Always create the ResultSet before the DataSet #so we can pass it as support my $rset; if($fclass eq 'annotated'){ #what about result_set_mode = none here? #we may want to rollback full delete here #this is a bit odd that define_ResultSet will delete if fully #alternate is to fetch the stored set and call _validate_rollback_Set #directly from here $rset = $self->define_ResultSet(@_); push @$ssets, $rset; } my $dset_adaptor = $db->get_DataSetAdaptor; #Generate new DataSet first to validate the parameters my $new_dset = Bio::EnsEMBL::Funcgen::DataSet->new ( -name => $name, -feature_set => $fset, -supporting_sets => $ssets, ); my $stored_dset = $dset_adaptor->fetch_by_name($name); #Could we actually cascade a compare_to from DataSet to feature_set and result_set? #This would be much harder to manage rollbacks #We are effectively doing this above, without complicating the the DataSet compare_to method #If we did it all in one go, then we would have to define eveything here #meaning the define_ResultSet and define_FeatureSet would beome redundant? #No, we could remove the _validate_rollback_Set from the return value #and let that be called in a controlling method define_validate_rollback_DataSet ? #it would also be very hard to cascade rollback of a DataSet withing _validate_rollback_Set? #Hence we would have to have a control method, which is essentially this method! #change the name of this method to define_validate_rollback_DataSet??? #should we change this other define method, such that they don't call _validate_rollback_Set? #Would we have a use for the very simple define Set methods? if(defined $stored_dset){ $self->_compare_set_for_rollback($new_dset, $stored_dset, 'data_set'); #No need to pass slices here as we can't rollback a data_slice based on a slice } else{ ($stored_dset) = @{$dset_adaptor->store($new_dset)}; } return $stored_dset; } ## end sub define_DataSet =head2 rollback_FeatureSet Arg [0] : Bio::EnsEMBL::Funcgen::FeatureSet for another DataSet. Arg [1] : Arrayref of Bio::EnsEMBL::Slice objects to rollback (Optional) Arg [2] : Boolean flag to perform full delete of feature_set and data_set records. Example : $self->rollback_FeatureSet($fset); Description: Deletes all features for the passed FeatureSet, along with associated status and xref records. Checks whether FeatureSet is a supporting set in any other DataSet. Returntype : None Exceptions : Throws if: Any deletes fails Dependant DataSets are found FeatureSet 'adaptor' method is not defined Defined Slices are not valid More than 1 sub-Slice is defined (i.e. not full length) Caller : Importers and Parsers Status : At risk =cut sub rollback_FeatureSet { my ( $self, $fset, $delete_mode, $slices ) = @_; my ( $sql, $slice_name ); my $slice_join = ''; if($delete_mode && ($delete_mode ne 'full')){ throw("Invalid delete mode defined:\t$delete_mode\n". 'Please omit of specify full'); #delete_mode is assumed to be full below } if ( ! ($fset && (ref($fset) eq 'Bio::EnsEMBL::Funcgen::FeatureSet') && defined $fset->adaptor ) ){ throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::ResultSet'); } my $db = $fset->adaptor->db; #Assumes access to DBAdaptor $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::FeatureSet', $fset ); my $table = $fset->feature_class . '_feature'; $self->log_header('Rolling back '.$fset->feature_class. " FeatureSet:\t" . $fset->name ); #Check whether this is a supporting set for another data_set my @dsets = @{ $db->get_DataSetAdaptor->fetch_all_by_supporting_set($fset) }; if (@dsets) { my $txt = $fset->name." is a supporting set of the following DataSets:\t" . join( ', ', ( map { $_->name } @dsets ) ); throw( $txt ."\nPlease resolve by deleting dependant Feature/DataSets" ); } #Validate all slices before we commence any rollback if ($slices) { if ($delete_mode) { #Must be full throw("Cannot specify a full delete for a Slice based rollback:\t" . $fset->name ); } if ( ref($slices) ne 'ARRAY' ) { throw('Slices must be an ARRAYREF of Slice objects'); } map { throw("Must pass a valid Bio::EnsEMBL::Slice") if ( !( ref($_) && $_->isa('Bio::EnsEMBL::Slice') ) ) } @$slices; $self->log( "Restricting to slices:\n\t\t" . join( "\n\t\t", ( map { $_->name } @$slices ) ) ); } else{ #Set undef slice for no slice definition $slices ||= [undef]; } for my $slice(@$slices){ my $slice_join = ''; if(defined $slice){ my $efg_sr_id = $fset->get_FeatureAdaptor->get_seq_region_id_by_Slice($slice); if ( ! $efg_sr_id ) { $self->log("Slice is not present in eFG DB:\t" . $slice->name ); } else { #Test is not subslice my $full_slice = $slice->adaptor->fetch_by_region(undef, $slice->seq_region_name); if ( ( $slice->start != 1 ) || ( $full_slice->end != $slice->end ) ){ $slice_join = " and f.seq_region_id=$efg_sr_id and f.seq_region_start<=" . $slice->end.' and f.seq_region_end>=' . $slice->start; } else{ $slice_join = ' and f.seq_region_id = '.$efg_sr_id; } # Now do the rollback $fset->adaptor->revoke_states($fset); if ( $fset->feature_class eq 'regulatory' ) { #Rollback reg attributes $sql = "DELETE ra from regulatory_attribute ra, $table f where ". "f.${table}_id=ra.${table}_id and f.feature_set_id=".$fset->dbID.$slice_join; $self->rollback_table( $sql, 'regulatory_attribute', undef, $db ); } elsif($fset->feature_class eq 'annotated'){ #Handle amfs $sql = 'DELETE amf from annotated_feature af, associated_motif_feature amf where '. 'af.feature_set_id='.$fset->dbID. ' AND af.annotated_feature_id = amf.annotated_feature_id'.$slice_join; $self->rollback_table( $sql, 'associated_motif_feature', undef, $db ); } ## end if ( $fset->feature_class...) #Remove object_xref records (but not xref which may be used by soemthing else) $sql = "DELETE ox from object_xref ox, $table f where ox.ensembl_object_type='" .ucfirst( $fset->feature_class )."Feature' and ox.ensembl_id=f.${table}_id and ". "f.feature_set_id=". $fset->dbID . $slice_join; $self->rollback_table( $sql, 'object_xref', 'object_xref_id', $db ); #Remove associated_feature_type records $sql = "DELETE aft from associated_feature_type aft, $table f where ". "f.feature_set_id=".$fset->dbID." and f.${table}_id=aft.table_id and ". "aft.table_name='".$fset->feature_class . "_feature'" . $slice_join; $self->rollback_table( $sql, 'associated_feature_type', undef, $db ); #Remove features $sql = "DELETE f from $table f where f.feature_set_id=" . $fset->dbID . $slice_join; $self->rollback_table( $sql, $table, "${table}_id", $db ); } } } if ($delete_mode) { #Must be full #Also delete feature/data/supporting_set records $self->log( "Deleting Feature/DataSet:\t" . $fset->name ); #Delete regbuild strings first if ( $fset->feature_class eq 'regulatory' ) { $sql = "DELETE from regbuild_string where feature_set_id=" . $fset->dbID; $self->rollback_table( $sql, 'regbuild_string', 'feature_set_id', $db ); $self->log( "Deleted regbuild_string entries for:\t" . $fset->name ); } $sql = "DELETE from feature_set where feature_set_id=" . $fset->dbID; $self->rollback_table( $sql, 'feature_set', 'feature_set_id', $db ); $self->log( "Deleted feature_set entry for:\t" . $fset->name ); $sql = 'DELETE ss, ds from data_set ds, supporting_set ss where '. 'ds.feature_set_id='.$fset->dbID.' AND ds.data_set_id=ss.data_set_id'; $self->rollback_table( $sql, 'data_set', 'data_set_id', $db ); $self->log("Deleted associated data/supporting_set entries for:\t" . $fset->name ); } return; } ## end sub rollback_FeatureSet =head2 rollback_ResultSet Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet Arg[2] : Boolean - optional flag to roll back array results Example : $self->rollback_ResultSet($rset); Description: Deletes all status. chip_channel and result_set entries for this ResultSet. Will also rollback_results sets if rollback_results specified. This will also update or delete associated ResultSets where appropriate. Returntype : Arrayref containing the ResultSet and associated DataSet which have not been rolled back Exceptions : Throws if ResultSet not valid Throws is result_rollback flag specified but associated product FeatureSet found. Caller : General Status : At risk =cut #Need to change slice to slices ref here #Need to add full rollback, which will specify to remove all sets #as well as results and #These params need clarifying as their nature changes between input_set and array rsets #Don't we always want to rollback_results? #force should only really be used to rollback InputSet ResultFeature sets #i.e. Read collections which are not used as direct input for the linked product FeatureSet #This should fail with array data associated with a product feature set #Do we want to separate ResultFeature rollback from result rollback? #Currently the array based collection rollback is done by hand #Could be done via the ResultFeature Collector, but should probably use this method. #rollback_results is only used in the MAGE parser to identify sets which have an #associated product fset. #Can't really separate due to integrated functionality #todo update callers to remove force, rollback_result and slice args sub rollback_ResultSet { my ( $self, $rset, $delete_mode ) = @_; if ( ! ($rset && (ref($rset) eq 'Bio::EnsEMBL::Funcgen::ResultSet') && defined $rset->adaptor ) ){ throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::ResultSet'); } if($delete_mode && (($delete_mode ne 'full') || ($delete_mode ne 'recover'))){ throw("Invalid delete mode defined:\t$delete_mode\n". 'Please omit of specify full or recover'); } my $db = $rset->adaptor->db; #Assumes db is accessible $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::ResultSet', $rset ); #Just omit experimental_chip/channel/result_feature support for now as we probably #won't ever use it again if($rset->table_name ne 'input_set'){ throw('rollback_ResutlSet not longer support non-InputSet rollbacks'); #This would need to check co-dependant ResultSets, see old version in cvs } $self->log( "Rolling back ResultSet:\t" . $rset->name ); ### Check if this ResultSet is part of a DataSet with a product feature set my @dsets = @{ $db->get_DataSetAdaptor->fetch_all_by_supporting_set($rset) }; if(@dsets) { if($delete_mode && ($delete_mode ne 'recover')){ throw('ResultSet '.$rset->name. " has associated DataSets, please specify -recovery or remove before rollback:\n\t". join(',', (map $_->name, @dsets))); } #This would never get executed #if($delete_mode && ($delete_mode eq 'full')){ # throw('Cannot perform full_delete on ResultSet '.$rset->name. # " with associated DataSets:\n\t".join(',', (map $_->name, @dsets))); #} #else assume diffs have already been caught by caller (e.g. _validate_rollback_Set in recovery mode) #Actually unsafe to force unless diffs have been checked } $db->get_ResultSetAdaptor->revoke_states($rset); #delete the dbfile_registry entry my $sql = 'DELETE from dbfile_registry where table_name="result_set" and table_id='.$rset->dbID; $self->rollback_table($sql, 'dbfile_registry', undef, $db); if($delete_mode && ($delete_mode eq 'full')){ #delete the result_set and result_set_input entries $self->log( "Deleting ResultSet:\t" . $rset->name ); $sql = 'DELETE rs, rsi from result_set_rs, result_set_input rsi WHERE '. 'rsi.result_set_id=rs.result_set_id AND rs.result_set_id='.$rset->dbID; $self->rollback_table($sql, 'result_set', 'result_set_id', $db); } return; } ## end sub rollback_ResultSet #todo add delete_DataSet (with flag to full_delete supporting_sets? #todo add delete_InputSet =head2 rollback_ArrayChips Arg[1] : ARRAYREF: Bio::EnsEMBL::Funcgen::ArrayChip objects Example : $self->rollback_ArrayChips([$achip1, $achip2]); Description: Deletes all Probes, ProbeSets, ProbeFeatures and states associated with this ArrayChip Returntype : None Exceptions : Throws if ArrayChip not valid and stored Throws if ArrayChips are not of same class Caller : General Status : At risk =cut #This should be tied to a CS id!!! #And analysis dependant? #We may not want to delete alignment by different analyses? #In practise the slice methods ignore analysis_id for this table #So we currently never use this! #So IMPORTED status should be tied to CS id and Analysis id? sub rollback_ArrayChips { my ( $self, $acs, $mode, $force, $keep_xrefs, $no_clean_up, $force_clean_up ) = @_; #no_clean_up and force_clean_up allow analyze/optimize to be skipped until the last rollback #We could get around this by specifying all ArrayChips for all formats at the same time? #Need to implement in RollbackArrays $mode ||= 'probe'; if ( $mode && ( $mode ne 'probe' && $mode ne 'probe_feature' && $mode ne 'ProbeAlign' && $mode ne 'ProbeTranscriptAlign' && $mode ne 'probe2transcript' ) ) { throw( "You have passed an invalid mode argument($mode), you must omit or specify either 'probe2transcript', 'probe', 'ProbeAlign, 'ProbeTranscriptAlign' or 'probe_feature' for all of the Align output" ); } if ( $force && ( $force ne 'force' ) ) { throw( "You have not specified a valid force argument($force), you must specify 'force' or omit" ); } if ( $keep_xrefs && ( $keep_xrefs ne 'keep_xrefs' ) ) { throw( "You have not specified a valid keep_xrefs argument($keep_xrefs), you must specify 'keep_xrefs' or omit" ); } if ($keep_xrefs) { if ( $mode eq 'probe' || $mode eq 'probe2transcript' ) { throw( "You cannot specify 'keep_xrefs' with mode $mode, you can only rollback features e.g. probe_feature, ProbeAlign or ProbeTranscriptAlign" ); } if ($force) { throw( "You cannot 'force' delete the probe2transcript xrefs and 'keep_xrefs' at the same time. Please specify just one." ); } } my ( $adaptor, $db, %classes ); foreach my $ac (@$acs) { $adaptor ||= $ac->adaptor || throw('ArrayChip must have an adaptor'); $db ||= $adaptor->db; $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::ArrayChip', $ac ); if ( !$ac->get_Array->class ) { throw( 'The ArrayChip you are trying to rollback does not have a class attribute' ); } $classes{ $ac->get_Array->class } = undef; #if($class && ($class ne $ac->get_Array->class)){ # throw('You can only rollback_ArrayChips for ArrayChips with the same class'); #} } #This is always the case as we register the association before we set the Import status #Hence the 2nd stage of the import fails as we have an associated ExperimentalChip #We need to make sure the ExperimentalChip and Channel have not been imported!!! warn "NOTE: rollback_ArrayChips. Need to implement ExperimentlChip check, is the problem that ExperimentalChips are registered before ArrayChips imported?"; #Check for dependent ExperimentalChips #if(my @echips = @{$db->get_ExperimentalChipAdaptor->fetch_all_by_ArrayChip($ac)}){ # my %exps; # my $txt = "Experiment\t\t\t\tExperimentalChip Unique IDs\n"; # foreach my $ec(@echips){ # $exps{$ec->get_Experiment->name} ||= ''; # $exps{$ec->get_Experiment->name} .= "\t".$ec->unique_id; # } # map {$txt.= "\t".$_.":".$exps{$_}."\n"} keys %exps; # throw("Cannot rollback ArrayChip:\t".$ac->name. # "\nFound Dependent Experimental Data:\n".$txt); # } my $ac_names = join( ', ', ( map { $_->name } @$acs ) ); my $ac_ids = join( ', ', ( map { $_->dbID } @$acs ) ); $self->log("Rolling back ArrayChips $mode entries:\t$ac_names"); my ( $row_cnt, $probe_join, $sql ); #$ac->adaptor->revoke_states($ac);#This need to be more specific to the type of rollback my $species = $db->species; if ( !$species ) { throw( 'Cannot rollback probe2transcript level xrefs without specifying a species for the DBAdaptor' ); } #Will from registry? this return Homo sapiens? #Or homo_sapiens ( $species = lc($species) ) =~ s/ /_/; my $transc_edb_name = "${species}_core_Transcript"; my $genome_edb_name = "${species}_core_Genome"; #Maybe we want to rollback ProbeAlign and ProbeTranscriptAlign output separately so we #can re-run just one part of the alignment step. #We want this Probe(Transcript)Align rollback available in the environment #So we can do it natively and before we get to the RunnableDB stage, #where we would be trying multiple rollbacks in parallel #Wrapper script? #Or do we keep it simple here and maintain probe_feature wide rollback #And just the ProbeAlign/ProbeTranscriptAlign roll back in the environment? #We can restrict the probe deletes using the ac_id #We should test for other ac_ids using the same probe_id #Then fail unless we have specified force delete #These should be deleted for all other modes but only if force is set? #This may delete xrefs for other ArrayChips #The issues is if we need to specify force for one delete but don't want to delete something else? #force should only be used to delete upto and including the mode specified #no mode equates to probe mode #if no force then we fail if previous levels/modes have xrefs etc... #Let's grab the edb ids first and use them directly, this will avoid table locks on edb #and should also speed query up? if ( $mode eq 'probe2transcript' || $force ) { #Delete ProbeFeature UnmappedObjects $self->log("Deleting probe2transcript ProbeFeature UnmappedObjects"); $sql = "DELETE uo FROM analysis a, unmapped_object uo, probe p, probe_feature pf, external_db e ". "WHERE a.logic_name ='probe2transcript' AND a.analysis_id=uo.analysis_id ". "AND p.probe_id=pf.probe_id and pf.probe_feature_id=uo.ensembl_id AND ". "uo.ensembl_object_type='ProbeFeature' and uo.external_db_id=e.external_db_id ". "AND e.db_name ='${transc_edb_name}' AND p.array_chip_id IN($ac_ids)"; $self->rollback_table( $sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up ); #Delete ProbeFeature Xrefs/DBEntries $self->log("Deleting probe2transcript ProbeFeature Xrefs"); $sql = "DELETE ox FROM xref x, object_xref ox, probe p, probe_feature pf, external_db e ". "WHERE x.external_db_id=e.external_db_id AND e.db_name ='${transc_edb_name}' ". "AND x.xref_id=ox.xref_id AND ox.ensembl_object_type='ProbeFeature' ". "AND ox.ensembl_id=pf.probe_feature_id AND pf.probe_id=p.probe_id AND ". "ox.linkage_annotation!='ProbeTranscriptAlign' AND p.array_chip_id IN($ac_ids)"; $self->rollback_table( $sql, 'object_xref', 'object_xref_id', $db, $no_clean_up ); #Probe/Set specific entries for my $xref_object ( 'Probe', 'ProbeSet' ) { $probe_join = ( $xref_object eq 'ProbeSet' ) ? 'p.probe_set_id' : 'p.probe_id'; #Delete Probe/Set UnmappedObjects $self->log("Deleting probe2transcript $xref_object UnmappedObjects"); $sql = "DELETE uo FROM analysis a, unmapped_object uo, probe p, external_db e ". "WHERE a.logic_name='probe2transcript' AND a.analysis_id=uo.analysis_id AND ". "uo.ensembl_object_type='${xref_object}' AND $probe_join=uo.ensembl_id AND ". "uo.external_db_id=e.external_db_id AND e.db_name='${transc_edb_name}' ". "AND p.array_chip_id IN($ac_ids)"; #.' and edb.db_release="'.$schema_build.'"'; $self->rollback_table( $sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up ); #Delete Probe/Set Xrefs/DBEntries $sql = "DELETE ox FROM xref x, object_xref ox, external_db e, probe p WHERE x.xref_id=ox.xref_id AND e.external_db_id=x.external_db_id AND e.db_name ='${transc_edb_name}' AND ox.ensembl_object_type='${xref_object}' AND ox.ensembl_id=${probe_join} AND p.array_chip_id IN($ac_ids)"; $self->log("Deleting probe2transcript $xref_object xref records"); $self->rollback_table( $sql, 'object_xref', 'object_xref_id', $db, $no_clean_up ); } } ## end if ( $mode eq 'probe2transcript'...) elsif ( !$keep_xrefs ) { #Need to check for existing xrefs if not force #we don't know whether this is on probe or probeset level #This is a little hacky as there's not way we can guarantee this xref will be from probe2transcript #until we get the analysis_id moved from identity_xref to xref #We are also using the Probe/Set Xrefs as a proxy for all other Xrefs and UnmappedObjects #Do we need to set a status here? Would have problem rolling back the states of associated ArrayChips for my $xref_object ( 'Probe', 'ProbeSet' ) { $probe_join = ( $xref_object eq 'ProbeSet' ) ? 'p.probe_set_id' : 'p.probe_id'; $row_cnt = $db->dbc->db_handle->selectrow_array( "SELECT COUNT(*) FROM xref x, object_xref ox, external_db e, probe p WHERE x.xref_id=ox.xref_id AND e.external_db_id=x.external_db_id AND e.db_name ='${transc_edb_name}' and ox.ensembl_object_type='${xref_object}' and ox.ensembl_id=${probe_join} AND p.array_chip_id IN($ac_ids)" ); if ($row_cnt) { throw( "Cannot rollback ArrayChips($ac_names), found $row_cnt $xref_object Xrefs. Pass 'force' argument or 'probe2transcript' mode to delete" ); } else { #$self->log("Found $row_cnt $xref_object Xrefs"); } } } ## end elsif ( !$keep_xrefs ) [ if ( $mode eq 'probe2transcript'...)] #ProbeFeatures inc ProbeTranscriptAlign xrefs if ( $mode ne 'probe2transcript' ) { if ( ( $mode eq 'probe' && $force ) || $mode eq 'probe_feature' || $mode eq 'ProbeAlign' || $mode eq 'ProbeTranscriptAlign' ) { #Should really revoke some state here but we only have IMPORTED #ProbeTranscriptAlign Xref/DBEntries #my (@anal_ids) = @{$db->get_AnalysisAdaptor->generic_fetch("a.module='ProbeAlign'")}; #Grrrr! AnalysisAdaptor is not a standard BaseAdaptor implementation #my @anal_ids = @{$db->dbc->db_handle->selectall_arrayref('select analysis_id from analysis where module like "%ProbeAlign"')}; #@anal_ids = map {$_= "@$_"} @anal_ids; if ( $mode ne 'ProbeAlign' ) { my $lnames = join( ', ', ( map { "'${_}_ProbeTranscriptAlign'" } keys(%classes) ) ); $sql = "DELETE ox from object_xref ox, xref x, probe p, probe_feature pf, external_db e WHERE ox.ensembl_object_type='ProbeFeature' AND ox.linkage_annotation='ProbeTranscriptAlign' AND ox.xref_id=x.xref_id AND e.external_db_id=x.external_db_id and e.db_name='${transc_edb_name}' AND ox.ensembl_id=pf.probe_feature_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; $self->log( "Deleting ProbeFeature Xref/DBEntry records for:\t$lnames"); $self->rollback_table( $sql, 'object_xref', 'object_xref_id', $db, $no_clean_up ); #Can't include uo.type='ProbeTranscriptAlign' in these deletes yet as uo.type is enum'd to xref or probe2transcript #will have to join to analysis and do a like "%ProbeTranscriptAlign" on the the logic name? #or/and ur.summary_description='Promiscuous probe'? $sql = "DELETE uo from unmapped_object uo, probe p, external_db e, analysis a ". "WHERE uo.ensembl_object_type='Probe' AND uo.analysis_id=a.analysis_id ". "AND a.logic_name in (${lnames}) AND e.external_db_id=uo.external_db_id ". "AND e.db_name='${transc_edb_name}' AND uo.ensembl_id=p.probe_id ". "AND p.array_chip_id IN($ac_ids)"; $self->log("Deleting UnmappedObjects for:\t${lnames}"); $self->rollback_table( $sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up ); #Now the actual ProbeFeatures $sql = "DELETE pf from probe_feature pf, probe p, analysis a WHERE a.logic_name in(${lnames}) AND a.analysis_id=pf.analysis_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; $self->log("Deleting ProbeFeatures for:\t${lnames}"); $self->rollback_table( $sql, 'probe_feature', 'probe_feature_id', $db, $no_clean_up ); } ## end if ( $mode ne 'ProbeAlign') if ( $mode ne 'ProbeTranscriptAlign' ) { my $lnames = join( ', ', ( map { "'${_}_ProbeAlign'" } keys(%classes) ) ); $sql = "DELETE uo from unmapped_object uo, probe p, external_db e, analysis a WHERE uo.ensembl_object_type='Probe' AND uo.analysis_id=a.analysis_id AND a.logic_name in (${lnames}) AND e.external_db_id=uo.external_db_id and e.db_name='${genome_edb_name}' AND uo.ensembl_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; $self->log("Deleting UnmappedObjects for:\t${lnames}"); $self->rollback_table( $sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up ); $sql = "DELETE pf from probe_feature pf, probe p, analysis a WHERE a.logic_name in(${lnames}) AND a.analysis_id=pf.analysis_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; $self->log("Deleting ProbeFeatures for:\t${lnames}"); $self->rollback_table( $sql, 'probe_feature', 'probe_feature_id', $db, $no_clean_up ); } } ## end if ( ( $mode eq 'probe'...)) else { #Need to count to see if we can carry on with a unforced probe rollback? #Do we need this level of control here #Can't we assume that if you want probe you also want probe_feature? #Leave for safety, at least until we get the dependant ExperimetnalChip test sorted #What about if we only want to delete one array from an associated set? #This would delete all the features from the rest? $sql = "select count(*) from object_xref ox, xref x, probe p, external_db e WHERE ox.ensembl_object_type='ProbeFeature' AND ox.linkage_annotation='ProbeTranscriptAlign' AND ox.xref_id=x.xref_id AND e.external_db_id=x.external_db_id and e.db_name='${transc_edb_name}' AND ox.ensembl_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; $row_cnt = $db->dbc->db_handle->selectrow_array($sql); if ($row_cnt) { throw( "Cannot rollback ArrayChips($ac_names), found $row_cnt ProbeFeatures. Pass 'force' argument or 'probe_feature' mode to delete" ); } else { $self->log("Found $row_cnt ProbeFeatures"); } } if ( $mode eq 'probe' ) { #Don't need to rollback on a CS as we have no dependant EChips? #Is this true? Should we enforce a 3rd CoordSystem argument, 'all' string we delete all? foreach my $ac (@$acs) { $ac->adaptor->revoke_states($ac) ; #Do we need to change this to revoke specific states? #Current states are only IMPORTED, so not just yet, but we could change this for safety? } #ProbeSets $sql = "DELETE ps from probe p, probe_set ps where p.array_chip_id IN($ac_ids) and p.probe_set_id=ps.probe_set_id"; $self->rollback_table( $sql, 'probe_set', 'probe_set_id', $db, $no_clean_up ); #Probes $sql = "DELETE from probe where array_chip_id IN($ac_ids)"; $self->rollback_table( $sql, 'probe', 'probe_id', $db, $no_clean_up ); } } ## end if ( $mode ne 'probe2transcript') $self->log("Finished $mode roll back for ArrayChip:\t$ac_names"); return; } ## end sub rollback_ArrayChips #This will just fail silently if the reset value #Is less than the true autoinc value #i.e. if there are parallel inserts going on #So we can never assume that the $new_auto_inc will be used sub rollback_table { my ( $self, $sql, $table, $id_field, $db, $no_clean_up, $force_clean_up ) = @_; my $row_cnt; #warn $sql; eval { $row_cnt = $db->dbc->do($sql) }; if ($@) { throw("Failed to rollback table $table using sql:\t$sql\n$@"); } $row_cnt = 0 if $row_cnt eq '0E0'; $self->log("Deleted $row_cnt $table records"); if ( $force_clean_up || ( $row_cnt && !$no_clean_up ) ) { $self->refresh_table( $table, $id_field, $db ); } return; } #Now separated so that we can do this once at the end of a rollback of many Sets sub refresh_table { my ( $self, $table, $id_field, $db ) = @_; #This only works if the new calue is available #i.e. do not need lock for this to be safe $self->reset_table_autoinc( $table, $id_field, $db ) if $id_field; $self->log("Optimizing and Analyzing $table"); $db->dbc->do("optimize table $table") ; #defrag data, sorts indices, updates table stats $db->dbc->do("analyze table $table"); #analyses key distribution return; } sub reset_table_autoinc { my ( $self, $table_name, $autoinc_field, $db ) = @_; if ( !( $table_name && $autoinc_field && $db ) ) { throw( 'You must pass a table_name and an autoinc_field to reset the autoinc value' ); } if ( !( ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) ) { throw('Must pass a valid Bio::EnsEMBL::DBSQL::DBAdaptor'); } #Unsafe to do this in two queries as parallel jobs may add in between select and alter #in fact this needs a table lock to be totally safe #although current ALTER will just fail silently if this happens my $sql = "select $autoinc_field from $table_name order by $autoinc_field desc limit 1"; my ($current_auto_inc) = $db->dbc->db_handle->selectrow_array($sql); my $new_autoinc = ($current_auto_inc) ? ( $current_auto_inc + 1 ) : 1; $sql = "ALTER TABLE $table_name AUTO_INCREMENT=$new_autoinc"; $db->dbc->do($sql); return; } ## end sub reset_table_autoinc =head2 get_core_display_name_by_stable_id Args [1] : Bio::EnsEMBL::DBSQL::DBAdaptor Args [2] : stable ID from core DB. Args [3] : stable feature type e.g. gene, transcript, translation Example : $self->validate_and_store_feature_types; Description: Builds a cache of stable ID to display names. Returntype : string - display name Exceptions : Throws is type is not valid. Caller : General Status : At risk =cut # -------------------------------------------------------------------------------- # Build a cache of ensembl stable ID -> display_name # Return hashref keyed on {$type}{$stable_id} #Need to update cache if we're doing more than one 'type' at a time # as it will never get loaded for the new type! sub get_core_display_name_by_stable_id { my ( $self, $cdb, $stable_id, $type ) = @_; $type = lc($type); if ( $type !~ /(gene|transcript|translation)/ ) { throw( "Cannot get display_name for stable_id $stable_id with type $type" ); } if ( !exists $self->{'display_name_cache'}->{$stable_id} ) { ( $self->{'display_name_cache'}->{$stable_id} ) = $cdb->dbc->db_handle->selectrow_array( "SELECT x.display_label FROM $type t, xref x where t.display_xref_id=x.xref_id and t.stable_id='${stable_id}'" ); } return $self->{'display_name_cache'}->{$stable_id}; } =head2 get_core_stable_id_by_display_name Args [1] : Bio::EnsEMBL::DBSQL::DBAdaptor Args [2] : display name (e.g. from core DB or GNC name) Example : Description: Builds a cache of stable ID to display names. Returntype : string - gene stable ID Exceptions : None Caller : General Status : At risk =cut # -------------------------------------------------------------------------------- # Build a cache of ensembl stable ID -> display_name # Return hashref keyed on {$type}{$stable_id} #Need to update cache if we're doing more than one 'type' at a time # as it will never get loaded for the new type! sub get_core_stable_id_by_display_name { my ( $self, $cdb, $display_name ) = @_; #if($type !~ /(gene|transcript|translation)/){ # throw("Cannot get display_name for stable_id $stable_id with type $type"); # } if ( !exists $self->{'stable_id_cache'}->{$display_name} ) { ( $self->{'stable_id_cache'}->{$display_name} ) = $cdb->dbc->db_handle->selectrow_array( "SELECT g.stable_id FROM gene g, xref x where g.display_xref_id=x.xref_id and and x.display_label='${display_name}'" ); } return $self->{'stable_id_cache'}->{$display_name}; } 1; #This could be a simple sub, if we passed $rollback_modes sub _compare_set_for_rollback { my ($self, $new_set, $stored_set, $set_type, $rollback_level, $slices) = @_; #do we catch undef $stored_set here and return without warning? #add flag for states comparison? my $diffs = $stored_set->compare_to($new_set, undef, undef, undef, 1); #by default this tests nested objects via is_stored and dbID match #1 is skip states flag for data set #delete get_states_diffs as these are never set and will likely #always be different and we handle IMPORTED below delete $diffs->{'get_all_states'} if exists $diffs->{'get_all_states'}; if(%$diffs){ if($set_type eq 'data_set'){ throw('Found differences in specified and stored DataSet '. "support or product FeatureSet, please rectify manually\n". Dumper($diffs)); #When called from define_DataSet should expect no errors as we have already #done the compare and rollback for Feature/ResultSet #We probably have some weird naming issue that has happened with previous #import and we need to handle this manually as it will be unsafe to rollback this data_set #What about addition/change if InputSet!!!!! #This should have been caught by rollback_FeatureSet #Can we be sure of this? #Should we just be calling _validate_rollback_Set here #for safety, no as this is unsafe #if there is a difference, then something has gone wrong and we need to rectify manually! #with the FeatureSet - DataSet association #If this is true the we will probably have failed at the rollback_ResultSet #level, as this may find an associated DataSet, as the rollback_FeatureSet #will likely not have rolled back the correct DataSet #todo Need to make sure the rollback methods are sensitive to this and throw #correctly } elsif($rollback_level < $rollback_modes{$set_type}){ throw("Found $set_type mismatch, please rectify manually or specify ". "-rollback $set_type\n".Dumper($diffs)); } elsif(@$slices){ #Should never have diffs and slices set, this indicates we are #redefining the inputs but only rerunning a subset of the data #There should be no diffs in parallel mode(single slice), as we should #have resolved this in the previous setup/submit analysis! throw("It is unsafe to rollback $set_type with a sub set of slices ". 'please do a full rollback i.e. omit -slices'); } } return; } ### DEPRECATED ### =head2 rollback_InputSet Arg[1] : Bio::EnsEMBL::Funcgen::InputSet Example : $self->rollback_InputSet($eset); Description: Deletes all status entries for this InputSet and it's Subsets Returntype : none Exceptions : Throws if any deletes fails or if db method unavailable Caller : Importers and Parsers Status : At risk =cut #Usage of this is now moot due to removal of IMPORTED style states form InputSet/Subset #todo implement delete_InputSet #revoke states can be called directly if required rather than rollback, as this is essentially #all a rollback method would be doing? #deprecate this with a throw as is not longer support sub rollback_InputSet { #deprecated in v72 my ( $self, $eset, $force_delete, $full_delete ) = @_; throw('rollback_InputSet is now deprecated, please update your code to use '. 'delete_InputSet or use revoke_states directly'); #Need to implement force_delete!!!!!!!!!!!!!!!!!!!!!! #Need to check this is not used in a DataSet/ResultSet my $adaptor = $eset->adaptor || throw('InputSet must have an adaptor'); my $db = $adaptor->db; $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::InputSet', $eset ); $self->log( "Rolling back InputSet:\t" . $eset->name ); #SubSets foreach my $esset ( @{ $eset->get_InputSubsets } ) { $esset->adaptor->revoke_states($esset); } #InputSet $eset->adaptor->revoke_states($eset); return; } =head2 define_and_validate_sets Arg [1] : hash - set constructor parameters: -dbadaptor Bio::EnsEMBL::Funcgen::DBAdaptor -name Data/FeatureSet/ResultSet name to create -feature_type Bio::EnsEMBL::Funcgen::FeatureType -cell_type Bio::EnsEMBL::Funcgen::CellType -analysis FeatureSet Bio::EnsEMBL::Analysis -feature_class e.g. annotated or regulatory -description FeatureSet description -recovery Allows definition of extant sets so long as they match -append Boolean - Forces import on top of previously imported data -rollback Rolls back product feature set. -supporting_sets Complete set of pre-stored supporting or input sets for this DataSet -slices ARRAYREF of Slices to rollback Example : my $dset = $self->define_and_validate_Set(%params); Description: Checks whether set is already in DB based on set name, rolls back features if roll back flag set. Or creates new DataSet and Feature|ResultSet if not present. Returntype : Bio::EnsEMBL::Funcgen::DataSet Exceptions : Throws if DBAdaptor param not valid Caller : Importers and Parsers Status : At risk =cut #This needs to account for >1 type, currently only create feature set or result set #this should actaully be define_and_validate_DataSet #then it can implicitly handle all other sets to # supporting sets are also used as the result_set_inputs! # these should always be the same as the data_set supporting set i.e. input_sets! sub define_and_validate_sets { my $self = shift; #change slice to slices to support multi slice import from InputSet::define_sets #Can't do full rollback in slice mode #This may not be safe in slice mode as we will then have mixed inputs/outputs throw ("define_and_validate_sets is deprecated, please use new define_Sets method"); my ( $name, $anal, $ftype, $ctype, $type, $append, $db, $ssets, $description, $rollback, $recovery, $slices, $display_label ) = rearrange( [ 'NAME', 'ANALYSIS', 'FEATURE_TYPE', 'CELL_TYPE', 'FEATURE_CLASS', 'APPEND', 'DBADAPTOR', 'SUPPORTING_SETS', 'DESCRIPTION', 'ROLLBACK', 'RECOVERY', 'SLICES', 'DISPLAY_LABEL' ], @_ ); #VALIDATE CONFIG HASH #$config_hash ||= {};#default so exists will work without testing #if(keys %{$config_hash}){ # #There is a module to handle config hashes somewhere! # throw('config_hash not yet implemented for define_and_validate_sets'); #my @known_config = ('full_delete');#We never want full delete here as this is a create method! #Can we set vars from has by refs like getopts? #map { # throw("Found unsupported config hash parameter:\t$_") if ! grep(/^${_}$/, @known_config); #} keys %{$config_hash}; # } #define rollback level #extract this to _set_rollback_level($rollback_mode, $feature_class) my $rollback_level = 0; #These should be globally defined so all rollback methods can use them my %valid_rollback_modes = ( product_features => 1, #Just product features and FeatureSet status, what about DataSet status? #full delete does nothing here? sets => 2, #Includes product_features and #deletes supporting_sets entries unless we specify append #revoke all states on Feature/Data/InputSets #Full delete removes Feature/Data/InputSet entries #Never includes ResultSets! supporting_features => 3, #Includes product_feature and sets #Removes all states and supporting features #inc. ResultSet results/ResultFeatures #Full_delete remove supporting set entries #Otherwise just rollback states for affected sets ); if ($rollback) { if ( !exists $valid_rollback_modes{$rollback} ) { #Default to some sensible values $rollback = 'product_features'; #default for FeatureSets #Always want overwrite supporting sets if there is a difference $rollback = 'sets' if ( $type eq 'regulatory' ); $rollback = 'supporting_sets' if ( $type eq 'result' ); warn( "You have not set a valid rollback mode(product_features|sets|supporting_features), defaulting to $rollback for feature class $type\n" ); } $rollback_level = $valid_rollback_modes{$rollback}; } if ( $slices && ( ref($slices) ne 'ARRAY' ) ) { throw( '-slices param must be an ARRAYREF of Bio::EnsEMBL::Slice objects' ); #Rest of validation done in other methods } #But how are we going to resolve the append behaviour when we also want to validate the ssets? #Can't, so append also functions to enable addition in the absence of some or all previous data/esets? #No this is not true, we want to be able to fetch an extant set for import, #we just need to be aware of sset IMPORTED status? #This should be a recovery thing, allow fetch, but validate sets? #Check mandatory params if ( !(ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor') ) ) { throw( 'Must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); } throw('Must provide a -name ') if ( !defined $name ); #Not necessarily, just do rollback then append? #But then we'd potentially have a supporting set associated which has had it's data removed from the feature set. #Generating sets for an ExpSet will always have append set #This could be valid for generically grabing/creating sets for adding new supporting sets e.g. reg build throw('-append and -rollback are mutually exclusive') if $rollback_level && $append; #This will never happen due to previous test? append will always fail? #warn('You are defining a pre-existing FeatureSet without rolling back'. # ' previous data, this could result in data duplication') if $append && ! $rollback_level; #Is this really possible, surely the supporting set will fail to store due to unique key? #Should we warn here about append && recovery? #Aren't these mutually exclusive? #Do we know if we have new data? append should override recovery, or just specifiy append #This will stop the import and highlight the issue to the user #We need to be able to run with both otherwise the import will not work throw( 'Must provide a -feature_class e.g. annotated, external, result or regulatory' ) if ( !defined $type ); #Check for annotated, external, regulatory etc here? #Should never be external as we don't have DataSets for external sets? $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::FeatureType', $ftype ); if ( defined $ctype ) { $db->is_stored_and_valid( 'Bio::EnsEMBL::Funcgen::CellType', $ctype ); } elsif ( $type ne 'regulatory' ) { throw( 'Only Data/FeatureSets with type \'regulatory\' can have an undefined CellType' ); #Coudl extend this to core set by name eq 'RegulatoryFeatures'? } $db->is_stored_and_valid( 'Bio::EnsEMBL::Analysis', $anal ); my $dset_adaptor = $db->get_DataSetAdaptor; my $fset_adaptor = $db->get_FeatureSetAdaptor; my $rset_adaptor = $db->get_ResultSetAdaptor; #DataSet centric definition to enable multiple DataSets #to be generated from the same supporting sets my $dset = $dset_adaptor->fetch_by_name($name); my ( $fset, $rset, @input_sets ); #Validate stored vs passed set data if ( defined $dset ) { $self->log( 'Found Stored DataSet ' . $dset->name ); if ( $type ne 'result' ) { #i.e. annotated #Does this account for regulatory? $fset = $dset->product_FeatureSet; #Here we have the possiblity that a feature_set with a different name may have #been associated with the DataSet if ( defined $fset ) { $self->log( "Found associated product FeatureSet:\t" . $fset->name ); #if(! $clobber && if ( $fset->name ne $name ) { throw( 'Invalid product FeatureSet name (' . $fset->name . ') for DataSet (' . $name . '). Rollback will overwrite the FeatureSet and mismatched name will be retained.' ); #Need to clobber both or give explicit name for datasets or rename dataset??? #Force this throw for now, make this fix manual as we may end up automatically overwriting data } } #This needs to be modified to support InputSets in ResultSets? #Would never have mixed Input/ResultSets so no need #Could potential need to do it for mixed Result/FeatureSets #if we ever use an analysis which uses both set types #check supporting_sets here if defined #We have the problem here of wanting to add ssets to a previously existing dset #we may not know the original sset, or which of the ssets are new #Hence there is a likelihood of a mismatch. #Much of this is replicated in store_udpated sets if ( defined $ssets ) { my @sorted_ssets = sort { $a->dbID <=> $b->dbID } @{$ssets}; my @stored_ssets = sort { $a->dbID <=> $b->dbID } @{ $dset->get_supporting_sets }; my $mismatch = 0; $mismatch = 1 if ( scalar(@sorted_ssets) != scalar(@stored_ssets) ); if ( !$mismatch ) { for my $i ( 0 .. $#stored_ssets ) { if ( $stored_ssets[$i]->dbID != $sorted_ssets[$i]->dbID ) { $mismatch = 1; last; } } } if ($mismatch) { #We're really print this names here which may hide the true cell/feature/anal type differences. my $mismatch = 'There is a (name/type/analysis) mismatch between the supplied supporting_sets and the' . ' supporting_sets in the DB for DataSet ' . $dset->name . "\n\nStored:\n" . join( ', ', ( map { $_->name } @stored_ssets ) ) . "\n\nSupplied supporting_sets:\n" . join( ', ', ( map { $_->name } @sorted_ssets ) ); if ($append) { warn( $mismatch . "\n\nAppending supporting set data to unvalidated supporting sets" ); } elsif ( $rollback_level > 1 ) { #supporting set rollback warn( $mismatch . "\n\nReplacing previously stored supporting sets with newly defined sets\n" ); if ($slices) { warn( "WARNING:\tPerforming supporting_set rollback in slice mode. This may corrupt the supporting_set definition for other slices in this DataSet if they are not re-generated using the same supporting_sets\n" ); } #Remove supporting_set entries #This should be in a rollback_DataSet method #This has moved to DataSetAdaptor::store_update_sets #Reset supporting sets $dset->{'supporting_sets'} = undef; $dset->add_supporting_sets( \@sorted_ssets ); #Move this to last block? #This will currently fail as it test for product_FeatureSet #How do we get around this? Remove IMPORTED status and only throw if fset has IMPORTED status? #warn "pre store sset ".@{$dset->get_supporting_sets}; #($dset) = @{$dset_adaptor->store_updated_sets([$dset], $rollback_level)}; #$dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; } ## end elsif ( $rollback_level >... [ if ($append) ]) else { throw($mismatch); } } ## end if ($mismatch) } ## end if ( defined $ssets ) else { warn( "No supporting sets defined, skipping supporting set validation for definition of DataSet:\t" . $name ); } } ## end if ( $type ne 'result') else { #result_features from InputSet #Do we ever pass supporting sets here? #Do we need to test vs stored_sets? #There is the potential for more than one ResultSet to be associated with DataSet #But as we are using the same name, this restricts the number wrt the cardinality #of the name field. i.e. 1 name per analysis/cell_type/feature_type. #This now works slightly differently to the rest of this method as we #need to treat the ResultSet as we are currently treating the FeatureSet below. #However, the use case of this method is for one InputSet giving rise to one ResultSet #Hence just throw if we find more than one or have a name mismatch??? my @stored_sets = @{ $dset->get_supporting_sets }; #THis assumes we will always have supporting sets #and is failing as we have removed this test in DataSet::new #But where are we storing it without the supporting set? if ( scalar(@stored_sets) > 1 ) { throw( 'define_and_validate_sets does not yet support DataSets with multiple supporting ResultSets for result_features' ); } elsif ( !@stored_sets ) { throw( "DataSet($name) does not have any stored supporting sets. These should have been defined when storing the DataSet" ); #Or should we handle this? } $rset = $stored_sets[0]; if ( $rset->set_type ne 'result' ) { throw( "DataSet already contains a supporting set which is not a ResultSet:\t" . $rset->set_type . "\t" . $stored_sets[0]->name ); } elsif ($ssets) { #Do we ever pass supporting sets, test for completeness #Just test we have the same supplied ssets if it is defined if ( scalar(@$ssets) != 1 ) { throw( "ResultFeature data sets currently only support one supporting ResultSet.\nSupproting sets:\t" . join( ', ', ( map { $_->name . '(' . $_->set_type } @$ssets ) ) ); } elsif ( !( $rset->dbID == $ssets->[0]->dbID ) && ( $ssets->[0]->set_type eq 'result' ) ) { throw( 'Supplied supporting set(' . $ssets->[0]->name . ') does not match stored supporting set(' . $rset->name . ')' ); } } @input_sets = @{ $rset->get_InputSets }; } ## end else [ if ( $type ne 'result')] } ## end if ( defined $dset ) if ( $type eq 'result' ) { #Validate the defined InputSets if ( scalar(@$ssets) > 1 ) { throw( "define_and_validate_sets does not yet support multiple InputSets for defining a ResultSet:\t" . $name ); } if ( $ssets->[0]->set_type ne 'input' ) { throw( "To define a ResultSet($name) containing result_features, you must provide and InputSet as a supporting set\nArray based ResultSets(i.e. experimental_chip/channel) are not defined using this method, see specific Import Parsers." ); } #Try and grab the rset just in case it has been orphaned somehow if ( !defined $rset ) { $rset = $rset_adaptor->fetch_all_by_name( $name, $ftype, $ctype, $anal ) ->[0]; #Should only ever be one given all parts of unique key @input_sets = @{ $rset->get_InputSets } if $rset; } if ( defined $rset ) { #Validate stored InputSets if ( scalar(@input_sets) != scalar(@$ssets) ) { throw( 'Found mismatch between number of previously stored InputSets(' . scalar(@input_sets) . ') and defined InputSets(' . scalar(@$ssets) . '). You must provide a complete list of InputSets to define your ResultSet.' ); } if ( $input_sets[0]->dbID != $ssets->[0]->dbID ) { throw( 'Found dbID mismatch between previously stored InputSet(' . $input_sets[0]->name . ') and define InputSet(' . $ssets->[0]->name . ')' ); } #rollback ResultSet/InputSet here? if ( $rollback_level > 2 ) { warn "rollback not yet fully implemented for Result/InputSets"; #Does this need to be by slice? #What about states if we are running in parallel? if ($slices) { throw('rollback_ResultSet not longer support slices'); map { $self->rollback_ResultSet( $rset, $rollback, $_ ) } @$slices; } else { $self->rollback_ResultSet( $rset, $rollback ); } } } ## end if ( defined $rset ) else { #define ResultSet ($rset) = @{ $rset_adaptor->store( Bio::EnsEMBL::Funcgen::ResultSet->new( -name => $name, -feature_type => $ftype, -cell_type => $ctype, -table_name => 'input_set', -table_id => $ssets->[0]->dbID, -analysis => $anal ) ) }; } } ## end if ( $type eq 'result') else { #annotated/regulatory/external i.e. FeatureSet #Try and grab the fset just in case it has been orphaned somehow if ( !defined $fset ) { $fset = $fset_adaptor->fetch_by_name($name); if ( defined $fset ) { #Now we need to test whether it is attached to a dset #Will be incorrect dset if it is as we couldn't get it before #else we test the types and rollback $self->log( "Found stored orphan FeatureSet:\t" . $fset->name ); my $stored_dset = $dset_adaptor->fetch_by_product_FeatureSet($fset); if ( defined $stored_dset ) { throw( 'Found FeatureSet(' . $name . ') associated with incorrect DataSet(' . $stored_dset->name . ").\nTry using another -name in the set parameters hash" ); } } } #Rollback or create FeatureSet if ( defined $fset ) { if ($rollback_level) { #Don't check for IMPORTED here as we want to rollback anyway #Not forcing delete here as this may be used as a supporting set itself. $self->rollback_FeatureSet( $fset, undef, $slices ); } elsif ( $append || $recovery ) { #This is only true if we have an sset mismatch #Do we need to revoke IMPORTED here too? #This behaves differently dependant on the supporting set. #InputSet status refers to loading in FeatureSet, where as ResultSet status refers to loading into result table #So we really want to revoke it #But this leaves us vulnerable to losing data if the import crashes after this point #because we have no way of assesing which is complete data and which is incomplete data #within a feature set. #This means we need a status on supporting_set, not InputSet or ResultSet #as this has to be in the context of a dataset. #Grrr, this means we need a SupportingSet class which simply wraps the InputSet/ResultSet #We also need a single dbID for the supporting_set table #Which means we will have to do some wierdity with the normal dbID implementation #i.e. Have supporting_set_id, so we can still access all the normal dbID method for the given Set class #This will have to be hardcoded into the state methods #Also will need to specify when we want to store as supporting_status or normal set status. #This is an awful lot to protect against vulnerability #Also as there easy way to track what features came from which supporting set #There isn't currently a viable way to rollback, hence will have to redo the whole set. #Maybe we can enforce this by procedure? #By simply not associating the supporting set until it has been loaded into the feature set? #This may cause even more tracking problems #Right then, simply warn and do not revoke feature_set IMPORTED to protect old data? #Parsers should identify supporting_sets(InputSets) which exist but do not have IMPORTED #status and fail, specifying -recover which will rollback_FeatureSet which will revoke the IMPORTED status #This can mean a failed import can leave a partially imported feature set with the IMPORTED status!!! #We just need to handle InputSets and ResultSets differently. #In parsers or here? #Probably best in the parsers as this is where the states are set. #Should we throw here for ResultSet? #Force rollback of FeatureSet first or create new one? #And throw for InputSet? #This again comes back to whether we will ever have more than one file #for a give InputSet, currently not. $self->log( "WARNING\t::\tAdding data to a extant FeatureSet:\t" . $fset->name ); } ## end elsif ( $append || $recovery) [ if ($rollback_level) ] else { throw( 'Found extant FeatureSet ' . $fset->name . '. Maybe you want to specify the rollback, append or recovery parameter or roll back the FeatureSet separately?' ); } } ## end if ( defined $fset ) else { #create a new one $self->log( "Creating new FeatureSet:\t" . $name ); $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new( -name => $name, -feature_type => $ftype, -cell_type => $ctype, -analysis => $anal, -feature_class => $type, -description => $description, -display_label => $display_label, ); ($fset) = @{ $fset_adaptor->store($fset) }; } } ## end else [ if ( $type eq 'result')] #Create/Update the DataSet if ( defined $dset ) { #Could do these updates above? #But delayed to reduce redundancy if ( $type ne 'result' ) { if ( !defined $dset->product_FeatureSet ) { $self->log( "Updating DataSet with new product FeatureSet:\t" . $fset->name ); $dset->product_FeatureSet($fset); } $dset = $dset_adaptor->store_updated_sets( [$dset], $rollback_level ) ->[0]; #This cannot store the focus sets as we don't know which are which yet #Only the script knows this # $dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; } else { #We may have the case where we have a DataSet(with a FeatureSet) but no ResultSet #i.e. Load result_features after peak calls #So update dset with ResultSet if ( !@{ $dset->get_supporting_sets } ) { $self->log( "Updating DataSet with new ResultSet:\t" . $rset->name ); $dset->add_supporting_sets( [$rset] ); $dset = $dset_adaptor->store_updated_sets( [$dset], $rollback_level ) ->[0]; } } } ## end if ( defined $dset ) else { $self->log( "Creating new ${type}_feature DataSet:\t" . $name ); if ( $type ne 'result' ) { ($dset) = @{ $dset_adaptor->store( Bio::EnsEMBL::Funcgen::DataSet->new( -name => $name, -feature_set => $fset, -supporting_sets => $ssets, ) ) }; #$dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; } else { warn "creating dataset $name with supporting set $rset"; ($dset) = @{ $dset_adaptor->store( Bio::EnsEMBL::Funcgen::DataSet->new( -name => $name, -supporting_sets => [$rset], ) ) }; } } return $dset; } ## end sub define_and_validate_sets #### DEPRECATED #### sub debug_hash { throw('DEPREACATED: Please use debug instead'); }