=pod 

=head1 NAME

Bio::EnsEMBL::Funcgen::RunnableDB::DefineSets

=head1 DESCRIPTION

=cut

package Bio::EnsEMBL::Funcgen::Hive::DefineSets;

use base ('Bio::EnsEMBL::Funcgen::Hive::Base');#?


#Bio::EnsEMBL::Funcgen::Hive(::Config)
#We don't need to discriminate between Runnables and RunnableDBs anymore
#Just name the modules accordingly!


use warnings;
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; 
use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (strip_param_args generate_slices_from_names 
                                               strip_param_flags run_system_cmd);
use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
use Bio::EnsEMBL::Funcgen::Importer;
#use Data::Dumper;

#global values for the Helper... maybe pass as parameters...
$main::_debug_level = 0;
$main::_tee = 0;
$main::_no_log = 1;


#We're assuming here that the Experiment and InputSet/SubSets have all been previously registered, 
#and now we want simply to define/fetch the data set, feature and result set based on these data.

#Do we want to add support here to add a collection result set to an existing data set which has no result_set
#and vice versa wrt peak set?

#Naming issues here as peaks set will always have analysis in name.
#Do we actually need to create a result_set only data set?

#todo -slice_import_status?

sub fetch_input {   # fetch parameters...
  my $self = shift @_;

  
  #why is this needed? Can we set this in the config as the src_root?  
  if(!defined($ENV{EFG_SRC})){ throw "NEED to define EFG_SRC"; }

  
  $self->param('name',$self->param('input_set'));
  $self->param('output_dir',$self->param('output_dir')."/".$self->param('input_set'));

  $ENV{EFG_DATA} = $self->param('output_dir');
  #Either test $ENV{EFG_DATA} or use a changed version of Importer.pm 
  #Are these ENV vars required by collection code?

  $self->set_Importer_from_params;

  return;
}

sub run {   # Check parameters and do appropriate database/file operations... 
  my $self = shift @_;
  
  my $Imp = $self->param('importer');
  
  
  #This needs to call a generic method which will use
  #optioanl command line filters (ftype.ctype, project, name etc) and poll the input_set_tracking table
  #based on the expectations of this analysis and the status set for a given input_set
  #Need to handle some form of rollback here, such that we can fetch input_ids and rollback an IMPORTED analysis
  #rollback will fail if there are depedancies which already exist (IMPORTED or not)
  
  #This method can be re-used by other confs to define their inputs, if we don't dataflow between conf linking analyses
  
  #Then for each input_set_id
  #we fetch the input_set and define_DataSet based on the 


  foreach my $input_set_id(@input_set_ids){
    
    #This does not allow for us to create just the ResultSet
    #we should handle that here
    
    #do all these come from the importer?
    #can we start to separate these out a little bit
   
   
    $Imp->define_DataSet
      (
        -NAME => $set_name,
        'DESCRIPTION', 'FEATURE_CLASS', 'DISPLAY_LABEL', 'FEATURE_SET_ANALYSIS'
    'RESULT_SET_ANALYSIS', 'RESULT_SET_MODE'
    'DBADAPTOR', 'ROLLBACK', 'SLICES', 'SUPPORTING_SETS', 
      'FEATURE_TYPE', 'CELL_TYPE', 'FEATURE_CLASS'
    #dataflow here as we now use data_set_ids rather than input_set_ids 
  }
  

  return 1;
}


sub write_output {  # Create the relevant jobs
  my $self = shift @_;
  
  #Don't need explicit data flow here as it will be done via branch 1.
   
  return;
}


#Here we need to handle pre-existing data and rollback!
#Or just do this separately for now?
#Would need to use compare methods here?
#


#We will have pre-defined InputSets here
#What will input parameters actually be? built from dir names or manually specified
#or picked up from status in tracking table? 
#We could then specify filters in conjunction with tracking states to
#run subsets
#This would allow use to pre-config the tracking DB.
#This would require the status of the input_set to be updated
#else we would risk re-running a job that has already be run.
#Is this duplication of states between hive and tracking sensible?

#This should really run as one job, rather than a job for each set?
#This should take filter arguments to fetch the required input_set based on the status
#or a list of input_set names
#This in turn will create the reset of the ids to flow onto the other analyses
#what about manually creation of these input_ids, if we want to rerun just one analysis(peaks/signal)
#simply re_run this analysis, in recovery mode, with the appropriate config, such that it dataflows
#correctly
#need to explicitly flow the IDs as we don't have the correct ids at this point.
#
#This is not possible for all the other pipelines, as they won't share this analysis
#but is this the utility of the tracking status
#this will allow use to re-generate the input_ids, regardless of whether we are 
#re-running/overwriting

#Do we want to rollback separately, or allow the pipeline to do this?
#probably allow the pipeline to do it based on the rollback_level!
#rollback level can be set by an analysis, as this will be defined by the context of the analysis

#If we don't update the status of the input_set, then e can use the name to identify which sets have been IMPORTED
#is this easy?
#will this give us tracking of all aspects of the pipeline? MotifFeature association on FeatureSets?

#rollback
#Should we allow the pipeline to perform rollback of entire IMPORTED data set?
#Or just single analysis which we are rerunning i.e. features for a peak analysis
#what about the case where the collections have fallen over?
#Currently we can't rollback a result_set which has a data set.
#if this where we want to recover? i.e. allow re-use of an existing result_set(with an associated data_set)
#how does the rollback for this currently work?
#
#We need a -recover option, which will simply overwrite exisint feature data given that the set definition is unchanged
#This needs unset IMPORTED status


sub define_Sets {
  my ($self, $analysis, $input_subset, $fset_name) = @_;
  #Todo make it more generic and accept multiple input_subsets
  #Also maybe pass parameters as hash list...
  
  #aren't vars above set in params and available via *private* methods?
  #$self->_analysis(), $self->_input_file(), $self->_feature_set_name()
  
  #Global parameters set in Funcgen->fetch_input 
  my $efgdba = $self->_efgdba();
  my $set_name  =  $self->_set_name();
  my $group = $self->_group();
  my $cell_type =  $self->_cell_type();
  my $feature_type =  $self->_feature_type();
  
  
  my $iset_name = $set_name;
  my $dset_name = $fset_name;

  # set experiment: Reuse if already exists? (This comes from result sets)
  my $ea = $efgdba->get_ExperimentAdaptor;
  my $exp = $ea->fetch_by_name($set_name);

  my @date = (localtime)[5,4,3];
  $date[0] += 1900; $date[1]++;
  
  if (! defined $exp) {
    
    #Group needs to be set manually, like Cell_Type and Feature_Type
    #Do not create Group on the fly here, as it will cause concurrency issues...    
    $exp = Bio::EnsEMBL::Funcgen::Experiment->new
      (
       -NAME => $set_name,
       -EXPERIMENTAL_GROUP => $group,
       -DATE => join('-', @date),
       -PRIMARY_DESIGN_TYPE => 'binding_site_identification',
       -ADAPTOR => $ea,
      );

    ($exp) =  @{$ea->store($exp)};

  }
  throw("Can't create experiment $set_name ") unless $exp;

  my $isa = $efgdba->get_InputSetAdaptor();
  my $iset = $isa->fetch_by_name($iset_name);

  if (! defined $iset){
    
    $iset = Bio::EnsEMBL::Funcgen::InputSet->new
      (
       -name         => $iset_name,
       -experiment   => $exp,
       -feature_type => $feature_type,
       -cell_type    => $cell_type,
       -vendor       => 'SOLEXA',
       -format       => 'SEQUENCING',
       -feature_class => 'result'
       # Analysis is not being used??
       #-analysis     => $self->feature_analysis,
      );
    warn "Storing new InputSet:\t$iset_name\n";
    ($iset)  = @{$isa->store($iset)};
    
    $iset->add_new_subset($input_subset);
    $iset->adaptor->store_InputSubsets($iset->get_InputSubsets);
  } else {

    #We only expect one subset here (? why??)... 
    #shouldn't we be adding the control file also when used?? But this is SWEmbl-specific...
    #And it should be the same file name...
    #Maybe do some file checking  here???
    warn "InputSet already exists:\t$iset_name\n";
    my @issets = @{$iset->get_InputSubsets};
    
    #if(scalar(@issets) > 1){
    #  throw("InputSet $iset_name has more than one InputSubset:\t".join("\t", (map $_->name, @issets)));
    #} elsif((scalar(@issets) == 1) && ($issets[0]->name ne $self->param('input_file'))){
    #  throw("InputSet $iset_name already has an InputSubset(".$issets[0]->name.") which does not match ".$self->param('input_file'));
    #} elsif(scalar(@issets) == 0){ #we can just add this InputSubset
    #  $iset->add_new_subset($self->input_id);
    #  $iset->adaptor->store_InputSubsets($iset->get_InputSubsets);
    #}

    if(scalar(@issets)==0){
      #we can just add this InputSubset. Add an extra 'input:' as prefix?
      $iset->add_new_subset($input_subset);
      $iset->adaptor->store_InputSubsets($iset->get_InputSubsets);
    } else {
      #warn("Need to uncomment this section!! - it was commented just for testing purposes!!");
      #we just need to check if our file(s) is(are) already here...
      if(!$iset->get_subset_by_name($input_subset)){ 
      	#throw("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset);
	#warn("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset);
      }
    } 
  }

  my $fsa = $efgdba->get_FeatureSetAdaptor();
  my $fset = $fsa->fetch_by_name($fset_name);
  
  if ( ! defined $fset ) {
    
    $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new
      (
       -analysis      => $analysis,
       -feature_type  => $feature_type,
       -cell_type     => $cell_type,
       -name          => $fset_name,
       -feature_class => 'annotated',
       -experiment_id => $exp->dbID,
       #The adaptor is needed to store!
       -adaptor       => $fsa

      );
    
    warn "Storing new FeatureSet:\t$fset_name\n";
    ($fset) = @{$fsa->store($fset)};
		
  } 
  else {
    warn "FeatureSet already exists:\t$fset_name\n";

    if(@{$efgdba->get_AnnotatedFeatureAdaptor->fetch_all_by_FeatureSets([$fset])}){
      throw "Feature Set $set_name already contains data. Please rollback before rerunning";
    }

  }

  my $dsa = $efgdba->get_DataSetAdaptor;
  my $dset = $dsa->fetch_by_name($dset_name);
  
  
    if ( ! defined $dset ) {
      
      $dset = Bio::EnsEMBL::Funcgen::DataSet->new
        (
         -SUPPORTING_SETS     => [$iset],
         -FEATURE_SET         => $fset,
         -DISPLAYABLE         => 1,
         -NAME                => $dset_name,
         -SUPPORTING_SET_TYPE => 'input',
        );
      
      warn "Storing new DataSet:\t$dset_name\n";
      ($dset) = @{$dsa->store($dset)}
    } 
  else {
    
    warn "DataSet already exists:\t$dset_name\n";
    
    # need to check whether InputSets and supporting_sets are the same and 
    # possibly add InputSet to supporting_sets
    
    my $ssets = $dset->get_supporting_sets();
    
    my %ssets_dbIDs = ();
    map { $ssets_dbIDs{$_->dbID}='' } (@{$ssets});
    $dset->add_supporting_sets([ $iset ]) if (! exists $ssets_dbIDs{$iset->dbID}); 
	  
  }
  
}


1;