=pod =head1 NAME Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf; =head1 SYNOPSIS # TODO: this could be easily merged with the Peaks pipeline... # TODO: allow subfolders which will represent replicates... # Allow semaphores so jobs can be run truly in parallel (see SemaStart and SemaLongMult_conf) # Example 1: specifying only the mandatory options (initial params are taken from defaults) init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -password # Example 2: specifying the mandatory options as well as setting initial params: init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -password -p1name p1value -p2name p2value # Example 3: do not re-create the database, just load more tasks into an existing one: init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -job_topup -password -p1name p1value -p2name p2value =head1 DESCRIPTION This is the Config file for the Alignment Pipeline Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. The Alignment pipeline consists of several "analysis": * SetupAlignmentPipeline verifies the existence of the files and creates alignment jobs ... * RunAlignment makes the alignment... * WrapUpAlignment merges the alignments, some QC and fills in the data tracking db Please see the implementation details in Runnable modules themselves. =head1 CONTACT Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. =cut package Bio::EnsEMBL::Funcgen::Hive::Config::Base; use strict; use warnings; use Data::Dumper; use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly =head2 default_options Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. Caller : DependantOptions::process_options (init_pipeline.pl) =cut #These are used by init_pipeline.pl and according to the docs should the pipelinedb spec #Anything set (e.g. test) in default_options will be accessible in other methods here #(e.g pipeline_wide_params, pipeline_create_commands) as $self->o(test), #unless specified on the cmdline in which case that will be the value of $self->o(test); #This is basically mechanism to remove the needs for doing soemthing like this f #or defaults in pipeline_wide_params: test => $self->o('test') || default_value #or: test => (defined $self->o('test')) ? $self->o('test').'/ing' : '/default/ing' #$self->o also captures undefined cmdline params, if they have not been set in defaults #There seems to be no clear reason for not setting params in here, if we never want to set #a hardcoded default for them which we might want to over-write with a specific cmdlind opt #in pipeline_wide_parameters #It looks like it is possible to refer to a previously defined option within the same hash sub default_options { my ($self) = @_; return { #%{$self->SUPER::default_options},#Don't do this for Base #as we don't want any generic defs in there including the #default ensembl_cvs_root_dir ensembl_cvs_root_dir => $ENV{'SRC'}, bin_dir => '/software/ensembl/funcgen', pipeline_name => $self->o('pipeline_name'),#Used for job names pipeline_db => { -host => $self->o('host'), -port => $self->o('port'), -user => $self->o('user'), -pass => $self->o('pass'), -dbname => $self->o('pipeline_name'), #todo deal with this in the env and don't corrupt pipeline_name add $ENV{USER}? }, #This could be inferred from the db, but it's probably safer(?) to pass as parameter... species => $self->o('species'), #May pass this to input_id... to allow for files of different assemblies in the same pipeline run. assembly => $self->o('assembly'), #add defaults for values for dir structure here data_dir => $self->o('data_dir'), #'/lustre/scratch109/ensembl/funcgen', #The following should all be able to be redefined by the name sake cmdline opts #in pipeline_wide_params #These can't be modified by each conf, as they will be in the same DB things will get messy #todo These should be used as root dirs and specific confs should define new keys i.e. 'confname_data_dir' root_output_dir => $self->o('data_dir').'/output/', #need root_output_dir for non-DB specific analyses e.g. alignments db_output_dir => $self->o('data_dir').'/output/'.$self->o('dbname'), hive_output_dir => $self->o('data_dir').'/output/'.$self->o('pipeline_name').'/hive_debug', use_tracking_db => 1, }; } =head2 resource_classes Description : Implements resource_classes() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available =cut sub resource_classes { my ($self) = @_; return { #Use this section when running on Sanger Farm default => { 'LSF' => '' }, urgent => { 'LSF' => '-q yesterday' }, normal_monitored => { 'LSF' => " -R\"select[$ENV{LSF_RESOURCE_HOST}<1000] ". "rusage[$ENV{LSF_RESOURCE_HOST}=10:duration=10:decay=1]\"" }, normal_high_mem => { 'LSF' => ' -M5000000 -R"select[mem>5000] rusage[mem=5000]"' }, long_monitored => { 'LSF' => "-q long -R\"select[$ENV{LSF_RESOURCE_HOST}<1000] ". "rusage[$ENV{LSF_RESOURCE_HOST}=10:duration=10:decay=1]\"" }, long_high_mem => { 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, long_monitored_high_mem => { 'LSF' => "-q long -M4000000 -R\"select[$ENV{LSF_RESOURCE_HOST}<1000 && mem>4000]". " rusage[$ENV{LSF_RESOURCE_HOST}=10:duration=10:decay=1,mem=4000]\"" }, #Use this section when running on EBI cluster??? # 0 => { -desc => 'default', 'LSF' => '' }, # 1 => { -desc => 'long_high_mem', 'LSF' => '-M5000 -R"select[mem>5000] rusage[mem=5000]"' }, # 2 => { -desc => 'normal_high_memory', 'LSF' => '-M5000 -R"select[mem>5000] rusage[mem=5000]"' }, }; } =head2 pipeline_wide_parameters Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). Please see existing PipeConfig modules for examples. Caller : HiveGeneric_conf::run (init_pipeline.pl) =cut #$self->o here will take the cmdline option else default to what was set in default_options #Will init_pipeline with analysis_topup, use previously stored opts so we don't have to define #them again? sub pipeline_wide_parameters { my ($self) = @_; return { %{$self->SUPER::pipeline_wide_parameters}, # inheriting database and hive tables creation #Allow over-writing defaults set above bin_dir => $self->o('bin_dir'), root_output_dir => $self->o('root_output_dir'), db_output_dir => $self->o('db_output_dir'), hive_output_dir => $self->o('hive_output_dir'), use_tracking_db => $self->o('use_tracking_db'), }; } #default pipeline_create_commands defined in HiveGeneric_conf.pm #specify more in sub-class conf if required but always do @{$self->SUPER::pipeline_create_commands}, 1;