=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/legal/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =head1 NAME Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC_conf =head1 DESCRIPTION Configuration module for variant QC eHive process =cut package Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC_conf; use strict; use warnings; use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); sub default_options { my ($self) = @_; # the hash returned from this function is used to configure the pipeline, you can supply # any of these options on the command line to override these default values # you shouldn't need to edit anything in this file other than these values, if you # find you do need to then we should probably make it an option here, contact # the variation team to discuss this - patches are welcome! my $login = `whoami`; chomp $login; return { # general pipeline options that you should change to suit your environment hive_use_triggers => 0, compile_module_once => 1, ## check why jobs failed before re-submitting for analysis retry_throwing_job => 0, # the location of your checkout of the ensembl API (the hive looks for SQL files here) ensembl_cvs_root_dir => $ENV{'HOME'}.'/EBI/bin/HEAD', # a name for your pipeline (will also be used in the name of the hive database) pipeline_name => 'variation_qc', # a directory to keep hive output files and your registry file, you should # create this if it doesn't exist pipeline_dir => '/lustre/scratch110/ensembl/' . $login . '/'.$self->o('pipeline_name') . '/'. $self->o('species'), # a directory where hive workers will dump STDOUT and STDERR for their jobs # if you use lots of workers this directory can get quite big, so it's # a good idea to keep it on lustre, or some other place where you have a # healthy quota! output_dir => $self->o('pipeline_dir').'/hive_output', # a standard ensembl registry file containing connection parameters # for your target database(s) (and also possibly aliases for your species # of interest that you can then supply to init_pipeline.pl with the -species # option) reg_file => $self->o('pipeline_dir').'/ensembl.registry', ## number of *variants* handled per batch qc_batch_size => 1000, unmapped_batch_size => 100000, ## quicker check can be binned in bigger chunks # Options to change for failure recovery ## only data with variation_id >= start_at_variation_id will be imported start_at_variation_id => 1, ## this can be changed for failure recovery ## working tables will not be created when create_working_table is set to 0 create_working_tables => 1, # create tmp_map_weight table unless this set to 0 create_map_table => 1, # configuration for the various resource options used in the pipeline # EBI farm users should either change these here, or override them on the # command line to suit the EBI farm. The names of each option hopefully # reflect their usage, but you may want to change the details (memory # requirements, queue parameters etc.) to suit your own data default_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', ## upped to 4G from 2G for mouse urgent_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', highmem_lsf_options => '-R"select[mem>15000] rusage[mem=15000]" -M15000000', long_lsf_options => '-q long -R"select[mem>2000] rusage[mem=2000]" -M2000000', medium_lsf_options => '-R"select[mem>4000] rusage[mem=4000]" -M4000000', # options controlling the number of workers used for the parallelisable analyses variant_qc_capacity => 60, unmapped_var_capacity => 10, # these flags control which parts of the pipeline are run run_check_dbSNP_import => 1, run_create_seqdb => 1, run_variant_qc => 1, run_unmapped_var => 1, run_flip_population_genotype => 1, run_update_population_genotype => 1, run_PAR_check => 1, run_Pubmed_check => 1, run_evidence_check => 0, # put back support for re-runs on new format schema schema => 'old', # connection parameters for the hive database, you should supply the hive_db_password # option on the command line to init_pipeline.pl (parameters for the target database # should be set in the registry file defined above) # init_pipeline.pl will create the hive database on this machine, naming it # _, and will drop any existing database with this # name hive_db_host => 'ens-variation', hive_db_port => 3306, hive_db_user => 'ensadmin', pipeline_db => { -host => $self->o('hive_db_host'), -port => $self->o('hive_db_port'), -user => $self->o('hive_db_user'), -pass => $self->o('hive_db_password'), -dbname => $ENV{'USER'}.'_'.$self->o('pipeline_name') . '_' . $self->o('species'), }, }; } sub pipeline_create_commands { my ($self) = @_; return [ 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0).q{-e 'DROP DATABASE IF EXISTS }.$self->o('pipeline_db', '-dbname').q{'}, @{$self->SUPER::pipeline_create_commands}, 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).q{-e 'INSERT INTO meta (meta_key, meta_value) VALUES ("hive_output_dir", "}.$self->o('output_dir').q{")'}, ]; } sub resource_classes { my ($self) = @_; return { 'default' => { 'LSF' => $self->o('default_lsf_options') }, 'urgent' => { 'LSF' => $self->o('urgent_lsf_options') }, 'highmem' => { 'LSF' => $self->o('highmem_lsf_options') }, 'long' => { 'LSF' => $self->o('long_lsf_options') }, 'medium' => { 'LSF' => $self->o('medium_lsf_options') }, }; } sub pipeline_analyses { my ($self) = @_; my @common_params = ( ensembl_registry => $self->o('reg_file'), species => $self->o('species'), pipeline_dir => $self->o('pipeline_dir'), ); my @analyses; push @analyses, ( { -logic_name => 'init_run_variant_qc', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::InitVariantQC', -parameters => { qc_batch_size => $self->o('qc_batch_size'), unmapped_batch_size => $self->o('unmapped_batch_size'), run_create_seqdb => $self->o('run_create_seqdb'), run_check_dbSNP_import => $self->o('run_check_dbSNP_import'), run_variant_qc => $self->o('run_variant_qc'), run_unmapped_var => $self->o('run_unmapped_var'), run_flip_population_genotype => $self->o('run_flip_population_genotype'), run_update_population_genotype => $self->o('run_update_population_genotype'), start_at_variation_id => $self->o('start_at_variation_id'), create_working_tables => $self->o('create_working_tables'), create_map_table => $self->o('create_map_table'), @common_params, }, -input_ids => [{}], -hive_capacity => -1, -rc_name => 'default', -flow_into => { 2 => [ 'check_dbSNP_import' ], 3 => [ 'create_seqdb' ], 4 => [ 'variant_qc' ], 5 => [ 'unmapped_var' ], 6 => [ 'flip_population_genotype' ], 7 => [ 'update_population_genotype' ], 8 => [ 'special_cases' ], 9 => [ 'finish_variation_qc' ], }, }, { -logic_name => 'check_dbSNP_import', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::CheckdbSNPImport', -parameters => { @common_params, }, -input_ids => [], -hive_capacity => -1, -rc_name => 'default', }, { -logic_name => 'create_seqdb', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::CreateSeqDB', -parameters => { @common_params, }, -input_ids => [], -hive_capacity => 1, -rc_name => 'medium', }, { -logic_name => 'unmapped_var', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::UnmappedVariant', -parameters => { batch_size => $self->o('unmapped_batch_size'), @common_params, }, -input_ids => [], -hive_capacity => $self->o('unmapped_var_capacity'), # -analysis_capacity=>$self->o('unmapped_var_capacity'), -max_retry_count => 0, -rc_name => 'default', -wait_for => [ 'check_dbSNP_import' ], -flow_into => {}, }, { -logic_name => 'variant_qc', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC', -parameters => { schema => $self->o('schema'), batch_size => $self->o('qc_batch_size'), use_seqdb => $self->o('run_create_seqdb'), evidence_check => $self->o('run_evidence_check'), @common_params, }, -input_ids => [], -hive_capacity => $self->o('variant_qc_capacity') , ## switch this off on hive upgrade # -analysis_capacity=> $self->o('variant_qc_capacity') , -max_retry_count => 0, -rc_name => 'default', -wait_for => [ 'check_dbSNP_import', 'create_seqdb'], -flow_into => {}, }, { -logic_name => 'flip_population_genotype', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::FlipPopulationGenotype', -parameters => { @common_params, }, -input_ids => [], -hive_capacity => -1, -rc_name => 'long', -wait_for => [ 'variant_qc' ], -flow_into => {}, }, { -logic_name => 'update_population_genotype', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::UpdatePopulationGenotype', -parameters => { @common_params, }, -input_ids => [], -hive_capacity => -1, -rc_name => 'default', -wait_for => [ 'variant_qc', 'flip_population_genotype' ], -flow_into => {}, }, { -logic_name => 'special_cases', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::SpecialCase', -parameters => { run_PAR_check => $self->o('run_PAR_check'), run_Pubmed_check => $self->o('run_Pubmed_check'), @common_params, }, -input_ids => [], -hive_capacity => -1, -rc_name => 'default', -wait_for => [ 'variant_qc'], -flow_into => {}, }, { -logic_name => 'finish_variation_qc', -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::FinishVariantQC', -parameters => { @common_params, }, -input_ids => [], -hive_capacity => -1, -rc_name => 'default', -wait_for => [ 'variant_qc','unmapped_var','update_population_genotype','special_cases' ], -flow_into => {}, }, ); return \@analyses; } 1;