# # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet # =head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =head1 NAME Bio::EnsEMBL::Funcgen::Parsers::InputSet =head1 SYNOPSIS use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet); =head1 DESCRIPTION This is a base class to support simple file format parsers. For simple imports the vendor is set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes a one line per feature format, other format need there own read_and_import_format_data method, which will need defining in the result_data config element. Features are stored either as ResultFeature collections or AnnotatedFeatures dependan ton the input feature class. =cut # To do # Add Parsers for BAM/SAM # Rename to InputSet # Handle mysqlimport for large data sets e.g. reads # Incorporate collection code # Implement matrix storage package Bio::EnsEMBL::Funcgen::Parsers::InputSet; use Bio::EnsEMBL::Funcgen::AnnotatedFeature; use Bio::EnsEMBL::Funcgen::SegmentationFeature; use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); #use Bio::EnsEMBL::Funcgen::Utils::Helper; use strict;#config stuff, move to BaseImporter? use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Funcgen::FeatureType; use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10 my %valid_types = ( result => undef, annotated => undef, segmentation => undef, ); #todo change input_set_name to name =head2 new Example : my $self = $class->SUPER::new(@_); Description: Constructor method for Bed class Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple Exceptions : throws if caller is not Importer Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple Status : at risk =cut sub new{ my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); ($self->{'input_feature_class'}, $self->{total_features}, $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser $self->{dbfile_data_root}, #only appropriate for result input_feature_class $self->{merged_replicates}, $self->{output_set} #Feature/ResultSet ) = rearrange ( ['input_feature_class', 'total_features', 'force', 'dbfile_data_root', 'merged_replicates', 'output_set'], @_); #Could potentially take fields params directly to define a custom format #Take direct field mappings, plus special fields which needs parsing differently #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); $self->{'config'} = {( #can we omit these? array_data => [],#['experiment'], probe_data => [],#["probe"], norm_method => undef, #protocols => {()}, 'results_data' => ["and_import"], )}; #set up feature params $self->{'_feature_params'} = {}; $self->{'_dbentry_params'} = []; return $self; } sub output_file{ my ($self, $output_file) = @_; $self->{output_file} = $output_file if $output_file; return $self->{output_file}; } sub output_set { return $_[0]->{output_set}; } sub input_file{ my ($self, $input_file) = @_; $self->{'input_file'} = $input_file if $input_file; return $self->{'input_file'}; } =head2 set_config Example : my $self->set_config; Description: Sets attribute dependent config Returntype : None Exceptions : None Caller : Bio::EnsEMBL::Funcgen::Importer Status : at risk =cut #Set config assumes we are importing an InputSet, rather than providing #an existing ResultSet! #We shouldn't be depedant on params for config until after we have called new? #This is not even an InputSet import really. It's a Collection/ResultSet import #Depending on how you look at it. #todo refine/simplify this once we separate the Imports/parsers #This needs to move to new anyway #input_feature_class refers to output not input! sub set_config{ my $self = shift; #Move all this to new when we fix the inheritance in Importer #We could set input_set_name to experiment name #But we would have to make warning in define_and_validate_sets mention -input_set_name # if(! defined $self->{output_set}){ throw('Must provide an -name for a '.uc($self->vendor).' import') if ! defined $self->name(); #Mandatory checks if(! defined $self->feature_analysis){ throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports'); } } #some convenience methods $self->{dbentry_adaptor} = $self->db->get_DBEntryAdaptor; $self->{input_set_adaptor} = $self->db->get_InputSetAdaptor; my $fclass_name = $self->{input_set_adaptor}->build_feature_class_name($self->input_feature_class); $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.$fclass_name; #We need to undef norm method as it has been set to the env var $self->{config}{norm_method} = undef; #dirs are not set in config to enable generic get_dir method access $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector my $adaptor_method = 'get_'.$fclass_name.'Adaptor'; #cannot validate here using 'can' as it doesn't appear to work for autoloaded methods :( eval { $self->{feature_adaptor} = $self->db->$adaptor_method; }; if($@){ throw("Failed to $adaptor_method, ".$self->input_feature_class.' is not a valid feature class'); } #Validate slices $self->slices($self->{'slices'}) if defined $self->{'slices'}; #Move to new when we sort out inheritance $self->validate_and_store_config([$self->name]); #Could use input_set_name here? #This was to support >1 input set per experiment (name) #This current breaks for no config imports #i.e. standard Bed import e.g. result_feature collections #segmentation imports use Bed and config #allow no config imports in BaseImporter? #or ultimately set the params as part of the user_config? return; } #Some of this can probably move to define_and_validate_sets? sub define_sets{ my ($self, $dset) = @_; my ($output_set, $eset); warn("define_sets need updating to use new set definition methods in Helper.\n". "i.e. needs mirror DefineOutputSet::run, then point that method here\n"); $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->name); if(! defined $eset){ #temporary fix until we implement input_subset_file replicate/control config #and merge validite_files in here #do not currently have non merged replicate import #need to define this param in all the run scripts my $replicate = 0 if $self->merged_replicates; $eset = Bio::EnsEMBL::Funcgen::InputSet->new ( -name => $self->name(), -experiment => $self->experiment(), -feature_type => $self->feature_type(), -cell_type => $self->cell_type(), -vendor => $self->vendor(), -format => $self->format(), -analysis => $self->feature_analysis, -feature_class => $self->input_feature_class, -replicate => $replicate, ); ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)}; } $dset = $self->define_and_validate_sets ( -dbadaptor => $self->db, -name => $self->name,#or name? -feature_type => $self->feature_type, -cell_type => $self->cell_type, -analysis => $self->feature_analysis, -feature_class=> $self->input_feature_class, -description => $self->feature_set_description, #-append => 1,#Omit append to ensure we only have this eset -recovery => $self->recovery, -supporting_sets => [$eset], -slices => $self->slices, #Can't set rollback here, as we don't know until we've validated the files #Can't validate the files until we have the sets. #So we're doing this manually in validate_files ); #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully #However we already have IMPORTED on the InputSubSet #We should add it to FeatureSet to remain consistent. #See Helper::define_and_validate_sets for more notes on #potential problems with FeatureSet IMPORTED status $self->{'_data_set'} = $dset; if ($self->input_feature_class eq 'result' || $self->input_feature_class eq 'dna_methylation') { $output_set = $dset->get_supporting_sets->[0]; $eset = $output_set->get_InputSets->[0]; $self->result_set($output_set); #required for ResultFeature Collector and Bed Parser } else { #annotated/segmentation $output_set = $dset->product_FeatureSet; $eset = $dset->get_supporting_sets->[0]; } return ($dset, $output_set, $eset); } #we have rollback functionality incorporated here #why is this not integrated into define_sets? #they are inherantly linked and for an InputSet import #we always want to validate files. #No way of defining replicate/control config just yet #This should be changed to support both tracking DB and direct import. #to correctly define the sets, we need access to the files and in the #rep/control config #We can do this implicitly by simply letting the order define the rep number #-result_files rep1 rep2 rep2 #-control_files rep1 rep2 #-merged_replicates would over-ride the above and trigger a check for >1 file #let's implement -merged_replicates for now and come back to this order based config #rename -result_files to -input_subset_files? #Will this support the collection import? #Need to be mindful about the true input files and what pipeline will actually use #as it may be some merged intermediate i.e. a sam/bed file rather than a fastq #should we also store the dbfile_registry data here? #this maybe done before generating the actual files #This overlaps with new Set definition methods #Need to use new methods which handle rollback sub validate_files{ my ($self, $prepare, $eset, $output_set) = @_; #Get files from input directory if (! $self->input_files) { #todo genericise this input directory? i.e. set a defined dir, and do not concat name here my $list = "ls ".$self->get_dir('input').'/'.$self->name().'*.'; my @rfiles = `$list`; $self->input_files(\@rfiles); } #Here were are tracking the import of individual files by adding them as InputSubSets #Recovery would never know what to delete #So would need to delete all, Hence no point in setting status? #We do not rollback IMPORTED data here. This is done via separate scripts #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback #flag in the run script ### VALIDATE FILES ### #We need validate all the files first, so the import doesn't fall over half way through #Or if we come across a rollback halfway through my (%new_data); if ((scalar(@{$self->slices}) > 1) && ! $prepare) { throw('Validate files does not yet support multiple Slice rollback'); } #IMPORTED status here may prevent #futher slice based imports #so we have wait to set this until we know all the slices #are loaded, unless we store slice based IMPORTED states #We currently get around this be never settign IMPORTED for slice based jobs #and always rolling back by slice before import #This loop supports multiple files my (@rollback_sets, %file_paths); my $auto_rollback = ($self->rollback) ? 0 : 1; #set the max seen replicate value my $replicate = 0; foreach my $iss( @{$eset->get_InputSubsets} ){ if($iss->replicate > $replicate){ $replicate = $iss->replicate; } } foreach my $filepath ( @{$self->input_files} ) { my ($filename, $sub_set); chomp $filepath; ($filename = $filepath) =~ s/.*\///; $file_paths{$filename} = $filepath; $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets $self->log('Validating '.$self->vendor." file:\t$filename"); throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath); #Can deal with links if ( $sub_set = $eset->get_subset_by_name($filename) ) { #IMPORTED status here is just for the file #Any changes to analysis or coord_system should result in different InputSubset(file) #Will only ever be imported into one Feature|ResultSet #Currently conflating recover_unimported and rollback #as they serve the same purpose until we implement InputSubset level recovery if ( $sub_set->has_status('IMPORTED') ) { $new_data{$filepath} = 0; $self->log("Found previously IMPORTED InputSubset:\t$filename"); } else { $self->log("Found existing InputSubset without IMPORTED status:\t$filename"); push @rollback_sets, $sub_set; } } else { $replicate ++; $self->log("Found new InputSubset:\t${filename}"); throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared; $new_data{$filepath} = 1; $replicate = 0 if $self->merged_replicates; #need to add is_control support here too. #tidy this up, force full creation here and remove convenience code from InputSet? #where else is it used? $sub_set = Bio::EnsEMBL::Funcgen::InputSubset->new ( -name => $filename, -replicate => $replicate, -input_set => $eset, # should also handle these here -is_control => 0,#for result files, need to handle control files separately #to be defined #-display_url => $display_url, #-archive_id => $archive_id, ); $self->input_set_adaptor->store_InputSubsets([$sub_set]); } } #Does -recover allow a single extra new file to be added to an existing InputSet? if (@rollback_sets && #recoverable sets i.e. exists but not IMPORTED ( (! $self->recovery) && (! $self->rollback) ) ) { throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n". "You must specify -recover or -rollback to perform a full rollback"); if ($self->recovery) { #Change these to logger->warn $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined"); $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n"); } } if ($self->rollback) { #Check we have all existing InputSubsets files before we do full rollback #Can probably remove this if we support InputSubset(file/slice) level rollback $self->log('Rolling back all InputSubsets'); @rollback_sets = @{$eset->get_InputSubsets}; foreach my $isset (@rollback_sets) { if (! exists $file_paths{$isset->name}) { throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name. "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names"); } } } #This is now all handled in the define sets methods #todo remove all this in favour of using new define sets methods in helper #is there anyway we can bypass this for now if we have passed the outputset? #can we sub out part of this to identify those which are new and those which need rolling back? foreach my $esset (@rollback_sets) { #This needs to be mapped to the specified filepaths my $fp_key = $esset->name; $fp_key = 'prepared.'.$fp_key if $self->prepared; $new_data{$file_paths{$fp_key}} = 1; $self->log("Revoking states for InputSubset:\t\t\t".$esset->name); $eset->adaptor->revoke_states($esset); if (! $prepare) { #This was to avoid redundant rollback in prepare step $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name); if ($self->input_feature_class eq 'result' || $self->input_feature_class eq 'dna_methylation') { #Can we do this by slice for parallelisation? #This will only ever be a single ResultSet due to Helper::define_and_validate_sets $self->rollback_ResultSet($output_set, undef, $self->slices->[0]); #Do no have rollback_InputSet here as we may have parallel Slice based imports running } else { #annotated/segmentation $self->rollback_FeatureSet($output_set, undef, $self->slices->[0]); $self->rollback_InputSet($eset); last; } } } return \%new_data; } sub set_feature_separator{ my ($self, $separator) = @_; #How do we test if something undefined was passed? #Rather than nothing passed at all? #Can't do this as this is the accessor #Need to split method throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') ); $self->{'_feature_separator'} = $separator; } ### SIMPLE ACCESSORS ### # Some of these can be called for each record # Trim the access time as much as possible #This is to support input_set/input_subset.replicate #Not appropriate for array experiment imports sub merged_replicates{ return $_[0]->{merged_replicates}; } sub input_feature_class{ return $_[0]->{input_feature_class}; } #sub input_set_name{ return $_[0]->{input_set_name}; } #Set in new sub feature_adaptor{ return $_[0]->{feature_adaptor}; } sub dbentry_adaptor{ return $_[0]->{dbentry_adaptor}; } sub input_set_adaptor{ return $_[0]->{input_set_adaptor}; } sub set{ return $_[0]->{set}; } #Feature or Result, set in define_sets sub data_set{ return $_[0]->{_data_set}; } sub feature_separator{ return $_[0]->{_feature_separator}; } sub feature_params{ return $_[0]->{_feature_params}; } sub dbentry_params{ return $_[0]->{_dbentry_params}; } sub input_gzipped{ return $_[0]->{input_gzipped}; } sub input_file_operator{ my ($self, $op) = @_; #Should be set in format parser $self->{'input_file_operator'} = $op if defined $op; return $self->{'input_file_operator'}; } # prepare boolean, simply stores the sets and preprocesses the file #Still need to implement prepare in other Parsers!! sub read_and_import_data{ my ($self, $prepare) = @_; my $action = ($prepare) ? 'preparing' : 'importing'; $self->log("Reading and $action ".$self->vendor()." data"); my ($filename, $fh, $f_out, %feature_params, @lines); if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){ throw('prepare mode is only currently implemented for the Bed parser'); } #Test for conflicting run modes if($prepare && ($self->batch_job || $self->prepared)){ #prepare should be called once by the runner, not in each batch_job #don't prepare if already prepared throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job'); } my ($eset); my $output_set = $self->output_set; #todo integrate all this into define_sets? if(! defined $output_set){ (undef, $output_set, $eset) = $self->define_sets(); } elsif($output_set->isa('Bio::EnsEMBL::Funcgen::ResultSet')){ $eset = $output_set->get_support->[0]; $self->result_set($output_set); #required for ResultFeature Collector and Bed Parser } elsif($output_set->isa('Bio::EnsEMBL::Funcgen::FeatureSet')){ $eset = $output_set->get_DataSet->get_supporting_sets->[0]; } else{ throw("Unsupported output set class specified:\t".$output_set); } #We also need to account for bsub'd slice based import #If we can do these the other way around we can get define_sets to rollback the FeatureSet #Cyclical dependency for the sets :| #validate_files required that the data_set is defined my $new_data = $self->validate_files($prepare, $eset, $output_set); my $seen_new_data = 0; ### READ AND IMPORT FILES ### foreach my $filepath(@{$self->input_files()}) { chomp $filepath; ($filename = $filepath) =~ s/.*\///; $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method if($new_data->{$filepath} ){ #This will currently autovivify! $seen_new_data = 1; $self->{'input_gzipped'} = &is_gzipped($filepath); $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file'); $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath); #We need to be able to optional open pipe to gzip | sort here #i.e. define open command $fh = open_file($filepath, $self->input_file_operator); #This my become way too large for some reads files #Currently no problems #This is not working as we are sorting the file! #$self->parse_header($fh) if $self->can('parse_header'); #For result features some times we want to run #locally and just sort without dumping #i.e if we are not a batch job #as there is no need to dump if it is a single process if((($self->input_feature_class eq 'result' || $self->input_feature_class eq 'dna_methylation' ) && ! $prepare)){ #Use the ResultFeature Collector here #Omiting the 0 wsize #How are we going to omit 0 wsize when doing the fetch? #simply check table name in ResultSet? #Should we do this for multiple chrs? #or fail here # we need to pass self #for access to get_Features_by_Slice #which should be in the specific parser e.g Bed #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice? #Could really do with separating the pure file parsers from the importer code, so these can be reused #by other code. Then simply use Bed import parser for specific import functions and as wrapper to #Bed file parser #So should really have #Parsers::File::Bed #and #Parsers::Import::Bed #This can probably wait until we update BioPerl and just grab the Bed parser from there? my $slices = $self->slices; #Should this be caught in new? if(! @$slices){ throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name); } if(scalar(@$slices) > 1){ throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n" ."Please submit these to the farm as single slice jobs"); } #restrict to just 1 slice as we don't yet support disk seeking #if the slices are not in the same order as they appear in the file #also we want to parallelise this #Set as attr for parse_Features_by_Slice in format sepcific Parsers $self->file_handle(open_file($filepath, $self->input_file_operator)); foreach my $slice(@$slices){ $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self, ( #Force needs reimplementing here? -force => $self->{force}, -dbfile_data_root => $self->{dbfile_data_root}, )); } warn "Need to update InputSubset status to IMPORTED after all slices have been loaded"; #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures? warn "Closing $filename\nDisregard the following 'Broken pipe' warning"; #Closing the read end of a pipe before the process writing to it at the other end #is done writing results in the writer receiving a SIGPIPE. If the other end can't #handle that, be sure to read all the data before closing the pipe. #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file? #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead? #Or maybe an eval will catch the STDERR better? #sub catch_SIGPIPE{ # my $sig = shift @_; # print " Caught SIGPIPE: $sig $1 \n"; # return; # #} #$SIG{PIPE} = \&catch_SIGPIPE; #my $retval = eval { no warnings 'all'; $fh->close }; #if($@){ # warn "after eval with error $@\nretval is $retval"; #} #Neither of these catch gzip: stdout: Broken pipe #IO::UnCompress::Gunzip? eval { close($fh); }; if($@){ die("Found errors when closing file\n$@"); } } else{ #Revoke FeatureSet IMPORTED state here incase we fail halfway through $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare)); #What about IMPORTED_"CSVERSION" #This may leave us with an incomplete import which still has #an IMPORTED_CSVERSION state #We need to depend on IMPORTED for completeness of set #DAS currently only uses IMPORTED_CSVERSION #This is okayish but we also need to write HCs for any sets #which do not have IMPORTED state! my ($line, @outlines, $out_fh); if($prepare && ! $self->batch_job){ #Assume we want gzipped output #filename is actually based on input, so may not have gz in file name $out_fh = open_file($self->output_file, "| gzip -c > %s"); } while(defined ($line=<$fh>)){ #Generic line processing #Move these to parse_line? $line =~ s/\r*\n//o; next if $line =~ /^\#/; next if $line =~ /^$/; #This has now been simplified to process_line method #my @fields = split/\t/o, $line; #start building parameters hash #foreach my $field_index(@{$self->get_field_indices}){ # my $field = $self->get_field_by_index($field_index); # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]); # } #We also need to enable different parse line methods if we have different file #e.g. cisRED #Code refs? if($self->parse_line($line, $prepare)){ $self->count('total parsed lines'); #Cache or print to sorted file if($prepare && ! $self->batch_job){ if(scalar(@outlines) >1000){ print $out_fh join("\n", @outlines)."\n"; @outlines = (); } else{ push @outlines, $line; } } } } eval { close($fh); }; if($@){ die("Found errors when closing file\n$@"); } #Print last of sorted file if($prepare && ! $self->batch_job){ print $out_fh join("\n", @outlines)."\n"; eval { close($out_fh) ;}; if($@){ die("Found errors when closing file\n$@"); } @outlines = (); } if(! $prepare){ #Now we need to deal with anything left in the read cache $self->process_params if $self->can('process_params'); #To speed things up we may need to also do file based import here with WRITE lock? #mysqlimport will write lock the table by default? #reset filename to that originally used to create the Inputsubsets $filename =~ s/^prepared\.// if $self->prepared; my $sub_set = $eset->get_subset_by_name($filename); $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job; } } if($prepare){ $self->log("Finished preparing import from:\t$filepath"); } else{ #Need to tweak this for slice based import $self->log('Finished importing '.$self->counts('features').' '. $output_set->name." features from:\t$filepath"); } #This currently fails here if the uncaught file sort was not successful foreach my $key (keys %{$self->counts}){ $self->log("Count $key:\t".$self->counts($key)); } } } #Here we should set IMPORTED on the FeatureSet #We could also log the first dbID of each feature in a subset to facilitate subset rollback #in feature table #this would be sketchy at best #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id #This may already have IMPORTED status as we don't revoke the status whilst #updating to protect the feature set due to lack of supportingset tracking #see Helper::defined_and_validate_sets for more notes. #Is there any point in setting it if we don't revoke it? #To allow consistent status handling across sets. Just need to be aware of fset status caveat. #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job if(! $prepare){ $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job; $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data}; $self->log("Finished parsing and importing results"); } return; } #Should be called from format parser e.g. BED, GFF, eQTL etc #Why don't we pass feature_params and dbentry_params directly? sub load_feature_and_xrefs{ my $self = shift; #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n"; #This now only fails once on the first run and then #Need to count based on feature_type? #new rather than new fast here as we want to validate the import my $feature = $self->{feature_class}->new(%{$self->feature_params}); ($feature) = @{$self->feature_adaptor->store($feature)}; $self->count('features'); #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key? ##This needs to be handled in caller as we are validating loci? #if($self->ucsc_coords){ # $start += 1; # $end += 1; # } #This needs to be put in a separate sub and called by the caller #if(! $self->cache_slice($chr)){ # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr"; #} #else{ #grab seq if dump fasta and available #my $seq; #if(exists $self->feature_params->{'sequence'}){ # $seq = $self->feature_params->{'sequence'}; # delete $self->feature_params->{'sequence'}; # } # else{ # $self->log('No fasta sequence available for '.$self->feature_params->display_label); # } # } #dump fasta here #if ($self->dump_fasta){ # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n"; # } #Store the xrefs foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){ my $ftype = $dbentry_hash->{feature_type}; delete $dbentry_hash->{feature_type}; my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash}); $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag #count here? no count in caller } #Clean data cache $self->{'_feature_params'} = {}; $self->{'_dbentry_params'} = []; return $feature; } #This should really be handled in Bio::EnsEMBL::Feature? #Move to Helper? sub set_strand{ my ($self, $strand) = @_; my $ens_strand = 0; my %strand_vals = ( '1' => 1, '0' => 0, '-1' => -1, '+' => 1, '-' => -1, '.' => 0, ); if($strand){ if(exists $strand_vals{$strand}){ $ens_strand = $strand_vals{$strand}; } else{ throw("Could not identify strand value for $strand"); } } return $ens_strand; } sub total_features{ my ($self, $total) = @_; $self->{'total_features'} = $total if defined $total; return $self->{'total_features'}; } #Currently only required for Bed::parse_Features_by_Slice #filehandle sub file_handle{ my ($self, $fh) = @_; $self->{'file_handle'} = $fh if defined $fh; return $self->{'file_handle'}; } sub result_set{ my ($self, $rset) = @_; #already tested/created by self $self->{'result_set'} = $rset if $rset; return $self->{'result_set'}; } 1;