package Bio::DB::GFF::Adaptor::biofetch; #$Id: biofetch.pm 10525 2006-09-26 22:03:22Z sendu $ =head1 NAME Bio::DB::GFF::Adaptor::biofetch -- Cache BioFetch objects in a Bio::DB::GFF database =head1 SYNOPSIS Proof of principle. Not for production use. =head1 DESCRIPTION This adaptor is a proof-of-principle. It is used to fetch BioFetch sequences into a Bio::DB::GFF database (currently uses a hard-coded EMBL database) as needed. This allows the Generic Genome Browser to be used as a Genbank/EMBL browser. =head1 AUTHOR Lincoln Stein Elstein@cshl.orgE. Copyright 2002 Cold Spring Harbor Laboratory. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut use strict; use Bio::DB::GFF::Util::Rearrange; # for rearrange() use Bio::DB::BioFetch; use Bio::SeqIO; use vars qw(%preferred_tags); # THIS IS WRONG: biofetch should delegate to an underlying # database adaptor, and not inherit from one. use base qw(Bio::DB::GFF::Adaptor::dbi::mysql); # priority for choosing names of CDS tags, higher is higher priority %preferred_tags = ( strain => 10, organism => 20, protein_id => 40, locus_tag => 50, locus => 60, gene => 70, standard_name => 80, ); =head2 new Title : new Usage : $db = Bio::DB::GFF->new(-adaptor=>'biofetch',@args) Function: create a new adaptor Returns : a Bio::DB::GFF object Args : -adaptor : required. Which adaptor to use; biofetch for mysql, biofetch_oracle for Oracle -preferred_tags : optional. A hash of {classname => weight,...} used to determine the class and name of the feature when a choice of possible feature classes is available (e.g. a feature has both a 'gene' and a 'locus' tag). Common defaults are provided that work well for eukaryotic features (but not well for viral/prokaryotic) see below for additional arguments. Status : Public This is the constructor for the adaptor. It is called automatically by Bio::DB::GFF-Enew. In addition to arguments that are common among all adaptors, the following class-specific arguments are recgonized: Argument Description -------- ----------- -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' -user username for authentication -pass the password for authentication -proxy [['http','ftp'],'http://proxy:8080'] -source source to use for loaded features ('EMBL') -dsn,-user and -pass indicate the local database to cache results in, and as are per Bio::DB::GFF::Adaptor::dbi. The -proxy argument allows you to set the biofetch web proxy, and uses the same syntax described for the proxy() method of L, except that the argument must be passed as an array reference. =cut sub new { my $class = shift; my $self = $class->SUPER::new(@_); my ($preferred,$proxy,$source) = rearrange(['PREFERRED_TAGS','PROXY','SOURCE'],@_); # if the caller sent their own preferences, then use these, otherwise use defaults. $self->_preferred_tags($preferred ? $preferred : \%preferred_tags); $self->_source($source || 'EMBL'); if ($proxy) { my @args = ref($proxy) ? @$proxy : eval $proxy; $self->{_proxy} = \@args if @args; } $self; } sub segment { my $self = shift; my @segments = $self->SUPER::segment(@_); if (!@segments) { my $refclass = $self->refclass; my %args = $self->setup_segment_args(@_); if ($args{-class} && $args{-class} =~ /$refclass/oi) { return unless $self->load_from_embl('embl'=>$args{-name}); @segments = $self->SUPER::segment(@_); } elsif ($args{-class} && $args{-class} =~ /refseq|swall|embl/i) { #hack to get refseq names return unless $self->load_from_embl(lc($args{-class})=>$args{-name}); $args{-class} = $self->refclass; @segments = $self->SUPER::segment(%args); } } $self->_multiple_return_args(@segments); } # default is to return 'Sequence' as the class of all references sub refclass { my $self = shift; my $refname = shift; 'Sequence'; } sub load_from_embl { my $self = shift; my $db = shift; my $acc = shift or $self->throw('Must provide an accession ID'); my $biofetch; if ($self->{_biofetch}{$db}) { $biofetch = $self->{_biofetch}{$db}; } else { $biofetch = $self->{_biofetch}{$db} = Bio::DB::BioFetch->new(-db=>$db); $biofetch->retrieval_type('tempfile'); $biofetch->proxy(@{$self->{_proxy}}) if $self->{_proxy}; } my $seq = eval {$biofetch->get_Seq_by_id($acc)} or return; $self->_load_embl($acc,$seq); 1; } sub load_from_file { my $self = shift; my $file = shift; my $format = $file =~ /\.(gb|genbank|gbk)$/i ? 'genbank' : 'embl'; my $seqio = Bio::SeqIO->new( '-format' => $format, -file => $file); my $seq = $seqio->next_seq; $self->_load_embl($seq->accession,$seq); 1; } sub _load_embl { my $self = shift; my $acc = shift; my $seq = shift; my $refclass = $self->refclass; my $locus = $seq->id; my $source = $self->_source; # begin loading $self->setup_load(); # first synthesize the entry for the top-level feature my @aliases; foreach ($seq->accession,$seq->get_secondary_accessions) { next if lc($_) eq lc($acc); push @aliases,[Alias => $_]; } $self->load_gff_line( { ref => $acc, class => $refclass, source => $source, # method => 'origin', method => 'region', start => 1, stop => $seq->length, score => undef, strand => '.', phase => '.', gclass => $self->refclass, gname => $acc, tstart => undef, tstop => undef, attributes => [[Note => $seq->desc],@aliases], } ); # now load each feature in turn my ($transcript_version,$mRNA_version) = (0,0); for my $feat ($seq->all_SeqFeatures) { my $attributes = $self->get_attributes($feat); my $name = $self->guess_name($attributes); my $location = $feat->location; my @segments = map {[$_->start,$_->end,$_->seq_id]} $location->can('sub_Location') ? $location->sub_Location : $location; # this changed CDS to coding, but that is the wrong thing to do, since # CDS is in SOFA and coding is not # my $type = $feat->primary_tag eq 'CDS' ? 'coding' # : $feat->primary_tag; my $type= $feat->primary_tag; next if (lc($type) eq 'contig'); # next if (lc($type) eq 'variation'); if (lc($type) eq 'variation' and $feat->length == 1) { $type = 'SNP'; } elsif (lc($type) eq 'variation' ) { $type = 'chromosome_variation'; } if ($type eq 'source') { $type = 'region'; } if ($type =~ /misc.*RNA/i) { $type = 'RNA'; } if ($type eq 'misc_feature' and $name->[1] =~ /similar/i) { $type = 'computed_feature_by_similarity'; } elsif ($type eq 'misc_feature') { warn "skipping a misc_feature\n"; next; } my $parttype = $feat->primary_tag eq 'mRNA' ? 'exon' : $feat->primary_tag; if ($type eq 'gene') { $transcript_version = 0; $mRNA_version = 0; } elsif ($type eq 'mRNA') { $name->[1] = sprintf("%s.t%02d",$name->[1],++$transcript_version); } elsif ($type eq 'CDS') { $name->[0] = 'mRNA'; $name->[1] = sprintf("%s.t%02d",$name->[1],$transcript_version); } my $strand = $feat->strand; my $str = defined $strand ? ($strand > 0 ? '+' : '-') : '.'; $self->load_gff_line( { ref => $acc, class => $refclass, source => $source, method => $type, start => $location->start, stop => $location->end, score => $feat->score || undef, strand => $str, phase => $feat->frame || '.', gclass => $name->[0], gname => $name->[1], tstart => undef, tstop => undef, attributes => $attributes, } ) if ($type && ($type ne 'CDS'||($type eq 'CDS'&&@segments==1) ) ); @$attributes = (); next if @segments == 1; for my $segment (@segments) { my $strand = $feat->strand; my $str = defined $strand ? ($strand > 0 ? '+' : '-') : '.'; $self->load_gff_line( { ref => $segment->[2] eq $locus ? $acc : $segment->[2], class => $refclass, source => $source, method => $parttype, start => $segment->[0], stop => $segment->[1], score => $feat->score || undef, strand => $str, phase => $feat->frame || '.', gclass => $name->[0], gname => $name->[1], tstart => undef, tstop => undef, attributes => $attributes, } ); } } # finish loading $self->finish_load(); # now load the DNA $self->load_sequence_string($acc,$seq->seq); 1; } sub get_attributes { my $self = shift; my $seq = shift; my @tags = $seq->all_tags or return; my @result; foreach my $tag (@tags) { foreach my $value ($seq->each_tag_value($tag)) { push @result,[$tag=>$value]; } } \@result; } sub guess_name { my $self = shift; my $attributes = shift; # remove this fix when Lincoln fixes it properly return ["Misc" => "Misc"] unless ($attributes); # these are arbitrary, and possibly destructive defaults my @ordered_attributes = sort {($self->_preferred_tags->{$a->[0]} || 0) <=> ($self->_preferred_tags->{$b->[0]} || 0)} @$attributes; my $best = pop @ordered_attributes; @$attributes = @ordered_attributes; return $best; } sub _preferred_tags { my $self = shift; $self->{preferred_tags} = shift if @_; return $self->{preferred_tags}; } sub _source { my $self = shift; $self->{source} = shift if @_; $self->{source}; } 1;