# $Id: HIV.pm 232 2008-12-11 14:51:51Z maj $ # # BioPerl module for Bio::DB::HIV # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::HIV - Database object interface to the Los Alamos HIV Sequence Database =head1 SYNOPSIS $db = new Bio::DB::HIV; $seq = $db->get_Seq_by_id('94284'); # LANL sequence id $seq = $db->get_Seq_by_acc('EF432710'); # GenBank accession $q = new Bio::DB::Query::HIVQuery( " (C D)[subtype] SI[phenotype] (symptomatic AIDS)[patient_health] " ); $seqio = $db->get_Stream_by_query($q); $seq = $seqio->next_seq(); ($seq->annotation->get_Annotations('Virus'))[0]->{subtype} # returns 'D' ($seq->annotation->get_Annotations('Patient'))[0]->{patient_health} # returns 'AIDS' ($seq->annotation->get_Annotations('accession'))[0]->{value} # returns 'K03454' =head1 DESCRIPTION Bio::DB::HIV, along with L, provides an interface for obtaining annotated HIV and SIV sequences from the Los Alamos National Laboratory (LANL) HIV Sequence Database ( L ). Unannotated sequences can be retrieved directly from the database object, using either LANL ids or GenBank accessions. Annotations are obtained via a query object, and are attached to the correct C objects when the query is handled by C or C. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHOR - Mark A. Jensen Email maj@fortinbras.us =head1 CONTRIBUTORS Mark A. Jensen =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::HIV; use strict; use warnings; use vars qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH ); # Object preamble - inherits from Bio::DB::WebDBSeqI use Bio::Root::Root; use HTTP::Request::Common; use Bio::DB::HIV::HIVAnnotProcessor; use base qw(Bio::DB::WebDBSeqI); BEGIN { # base change of 01/14/09 $LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/asearch"; $LANL_MAP_DB = "map_db.comp"; $LANL_MAKE_SEARCH_IF = "make_search_if.comp"; $LANL_SEARCH = "search.comp"; @Bio::ResponseProblem::Exception::ISA = qw( Bio::Root::Exception ); @Bio::HIVSorry::Exception::ISA = qw ( Bio::Root::Exception ); @Bio::WebError::Exception::ISA = qw( Bio::Root::Exception ); } =head1 Constructor =head2 new Title : new Usage : my $obj = new Bio::DB::HIV(); Function: Builds a new Bio::DB::HIV object Returns : an instance of Bio::DB::HIV Args : =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search) = $self->_rearrange([qw( LANL_BASE LANL_MAP_DB LANL_MAKE_SEARCH_IF LANL_SEARCH )], @args); $lanl_base && $self->lanl_base($lanl_base); $lanl_map_db && $self->map_db($lanl_map_db); $lanl_make_search_if && $self->make_search_if($lanl_make_search_if); $lanl_search && $self->search_($lanl_search); # defaults $self->lanl_base || $self->lanl_base($LANL_BASE); $self->map_db || $self->map_db($LANL_MAP_DB); $self->make_search_if || $self->make_search_if($LANL_MAKE_SEARCH_IF); $self->search_ || $self->search_($LANL_SEARCH); $self->url_base_address || $self->url_base_address($self->lanl_base); $self->request_format("fasta"); return $self; } =head1 WebDBSeqI compliance =head2 get_request Title : get_request Usage : my $url = $self->get_request Function: returns a HTTP::Request object Returns : Args : %qualifiers = a hash of qualifiers with keys in (-ids, -format, -mode, -query) Note : Several layers of requests are performed to get to the sequence; see Bio::DB::Query::HIVQuery. =cut sub get_request { my $self = shift; my %quals = @_; my ($resp); my (@ids, $mode, @interface, @query_parms, $query); # html parsing regexps my $tags_re = qr{(?:\s*<[^>]+>\s*)}; my $session_id_re = qr{]*action=".*/search.comp"}; my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found}; my $no_seqs_found_re = qr{Sorry.*no sequences found}; my $too_many_re = qr{too many records: $tags_re*([0-9]+)}; # find something like: # tables without join:
SequenceAccessions
my $tbl_no_join_re = qr{tables without join}i; # my $sorry_bud_re = qr{}; # handle "qualifiers" foreach (keys %quals) { m/mode/ && do { $mode = $quals{$_}; next; }; m/uids/ && do { $self->throw(-class=>"Bio::Root::BadParameter", -text=>"Arrayref required for qualifier \"$_\"", -value=>$quals{$_}) unless ref($quals{$_}) eq 'ARRAY'; @ids = @{$quals{$_}}; next; }; m/query/ && do { $self->throw(-class=>"Bio::Root::BadParameter", -text=>"Bio::DB::Query::HIVQuery required for qualifier \"$_\"", -value=>$quals{$_}) unless $quals{$_}->isa("Bio::DB::Query::HIVQuery"); $query = $quals{$_}; next; }; do { 1; #else stub }; } # what kind of request? for my $m ($mode) { ($m =~ m/single/) && do { @interface = ( 'sequenceentry' => 'se_sequence', 'sequenceentry' => 'se_id', 'action' => 'Search Interface' ); @query_parms = map { ('sequenceentry.se_id' => $_ ) } @ids; push @query_parms, ( 'sequenceentry.se_sequence'=>'Any', 'order' => 'sequenceentry.se_id', 'sort_dir' => 'ASC', 'action' => 'Search' ); }; ($mode =~ m/acc/) && do { @interface = ( 'sequenceentry' => 'se_sequence', 'sequenceentry' => 'se_id', 'sequenceaccessions' => 'sa_genbankaccession', 'sequenceaccessions' => 'sa_se_id', 'action' => 'Search Interface' ); @query_parms = map {('sequenceaccessions.sa_genbankaccession' => $_)} @ids; push @query_parms, ( 'sequenceentry.se_sequence' => 'Any', 'order' => 'sequenceaccessions.sa_genbankaccession', 'sort_dir' => 'ASC', 'action' => 'Search' ); }; ($mode =~ m/gi/) && do { $self->_sorry("-mode=>gi"); }; ($mode =~ m/version/) && do { $self->_sorry("-mode=>version"); }; ($mode =~ m/query/) && do { $self->throw(-class=>"Bio::Root::BadParameter", -text=>"Query ".($query->{'_RUN_LEVEL'} ? "has been run only at run level ".$query->{'_RUN_LEVEL'} : "has not been run").", run at level 2 with _do_query(2)", -value=>$query->{'_RUN_LEVEL'}) unless $query->{'_RUN_LEVEL'} == 2; @interface = ( 'sequenceentry' => 'se_sequence', 'sequenceentry' => 'se_id', 'action' => 'Search Interface' ); @query_parms = ("sequenceentry.se_id" =>sprintf("'%s'",join("\t", $query->ids))); # @query_parms = map { ( "sequenceentry.se_id" => $_ ) } $query->ids; push @query_parms, ( 'sequenceentry.se_sequence' => 'Any', 'order' => 'sequenceentry.se_id', 'sort_dir' => 'ASC', 'action' => 'Search' ); }; do { 1; # else stub }; } # web work eval { # capture web errors; throw below... # negotiate a session with lanl db if (!$self->_session_id) { $resp = $self->ua->get($self->_map_db_uri); $resp->is_success || die "Connect failed"; # get the session id if (!$self->_session_id) { ($self->{'_session_id'}) = ($resp->content =~ /$session_id_re/); $self->_session_id || die "Session not established"; } } # establish correct "interface" for this session id $resp = $self->ua->post($self->_make_search_if_uri, [@interface, id=>$self->_session_id]); $resp->is_success || die "Interface request failed (1)"; $self->_response($resp); $resp->content =~ /$search_form_re/ || die "Interface request failed (2)"; # interface successful, do the "pre-search" $resp = $self->ua()->post($self->_search_uri, [(@query_parms, 'id' => $self->_session_id)] ); unless ($resp->is_success) { die "Search post failed"; } $self->_response($resp); # check for error conditions for ($resp->content) { /$no_seqs_found_re/ && do { die "No sequences found"; last; }; /$too_many_re/ && do { die "Too many records ($1): must be <10000"; last; }; /$tbl_no_join_re/ && do { die "Some required tables went unjoined to query"; last; }; /$seqs_found_re/ && do { last; }; do { die "Unparsed failure"; last; }; } }; $self->throw(-class=>'Bio::WebError::Exception', -text=>$@, -value=>$resp->content) if $@; # "pre-search" successful, return request ### check this post update return POST $self->_search_uri, ['action Download.x' => 1, 'action Download.y'=>1, 'id'=>$self->_session_id ]; } =head2 postprocess_data Title : postprocess_data Usage : $self->postprocess_data ( 'type' => 'string', 'location' => \$datastr); Function: process downloaded data before loading into a Bio::SeqIO Returns : void Args : hash with two keys - 'type' can be 'string' or 'file' - 'location' either file location or string reference containing data =cut sub postprocess_data { # parse tab-separated value content from LANL db my ( $self, %args) = @_; my ($type, $loc) = ($args{type}, $args{location}); my (@data, @cols, %rec, $idkey, @flines); $self->throw(-class=>'Bio::Root::BadParameter', -text=>"Argument hash requires values for keys \"type\" and \"location\"", -value=>\%args) unless ($type && $loc); for ($type) { m/string/ && do { @data = split(/\n|\r/, ${$loc}); last; }; m/file/ && do { local $/; undef $/; open (F, "<", $loc) or $self->throw( -class=>'Bio::Root::FileOpenException', -text=>"Error opening tempfile \"$loc\" for reading", -value=>$loc ); @data = split( /\n|\r/, ); close(F); last; }; do { 1; # else stub }; } $self->throw(-class=>'Bio::Root::BadParameter', -text=>'No data found in repsonse', -value=>%args) unless (@data); my $l; do { $l = shift @data; } while ( $l !~ /Number/ ); # number-returned line @cols = split( /\t/, shift @data); # if Accession column is present, get_Stream_by_acc was called # otherwise, return lanl ids ($idkey) = grep /SE.id/i, @cols unless ($idkey) = grep /Accession/i, @cols; $self->throw(-class=>"Bio::ResponseProblem::Exception", -text=>"Trouble with column headers in LANL response", -value=>join(' ',@cols)) unless $idkey; foreach (@data) { chop; @rec{@cols} = split /\t/; push @flines, ">$rec{$idkey}\n".$rec{'Sequence'}."\n"; } for ($type) { m/string/ && do { ${$loc} = join("", @flines); last; }; m/file/ && do { open(F, ">", $loc) or $self->throw(-class=>'Bio::Root::FileOpenException', -text=>'Error opening tempfile \"$loc\" for writing', -value=>$loc); print F join("", @flines); close(F); last; }; do { 1; #else stub }; } return; } =head1 WebDBSeqI overrides =head2 get_seq_stream Title : get_seq_stream Usage : my $seqio = $self->get_seq_stream(%qualifiers) Function: builds a url and queries a web db Returns : a Bio::SeqIO stream capable of producing sequence Args : %qualifiers = a hash qualifiers that the implementing class will process to make a url suitable for web querying Note : Some tightening up of the baseclass version =cut sub get_seq_stream { my ($self, %qualifiers) = @_; my ($rformat, $ioformat) = $self->request_format(); my ($key) = grep /format$/, keys %qualifiers; $qualifiers{'-format'} = ($key ? $qualifiers{$key} : $rformat); ($rformat, $ioformat) = $self->request_format($qualifiers{'format'}); # web work is here/maj my $request = $self->get_request(%qualifiers); # authorization is here/maj $request->proxy_authorization_basic($self->authentication) if ( $self->authentication); $self->debug("request is ". $request->as_string(). "\n"); # workaround for MSWin systems (no forking available/maj) $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/; if ($self->retrieval_type =~ /pipeline/) { # Try to create a stream using POSIX fork-and-pipe facility. # this is a *big* win when fetching thousands of sequences from # a web database because we can return the first entry while # transmission is still in progress. # Also, no need to keep sequence in memory or in a temporary file. # If this fails (Windows, MacOS 9), we fall back to non-pipelined access. # fork and pipe: _stream_request()=> my ($result,$stream) = $self->_open_pipe(); if (defined $result) { $DB::fork_TTY = File::Spec->devnull; # prevents complaints from debugge if (!$result) { # in child process $self->_stream_request($request,$stream); POSIX::_exit(0); #prevent END blocks from executing in this forked child } else { return Bio::SeqIO->new('-verbose' => $self->verbose, '-format' => $ioformat, '-fh' => $stream); } } else { $self->retrieval_type('io_string'); } } if ($self->retrieval_type =~ /temp/i) { my $dir = $self->io->tempdir( CLEANUP => 1); my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir ); close $fh; my $resp = $self->_request($request, $tmpfile); if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) { $self->throw("WebDBSeqI Error - check query sequences!\n"); } $self->postprocess_data('type' => 'file','location' => $tmpfile); # this may get reset when requesting batch mode ($rformat,$ioformat) = $self->request_format(); if( $self->verbose > 0 ) { open(my $ERR, "<", $tmpfile); while(<$ERR>) { $self->debug($_);} } return Bio::SeqIO->new('-verbose' => $self->verbose, '-format' => $ioformat, '-file' => $tmpfile); } if ($self->retrieval_type =~ /io_string/i ) { my $resp = $self->_request($request); my $content = $resp->content_ref; $self->debug( "content is $$content\n"); if (!$resp->is_success() || length($$content) == 0) { $self->throw("WebDBSeqI Error - check query sequences!\n"); } ($rformat,$ioformat) = $self->request_format(); $self->postprocess_data('type'=> 'string', 'location' => $content); $self->debug( "str is $$content\n"); return Bio::SeqIO->new('-verbose' => $self->verbose, '-format' => $ioformat, '-fh' => new IO::String($$content)); } # if we got here, we don't know how to handle the retrieval type $self->throw("retrieval type " . $self->retrieval_type . " unsupported\n"); } =head2 get_Stream_by_acc Title : get_Stream_by_acc Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); Function: Gets a series of Seq objects by GenBank accession numbers Returns : a Bio::SeqIO stream object Args : an arrayref of accession numbers for the desired sequence entries Note : For LANL DB, alternative to LANL seqids =cut sub get_Stream_by_acc { my ($self, $ids ) = @_; return $self->get_seq_stream('-uids' => [$ids], '-mode' => 'acc'); } =head2 get_Stream_by_query Title : get_Stream_by_query Usage : $stream = $db->get_Stream_by_query($query); Function: Gets a series of Seq objects by way of a query string or oject Returns : a Bio::SeqIO stream object Args : $query : Currently, only a Bio::DB::Query::HIVQuery object. It's a good idea to create the query object first and interrogate it for the entry count before you fetch a potentially large stream. =cut sub get_Stream_by_query { my ($self, $query ) = @_; my $stream = $self->get_seq_stream('-query' => $query, '-mode'=>'query'); return new Bio::DB::HIV::HIVAnnotProcessor( -hiv_query=>$query, -source_stream=>$stream ); } sub _request { my ($self, $request,$tmpfile) = @_; my ($resp); if( defined $tmpfile && $tmpfile ne '' ) { $resp = $self->ua->request($request, $tmpfile); } else { $resp = $self->ua->request($request); } if( $resp->is_error ) { $self->throw("WebDBSeqI Request Error:\n".$resp->as_string); } return $resp; } =head1 Internals =head2 lanl_base Title : lanl_base Usage : $obj->lanl_base($newval) Function: get/set the base url of the LANL HIV database Example : Returns : value of lanl_base (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub lanl_base{ my $self = shift; return $self->{'lanl_base'} = shift if @_; return $self->{'lanl_base'}; } =head2 map_db Title : map_db Usage : $obj->map_db($newval) Function: get/set the cgi filename for map_db ("Database Map") Example : Returns : value of map_db (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub map_db{ my $self = shift; return $self->{'map_db'} = shift if @_; return $self->{'map_db'}; } =head2 make_search_if Title : make_search_if Usage : $obj->make_search_if($newval) Function: get/set the cgi filename for make_search_if ("Make Search Interface") Example : Returns : value of make_search_if (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub make_search_if{ my $self = shift; return $self->{'make_search_if'} = shift if @_; return $self->{'make_search_if'}; } =head2 search_ Title : search_ Usage : $obj->search_($newval) Function: get/set the cgi filename for the search query page ("Search Database") Example : Returns : value of search_ (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub search_{ my $self = shift; return $self->{'search_'} = shift if @_; return $self->{'search_'}; } =head2 _map_db_uri Title : _map_db_uri Usage : Function: return the full map_db uri ("Database Map") Example : Returns : scalar string Args : none =cut sub _map_db_uri{ my $self = shift; return $self->url_base_address."/".$self->map_db; } =head2 _make_search_if_uri Title : _make_search_if_uri Usage : Function: return the full make_search_if uri ("Make Search Interface") Example : Returns : scalar string Args : none =cut sub _make_search_if_uri{ my $self = shift; return $self->url_base_address."/".$self->make_search_if; } =head2 _search_uri Title : _search_uri Usage : Function: return the full search cgi uri ("Search Database") Example : Returns : scalar string Args : none =cut sub _search_uri{ my $self = shift; return $self->url_base_address."/".$self->search_; } =head2 _session_id Title : _session_id Usage : $obj->_session_id($newval) Function: Contains HIV db session id (initialized in _do_lanl_request) Example : Returns : value of _session_id (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _session_id{ my $self = shift; return $self->{'_session_id'} = shift if @_; return $self->{'_session_id'}; } =head2 _response Title : _response Usage : $obj->_response($newval) Function: hold the response to search post Example : Returns : value of _response (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _response{ my $self = shift; return $self->{'_response'} = shift if @_; return $self->{'_response'}; } =head2 Dude, sorry Title : _sorry Usage : $hiv->_sorry Function: Throws an exception for unsupported option or parameter Example : Returns : Args : scalar string =cut sub _sorry{ my $self = shift; my $parm = shift; $self->throw(-class=>"Bio::HIVSorry::Exception", -text=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.", -value=>$parm); return; } 1;