#!/usr/local/bin/perl -w use strict; use warnings; use File::Basename; use File::Find; use Net::FTP; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; my $server = 'http://rsat.ulb.ac.be/rsat/web_services'; my $rsat_lib_url = 'http://rsat.ulb.ac.be/rsat/web_services/RSATWS.wsdl'; ################################################################################ # Subroutines ################################################################################ # Print a usage message. sub print_usage { my($usage, $status) = @_; if (-c STDOUT) { # standard output is a terminal open(C, "| more"); print C $usage; close C; } else { # standard output not a terminal print STDERR $usage; } exit $status; } # This subroutine fetches the list of desired databases from the web server URL sub fetch_db_list { my ($csv_file) = @_; my $url = "http://meme.nbcr.net/meme/cgi-bin/get_db_list.cgi?db_names=$csv_file"; my $ua = LWP::UserAgent->new(); my $req = HTTP::Request->new(GET => $url); my $response = $ua->request($req); if ($response->is_error) { printf "Couldn't get URL $url -- %s\n", $response->status_line; } else { return($response->content()); } } # fetch_db_list # This subroutine downloads a BlastDb database from NCBI using # update_blastdb.pl, converts it to a FASTA file using fastacmd, # and installs it in the db directory. sub download_blastdb ($$) { my $db_root = shift @_; # Root of db name my $db_suffix = shift @_; # Suffix of db name: # aa = amino acid # na = nucleotide. my $update_blastdb_path = file_in_path("update_blastdb.pl"); if ($update_blastdb_path eq "") { die "FAILED to download $db_root, update_blastdb.pl could not be found\n"; } my $fastacmd_path = file_in_path("fastacmd"); if ($fastacmd_path eq "") { die "FAILED to convert $db_root to FASTA, fastacmd could not be found\n"; } system "$update_blastdb_path $db_root"; # Expand tar balls my @tarballs = glob "$db_root.*tar.gz"; foreach my $tarball (@tarballs) { system "tar zxf $tarball"; } # Create FASTA file from BlastDb format files. system "$fastacmd_path -D 1 -d $db_root -o $db_root.$db_suffix"; } # This subroutine searches for the passed file name in the execution path. # Returns the first full path to the file, or an empty string if not found. sub file_in_path($) { my $file = shift @_; my @dirList = split /:/, $ENV{PATH}; my $found_path = ""; foreach my $dir (@dirList) { if (-e "$dir/$file") { $found_path = "$dir/$file"; last; } } return $found_path; } # This subroutine uses regular expressions to break a string in the form of # host/path/base_name.suffix into it's component parts. # It takes a single argument which is the input string, # and returns an array containing (host, path, base_name, suffix). sub parse_source($) { my $source = shift @_; (my $base_name, my $path, my $suffix) = fileparse($source, ("\.gz", "\.Z")); # Split host and directory from full path. my $host = $path; $host =~ s?(^[^\/]*)(\/.*)?$1?; $path =~ s?(^[^\/]*)(\/.*)?$2?; return ($host, $path, $base_name, $suffix); } # check that files exist and are non-empty: file, file.bfile & file.nseqs sub check_file ($) { my($dest) = @_; my $bfile = $dest . ".bfile"; my $nseqs = $dest . ".nseqs"; my $file; foreach $file ($dest, $bfile, $nseqs) { if (-z $file) { print("File $file is empty.\n"); } elsif (! -e $file) { print("File $file does not exist.\n"); } } } # checkfile # create the bfile and nseqs file for a given fasta file sub create_bfile_and_nseqs ($$$) { my ($dest, $type, $check) = @_; my $file; $file = $dest . ".nseqs"; if (!$check || (-z $file || (! -e $file))) { print "Creating file $file.\n" if ($check); `grep -c '^>' $dest > $file`; } $file = $dest . ".bfile"; if (!$check || (-z $file || (! -e $file))) { print "Creating file $file.\n" if ($check); if ($type eq "p") { `/home/rmott/meme/bin/getsize -prot $dest > $file 2> /dev/null`; } else { `/home/rmott/meme/bin/getsize -dna $dest > $file 2> /dev/null`; } } } # create_bfile_and_nseqs # This subroutine updates a local file with a remote file from an FTP server. # If the local file already exists, and is writeable, it compares the # modification timestamp of the local file with that of the remote file, # and only performs the update if the two timestamps are unequal. # If the remote file is compressed it will be expanded when updating # the local file. If updated, the modification timestamp will be set # equal to that of the remote file. # The subroutine takes two arguments: the name of the remote source file, # the name of the local destination file and "p" or "n", the # type of file (protein or nucleotide). The name of the remote source # file is expected to be in the form: host/path/base_name.suffix, for example, # ftp.ncbi.nih.gov/genbank/rel153.fsa_aa.gz. It returns an array of two # values, ($error_status, $message). $error status should be 1 if an error # occurred and 0 otherwise. $message will be a string which may contain an # explanatory message. If the remote file was not downloaded because its # modification timestamp matches that of the local destination file, # $error_status will be 0. sub update_file($$$$) { my $source = shift @_; my $dest = shift @_; my $type = shift @_; my $check_only = shift @_; my $result; my $message = ""; my $error_status = 0; my $downloaded_update = 0; my $mtime_for_remote_file = 0; my $tmp_file_name = "update_db.$$.tmp"; # just check that the file exists and isn't empty if ($check_only) { check_file($dest); #create_bfile_and_nseqs($dest, $type, 1); return($error_status, $message); } # Break up the source string into its component pieces. (my $host, my $path, my $base_name, my $suffix) = parse_source($source); # Check if local file already exists. # If so, we require that it must be an ordinary file # and writable in case we need to update it. # We also get its last modified time, so we can compare it # to that for the source file. my $local_filename_exists = ""; my $mtime_for_local_file = 0; # will be zero unless non-empty my $mtime; if (-e $dest) { $local_filename_exists = "yes"; if (! -W $dest || ! -f $dest) { $error_status = 1; $message = "Unable to write to $dest.\n"; } elsif (-s $dest) { # non-zero size # Get time stamp of local file my @stat_params = stat $dest; $mtime_for_local_file = $stat_params[9]; } } if (! $error_status) { # Connnect to source FTP server my $ftp = Net::FTP->new($host, Timeout=>6000, Debug=>0, Passive=>1); if (! $ftp) { # connect succeeded $error_status = 1; $message = "Unable to connect to $host.\n"; } else { # connect failed # Login to FTP server and set binary tranfer mode. $result = $ftp->login('anonymous','cegrant@u.washington.edu'); if (! $result) { $error_status = 1; $message = "Unable to login as anonymous on $host.\n"; } if (! $error_status) { $result = $ftp->binary(); if (! $result) { $error_status = 1; $message = "Unable to set binary transfer mode on $host.\n"; } } # Move to the appropriate directory. if (! $error_status) { $result = $ftp->cwd($path); if (! $result) { $error_status = 1; $message = "Unable to cd to $path on $host.\n"; } } # See if we are getting a single file or a bunch of files # by checking if the base_name has wildcards * or ?. # If a bunch, files will be concatenated together. my $multi_file = ($base_name =~ /[\*\?]/) ? 1 : 0; my (@file_list, $file); # If the local destination file doesn't exist, or if it does exist # but is older then the server source file, download the source file # from the server. if (! $error_status) { # get the list of files to be concatenated together @file_list = $multi_file ? $ftp->ls("$base_name$suffix") : "$base_name$suffix"; # get the latest time stamp for the remote files $mtime_for_remote_file = 0; foreach $file (@file_list) { $mtime = $ftp->mdtm("$file"); $mtime_for_remote_file = $mtime if ($mtime > $mtime_for_remote_file); my $size = $ftp->size("$file"); print "$file mtime $mtime remote_size $size\n"; } #print "local: $mtime_for_local_file remote: time $mtime_for_remote_file\n"; # download the file (or files) to a temporary file if (! $local_filename_exists or ($mtime_for_remote_file != $mtime_for_local_file)) { my $nfiles = @file_list; foreach $file (@file_list) { my $app_file_name = $tmp_file_name . ".app"; $result = $ftp->get($file, "$app_file_name"); if ($result) { if ($suffix eq '.Z' or $suffix eq ".gz") { # compressed $result = system("zcat < $app_file_name >> $tmp_file_name"); if ($result) { $error_status = 1; $message = "Unable to expand compressed file $app_file_name into $tmp_file_name.\n"; unlink "$app_file_name"; # remove partially uncompressed file } } else { # not compressed if ($nfiles == 1) { # only 1 file, link to save time link $app_file_name, $tmp_file_name; } else { # concatenate multiple files $result = system("cat $app_file_name >> $tmp_file_name"); if ($result) { $error_status = 1; $message = "Unable to expand concatenate file $app_file_name onto $tmp_file_name.\n"; unlink "$app_file_name"; # remove app file } } } unlink $app_file_name; # done with this temp file } else { $error_status = 1; $message = "Unable to get $file from $path on $host.\n"; last; } # ftp->get } # succeeded with download if no errors $downloaded_update = 1 unless $error_status; } else { # already up-to-date $error_status = 0; $message = "Skipped: Local file $dest is up-to-date.\n\n"; } # compare dates } # error_status ? $ftp->quit(); } # ftp connection? } # error_status ? # If we downloaded an update we may need to do some post processing. if ($downloaded_update) { # rename the temporary file to destination file if ($error_status == 0) { $result = rename "$tmp_file_name", $dest; if ($result == 0) { $error_status = 1; $message = "Unable to rename temporary file: $!\n\n"; } else { $error_status = 0; $message = "Updated $dest from $source.\n"; } } # Mark the local file with the time stamp of the remote file; # make it readable by all; # create a background file # create an "nseqs" file if ($error_status == 0) { utime time, $mtime_for_remote_file, $dest; chmod 0644, $dest; create_bfile_and_nseqs($dest, $type, 0); } else { # remove the temporary file if we failed unlink "$tmp_file_name"; } # create the auxiliary files if they are missing/empty create_bfile_and_nseqs($dest, $type, 1); } return ($error_status, $message); } # # update_file_rsat # # Download an upstream database from RSAT. # sub update_file_rsat ($$$$$$) { my ($organism, $from, $to, $outfile, $check_only, $aux_only) = @_; my $error_status = 0; my $message = ""; # just check that the file exists and isn't empty if ($check_only) { check_file($outfile); return($error_status, $message); } # just create the auxiliary files if they are missing/empty if ($aux_only) { create_bfile_and_nseqs($outfile, "n", 1); return($error_status, $message); } # Service call require MyInterfaces::RSATWebServices::RSATWSPortType; my $soap=MyInterfaces::RSATWebServices::RSATWSPortType->new(); my %args = ( 'output' => 'client', 'organism' => $organism, 'all' => 1, 'noorf' => 1, 'from' => $from, 'to' => $to, 'feattype' => '', 'type' => 'upstream', 'format' => 'fasta', 'lw' => 50, 'label' => 'id,name', 'label_sep' => '', 'nocom' => 0, 'repeat' => 0, 'imp_pos' => 0 ); my $som = $soap->retrieve_seq({'request' => \%args}); unless ($som) { $error_status = 1; $message = "A fault (" . $som->get_faultcode() . ") occurred: " . \ $som->get_faultstring() . "\n"; } else { my $results = $som->get_response(); my $command = $results -> get_command(); #print STDERR "Command used on the server: ".$command, "\n"; open OUT, ">$outfile"; print OUT $results -> get_client(); close OUT; # make the file readable by all chmod 0644, $outfile; create_bfile_and_nseqs($outfile, "n", 0); } undef $soap; return($error_status, $message); } # update_file_rsat ################################################################################ # Main program ################################################################################ # defaults my $PGM = $0; # name of program $PGM =~ s#.*/##; # remove part up to last slash my $upstream = 0; my $noclobber = 0; my $check_only = 0; my $csv_file = 'fasta_db.csv'; #my $csv_file = 'mast_db_names.csv'; my $usage = <= 0) { $_ = shift; if ($_ eq "-h") { # help print_usage("$usage", 0); } elsif ($_ eq "-upstream") { $upstream = 1; my $rsat_lib = "/home/rmott/meme/lib/perl/RSATWS"; die("You must link '$rsat_lib' \ to the RSATWS perl library to use -upstream. (Available at $rsat_lib_url).\n") unless (-d $rsat_lib); eval "use lib '$rsat_lib'"; } elsif ($_ eq "-noclobber") { $noclobber = 1; } elsif ($_ eq "-check-only") { $check_only = 1; } else { print_usage("$usage", 1); } } # Move to db directory; create it if it doesn't exist my $db_dir = "/home/rmott/meme/db/fasta_databases"; my $result = 1; $result = mkdir($db_dir, 0755) unless (-d $db_dir); if (! $result) { die "Couldn't create directory $db_dir.\n"; } $result = chdir $db_dir; if (! $result) { die "Couldn't change directory to $db_dir.\n"; } # Read each line from the fasta_db file. # Name of file listing files to download my @lines = split /^/m, fetch_db_list($csv_file); my $line_number = 0; foreach $_ (@lines) { $line_number++; if (/^#/ or /^\s/ or /^-/) { next; # skip comments, lines with initial blanks } # Parse the line chomp; my @words = split /,/; my $len = @words; next if ($words[0] eq ""); # skip header lines # check that number of fields is right; allow for last field to be blank or missing if ($len > 8 || $len < 7) { if ($len < 7) { print STDERR "\nMissing fields in line $line_number of db names file ($len of 8 found)\n"; } else { print STDERR "\nExtra fields in line $line_number of db names file ($len of 8 found)\n"; } print STDERR "Line is: $_\n"; print STDERR "No action taken.\n"; next; } my $db_root = $words[0]; my $is_protein = $words[1]; my $is_nucleotide = $words[2]; my $has_short_seqs = $words[3]; my $db_menu_name = $words[4]; my $db_long_name = $words[5]; my $protein_url = $words[6]; my $nucleotide_url = $len == 8 ? $words[7] : ""; print STDERR "\nDB: $db_root\n" unless ($check_only); # Download the file(s) my $error_status; my $message; # protein if (uc $is_protein eq "1") { my $outfile = "$db_root.aa"; ($error_status, $message) = update_file($protein_url, $outfile, "p", $check_only); if ($error_status) { print STDERR "FAILED to update $outfile. $message"; } else { print STDERR "$message"; } } # nucleotide if (uc $is_nucleotide eq "1") { my $outfile = "$db_root.na"; # upstream sequence database? if ($db_root =~ /^\s*(\S+)_upstream\s*$/) { my $organism = $1; # downloading upstream files as well? if ($upstream) { my $from = $protein_url; my $to = $nucleotide_url; # check and update only auxiliary files if sequence file exists and -noclobber if ($noclobber && -e $outfile && -s $outfile) { ($error_status, $message) = update_file_rsat($organism, $from, $to, $outfile, $check_only, 1); $message = "Skipped: -noclobber given and non-empty '$outfile' exists.\n"; } else { ($error_status, $message) = update_file_rsat($organism, $from, $to, $outfile, $check_only, 0); } } else { $message = "Skipped: -upstream not given.\n"; } } else { # normal database ($error_status, $message) = update_file($nucleotide_url, "$outfile", "n", $check_only); } if ($error_status) { print STDERR "FAILED to update $outfile. $message"; } else { print STDERR "$message\n"; } } }