#!/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/blastz-run-ucsc instead. # Based on Scott Schwartz's bash script blastz-run; # rewritten in perl and extended to handle .2bit inputs and # multi-record fasta for target as well as query by Angie Hinrichs. # $Id: blastz-run-ucsc,v 1.11 2009/08/13 23:15:57 hiram Exp $ use Getopt::Long; use strict; use warnings; use vars qw/ $opt_outFormat $opt_gz $opt_dropSelf $opt_help /; sub usage { my ($status) = @_; my $base = $0; $base =~ s/^(.*\/)?//; print STDERR " usage: $base target query DEF out options: -outFormat (psl|axt) Convert blastz output from lav to psl or axt. -gz Compress output with gzip. -dropSelf When converting to psl or axt, drop alignment blocks that cross the diagonal (trivial self- alignments). Unpacks sequence (if necessary), runs blastz and lifts/\"normalizes\" output (if necessary). Target and query can be list files containing one sequence specifier per line, or sequence specifiers. A sequence specifier is a .2bit, .nib or .fa(.gz) files, possibly with a sequence and range specifier appended. DEF is a Scott Schwartz-style bash script containing blastz parameters. out is the (lifted/\"normalized\") output of blastz (optionally converted to axt or psl, and/or gzipped). This script does not pay attention to out's file suffixes -- it is up to you to make sure that they are consistent with output format and compression options. \n"; exit $status; } sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions("outFormat=s", "gz", "dropSelf", "help"); &usage(1) if (!$ok); &usage(0) if ($opt_help); if ($opt_outFormat) { $opt_outFormat =~ tr/A-Z/a-z/; if ($opt_outFormat eq 'lav') { $opt_outFormat = undef; } elsif ($opt_outFormat !~ /^(psl|axt)$/) { print STDERR "\nUnsupported -outFormat $opt_outFormat.\n"; &usage(1); } } if ($opt_dropSelf && !($opt_outFormat && $opt_outFormat =~ /^(psl|axt)$/)) { print STDERR "\n-dropSelf is only supported for -outFormat axt or psl.\n"; &usage(1); } } # my $machtype = $ENV{'MACHTYPE'}; my $machtype = "x86_64"; # This hash will contain the params defined in the Scott-style DEF script: my %defVars = (); # Temporary local directory for intermediate files: my $TMP; # For lifting, we'll need to know the sequence sizes: my %tSizes = (); my %qSizes = (); my $machType = `uname -m`; chomp $machType; $machType = "i386" if ($machType eq "i686"); my $defaultPath = ("/cluster/bin/penn/$machType:/cluster/bin/penn:" . "/cluster/bin/scripts:/cluster/bin/$machType:" . '/bin:/usr/bin'); sub cleanDie { # Clean up $TMP (if it has been created) and then die. my ($msg) = @_; if ($TMP && -d $TMP) { system("/bin/sh", "-c", "/bin/rm -rf $TMP"); } die $msg; } sub run { # Run a command in sh, with PATH specified in %defVars. my ($cmd) = @_; my $setPath = "export PATH="; $setPath .= "$defVars{PATH}:" if (defined $defVars{'PATH'}); $setPath .= "$defaultPath; "; system("/bin/sh", "-c", $setPath . $cmd) == 0 || &cleanDie("Command failed:\n$setPath\n$cmd\n"); } sub nfsNoodge { # Try to access the given file/dir; if it fails, don't die, just # wait a second, try again. # Scott's blastz-run included a similar trick. my ($file) = @_; my $ret = system("/bin/sh", "-c", "/bin/ls -d $file > /dev/null"); if ($ret != 0) { sleep 1; $ret = system("/bin/sh", "-c", "/bin/ls -d $file > /dev/null"); sleep 1 if ($ret != 0); } } sub loadDef { # Read parameters from a bash script with Scott's param variable names: my ($def) = @_; &nfsNoodge($def); open(DEF, "$def") || &cleanDie("Can't open def file $def: $!\n"); while () { s/^\s*export\s+//; next if (/^\s*#/ || /^\s*$/); if (/(\w+)\s*=\s*(.*)/) { my ($var, $val) = ($1, $2); while ($val =~ /\$(\w+)/) { my $subst = $defVars{$1}; $val =~ s/\$$1/$subst/ if (defined $subst); } $defVars{$var} = $val; } } close(DEF); } sub loadSeqSizes { # Load up sequence -> size mapping from $sizeFile into $hashRef. my ($sizeFile, $hashRef) = @_; &nfsNoodge($sizeFile); open(SIZE, "$sizeFile") || &cleanDie("Can't open size file $sizeFile: $!\n"); while () { chomp; my ($seq, $size) = split; $hashRef->{$seq} = $size; } close(SIZE); } sub fileBase { # Get rid of leading path and .suffix (and .gz too if it's there). my ($path) = @_; $path =~ s/^(.*\/)?(\S+)\.\w+(\.gz)?/$2/; return $path; } sub parseFileSpec { # Take a file spec with possible seq name, start and end specifiers and # split into components. my ($info, $dir) = @_; my ($fName, $seq, $start, $end); if ($info =~ /^(\S+):(\S+):(\d+)-(\d+)$/) { ($fName, $seq, $start, $end) = ($1, $2, $3, $4); } elsif ($info =~ /^(\S+):(\d+)-(\d+)$/) { ($fName, $seq, $start, $end) = ($1, &fileBase($info), $2, $3); } else { ($fName, $seq, $start, $end) = ($info, &fileBase($info), 0, 0); } $fName = "$dir/$fName" if ($fName !~ /^[\/.]/); return ($fName, $seq, $start, $end); } sub enumerateFileSpecs { # Given a sequence file name, sequence file name + offset spec, or # a name of a file containing a list of such file specs, return a # list of file specs. my ($listOrSpec, $seqDir) = @_; my @specs = (); my ($fName, $seq, $start, $end) = &parseFileSpec($listOrSpec, $seqDir); if ($fName =~ /\.(fa(\.gz)?|nib|2bit)$/) { # It appears to be a single spec not a list. push @specs, $listOrSpec; } else { # It doesn't have a recognized file ending -- it had better be a list. # Assume each line contains a single spec. &nfsNoodge($listOrSpec); open(LST, "$listOrSpec") || &cleanDie("Couldn't open list file $listOrSpec: $!\n"); while () { chomp; push @specs, $_; } close(LST); } return @specs; } sub collapseFileSpecs { # If we get a big old list of query sequences that are not subranged, # then cat them all into one big .fa file so we can invoke blastz # only once, and don't have to lift afterwards. my @fileSpecs = @_; my $collapsed = 0; if (scalar(@fileSpecs) == 1) { # Only one spec in the list ==> no collapsing to do. return ($collapsed, @fileSpecs); } # Now see if there are any specs that look like subranges: foreach my $spec (@fileSpecs) { my ($fname, $seq, $start, $end) = &parseFileSpec($spec, "."); if ($start > 0) { # Found a subrange; no collapsing. return ($collapsed, @fileSpecs); } } # OK, we're in the clear to concatenate all of these complete sequences # onto one local file. my $big = "$TMP/bigCollapse.fa"; # avoiding exec of shell here instead of going through &run # and making an empty new file $big &run("/bin/cp /dev/null $big"); foreach my $spec (@fileSpecs) { my ($seq, $local) = unpackForBlastz($spec, '.', 'tmp'); $local =~ s/\[\S+\]$//; if ($local =~ /\.nib$/) { my $nib = $local; $local =~ s/\.nib/.fa/; $local =~ s/(.*\/)/tmp./; my $size = `nibSize $nib | awk '{print \$3;}'`; chomp $size; &run("nibFrag -masked $nib 0 $size + $local"); } &run("/bin/cat $local >> $big"); &run("/bin/rm -f $local") if ($local =~ /^$TMP/); } $collapsed = 1; @fileSpecs = ($big); return ($collapsed, @fileSpecs); } sub getBlastzParams { # Look for BLASTZ_? param specs in %defVars; turn into blastz command line # option string. (blastz command line option names are all 1-character, # and all 1-char BLASTZ_? DEF params are blastz command line options.) my $params = " "; foreach my $p (keys %defVars) { if ($p =~ /^BLASTZ_(\w)$/) { $params .= "$1=$defVars{$p} "; } } return $params; } sub blastzRangeSpec { # Return a blastz-style range spec unless a null range is passed in. my ($start, $end) = @_; if ($end != 0) { $start += 1; return "[$start,$end]"; } else { return ""; } } sub unpackForBlastz { # If the sequence file is .fa.gz or .2bit, uncompress/extract fa into # a local file; if it's something that blastz already supports like # .fa or .nib, translate the offset spec (if any) into blastz's format. my ($spec, $seqDir, $prefix) = @_; my ($fname, $seq, $start, $end) = &parseFileSpec($spec, $seqDir); my $local = "$TMP/$prefix.$seq.$start.$end.fa"; if ($fname =~ /\.2bit$/) { # Don't use twoBitToFa's range extraction capabilities -- that # confuses blastz/normalizeLav. Just extract the whole thing as # fa and tell blastz what range we're using: &run("/cluster/bin/$machtype/twoBitToFa $fname:$seq $local"); $local .= &blastzRangeSpec($start, $end); } elsif ($fname =~ /\.fa(\.gz)?$/) { if ($1) { &run("/bin/gunzip -c $fname > $local"); } else { $local = $fname; } $local .= &blastzRangeSpec($start, $end); } elsif ($fname =~ /\.nib$/) { $local = $fname; $local .= &blastzRangeSpec($start, $end); } else { &cleanDie("Don't know what type of sequence file this is: $fname"); } return ($seq, $local); } sub parseBlastzSpec { # Extract info back out of blastz spec. Leave start 1-based. my ($spec) = @_; my ($fname, $seq, $start, $end); if ($spec =~ /^(\S*\/)?(\S+)\.(fa|nib)(\[(\d+),(\d+)\])?$/) { $fname = "$1$2.$3"; $seq = $2; if ($4) { $start = $5; $end = $6; } else { $start = 0; $end = 0; } } else { &cleanDie("Can't parse this blastz spec: $spec"); } return ($fname, $seq, $start, $end); } sub emptyLav { # satisify later processing steps with an empty lav result my ($raw) = @_; open (FH,">$raw") || &cleanDie("emptyLav: can not write to $raw $!\n"); print FH <<_EOF_ #:lav d { "blastz target query H=2000 A C G T 91 -114 -31 -123 -114 100 -125 -31 -31 -125 100 -114 -123 -31 -114 91 O = 400, E = 30" } m { n 0 } #:eof _EOF_ ; close (FH); } sub checkSubRange { # given an fa or nib file name: file.fa|nib[one_based_start,end] # verify that bit of sequence has usable sequence for blastz my ($faNibFile) = @_; my ($file, $seq, $one_start, $end) = &parseBlastzSpec($faNibFile); my $start = $one_start - 1; # 1 based becomes 0 based coord my $answer = 0; # default answer is usable sequence exists return($answer) if ((0 == $end) || (($end - $start) > 10000)); if ($file =~ m/.nib$/) { $answer = `nibFrag -masked $file $start $end "+" stdout | faSize stdin | grep " 0 upper " | wc -l`; } else { $answer = `faFrag -mixed $file $start $end stdout | faSize stdin | grep " 0 upper " | wc -l`; } return($answer); } sub plainBlastz { # just run blastz -- target and query can have "[$start,$end]" specifiers # where $start is 1-based. Put output in file name $raw. my ($target, $query, $options, $raw) = @_; my $answer = 1; # default answer is that blastz was run OK my $blastz = $defVars{'BLASTZ'} || "lastz"; # these variables will be 1 if there is zero usable sequence # this could be a more extensive check than just for zero. my $targetNoSequence = 0; if ($target =~ m/\[/) { $targetNoSequence = &checkSubRange($target); } else { $targetNoSequence = `cat $target | faSize stdin | grep " 0 upper " | wc -l`; } chomp $targetNoSequence; my $queryNoSequence = 0; if ($query =~ m/\[/) { $queryNoSequence = &checkSubRange($query); } else { $queryNoSequence = `cat $query | faSize stdin | grep " 0 upper " | wc -l`; } chomp $queryNoSequence; if ($targetNoSequence || $queryNoSequence) { &emptyLav($raw); $answer = 0; # not really running blastz } else { &run("$blastz $target $query $options > $raw"); } return($answer); } sub selectRpts { # This used to be an awk, piped to select_rpts, piped to sort -n. # select_rpts is a little perl script, and getting the DEF PATH into the # pipe command seems more trouble than it's worth -- so inline not only # the awk but also the select_rpts here. Still pipe to sort -n just in case. my ($allRepFile, $start, $end, $selRepFile) = @_; &nfsNoodge($allRepFile); open(ALL, "<$allRepFile") || &cleanDie("Can't open $allRepFile: $!\n"); open(SEL, "| sort -n > $selRepFile") || &cleanDie("Can't open pipe to sort -n > $selRepFile: $!\n"); print SEL "\%:repeats\n"; while () { if (/^\s*\d+/) { my @words = split; my ($repStart, $repEnd) = ($words[5], $words[6]); next if ((! defined $repStart) || (! defined $repEnd)); $repStart += 1; next if ($repStart > $repEnd); next if (($repEnd < $start) || ($repStart > $end)); $repStart = $start if ($repStart < $start); $repEnd = $end if ($repEnd > $end); $repStart -= ($start - 1); $repEnd -= ($start - 1); print SEL "$repStart $repEnd rpt\n"; } } close(SEL); close(ALL); } sub abridgeLinSpecReps { # Before running blastz, snip out lineage-specific repeats from each # sequence. Then run blastz only on that strand. After running blastz, # fix up the alignment coords to reflect the original sequence including # the lineage-specific repeats. # Currently this only works on per-chromosome sequence and repeat files. my ($tSpec, $qSpec, $blastzOptions, $raw) = @_; my ($tFname, $tSeq, $tStart, $tEnd) = &parseBlastzSpec($tSpec); my ($qFname, $qSeq, $qStart, $qEnd) = &parseBlastzSpec($qSpec); if ($tEnd == 0 || $qEnd == 0) { &cleanDie("When abridging repeats, sequence filenames must be followed " . "by range specs even if the whole sequence is aligned.\n"); } my $S1X = "$defVars{SEQ1_SMSK}/$tSeq.out.spec"; my $S2X = "$defVars{SEQ2_SMSK}/$qSeq.out.spec"; &cleanDie("$S1X not found") if (! -f $S1X); &cleanDie("$S2X not found") if (! -f $S2X); my $TS1X = "$TMP/s1.rpts"; my $TS2X = "$TMP/s2.rpts"; my $TS1 = "$TMP/s1.fa"; my $TS2 = "$TMP/s2.fa"; my $TS1S = "$TMP/s1.strip.fa"; my $TS2S = "$TMP/s2.strip.fa"; my $TS2R = "$TMP/s2.rev.fa"; my $TS2SR = "$TMP/s2.strip.rev.fa"; my $TZF = "$TMP/fwd.lav"; my $TZR = "$TMP/rev.lav"; my $TZS = "$TMP/std.lav"; # Get repeats in range: &selectRpts($S1X, $tStart, $tEnd, $TS1X); &selectRpts($S2X, $qStart, $qEnd, $TS2X); # Get sequence in range: &run("fasta-subseq $tFname $tStart $tEnd > $TS1"); &run("fasta-subseq $qFname $qStart $qEnd > $TS2"); # Snip out the repeats: &run("strip_rpts $TS1 $TS1X > $TS1S"); &run("strip_rpts $TS2 $TS2X > $TS2S"); # Run blastz on query forward strand and restore repeats: my $blastzOK = &plainBlastz($TS1S, $TS2S, "B=0" . $blastzOptions, $TZF); if ($blastzOK ) { &run("restore_rpts $TZF " . "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " . "$TS2 \"\\\"$qFname\\\" $qStart $qEnd\" " . "$TS1X $TS2X > $raw"); } else { &run("/bin/cp -p $TZF $raw"); } # Run blastz on query reverse strand and restore repeats: &run("revcomp $TS2S > $TS2SR"); $blastzOK = &plainBlastz($TS1S, $TS2SR, "B=0" . $blastzOptions, $TZR); if ($blastzOK ) { &run("restore_rpts $TZR " . "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " . "$TS2 \"\\\"$qFname-\\\" $qStart $qEnd\" " . "$TS1X $TS2X reverse >> $raw"); } else { &run("/bin/cat $TZF >> $raw"); } } sub liftLav { # Run blastz-normalizeLav to lift up chunk coords to sequence level. my ($raw, $out, $tSeq, $qSeq) = @_; my $tLen = $tSizes{$tSeq}; my $qLen = $qSeq ? $qSizes{$qSeq} : "0"; &run("blastz-normalizeLav $tLen $qLen < $raw > $out"); } sub convertOutput { # Convert lav file to psl or axt file, optionally dropping trivial self al's. my ($lav, $out) = @_; my $tSeq = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; my $qSeq = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; my $tSizes = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; my $qSizes = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; if ($opt_outFormat eq 'axt') { my $dropSelf = ($opt_dropSelf ? "-dropSelf " : ""); &run("lavToAxt $dropSelf $lav $tSeq $qSeq $out"); } elsif ($opt_dropSelf) { # lavToPsl doesn't have lavToAxt's -dropSelf functionality, so pipe # lavToAxt -dropSelf to axtToPsl. &run("lavToAxt -dropSelf $lav $tSeq $qSeq stdout " . "| axtToPsl stdin $tSizes $qSizes $out"); } else { &run("lavToPsl $lav $out"); } } ######################################################################### # # -- main -- &checkOptions(); &usage(1) if (scalar(@ARGV) != 4); my ($target, $query, $DEF, $out) = @ARGV; # It is OK to have a previous result existing if ( -f "$out") { exit 0; } &loadDef($DEF); my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; &loadSeqSizes($seq1Len, \%tSizes); &loadSeqSizes($seq2Len, \%qSizes); # Make a temporary directory for intermediate files: if (exists($defVars{'TMPDIR'})) { $TMP = `mktemp -d $defVars{'TMPDIR'}/blastz.XXXXXX` || &cleanDie("Can't mktemp"); } else { $TMP = `mktemp -d /tmp/blastz.XXXXXX` || &cleanDie("Can't mktemp"); } chomp $TMP; print STDERR "temp files in $TMP -- will attempt cleanup before die-ing.\n"; # Some internal defaults for blastz params, carried over from # Scott's blastz-run: $defVars{'BLASTZ_H'} = 2000 if (! defined $defVars{'BLASTZ_H'}); my $blastzOptions = &getBlastzParams(%defVars); my @tFileSpecs = &enumerateFileSpecs($target, $seq1Dir); my @qFileSpecs = &enumerateFileSpecs($query, $seq2Dir); if ($defVars{'BLASTZ_ABRIDGE_REPEATS'} && scalar(@qFileSpecs) > 1) { &cleanDie("Sorry, DEF can't have BLASTZ_ABRIDGE_REPEATS set if a list of " . "query sequences is passed in -- not supported.\n"); } my $collapsed; ($collapsed, @qFileSpecs) = &collapseFileSpecs(@qFileSpecs); # Local file to hold concatenated results of all {target, query} runs: my $localOut = "$TMP/local.lav"; &run("/bin/cp /dev/null $localOut"); # Local temporary files that are overwritten by each {target, query} run: my $littleRaw = "$TMP/little.raw"; my $littleOut = "$TMP/little.lav"; my $littleConv = "$TMP/little.conv"; # Run blastz on each {target, query} pair; lift and collect results. foreach my $tSpec (@tFileSpecs) { my ($tSeq, $tLocal) = &unpackForBlastz($tSpec, $seq1Dir, 't'); foreach my $qSpec (@qFileSpecs) { my ($qSeq, $qLocal) = &unpackForBlastz($qSpec, $seq2Dir, 'q'); if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) { &abridgeLinSpecReps($tLocal, $qLocal, $blastzOptions, $littleRaw); } else { &plainBlastz($tLocal, $qLocal, $blastzOptions, $littleRaw); } if ($collapsed) { # Lift target side only: &liftLav($littleRaw, $littleOut, $tSeq, undef); } else { &liftLav($littleRaw, $littleOut, $tSeq, $qSeq); if ($qLocal =~ /^$TMP/) { $qLocal =~ s/\[.*\]$//; &run("/bin/rm $qLocal"); } } if ($opt_outFormat) { &convertOutput($littleOut, $littleConv); &run("/bin/cat $littleConv >> $localOut"); } else { &run("/bin/cat $littleOut >> $localOut"); } } if ($tLocal =~ /^$TMP/) { $tLocal =~ s/\[.*\]$//; &run("/bin/rm $tLocal"); } } if ($opt_gz) { &run("/bin/gzip $localOut"); $localOut .= ".gz"; } # Move collected results to final location. &run("/bin/mv $localOut $out"); # Clean up temporary directory: &run("/bin/rm -rf $TMP");