#!/usr/bin/perl -wT $ENV{'PATH'} = '/usr/local/bin:/bin'; delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; # Make %ENV safer ###################################################################### # written by: Belinda Giardine ###################################################################### use strict; #this converts from 1 set of coordinates to another using a dump from a psl table #it expects both coordinates to be nts my $psl = shift @ARGV; my $seq = shift @ARGV; my $input = shift @ARGV; my $pslLine = shift @ARGV if @ARGV; if (!$psl or !$input or !$seq) { print "USAGE: convert_prot_coors3 file.psl seqName_qName_in_file 999 ['pslLine']\n"; print "Where the 9's are replaced by the codon position\n", "pslLine is optional for programs to send tab separated line from ", "a psl file\n"; exit; } if ($input =~ /![0-9]/) { print "ERROR needs a integer\n"; } my $fh; my @f; #fields in psl file that correspond to origin seq if (!$pslLine) { open($fh, '<', $psl) or die "ERROR couldn't open psl file, $psl, $!\n"; while (<$fh>) { #format: bin,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert #tBaseInsert,strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount #blockSizes qStarts tStarts #for blat runs qStarts are codons, for kgProtMap they are nts! chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; } if ($f[9] eq $seq) { last; } } close $fh or die "ERROR reading psl file, $psl $!\n"; }else { @f = split(/\s+/, $pslLine); } if ($f[9] ne $seq) { print "ERROR couldn't find $seq in $psl have $f[9]\n"; exit; } my @gbStarts = split(/,/, $f[19]); #these are counting from end of - strands my @gbLens = split(/,/, $f[18]); my @chrStarts = split(/,/, $f[20]); my $totBlocks = $f[17]; my $gbSt = $f[11]; #start of alignment in query sequence my $chromPos; my $chr = $f[13]; my $ntPos = ($input * 3) - 2; if ($f[8] =~ /^\+/) { #plus strand my $b; for($b = 0; $b < $totBlocks; $b++) { if ($ntPos <= $gbStarts[$b] + $gbLens[$b]) { if ($ntPos < ($gbStarts[$b] + 1)) { #1 based comparison print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $ntPos - ($gbStarts[$b] + 1); #nt based difference $chromPos = $chrStarts[$b] + $diff; if ($ntPos > ($gbStarts[$b] + $gbLens[$b] - 2)) { my $e = $chrStarts[$b] + $gbLens[$b]; my $len = $e - $chromPos; if ($len > 2) { print "ERROR illegal length $len\n"; exit; } my $c2 = $chrStarts[$b+1]; my $c3 = $c2 + (3 - $len); #UCSC numbering for second set $chromPos++; print "ERROR $input maps to last codon of exon or near gap ", "$chr\t$chromPos\t$e\t$c2\t$c3\n"; exit; } last; } } if ($b == $totBlocks && $ntPos > $gbStarts[$b-1] + $gbLens[$b-1]) { my $pos = $gbStarts[$b-1] + $gbLens[$b-1]; print "ERROR $input not in gene, end of gene at $pos\n"; exit; } }else { #minus strand #chrom always +?, query can be - my $pos = $gbSt; #check for gap at beginning my $b; $ntPos += 2; #find lowest nt position (chrom coors) in codon for($b = $totBlocks - 1; $b >= 0; $b--) { $pos += $gbLens[$b]; #check for gap my $gap = 0; if ($b != $totBlocks -1 && $gbStarts[$b+1] != $gbStarts[$b] + $gbLens[$b]) { $gap = $gbStarts[$b+1] - ($gbStarts[$b] + $gbLens[$b]); $pos += $gap; } if ($ntPos <= $pos) { if ($ntPos <= $pos - $gbLens[$b]) { print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $pos - $ntPos; #nt difference $chromPos = $chrStarts[$b] + $diff; if ($ntPos < $pos - ($gbLens[$b] - 1) + 2) { my $e = $chrStarts[$b] + $gbLens[$b]; my $len = $e - $chromPos; if ($len > 2) { print "ERROR $input maps illegally len=$len\n"; exit; } my $c2 = $chrStarts[$b+1]; my $c3 = $c2 + (3 - $len); $chromPos++; print "ERROR $input maps to first codon (split) of exon ", "$chr\t$chromPos\t$e\t$c2\t$c3\n"; exit; } #zero based start as chrStart is zero based last; } } if ($b < 0 && $ntPos > $pos) { print "ERROR $input not in gene, end of gene at $pos\n"; exit; } } if (!$chromPos) { print "ERROR couldn't find chr position for $input in $seq\n"; exit; } $chromPos++; #switch to 1 based number my $end = $chromPos + 2; print "$chr\t$chromPos\t$end\n"; exit;