#!/usr/bin/perl -wT $ENV{'PATH'} = '/usr/local/bin:/bin'; delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; # Make %ENV safer ###################################################################### # written by: Belinda Giardine ###################################################################### use strict; #this converts from 1 set of coordinates to another using a PSL file my $psl = shift @ARGV; my $seq = shift @ARGV; my $input = shift @ARGV; my $pslLine = shift @ARGV if @ARGV; if (!$psl or !$input or !$seq) { print "USAGE: convert_coors file.psl seqName_qName_in_file 999\n"; print "Where the 9's are replaced by the codon position\n"; exit; } if ($input =~ /![0-9]/) { print "ERROR needs a integer\n"; } my $fh; my @f; #fields in psl file that correspond to origin seq if (!$pslLine) { open($fh, '<', $psl) or die "ERROR couldn't open psl file, $psl, $!\n"; while (<$fh>) { #format: bin,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert #tBaseInsert,strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount #blockSizes qStarts tStarts #for blat runs qStarts are codons, for kgProtMap they are nts! chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; } if ($f[9] eq $seq) { last; } } close $fh or die "ERROR reading psl file, $psl $!\n"; }else { @f = split(/\s+/, $pslLine); } if ($f[9] ne $seq) { print "ERROR couldn't find $seq in $psl have $f[9]\n"; exit; } my @gbStarts = split(/,/, $f[19]); #these are counting from end of - strands my @gbLens = split(/,/, $f[18]); my @chrStarts = split(/,/, $f[20]); my $totBlocks = $f[17]; my $gbSt = $f[11]; #start of alignment in query sequence my $chromPos; my $chr = $f[13]; if ($f[8] =~ /^\+/) { #plus strand my $b; for($b = 0; $b < $totBlocks; $b++) { #protein matches are not accurate for first and last codon if ($input == $gbStarts[$b] + 1 or $input == $gbStarts[$b] + $gbLens[$b]) { print "ERROR $input maps to first or last codon of exon\n"; exit; } if ($input <= $gbStarts[$b] + $gbLens[$b]) { if ($input < ($gbStarts[$b] + 1)) { #1 based comparison print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $input - ($gbStarts[$b] + 1); #codon based difference $diff = $diff * 3; $chromPos = $chrStarts[$b] + $diff; #chrStarts 0 or 1 based? last; } } if ($b == $totBlocks && $input > $gbStarts[$b-1] + $gbLens[$b-1]) { my $pos = $gbStarts[$b-1] + $gbLens[$b-1]; print "ERROR $input not in gene, end of gene at $pos\n"; exit; } }else { #minus strand #chrom always +?, query can be - my $pos = $gbSt; #check for gap at beginning my $b; for($b = $totBlocks - 1; $b >= 0; $b--) { $pos += $gbLens[$b]; #protein matches are not accurate for first and last codon if ($input == $pos - $gbLens[$b] -1 or $input == $pos) { print "ERROR $input maps to first or last codon of exon\n"; exit; } #check for gap my $gap = 0; if ($b != $totBlocks -1 && $gbStarts[$b+1] != $gbStarts[$b] + $gbLens[$b]) { $gap = $gbStarts[$b+1] - ($gbStarts[$b] + $gbLens[$b]); $pos += $gap; } if ($input <= $pos) { if ($input <= $pos - $gbLens[$b]) { print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $pos - $input; #codon difference $diff = 3 * $diff; $chromPos = $chrStarts[$b] + $diff; #zero based start as chrStart is zero based last; } } if ($b < 0 && $input > $pos) { print "ERROR $input not in gene, end of gene at $pos\n"; exit; } } if (!$chromPos) { print "ERROR couldn't find chr position for $input in $seq\n"; exit; } $chromPos++; #switch to 1 based number my $end = $chromPos + 2; print "$chr\t$chromPos\t$end\n"; exit;