#!/usr/bin/perl -wT $ENV{'PATH'} = '/usr/local/bin:/bin'; delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; # Make %ENV safer ###################################################################### # written by: Belinda Giardine ###################################################################### use strict; #this converts from 1 set of coordinates to another using a PSL file my $psl = shift @ARGV; my $seq = shift @ARGV; my $input = shift @ARGV; my $pslLine; if (@ARGV) { $pslLine = shift @ARGV; } if (!$psl or !$input or !$seq) { print "USAGE: convert_coors file.psl seqName_qName_in_file 999[-99,+99]\n"; print "Where the 9's are replaced by the coding sequence position or genomic position\n"; exit; } if ($input =~ /![0-9+-]/) { print "ERROR needs a integer with possibly a + or - into the intron to convert\n"; } my $offset = ''; if ($input =~ /\d+([+*-]\d+)/) { $offset = $1; $input =~ s/[+*-]\d+//; if ($offset =~ /\*/) { $offset =~ s/\*/+/; } } if ($input < 0 && $offset ne '') { print "ERROR can't have offset outside of exon pos=$input offset=$offset\n"; } my $fh; my @f; #fields in psl file that correspond to origin seq if (!$pslLine) { open($fh, '<', $psl) or die "ERROR couldn't open gene decription file, $psl, $!\n"; while (<$fh>) { #format: bin,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert #tBaseInsert,strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount #blockSizes qStarts tStarts chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads if ($f[9] eq $seq) { last; } } close $fh or die "ERROR reading psl file, $psl $!\n"; }else { @f = split(/\s+/, $pslLine); if (scalar @f < 21) { die "ERROR bad pslLine format $pslLine\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads } if ($f[9] ne $seq) { print "ERROR couldn't find $seq in $psl have $f[9]\n"; exit; } my @gbStarts = split(/,/, $f[19]); #these are counting from end of - strands my @gbLens = split(/,/, $f[18]); my @chrStarts = split(/,/, $f[20]); my $totBlocks = $f[17]; my $gbSt = $f[11]; #start of alignment in query sequence my $chromPos; my $chr = $f[13]; if ($f[8] =~ /^\+/) { #plus strand my $b; if ($input < 0) { $chromPos = $chrStarts[0] + $input; }else { for($b = 0; $b < $totBlocks; $b++) { if ($input <= $gbStarts[$b] + $gbLens[$b]) { if ($input < ($gbStarts[$b] + 1)) { #1 based comparison print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $input - ($gbStarts[$b] + 1); #1 based difference $chromPos = $chrStarts[$b] + $diff; last; } } if ($b == $totBlocks && $input > $gbStarts[$b-1] + $gbLens[$b-1]) { my $pos = $gbStarts[$b-1] + $gbLens[$b-1]; print "ERROR $input not in gene, end of gene at $pos\n"; exit; } } }else { #minus strand #chrom always +?, query can be - if ($input < 0) { $chromPos = $chrStarts[$totBlocks-1] + $gbLens[$totBlocks-1] - 1 + abs($input); }else { my $pos = $gbSt; #check for gap at beginning my $b; for($b = $totBlocks - 1; $b >= 0; $b--) { $pos += $gbLens[$b]; #check for gap my $gap = 0; if ($b != $totBlocks -1 && $gbStarts[$b+1] != $gbStarts[$b] + $gbLens[$b]) { $gap = $gbStarts[$b+1] - ($gbStarts[$b] + $gbLens[$b]); $pos += $gap; } if ($input <= $pos) { if ($input <= $pos - $gbLens[$b]) { print "ERROR: Couldn't map $seq $input, it is in a gap\n"; exit; } my $diff = $pos - $input; $chromPos = $chrStarts[$b] + $diff; #zero based start as chrStart is zero based last; } } if ($b < 0 && $input > $pos) { print "ERROR $input not in gene, end of gene at $pos\n"; exit; } } } if (!defined $chromPos) { print "ERROR couldn't find chr position for $input $offset in $seq\n"; exit; } $chromPos++; #switch to 1 based number if ($offset) { if ($f[8] =~ /^\+/) { #plus strand $chromPos += $offset; }else { $chromPos -= $offset; } } print "$input $offset is mapped to $chr $chromPos $f[8]\n"; exit;