#!/usr/bin/perl -w $ENV{'PATH'} = '/usr/bin:/cluster/home/giardine/bin/x86_64/'; use strict; #this reads the inputs from files, generates the HGVS name for the variant #and computes coordinates using blat and perl script convert_prot_coors2 #requires uniProt database tables: protein, comment, commentType #requires protDb table: spXref2 #set variables build, protDb, disFile, spFile, kgProtMap below #Notes on how name is derived from uniProt features #for len=1 #HGVS name: spId:p.[orig->3letter][start][variant->3letter] #for len>1 -> indel #HGVS name: spId:p.[first aa in orig->3letter][start]_ #[last aa in orig->3letter][start+len-1]delins[variant->3letter] #several variable changes to switch usage (kgProtMap,psl,converter) my $build = 'hg18'; my $protDb = 'proteins060115'; my $kgProtMap = 'kgProtMapUniq.txt'; #must set, blat too slow my $customTrack = 1; #flag whether want custom track with names my $psl = "kgProtMapUniq.$build.txt"; #'tmp.psl'; my $psl2; #my $psl2 = 'protBlatUniq.txt'; #the one below needs the feature ID as part of spVars my $fixFeatures; # = 'featureType2.txt'; #features with fix for run togethers #had to adust IDs, runs didn't match exactly my %feature; #skip for now too many errors, better way of chosing alignment? my $spSeq = 'tmp.fa'; #build controlled by path of nibs #not currently used my $nibList = 'nibsFullPath.list'; #version 2 for protBlat, version 3 for kgProtMap my $converter = './convert_prot_coors3'; #lengths in nts my $converter2 = './convert_prot_coors2'; #lengths in codons my %varIdToSpId; my %aa1To3; my $aaFile = 'aminoInfoDump'; #format:3letterAbrev 1letterAbrev LongName my $spFile = 'spVars061002.txt'; #'arVars.txt'; 'spVars.txt'; #disease associations my $disFile = 'SwissProtDiseaseAssoc.txt'; my %disAssoc; my %gene; my %omim; my %omimId; my %varIdToMutName; my %codon; my %uniProt; #for uniProt disease or polymorphism associations my $pslLine; my $coordinateAccuracy = 1; my $strand; #filled in when get PSL my $outGV = "gvPosSP.$build.txt"; my $outGV2 = 'gvLinkSP.txt'; my $outGV3 = 'gvAttrSP.txt'; my $outGV4 = 'gvSP.txt'; my $outGV5 = 'sp.ct'; my $outGV6 = 'gvAttrLongSP.txt'; #load hashes used in conversion loadCodons(); my $fh; open($fh, '<', $aaFile) or die "Couldn't open $aaFile, $!\n"; while (<$fh>) { chomp; my @v = split(/\s+/); $aa1To3{$v[1]} = $v[0]; } close $fh or die "Couldn't close $aaFile, $!\n"; #load disease associations loadDisease(); loadUniprotDisease(); loadOmimTitles(); if ($fixFeatures) { loadFeatureFix(); } #generate coordinates and print here #read the spFile and generate the mutation name open($fh, '<', $spFile) or die "Couldn't open $spFile, $!\n"; my $outFh; open($outFh, '>', $outGV) or die "Couldn't open $outGV, $!\n"; my $outFh4; open($outFh4, '>', $outGV4) or die "Couldn't open $outGV4, $!\n"; my $outFh2; my $prevSpId = 0; open($outFh2, '>', $outGV2) or die "Couldn't open $outGV2, $!\n"; my $outFh3; open($outFh3, '>', $outGV3) or die "Couldn't open $outGV3, $!\n"; my $outFh5; open($outFh5, '>', $outGV5) or die "Couldn't open $outGV5, $!\n"; my $outFh6; open($outFh6, '>', $outGV6) or die "Couldn't open $outGV6, $!\n"; my $cnt = 0; while (<$fh>) { chomp; $cnt++; my @v = split(/\t/); #format:spId startCodon len variant varId [featureId] if ($fixFeatures && scalar @v != 6) { die "Incompatible run, need IDs to run with feature fix\n"; } if ($fixFeatures && exists $feature{$v[5]}) { $v[3] = $feature{$v[5]}; } my $status; my $phenotype; if (exists $disAssoc{$v[4]}) { if ($disAssoc{$v[4]} ne 'Polymorphism') { $status = 'known to be disease-associated'; $phenotype = $disAssoc{$v[4]}; }else { $status = 'not disease-associated'; } } if (!$status) { $status = 'likely to be disease-associated'; if ($v[3] =~ /polymorphism/ && $v[3] =~ /cancer|disease/i) { undef $status; } elsif ($v[3] =~ /polymorphism/) { undef $status; } elsif ($v[3] =~ /unknown\s*pathological\s*significance/) { undef $status; } elsif ($v[3] =~ /pathological\s*significance\s*unknown/) { undef $status; } elsif ($v[3] =~ /effect not known/) { undef $status; } } my $n; my $oldAA; if ($v[2] == 1) { if (!$v[4]) { die "undefined varId\n"; } $n = "$v[0]:p."; if ($v[3] =~ /(\w) -> (\w)/) { my $aaf = $1; $oldAA = $aaf; my $aat = $2; if ($aaf eq $aat) { undef $status; } $n .= $aa1To3{$aaf}; $n .= ($v[1] + 1); #+1 for UCSC numbers $n .= $aa1To3{$aat}; }elsif ($v[3] =~ /Missing/) { $n .= ($v[1] + 1) . "del"; }else { print "Couldn't parse '$v[3]'\n"; next; } }elsif ($v[2] > 1) { $n = "$v[0]:p."; if ($v[3] =~ /(\w+) -> (\w+)/) { my $aaf = $1; $oldAA = $aaf; my $aat = $2; $aaf =~ /^(\w)/; my $firstAA = $1; $n .= $aa1To3{$firstAA}; $n .= ($v[1] + 1); $n .= '_'; $aaf =~ /(\w)$/; my $lastAA = $1; $n .= $aa1To3{$lastAA}; $n .= ($v[1] + $v[2]); $n .= 'delins'; my @a = split(/ */, $aat); foreach (@a) { if (!/[A-Z]/) { last; } #bad aa if (!$aa1To3{$_}) { #die "Died unknown aa is $_\n"; } $n =~ s/delins.*/delins/; #Bad amino acid don't list print "Bad amino acid for $v[0] $v[3] using name=$n\n"; last; } $n .= $aa1To3{$_}; } }elsif ($v[3] =~ /Missing/) { $n .= ($v[1] + 1) . "_" . ($v[1] + $v[2]) . "del"; }else { print "Couldn't parse '$v[3]'\n"; next; } }else { die "ERROR invalid len $v[2]\n"; } #make sure name isn't too long #long names are placed in separate attribute table with larger field if (length $n > 255) { my $full = $n; $n =~ s/delins.*/delins/; print $outFh6 "$v[4]\tlongName\t$full\n"; } $varIdToMutName{$v[4]} = $n; if (!$kgProtMap && $prevSpId ne $v[0]) { print "creating psl file for $v[0]\n"; my $succ = generatePsl($v[0]); $prevSpId = $v[0]; #reset so matches psl file if (!$succ) { next; } #can't do this one }elsif ($kgProtMap && $prevSpId ne $v[0]) { $prevSpId = $v[0]; #reset so matches psl file #print "TESTING starting new protein $v[0]\n"; $pslLine = getPslLine($v[0]); if (!$pslLine) { print "no alignment for $n\n"; next; } }elsif ($kgProtMap && !$pslLine) { #print "no alignment still for $n\n"; next; } my @chr = getCoordinates($v[1] + 1, $v[0], $pslLine); my @chr2; if (@chr && $coordinateAccuracy && $chr[0] =~ /maps to first|maps to last/) { $chr[0] =~ /(chr[XYM0-9]+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/; $chr[0] = $1; $chr[1] = $2; $chr[2] = $3; $chr2[1] = $4; $chr2[2] = $5; } if (!@chr or !$chr[0] or $chr[0] =~ /ERROR/) { print "no coordinates for $n\n"; if (@chr && $chr[0]) { print $chr[0], "\n"; } next; } my $type = 'substitution'; if ($n =~ /(\d+)delins/) { my $e = $1; my @t = getCoordinates($e, $v[0], $pslLine); if (@chr2 or !@t or $t[0] =~ /ERROR/) { print "no end coordinates for $n\n"; if (@t) { print $t[0], "\n"; } next; } #new start or end depending on strand if ($t[1] < $chr[1]) { $chr[1] = $t[1]; }else { $chr[2] = $t[2]; } $type = 'complex'; }elsif ($n =~ /del/) { if ($n =~ /_(\d+)del/) { my $e = $1; my @t = getCoordinates($e, $v[0], $pslLine); if (@chr2 or !@t or $t[0] =~ /ERROR/) { print "no end coordinates for $n\n"; if (@t) { print $t[0], "\n"; } next; } if ($t[1] < $chr[1]) { $chr[1] = $t[1]; }else { $chr[2] = $t[2]; } } $type = 'deletion'; } $chr[1]--; #UCSC coordinates if (!$chr[0]) { die "ERROR Bad coordinates for $v[4] $n $v[3]\n"; } if (!@chr or !$chr[0] or $chr[0] =~ /ERROR/) { print "no coordinates for $n\n"; if (@chr && $chr[0]) { print $chr[0], "\n"; } next; } if ($oldAA && !@chr2 && !checkSequence($oldAA, $strand, $chr[0], $chr[1], $chr[2])) { #print "sequence mismatch for $n\n"; next; }elsif ($oldAA && !checkSequence($oldAA, $strand, $chr[0], $chr[1], $chr[2], $chr2[1], $chr2[2])) { #print "sequence mismatch for $n\n"; next; } my $short = $n; if (length $n > 64) { $short =~ s/delins.*/delins/; if (length $short > 64) { die "Couldn't shorten name $n.\n"; } } print $outFh "$chr[0]\t$chr[1]\t$chr[2]\t$v[4]\t$strand\t$short\n"; if (@chr2) { print $outFh "$chr[0]\t$chr2[1]\t$chr2[2]\t$v[4]\t$strand\t$short\n"; if ($customTrack) { print $outFh5 "$chr[0]\t$chr2[1]\t$chr2[2]\t$short\n"; } } print $outFh4 "$v[4]\t$n\tSP\t$type\texon\t$coordinateAccuracy\n"; if ($status) { print $outFh3 "$v[4]\tdisease\t$status\n"; } if ($customTrack) { print $outFh5 "$chr[0]\t$chr[1]\t$chr[2]\t$short\n"; } #gvAttr:mutId attrKey attrValue [outFh3] #gvLink: id attrType raKey acc displayValue [outFh2] #print SwissProt links if ($type eq 'complex') { #SwissProt2 search 4 print $outFh2 "$v[4]\tlinks\tUniProtSearch\t$v[4]\t\n"; }else { #SP variant 3 print $outFh2 "$v[4]\tlinks\tUniProtVar\t$v[4]\t\n"; } if ($v[3] =~ /in dbSNP:(\d+)/) { my $a = $1; print $outFh2 "$v[4]\tlinks\tdbSNP\t$a\t\n"; }elsif ($v[3] !~ /in allele/ && $v[3] =~ /\((.*)\)/) { print $outFh3 "$v[4]\tcomment\t$1\n"; } if ($phenotype) { print $outFh3 "$v[4]\tphenoCommon\t$phenotype\n"; } if (exists $uniProt{$v[0]}) { #uniProt disease/polymorphism links foreach my $k (@{$uniProt{$v[0]}}) { print $outFh2 "$v[4]\tgenesDisVars\t$k\t$v[0]\t\n"; } } if (exists $omimId{$v[0]}) { #omim title links foreach my $o (@{$omimId{$v[0]}}) { print $outFh2 "$v[4]\tgenesDisVars\tomimTitle2\t$o\t\n"; } } } close $fh or die "Couldn't close $spFile, $!\n"; close $outFh or die "couldn't close outFh, $!\n"; close $outFh3 or die "couldn't close outFh3, $!\n"; close $outFh2 or die "couldn't close outFh2, $!\n"; close $outFh4 or die "couldn't close outFh4, $!\n"; close $outFh5 or die "couldn't close outFh5, $!\n"; close $outFh6 or die "couldn't close outFh6, $!\n"; #print "TESTING read $cnt variants\n"; exit 0; ############################################################################ #get coordinates based on mRNA and protein position sub getCoordinates { my $pos = shift; my $acc = shift; my $pslLine = shift; my @chr; my $fh; my $command = "$converter $psl $acc $pos '$pslLine' 2>&1 |"; if (!$coordinateAccuracy) { $command = "$converter2 $psl $acc $pos '$pslLine' 2>&1 |"; } open ($fh, $command) or die "ERROR Couldn't run convert_prot_coors[2|3], $!\n"; while (<$fh>) { chomp; if (/ERROR/) { $chr[0] = $_; last; } @chr = split(/\t/); } close $fh; return @chr; } ####End of subroutine getCoordinates #this runs blat on the SP transcript to get the psl sub generatePsl { my $spId = shift @_; if (system("hgsql -N uniProt > $spSeq <$spId'; select val from protein where acc = '$spId';\nend") == -1) { die "Couldn't fetch sequence for $spId\n"; #or return 0; } my $pfh; #this takes about 10 mins to run open($pfh, "blat $nibList $spSeq -noHead -q=prot -t=dnax stdout 2>&1 |") or die "ERROR couldn't run blat on protein sequence for $spId, $!\n"; #catch output so can just print first/best match my $line; while(<$pfh>) { if (!$line && /^\d+/) { $line = $_; } } close $pfh or die "Couldn't finish blat run for $spId, $!\n"; open($pfh, ">", $psl) or die "Couldn't open psl file for writing\n, $!\n"; if ($line) { print $pfh $line; } close $pfh or die "Couldn't close psl file, $!\n"; if ($line) { return 1; } else { return 0; } } ####End of subroutine generatePsl #this gets the fields from the psl file sub getPslLine { my $seq = shift @_; my $pfh; open($pfh, '<', $psl) or die "ERROR couldn't open psl file, $psl, $!\n"; my @f; my $rv; while (<$pfh>) { chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; } if ($f[9] eq $seq) { $rv = join("\t", @f); last; } } $strand = $f[8]; close $pfh or die "ERROR reading psl file, $psl $!\n"; if ($rv) { $coordinateAccuracy = 1; }elsif ($psl2) { $coordinateAccuracy = 0; my $matches = 0; open($pfh, '<', $psl2) or die "ERROR couldn't open psl file, $psl2, $!\n"; while (<$pfh>) { chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads elsif (scalar @f > 22) { die "ERROR bad psl file format $_\n"; } #keep alignment with highest matches if ($f[9] eq $seq && $f[0] > $matches) { $rv = join("\t", @f); $matches = $f[0]; } } close $pfh or die "ERROR reading psl file, $psl2 $!\n"; } return $rv; } ####End sub loadCodons { open (COD, "codon_amino_info") or die "Couldn't open codon_amino_info, $!\n"; while () { chomp; my @t = split(/\t/); $t[1] =~ s/\s+//g; $t[0] =~ s/\s+//g; $codon{$t[0]} = $t[1]; } close (COD); } ####End sub loadDisease { open (DIS, $disFile) or die "Couldn't open $disFile, $!\n"; my $c = 0; while () { chomp; my @t = split(/\s+/, $_, 10); #gene gene? SPGeneId VarID position oldAA -> newAA diseaseAssoc disease if ($t[8] eq 'Polymorphism') { $disAssoc{$t[3]} = $t[8]; }elsif ($t[8] eq 'Disease') { $disAssoc{$t[3]} = $t[9]; if ($disAssoc{$t[3]} =~ /\[MIM:(\d+)\]/) { $omim{$t[3]} = $1; } } #$gene{$t[3]} = "$t[0]($t[2])"; $c++; } close (DIS) or die "Couldn't finish loading $disFile, $!\n"; print STDERR "Found $c variants with disease-association data\n"; } ####End sub checkSequence { my $aa = shift; my $strand = shift; my $chr = shift; my $st = shift; my $end = shift; my $st2 = shift; my $end2 = shift; my @a = split(/ */, $aa); my $checked = 0; if ($strand eq '-') { $strand = 'm'; } #switch to nibFrag format open (NIB, "nibFrag -upper /cluster/data/$build/nib/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; my $seq = ''; while () { chomp; if (/^>/) { next; } #header $seq .= $_; } close NIB or die "Couldn't finish nibFrag, $!\n"; if ($st2) { open (NIB, "nibFrag -upper /cluster/data/$build/nib/${chr}.nib $st2 $end2 $strand stdout |") or die "Couldn't run nibFrag, $!\n"; while () { chomp; if (/^>/) { next; } #header $seq .= $_; } close NIB or die "Couldn't finish nibFrag, $!\n"; } if ($seq eq '') { return 0; } my @s = split(/ */, $seq); foreach $aa (@a) { my $codon = join("", splice(@s, 0, 3)); if (!$codon or !exists $codon{$codon} or $codon{$codon} ne $aa1To3{$aa}) { print "ERROR mismatch $codon != $aa\n"; return 0; } $checked = 1; } return $checked; } ####End sub loadFeatureFix { open(FIX, $fixFeatures) or die "Couldn't open $fixFeatures, $!\n"; while () { chomp; my @t = split(/\t/); $feature{$t[0]} = $t[1]; } close (FIX) or die "Couldn't finish $fixFeatures, $!\n"; } ####End sub loadOmimTitles { my $sql = 'select accession, extAC from spXref2 where division = "9606" and extDB = "MIM"'; open(FH, "-|", "hgsql -NB $protDb -e '$sql'") or die "Couldn't fetch omim IDs from db, $!\n"; while () { chomp; my @t = split(/\s+/); push(@{$omimId{$t[0]}}, $t[1]); } close(FH) or die "Couldn't finish omim IDs, $!\n"; } ####End sub loadUniprotDisease { #get polymorphism and disease IDs my $sql = "select id, val from commentType where val in ('DISEASE', 'POLYMORPHISM')"; my %type; my $dis; my $poly; open(FH, "-|", "hgsql -NB uniProt -e \"$sql\"") or die "Couldn't fetch uniProt type IDs from db, $!\n"; while () { chomp; my @t = split(/\s+/); if ($t[1] eq 'DISEASE') { $dis = $t[0]; $type{$t[0]} = 'uniProtDis'; }else { $poly = $t[0]; $type{$t[0]} = 'uniProtPoly'; } } close(FH) or die "Couldn't finish uniProt type IDs, $!\n"; #now get which IDs go with with which spIDs $sql = "select acc, commentType from comment where commentType in ($dis, $poly)"; open(FH, "-|", "hgsql -NB uniProt -e \"$sql\"") or die "Couldn't fetch uniProt comment types from db, $!\n"; while () { chomp; my @t = split(/\s+/); push(@{$uniProt{$t[0]}}, $type{$t[1]}); } close(FH) or die "Couldn't finish uniProt comment types, $!\n"; } ####End