# Make directory # :vim nowrap ssh hgwdev mkdir /cluster/data/hg18/bed/jkg cd /cluster/data/hg18/bed/jkg # Get refseq and genbank .ra and fa files somehow - either # from Mark, or by running something like: # /cluster/data/genbank/bin/x86_64/gbGetSeqs -get=ra -gbRoot=/cluster/data/genbank \ # -native -db=hg18 refseq mrna refSeq.ra # /cluster/data/genbank/bin/x86_64/gbGetSeqs -get=ra -gbRoot=/cluster/data/genbank \ # -native -db=hg18 genbank mrna mrna.ra # Then convert these into some tab separated files, which takes 4 seconds: txReadRa mrna.ra refSeq.ra . # Get sequence files. Takes 70 seconds /cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 refseq pep refPep.fa /cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 refseq mrna refSeq.fa /cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 genbank mrna mrna.fa # Get some other info from the database. Best to get it about # the same time so it is synced with other data. Takes 4 seconds. echo 'select acc,status from mgcFullStatus' | hgsql -N hg18 > mgcStatus.tab echo 'select distinct name,sizePolyA from mrnaOrientInfo' | hgsql -N hg18 > sizePolyA.tab # Get refSeq, mrna and intronEst tables as psl. Ignore the # stdin is empty messages, these are just from _random chroms # with no annotations. Takes 7 seconds. #mkdir refSeq mrna est #foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) # hgGetAnn -noMatchOk hg18 refSeqAli $c refSeq/$c.psl # hgGetAnn -noMatchOk hg18 mrna $c mrna/$c.psl # hgGetAnn -noMatchOk hg18 intronEst $c est/$c.psl #end # Note this time, since there are blat changes not yet in the main pipeline # do the following, which takes 41 seconds mkdir est pslSplitOnTarget /cluster/store10/markd/genbank/ncalign/results/hg18/refseq.psl refSeq pslSplitOnTarget /cluster/store10/markd/genbank/ncalign/results/hg18/genbank.psl mrna foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) if (! -e refSeq/$c.psl) then echo creating empty refSeq/$c.psl echo -n "" >refSeq/$c.psl endif if (! -e mrna/$c.psl) then echo creating empty mrna/$c.psl echo -n "" >mrna/$c.psl endif hgGetAnn -noMatchOk hg18 intronEst $c est/$c.psl if (! -e est/$c.psl) then echo creating empty est/$c.psl echo -n "" >est/$c.psl endif end # Get list of accessions that are associated with antibodies from database. # This will be a good list but not 100% complete. Cluster these to get # four or five antibody heavy regions. Later we'll weed out input that # falls primarily in these regions, and, include the regions themselves # as special genes. Takes 40 seconds txAbFragFind hg18 antibodyAccs pslCat mrna/*.psl -nohead | weedLines -invert antibodyAccs stdin antibody.psl clusterPsl -prefix=antibody.abV antibody.psl stdout | awk '$10 > 20' | cut -f 1-12 > antibody.bed # Convert psls to bed, saving mapping info and weeding antibodies. Takes 2.5 min foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) txPslToBed refSeq/$c.psl -noFixStrand -cds=cds.tab /cluster/data/hg18/hg18.2bit refSeq/$c.bed -unusual=refSeq/$c.unusual txPslToBed mrna/$c.psl -cds=cds.tab /cluster/data/hg18/hg18.2bit stdout -unusual=mrna/$c.unusual \ | bedWeedOverlapping antibody.bed maxOverlap=0.5 stdin mrna/$c.bed txPslToBed est/$c.psl /cluster/data/hg18/hg18.2bit stdout \ | bedWeedOverlapping antibody.bed maxOverlap=0.3 stdin est/$c.bed end # Create mrna splicing graphs. Takes 10 seconds. mkdir bedToGraph foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) txBedToGraph -prefix=$c. refSeq/$c.bed refSeq mrna/$c.bed mrna bedToGraph/$c.txg end # Create est splicing graphs. Takes 6 minutes. foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) txBedToGraph -prefix=e$c. est/$c.bed est est/$c.txg end # Create an evidence weight file cat > trim.weights <> mouse.txg end # Clean up all but final mouse.txg rm -r est mrna refSeq # Unpack chains and nets, apply synteny filter and split by chromosome # Takes 5 minutes. Make up phony empty nets for ones that are empty after # synteny filter. zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netFilter -syn stdin | netSplit stdin nets cd nets foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) if (! -e $c.net) then echo making phony $c.net echo -n > $c.net endif end cd ../.. # Make txOrtho directory and a para spec file mkdir txOrtho cd txOrtho mkdir edges cd ../bedToGraph echo "#\!/bin/tcsh -ef" > ../txOrtho/spec foreach f (*.txg) set c=$f:r echo txOrtho ../bedToGraph/$f ../mm8/chains/$c.chain ../mm8/nets/$c.net ../mm8/mouse.txg edges/$c.edges >> ../txOrtho/spec end cd .. # Do txOrtho parasol run on iServer (high RAM) cluster ssh kki "cd /cluster/data/hg18/bed/jkg/txOrtho; para make spec; para time" #Completed: 49 of 49 jobs #CPU time in finished jobs: 1916s 31.93m 0.53h 0.02d 0.000 y #IO & Wait Time: 540s 9.00m 0.15h 0.01d 0.000 y #Average job time: 50s 0.84m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 235s 3.92m 0.07h 0.00d #Submission to last job: 235s 3.92m 0.07h 0.00d # Filter out some duplicate edges. These are legitimate from txOrtho's point # of view, since they represent two different mouse edges both supporting # a human edge. However, from the human point of view we want only one # support from mouse orthology. Just takes a second. cd txOrtho mkdir uniqEdges foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) cut -f 1-9 edges/$c.edges | sort | uniq > uniqEdges/$c.edges end cd .. # Clean up chains and nets since they are big cd /cluster/data/hg18/bed/jkg rm -r mm8/chains mm8/nets # Get exonophy. Takes about 4 seconds. echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" \ | hgsql -N hg18 > exoniphy.bed bedToTxEdges exoniphy.bed exoniphy.edges # Add evidence from ests, orthologous mouse transcripts, and exoniphy # Takes 36 seconds. mkdir graphWithEvidence foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) echo adding evidence for $c txgAddEvidence -chrom=$c bedToGraph/$c.txg exoniphy.edges exoniphy stdout \ | txgAddEvidence stdin txOrtho/uniqEdges/$c.edges txOrtho stdout \ | txgAddEvidence stdin est/$c.edges est graphWithEvidence/$c.txg end # Do txWalk - takes 32 seconds (mostly loading the mrnaSize.tab again and # again...) mkdir txWalk foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) txWalk graphWithEvidence/$c.txg trim.weights 3 txWalk/$c.bed -evidence=txWalk/$c.ev -sizes=mrnaSize.tab -defrag=0.25 end # Make a file that lists the various categories of alt-splicing we see. # Do this by making and analysing splicing graphs of just the transcripts # that have passed our process so far. The txgAnalyze program occassionally # will make a duplicate, which is the reason for the sort/uniq run. # Takes 7 seconds. cat txWalk/*.bed | txBedToGraph stdin txWalk txWalk.txg txgAnalyze txWalk.txg /cluster/data/hg18/hg18.2bit stdout | sort | uniq > altSplice.bed # Get txWalk transcript sequences. This'll take about 2 minutes rm -f txWalk.fa foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) sequenceForBed -db=hg18 -bedIn=txWalk/$c.bed -fastaOut=stdout -upCase -keepName >> txWalk.fa end rm -rf txFaSplit mkdir txFaSplit faSplit sequence txWalk.fa 200 txFaSplit/ # Get parts of multiple alignments corresponding to transcripts. # Takes 43 minutes. echo hg18 panTro2 rheMac2 otoGar1 tupBel1 mm8 rn4 \ cavPor2 oryCun1 canFam2 felCat1 equCab1 bosTau3 \ dasNov1 loxAfr1 echTel1 monDom4 ornAna1 galGal3 \ anoCar1 xenTro2 fr2 tetNig1 gasAcu1 oryLat1 danRer4 > allOrgs.txt echo hg18 panTro2 rheMac2 mm8 canFam2 > ourOrgs.txt foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`) mafFrags hg18 multiz25way txWalk/$c.bed stdout -bed12 -orgs=allOrgs.txt \ | mafSpeciesSubset stdin ourOrgs.txt txWalk/$c.maf -keepFirst end # Fold in antibody stuff at a stage where it *won't* be taken up by alignments sequenceForBed -db=hg18 -bedIn=antibody.bed -fastaOut=stdout -upCase -keepName >> txWalk.fa mv txWalk.fa abWalk.fa # Set up cluster run Victor Solovyev's Best Orf program. rm -rf bestorf mkdir bestorf mkdir bestorf/fa bestorf/tab cd bestorf ls -1 ../txFaSplit/*.fa > in.lst cat << '_EOF_' > gsub #LOOP borfBig -exe=./borf $(path1) tab/$(root1).borf #ENDLOOP '_EOF_' gensub2 in.lst single gsub spec cat << '_EOF_' > borf #!/bin/tcsh -ef /cluster/bin/x86_64/bestorf /cluster/bin/x86_64/bestorf_hume.dat $1 '_EOF_' chmod a+x borf cd .. # Do borf cluster run ssh pk "cd /cluster/data/hg18/bed/jkg/bestorf; para make spec; para time" #Completed: 195 of 195 jobs #CPU time in finished jobs: 1354s 22.56m 0.38h 0.02d 0.000 y #IO & Wait Time: 1129s 18.82m 0.31h 0.01d 0.000 y #Average job time: 13s 0.21m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 22s 0.37m 0.01h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # Fetch human protein set and table that describes if curated or not. # Takes about a minute hgsql -N sp070202 -e \ 'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\ |awk '{print ">" $1;print $2}' >uniProt.fa hgsql -N sp070202 -e 'select i.acc,i.isCurated from info i,accToTaxon x where x.taxon=9606 and i.acc=x.acc' > uniCurated.tab # Create blat dir - we'll run a couple of types of alignments here. rm -rf blat mkdir blat # Set up blat jobs for proteins vs. translated txWalk transcripts mkdir blat/protein mkdir blat/protein/raw cd txFaSplit echo #\!/bin/tcsh -ef > ../blat/protein/spec foreach f (*.fa) set c=$f:r echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../uniProt.fa raw/uni_$c.psl >> ../blat/protein/spec echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../refPep.fa raw/ref_$c.psl >> ../blat/protein/spec end cd .. # Run protein/transcript blat job on cluster ssh pk "cd /cluster/data/hg18/bed/jkg/blat/protein; para make spec; para time" #CPU time in finished jobs: 13571s 226.18m 3.77h 0.16d 0.000 y #IO & Wait Time: 5645s 94.08m 1.57h 0.07d 0.000 y #Average job time: 49s 0.82m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 137s 2.28m 0.04h 0.00d #Submission to last job: 137s 2.28m 0.04h 0.00d # Set up blat jobs for mrna vs. txWalk transcripts mkdir blat/rna mkdir blat/rna/raw cd txFaSplit echo #\!/bin/tcsh -ef > ../blat/rna/spec foreach f (*.fa) set c=$f:r echo blat -ooc=/cluster/data/hg18/11.ooc -minIdentity=95 ../../txFaSplit/$f ../../mrna.fa raw/mrna_$c.psl >> ../blat/rna/spec echo blat -ooc=/cluster/data/hg18/11.ooc -minIdentity=97 ../../txFaSplit/$f ../../refSeq.fa raw/ref_$c.psl >> ../blat/rna/spec end cd .. # Run rna/transcript blat on cluster. This is a little i/o heavy, so use # maxNode=50, or optimize i/o somehow. ssh pk "cd /cluster/data/hg18/bed/jkg/blat/rna; para make -maxNode=50 spec; para time" #Completed: 390 of 390 jobs #CPU time in finished jobs: 19127s 318.78m 5.31h 0.22d 0.001 y #IO & Wait Time: 14795s 246.58m 4.11h 0.17d 0.000 y #Average job time: 87s 1.45m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 581s 9.68m 0.16h 0.01d #Submission to last job: 843s 14.05m 0.23h 0.01d # Sort and select best alignments. Remove raw files for space. Takes 22 # seconds. Use pslReps not pslCdnaFilter because need -noIntrons flag, # and also working on protein as well as rna alignments. The thresholds # for the proteins in particular are quite loose, which is ok because # they will be weighted against each other. We lose some of the refSeq # mappings at tighter thresholds. cd /cluster/data/hg18/bed/jkg/blat pslCat -nohead rna/raw/ref*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/refSeq.psl /dev/null pslCat -nohead rna/raw/mrna*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/mrna.psl /dev/null pslCat -nohead protein/raw/ref*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -nearTop=0.02 -ignoreSize -minAli=0.85 stdin protein/refSeq.psl /dev/null pslCat -nohead protein/raw/uni*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -nearTop=0.02 -minAli=0.85 stdin protein/uniProt.psl /dev/null rm -r rna/raw protein/raw cd .. # Create evidence derived from mRNA CDS and mRNA/transcript alignments # Takes 28 seconds. rm -rf cdsEvidence mkdir cdsEvidence txCdsEvFromRna refSeq.fa cds.tab blat/rna/refSeq.psl abWalk.fa \ cdsEvidence/refSeqTx.tce -refStatus=refSeqStatus.tab \ -unmapped=cdsEvidence/refSeqTx.unmapped -exceptions=exceptions.tab txCdsEvFromRna mrna.fa cds.tab blat/rna/mrna.psl abWalk.fa \ cdsEvidence/mrnaTx.tce -mgcStatus=mgcStatus.tab \ -unmapped=cdsEvidence/mrna.unmapped txCdsEvFromProtein refPep.fa blat/protein/refSeq.psl abWalk.fa \ cdsEvidence/refSeqProt.tce -refStatus=refPepStatus.tab \ -unmapped=cdsEvidence/refSeqProt.unmapped \ -exceptions=exceptions.tab -refToPep=refToPep.tab \ -dodgeStop=3 -minCoverage=0.3 txCdsEvFromProtein uniProt.fa blat/protein/uniProt.psl abWalk.fa \ cdsEvidence/uniProt.tce -uniStatus=uniCurated.tab \ -unmapped=cdsEvidence/uniProt.unmapped -source=tremble cat bestorf/tab/*.borf > all.borf txCdsEvFromBorf all.borf abWalk.fa cdsEvidence/bestorf.tce # Consolidate CDS and look for longest ORFs in other species. Takes 40 seconds cat cdsEvidence/*.tce | sort > unweighted.tce cut -f 1-3 unweighted.tce | uniq > cdsIntervals.tab cat txWalk/*.maf | txCdsOrtho cdsIntervals.tab stdin cdsOrtho.tab # Pick best CDS. Take 4 seconds cp ~/kent/src/hg/txCds/txCdsPick/cds.weights . cat txWalk/*.bed antibody.bed | \ txCdsPick stdin unweighted.tce cds.weights pick.tce pick.picks \ -weightedTce=weighted.tce -refToPep=refToPep.tab \ -exceptionsIn=exceptions.tab \ -exceptionsOut=txWalk.exceptions # Create gene prediction (GTF) and peptide fasta file. # Takes 7 seconds. cat txWalk/*.bed antibody.bed | txCdsToGene stdin abWalk.fa pick.tce pick.gtf pick.fa \ -bedOut=pick.bed -exceptions=txWalk.exceptions # Create gene info table. Takes 8 seconds cat mrna/*.unusual refSeq/*.unusual | awk '$5=="flip" {print $6;}' > all.flip cat mrna/*.psl refSeq/*.psl | txInfoAssemble pick.bed pick.tce all.borf \ altSplice.bed txWalk.exceptions sizePolyA.tab stdin all.flip prelim.info # Cluster purely based on CDS (in same frame). Takes 1 second txCdsCluster pick.bed pick.cluster # Flag suspicious CDS regions, and add this to info file. Weed out bad CDS. # Map CDS to gene set. Takes 10 seconds txCdsSuspect pick.bed txWalk.txg pick.cluster prelim.info pick.suspect pick.info -niceProt=pick.nice txCdsWeed pick.tce pick.info cdsOrtho.tab weededCds.tce weededCds.info cat txWalk/*.bed antibody.bed | txCdsToGene stdin abWalk.fa weededCds.tce weededCds.gtf weededCds.faa \ -bedOut=weededCds.bed -exceptions=txWalk.exceptions # Separate out transcripts into coding and 4 noncoding categories. # Generate new gene set that weeds out the junkiest. Takes 9 seconds. txGeneSeparateNoncoding weededCds.bed weededCds.info \ coding.bed nearCoding.bed nearCodingJunk.bed antisense.bed noncoding.bed separated.info awk '$2 != "nearCodingJunk"' separated.info > weeded.info awk '$2 == "nearCodingJunk" {print $1}' separated.info > weeds.lst cat coding.bed nearCoding.bed antisense.bed noncoding.bed > weeded.bed txGeneFromBed weeded.bed pick.picks weeded.gp # Generate data for SVM. cat txWalk/*.bed | txCdsSvmInput stdin unweighted.tce cdsOrtho.tab separated.info svm.list svm.vector # Assign permanent accessions to each transcript, and make up a number # of our files with this accession in place of the temporary IDs we've been # using. Takes 4 seconds txGeneAccession ../jkg.7/ucscGenes.bed ~kent/src/hg/txGene/txGeneAccession/txLastId \ weeded.bed txToAcc.tab oldToNew.tab subColumn 4 weeded.bed txToAcc.tab ucscGenes.bed subColumn 1 weeded.info txToAcc.tab ucscGenes.info weedLines weeds.lst pick.picks stdout | subColumn 1 stdin txToAcc.tab ucscGenes.picks weedLines weeds.lst pick.nice stdout | subColumn 2 stdin txToAcc.tab ucscGenes.nice subColumn 4 coding.bed txToAcc.tab ucscCoding.bed subColumn 4 nearCoding.bed txToAcc.tab ucscNearCoding.bed subColumn 4 antisense.bed txToAcc.tab ucscAntisense.bed subColumn 4 noncoding.bed txToAcc.tab ucscNoncoding.bed cat txWalk/*.ev | weedLines weeds.lst stdin stdout | subColumn 1 stdin txToAcc.tab ucscGenes.ev # Cluster the coding and the noncoding sets, and make up canonical and # isoforms tables. Takes 3 seconds. txCdsCluster ucscCoding.bed coding.cluster txBedToGraph ucscNoncoding.bed noncoding noncoding.txg -prefix=non txBedToGraph ucscAntisense.bed antisense antisense.txg -prefix=anti cat noncoding.txg antisense.txg > senseAnti.txg txGeneCanonical coding.cluster ucscGenes.info senseAnti.txg ucscGenes.bed ucscNearCoding.bed \ canonical.tab isoforms.tab txCluster.tab ##################################################################################### # Start loading up the database! Here we load into hg18a, which is a slimmed # down copy of hg18. Takes 2 seconds hgLoadSqlTab hg18a knownCanonical ~/kent/src/hg/lib/knownCanonical.sql canonical.tab hgLoadSqlTab hg18a knownIsoforms ~/kent/src/hg/lib/knownIsoforms.sql isoforms.tab # Make files with protein and mrna accessions. These will be taken from # RefSeq for the RefSeq ones, and derived from our transcripts for the rest. # Load these sequences into database. Takes 17 seconds. txGeneProtAndRna weeded.bed separated.info abWalk.fa weeded.faa refSeq.fa \ refToPep.tab refPep.fa txToAcc.tab ucscGenes.fa ucscGenes.faa hgPepPred hg18a generic knownGenePep ucscGenes.faa hgPepPred hg18a generic knownGeneMrna ucscGenes.fa # Make up knownGenes table, adding uniProt ID. Load into database. Takes 3 # seconds. txGeneFromBed ucscGenes.bed ucscGenes.picks ucscGenes.gp hgLoadSqlTab hg18a knownGene ~/kent/src/hg/lib/knownGene.sql ucscGenes.gp # Make up kgXref table. Takes about 3 minutes. txGeneXref hg18 sp070202 ucscGenes.info ucscGenes.picks ucscGenes.ev ucscGenes.xref hgLoadSqlTab hg18a kgXref ~/kent/src/hg/lib/kgXref.sql ucscGenes.xref # Make up and load kgColor table. Takes about a minute. txGeneColor sp070202 ucscGenes.info ucscGenes.picks ucscGenes.color hgLoadSqlTab hg18a kgColor ~/kent/src/hg/lib/kgColor.sql ucscGenes.color # Load up kgTxInfo table. Takes 0.3 second hgLoadSqlTab hg18a kgTxInfo ~/kent/src/hg/lib/txInfo.sql ucscGenes.info # Make up alias tables and load them. Takes a minute or so. txGeneAlias hg18a sp070202 ucscGenes.xref ucscGenes.ev foo.alias foo.protAlias sort foo.alias | uniq > ucscGenes.alias sort foo.protAlias | uniq > ucscGenes.protAlias rm foo.alias foo.protAlias hgLoadSqlTab hg18a kgAlias ~/kent/src/hg/lib/kgAlias.sql ucscGenes.alias hgLoadSqlTab hg18a kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias # Make full text index. Takes a minute or so. After this the genome browser # tracks display will work including the position search. The genes details # page, gene sorter, and proteome browser still need more tables. mkdir index cd index hgKgGetText hg18a knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/hg18/bed/jkg/index/knownGene.ix /gbdb/hg18a/knownGene.ix ln -s /cluster/data/hg18/bed/jkg/index/knownGene.ixx /gbdb/hg18a/knownGene.ixx # Create a bunch of knownToXxx tables. Takes about 3 minutes: hgMapToGene hg18a allenBrainAli -type=psl knownGene knownToAllenBrain hgMapToGene hg18a ensGene knownGene knownToEnsembl hgMapToGene hg18a gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgMapToGene hg18a affyGnf1h knownGene knownToGnf1h hgMapToGene hg18a HInvGeneMrna knownGene knownToHInv hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18a > refToLl.txt hgMapToGene hg18a refGene knownGene knownToLocusLink -lookup=refToLl.txt hgMapViaSwissProt hg18a knownGene name proteinID Pfam knownToPfam hgMapToGene hg18a refGene knownGene knownToRefSeq hgMapToGene "-type=bed 12" hg18a affyUclaNorm knownGene knownToU133 hgMapToGene hg18a affyU133Plus2 knownGene knownToU133Plus2 hgMapToGene hg18a affyU95 knownGene knownToU95 knownToVisiGene hg18a -fromProbePsl=vgAllProbes # Create expression distance table - takes about an hour hgExpDistance hg18a hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 # Create another expression distance table, this one is quicker - 6 min.. hgExpDistance hg18a hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 # This one takes a whle too, 1 hour 22 minutes. cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg18a affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 # Run nice Perl script to make all protein blast runs for # Gene Sorter and Known Genes details page. Takes about # 45 minutes to run. mkdir hgNearBlastp4 cd hgNearBlastp4 cat << _EOF_ > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb jk18 queryDbs mm8 rn4 danRer4 ce3 sacCer1 dm2 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa jk18Fa /cluster/data/hg18/bed/jkg/ucscGenes.faa rn4Fa /cluster/data/rn4/bed/blastp/known.faa danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa buildDir /cluster/data/hg18/bed/jkg/hgNearBlastp4 scratchDir /san/sanvol1/scratch/jkgHgNearBlastp4 _EOF_ doHgNearBlastp.pl config.ra |& tee do.log # Remove non-syntenic hits for mouse and rat # Takes a few minutes mkdir /gbdb/hg18a/liftOver ln -s /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \ /gbdb/hg18a/liftOver/hg18aToRn4.over.chain.gz ln -s /cluster/data/hg18/bed/liftOver/hg18ToMm8.over.chain.gz \ /gbdb/hg18a/liftOver/hg18aToMm8.over.chain.gz synBlastp.csh hg18a rn4 synBlastp.csh hg18a mm8 # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/hg18/bed/jkg mkdir -p rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg18a knownGene utr3 utr3/utr.fa utrFa hg18a knownGene utr5 utr5/utr.fa # Split up files and make files that define job. faSplit sequence utr3/utr.fa 10000 utr3/split/s faSplit sequence utr5/utr.fa 10000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub <> train.vector svm_learn train.vector train.model ####################### Issues for take 10 chr10:70,562,806-70,562,873 - Here a refSeq peptide gets mapped, somehow forces a geomic frame shift up throughthe remapping. This gets lost in the final gene set though.... This is back more or less. The shifted codon in the mRNA gets cut out. This is reasonable now. chr14:104,277,141-104,277,163 - Another case where boundaries wiggle in ucscGenes vs cds Mappings.... In this case no "genomic frame shift" Still does involve using a refPepValidated that's in two frames though. chr21.490.2.AK128598 chr21:44,050,073-44,056,876 - A case with CDS in long exon bleed. Should be flagged for NMD but isn't. NM_014638 Another case where boundaries shift in ucscGenes vs. cds mappings. fixed. ###################### Notes for take 11: NM_004390, which overlaps another refseq, seems to have fallen out somewhere between the graph and walking stage. X17115 - use as IgM heavy chain? Please take out /cluster/data/genbank/data/exceptions/invitrogenFullLength.acc Increase txCdsThreshold to 871.42.