# :vim nowrap # for emacs: -*- mode: sh; -*- # This describes how to make the UCSC genes on hg18, though # hopefully by editing the variables that follow immediately # this will work on other databases too. # Directories set dir = /cluster/data/hg18/bed/ucsc.10 set gbDir = $dir/gb set scratchDir = /san/sanvol1/scratch # Databases set db = hg18 set tempDb = tmpFoo2 set xdb = mm8 set Xdb = Mm8 set ydb = canFam2 set zdb = rheMac2 set spDb = sp070202 set pbDb = proteins070202 set ratDb = rn4 set RatDb = Rn4 set fishDb = danRer4 set flyDb = dm2 set wormDb = ce2 set yeastDb = sacCer1 # Database to rebuild visiGene text from set vgTextDbs = (mm7 mm8 hg17 hg18) # Proteins in various species set tempFa = $dir/ucscGenes.faa set xdbFa = /cluster/data/mm8/bed/geneSorter/blastp/known.faa set ratFa = /cluster/data/rn4/bed/blastp/known.faa set fishFa = /cluster/data/danRer4/bed/blastp/ensembl.faa set flyFa = /cluster/data/dm2/bed/flybase4.2/flybasePep.fa set wormFa = /cluster/data/ce3/bed/blastp/wormPep140.faa set yeastFa = /cluster/data/sacCer1/bed/blastp/sgdPep.faa # Tracks set multiz = multiz17way # NCBI Taxon set taxon = 9606 # Previous gene set set oldGeneBed = /dev/null # Machines set dbHost = hgwdev set ramFarm = kki set cpuFarm = pk # Create initial dir mkdir -p $dir cd $dir # Get Genbank info mkdir -p $gbDir cd $gbDir txGenbankData $db echo 'select acc,status from mgcFullStatus' | hgsql -N $db > mgcStatus.tab # Unpack and process RA Files cd $dir zcat $gbDir/genbank.ra.gz >mrna.ra zcat $gbDir/refseq.ra.gz > refSeq.ra txReadRa mrna.ra refSeq.ra . # Unpack sequence and psl files zcat $gbDir/refseq.pep.fa.gz > refPep.fa zcat $gbDir/genbank.fa.gz > mrna.fa zcat $gbDir/refseq.fa.gz > refSeq.fa zcat $gbDir/genbank.psl.gz > mrna.psl zcat $gbDir/refseq.psl.gz > refSeq.psl # Get some other info from the database. Best to get it about # the same time so it is synced with other data. Takes 4 seconds. echo 'select distinct name,sizePolyA from mrnaOrientInfo' | hgsql -N $db | \ subColumn -miss=sizePolyA.miss 1 stdin accVer.tab sizePolyA.tab # Create directories full of alignments split by chromosome. mkdir -p est refSeq mrna pslSplitOnTarget refSeq.psl refSeq pslSplitOnTarget mrna.psl mrna foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) if (! -e refSeq/$c.psl) then echo creating empty refSeq/$c.psl echo -n "" >refSeq/$c.psl endif if (! -e mrna/$c.psl) then echo creating empty mrna/$c.psl echo -n "" >mrna/$c.psl endif hgGetAnn -noMatchOk $db intronEst $c est/$c.psl if (! -e est/$c.psl) then echo creating empty est/$c.psl echo -n "" >est/$c.psl endif end # Get list of accessions that are associated with antibodies from database. # This will be a good list but not 100% complete. Cluster these to get # four or five antibody heavy regions. Later we'll weed out input that # falls primarily in these regions, and, include the regions themselves # as special genes. Takes 40 seconds txAbFragFind $db antibodyAccs pslCat mrna/*.psl -nohead | fgrep -w -f antibodyAccs > antibody.psl clusterPsl -prefix=ab.ab.antibody. antibody.psl antibody.cluster if ($db =~ mm*) then echo chr12 > abChrom.lst echo chr16 >> abChrom.lst echo chr6 >> abChrom.lst fgrep -w -f abChrom.lst antibody.cluster | cut -f 1-12 | bedOrBlocks stdin antibody.bed else awk '$13 > 100' antibody.cluster | cut -f 1-12 > antibody.bed endif # Convert psls to bed, saving mapping info and weeding antibodies. Takes 2.5 min foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) txPslToBed refSeq/$c.psl -noFixStrand -cds=cds.tab /cluster/data/$db/$db.2bit refSeq/$c.bed -unusual=refSeq/$c.unusual txPslToBed mrna/$c.psl -cds=cds.tab /cluster/data/$db/$db.2bit stdout -unusual=mrna/$c.unusual \ | fgrep -v -w -f /cluster/data/genbank/data/exceptions/invitrogenFullLength.acc \ | bedWeedOverlapping antibody.bed maxOverlap=0.5 stdin mrna/$c.bed txPslToBed est/$c.psl /cluster/data/$db/$db.2bit stdout \ | bedWeedOverlapping antibody.bed maxOverlap=0.3 stdin est/$c.bed end # Create mrna splicing graphs. Takes 10 seconds. mkdir -p bedToGraph foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) txBedToGraph -prefix=$c. refSeq/$c.bed refSeq mrna/$c.bed mrna bedToGraph/$c.txg end # Create est splicing graphs. Takes 6 minutes. foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) txBedToGraph -prefix=e$c. est/$c.bed est est/$c.txg end # Create an evidence weight file cat > trim.weights <> other.txg end # Clean up all but final other.txg rm -r est mrna refSeq # Unpack chains and nets, apply synteny filter and split by chromosome # Takes 5 minutes. Make up phony empty nets for ones that are empty after # synteny filter. cd $dir/$xdb zcat /cluster/data/$db/bed/blastz.$xdb/axtChain/$db.$xdb.all.chain.gz | chainSplit chains stdin zcat /cluster/data/$db/bed/blastz.$xdb/axtChain/$db.$xdb.net.gz | netFilter -syn stdin | netSplit stdin nets cd nets foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) if (! -e $c.net) then echo -n > $c.net endif end # Make txOrtho directory and a para spec file cd $dir mkdir -p txOrtho/edges cd txOrtho cd $dir/bedToGraph echo "#\!/bin/tcsh -ef" > ../txOrtho/spec foreach f (*.txg) set c=$f:r echo txOrtho ../bedToGraph/$f ../$xdb/chains/$c.chain ../$xdb/nets/$c.net ../$xdb/other.txg edges/$c.edges >> ../txOrtho/spec end cd .. # Do txOrtho parasol run on iServer (high RAM) cluster ssh $ramFarm "cd $dir/txOrtho; para make spec" #Completed: 49 of 49 jobs #CPU time in finished jobs: 1916s 31.93m 0.53h 0.02d 0.000 y #IO & Wait Time: 540s 9.00m 0.15h 0.01d 0.000 y #Average job time: 50s 0.84m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 235s 3.92m 0.07h 0.00d #Submission to last job: 235s 3.92m 0.07h 0.00d # Filter out some duplicate edges. These are legitimate from txOrtho's point # of view, since they represent two different mouse edges both supporting # a human edge. However, from the human point of view we want only one # support from mouse orthology. Just takes a second. cd txOrtho mkdir -p uniqEdges foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) cut -f 1-9 edges/$c.edges | sort | uniq > uniqEdges/$c.edges end cd .. # Clean up chains and nets since they are big cd $dir rm -r $xdb/chains $xdb/nets # Get exonophy. Takes about 4 seconds. echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" \ | hgsql -N $db > exoniphy.bed bedToTxEdges exoniphy.bed exoniphy.edges # Add evidence from ests, orthologous other species transcripts, and exoniphy # Takes 36 seconds. mkdir -p graphWithEvidence foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) echo adding evidence for $c txgAddEvidence -chrom=$c bedToGraph/$c.txg exoniphy.edges exoniphy stdout \ | txgAddEvidence stdin txOrtho/uniqEdges/$c.edges txOrtho stdout \ | txgAddEvidence stdin est/$c.edges est graphWithEvidence/$c.txg end # Do txWalk - takes 32 seconds (mostly loading the mrnaSize.tab again and # again...) mkdir -p txWalk if ($db =~ mm*) then set sef = 0.6 else set sef = 0.75 endif foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) txWalk graphWithEvidence/$c.txg trim.weights 3 txWalk/$c.bed \ -evidence=txWalk/$c.ev -sizes=mrnaSize.tab -defrag=0.25 -singleExonFactor=$sef end # Make a file that lists the various categories of alt-splicing we see. # Do this by making and analysing splicing graphs of just the transcripts # that have passed our process so far. The txgAnalyze program occassionally # will make a duplicate, which is the reason for the sort/uniq run. # Takes 7 seconds. cat txWalk/*.bed > txWalk.bed txBedToGraph txWalk.bed txWalk txWalk.txg txgAnalyze txWalk.txg /cluster/data/$db/$db.2bit stdout | sort | uniq > altSplice.bed # Get txWalk transcript sequences. This'll take about 2 minutes rm -f txWalk.fa foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) sequenceForBed -db=$db -bedIn=txWalk/$c.bed -fastaOut=stdout -upCase -keepName >> txWalk.fa end rm -rf txFaSplit mkdir -p txFaSplit faSplit sequence txWalk.fa 200 txFaSplit/ # Fetch human protein set and table that describes if curated or not. # Takes about a minute hgsql -N $spDb -e \ "select p.acc, p.val from protein p, accToTaxon x where x.taxon=$taxon and p.acc=x.acc" \ | awk '{print ">" $1;print $2}' >uniProt.fa hgsql -N $spDb -e "select i.acc,i.isCurated from info i,accToTaxon x where x.taxon=$taxon and i.acc=x.acc" > uniCurated.tab # Set up blat jobs for mrna vs. txWalk transcripts mkdir -p blat/rna/raw cd txFaSplit echo #\!/bin/tcsh -ef > ../blat/rna/spec foreach f (*.fa) set c=$f:r echo blat -ooc=/cluster/data/$db/11.ooc -minIdentity=95 ../../txFaSplit/$f ../../mrna.fa raw/mrna_$c.psl >> ../blat/rna/spec echo blat -ooc=/cluster/data/$db/11.ooc -minIdentity=97 ../../txFaSplit/$f ../../refSeq.fa raw/ref_$c.psl >> ../blat/rna/spec end cd .. # Run rna/transcript blat on cluster. This is a little i/o heavy, so use # maxNode=50, or optimize i/o somehow. ssh $cpuFarm "cd $dir/blat/rna; para make -maxNode=50 spec" #Completed: 390 of 390 jobs #CPU time in finished jobs: 19127s 318.78m 5.31h 0.22d 0.001 y #IO & Wait Time: 14795s 246.58m 4.11h 0.17d 0.000 y #Average job time: 87s 1.45m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 581s 9.68m 0.16h 0.01d #Submission to last job: 843s 14.05m 0.23h 0.01d # Set up blat jobs for proteins vs. translated txWalk transcripts mkdir -p blat/protein/raw cd txFaSplit echo #\!/bin/tcsh -ef > ../blat/protein/spec foreach f (*.fa) set c=$f:r echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../uniProt.fa raw/uni_$c.psl >> ../blat/protein/spec echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../refPep.fa raw/ref_$c.psl >> ../blat/protein/spec end cd .. # Run protein/transcript blat job on cluster ssh $cpuFarm "cd $dir/blat/protein; para make spec;" #CPU time in finished jobs: 13571s 226.18m 3.77h 0.16d 0.000 y #IO & Wait Time: 5645s 94.08m 1.57h 0.07d 0.000 y #Average job time: 49s 0.82m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 137s 2.28m 0.04h 0.00d #Submission to last job: 137s 2.28m 0.04h 0.00d # Sort and select best alignments. Remove raw files for space. Takes 22 # seconds. Use pslReps not pslCdnaFilter because need -noIntrons flag, # and also working on protein as well as rna alignments. The thresholds # for the proteins in particular are quite loose, which is ok because # they will be weighted against each other. We lose some of the refSeq # mappings at tighter thresholds. cd $dir/blat pslCat -nohead rna/raw/ref*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/refSeq.psl /dev/null pslCat -nohead rna/raw/mrna*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/mrna.psl /dev/null pslCat -nohead protein/raw/ref*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -nearTop=0.02 -ignoreSize -minAli=0.85 stdin protein/refSeq.psl /dev/null pslCat -nohead protein/raw/uni*.psl | sort -k 10 | \ pslReps -noIntrons -nohead -nearTop=0.02 -minAli=0.85 stdin protein/uniProt.psl /dev/null rm -r protein/raw cd .. # Get parts of multiple alignments corresponding to transcripts. # Takes 1 hour. echo $db $xdb $ydb $zdb > ourOrgs.txt foreach c (`echo 'select chrom from chromInfo' | hgsql -N $db`) mafFrags $db $multiz txWalk/$c.bed stdout -bed12 -meFirst \ | mafSpeciesSubset stdin ourOrgs.txt txWalk/$c.maf -keepFirst end # Create and populate directory with various CDS evidence mkdir -p cdsEvidence cat txWalk/*.maf | txCdsPredict txWalk.fa -nmd=txWalk.bed -maf=stdin cdsEvidence/txCdsPredict.tce txCdsEvFromRna refSeq.fa cds.tab blat/rna/refSeq.psl txWalk.fa \ cdsEvidence/refSeqTx.tce -refStatus=refSeqStatus.tab \ -unmapped=cdsEvidence/refSeqTx.unmapped -exceptions=exceptions.tab txCdsEvFromRna mrna.fa cds.tab blat/rna/mrna.psl txWalk.fa \ cdsEvidence/mrnaTx.tce -mgcStatus=mgcStatus.tab \ -unmapped=cdsEvidence/mrna.unmapped txCdsEvFromProtein refPep.fa blat/protein/refSeq.psl txWalk.fa \ cdsEvidence/refSeqProt.tce -refStatus=refPepStatus.tab \ -unmapped=cdsEvidence/refSeqProt.unmapped \ -exceptions=exceptions.tab -refToPep=refToPep.tab \ -dodgeStop=3 -minCoverage=0.3 txCdsEvFromProtein uniProt.fa blat/protein/uniProt.psl txWalk.fa \ cdsEvidence/uniProt.tce -uniStatus=uniCurated.tab \ -unmapped=cdsEvidence/uniProt.unmapped -source=trembl cat cdsEvidence/*.tce | sort > unweighted.tce # Merge back in antibodies cat txWalk.bed antibody.bed > abWalk.bed sequenceForBed -db=$db -bedIn=antibody.bed -fastaOut=stdout -upCase -keepName > antibody.fa cat txWalk.fa antibody.fa > abWalk.fa # Pick ORFs, make genes txCdsPick abWalk.bed unweighted.tce refToPep.tab pick.tce pick.picks \ -exceptionsIn=exceptions.tab \ -exceptionsOut=abWalk.exceptions txCdsToGene abWalk.bed abWalk.fa pick.tce pick.gtf pick.fa \ -bedOut=pick.bed -exceptions=abWalk.exceptions # Create gene info table. Takes 8 seconds cat mrna/*.unusual refSeq/*.unusual | awk '$5=="flip" {print $6;}' > all.flip cat mrna/*.psl refSeq/*.psl | txInfoAssemble pick.bed pick.tce cdsEvidence/txCdsPredict.tce \ altSplice.bed abWalk.exceptions sizePolyA.tab stdin all.flip prelim.info # Cluster purely based on CDS (in same frame). Takes 1 second txCdsCluster pick.bed pick.cluster # Flag suspicious CDS regions, and add this to info file. Weed out bad CDS. # Map CDS to gene set. Takes 10 seconds. Might want to reconsider using # txCdsWeed here. txCdsSuspect pick.bed txWalk.txg pick.cluster prelim.info pick.suspect pick.info -niceProt=pick.nice txCdsWeed pick.tce pick.info weededCds.tce weededCds.info txCdsToGene abWalk.bed abWalk.fa weededCds.tce weededCds.gtf weededCds.faa \ -bedOut=weededCds.bed -exceptions=abWalk.exceptions \ -tweaked=weededCds.tweaked # Separate out transcripts into coding and 4 uncoding categories. # Generate new gene set that weeds out the junkiest. Takes 9 seconds. txGeneSeparateNoncoding weededCds.bed weededCds.info \ coding.bed nearCoding.bed nearCodingJunk.bed antisense.bed uncoding.bed separated.info awk '$2 != "nearCodingJunk"' separated.info > weeded.info awk '$2 == "nearCodingJunk" {print $1}' separated.info > weeds.lst cat coding.bed nearCoding.bed antisense.bed uncoding.bed | sort -k1,1 -k2,3n >weeded.bed # Make up a little alignment file for the ones that got tweaked. sed -r 's/.*NM_//' weededCds.tweaked | awk '{printf("NM_%s\n", $1);}' > tweakedNm.lst fgrep -f tweakedNm.lst refToPep.tab | cut -f 2 > tweakedNp.lst fgrep -f weededCds.tweaked weededCds.bed > tweakedNm.bed sequenceForBed -db=$db -bedIn=tweakedNm.bed -fastaOut=tweakedNm.fa -upCase -keepName faSomeRecords refPep.fa tweakedNp.lst tweakedNp.fa blat -q=prot -t=dnax -noHead tweakedNm.fa tweakedNp.fa stdout | sort -k 10 > tweaked.psl # Make an alignment file for refSeqs that swaps in the tweaked ones weedLines weededCds.tweaked blat/protein/refSeq.psl refTweaked.psl cat tweaked.psl >> refTweaked.psl # Make precursor to kgProtMap table. This is a psl file that maps just the CDS of the gene # to the genome. txGeneCdsMap weeded.bed weeded.info pick.picks refTweaked.psl \ refToPep.tab /cluster/data/$db/chrom.sizes cdsToRna.psl \ rnaToGenome.psl pslMap cdsToRna.psl rnaToGenome.psl cdsToGenome.psl # Assign permanent accessions to each transcript, and make up a number # of our files with this accession in place of the temporary IDs we've been # using. Takes 4 seconds txGeneAccession $oldGeneBed ~kent/src/hg/txGene/txGeneAccession/txLastId \ weeded.bed txToAcc.tab oldToNew.tab subColumn 4 weeded.bed txToAcc.tab ucscGenes.bed subColumn 1 weeded.info txToAcc.tab ucscGenes.info weedLines weeds.lst pick.picks stdout | subColumn 1 stdin txToAcc.tab ucscGenes.picks weedLines weeds.lst pick.nice stdout | subColumn 2 stdin txToAcc.tab ucscGenes.nice subColumn 4 coding.bed txToAcc.tab ucscCoding.bed subColumn 4 nearCoding.bed txToAcc.tab ucscNearCoding.bed subColumn 4 antisense.bed txToAcc.tab ucscAntisense.bed subColumn 4 uncoding.bed txToAcc.tab ucscUncoding.bed subColumn 10 cdsToGenome.psl txToAcc.tab ucscProtMap.psl cat txWalk/*.ev | weedLines weeds.lst stdin stdout | subColumn 1 stdin txToAcc.tab ucscGenes.ev # Make files with protein and mrna accessions. These will be taken from # RefSeq for the RefSeq ones, and derived from our transcripts for the rest. # Load these sequences into database. Takes 17 seconds. txGeneProtAndRna weeded.bed weeded.info abWalk.fa weededCds.faa refSeq.fa \ refToPep.tab refPep.fa txToAcc.tab ucscGenes.fa ucscGenes.faa # Cluster the coding and the uncoding sets, and make up canonical and # isoforms tables. Takes 3 seconds. txCdsCluster ucscCoding.bed coding.cluster txBedToGraph ucscUncoding.bed uncoding uncoding.txg -prefix=non txBedToGraph ucscAntisense.bed antisense antisense.txg -prefix=anti cat uncoding.txg antisense.txg > senseAnti.txg txGeneCanonical coding.cluster ucscGenes.info senseAnti.txg ucscGenes.bed ucscNearCoding.bed \ canonical.tab isoforms.tab txCluster.tab # Make up final splicing graph just containing our genes, and final alt splice # table. txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg /cluster/data/$db/$db.2bit stdout | sort | uniq > ucscSplice.bed ##################################################################################### # Now the gene set is built. Time to start loading it into the database, # and generating all the many tables that go on top of known Genes. # We do this initially in a temporary database. # Create temporary database with a few small tables from main database hgsqladmin create $tempDb hgsqldump $db chromInfo | hgsql $tempDb hgsqldump $db trackDb_$user | hgsql $tempDb # Load in isoforms, canonical, and gene sequence tables hgLoadSqlTab $tempDb knownIsoforms ~/kent/src/hg/lib/knownIsoforms.sql isoforms.tab hgLoadSqlTab $tempDb knownCanonical ~/kent/src/hg/lib/knownCanonical.sql canonical.tab hgPepPred $tempDb generic knownGenePep ucscGenes.faa hgPepPred $tempDb generic knownGeneMrna ucscGenes.fa # Make up knownGenes table, adding uniProt ID. Load into database. Takes 3 # seconds. txGeneFromBed ucscGenes.bed ucscGenes.picks ucscGenes.faa uniProt.fa refPep.fa ucscGenes.gp hgLoadSqlTab $tempDb knownGene ~/kent/src/hg/lib/knownGene.sql ucscGenes.gp hgLoadBed $tempDb knownAlt ucscSplice.bed # Make up kgXref table. Takes about 3 minutes. txGeneXref $db $spDb ucscGenes.gp ucscGenes.info ucscGenes.picks ucscGenes.ev ucscGenes.xref hgLoadSqlTab $tempDb kgXref ~/kent/src/hg/lib/kgXref.sql ucscGenes.xref # Make up and load kgColor table. Takes about a minute. txGeneColor $spDb ucscGenes.info ucscGenes.picks ucscGenes.color hgLoadSqlTab $tempDb kgColor ~/kent/src/hg/lib/kgColor.sql ucscGenes.color # Load up kgTxInfo table. Takes 0.3 second hgLoadSqlTab $tempDb kgTxInfo ~/kent/src/hg/lib/txInfo.sql ucscGenes.info # Make up alias tables and load them. Takes a minute or so. txGeneAlias $db $spDb ucscGenes.xref ucscGenes.ev foo.alias foo.protAlias sort foo.alias | uniq > ucscGenes.alias sort foo.protAlias | uniq > ucscGenes.protAlias rm foo.alias foo.protAlias hgLoadSqlTab $tempDb kgAlias ~/kent/src/hg/lib/kgAlias.sql ucscGenes.alias hgLoadSqlTab $tempDb kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias # Load up kgProtMap2 table that says where exons are in terms of CDS hgLoadPsl $tempDb ucscProtMap.psl -table=kgProtMap2 # Make full text index. Takes a minute or so. After this the genome browser # tracks display will work including the position search. The genes details # page, gene sorter, and proteome browser still need more tables. mkdir index mkdir /gbdb/$tempDb cd index hgKgGetText $tempDb knownGene.text -summaryTable=$db.refSeqSummary ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s $dir/index/knownGene.ix /gbdb/$tempDb/knownGene.ix ln -s $dir/index/knownGene.ixx /gbdb/$tempDb/knownGene.ixx # Create a bunch of knownToXxx tables. Takes about 3 minutes: hgMapToGene $db -tempDb=$tempDb allenBrainAli -type=psl knownGene knownToAllenBrain hgMapToGene $db -tempDb=$tempDb ensGene knownGene knownToEnsembl hgMapToGene $db -tempDb=$tempDb gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" $db > refToLl.txt hgMapToGene $db -tempDb=$tempDb refGene knownGene knownToLocusLink -lookup=refToLl.txt hgMapToGene $db -tempDb=$tempDb refGene knownGene knownToRefSeq if ($db =~ hg*) then hgMapToGene $db -tempDb=$tempDb affyGnf1h knownGene knownToGnf1h hgMapToGene $db -tempDb=$tempDb HInvGeneMrna knownGene knownToHInv hgMapToGene $db -tempDb=$tempDb affyU133Plus2 knownGene knownToU133Plus2 hgMapToGene $db -tempDb=$tempDb affyU95 knownGene knownToU95 knownToHprd $tempDb /cluster/data/$db/p2p/hprd/FLAT_FILES/HPRD_ID_MAPPINGS.txt time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 endif if ($db =~ mm*) then hgMapToGene $db -tempDb=$tempDb affyGnf1m knownGene knownToGnf1m hgMapToGene $db -tempDb=$tempDb gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgMapToGene $db -tempDb=$tempDb affyU74 knownGene knownToU74 hgMapToGene $db -tempDb=$tempDb affyMOE430 knownGene knownToMOE430 hgMapToGene $db -tempDb=$tempDb affyMOE430 -prefix=A: knownGene knownToMOE430A hgExpDistance $tempDb $db.affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 hgExpDistance $tempDb $db.affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 hgExpDistance $tempDb $db.affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 hgExpDistance $tempDb hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m endif # next command needs $db.vgProbes and $db.vgAllProbes to be copied to $tempDb knownToVisiGene $tempDb vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text $vgTextDbs # Create Human P2P protein-interaction Gene Sorter columns if ($db =~ hg*) then hgLoadNetDist /cluster/data/$db/p2p/hprd/hprd.pathLengths $db humanHprdP2P \ -sqlRemap="select distinct value, name from knownToHprd" hgLoadNetDist /cluster/data/$db/p2p/vidal/humanVidal.pathLengths $db humanVidalP2P \ -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" hgLoadNetDist /cluster/data/$db/p2p/wanker/humanWanker.pathLengths $db humanWankerP2P \ -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" endif # Run nice Perl script to make all protein blast runs for # Gene Sorter and Known Genes details page. Takes about # 45 minutes to run. mkdir hgNearBlastp cd hgNearBlastp cat << _EOF_ > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb $tempDb queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb tmpFoo2Fa $tempFa ${xdb}Fa $xdbFa ${ratDb}Fa $ratFa ${fishDb}Fa $fishFa ${flyDb}Fa $flyFa ${wormDb}Fa $wormFa ${yeastDb}Fa $yeastFa buildDir $dir/hgNearBlastp scratchDir $scratchDir/jkgHgNearBlastp _EOF_ doHgNearBlastp.pl config.ra |& tee do.log # Remove non-syntenic hits for mouse and rat # Takes a few minutes mkdir /gbdb/$tempDb/liftOver ln -s /cluster/data/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz ln -s /cluster/data/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz synBlastp.csh $tempDb $ratDb synBlastp.csh $tempDb $xdb # Make reciprocal best subset for the blastp pairs that are too # Far for synteny to help # Us vs. fish set aToB = run.$tempDb.$fishDb set bToA = run.$fishDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab hgLoadBlastTab $fishDb tfBlastTab $bToA/recipBest.tab # Us vs. fly set aToB = run.$tempDb.$flyDb set bToA = run.$flyDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab hgLoadBlastTab $flyDb tfBlastTab $bToA/recipBest.tab # Us vs. worm set aToB = run.$tempDb.$wormDb set bToA = run.$wormDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab hgLoadBlastTab $wormDb tfBlastTab $bToA/recipBest.tab # Us vs. yeast set aToB = run.$tempDb.$yeastDb set bToA = run.$yeastDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab hgLoadBlastTab $yeastDb tfBlastTab $bToA/recipBest.tab # Clean up cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz rm -r run.*/out gzip run.*/all.tab # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev cd $dir mkdir -p rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa $tempDb knownGene utr3 utr3/utr.fa utrFa $tempDb knownGene utr5 utr5/utr.fa # Split up files and make files that define job. faSplit sequence utr3/utr.fa 10000 utr3/split/s faSplit sequence utr5/utr.fa 10000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < in.lst cat << _EOF_ > doPfam #!/bin/tcsh -ef /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/pfam/Pfam_fs \$1 > \$2 _EOF_ chmod a+x doPfam cat << _EOF_ > gsub #LOOP doPfam ../splitProt/\$(path1) out/\$(root1).pf #ENDLOOP _EOF_ gensub2 in.lst single gsub spec ssh pk "cd /san/sanvol1/scratch/$db/ucscGenes/pfam; para make" # Make up pfamDesc.tab by converting pfam to a ra file first cat << _EOF_ > makePfamRa.awk /^NAME/ {print} /^ACC/ {print} /^DESC/ {print; printf("\n");} _EOF_ awk -f makePfamRa.awk /cluster/store12/pfam/Pfam_fs > pfamDesc.ra raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout | \ awk -F '\t' '{printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3);}' > pfamDesc.tab # Convert output to tab-separated file. cd $dir catDir /san/sanvol1/scratch/$db/ucscGenes/pfam/out | hmmPfamToTab -eValCol stdin ucscPfam.tab # Convert output to knownToPfam table awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' pfamDesc.tab > sub.foo cut -f 1,4 ucscPfam.tab | subColumn 2 stdin sub.foo knownToPfam.tab hgLoadSqlTab $tempDb knownToPfam ~/kent/src/hg/lib/knownTo.sql knownToPfam.tab # Do scop run. Takes about 6 hours # First get pfam global HMMs into /san/sanvol1/scop somehow. cd /san/sanvol1/scratch/$db/ucscGenes mkdir scop cd scop mkdir out ls -1 ../splitProt > in.lst cat << _EOF_ > doScop #!/bin/tcsh -ef /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/scop/scop.hmm \$1 > \$2 _EOF_ chmod a+x doScop cat << _EOF_ > gsub #LOOP doScop ../splitProt/\$(path1) out/\$(root1).pf #ENDLOOP _EOF_ gensub2 in.lst single gsub spec ssh pk "cd /san/sanvol1/scratch/$db/ucscGenes/scop; para make" # Convert scop output to tab-separated files catDir /san/sanvol1/scratch/$db/ucscGenes/scop/out | \ hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab scopCollapse scopPlusScore.tab /cluster/store12/scop/model.tab ucscScop.tab \ scopDesc.tab knownToSuper.tab hgLoadSqlTab $tempDb knownToSuper ~/kent/src/hg/lib/knownToSuper.sql knownToSuper.tab # Regenerate ccdsKgMap table #/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=$db -loadDb ccdsGene knownGene ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=$tempDb -loadDb $db.ccdsGene knownGene ccdsKgMap # Map old to new mapping hgsql $db -N -e 'select * from knownGene' > knownGene_1.gp genePredToBed knownGene_1.gp >knownGene_1.bed cat refSeq/*.bed mrna/*.bed | txGeneExplainUpdate1 knownGene_1.bed ucscGenes.bed stdin abWalk.bed kg2ToKg3.bed hgLoadSqlTab $tempDb kg1ToKg2 ~/kent/src/hg/lib/kg2ToKg3.sql kg2ToKg3.bed # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables. hgsql $tempDb -N -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql $tempDb -N -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u >kgSpAlias.tab rm j.tmp hgLoadSqlTab $tempDb kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql ./kgSpAlias.tab exit # RE-BUILD HG18 PROTEOME BROWSER TABLES (DONE, Fan, 4/2/07). # These are instructions for building tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 070202. # Create the working directory ssh hgwdev cd /cluster/data/$db/bed/ucsc.10 mkdir pb cd pb # Build the pepMwAa table hgsql $pbDb -N -e \ "select info.acc, molWeight, aaSize from $spDb.info, $spDb.accToTaxon where accToTaxon.taxon=$taxon and accToTaxon.acc = info.acc" > pepMwAa.tab hgLoadSqlTab $tempDb pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab o Build the pepPi table hgsql $pbDb -e \ "select info.acc from $spDb.info, $spDb.accToTaxon where accToTaxon.taxon=$taxon and accToTaxon.acc = info.acc" > protAcc.lis hgsql $tempDb -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis pbCalPi protAcc.lis $spDb pepPi.tab hgLoadSqlTab $tempDb pepPi ~/kent/src/hg/lib/pepPi.sql ./pepPi.tab # Calculate and load pep distributions pbCalDist $spDb $pbDb $taxon $tempDb hgLoadSqlTab $tempDb pepExonCntDist ~/kent/src/hg/lib/pepExonCntDist.sql ./pepExonCntDist.tab hgLoadSqlTab $tempDb pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql ./pepCCntDist.tab hgLoadSqlTab $tempDb pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql ./pepHydroDist.tab hgLoadSqlTab $tempDb pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql ./pepMolWtDist.tab hgLoadSqlTab $tempDb pepResDist ~/kent/src/hg/lib/pepResDist.sql ./pepResDist.tab hgLoadSqlTab $tempDb pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql ./pepIPCntDist.tab hgLoadSqlTab $tempDb pepPiDist ~/kent/src/hg/lib/pepPiDist.sql ./pepPiDist.tab # Calculate frequency distributions pbCalResStd $spDb $taxon $tempDb # Create pbAnomLimit and pbResAvgStd tables hgLoadSqlTab $tempDb pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql ./pbAnomLimit.tab hgLoadSqlTab $tempDb pbResAvgStd ~/kent/src/hg/lib/pbResAvgStd.sql ./pbResAvgStd.tab # The old pbStamp table seems OK, so no adjustment needed.