# for emacs: -*- mode: sh; -*- # This file describes browser build for the Marmoset # genome, July 2007 # # "$Id: calJac1.txt,v 1.22 2010/02/12 23:42:33 hiram Exp $" # ###################################################################### ## DOWNLOAD SEQUENCE (DONE - 2007-08-21 - Hiram) ssh kkstore06 mkdir /cluster/store4/calJac1 ln -s /cluster/store4/calJac1 /cluster/data/calJac1 mkdir /cluster/data/calJac1/wustl cd /cluster/data/calJac1/wustl for F in supercontigs.agp.gz supercontigs.fa.gz contigs.fa.gz contigs.fa.qual.gz do wget --timestamping \ /pub/organism/Primates/Callithrix_jacchus/assembly/Callithrix_jacchus-2.0.2/output/${F} \ -O ${F} done # real 50m13.535s ls -ogrt # -rw-rw-r-- 1 6656649 Jun 19 17:03 supercontigs.agp.gz # -rw-rw-r-- 1 521109271 Jun 19 17:03 contigs.fa.qual.gz # -rw-rw-r-- 1 781437003 Jun 19 18:30 contigs.fa.gz # -rw-rw-r-- 1 851641082 Aug 21 13:29 supercontigs.fa.gz ########################################################################## # fetch photograph (DONE - 2007-08-21 - Hiram) mkdir /cluster/data/calJac1/photo cd /cluster/data/calJac1/photo wget --timestamping \ http://www.genome.gov/Images/press_photos/highres/82-300.jpg \ -O nhgri.original.82-300.jpg convert -geometry 300x200 -quality 80 nhgri.original.82-300.jpg \ Callithrix_jacchus.jpg # check this .jpg image into the source tree browser/images/ directory ####################################################################### ## create config.ra and run makeGenomeDb.pl ssh kkstore06 cd /cluster/data/calJac1 cat << '_EOF_' > calJac1.config.ra # Config parameters for makeGenomeDb.pl: db calJac1 scientificName Callithrix jacchus commonName Marmoset assemblyDate Jun. 2007 assemblyLabel WUSTL 2.0.2 orderKey 40 clade mammal genomeCladePriority 16 mitoAcc none fastaFiles /cluster/data/calJac1/wustl/supercontigs.fa.gz agpFiles /cluster/data/calJac1/wustl/supercontigs.agp.gz # qualFiles /dev/null dbDbSpeciesDir marmoset '_EOF_' # << happy emacs time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \ -stop=agp calJac1.config.ra > makeGenomeDb.out 2>&1 & # real 24m24.468s time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \ -continue=db calJac1.config.ra > db.continue.out 2>&1 & # add the trackDb files to the source tree and to the trackDb/makefile ########################################################################## ## Repeat masker (DONE - 2007-08-21 - Hiram) ssh kkstore06 ## use screen for this mkdir /cluster/data/calJac1/bed/RepeatMasker cd /cluster/data/calJac1/bed/RepeatMasker time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \ -bigClusterHub=kk \ -buildDir=/cluster/data/calJac1/bed/RepeatMasker calJac1 > do.out 2>&1 & ############################################################################## ## simpleRepeat masking (DONE - 2007-09-05 - Hiram) ## create a kki kluster run ssh kkr1u00 mkdir /iscratch/i/calJac1 cd /iscratch/i/calJac1 cp -p /cluster/data/calJac1/calJac1.unmasked.2bit . cp -p /cluster/data/calJac1/chrom.sizes . twoBitToFa calJac1.unmasked.2bit calJac1.unmasked.fa mkdir split # split sequence into about 1000 files, each about 3,000,000 bases time nice -n +19 faSplit about calJac1.unmasked.fa 3000000 split/cj1_ for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/calJac1/ kkr${R}u00:/iscratch/i/calJac1/ done ssh kki mkdir -p /cluster/data/calJac1/bed/simpleRepeat/trf cd /cluster/data/calJac1/bed/simpleRepeat/trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set C = $1:r set SRC = /iscratch/i/calJac1/split/$C.fa mkdir -p /scratch/tmp/$C cp -p $SRC /scratch/tmp/$C/$C.fa pushd /scratch/tmp/$C /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \ /dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C popd rm -f $C.bed cp -p /scratch/tmp/$C/$C.bed . rm -fr /scratch/tmp/$C '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf $(path1) {check out line $(root1).bed} #ENDLOOP '_EOF_' # << happy emacs ls /iscratch/i/calJac1/split > part.list gensub2 part.list single template jobList para create jobList para try ... check ... push ... etc ... # Completed: 947 of 947 jobs # CPU time in finished jobs: 37242s 620.70m 10.35h 0.43d 0.001 y # IO & Wait Time: 2842s 47.36m 0.79h 0.03d 0.000 y # Average job time: 42s 0.71m 0.01h 0.00d # Longest finished job: 1318s 21.97m 0.37h 0.02d # Submission to last job: 3572s 59.53m 0.99h 0.04d cat *.bed > ../simpleRepeat.bed cd .. awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/calJac1/bed/simpleRepeat time nice -n +19 hgLoadBed calJac1 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 846105 elements of size 16 # real 0m24.710s nice -n +19 featureBits calJac1 simpleRepeat \ > fb.simpleRepeat.calJac1.txt 2>&1 cat fb.simpleRepeat.calJac1.txt # 100489601 bases of 2929139385 (3.431%) in intersection # add the trfMask to the rmsk masked sequence to get our final # masked sequence ssh kkstore06 cd /cluster/data/calJac1 time nice -n +19 cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed calJac1.rmsk.2bit stdin calJac1.2bit # measure it time nice -n +19 twoBitToFa calJac1.2bit stdout \ | faSize stdin > faSize.calJac1.2bit.txt 2>&1 grep masked faSize.calJac1.2bit.txt # %45.93 masked total, %47.50 masked real ## clean up the /iscratch/i/calJac1/ directory ssh kkr1u00 cd /iscratch/i/calJac1 rm -fr * for R in 2 3 4 5 6 7 8 do rsync -a --progress --delete --stats /iscratch/i/calJac1/ kkr${R}u00:/iscratch/i/calJac1/ done cd .. rmdir calJac1 for R in 2 3 4 5 6 7 8 do ssh kkr${R}u00 rmdir /iscratch/i/calJac1 done ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-09-06 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("calJac1", "blat13", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("calJac1", "blat13", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## BLASTZ swap from hg18 alignments (2007-11-11 - markd) ssh hgwdev mkdir /cluster/data/calJac1/bed/blastz.hg18.swap cd /cluster/data/calJac1/bed/blastz.hg18.swap ln -s blastz.hg18.swap ../blastz.hg18 /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/hg18/bed/blastz.calJac1.2007-10-07/DEF >& swap.out& # fb.calJac1.chainHg18Link.txt: # 2426684781 bases of 2929139385 (82.846%) in intersection # running syntenic net (DONE - 2007-12-14 - Hiram) time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastz.calJac1.2007-10-07/DEF \ -bigClusterHub=pk -continue=syntenicNet -syntenicNet \ -swap -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1 & # real 8m24.277s # failed during a chainSplit: # Can't open chain/Contig836.chain to append: Too many open files # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/calJac1/bed/blastz.hg18.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 hg18 \ > rbest.log 2>&1 & ######################################################################### ## genscan run (DONE - 2007-11-08 - Hiram) ## create hard masked sequence ssh kkstore06 cd /cluster/data/calJac1 twoBitToFa calJac1.2bit stdout \ | maskOutFa stdin hard stdout | faToTwoBit stdin calJac1.hard.2bit # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts twoBitToFa calJac1.hard.2bit stdout \ | faCount stdin > faCount.hard.txt # the lowest three are: egrep -v "^#|^total" faCount.hard.txt \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # there are a lot of these that turned into zero sequence left # will sort this out when making the sequence to use on the Iservers ssh kkr1u00 mkdir /iscratch/i/calJac1/hardMasked cd /iscratch/i/calJac1/hardMasked twoBitToFa /cluster/data/calJac1/calJac1.hard.2bit stdout \ | faSplit byname stdin ./ # more than 128 bases of sequence results in the removal of 1,208 # sequences that are too short, leaving 48,516 sequences egrep -v "^#|^total" /cluster/data/calJac1/faCount.hard.txt \ | awk '{size=$2-$7; if (size < 128) {print $1}}' | while read F do rm -f "${F}.fa" echo "${F}.fa" done mkdir ../hardChunks cd ../hardChunks # chunk them up into 4,000,000 base packages, no sequence is broken catDir ../hardMasked \ | faSplit about stdin 4000000 c_ rm -fr ../hardMasked for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/calJac1/hardChunks/ done ssh hgwdev mkdir /cluster/data/calJac1/bed/genscan cd /cluster/data/calJac1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/calJac1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /iscratch/i/calJac1/hardChunks/c_*.fa > genome.list # Create script to run gsBig cat << '_EOF_' > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot '_EOF_' # << happy emacs chmod +x runGsBig # template file for gensub2 cat << '_EOF_' > template #LOOP runGsBig /iscratch/i/calJac1/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... # Completed: 720 of 720 jobs # CPU time in finished jobs: 55325s 922.09m 15.37h 0.64d 0.002 y # IO & Wait Time: 2063s 34.38m 0.57h 0.02d 0.000 y # Average job time: 80s 1.33m 0.02h 0.00d # Longest finished job: 132s 2.20m 0.04h 0.00d # Submission to last job: 65396s 1089.93m 18.17h 0.76d # cat and lift the results into single files ssh kkstore06 cd /cluster/data/calJac1/bed/genscan cat gtf/c_*.gtf > genscan.gtf cat subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/calJac1/bed/genscan ldHgGene calJac1 -gtf genscan genscan.gtf # Read 64005 transcripts in 344791 lines in 1 files # 64005 groups 23602 seqs 1 sources 1 feature types # 64005 gene predictions hgPepPred calJac1 generic genscanPep genscan.pep hgLoadBed calJac1 genscanSubopt genscanSubopt.bed # Loaded 576960 elements of size 6 # check the numbers time nice -n +19 featureBits calJac1 genscan # 59205113 bases of 2929139385 (2.021%) in intersection # the next closest genome with a genscan track time nice -n +19 featureBits panTro2 genscan # 53758386 bases of 2909485072 (1.848%) in intersection time nice -n +19 featureBits mm9 genscan # 55293837 bases of 2620346127 (2.110%) in intersection ############################################################################ # GENBANK AUTO UPDATE (DONE - 2007-11-21 - Hiram) # Create a lift file as per the procedures for Chimp from the AGP: ssh kolossus cd /cluster/data/calJac1 # MAKE 11.OOC FILE FOR BLAT blat calJac1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024 # Wrote 34303 overused 11-mers to 11.ooc # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add calJac1 just after panTro2 # calJac1 # Marmoset calJac1.serverGenome = /cluster/data/calJac1/calJac1.2bit calJac1.clusterGenome = /scratch/data/calJac1/calJac1.2bit calJac1.ooc = /cluster/data/calJac1/calJac1/11.ooc calJac1.lift = no calJac1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} calJac1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} calJac1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} calJac1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} calJac1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} calJac1.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} calJac1.downloadDir = calJac1 calJac1.genbank.est.xeno.load = no calJac1.refseq.mrna.native.load = yes calJac1.refseq.mrna.xeno.load = yes calJac1.refseq.mrna.xeno.loadDesc = yes cvs ci -m "Added calJac1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *calJacNames[] = {"Callithrix jacchus", NULL}; # {"calJac", calJacNames}, cvs ci -m "Added Callithrix jacchus (Marmoset)." src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial calJac1 & # logFile: var/build/logs/2007.11.20-11:31:54.calJac1.initalign.log # real 607m38.957s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad calJac1 # logFile: var/dbload/hgwdev/logs/2007.11.25-10:26:23.dbload.log # real 26m30.926s # enable daily alignment and update of hgwdev (DONE - 2007-11-21 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add calJac1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added calJac1." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # Blastz Platypus ornAna1 (DONE - 2007-11-14 - Hiram) # this was done a second time, see ornAna1.txt for the second run # since this run produced a null result for some unknown reason. ssh kkstore06 screen # use screen to control this job mkdir /cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14 cd /cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14 cat << '_EOF_' > DEF # Orangutan vs. platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # QUERY: Marmoset calJac1 SEQ1_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ1_LEN=/cluster/data/calJac1/chrom.sizes SEQ1_CHUNK=20000000 SEQ2_LIMIT=400 SEQ1_LAP=0 # QUERY: Platypus ornAna1 SEQ2_DIR=/cluster/bluearc/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 1927m20.962s - to the first pk crash # this was a tough job to get finished. Several pk crashes, # problems with garbage in the para.results file, and so forth. # But, it did finish as of Monday afternoon 2007-11-19 # Completed: 899536 of 900180 jobs # Crashed: 644 jobs # CPU time in finished jobs: 131663141s 2194385.68m 36573.09h 1523.88d 4.175 y # IO & Wait Time: 12592457s 209874.29m 3497.90h 145.75d 0.399 y # Average job time: 160s 2.67m 0.04h 0.00d # Longest finished job: 1795s 29.92m 0.50h 0.02d # Submission to last job: 440290s 7338.17m 122.30h 5.10d # despite the '644 jobs' crashed, they are actually done and all results # are complete # continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat -bigClusterHub=pk > cat.log 2>&1 & # real 31m45.069s cat fb.calJac1.chainOrnAna1Link.txt # 0 bases of 2929139385 (0.000%) in intersection # This error was fixed in the script. It failed on a command in one of # the ssh scripts that happened to run under the bash shell which did # not detect the error in a set of piped commands. It was a shell # wild-card expansion problem, changed to a 'find' to avoid that. ########################################################################### ## BLASTZ Mouse Mm9 swap (DONE - 2007-09-07 - Hiram ssh kkstore06 # use a screen to control this job screen # the original alignment cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06 cat fb.mm9.chainCalJac1Link.txt # 863961573 bases of 2620346127 (32.971%) in intersection # the swap mkdir /cluster/data/calJac1/bed/blastz.mm9.swap cd /cluster/data/calJac1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \ -stop=load -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 & # real 217m10.835s cat fb.calJac1.chainMm9Link.txt # 887586922 bases of 2929139385 (30.302%) in intersection time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \ -continue=download -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 & # real 1m9.876s # run the syntenic net for multiple alignment (DONE - 2007-12-14 - Hiram) time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \ -syntenicNet -continue=syntenicNet -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 & # real 7m23.683s # failed during a chainSplit: # Can't open chain/Contig1203.chain to append: Too many open files # create reciprocal best chains/nets for 9-way multiple alignment ssh hgwdev cd /cluster/data/calJac1/bed/blastz.mm9.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 mm9 \ > rbest.log 2>&1 & time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 mm9 \ -continue=download > rbest.download.log 2>&1 & ########################################################################### # Blastz swap Chimp panTro2 (DONE - 2007-11-14 - Hiram) ssh kkstore06 screen # use screen to manage this job cd /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13 cat fb.panTro2.chainCalJac1Link.txt # 2220169777 bases of 2909485072 (76.308%) in intersection mkdir /cluster/data/calJac1/bed/blastz.panTro2.swap cd /cluster/data/calJac1/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 320m14.293s cat fb.calJac1.chainPanTro2Link.txt # 2264115411 bases of 2929139385 (77.296%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/calJac1/bed/blastz.panTro2.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 panTro2 \ > rbest.log 2>&1 & ########################################################################### # SWAP BLASTZ Orangutan ponAbe2 (DONE - 2007-11-29 - Hiram) # primary blastz result cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18 cat fb.ponAbe2.chainCalJac1Link.txt # 2310720863 bases of 3093572278 (74.694%) in intersection # and for the swap ssh kkstore02 mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 341m54.548s cat fb.calJac1.chainPonAbe2Link.txt # 2253236255 bases of 2929139385 (76.925%) in intersection # reciprocal best for 9-way maf alignments ssh hgwdev cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 ponAbe2 \ > rbest.log 2>&1 & # real 96m17.285s ########################################################################### # SWAP BLASTZ Dog canFam2 (DONE - 2007-11-30 - Hiram) # primary blastz result cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28 cat fb.canFam2.chainCalJac1Link.txt # 1369690756 bases of 2384996543 (57.429%) in intersection mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap cd /cluster/data/calJac1/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # encountered difficulties with /scratch/data/ on kolossus # had to finish the netChains.csh script manually, then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > load.log 2>&1 & # real 56m44.375s cat fb.calJac1.chainCanFam2Link.txt # 1451345669 bases of 2929139385 (49.549%) in intersection # reciprocal best for 9-way maf alignments ssh hgwdev # expects blastz.canFam2 to exist cd /cluster/data/calJac1/bed ln -s blastz.canFam2.swap blastz.canFam2 cd /cluster/data/calJac1/bed/blastz.canFam2 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 canFam2 \ > rbest.log 2>&1 & # real 70m45.324s ########################################################################### # SWAP BLASTZ Chimp rheMac2 (DONE - 2007-11-18 - Hiram) # primary blastz result cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16 cat fb.rheMac2.chainCalJac1Link.txt # 2055107003 bases of 2646704109 (77.648%) in intersection # and the download mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap cd /cluster/data/calJac1/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > swap.log 2>&1 & # real 349m36.073s cat fb.calJac1.chainRheMac2Link.txt # 2191300051 bases of 2929139385 (74.810%) in intersection # reciprocal best for 9-way maf alignments ssh hgwdev # expects blastz.rheMac2 to exist cd /cluster/data/calJac1/bed ln -s blastz.rheMac2.swap blastz.rheMac2 cd /cluster/data/calJac1/bed/blastz.rheMac2 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 rheMac2 \ > rbest.log 2>&1 & # real 87m53.651s ######################################################################### ## 9-Way Multiz (DONE - 2007-12-21 - Hiram) ## ssh hgwdev mkdir /cluster/data/calJac1/bed/multiz9way cd /cluster/data/calJac1/bed/multiz9way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get calJac1 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 9 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Marmoset_calJac1,Rhesus_rheMac2,Orangutan_ponAbe2,Dog_canFam2,Platypus_ornAna1,Opossum_monDom4 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ > 9-way.fullNames.nh # looks something like this: ((((Mouse_mm9:0.325818, ((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037, Orangutan_ponAbe2:0.020000):0.013037, Rhesus_rheMac2:0.031973):0.036500, Marmoset_calJac1:0.070000):0.058454):0.019763, Dog_canFam2:0.187963):0.243550, Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110); ((( (Mouse_mm9:0.325818, (Marmoset_calJac1:0.070000, (((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037, Orangutan_ponAbe2:0.020000):0.013037, Rhesus_rheMac2:0.031973):0.036500):0.058454):0.019763, Dog_canFam2:0.187963):0.243550, Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110); # rearrange to get Marmoset at the top: # this leaves us with: cat << '_EOF_' > calJac1.9-way.nh (((((Marmoset_calJac1:0.070000, (((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037, Orangutan_ponAbe2:0.020000):0.013037, Rhesus_rheMac2:0.031973):0.036500):0.058454, Mouse_mm9:0.325818):0.019763, Dog_canFam2:0.187963):0.243550, Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' calJac1.9-way.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 9 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' calJac1.9-way.nh \ | sed -e "s/ / /g"` > tree.9.nh # that looks like, as a single line: # (((((calJac1(((hg18 panTro2)ponAbe2)rheMac2)) mm9) canFam2) monDom4) ornAna1) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/calJac1/bed/multiz9way foreach db (`grep -v calJac1 species.list`) set bdir = /cluster/data/calJac1/bed/blastz.$db if (-e $bdir/mafRBestNet/calJac1.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/calJac1.$db.net.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/calJac1.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, the "mafs not found" should only show up on calJac1 ./listMafs.csh # canFam2 mafRBestNet # hg18 mafRBestNet # mm9 mafRBestNet # monDom4 mafNet # ornAna1 mafNet # panTro2 mafRBestNet # ponAbe2 mafRBestNet # rheMac2 mafRBestNet /cluster/bin/phast/all_dists calJac1.9-way.nh > 9way.distances.txt grep -i caljac 9way.distances.txt | sort -k3,3n Marmoset_calJac1 Human_hg18 0.138447 Marmoset_calJac1 Rhesus_rheMac2 0.138473 Marmoset_calJac1 Orangutan_ponAbe2 0.139537 Marmoset_calJac1 Chimp_panTro2 0.140242 Marmoset_calJac1 Dog_canFam2 0.336180 Marmoset_calJac1 Mouse_mm9 0.454272 Marmoset_calJac1 Opossum_monDom4 0.712488 Marmoset_calJac1 Platypus_ornAna1 0.968524 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCalJac1Link chain linearGap # distance on CalJac1 on other minScore # 1 0.138447 Human_hg18 (% 82.846) (% 78.351) 3000 medium # 2 0.138473 Rhesus_rheMac2 (% 74.810) (% 77.648) 3000 medium # 3 0.139537 Orangutan_ponAbe2 (% 76.925) (% 74.694) 3000 medium # 4 0.140242 Chimp_panTro2 (% 77.296) (% 76.308) 3000 medium # 5 0.336180 Dog_canFam2 (% 57.429) (% 49.549) 3000 medium # 6 0.454272 Mouse_mm9 (% 30.302) (% 32.971) 3000 medium # 6 0.712488 Opossum_monDom4 (% 13.357) (% 11.050) 5000 loose # 7 0.968524 Platypus_ornAna1 (% 7.221) (% 10.619) 5000 loose # copy net mafs to cluster-friendly storage, splitting chroms mkdir mafLinks cd mafLinks # hint: obtained these links by altering listMafs.csh above # add an echo statement to output these commands ln -s ../../blastz.canFam2/mafRBestNet/calJac1.canFam2.rbest.maf.gz \ mafLinks/canFam2.maf.gz ln -s ../../blastz.hg18/mafRBestNet/calJac1.hg18.rbest.maf.gz \ mafLinks/hg18.maf.gz ln -s ../../blastz.mm9/mafRBestNet/calJac1.mm9.rbest.maf.gz \ mafLinks/mm9.maf.gz ln -s ../../blastz.monDom4/mafNet/calJac1.monDom4.net.maf.gz \ mafLinks/monDom4.maf.gz ln -s ../../blastz.ornAna1/mafNet/calJac1.ornAna1.net.maf.gz \ mafLinks/ornAna1.maf.gz ln -s ../../blastz.panTro2/mafRBestNet/calJac1.panTro2.rbest.maf.gz \ mafLinks/panTro2.maf.gz ln -s ../../blastz.ponAbe2/mafRBestNet/calJac1.ponAbe2.rbest.maf.gz \ mafLinks/ponAbe2.maf.gz ln -s ../../blastz.rheMac2/mafRBestNet/calJac1.rheMac2.rbest.maf.gz \ mafLinks/rheMac2.maf.gz # need to split these things up by Contig number for efficient kluster run ssh kkstore06 cd /cluster/data/calJac1/bed/multiz9way/mafLinks mkdir -p /san/sanvol1/scratch/calJac1/multiz9way/contigMaf cd /scratch/tmp for D in `grep -v calJac1 /cluster/data/calJac1/bed/multiz9way/species.list` do mkdir /scratch/tmp/${D} cd /scratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \ /cluster/data/calJac1/bed/multiz9way/mafLinks/${D}.maf.gz -outDirDepth=2 rsync -a --progress ./ \ /san/sanvol1/scratch/calJac1/multiz9way/contigMaf/${D} cd /scratch/tmp rm -fr ${D} done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: cd /san/sanvol1/scratch/calJac1/multiz9way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/9-way.contig.list wc -l /tmp/9-way.contig.list # 36707 /tmp/9-way.contig.list # ready for the multiz run ssh pk mkdir /cluster/data/calJac1/bed/multiz9way/splitRun cd /cluster/data/calJac1/bed/multiz9way/splitRun scp -p kkstore06:/tmp/9-way.contig.list . mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = calJac1 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz9way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.9.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.9.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/calJac1/bed/multiz9way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP '_EOF_' # << emacs # it is a single job since everything is in the same maf file time nice -n +19 ./autoMultiz.csh calJac1 XXX - running 2007-12-21 16:32 on mkr0u3 sed -e "s/^\.\///" ../9-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... push ... etc # Completed: 36707 of 36707 jobs # CPU time in finished jobs: 244659s 4077.65m 67.96h 2.83d 0.008 y # IO & Wait Time: 115457s 1924.29m 32.07h 1.34d 0.004 y # Average job time: 10s 0.16m 0.00h 0.00d # Longest finished job: 249s 4.15m 0.07h 0.00d # Submission to last job: 2454s 40.90m 0.68h 0.03d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore06 cd /cluster/data/calJac1/bed/multiz9way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \ | sort -u > togetherMaf/calJac1.9way.maf for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/calJac1.9way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/calJac1.9way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \ | sort -u >> togetherMaf/calJac1.9way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/calJac1/multiz9way/maf ln -s /cluster/data/calJac1/bed/multiz9way/togetherMaf/*.maf \ /gbdb/calJac1/multiz9way/maf/multiz9way.maf # this generates an immense multiz9way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/calJac1/multiz9way/maf calJac1 multiz9way # real 5m6.330s # Loaded 8484286 mafs in 1 files from /gbdb/calJac1/multiz9way/maf # load summary table time nice -n +19 cat /gbdb/calJac1/multiz9way/maf/*.maf \ | hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz9waySummary stdin # real 5m58.150s # Created 121083 summary blocks from 3410157 components # and 693943 mafs from stdin # Gap Annotation # prepare bed files with gap info ssh kkstore02 mkdir /cluster/data/calJac1/bed/multiz9way/anno cd /cluster/data/calJac1/bed/multiz9way/anno mkdir maf run # these actually already all exist from previous multiple alignments for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v calJac1 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh memk # temporarily copy the calJac1.9way.maf file onto the memk # nodes /scratch/data/calJac1/maf/ directory for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} rsync -a --progress \ /cluster/data/calJac1/bed/multiz9way/togetherMaf/calJac1.9way.maf.gz \ /scratch/data/calJac1/maf/ done mkdir /cluster/data/calJac1/bed/multiz9way/anno/splitMaf # need to split up the single maf file into individual # per-scaffold maf files to run annotation on cd /cluster/data/calJac1/bed/multiz9way/anno/splitMaf # create bed files to list approximately 1553 scaffolds in # a single list, approximately 33 lists cat << '_EOF_' > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 1553) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); close (BH); '_EOF_' # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # now, run a mafsInRegion on each one of those lists cat << '_EOF_' > runOne #!/bin/csh -fe set runDir = "/cluster/data/calJac1/bed/multiz9way/anno/splitMaf" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/calJac1/$resultDir pushd /scratch/tmp/calJac1/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /scratch/data/calJac1/maf/calJac1.9way.maf popd rsync -q -a /scratch/tmp/calJac1/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/calJac1/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/calJac1 '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP ./runOne $(root1) #ENDLOOP '_EOF_' # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # Completed: 33 of 33 jobs # CPU time in finished jobs: 11075s 184.58m 3.08h 0.13d 0.000 y # IO & Wait Time: 22992s 383.20m 6.39h 0.27d 0.001 y # Average job time: 1032s 17.21m 0.29h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2019s 33.65m 0.56h 0.02d # Submission to last job: 22051s 367.52m 6.13h 0.26d cd /cluster/data/calJac1/bed/multiz9way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /scratch/data/calJac1/calJac1.2bit $result '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../splitMaf -type f -name "*.maf > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 36707 of 36707 jobs # CPU time in finished jobs: 94093s 1568.22m 26.14h 1.09d 0.003 y # IO & Wait Time: 94674s 1577.90m 26.30h 1.10d 0.003 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 6129s 102.15m 1.70h 0.07d ssh kkstore06 cd /cluster/data/calJac1/bed/multiz9way/anno grep "^##maf version" maf/file_0/Contig0.maf \ | sort -u > calJac1.anno.9way.maf find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> calJac1.anno.9way.maf echo "##eof maf" >> calJac1.anno.9way.maf ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/anno mkdir -p /gbdb/calJac1/multiz9way/anno ln -s `pwd`/calJac1.anno.9way.maf \ /gbdb/calJac1/multiz9way/anno/multiz9way.maf # by loading this into the table multiz9way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/calJac1/multiz9way/anno \ calJac1 multiz9way # Loaded 9243378 mafs in 1 files from /gbdb/calJac1/multiz9way/anno # real 5m39.367s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz9waySummary \ /gbdb/calJac1/multiz9way/anno/multiz9way.maf # Created 121083 summary blocks from 3410157 components and 749940 mafs # from /gbdb/calJac1/multiz9way/anno/multiz9way.maf # by loading this into the table multiz9waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz9way*.tab files in this /scratch/tmp directory rm multiz9way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/calJac1/multiz9way/maf/multiz9way.maf rmdir /gbdb/calJac1/multiz9way/maf ########################################################################### ## Annotate 9-way multiple alignment with gene annotations ## (DONE - 2008-01-08 - Hiram) # Gene frames ## given previous survey done for 8-way alignment on Orangutan, ## try using the following tables for this gene annotation # use knownGene for hg18, mm9 # use ensGene for monDom4, ornAna1, panTro2, rheMac2 # new try with xenoMrna for ponAbe2, canFam2 and calJac1 ssh hgwdev mkdir /cluster/data/calJac1/bed/multiz9way/frames cd /cluster/data/calJac1/bed/multiz9way/frames mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in monDom4 ornAna1 panTro2 rheMac2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # use xenoMrna for ponAbe2, canFam2, calJac1 # loxAfr1 oryCun1 ponAbe2 for DB in ponAbe2 canFam2 calJac1 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \ from xenoMrna,gbCdnaInfo,cds \ where (xenoMrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 2697811 Jan 8 15:43 calJac1.gp.gz # -rw-rw-r-- 1 2551552 Jan 8 15:41 canFam2.gp.gz # -rw-rw-r-- 1 2008806 Jan 8 15:33 hg18.gp.gz # -rw-rw-r-- 1 1965274 Jan 8 15:33 mm9.gp.gz # -rw-rw-r-- 1 1751726 Jan 8 15:33 monDom4.gp.gz # -rw-rw-r-- 1 1232719 Jan 8 15:33 ornAna1.gp.gz # -rw-rw-r-- 1 1980696 Jan 8 15:33 panTro2.gp.gz # -rw-rw-r-- 1 2703247 Jan 8 15:39 ponAbe2.gp.gz # -rw-rw-r-- 1 1935916 Jan 8 15:33 rheMac2.gp.gz ssh kkstore06 cd /cluster/data/calJac1/bed/multiz9way/frames # anything to annotate is in a pair, e.g.: calJac1 genes/calJac1.gp.gz time (cat ../anno/calJac1.anno.9way.maf | nice -n +19 genePredToMafFrames calJac1 stdin stdout calJac1 genes/calJac1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz9way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz9way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 206370 hg18 # 208834 panTro2 # 211731 rheMac2 # 224988 calJac1 # 225518 canFam2 # 225632 mm9 # 261163 ponAbe2 # 417544 ornAna1 # 462890 monDom4 # load the resulting file ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/frames time nice -n +19 hgLoadMafFrames calJac1 multiz9wayFrames \ multiz9way.mafFrames.gz # real 0m38.282s # enable the trackDb entries: # frames multiz9wayFrames # irows on ############################################################################# # phastCons 9-way (DONE - 2007-10-16 - Hiram) # split 9way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/calJac1/bed/multiz9way/msa.split cd /cluster/data/calJac1/bed/multiz9way/msa.split mkdir -p /san/sanvol1/scratch/calJac1/multiz9way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/calJac1/bed/multiz9way/anno/maf set WINDOWS = /san/sanvol1/scratch/calJac1/multiz9way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /scratch/data/calJac1/calJac1.2bit /scratch/tmp/calJac1.$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \ -M /scratch/tmp/calJac1.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/calJac1.$c.fa popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # CPU time in finished jobs: 5250s 87.50m 1.46h 0.06d 0.000 y # IO & Wait Time: 94631s 1577.18m 26.29h 1.10d 0.003 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9s 0.15m 0.00h 0.00d # Submission to last job: 3697s 61.62m 1.03h 0.04d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/calJac1/bed/multiz9way/cons/run.cons cd /cluster/data/calJac1/bed/multiz9way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04 set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/calJac1/bed/multiz9way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/calJac1/multiz9way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/calJac1/multiz9way/cons/all/pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/calJac1/multiz9way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/calJac1/bed/multiz9way/cons/ss.list # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ --prune-all-but=calJac1,hg18,panTro2,rheMac2,ponAbe2,mm9,canFam2,monDom4,ornAna1 \ > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/calJac1/multiz9way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 28485 of 28485 jobs # CPU time in finished jobs: 14082s 234.70m 3.91h 0.16d 0.000 y # IO & Wait Time: 188534s 3142.23m 52.37h 2.18d 0.006 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 17s 0.28m 0.00h 0.00d # Submission to last job: 72420s 1207.00m 20.12h 0.84d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all find ./bed -type f -name "Contig*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 3 minutes cp -p mostConserved.bed /cluster/data/calJac1/bed/multiz9way/cons/all # load into database ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/cons/all time nice -n +19 hgLoadBed calJac1 phastConsElements9way mostConserved.bed # Loaded 1297014 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits calJac1 phastConsElements9way # 141561229 bases of 2929139385 (4.833%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all mkdir -p phastCons9wayScores for D in `ls -1d pp/file* | sort -t_ -k2n` do F=${D/pp\/} out=phastCons9wayScores/${F}.data.gz echo "${D} > ${F}.data.gz" ls -S ${D}/*.pp | xargs cat | gzip > ${out} done # real 38m22.760s # copy the phastCons9wayScores to: # /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all ls -1 phastCons9wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \ | wigEncode -noOverlap stdin phastCons9way.wig phastCons9way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/calJac1/bed/multiz9way/cons/all # real 1m4.483s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/cons/all ln -s `pwd`/phastCons9way.wib /gbdb/calJac1/multiz9way/phastCons9way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/calJac1/multiz9way calJac1 \ phastCons9way phastCons9way.wig # real 0m56.271s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=calJac1 phastCons9way > histogram.data 2>&1 # real 5m0.608s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Orangutan PonAbe2 Histogram phastCons9way track" set xlabel " phastCons9way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons9way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-01-11 - Hiram) # Let's see if the downloads will work ssh hgwdev /cluster/data/calJac1 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/calJac1.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev calJac1 > jkStuff/downloads.log 2>&1 # real 24m3.210s # failed making upstream sequences: # featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-01-11 - Hiram) ssh hgwdev /cluster/data/calJac1 /cluster/bin/scripts/makePushQSql.pl calJac1 > jkStuff/pushQ.sql # output warnings: # calJac1 does not have seq # calJac1 does not have gbMiscDiff # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep ############################################################################# # Create 9-way downloads (DONE - 2008-03-28 - Hiram) ssh hgwdev mkdir -p /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way cd /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way cp -p \ /san/sanvol1/scratch/calJac1/multiz9way/cons/all/phastCons9wayScores/* . ln -s ../../cons/all/all.mod ./9way.mod cp /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt . # edit that README.txt to be correct for this 9-way alignment cd .. mkdir multiz9way cd multiz9way cp -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/README.txt . # edit that README.txt to be correct for this 9-way alignment ssh kkstore06 mkdir -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way ln -s ../../calJac1.9-way.nh ./9way.nh time nice -n +19 gzip -c ../../../anno/calJac1.anno.9way.maf \ > calJac1.9way.maf.gz # real 310m12.800s # unusual long time due to nice +19 and conflice with other long-running # jobs on kkstore06 ssh hgwdev cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way # creating upstream files from xenoRefGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=calJac1 GENE=xenoRefGene NWAY=multiz9way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh # real 119m5.562s # -rw-rw-r-- 1 42975041 Mar 28 14:27 upstream1000.maf.gz # -rw-rw-r-- 1 76363192 Mar 28 15:03 upstream2000.maf.gz # -rw-rw-r-- 1 303870318 Mar 28 15:42 upstream5000.maf.gz # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then xenoRefGene names, e.g.: # 51588 rheMac2 # 51588 ponAbe2 # 51588 panTro2 # 51588 ornAna1 # 51588 monDom4 # 51588 mm9 # 51588 hg18 # 51588 canFam2 # 18 NM_001033610 # 17 NM_016957 # 17 NM_000992 # 16 NM_181722 ssh kkstore06 cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way md5sum *.maf.gz > md5sum.txt cd ../phastCons9way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way mkdir /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way cd ../phastCons9way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-04-03 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/calJac1/bed/nscan/ wget http://mblab.wustl.edu/predictions/marmoset/calJac1/calJac1.gtf wget http://mblab.wustl.edu/predictions/marmoset/calJac1/calJac1.prot.fa wget http://mblab.wustl.edu/predictions/marmoset/calJac1/readme.html bzip2 calJac1.* chmod a-w * # load track gtfToGenePred -genePredExt calJac1.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt calJac1 nscanGene stdin hgPepPred calJac1 generic nscanPep calJac1.prot.fa.bz2 rm *.tab # update trackDb; need a calJac1-specific page to describe informants marmoset/calJac1/nscanGene.html (copy from readme.html) marmoset/calJac1/trackDb.ra # set search regex to termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9] ############################################################################# ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: calJac1.upstreamGeneTbl = xenoRefGene calJac1.upstreamMaf = multiz9way /hive/data/genomes/calJac1/bed/multiz9way/species.list ############################################################################ # QUALITY TRACK (DONE - 2008-11-25 - Hiram) mkdir /hive/data/genomes/calJac1/bed/qual cd /hive/data/genomes/calJac1/bed/qual # the qac file was created by Rico during 28-way annotations qacToWig -fixed ../quality/calJac1.qac stdout \ | wigEncode stdin qual.wig qual.wib ln -s `pwd`/qual.wib /gbdb/calJac1/wib hgLoadWiggle -pathPrefix=/gbdb/calJac1/wib calJac1 quality qual.wig ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # LIFTOVER TO calJac3 (DONE - 2010-02-11 - Hiram ) mkdir /hive/data/genomes/calJac1/bed/blat.calJac3.2010-02-11 cd /hive/data/genomes/calJac1/bed/blat.calJac3.2010-02-11 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug calJac1 calJac3 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ calJac1 calJac3 > do.log 2>&1 # real 36m16.693s #############################################################################