# for emacs: -*- mode: sh; -*- # This file describes browser build for the Orangutan # genome, July 31 2007 # # "$Id: ponAbe2.txt,v 1.23 2010/02/12 23:51:36 hiram Exp $" # # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABGA00 # ###################################################################### ## DOWNLOAD SEQUENCE (DONE - 2007-09-06 - Hiram) ssh kkstore02 mkdir /cluster/store5/ponAbe2 ln -s /cluster/store5/ponAbe2 /cluster/data/ponAbe2 mkdir /cluster/data/ponAbe2/wustl cd /cluster/data/ponAbe2/wustl # LaDeana Hillier provided us with a private copy before it got to the # public site wget --timestamping \ "http://genome.wustl.edu/pub/user/lhillier/private/P_abelii/orang_070731.tar.gz" tar xvzf orang_070731.tar.gz # creates a directory FINAL_070731 ls -ogrt # drwxrwsr-x 2 4096 Sep 6 03:29 FINAL_070731 # -rw-rw-r-- 1 906641840 Sep 6 05:01 orang_070731.tar.gz # chrX.agp and chr20.agp had blank lines at the end, remove those manually ####################################################################### ## create config.ra and run makeGenomeDb.pl (DONE - 2007-09-12 - Hiram) ssh kkstore02 cd /cluster/data/ponAbe2 cat << '_EOF_' > ponAbe2.config.ra # Config parameters for makeGenomeDb.pl: db ponAbe2 scientificName Pongo pygmaeus abelii commonName Orangutan assemblyDate Jul. 2007 assemblyLabel WUSTL 3.0 orderKey 31 # clade already on in ponAbe1 # clade mammal # genomeCladePriority 11 # mitoAcc gi:1743294 mitoAcc X97707.1 fastaFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.fa agpFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.agp # qualFiles /dev/null dbDbSpeciesDir orangutan '_EOF_' # << happy emacs time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \ ponAbe2.config.ra > makeGenomeDb.out 2>&1 & # real 19m49.887s # failed in the makeDb due to chrM vs. AGP confusion. Fixed the code, # and ran the jkStuff/makeDb.csh manually. Then continuing: time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \ -continue=dbDb -stop=dbDb ponAbe2.config.ra > dbDb.out 2>&1 & # no need to run the trackDb business since we can simply # copy the ponAbe1 directory in the source tree ########################################################################## ## Repeat masker (DONE - 2007-09-15 - Hiram) ssh kkstore02 ## use screen for this mkdir /cluster/data/ponAbe2/bed/RepeatMasker cd /cluster/data/ponAbe2/bed/RepeatMasker time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \ -bigClusterHub=pk \ -buildDir=/cluster/data/ponAbe2/bed/RepeatMasker ponAbe2 > do.out 2>&1 & # real 1831m46.301s ############################################################################## ## simpleRepeat masking (DONE - 2007-09-19 - Hiram) ## create a kki kluster run ssh kkr1u00 mkdir /iscratch/i/ponAbe2 cd /iscratch/i/ponAbe2 cp -p /cluster/data/ponAbe2/ponAbe2.unmasked.2bit . cp -p /cluster/data/ponAbe2/chrom.sizes . twoBitToFa ponAbe2.unmasked.2bit ponAbe2.unmasked.fa mkdir split # split sequence into individual chrom sequences faSplit sequence ponAbe2.unmasked.fa 100 split/s_ rm ponAbe2.unmasked.fa ponAbe2.unmasked.2bit for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/ done ssh kki mkdir -p /cluster/data/ponAbe2/bed/simpleRepeat/trf cd /cluster/data/ponAbe2/bed/simpleRepeat/trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set C = $1:r set SRC = /iscratch/i/ponAbe2/split/$C.fa mkdir -p /scratch/tmp/$C cp -p $SRC /scratch/tmp/$C/$C.fa pushd /scratch/tmp/$C /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf $C.fa \ /dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C popd rm -f $C.bed cp -p /scratch/tmp/$C/$C.bed . rm -fr /scratch/tmp/$C '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf $(path1) {check out line $(root1).bed} #ENDLOOP '_EOF_' # << happy emacs ls /iscratch/i/ponAbe2/split > part.list gensub2 part.list single template jobList para create jobList para try ... check ... push ... etc ... # Completed: 55 of 55 jobs # CPU time in finished jobs: 57854s 964.23m 16.07h 0.67d 0.002 y # IO & Wait Time: 473s 7.88m 0.13h 0.01d 0.000 y # Average job time: 1060s 17.67m 0.29h 0.01d # Longest finished job: 32538s 542.30m 9.04h 0.38d # Submission to last job: 32538s 542.30m 9.04h 0.38d ssh kkstore02 cd /cluster/data/ponAbe2/bed/simpleRepeat/trf cat s_*.bed > ../simpleRepeat.bed cd .. awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/ponAbe2/bed/simpleRepeat time nice -n +19 hgLoadBed ponAbe2 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 1036457 elements of size 16 nice -n +19 featureBits ponAbe2 simpleRepeat \ > fb.simpleRepeat.ponAbe2.txt 2>&1 & cat fb.simpleRepeat.ponAbe2.txt # 120386097 bases of 3093572278 (3.891%) in intersection # add the trfMask to the rmsk masked sequence to get our final # masked sequence ssh kkstore02 cd /cluster/data/ponAbe2 time nice -n +19 cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed ponAbe2.rmsk.2bit stdin ponAbe2.2bit # can safely ignore the warning about >= 13 fields # measure it time nice -n +19 twoBitToFa ponAbe2.2bit stdout \ | faSize stdin > faSize.ponAbe2.2bit.txt 2>&1 grep masked faSize.ponAbe2.2bit.txt # %45.68 masked total, %50.89 masked real ## clean up the /iscratch/i/ponAbe2/ directory ssh kkr1u00 cd /iscratch/i/ponAbe2 rm -fr * for R in 2 3 4 5 6 7 8 do rsync -a --progress --delete --stats /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/ ssh kkr${R}u00 rmdir /iscratch/i/ponAbe2 done cd .. rmdir ponAbe2 ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-09-19 - Hiram) # Replaced the ponAbe1 blat server with this sequence ssh hgwdev hgsql -e 'update blatServers set db="ponAbe2" where db="ponAbe1"' \ hgcentraltest # Do not need to do this insert, just the update above hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ponAbe2", "blat13", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ponAbe2", "blat13", "17785", "0", "1");' \ hgcentraltest # test it with some sequence, i.e. the GABRA3 protein from Hg18 # to establish the defaultPos next: ############################################################################ # Reset default position to GABRA3 gene location (like Hg18) hgsql -e \ 'update dbDb set defaultPos="chrX:152299762-152501081" where name="ponAbe2";' \ hgcentraltest ############################################################################ # SWAP BLASTZ Mouse Mm9 (DONE - 2007-09-20 - Hiram) # the original ssh kkstore02 screen # control with a screen cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19 cat fb.mm9.chainPonAbe2Link.txt # 914561309 bases of 2620346127 (34.902%) in intersection # And, for the swap mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap cd /cluster/data/ponAbe2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \ -chainMinScore=3000 -swap -chainLinearGap=medium \ -bigClusterHub=pk > swap.log 2>&1 & # real 102m23.209s cat fb.ponAbe2.chainMm9Link.txt # 948458190 bases of 3093572278 (30.659%) in intersection # create syntenicNets for 7-way conservation maf use time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \ -continue=syntenicNet -syntenicNet \ -chainMinScore=3000 -swap -chainLinearGap=medium \ -bigClusterHub=pk > synNet.log 2>&1 & # real 80m44.564s ############################################################################ # GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram) # Create a lift file as per the procedures for Chimp from the AGP: ssh kolossus cd /cluster/data/ponAbe2 cat ponAbe2.agp | /cluster/bin/scripts/agpToLift -revStrand \ > jkStuff/genbank.lft # MAKE 11.OOC FILE FOR BLAT blat ponAbe2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024 # Wrote 36396 overused 11-mers to 11.ooc # copy that over to /iscratch/i/ponAbe2/ and get that directory copied # to all the Iservers # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add ponAbe2 just after panTro2 # ponAbe2 # Orangutan ponAbe2.serverGenome = /cluster/data/ponAbe2/ponAbe2.2bit ponAbe2.clusterGenome = /iscratch/i/ponAbe2/ponAbe2.2bit ponAbe2.ooc = /iscratch/i/ponAbe2/11.ooc ponAbe2.align.unplacedChroms = chrUn,chr*_random ponAbe2.lift = /cluster/data/ponAbe2/jkStuff/genbank.lft ponAbe2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} ponAbe2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} ponAbe2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} ponAbe2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} ponAbe2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} ponAbe2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} ponAbe2.downloadDir = ponAbe2 ponAbe2.genbank.est.xeno.load = no ponAbe2.refseq.mrna.native.load = yes ponAbe2.refseq.mrna.xeno.load = yes ponAbe2.refseq.mrna.xeno.loadDesc = yes cvs ci -m "Added ponAbe2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *ponAbeNames[] = {"Pongo abelii", "Pongo pygmaeus", # "Pongo pygmaeus pygmaeus", "Pongo pygmaeus abelii", NULL}; cvs ci -m "Added Pongo pygmaeus abelii (Orangutan)." src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # 2008-02-14 12:28 add other Pongo names to gbGeneom.c, # ssh to genbank machine, remove data/aligned/*/ponAbe2 # and start this build again. # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial ponAbe2 & # logFile: var/build/logs/2008.02.14-12:28:36.ponAbe2.initalign.log # real 702m19.137s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ponAbe2 # logFile: var/dbload/hgwdev/logs/2008.02.22-17:08:01.dbload.log # real 21m12.354s # enable daily alignment and update of hgwdev (DONE - 2007-11-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add ponAbe2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added ponAbe2." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (SCRIPT DONE 10/1/07 angie -- README EDITS TODO) cd /cluster/data/ponAbe2 # Thought this was done in most chrom-based assemblies: ssh -x kkstore02 splitFileByColumn \ `pwd`/bed/simpleRepeat/trfMask.bed `pwd`/bed/simpleRepeat/trfMaskChrom # And these aren't quite where expected, so make some symbolic links: foreach c (5_h2_hap1 \ 6_cox_hap1 6_qbl_hap2 6_cox_hap1_random 6_qbl_hap2_random) set cOldBase = `echo $c | sed -e 's/_random$//;'` set cNewBase = `echo $c | sed -e 's/_.*//;'` ln -s `pwd`/$cOldBase/chr$c.agp $cNewBase/chr$c.agp ln -s `pwd`/$cOldBase/chr$c.fa.out $cNewBase/chr$c.fa.out end ln -s bed/RepeatMasker/ponAbe2.fa.out . makeDownloads.pl ponAbe2 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log #TODO: # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/ponAbe2/goldenPath/database/README.txt # /cluster/data/ponAbe2/goldenPath/bigZips/README.txt # /cluster/data/ponAbe2/goldenPath/chromosomes/README.txt # (The htdocs/goldenPath/ponAbe2/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). ######################################################################### ## genscan run (DONE - 2007-11-08 - Hiram) ## create hard masked sequence ssh kkstore02 cd /cluster/data/ponAbe2 for C in `cut -f1 chrom.sizes` do CN=${C/_*} CN=${CN/chr} echo "${CN}/${C}.hard.fa" twoBitToFa -seq=${C} ponAbe2.2bit stdout \ | maskOutFa stdin hard ${CN}/${C}.hard.fa ls -ld "${CN}/${C}.hard.fa" done # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts faCount ?/*hard.fa ??/*hard.fa > faCount.hard.txt # the lowest three are: egrep -v "^#|^total" faCount.hard.txt \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # chr6_qbl_hap2_random 44531 # chr5_h2_hap1 25615 # chrM 16079 # this is OK, the numbers mean there is plenty of sequence in those bits ssh kkr1u00 mkdir /iscratch/i/ponAbe2/hardChunks cd /iscratch/i/ponAbe2/hardChunks # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. cat /cluster/data/ponAbe2/?/*.hard.fa /cluster/data/ponAbe2/??/*.hard.fa \ | faSplit about stdin 4000000 c_ for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/hardChunks/ done ssh hgwdev mkdir /cluster/data/ponAbe2/bed/genscan cd /cluster/data/ponAbe2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/ponAbe2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /iscratch/i/ponAbe2/hardChunks/c_*.fa > genome.list # Create run-time script to operate gsBig in a cluster safe manner cat << '_EOF_' > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot '_EOF_' # << happy emacs chmod +x runGsBig cat << '_EOF_' > template #LOOP runGsBig /iscratch/i/ponAbe2/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... # Completed: 44 of 48 jobs # Crashed: 4 jobs # CPU time in finished jobs: 327299s 5454.99m 90.92h 3.79d 0.010 y # IO & Wait Time: 1091s 18.18m 0.30h 0.01d 0.000 y # Average job time: 7463s 124.39m 2.07h 0.09d # Longest finished job: 93797s 1563.28m 26.05h 1.09d # Submission to last job: 104573s 1742.88m 29.05h 1.21d # four jobs will not finish. They don't finish on kolossus either. # The sequence involved is: # c_12.fa -> chr8 # c_08.fa -> chr6 # c_06.fa -> chr5 # c_04.fa -> chr4 # So, split up these chroms on a couple of the non-bridged gaps ssh kkr1u00 cd /iscratch/i/ponAbe2/hardChunks mkdir ../genscanSplit # examine the gap sizes to see if there is a good size range to # break them up on, c_12 turns out to be about 30,000 faGapSizes c_12.fa # this makes 6 or 7 pieces for each of these faSplit -minGapSize=29999 -lift=../genscanSplit/c_12.lift gap c_12.fa \ 40000000 ../genscanSplit/c_12_ faSplit -minGapSize=29999 -lift=../genscanSplit/c_08.lift gap c_08.fa \ 40000000 ../genscanSplit/c_08_ faSplit -minGapSize=29999 -lift=../genscanSplit/c_06.lift gap c_06.fa \ 40000000 ../genscanSplit/c_06_ faSplit -minGapSize=29999 -lift=../genscanSplit/c_04.lift gap c_04.fa \ 40000000 ../genscanSplit/c_04_ # copy the genscanSplit directory to the other Iservers cd ../genscanSplit for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/genscanSplit/ done # and run these four items in a small kluster job # can re-use the runGsBig script from before, just a different # source directory location ssh kki mkdir /cluster/data/ponAbe2/bed/genscan/split cd /cluster/data/ponAbe2/bed/genscan/split # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt cat << '_EOF_' > template #LOOP ../runGsBig /iscratch/i/ponAbe2/genscanSplit $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs ls -1Sr /iscratch/i/ponAbe2/genscanSplit/c_*.fa > genome.list gensub2 genome.list single template jobList para create jobList # even after this split, three of these jobs would not complete # Completed: 22 of 25 jobs # Crashed: 3 jobs # CPU time in finished jobs: 17154s 285.89m 4.76h 0.20d 0.001 y # IO & Wait Time: 77s 1.29m 0.02h 0.00d 0.000 y # Average job time: 783s 13.05m 0.22h 0.01d # Longest finished job: 1135s 18.92m 0.32h 0.01d # Submission to last job: 22398s 373.30m 6.22h 0.26d # so, take those three jobs and split them up and run them like this # after all that is done, lifting all results back to their chroms for F in c_04 c_08 c_12 do sort -k1,1 -k4,4n gtf/${F}_*.gtf ../secondSplit/${F}_?.gtf | liftUp -type=.gtf stdout split.lift error stdin \ | sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \ | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \ > ${F}.gtf sort -k1,1 -k2,2n subopt/${F}_*.bed ../secondSplit/${F}_?.bed | liftUp -type=.bed stdout split.lift error stdin \ | sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \ | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \ > ${F}.bed cat pep/${F}_*.pep ../secondSplit/${F}_?.pep \ | sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \ | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \ > ${F}.pep echo $F done # that includes the secondary splits for three jobs too. # the result of all this is copied to ../gtf ../pep and ../subopt # to fully populate the original result, now continuing with the # processing of all results # cat and lift the results into single files ssh kkstore02 cd /cluster/data/ponAbe2/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/ponAbe2/bed/genscan ldHgGene ponAbe2 -gtf genscan genscan.gtf # Read 42190 transcripts in 314789 lines in 1 files # 42190 groups 54 seqs 1 sources 1 feature types # 42190 gene predictions hgPepPred ponAbe2 generic genscanPep genscan.pep hgLoadBed ponAbe2 genscanSubopt genscanSubopt.bed # Loaded 517874 elements of size 6 # check the numbers time nice -n +19 featureBits ponAbe2 genscan # 52587049 bases of 3093572278 (1.700%) in intersection ########################################################################### # Blastz Platypus ornAna1 (DONE - 2007-11-20 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13 cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13 cat << '_EOF_' > DEF # Orangutan vs. platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Orangutan PonAbe2 SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Platypus ornAna1 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 2585m50.826s # some of the jobs got confused during kk parasol hub reboots, where they # think they aren't done, but they actually are. A 'para recover' finds # nothing to re-run, all the result files are present. # Completed: 263242 of 263488 jobs # Crashed: 246 jobs # CPU time in finished jobs: 48351127s 805852.12m 13430.87h 559.62d 1.533 y # IO & Wait Time: 7609407s 126823.45m 2113.72h 88.07d 0.241 y # Average job time: 213s 3.54m 0.06h 0.00d # Longest finished job: 2146s 35.77m 0.60h 0.02d # Submission to last job: 259048s 4317.47m 71.96h 3.00d # after verifying all jobs are actually finished, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat -bigClusterHub=kk > cat.log 2>&1 & # real 53m18.757s cat fb.ponAbe2.chainOrnAna1Link.txt # 214557704 bases of 3093572278 (6.936%) in intersection mkdir /cluster/data/ornAna1/bed/blastz.ponAbe2.swap cd /cluster/data/ornAna1/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 120m20.860s cat fb.ornAna1.chainPonAbe2Link.txt # 201368027 bases of 1842236818 (10.931%) in intersection ########################################################################### # Blastz Marmoset calJac1 (DONE - 2007-11-19 - Hiram) ssh kkstore02 screen # use screen to control this job mkdir /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18 cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18 cat << '_EOF_' > DEF # Orangutan vs. Marmoset BLASTZ_M=50 # TARGET: Orangutan PonAbe2 SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Marmoset calJac1 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > do.log 2>&1 & # real 1441m # Completed: 97520 of 97520 jobs # CPU time in finished jobs: 21651347s 360855.79m 6014.26h 250.59d 0.687 y # IO & Wait Time: 10087025s 168117.08m 2801.95h 116.75d 0.320 y # Average job time: 325s 5.42m 0.09h 0.00d # Longest finished job: 5645s 94.08m 1.57h 0.07d # Submission to last job: 67173s 1119.55m 18.66h 0.78d cat fb.ponAbe2.chainCalJac1Link.txt # 2310720863 bases of 3093572278 (74.694%) in intersection # reciprocal best net mafs for 7-way multiz time nice -n +19 doRecipBest.pl -workhorse=hgwdev ponAbe2 calJac1 \ > rbest.log 2>&1 # this failed at the download step, expects to run on hgwdev # So, let's see what it makes on hgwdev ssh hgwdev cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18 time nice -n +19 doRecipBest.pl -continue=download -workhorse=hgwdev \ ponAbe2 calJac1 > rbest.download.log 2>&1 & # it populates the directory: # /usr/local/apache/htdocs/goldenPath/ponAbe2/vsCalJac1/reciprocalBest ssh kkstore02 mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 341m54.548s cat fb.calJac1.chainPonAbe2Link.txt # 2253236255 bases of 2929139385 (76.925%) in intersection ######################################################################### ## 8-Way Multiz (DONE - 2007-11-27 - Hiram) ## ssh hgwdev mkdir /cluster/data/ponAbe2/bed/multiz8way cd /cluster/data/ponAbe2/bed/multiz8way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get ponAbe2 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 8 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Platypus_ornAna1,Opossum_monDom4 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ > 8-way.fullNames.nh # looks something like this: (((Mouse_mm9:0.325818, ((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037, Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500, Marmoset_calJac1:0.070000):0.058454):0.263313, Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110); # rearrange to get Orangutan at the top: # this leaves us with: cat << '_EOF_' > ponAbe2.8-way.nh ((((((Orangutan_ponAbe2:0.020000, (Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037):0.013037, Rhesus_rheMac2:0.031973):0.036500, Marmoset_calJac1:0.070000):0.058454, Mouse_mm9:0.325818):0.263313, Opossum_monDom4:0.320721):0.088647, Platypus_ornAna1:0.488110); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' ponAbe2.8-way.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 8 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' ponAbe2.8-way.nh \ | sed -e "s/ / /g"` > tree.8.nh # that looks like, as a single line: # ((((((ponAbe2 (hg18 panTro2)) rheMac2) calJac1) mm9) monDom4) ornAna1) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/ponAbe2/bed/multiz8way foreach db (`cat species.list`) set bdir = /cluster/data/ponAbe2/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, the "mafs not found" should only show up on ponAbe2 ./listMafs.csh # calJac1 mafRBestNet # hg18 mafSynNet # mm9 mafSynNet # monDom4 mafNet # ornAna1 mafNet # panTro2 mafSynNet # ponAbe2 mafs not found # rheMac2 mafSynNet /cluster/bin/phast/all_dists ponAbe2.8-way.nh > 8way.distances.txt grep -i ponabe 8way.distances.txt | sort -k3,3n # Orangutan_ponAbe2 Human_hg18 0.038910 # Orangutan_ponAbe2 Chimp_panTro2 0.040705 # Orangutan_ponAbe2 Rhesus_rheMac2 0.065010 # Orangutan_ponAbe2 Marmoset_calJac1 0.139537 # Orangutan_ponAbe2 Mouse_mm9 0.453809 # Orangutan_ponAbe2 Opossum_monDom4 0.712025 # Orangutan_ponAbe2 Platypus_ornAna1 0.968061 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainPonAbe2Link chain linearGap # distance on PonAbe2 on other minScore # 1 0.038910 Human_hg18 (% 91.302) (% 92.892) 3000 medium # 2 0.040705 Chimp_panTro2 (% 89.431) (% 91.033) 3000 medium # 3 0.065010 Rhesus_rheMac2 (% 82.623) (% 88.157) 3000 medium # 4 0.139537 Marmoset_calJac1 (% 74.694) (% 76.925) 3000 medium # 5 0.453809 Mouse_mm9 (% 30.659) (% 34.902) 3000 medium # 6 0.712025 Opossum_monDom4 (% 11.306) (% 13.137) 5000 loose # 7 0.968061 Platypus_ornAna1 (% 6.936) (% 10.931) 5000 loose # copy net mafs to cluster-friendly storage, splitting chroms # into 50MB chunks to improve run-time # NOTE: splitting will be different for scaffold-based reference asemblies ssh hgwdev mkdir /cluster/data/ponAbe2/bed/multiz8way/run.split cd /cluster/data/ponAbe2/bed/multiz8way/run.split # this works by examining the rmsk table for likely repeat areas # that won't be used in blastz mafSplitPos ponAbe2 50 mafSplit.bed ssh kki cd /cluster/data/ponAbe2/bed/multiz8way/run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set targDb = "ponAbe2" set db = $1 set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet mkdir -p $sdir if (-e $sdir/$db) then echo "directory $sdir/$db already exists -- remove and retry" exit 1 endif set bdir = /cluster/data/$targDb/bed/blastz.$db if (! -e $bdir) then echo "directory $bdir not found" exit 1 endif mkdir -p $sdir/$db if (-e $bdir/mafRBestNet) then set mdir = $bdir/mafRBestNet else if (-e $bdir/mafSynNet) then set mdir = $bdir/mafSynNet else if (-e $bdir/mafNet) then set mdir = $bdir/mafNet else echo "$bdir maf dir not found" exit 1 endif echo $mdir foreach f ($mdir/*) set c = $f:t:r:r echo " $c" nice mafSplit mafSplit.bed $sdir/$db/ $f end echo "gzipping $sdir/$db mafs" nice gzip $sdir/$db/* endif echo $mdir > $db.done '_EOF_' # << happy emacs chmod +x doSplit.csh grep -v ponAbe2 ../species.list > split.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out line+ $(path1).done} #ENDLOOP '_EOF_' gensub2 split.list single template jobList para create jobList # 7 jobs # start these gently, this is a good load on the san filesystem para -maxPush=3 push # wait a while, verify these are running OK para push # let that run to a couple completions, a few minutes, then again: para try # etc ... # Completed: 7 of 7 jobs # CPU time in finished jobs: 6048s 100.81m 1.68h 0.07d 0.000 y # IO & Wait Time: 593s 9.88m 0.16h 0.01d 0.000 y # Average job time: 949s 15.81m 0.26h 0.01d # Longest finished job: 1451s 24.18m 0.40h 0.02d # Submission to last job: 1471s 24.52m 0.41h 0.02d # ready for the multiz run ssh pk cd /cluster/data/ponAbe2/bed/multiz8way # actually, the result directory here should be maf.split instead of maf mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # list chrom chunks, any db dir will do; better would be for the # splitter to generate this file # We temporarily use __ instead of . to delimit chunk in filename # so we can use $(root) to get basename find /san/sanvol1/scratch/ponAbe2/splitStrictMafNet -type f \ | while read F; do basename $F; done \ | sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list wc -l chromChunks.list # 104 cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = ponAbe2 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet rm -fr $tmp mkdir -p $tmp cp ../tree.8.nh ../species.list $tmp pushd $tmp foreach s (`cat species.list`) set c2 = `echo $c | sed 's/__/./'` set in = $pairs/$s/$c2.maf set out = $db.$s.sing.maf if ($s == ponAbe2) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs gensub2 chromChunks.list single template jobList para create jobList # the 104 jobs run time: # Completed: 104 of 104 jobs # CPU time in finished jobs: 174748s 2912.47m 48.54h 2.02d 0.006 y # IO & Wait Time: 2952s 49.20m 0.82h 0.03d 0.000 y # Average job time: 1709s 28.48m 0.47h 0.02d # Longest finished job: 3340s 55.67m 0.93h 0.04d # Submission to last job: 4413s 73.55m 1.23h 0.05d # put the split maf results back together into single chroms ssh kkstore02 cd /cluster/data/ponAbe2/bed/multiz8way # here is where the result directory maf should have already been maf.split mv maf maf.split mkdir maf # going to sort out the redundant header garbage to leave a cleaner maf for C in `ls maf.split | sed -e "s#__.*##" | sort -u` do echo ${C} head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf done # load tables for a look ssh hgwdev mkdir -p /gbdb/ponAbe2/multiz8way/maf ln -s /cluster/data/ponAbe2/bed/multiz8way/maf/*.maf \ /gbdb/ponAbe2/multiz8way/maf # this generates a large 1 Gb multiz8way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/ponAbe2/multiz8way/maf ponAbe2 multiz8way # real 6m37.534s # Loaded 6981959 mafs in 55 files from /gbdb/ponAbe2/multiz8way/maf # load summary table time nice -n +19 cat /gbdb/ponAbe2/multiz8way/maf/*.maf \ | hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary stdin # Created 1417364 summary blocks from 29928557 components # and 6981421 mafs from stdin # real 21m35.057s # Gap Annotation # prepare bed files with gap info ssh kkstore02 mkdir /cluster/data/ponAbe2/bed/multiz8way/anno cd /cluster/data/ponAbe2/bed/multiz8way/anno mkdir maf run # these actually already all exist from previous multiple alignments for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v ponAbe2 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh kki cd /cluster/data/ponAbe2/bed/multiz8way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/ponAbe2/bed/multiz8way set c = $1 cat $dir/maf/${c}.maf | \ nice mafAddIRows -nBeds=nBeds stdin /cluster/data/ponAbe2/ponAbe2.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/anno/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/ponAbe2/chrom.sizes > chrom.list gensub2 chrom.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 55 of 55 jobs # CPU time in finished jobs: 2278s 37.97m 0.63h 0.03d 0.000 y # IO & Wait Time: 9909s 165.15m 2.75h 0.11d 0.000 y # Average job time: 222s 3.69m 0.06h 0.00d # Longest finished job: 3340s 55.67m 0.93h 0.04d # Submission to last job: 3897s 64.95m 1.08h 0.05d ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/anno mkdir -p /gbdb/ponAbe2/multiz8way/anno/maf ln -s /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf \ /gbdb/ponAbe2/multiz8way/anno/maf # by loading this into the table multiz8way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ponAbe2/multiz8way/anno/maf \ ponAbe2 multiz8way # Loaded 7331265 mafs in 55 files from /gbdb/ponAbe2/multiz8way/anno/maf # real 8m31.092s cat /cluster/data/ponAbe2/chrom.sizes | \ awk '{if ($2 > 1000000) { print $1 }}' | while read C do echo /gbdb/ponAbe2/multiz8way/anno/maf/$C.maf done | xargs cat | \ hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary stdin # Created 1417364 summary blocks from 29928557 components and 7330669 # mafs from stdin # by loading this into the table multiz8waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz8way*.tab files in this /scratch/tmp directory rm multiz8way* ############################################################################# ## Annotate 8-way multiple alignment with gene annotations ## (DONE - 2007-11-27 - Hiram) # Gene frames ## survey all genomes to see what type of gene track to use ssh hgwdev mkdir /cluster/data/ponAbe2/bed/multiz8way/frames cd /cluster/data/ponAbe2/bed/multiz8way/frames # dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " echo -n "Tables: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql ponAbe2 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql ponAbe2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh # given this output, manually sorted for this display: # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29037, # refGene: 25902, Mrnas: 209126 # mm9: Tables: knownGene: 49409, mgcGenes: 22950, refGene: 21146, Mrnas: 242485 # monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398 # ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141 # panTro2: Tables: ensGene: 32852, refGene: 26179, Mrnas: 1297 # rheMac2: Tables: ensGene: 38561, refGene: 426, Mrnas: 3202 # calJac1: Tables: Mrnas: 952 # ponAbe2: Tables: Mrnas: 2 # from this information, conclusions: # use knownGene for hg18, mm9 # use ensGene for monDom4, ornAna1, panTro2, rheMac2 # no annotations for calJac1, ponAbe2 mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in monDom4 ornAna1 panTro2 rheMac2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 2008806 Nov 27 14:37 hg18.gp.gz # -rw-rw-r-- 1 1965274 Nov 27 14:37 mm9.gp.gz # -rw-rw-r-- 1 1751726 Nov 27 14:37 monDom4.gp.gz # -rw-rw-r-- 1 1232719 Nov 27 14:37 ornAna1.gp.gz # -rw-rw-r-- 1 1980696 Nov 27 14:37 panTro2.gp.gz # -rw-rw-r-- 1 1935916 Nov 27 14:37 rheMac2.gp.gz ################################################## # redmine GB - Feature #480 - missing self frames on multiz # NwayFrames tables (DONE 2010-07-29) # re-run the genePredToMafFrames with ponAbe2 genes/ponAbe2 # # fixed up steps: # mv frames.log frames.2007-11-27.log # mv multiz8way.mafFrames.gz multiz8way.mafFrames.2007-11-27.gz # Since ponAbe2 has ensGene: 41488, refGene: 1337, Mrnas: 2153 # use ensGene for ponAbe2: for DB in ponAbe2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ssh hgwdv sizeG=188743680 export sizeG ulimit -d $sizeG ulimit -v $sizeG cd /cluster/data/ponAbe2/bed/multiz8way/frames # leaving out calJac1 ponAbe2 time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames ponAbe2 stdin stdout ponAbe2 genes/ponAbe2.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz panTro2 genes/panTro2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz8way.mafFrames.gz) > frames.log 2>&1 # real 6m52.150s # see what it looks like in terms of number of annotations per DB: zcat multiz8way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 192293 ponAbe2 # 219627 hg18 # 221459 panTro2 # 230108 rheMac2 # 238734 mm9 # 412384 ornAna1 # 459521 monDom4 ## output from multiz8way.mafFrames.2007-11-27.gz matched # 219627 hg18 # 221459 panTro2 # 230108 rheMac2 # 238734 mm9 # 412384 ornAna1 # 459521 monDom4 # load the resulting file ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/frames time nice -n +19 hgLoadMafFrames ponAbe2 multiz8wayFrames \ multiz8way.mafFrames.gz # real 0m18.905s # enable the trackDb entries: # frames multiz8wayFrames # irows on ############################################################################# # phastCons 8-way (DONE - 2007-10-16 - Hiram) # split 8way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh kki mkdir /cluster/data/ponAbe2/bed/multiz8way/msa.split cd /cluster/data/ponAbe2/bed/multiz8way/msa.split mkdir -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/ponAbe2/bed/multiz8way/maf set WINDOWS = /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss pushd $WINDOWS set c = $1 rm -fr $c mkdir $c twoBitToFa -seq=$c /scratch/data/ponAbe2/ponAbe2.2bit /scratch/tmp/ponAbe2.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \ -M /scratch/tmp/ponAbe2.$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 rm -f $CLEAN_MAF /scratch/tmp/ponAbe2.$c.fa popd date >> $c.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../maf | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # push these with a -maxPush number that would put a single job on # each iserver, you don't want two jobs at a time on each iserver, # they consume too much memory and cause swapping to the point where # they slow to a crawl and take hours to finish. # Completed: 55 of 55 jobs # CPU time in finished jobs: 2679s 44.65m 0.74h 0.03d 0.000 y # IO & Wait Time: 2980s 49.67m 0.83h 0.03d 0.000 y # Average job time: 103s 1.71m 0.03h 0.00d # Longest finished job: 716s 11.93m 0.20h 0.01d # Submission to last job: 10391s 173.18m 2.89h 0.12d # XXXX Estimates were attempted, not really very useful, instead, as seen # below, merely take the cons and noncons trees from the mouse 30-way # Estimate phastCons parameters # see also: # http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html # Create a list of .ss files over 3,000,000 in length # this is almost everything cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \ awk '$5 > 3000000 {print $9;}' > ../tuningRun.list # Set up parasol directory to calculate trees on these 50 regions ssh kk mkdir /cluster/data/ponAbe2/bed/multiz8way/treeRun2 cd /cluster/data/ponAbe2/bed/multiz8way/treeRun2 mkdir tree log most # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set SAN="/san/sanvol1/scratch/ponAbe2/multiz8way/cons" set SS=$1 set C=$1:h set F=$1:t set tmpDir="/scratch/tmp/pA2_$2" rm -fr $tmpDir mkdir $tmpDir mkdir -p log/${C} tree/${C} most/${C} scp -p hiram@pk:$SAN/ss/$1 $tmpDir/$F scp -p hiram@pk:$SAN/estimate/starting-tree.mod $tmpDir pushd $tmpDir /cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \ --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \ --quiet --log $F.log --estimate-trees $F.tree popd cp -p $tmpDir/$F.log log/$C cp -p $tmpDir/$F.most most/$C cp -p $tmpDir/$F.tree.*cons.mod tree/$C rm -fr $tmpDir '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) $(num1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it scp -p hiram@pk:/san/sanvol1/scratch/ponAbe2/multiz8way/cons/tuningRun.list . gensub2 tuningRun.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 51439s 857.32m 14.29h 0.60d 0.002 y # IO & Wait Time: 948s 15.79m 0.26h 0.01d 0.000 y # Average job time: 1048s 17.46m 0.29h 0.01d # Longest finished job: 1388s 23.13m 0.39h 0.02d # Submission to last job: 4376s 72.93m 1.22h 0.05d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt sort -k1,1 -k2,2n most/chr*/*.most > mostConserved.bed wc -l mostConserved.bed # at --target_coverage .3 and --expected_length 45 # 1159312 most conserved elements # primates only at --target_coverage .75 and --expected_length 1000 # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 --NH 9.78 \ ave.cons.mod ave.noncons.mod # ( Solving for new omega: 45.000000 21.095519 13.646969 11.146376 # 10.499722 10.437794 10.437185 ) # Transition parameters: gamma=0.300000, omega=45.000000, # mu=0.022222, nu=0.009524 # Relative entropy: H=0.415553 bits/site # Expected min. length: L_min=30.634230 sites # Expected max. length: L_max=24.942235 sites # Phylogenetic information threshold: PIT=L_min*H=12.730148 bits # Recommended expected length: omega=10.437185 sites (for L_min*H=9.780000) # and checking the most conserved areas, on hgwdev: # at --target_coverage .3 and --expected_length 45 featureBits -noRandom -noHap ponAbe2 mostConserved.bed # 221759187 bases of 2723012131 (8.144%) in intersection featureBits -noRandom -noHap -enrichment ponAbe2 genscan:cds \ mostConserved.bed # genscan:cds 1.688%, mostConserved.bed 8.144%, both 0.146%, # cover 8.63%, enrich 1.06x # primates only at --target_coverage .75 and --expected_length 1000 featureBits -noRandom -noHap ponAbe2 mostConserved.bed # 524207695 bases of 2723012131 (19.251%) in intersection featureBits -enrichment -noRandom -noHap ponAbe2 genscan:cds \ mostConserved.bed # genscan:cds 1.688%, mostConserved.bed 19.251%, both 0.314%, # cover 18.61%, enrich 0.97x # Estimates could be made, but more correctly, take the 30-way # .mod file, and re-use it here. ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . ssh hgwdev screen # use screen to manage this long running job cd /cluster/data/ponAbe2/bed/multiz8way/msa.split # using one of the sections of chr1 as an example time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss/chr1/chr1.50000209-60000000.ss \ --tree \ "((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \ --out-root starting-tree # real 1m35.979s # create a single ss file from a whole chrom maf file /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \ -o SS -r chr1 -w 300000000,0 -I 1000 -B 5000 # run a phyloFit on that large .ss file to find a starting-tree time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \ chr1.1-229942017.ss --tree \ "((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \ --out-root starting-tree # Reading alignment from chr1.1-229942017.ss ... # Compacting sufficient statistics ... # Fitting tree model to chr1.1-229942017.ss using REV ... # Writing model to starting-tree.mod ... # Done. # real 5m30.470s # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.355 # This 0.355 is used in the --gc argument below # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh pk mkdir -p /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons cd /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04 set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/ponAbe2/bed/multiz8way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/ponAbe2/multiz8way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf . cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod . cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 4 touch $san/$grp/pp/$c $san/$grp/bed/$c rm -f $san/$grp/pp/$c/$f.pp rm -f $san/$grp/bed/$c/$f.bed mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # it may be better to have the result check file be the .pp file, since # this .bed file is sometimes empty even though there are .pp results cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/ponAbe2/multiz8way/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/ponAbe2/bed/multiz8way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ --prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \ > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # it may be better to have the result check file be the .pp file, since # this .bed file is sometimes empty even though there are .pp results # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 368 of 368 jobs # CPU time in finished jobs: 11954s 199.23m 3.32h 0.14d 0.000 y # IO & Wait Time: 3688s 61.47m 1.02h 0.04d 0.000 y # Average job time: 43s 0.71m 0.01h 0.00d # Longest finished job: 52s 0.87m 0.01h 0.00d # Submission to last job: 577s 9.62m 0.16h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/all # load into database ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/cons/all time nice -n +19 hgLoadBed ponAbe2 phastConsElements8way mostConserved.bed # Loaded 1096981 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have and gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits ponAbe2 phastConsElements8way # 134611760 bases of 3093572278 (4.351%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons8wayScores for D in pp/chr* do C=${D/pp\/} out=phastCons8wayScores/${C}.data.gz echo "${D} > ${C}.data.gz" ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \ gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 38m22.760s # copy the phastCons8wayScores to: # /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all time nice -n +19 ls phastCons8wayScores/*.data.gz | xargs zcat \ | wigEncode -noOverlap stdin phastCons8way.wig phastCons8way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 23m55.813s time nice -n +19 cp -p *.wi? /cluster/data/ponAbe2/bed/multiz8way/cons/all # real 2m2.107s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/cons/all ln -s `pwd`/phastCons8way.wib /gbdb/ponAbe2/multiz8way/phastCons8way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \ phastCons8way phastCons8way.wig # real 1m27.666s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ponAbe2 phastCons8way > histogram.data 2>&1 # real 5m0.608s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Orangutan PonAbe2 Histogram phastCons8way track" set xlabel " phastCons8way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ### Create a phastCons data set for primates-only # XXX - this was only a test, this data was not used # setup primates-only run ssh pk cd /cluster/data/ponAbe2/bed/multiz8way/cons mkdir primates run.cons/primates cd primates # primates-only: exclude all but these for phastCons tree: /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ --prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1 \ > primates.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "monDom4,ornAna1" > primates.non-inf cd ../run.cons/primates # root1 == chrom name, file1 == ss file name without .ss suffix # it may be better to have the result check file be the .pp file, since # this .bed file is sometimes empty even though there are .pp results # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Four of these jobs fail to produce any output: # chr5_h2_hap1/chr5_h2_hap1.1-191932.bed # chr6_cox_hap1_random/chr6_cox_hap1_random.1-239941.bed # chr6_qbl_hap2_random/chr6_qbl_hap2_random.1-120011.bed # chrM/chrM.1-16499.bed # this chrM missing one is interesting. It should have something ? # I don't know. They all have data in their pp files. # Completed: 364 of 368 jobs # Crashed: 4 jobs # CPU time in finished jobs: 11969s 199.49m 3.32h 0.14d 0.000 y # IO & Wait Time: 3757s 62.61m 1.04h 0.04d 0.000 y # Average job time: 43s 0.72m 0.01h 0.00d # Longest finished job: 56s 0.93m 0.02h 0.00d # Submission to last job: 2362s 39.37m 0.66h 0.03d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/euarchontoglires cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/primates # load into database ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates time nice -n +19 hgLoadBed ponAbe2 phastConsElements8wayPrimates \ mostConserved.bed # Loaded 298598 elements of size 5 # real 0m8.768s # verify coverage featureBits ponAbe2 phastConsElements8wayPrimates # 92315637 bases of 3093572278 (2.984%) in intersection # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates mkdir downloads for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.primates.pp.data.gz echo $D $C done done # Create merged posterier probability file and wiggle track data files cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates time nice -n +19 ls downloads/chr*.data.gz | xargs zcat \ | wigEncode -noOverlap stdin \ phastCons8wayPrimates.wig phastCons8wayPrimates.wib # Converted stdin, upper limit 1.00, lower limit 0.00 ## load table with wiggle data ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates cp -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/*.wi? . ln -s `pwd`/phastCons8wayEuarch.wib \ /gbdb/ponAbe2/multiz8way/phastCons8wayPrimates.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \ phastCons8wayPrimates phastCons8wayPrimates.wig # real 0m44.161s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ponAbe2 phastCons8wayPrimates > histogram.data 2>&1 # real 5m15.581s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Orantutan PonAbe2 Histogram phastCons8wayPrimates track" set xlabel " phastCons8wayPrimates score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# ## Create downloads for 8-way business (DONE - 2007-12-27 - Hiram) ssh kkstore02 mkdir -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf cp -p /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf . time nice -n +19 gzip -v *.maf # creating upstream files ssh hgwdev cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf # creating upstream files from xenoRefGene, bash script: DB=ponAbe2 GENE=xenoRefGene NWAY=multiz8way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # ssh kkstore02 cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf md5sum *.maf.gz > md5sum.txt cd .. ln -s ../../ponAbe2.8-way.nh ./8way.nh ln -s $HOME/kent/src/hg/makeDb/trackDb/orangutan/ponAbe2/multiz8way.html . # copy README.txt from mm9 30-way downloads and edit to represent # this directory cd .. mkdir -p phastCons8way/phastConsScores cd phastCons8way/phastConsScores cp -p \ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/phastCons8wayScores/*.data.gz . md5sum *.data.gz > md5sum.txt cd .. ln -s ../../cons/all/all.mod 8way.mod # copy README.txt from mm9 30way downloads and edit to represent # this directory # enable them for hgdownload rsync transfer ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/ponAbe2 mkdir multiz8way phastCons8way cd multiz8way ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/8way.nh . ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/*.html . ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/R*.txt . mkdir maf cd maf ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/*.maf.gz . ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/md5sum.txt cd ../../phastCons8way mkdir phastConsScores cd phastConsScores ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/*.gz . ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/md5sum.txt . cd .. ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/8way.mod . ln -s \ /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt . ########################################################################### # BLASTZ SWAP Human Hg18 (DONE - 2007-10-05 - Hiram) ssh kkstore02 screen # use a screen to control this job # the original blastz cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02 cat fb.hg18.chainPonAbe2Link.txt # 2676696124 bases of 2881515245 (92.892%) in intersection # And the swap mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap cd /cluster/data/ponAbe2/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 123m9.197s cat fb.ponAbe2.chainHg18Link.txt # 2824501297 bases of 3093572278 (91.302%) in intersection # create syntenicNets for 7-way conservation maf use time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=syntenicNet -syntenicNet \ -swap -bigClusterHub=pk > synNet.log 2>&1 & # real 91m18.097s ########################################################################### # BLASTZ SWAP Chimp PanTro2 (DONE - 2007-10-16 - Hiram) ssh kkstore04 screen # use screen to control this job # the original blastz cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13 cat fb.panTro2.chainPonAbe2Link.txt # 2648603020 bases of 2909485072 (91.033%) in intersection # and, for the swap mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 160m51.639s cat fb.ponAbe2.chainPanTro2Link.txt # 2766615040 bases of 3093572278 (89.431%) in intersection # real 2597m24.771s # create syntenicNets for 7-way conservation maf use time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \ -continue=syntenicNet -syntenicNet \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > synNet.log 2>&1 & # real 131m30.045s ########################################################################### # BLASTZ SWAP Rhesus rheMac2 (DONE - 2007-10-18 - Hiram) ssh kkstore01 screen # use screen to control this job cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16 cat fb.rheMac2.chainPonAbe2Link.txt # 2333264093 bases of 2646704109 (88.157%) in intersection # and, for the swap mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > swap.log 2>&1 & # real 289m9.479s cat fb.ponAbe2.chainRheMac2Link.txt # 2556010573 bases of 3093572278 (82.623%) in intersection # create syntenicNets for 7-way conservation maf use time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \ -continue=syntenicNet -syntenicNet \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -bigClusterHub=kk > synNet.log 2>&1 & # real 62m10.816s ########################################################################### # BLASTZ SWAP Opossum monDom4 (DONE - 2007-11-26 - Hiram) ssh kkstore04 screen # use screen to control this job # the original blastz cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22 cat fb.monDom4.chainPonAbe2Link.txt # 395887393 bases of 3501643220 (11.306%) in intersection # and for the swap mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \ /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 223m38.325s cat fb.ponAbe2.chainMonDom4Link.txt # 406409435 bases of 3093572278 (13.137%) in intersection ########################################################################### ## PUSHQ entry creation (DONE - 2007-12-12 - Hiram) ssh hgwdev mkdir /cluster/data/ponAbe2/pushQ cd /cluster/data/ponAbe2/pushQ # had to touch # goldenPath/ponAbe2/liftOver/ponAbe2ToPonAbe1.over.chain.gz # to get it to pass that check. (there isn't any ponAbe1 release) /cluster/bin/scripts/makePushQSql.pl -noGenbank ponAbe2 \ > ponAbe2.sql 2> stderr.out ## check the stderr.out for anything that needs to be fixed ## copy ponAbe2.sql to hgwbeta:/tmp scp ponAbe2.sql hgwbeta:/tmp ## then on hgwbeta ssh hgwbeta cd /tmp ### XXX remember to set the pushQ id number in the # last statement in this SQL file hgsql qapushq < ponAbe2.sql ########################################################################### # Blastz Chicken galGal3 (DONE - 2007-12-13 - Hiram) ssh kkstore03 screen # use screen to control this job mkdir /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13 cd /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13 cat << '_EOF_' > DEF # Orangutan vs. chicken BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Orangutan PonAbe2 SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/hg/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # testing some changes to the script time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 407m0.026s # some of the jobs failed due to out of memory on kk nodes # did a para recover and ran the last set of jobs on pk # then, continuing and testing the new memk kluster: time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -continue=cat `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # failed when it came to the kolossus run because kkr1u00 was out # which provides some filesystems to kolossus. Got that fixed # and finished the netChains.csh script manually. # real 6m2.271s # Then, continuing: time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -continue=load `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > load.log 2>&1 & # real 4m21.364s cat fb.ponAbe2.chainGalGal3Link.txt # 134448942 bases of 3093572278 (4.346%) in intersection mkdir /cluster/data/galGal3/bed/blastz.ponAbe2.swap cd /cluster/data/galGal3/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13/DEF \ -chainMinScore=5000 -chainLinearGap=loose -smallClusterHub=memk \ -swap -bigClusterHub=kk > swap.log 2>&1 & cat fb.galGal3.chainPonAbe2Link.txt # 115578761 bases of 1042591351 (11.086%) in intersection ########################################################################### # Add quality score track (DONE - 2008-03-10 - Hiram) ssh kkstore02 cd /cluster/data/ponAbe2/wustl # use mScore=98 since the "Finished" contigs are not in the qual file time nice -n +19 zcat contigs.fa.qual.gz \ | qaToQac stdin stdout \ | qacAgpLift -mScore=98 ../ponAbe2.agp stdin ponAbe2.qual.qac # 3m35.075s mkdir /cluster/data/ponAbe2/bed/qual cd /cluster/data/ponAbe2/bed/qual time nice -n +19 qacToWig -fixed \ /cluster/data/ponAbe2/wustl/ponAbe2.qual.qac stdout \ | wigEncode stdin qual.wig qual.wib # real 21m33.899s ssh hgwdev cd /cluster/data/ponAbe2/bed/qual rm -f /gbdb/ponAbe2/wib/qual.wib ln -s /cluster/data/ponAbe2/bed/qual/qual.wib /gbdb/ponAbe2/wib/ time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/wib \ ponAbe2 quality qual.wig # real 1m30.877s ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-03-11 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/ponAbe2/bed/nscan/ wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.gtf wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.prot.fa wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/readme.html bzip2 ponAbe2.* chmod a-w * # load track gtfToGenePred -genePredExt ponAbe2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt ponAbe2 nscanGene stdin hgPepPred ponAbe2 generic nscanPep ponAbe2.prot.fa.bz2 rm *.tab # update trackDb; need a ponAbe2-specific page to describe informants orangutan/ponAbe2/nscanGene.html (copy from readme.html) orangutan/ponAbe2/trackDb.ra # set search regex to termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9] ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # Create quality download files (DONE - 2008-10-14 - Hiram) cd /hive/data/genomes/ponAbe2/bed/qual qacToQa /hive/data/genomes/ponAbe2/wustl/ponAbe2.qual.qac ponAbe2.qa.fa cat << '_EOF_' > splitByName.pl #!/usr/bin/env perl use warnings; use strict; my $chrom = ""; while (my $line = <>) { if ($line =~ m/^>/) { close FH if (length($chrom) > 0); chomp $line; $chrom = $line; $chrom =~ s/>//; $chrom =~ s/\s+//g; open (FH, "| gzip -c > qa/$chrom.qa.fa.gz") or die "can not write to qa/$chrom.qa.fa.gz"; print FH ">$chrom\n"; } else { print FH "$line"; } } close (FH); '_EOF_' # << happy emacs chmod +x ./splitByName.pl mkdir qa ./splitByName.pl < ponAbe2.qa.fa cd qa tar cvzf ../ponAbe2.quality.fa.gz ./*.fa.gz # check byte counts with these files to make sure they are OK for F in *.fa.gz; do zcat $F; done | wc # 172337823 3446754951 10512603041 cd .. wc ponAbe2.qa.fa # 172337823 3446754951 10512603041 ponAbe2.qa.fa # identical counts, OK. ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11 cd /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11 # same kind of parameters as used in human<->marmoset cat << '_EOF_' > DEF # Orangutan vs. marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Orangutan PonAbe2 SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 329m55.081s cat fb.ponAbe2.chainCalJac3Link.txt # 2086557592 bases of 3093572278 (67.448%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap cd /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 146m12.301s cat fb.calJac3.chainHg19Link.txt # 1978857628 bases of 2752505800 (71.893%) in intersection ############################################################################ # LASTZ Gorilla GorGor3 (DONE - 2011-10-21,23 - Hiram) mkdir /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21 cd /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21 cat << '_EOF_' > DEF # Orangutan vs gorilla # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Orangutan PonAbe2 SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Frog gorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # 1441m58s cat fb.ponAbe2.chainGorGor3Link.txt # 2584628377 bases of 3093572278 (83.548%) in intersection cd /hive/data/genomes/ponAbe2/bed ln -s lastzGorGor3.2011-10-21 lastz.gorGor3 # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap cd /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 86m19.901s cat fb.gorGor3.chainPonAbe2Link.txt # 2437562544 bases of 2822760080 (86.354%) in intersection ############################################################################ # LASTZ Mouse Mm10 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 cd /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 cat << '_EOF_' > DEF # chimp vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Orangutan PonAbe2 SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Mouse Mm10 SEQ2_DIR=/scratch/data/mm10/mm10.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S ponAbe2Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 636m35.769s cat fb.ponAbe2.chainMm10Link.txt # 946932454 bases of 3093572278 (30.610%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ponAbe2/bed ln -s lastzMm10.2012-03-08 lastz.mm10 mkdir /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap cd /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m38.550s cat fb.mm10.chainPonAbe2Link.txt # 915093866 bases of 2652783500 (34.496%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.ponAbe2.swap lastz.ponAbe2 ########################################################################## # LASTZ Rhesus rheMac3 Swap (DONE - 2012-04-26 - Chin) cd /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18 cat fb.rheMac3.chainPonAbe2Link.txt # 2354152230 bases of 2639145830 (89.201%) in intersection # and, for the swap mkdir /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap cd /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 289m9.479s cat fb.ponAbe2.chainRheMac3Link.txt # 2565157739 bases of 3093572278 (82.919%) in intersection cd /hive/data/genomes/ponAbe2/bed ln -s blastz.rheMac3.swap lastz.rheMac3 ##############################################################################