# for emacs: -*- mode: sh; -*- # This file describes browser build for the Lancelet # genome, Branchiostoma floridae, March 2006, v.1.0 from JGI # # "$Id: braFlo1.txt,v 1.19 2009/11/25 21:48:38 hiram Exp $" # ########################################################################## ### Fetch sequence (DONE - 2007-03-26 - Hiram) ssh kkstore05 mkdir /cluster/store12/braFlo1 ln -s /cluster/store12/braFlo1 /cluster/data/braFlo1 mkdir /cluster/data/braFlo1/downloads cd /cluster/data/braFlo1/downloads wget --timestamping \ 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/Branchiostoma.fasta.gz' wget --timestamping \ 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/proteins.Brafl1.fasta.gz' wget --timestamping \ 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/transcripts.Brafl1.fasta.gz' gunzip Branchiostoma.fasta.gz scaffoldFaToAgp Branchiostoma.fasta ########################################################################## ## Set up initial database (DONE - 2007-03-26 - Hiram) cd /cluster/data/braFlo1 cat << '_EOF_' > braFlo1.config.ra # Config parameters for makeGenomeDb.pl: db braFlo1 clade deuterostome genomeCladePriority 20 scientificName Branchiostoma floridae commonName Lancelet assemblyDate Mar. 2006 assemblyLabel JGI v.1.0 (March 2006) orderKey 799 mitoAcc 5881414 fastaFiles /cluster/data/braFlo1/downloads/Branchiostoma.fasta agpFiles /cluster/data/braFlo1/downloads/Branchiostoma.agp # qualFiles none dbDbSpeciesDir lancelet '_EOF_' time nice -n +19 makeGenomeDb.pl braFlo1.config.ra > makeGenomeDb.out 2>&1 hgsql -e 'INSERT INTO chrM_gold (bin, chrom, chromStart, chromEnd, ix, type, frag, fragStart, fragEnd, strand) VALUES (585, "chrM", 0, 15083, 1, "F", "NC_000834.1", 0, 15083, "+");' braFlo1 ## done with this sequence cd downloads gzip Branchiostoma.fasta ########################################################################## ## RepeatMasker (DONE - 2007-03-26 - Hiram) ssh kkstore05 mkdir /cluster/data/braFlo1/bed/RepeatMasker cd /cluster/data/braFlo1/bed/RepeatMasker time nice -n +10 doRepeatMasker.pl braFlo1 \ -buildDir=/cluster/data/braFlo1/bed/RepeatMasker > do.log 2>&1 & # about 11 minutes cd /cluster/data/braFlo1 twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin # 926386587 bases (95184565 N's 831202022 real # 806238402 upper 24963620 lower) in 2 sequences in 1 files # twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin # %2.69 masked total, %3.00 masked real ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Hiram) ssh kolossus mkdir /cluster/data/braFlo1/bed/simpleRepeat cd /cluster/data/braFlo1/bed/simpleRepeat for C in chrM chrUn do mkdir /scratch/tmp/braFlo1.trf twoBitToFa -seq="${C}" ../../braFlo1.unmasked.2bit \ /scratch/tmp/braFlo1.trf/${C}.fa time nice -n 19 trfBig -trf=/cluster/bin/i386/trf \ /scratch/tmp/braFlo1.trf/${C}.fa /dev/null \ -bedAt=${C}.bed -tempDir=/scratch/tmp/braFlo1.trf > ${C}.log 2>&1 rm -fr /scratch/tmp/braFlo1.trf echo ${C} done done # real 187m42.772s # no repeats found on chrM # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' chrUn.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed braFlo1 simpleRepeat \ /cluster/data/braFlo1/bed/simpleRepeat/chrUn.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql ################################################ ## WINDOWMASKER (DONE - 2007-03-26 - Hiram) ssh kkstore05 mkdir /cluster/data/braFlo1/bed/WindowMasker cd /cluster/data/braFlo1/bed/WindowMasker time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl braFlo1 \ -buildDir=/cluster/data/braFlo1/bed/WindowMasker \ -workhorse=kolossus > wmRun.log 2>&1 & # real 82m55.387s # Masking statistics twoBitToFa braFlo1.wmsk.sdust.2bit stdout | faSize stdin # 926386587 bases (95184565 N's 831202022 real # 611909619 upper 219292403 lower) in 2 sequences in 1 files # %23.67 masked total, %26.38 masked real ssh hgwdev time nice -n +19 \ hgLoadBed -strict braFlo1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 4263263 elements of size 3 # eliminate the gaps from the masking (WM bug) featureBits braFlo1 -not gap -bed=notGap.bed # 831202022 bases of 831202022 (100.000%) in intersection time nice -n +19 featureBits braFlo1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 219292403 bases of 831202022 (26.383%) in intersection # reload track to get it clean hgLoadBed braFlo1 windowmaskerSdust cleanWMask.bed.gz # Loaded 4255796 elements of size 4 featureBits braFlo1 windowmaskerSdust # 219292403 bases of 831202022 (26.383%) in intersection ############################################################################## ## combine trf mask with windowmasker (DONE - 2008-05-23 - Hiram) ssh kkstore05 cd /cluster/data/braFlo1 zcat bed/WindowMasker/cleanWMask.bed.gz \ | twoBitMask braFlo1.unmasked.2bit stdin \ -type=.bed braFlo1.cleanWMSdust.2bit cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed braFlo1.cleanWMSdust.2bit stdin braFlo1.2bit # can safely ignore the warning about BED file >= 13 fields # check it twoBitToFa braFlo1.2bit stdout | faSize stdin # 926386587 bases (95184565 N's 831202022 real 611655540 upper # 219546482 lower) in 2 sequences in 1 files # %23.70 masked total, %26.41 masked real # create gbdb symlink ssh hgwdev ln -s /cluster/data/braFlo1/braFlo1.2bit /gbdb/braFlo1 ######################################################################### ## Create masked scaffolds-only 2bit file (DONE - 2007-03-26 - Hiram) ssh kkstore05 mkdir /cluster/data/braFlo1/scaffolds cp -p /cluster/data/oryLat1/jkStuff/lft2BitToFa.pl \ /cluster/data/braFlo1/jkStuff cp -p /cluster/data/oryLat1/jkStuff/agpToLift.pl \ /cluster/data/braFlo1/jkStuff cd /cluster/data/braFlo1/scaffolds ln -s ../Un/chrUn.agp . ../jkStiff/agpToLift.pl chrUn.agp > braFlo1.lift ../jkStuff/lft2BitToFa.pl ../braFlo1.2bit braFlo1.lift \ > chrUn.scaffolds.fa twoBitToFa ../braFlo1.2bit chrM.fa faToTwoBit chrUn.scaffolds.fa chrM.fa braFlo1UnScaffolds.2bit twoBitInfo braFlo1UnScaffolds.2bit stdout | sort -k2nr \ > braFlo1UnScaffolds.sizes cp -p braFlo1UnScaffolds.sizes braFlo1UnScaffolds.2bit braFlo1.lift \ /san/sanvol1/scratch/braFlo1 ######################################################################### ## Reload rmsk to take care of empty chrM (DONE - 2007-03-26 - Hiram) ssh hgwdev cd /cluster/data/braFlo1/bed/RepeatMasker hgLoadOut -table=rmsk -nosplit braFlo1 braFlo1.fa.out hgsql -e "drop table chrUn_rmsk;" braFlo1 # before: featureBits braFlo1 rmsk # 24973684 bases of 923355587 (2.705%) in intersection # after: featureBits braFlo1 rmsk # 24973684 bases of 923355587 (2.705%) in intersection ######################################################################### ## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram) # measurement on hg18 ssh kkstore02 cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26 cat fb.hg18.chainBraFlo1Link.txt # 26455595 bases of 2881515245 (0.918%) in intersection # and now the swap mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap cd /cluster/data/braFlo1/bed/blastz.hg18.swap time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \ /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 \ -swap > swap.log 2>&1 & # real 83m46.258s cat fb.braFlo1.chainHg18Link.txt # 30912893 bases of 923355587 (3.348%) in intersection # create reciprocal best chains/nets for 5-way maf alignments ssh hgwdev cd /cluster/data/braFlo1/bed/blastz.hg18.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 hg18 \ time nice -n +19 $HOME/kent/src/hg/utils/automation/doRecipBest.pl \ braFlo1 hg18 > rbest.log 2>&1 & # real 34m14.843s # real 32m11.695s # This did not work right. Something went haywire somewhere: # Error: braFlo1 rbest net coverage 14190116 != hg18 14270477 ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram) # Use -repMatch=328 (based on size -- for human we use 1024, and # lancelet size is ~32% of human judging by gapless braFlo1 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/braFlo1 blat braFlo1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=328 # Wrote 9536 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/braFlo1 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add braFlo1 just after gasAcu1 # braFlo1 (Branchiostoma floridae - Lancelet) braFlo1.serverGenome = /cluster/data/braFlo1/braFlo1.2bit braFlo1.clusterGenome = /san/sanvol1/scratch/braFlo1/braFlo1.2bit braFlo1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} braFlo1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} braFlo1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} braFlo1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} braFlo1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} braFlo1.ooc = /san/sanvol1/scratch/braFlo1/11.ooc braFlo1.lift = /cluster/data/braFlo1/jkStuff/liftAll.lft braFlo1.refseq.mrna.native.load = no braFlo1.refseq.mrna.xeno.load = yes braFlo1.genbank.mrna.xeno.load = yes braFlo1.genbank.est.native.load = yes braFlo1.downloadDir = braFlo1 braFlo1.perChromTables = no cvs ci -m "Added braFlo1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added Branchiostoma floridae (lancelet)." src/lib/gbGenome.c make install-server ssh kkstore05 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial braFlo1 & # logFile: var/build/logs/2007.03.26-21:58:34.braFlo1.initalign.log # real 230m8.123s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad braFlo1 # logFile: var/dbload/hgwdev/logs/2007.03.27-07:52:23.dbload.log # real 6m57.557s ## re-load to get the est track after adding that to the .conf file # var/dbload/hgwdev/logs/2007.03.29-17:09:35.dbload.log # real 19m2.168s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add braFlo1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added braFlo1 - Branchiostoma floridae (lancelet)" \ etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ ## Adding a photograph (DONE - 2007-03-28 - Hiram) ## From Hector Escriva: hector.escriva@obs-banyuls.fr # Dr. Clawson # All the european teams working with amphioxus have changed to B. # lanceolatum species and none of us is still working with B. floridae. We # are able here to induce spawnings everyday (on B. lanceolatum) and this # is impossible with the american species which limits very much the # amount of experiments that one can perform. So I have pictures of B. # lanceolatum, which is morphologically the same as floridae (even me I # cannot distinghish them). So I send you here a picture of a B. # lanceolatum juvenile in which we clearly see the main morphological # aspects of this animal. You should however write the correct name of the # species in order to be serious. # I would also ask you to write in "Photo courtesy of", the name of my # technicien who did the picture and myself. Michael Fuentes and Hector Escriva. # Let me know if you agree with this picture. # Sincerely Hector Escriva mkdir /cluster/data/braFlo1/photograph cd /cluster/data/braFlo1/photograph convert -sharpen 0 -normalize -geometry 350x200 \ PostmetamorphicLarvae.jpg Branchiostoma_lanceolatum.jpg ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("braFlo1", "blat11", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("braFlo1", "blat11", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:619013791-619054719" where name="braFlo1";' hgcentraltest ############################################################################ # BLASTZ petMar1 Lamprey (DONE - 2008-04-11 - Hiram) ssh kkstore05 screen # use a screen to control this multi-day job mkdir /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11 cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11 cat << '_EOF_' > DEF # Lancelet vs Lamprey BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Lancelet braFlo1 - largest chunk big enough for largest scaffold # Largest scaffold 7,200,735 - 3032 scaffolds + chrM SEQ1_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit SEQ1_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift SEQ1_CHUNK=10000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # Completed: 78590 of 78590 jobs # CPU time in finished jobs: 10056466s 167607.76m 2793.46h 116.39d 0.319 y # IO & Wait Time: 6633099s 110551.65m 1842.53h 76.77d 0.210 y # Average job time: 212s 3.54m 0.06h 0.00d # Longest finished job: 4572s 76.20m 1.27h 0.05d # Submission to last job: 107694s 1794.90m 29.91h 1.25d # continuing after various interruptions and finished the batch manually time nice -n +19 doBlastzChainNet.pl \ /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # real 12m43.792s cat fb.braFlo1.chainPetMar1Link.txt # 27418217 bases of 923355587 (2.969%) in intersection # create reciprocal best chains/nets for 5-way maf alignments ssh hgwdev cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 petMar1 \ > rbest.log 2>&1 & # real 30m11.990s # and the swap mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap cd /cluster/data/petMar1/bed/blastz.braFlo1.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & # real 194m53.514s cat fb.petMar1.chainBraFlo1Link.txt # 27373264 bases of 831696438 (3.291%) in intersection ########################################################################## ## BLASTZ Chicken GalGal3 swap (DONE - 2008-04-16 - Hiram) # the original ssh kkstore03 cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15 cat fb.galGal3.chainBraFlo1Link.txt # 19795687 bases of 1042591351 (1.899%) in intersection # and the swap mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap cd /cluster/data/braFlo1/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 & # real 4m45.054s cat fb.braFlo1.chainGalGal3Link.txt # 30287175 bases of 923355587 (3.280%) in intersection # create reciprocal best chains/nets for 5-way maf alignments ssh hgwdev cd /cluster/data/braFlo1/bed/blastz.galGal3.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 galGal3 \ > rbest.log 2>&1 & # real 24m30.834s ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-21) # (auto) establish native host of /cluster/data/ df /cluster/data/braFlo1 ssh kkstore05 # bash if not using bash shell already mkdir /cluster/data/braFlo1/blastDb cd /cluster/data/braFlo1 awk '{if ($2 > 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > 1meg.lst if test -s 1meg.lst; then twoBitToFa -seqList=1meg.lst scaffolds/braFlo1UnScaffolds.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa fi rm 1meg.lst awk '{if ($2 <= 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > less1meg.lst if test -s less1meg.lst; then twoBitToFa -seqList=less1meg.lst scaffolds/braFlo1UnScaffolds.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa fi rm less1meg.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 1063 mkdir -p /san/sanvol1/scratch/braFlo1/blastDb cd /cluster/data/braFlo1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/braFlo1/blastDb done mkdir -p /cluster/data/braFlo1/bed/tblastn.hg18KG cd /cluster/data/braFlo1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/braFlo1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 1063 query.lst # we want around 100,000 jobs per gig (0.0001 jobs per base) numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/braFlo1/chrom.sizes` # we want around numJobs jobs numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'` numQueries=`wc query.lst | awk '{print $1}'` numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' ` mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/braFlo1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then if test -s /cluster/data/braFlo1/blastDb.lft then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/braFlo1/blastDb.lft carry $f.2 else mv $f.2 $f.3 fi liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/braFlo1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 93544 of 93544 jobs # CPU time in finished jobs: 8499504s 141658.40m 2360.97h 98.37d 0.270 y # IO & Wait Time: 669830s 11163.84m 186.06h 7.75d 0.021 y # Average job time: 98s 1.63m 0.03h 0.00d # Longest finished job: 376s 6.27m 0.10h 0.00d # Submission to last job: 44890s 748.17m 12.47h 0.52d ssh kkstore05 cd /cluster/data/braFlo1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh memk cd /cluster/data/braFlo1/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 88 of 88 jobs # CPU time in finished jobs: 906s 15.09m 0.25h 0.01d 0.000 y # IO & Wait Time: 19435s 323.92m 5.40h 0.22d 0.001 y # Average job time: 231s 3.85m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 322s 5.37m 0.09h 0.00d # Submission to last job: 812s 13.53m 0.23h 0.01d ssh kkstore05 cd /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/braFlo1/bed/tblastn.hg18KG/preblastHg18KG.psl cd .. pslCheck preblastHg18KG.psl liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/braFlo1/scaffolds/braFlo1.lift carry preblastHg18KG.psl # load table ssh hgwdev cd /cluster/data/braFlo1/bed/tblastn.hg18KG hgLoadPsl braFlo1 blastHg18KG.psl # check coverage featureBits braFlo1 blastHg18KG # 12239766 bases of 923355587 (1.326%) in intersection featureBits braFlo1 xenoRefGene:cds blastHg18KG -enrichment # xenoRefGene:cds 1.206%, blastHg18KG 1.326%, both 0.621%, cover 51.51%, enrich 38.86x ssh kkstore05 rm -rf /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut #end tblastn ## BLASTZ Mouse mm9 swap (DONE - 2008-04-17 - Hiram) # the original ssh kkstore06 cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14 cat fb.mm9.chainBraFlo1Link.txt # 26725980 bases of 2620346127 (1.020%) in intersection # That is OK, now for the swap: mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap cd /cluster/data/braFlo1/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 & # real 12m23.402s cat fb.braFlo1.chainMm9Link.txt # 31517169 bases of 923355587 (3.413%) in intersection # create reciprocal best chains/nets for 5-way maf alignments ssh hgwdev cd /cluster/data/braFlo1/bed/blastz.mm9.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 mm9 \ > rbest.log 2>&1 & # real 35m15.178s ############################################################################# ## 5-Way Multiz (DONE - 2008-04-17 - Hiram) ## ssh hgwdev mkdir /cluster/data/braFlo1/bed/multiz5way cd /cluster/data/braFlo1/bed/multiz5way # take the 6-way tree from mm9 and eliminate genomes not in # this alignment. Manually add in petMar1 and braFlo1 # arbitrarily somewhere outside of the fishes # rearrange to get braFlo1 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 5 organisms from the 6-way on Lamprey petMar1 /cluster/bin/phast/tree_doctor \ --prune-all-but \ Human_hg18,Mouse_mm9,Chicken_galGal3,Lamprey_petMar1,Lanclet_braFlo1 \ /cluster/data/petMar1/bed/multiz6way/petMar1.6-way.nh \ > 5-way.fullNames.nh # looks something like this: ((Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824, (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000, Lanclet_braFlo1:2.000000); # rearrange to get Lanclet at the top, this leaves us with: cat << '_EOF_' > braFlo1.5-way.nh (Lanclet_braFlo1:2.0,(Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824, (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' braFlo1.5-way.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 5 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' braFlo1.5-way.nh \ | sed -e "s/ / /g"` > tree.5.nh # that looks like, as a single line: # (braFlo1 (petMar1 (galGal3 (mm9 hg18)))) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/braFlo1/bed/multiz5way foreach db (`grep -v braFlo1 species.list`) set bdir = /cluster/data/braFlo1/bed/blastz.$db if (-e $bdir/mafRBestNet/chrUn.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chrUn.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chrUn.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, these should all be mafRBestNet ./listMafs.csh # galGal3 mafRBestNet # hg18 mafRBestNet # mm9 mafRBestNet # petMar1 mafRBestNet /cluster/bin/phast/all_dists braFlo1.5-way.nh > 5way.distances.txt grep -i braflo 5way.distances.txt | sort -k3,3n Lanclet_braFlo1 Chicken_galGal3 3.614778 Lanclet_braFlo1 Human_hg18 3.723612 Lanclet_braFlo1 Mouse_mm9 3.922529 Lanclet_braFlo1 Lamprey_petMar1 4.000000 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainBraFlo1Link chain linearGap # distance on BraFlo1 on other minScore # 1 3.614778 Chicken_galGal3 (% 3.280) (% 1.899) 5000 loose # 2 3.723612 Human_hg18 (% 3.348) (% 0.918) 2000 loose # 3 3.922529 Mouse_mm9 (% 3.413) (% 1.020) 5000 loose # 4 4.000000 Lamprey_petMar1 (% 2.969) (% 3.291) 5000 loose # copy net mafs to cluster-friendly storage, splitting chroms # XXX - no need to split, there is only chrM and chrUn mkdir mafLinks cd mafLinks for D in galGal3 petMar1 mm9 hg18 do mkdir ${D} ln -s /cluster/data/braFlo1/bed/blastz.${D}/mafRBestNet/*.maf.gz ./${D} done # copy to kluster friendly san for multiz run ssh kkstore05 mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc cd /cluster/data/braFlo1/bed/multiz5way/mafLinks rsync -a --progress -L ./ /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc # ready for the multiz run ssh memk mkdir /cluster/data/braFlo1/bed/multiz5way/cons cd /cluster/data/braFlo1/bed/multiz5way/cons mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = braFlo1 set c = $1 set result = $2 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz5way/mafSrc rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.5.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf popd rm -f $result cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/braFlo1/bed/multiz5way/cons/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs echo "chrM.maf" > mafList echo "chrUn.maf" >> mafList gensub2 mafList single template jobList para create jobList para try ... check ... push ... etc # Completed: 2 of 2 jobs # CPU time in finished jobs: 375s 6.25m 0.10h 0.00d 0.000 y # IO & Wait Time: 17s 0.28m 0.00h 0.00d 0.000 y # Average job time: 196s 3.27m 0.05h 0.00d # Longest finished job: 385s 6.42m 0.11h 0.00d # Submission to last job: 385s 6.42m 0.11h 0.00d # load tables for a look ssh hgwdev mkdir -p /gbdb/braFlo1/multiz5way/maf ln -s /cluster/data/braFlo1/bed/multiz5way/cons/maf/*.maf \ /gbdb/braFlo1/multiz5way/maf # this generates an immense multiz5way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/braFlo1/multiz5way/maf braFlo1 multiz5way # Loaded 220560 mafs in 2 files from /gbdb/braFlo1/multiz5way/maf # real 0m4.151s # load summary table time nice -n +19 cat /gbdb/braFlo1/multiz5way/maf/*.maf \ | hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary stdin # Created 87678 summary blocks from 366073 components and 220521 mafs from stdin # real 0m4.911s # Gap Annotation # prepare bed files with gap info ssh kkstore05 mkdir /cluster/data/braFlo1/bed/multiz5way/anno cd /cluster/data/braFlo1/bed/multiz5way/anno mkdir maf run # these actually already all exist from previous multiple alignments # if there are done you will see an ls of them # or else the twoBitInfo command will be echoed, run it if you want for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v braFlo1 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh memk cd /cluster/data/braFlo1/bed/multiz5way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/braFlo1/bed/multiz5way/cons set c = $1 cat $dir/maf/${c}.maf | nice \ mafAddIRows -nBeds=nBeds stdin /cluster/data/braFlo1/braFlo1.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/braFlo1/chrom.sizes > chrom.list gensub2 chrom.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 2 of 2 jobs # CPU time in finished jobs: 7240s 120.66m 2.01h 0.08d 0.000 y # IO & Wait Time: 9s 0.16m 0.00h 0.00d 0.000 y # Average job time: 3625s 60.41m 1.01h 0.04d # Longest finished job: 7241s 120.68m 2.01h 0.08d # Submission to last job: 7241s 120.68m 2.01h 0.08d # load the results ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/anno mkdir -p /gbdb/braFlo1/multiz5way/anno ln -s `pwd`/maf/*.maf /gbdb/braFlo1/multiz5way/anno # by loading this into the table multiz5way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/braFlo1/multiz5way/anno \ braFlo1 multiz5way # Loaded 311293 mafs in 2 files from /gbdb/braFlo1/multiz5way/anno # real 0m7.505s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything. time nice -n +19 cat /gbdb/braFlo1/multiz5way/anno/*.maf \ | hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary stdin # Created 87678 summary blocks from 366073 components and 311253 mafs from stdin # real 0m6.998s # by loading this into the table multiz5waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz5way*.tab files in this /scratch/tmp directory rm multiz5way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/braFlo1/multiz5way/maf/*.maf rmdir /gbdb/braFlo1/multiz5way/maf ########################################################################### ## Annotate 5-way multiple alignment with gene annotations ## (DONE - 2008-05-01 - Hiram) # Gene frames ## following pattern used in Lamprey petMar1 # hg18 is in a bit of transition at the moment, so use ensGene # mm9 can use knownGene # xenoMrna for braFlo1, petMar1 # ensGene v49 for galGal3 ssh hgwdev mkdir /cluster/data/braFlo1/bed/multiz5way/frames cd /cluster/data/braFlo1/bed/multiz5way/frames mkdir genes # knownGene for DB in mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in hg18 galGal3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # use xenoMrna for petMar1, braFlo1 for DB in petMar1 braFlo1 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \ from xenoMrna,gbCdnaInfo,cds \ where (xenoMrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 1040639 May 1 09:53 braFlo1.gp.gz # -rw-rw-r-- 1 1557911 May 1 09:50 galGal3.gp.gz # -rw-rw-r-- 1 2151412 May 1 09:50 hg18.gp.gz # -rw-rw-r-- 1 1965274 May 1 09:50 mm9.gp.gz # -rw-rw-r-- 1 596023 May 1 09:51 petMar1.gp.gz ssh kkstore05 cd /cluster/data/braFlo1/bed/multiz5way/frames # hg18 is in a bit of transition at the moment, so use ensGene # mm9 can use knownGene # xenoMrna for braFlo1, braFlo1 # ensGene v49 for galGal3, oryLat1 # anything to annotate is in a pair, e.g.: braFlo1 genes/braFlo1.gp.gz time (cat ../anno/maf/*.maf | nice -n +19 genePredToMafFrames braFlo1 stdin stdout braFlo1 genes/braFlo1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz petMar1 genes/petMar1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 44216 petMar1 # 71268 braFlo1 # 107089 galGal3 # 107857 hg18 # 111452 mm9 # load the resulting file ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames braFlo1 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m7.902s # enable the trackDb entries: # frames multiz5wayFrames # irows on ############################################################################# # phastCons 5-way (DONE - 2008-05-01 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/braFlo1/bed/multiz5way/msa.split cd /cluster/data/braFlo1/bed/multiz5way/msa.split mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/braFlo1/bed/multiz5way/anno/maf set WINDOWS = /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss pushd $WINDOWS set c = $1 rm -f $c.out twoBitToFa -seq=$c /scratch/data/braFlo1/braFlo1.2bit /scratch/tmp/braFlo1.$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /scratch/tmp/braFlo1.$c.fa \ -o SS -r $c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/braFlo1.$c.fa popd date > $c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 2 of 2 jobs # CPU time in finished jobs: 337s 5.61m 0.09h 0.00d 0.000 y # IO & Wait Time: 40s 0.67m 0.01h 0.00d 0.000 y # Average job time: 189s 3.14m 0.05h 0.00d # Longest finished job: 375s 6.25m 0.10h 0.00d # Submission to last job: 375s 6.25m 0.10h 0.00d # take the cons and noncons trees from the Lamprey 6-way # Estimates are not easy to make, probably more correctly, # take the 6-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way cp -p /cluster/data/petMar1/bed/multiz6way/cons/all/all.mod \ ./petMar1.6way.mod # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/braFlo1/bed/multiz5way/cons/run.cons cd /cluster/data/braFlo1/bed/multiz5way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin set f = $1 set c = $1:r set len = $2 set cov = $3 set rho = $4 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/braFlo1/bed/multiz5way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/braFlo1/multiz5way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp $san/$grp/bed sleep 4 touch $san/$grp/pp $san/$grp/bed rm -f $san/$grp/pp/$f.pp rm -f $san/$grp/bed/$f.bed mv $tmp/$f.pp $san/$grp/pp mv $tmp/$f.bed $san/$grp/bed rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/braFlo1/multiz5way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/braFlo1/bed/multiz5way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \ ../../petMar1.6way.mod \ --prune-all-but=braFlo1,hg18,galGal3,mm9,petMar1 \ > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 94 of 94 jobs # CPU time in finished jobs: 1945s 32.42m 0.54h 0.02d 0.000 y # IO & Wait Time: 810s 13.49m 0.22h 0.01d 0.000 y # Average job time: 29s 0.49m 0.01h 0.00d # Longest finished job: 34s 0.57m 0.01h 0.00d # Submission to last job: 201s 3.35m 0.06h 0.00d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all find ./bed -type f -name "chr*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed cp -p mostConserved.bed /cluster/data/braFlo1/bed/multiz5way/cons/all # load into database ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/cons/all time nice -n +19 hgLoadBed braFlo1 phastConsElements5way mostConserved.bed # Loaded 122683 elements of size 5 # real 0m2.299s # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits braFlo1 phastConsElements5way # 35581251 bases of 923355587 (3.853%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all mkdir phastCons5wayScores cat pp/chrM*.pp | gzip > phastCons5wayScores/chrM.data.gz # this is a bit of a cheat on the sorting order here, but it works. ls pp/chrUn*.pp | sort -t. -k2n | xargs cat \ | gzip > phastCons5wayScores/chrUn.data.gz # copy the phastCons5wayScores to: mkdir -p \ /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way cp -p phastCons5wayScores/chr*.data.gz \ /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all zcat phastCons5wayScores/chr*.data.gz \ | wigEncode stdin phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/braFlo1/bed/multiz5way/cons/all # real 0m1.216s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/cons/all ln -s `pwd`/phastCons5way.wib /gbdb/braFlo1/multiz5way/phastCons5way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/braFlo1/multiz5way braFlo1 \ phastCons5way phastCons5way.wig # real 0m2.166s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=braFlo1 phastCons5way > histogram.data 2>&1 # real 5m0.608s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Lancelet BraFlo1 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # it is a bit lopsided toward the highly conserved end, but that # is because there is so little bits being used # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons5way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-05-01 - Hiram) # Let's see if the downloads will work ssh hgwdev cd /cluster/data/braFlo1 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/braFlo1.fa.out . cd bed/simpleRepeat mkdir trfMaskChrom splitFileByColumn trfMask.bed trfMaskChrom touch trfMaskChrom/chrM.bed cd /cluster/data/braFlo1 time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev braFlo1 > jkStuff/downloads.log 2>&1 # real 11m53.475s # failed making upstream sequences: # featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-05-01 - Hiram) ssh hgwdev /cluster/data/braFlo1 /cluster/bin/scripts/makePushQSql.pl braFlo1 > jkStuff/pushQ.sql # output warnings: # braFlo1 does not have seq # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # gbWarn # multiz5wayFrames # phastCons5way scp -p jkStuff/pushQ.sql hgwbeta:/tmp/braFlo1.sql ssh hgwbeta cd /tmp hgsql qapushq < braFlo1.sql ############################################################################# # Create 5-way downloads (DONE - 2008-05-01 - Hiram) ssh hgwdev mkdir -p /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way cd /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way # if these haven't been copied here from before: cp -p \ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/phastCons5wayScores/* . ln -s ../../cons/all/all.mod ./5way.mod cp /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/README.txt . # edit that README.txt to be correct for this 5-way alignment cd .. mkdir multiz5way cd multiz5way cp -p /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way/README.txt . # edit that README.txt to be correct for this 5-way alignment ssh kkstore05 cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way ln -s ../../braFlo1.5-way.nh ./5way.nh cp -p ../../anno/maf/chr*.maf . gzip chr*.maf ssh hgwdev cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way # creating upstream files from xenoRefGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=braFlo1 GENE=xenoRefGene NWAY=multiz5way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh # real 3m1.228s # -rw-rw-r-- 1 126900 May 1 16:51 upstream1000.maf.gz # -rw-rw-r-- 1 224320 May 1 16:52 upstream2000.maf.gz # -rw-rw-r-- 1 501687 May 1 16:53 upstream5000.maf.gz # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then xenoRefGene names, e.g.: # 364 petMar1 # 364 mm9 # 364 hg18 # 364 galGal3 # 7 NM_069006 # 7 NM_054045 # 7 NM_021059 ssh kkstore05 cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way md5sum *.maf.gz > md5sum.txt cd ../phastCons5way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way rm /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way/mkUpstream.sh cd ../phastCons5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# # add strings of N's to the gap track (DONE - 2008-05-09 - Hiram) ssh kkstore05 mkdir /cluster/data/braFlo1/bed/gap cd /cluster/data/braFlo1/bed/gap time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \ ../../braFlo1.2bit > braFlo1.gaps.txt 2>&1 # real 0m16.659s grep "^#GAP chrUn" braFlo1.gaps.txt | sed -e "s/#GAP //" > chrUn.gap.bed # what are their sizes like: ave -col=5 chrUn.gap.bed Q1 50.000000 median 60.000000 Q3 704.000000 average 1004.236543 min 1.000000 max 144292.000000 count 94786 total 95187565.000000 standard deviation 4497.943141 # don't let this overlap with the gap track already ssh hgwdev cd /cluster/data/braFlo1/bed/gap # already existing gap track featureBits braFlo1 gap # 3031000 bases of 923355587 (0.328%) in intersection # prove that it is a complete subset of this new set featureBits braFlo1 gap chrUn.gap.bed # 3031000 bases of 923355587 (0.328%) in intersection # let's make sure none of the regular gaps run-on into these new # found gaps hgsql -e "select chrom,chromStart,chromEnd from chrUn_gap;" braFlo1 \ | sort -k1,1 -k2,2n > gap.bed awk '{printf "%s\t%s\t%s\n", $1, $2, $3}' chrUn.gap.bed \ | sort -k1,1 -k2,2n > new.bed # after working with that for a while, indeed there is a problem # found in the existing gap track. There are "contig" gaps in the # the track which claim to be 1000 bases, but the DNA sequence # has 1001 N's in that location. Let's see if that is because the # scaffold itself ends in a single N. Yes, after some investigation, # it is found there are scaffolds which either begin with Ns, or # or end with N's, and it looks like they are single N's. # there are 30 gap locations where this happens. # So, let's make up a proper exclusive or of the new gaps: # intersect the new gaps with everything that is not in the current # gap track. Get the regions that are not in the gap track: featureBits -not -countGaps braFlo1 gap -chrom=chrUn -bed=notGap.bed # then, find only those bits in the new gaps that intersect that: featureBits -countGaps braFlo1 notGap.bed new.bed -bed=newGaps.bed # and profile their sizes: awk '{print $3-$2}' newGaps.bed | ave stdin Q1 50.000000 median 51.000000 Q3 591.000000 average 1004.037315 min 1.000000 max 144292.000000 count 91785 total 92155565.000000 standard deviation 4570.916383 # make the new gaps into a properly formatted gap table # currently, the last index number in the gap track is:o hgsql -e "select max(ix) from chrUn_gap;" braFlo1 # 6062 awk ' BEGIN{ ix=6063 } { printf "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $1, $2, $3, ix, $3-$2 ++ix }' newGaps.bed > newGaps.gap.table # findMotif has made an error on the last one, there is an extra one # at the end that is not legitimate, take it off. # let's get the SQL definition: mkdir dump cd dump hgsqldump -c --tab=. braFlo1 chrUn_gap cd .. # and, make the bin column hgLoadBed -noLoad braFlo1 xyt newGaps.gap.table # count before: hgsql -e "select count(*) from chrUn_gap;" braFlo1 # 3031 # then, add this business to the existing table: hgLoadSqlTab -append braFlo1 chrUn_gap dump/chrUn_gap.sql bed.tab # and after hgsql -e "select count(*) from chrUn_gap;" braFlo1 # 94815 # that is the sum of old and new: wc -l gap.bed bed.tab # 3031 gap.bed # 91784 bed.tab # 94815 total ############################################################################# # GENBANK reload due to problems found by gbSanity (2008-05-09 markd) ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep3 -drop -initialLoad braFlo1& ############################################################################# ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: braFlo1.upstreamGeneTbl = xenoRefGene braFlo1.upstreamMaf = multiz5way /hive/data/genomes/braFlo1/bed/multiz5way/species.list