# for emacs: -*- mode: sh; -*- # $Id: calJac3.txt,v 1.16 2010/06/11 16:00:57 chinhli Exp $ # Marmoset sequence: # ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/ # Callithrix_jacchus/Callithrix_jacchus-3.2 # Callithrix jacchus # http://www.hgsc.bcm.tmc.edu/project-species-p-Marmoset.hgsc?pageLocation=Marmoset # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ACFV00 ########################################################################## # Download sequence (DONE - 2010-02-04 - Hiram) mkdir /hive/data/genomes/calJac3 cd /hive/data/genomes/calJac3 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Callithrix_jacchus/Callithrix_jacchus-3.2/*" mkdir ucscChr cd ucscChr # fixup the accession names to become UCSC chrom names zcat ../Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*gb|\([A-Z]*[0-9]*\).1.*/>chrUn_\1/" > chrUn.fa zcat ../Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e "s/^\([A-Z]*[0-9]*\).1/chrUn_\1/" > chrUn.agp find ../Primary_Assembly/unlocalized_scaffolds/FASTA -type f \ | grep "unlocalized.scaf.fa.gz$" | head -1 | while read F do C=`basename ${F}` C=${C/.unlocalized.scaf.fa.gz} zcat "${F}" | sed -e "s/^>.*gb|\([A-Z]*[0-9]*\).1.*/>${C}_\1_random/" done > chr_randoms.fa find ../Primary_Assembly/unlocalized_scaffolds/AGP -type f | grep ".agp.gz$" \ | while read F do C=`basename ${F}` echo -n "${C} " C=${C/.unlocalized.scaf.agp.gz} echo "${C}" zcat "${F}" | sed -e "s/^\([A-Z]*[0-9]*\).1/${C}_\1_random/" done > chr_randoms.agp AC="../Primary_Assembly/assembled_chromosomes" for F in ${AC}/FASTA/chr*.fa.gz do C=`basename $F` C=${C/.fa.gz} echo -n "${C} " H=`zcat "${F}" | head -1` chrN=`echo $H | sed -e "s/.*Callithrix jacchus chromosome /chr/; s/, .*//"` A=`echo $H | sed -e "s/. Callithrix.*//; s/.*gb.//"` echo $chrN $A zcat ${AC}/AGP/${chrN}.comp.agp.gz \ | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp echo ">${chrN}" > ${chrN}.fa zcat ${AC}/FASTA/${chrN}.fa.gz | grep -v "^>" >> ${chrN}.fa done ########################################################################## # Initial genome build (DONE - 2009-12-17 - Hiram) cd /hive/data/genomes/calJac3 cat << '_EOF_' > calJac3.config.ra # Config parameters for makeGenomeDb.pl: db calJac3 clade mammal genomeCladePriority 16 scientificName Callithrix jacchus commonName Marmoset assemblyDate Mar. 2009 assemblyLabel WUSTL 3.2 (GCA_000004665.1) orderKey 40 mitoAcc none fastaFiles /hive/data/genomes/calJac3/genbank/ucscChr/*.fa agpFiles /hive/data/genomes/calJac3/genbank/ucscChr/*.agp # qualFiles none dbDbSpeciesDir marmoset taxId 9483 '_EOF_' makeGenomeDb.pl -workhorse=hgwdev -stop=seq calJac3.config.ra > seq.out 2>&1 # real 4m5.924s makeGenomeDb.pl -continue=agp -stop=agp calJac3.config.ra > agp.out 2>&1 # real 0m20.968s makeGenomeDb.pl -continue=db -stop=db calJac3.config.ra > db.out 2>&1 # real 5m39.181s # XXX - chromInfo doesn't have large enough fields for the name keys # been fixed in later versions of makeGenomeDb.pl makeGenomeDb.pl -continue=dbDb -stop=dbDb calJac3.config.ra > dbDb.out 2>&1 makeGenomeDb.pl -continue=trackDb -stop=trackDb calJac3.config.ra > trackDb.out 2>&1 ########################################################################## # running repeat masker (DONE - 2010-02-02 - Hiram) mkdir /hive/data/genomes/calJac3/bed/repeatMasker cd /hive/data/genomes/calJac3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk calJac3 > do.log 2>&1 & # real 443m55.891s cat faSize.rmsk.txt # 2914958544 bases (162452744 N's 2752505800 real 1439949519 upper 1312556281 # lower) in 14205 sequences in 1 files # %45.03 masked total, %47.69 masked real ########################################################################## # running simple repeat (DONE - 2010-02-02 - Hiram) mkdir /hive/data/genomes/calJac3/bed/simpleRepeat cd /hive/data/genomes/calJac3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ calJac3 > do.log 2>&1 & # real 198m33.953s cat fb.simpleRepeat # 66211508 bases of 2752505800 (2.405%) in intersection cd /hive/data/genomes/calJac3 twoBitMask calJac3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed calJac3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa calJac3.2bit stdout | faSize stdin > faSize.calJac3.2bit.txt cat faSize.calJac3.2bit.txt # 2914958544 bases (162452744 N's 2752505800 real 1439244378 upper # 1313261422 lower) in 14205 sequences in 1 files # %45.05 masked total, %47.71 masked real rm /gbdb/calJac3/calJac3.2bit ln -s `pwd`/calJac3.2bit /gbdb/calJac3/calJac3.2bit ######################################################################### # MAKE 11.OOC FILES FOR BLAT (DONE - 2010-02-03 - Hiram) ssh kolossus # numerator is calJac3 gapless bases "real" as reported by faSize # denominator is hg17 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 2752505800 / 2897310462 \) \* 1024 # ( 2752505800 / 2897310462 ) * 1024 = 972.821510 # ==> use -repMatch=950 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/calJac3 blat calJac3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/calJac3.11.ooc -repMatch=950 # Wrote 32908 overused 11-mers to jkStuff/calJac3.11.ooc mkdir /hive/data/staging/data/calJac3 cp -p calJac3.2bit chrom.sizes jkStuff/calJac3.11.ooc \ /hive/data/staging/data/calJac3 gapToLift -bedFile=jkStuff/nonBridgedGaps.bed calJac3 \ jkStuff/calJac3.nonBridged.lft ########################################################################## # BLATSERVERS ENTRY (DONE - 2009-12-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("calJac3", "blat12", "17804", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("calJac3", "blat12", "17805", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset position to RHO location as found from blat of hg19 RHO gene hgsql -e \ 'update dbDb set defaultPos="chr15:67439781-67471107" where name="calJac3";' \ hgcentraltest ############################################################################ # genbank run (DONE - 2010-02-04 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank # edit etc/genbank.conf to add this section just before calJac1: # Marmoset calJac3.serverGenome = /hive/data/genomes/calJac3/calJac3.2bit calJac3.clusterGenome = /scratch/data/calJac3/calJac3.2bit calJac3.ooc = /scratch/data/calJac3/calJac3.11.ooc calJac3.lift = no calJac3.perChromTables = no calJac3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} calJac3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} calJac3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} calJac3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} calJac3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} calJac3.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} calJac3.downloadDir = calJac3 calJac3.refseq.mrna.native.load = yes calJac3.refseq.mrna.xeno.load = yes calJac3.refseq.mrna.xeno.loadDesc = yes cvs ci -m "adding marmoset calJac3" etc/genbank.conf make etc-update ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial calJac3 & # var/build/logs/2010.02.10-09:19:20.calJac3.initalign.log # real 238m59.390s ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad calJac3 & # logFile: var/dbload/hgwdev/logs/2010.02.11-10:41:22.dbload.log # real 25m55.474s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add calJac3 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Adding calJac3 - Marmoset - Callithrix jacchus" \ etc/align.dbs etc/hgwdev.dbs make etc-update # done - 2010-02-11 - Hiram ############################################################################ # running cpgIsland business (DONE - 2010-02-11 - Hiram) mkdir /hive/data/genomes/calJac3/bed/cpgIsland cd /hive/data/genomes/calJac3/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../calJac3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/calJac3/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 14205 of 14205 jobs # CPU time in finished jobs: 205s 3.41m 0.06h 0.00d 0.000 y # IO & Wait Time: 38701s 645.02m 10.75h 0.45d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 23s 0.38m 0.01h 0.00d # Submission to last job: 202s 3.37m 0.06h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed cd /hive/data/genomes/calJac3/bed/cpgIsland hgLoadBed calJac3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 32732 elements of size 10 # Sorted # Saving bed.tab # Loading calJac3 # cleanup rm -fr hardMaskedFa ############################################################################ # LASTZ Human Swap (DONE - 2010-02-11 - Hiram) # original alignment on hg19: cd /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11 cat fb.hg19.chainCalJac3Link.txt # 2047068864 bases of 2897316137 (70.654%) in intersection # and for this swap mkdir /hive/data/genomes/calJac3/bed/blastz.hg19.swap cd /hive/data/genomes/calJac3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 120m42.991s cat fb.calJac3.chainHg19Link.txt # 2030475813 bases of 2752505800 (73.768%) in intersection ##################################################################### # LASTZ Rhesus Swap (DONE - 2010-02-11 - Hiram) # original alignment to rheMac2 cd /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11 cat fb.rheMac2.chainCalJac3Link.txt # 1871513554 bases of 2646704109 (70.711%) in intersection # and for this swap mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap cd /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 142m34.894s cat fb.calJac3.chainHg19Link.txt # 1916431926 bases of 2752505800 (69.625%) in intersection ############################################################################ # LASTZ Chimp Swap (DONE - 2010-02-11 - Hiram) # original alignment to panTro2 cd /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11 cat fb.panTro2.chainCalJac3Link.txt # 2016331285 bases of 2909485072 (69.302%) in intersection # and this swap run mkdir /hive/data/genomes/calJac3/bed/blastz.panTro2.swap cd /hive/data/genomes/calJac3/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 118m42.203s cat fb.calJac3.chainHg19Link.txt # 1990168262 bases of 2752505800 (72.304%) in intersection ############################################################################ # LASTZ Orangutan Swap (DONE - 2010-02-11 - Hiram) # original alignment to ponAbe2 cd /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11 cat fb.ponAbe2.chainCalJac3Link.txt # 2086557592 bases of 3093572278 (67.448%) in intersection # and this swap run mkdir /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap cd /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 146m12.301s cat fb.calJac3.chainHg19Link.txt # 1978857628 bases of 2752505800 (71.893%) in intersection ##################################################################### # lastz Dog CanFam2 Swap (DONE - 2010-02-14 - Hiram) # original alignment cd /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12 cat fb.canFam2.chainCalJac3Link.txt # 1363307334 bases of 2384996543 (57.162%) in intersection # and for the swap mkdir /hive/data/genomes/calJac3/bed/blastz.canFam2.swap cd /hive/data/genomes/calJac3/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 129m56.144s cat fb.calJac3.chainHg19Link.txt # 1397333116 bases of 2752505800 (50.766%) in intersection ######################################################################### # lastz Mouse Mm9 Swap (DONE - 2010-02-14 - Hiram) # original alignment to mouse cd /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12 cat fb.mm9.chainCalJac3Link.txt # 859869647 bases of 2620346127 (32.815%) in intersection # and for the swap mkdir /hive/data/genomes/calJac3/bed/blastz.mm9.swap cd /hive/data/genomes/calJac3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 90m38.739s cat fb.calJac3.chainHg19Link.txt # 861811978 bases of 2752505800 (31.310%) in intersection ##################################################################### # lastz Opossum monDom5 Swap (DONE - 2010-02-14 - Hiram) # original alignment to Opossum cd /hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12 cat fb.monDom5.chainCalJac3Link.txt # 216197506 bases of 3501660299 (6.174%) in intersection # and for the swap mkdir /hive/data/genomes/calJac3/bed/blastz.monDom5.swap cd /hive/data/genomes/calJac3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 110m13.435s cat fb.calJac3.chainMonDom5Link.txt # 217614612 bases of 2752505800 (7.906%) in intersection ############################################################################## # HUMAN (hg18) PROTEINS TRACK (working braney...) # bash if not using bash shell already cd /cluster/data/calJac3 mkdir /cluster/data/calJac3/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst calJac3.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst calJac3.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3275 mkdir -p /cluster/data/calJac3/bed/tblastn.hg18KG cd /cluster/data/calJac3/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3275 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3275) = 343.659786 mkdir -p kgfa split -l 344 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 107 107 1391 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/calJac3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/calJac3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/calJac3/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 100076 of 100076 jobs # CPU time in finished jobs: 2219419s 36990.31m 616.51h 25.69d 0.070 y # IO & Wait Time: 445463s 7424.39m 123.74h 5.16d 0.014 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest finished job: 319s 5.32m 0.09h 0.00d # Submission to last job: 2791s 46.52m 0.78h 0.03d ssh swarm cd /cluster/data/calJac3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 254 of 254 jobs # CPU time in finished jobs: 590077s 9834.62m 163.91h 6.83d 0.019 y # IO & Wait Time: 20749s 345.81m 5.76h 0.24d 0.001 y # Average job time: 2405s 40.08m 0.67h 0.03d # Longest finished job: 36592s 609.87m 10.16h 0.42d # Submission to last job: 36604s 610.07m 10.17h 0.42d cd /cluster/data/calJac3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 38517 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/calJac3/bed/tblastn.hg18KG hgLoadPsl calJac3 blastHg18KG.psl # check coverage featureBits calJac3 blastHg18KG # 18646943 bases of 332311746 (5.611%) in intersection featureBits calJac3 blastHg18KG ensGene -enrichment # blastHg18KG 5.611%, ensGene 9.520%, both 4.993%, cover 88.97%, enrich 9.35x rm -rf blastOut #end tblastn ############################################################################## # papHam1 Baboon LASTZ/CHAIN/NET (DONE - 2010-02-15 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-15 cd /hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-15 cat << '_EOF_' > DEF # baboon vs. marmoset # same paramters as human hg19 vs marmoset calJac3 BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Marmoset (calJac3) SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_LIMIT=50 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Baboon papHam1 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit SEQ2_LEN=/scratch/data/papHam1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # failed lastz run, finished manually # real 287m24.258s time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -continue=cat \ -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > cat.log 2>&1 & # real 158m17.502s cat fb.calJac3.chainPapHam1Link.txt # 1928203329 bases of 2752505800 (70.053%) in intersection time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ calJac3 papHam1 > rbest.log 2>&1 # real 232m mkdir /hive/data/genomes/papHam1/bed/blastz.calJac3.swap cd /hive/data/genomes/papHam1/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 791m46.765s cat fb.papHam1.chainCalJac3Link.txt # 1908519637 bases of 2741867288 (69.607%) in intersection ############################################################################## # tarSyr1 Tarsier LASTZ/CHAIN/NET (DONE - 2010-02-21 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21 cd /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21 cat << '_EOF_' > DEF # tarsier vs. marmoset # same paramters as human hg19 vs tarsier tarSyr1 BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Marmoset (calJac3) SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_LIMIT=50 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Tarsier tarSyr1 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21 TMPDIR=/scratch/tmp TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ > do.log 2>&1 & cat fb.calJac3.chainTarSyr1Link.txt # 1286219755 bases of 2752505800 (46.729%) in intersection time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ calJac3 tarSyr1 > rbest.log 2>&1 & # real 532m mkdir /hive/data/genomes/tarSyr1/bed/blastz.calJac3.swap cd /hive/data/genomes/tarSyr1/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 120m42.991s cat fb.tarSyr1.chainCalJac3Link.txt # 2030475813 bases of 2752505800 (73.768%) in intersection ##################################################################### # micMur1 Mouse lemur LASTZ/CHAIN/NET (DONE - 2010-02-17,22 - Hiram) # Mouse lemur ( Microcebus murinus) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17 cd /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17 cat << '_EOF_' > DEF # mouse lemur vs. marmoset # same paramters as human hg19 vs Mouse lemur micMur1 BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Marmoset (calJac3) SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_LIMIT=5 SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Mouse lemur micMur1 SEQ2_DIR=/hive/data/genomes/micMur1/micMur1.2bit SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 5502m6.707s # some kluster difficulties, finished cat run manually, then: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=chainRun `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ > chainRun.log 2>&1 & # real 374m19.587s calJac3 micMur1 02-17 cat fb.calJac3.chainMicMur1Link.txt # 1258616069 bases of 2752505800 (45.726%) in intersection time doRecipBest.pl -buildDir=`pwd` calJac3 micMur1 > rbest.log 2>&1 # real 235m55.179s calJac3 micMur1 mkdir /hive/data/genomes/micMur1/bed/blastz.calJac3.swap cd /hive/data/genomes/micMur1/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 455m11.215s micMur1 calJac3 swap cat fb.micMur1.chainCalJac3Link.txt # 1243785262 bases of 1852394361 (67.145%) in intersection ##################################################################### # otoGar1 Bushbaby LASTZ/CHAIN/NET (DONE - 2010-02-17,22 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17 cd /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17 cat << '_EOF_' > DEF # bushbaby vs. marmoset # same paramters as human hg19 vs Bushbaby otoGar1 BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Marmoset (calJac3) SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_LIMIT=5 SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Bushbaby otoGar1 SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit SEQ2_LEN=/scratch/data/otoGar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 4722m38.163s # memk failed at the cat run, finish it manually, then, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=chainRun `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ > chainRun.log 2>&1 & # real 285m58.314s cat fb.calJac3.chainOtoGar1Link.txt # 1176505967 bases of 2752505800 (42.743%) in intersection time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ calJac3 otoGar1 > rbest.log 2>&1 & # real 332m14.375s calJac3 otoGar1 mkdir /hive/data/genomes/otoGar1/bed/blastz.calJac3.swap cd /hive/data/genomes/otoGar1/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 310m4.077s cat fb.otoGar1.chainCalJac3Link.txt # 1158531484 bases of 1969052059 (58.837%) in intersection ##################################################################### ## 8-Way Multiz (DONE - 2009-06-09,2009-11-10 - Hiram) mkdir /hive/data/genomes/calJac3/bed/multiz8way cd /hive/data/genomes/calJac3/bed/multiz8way /cluster/bin/phast/tree_doctor \ --prune-all-but=calJac1,hg19,panTro2,rheMac2,ponAbe2,mm9,canFam2,monDom5 \ --rename="calJac1 -> calJac3 " \ /hive/data/genomes/hg19/bed/multiz46way/fixedTree/46wayFixed.nh > 8way.nh # *carefully* edit 8way.nh to get calJac3 at the top of this picture # resulting in this tree: (calJac3:0.066389,((rheMac2:0.057695,(ponAbe2:0.018342, (hg19:0.006591,panTro2:0.006639):0.012126):0.014256):0.010000, (mm9:0.352605,(canFam2:0.193569,monDom5:0.581923):0.020666) :0.088210):0.000001); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/calJac3_8way.gif /cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt # Use this output to create the table below, with this perl script: cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "grep -y calJac3 8way.distances.txt | sort -k3,3n|") or die "can not read 8way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($calJac3, $D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/calJac3/bed/lastz.$D/fb.calJac3." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.calJac3/fb.${D}.chainCalJac3Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; if ($swapMeasure eq "N/A") { printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } else { printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainCalJac3Link # distance on calJac3 on other # 01 0.1090 - Orangutan ponAbe2 (% 71.893) (% 67.448) # 02 0.1094 - Human hg19 (% 73.768) (% 70.654) # 03 0.1094 - Chimp panTro2 (% 72.304) (% 69.302) # 04 0.1341 - Rhesus rheMac2 (% 69.625) (% 70.711) # 05 0.3688 - Dog canFam2 (% 50.766) (% 57.162) # 06 0.5072 - Mouse mm9 (% 31.310) (% 32.815) # 07 0.7572 - Opossum monDom5 (% 7.906) (% 6.174) # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 8way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # bash shell syntax here ... mkdir -p mafLinks H="/hive/data/genomes/calJac3/bed/" for S in `sed -e "s/calJac3 //" species.list` do echo $S ls -og ${H}/lastz.${S}/axtChain/calJac3.${S}.synNet.maf.gz ln -s ${H}/lastz.${S}/axtChain/calJac3.${S}.synNet.maf.gz \ mafLinks/${S}.maf.gz done HERE=`pwd` export HERE PATH=${HERE}/penn:${PATH} export PATH rm -fr tmp mkdir -p tmp cd mafLinks time ../penn/autoMZ + T=${HERE}/tmp \ E=calJac3 "`cat ../tree.nh`" *.sing.maf result.maf # real 3584m8.094s mkdir /gbdb/calJac3/multiz8way ln -s `pwd`/mafLinks/result.maf /gbdb/calJac3/multiz8way/multiz8way.maf cd /scratch/tmp # Loaded 7475045 mafs in 1 files from /gbdb/calJac3/multiz8way time nice -n +19 hgLoadMaf calJac3 multiz8way time nice -n +19 hgLoadMafSummary -minSize=30000 -mergeGap=1500 \ -maxSize=200000 calJac3 multiz8waySummary multiz8way.maf | hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz9waySummary stdin # Created 1313222 summary blocks from 34128178 components # and 7475045 mafs from multiz8way.maf # real 8m36.016s ############################################################################## # gorGor2 Bushbaby LASTZ/CHAIN/NET (DONE - 2010-02-22,24 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-22 cd /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-22 cat << '_EOF_' > DEF # Gorilla vs. marmoset # same paramters as human hg19 vs other nearby primates # without all the extra blastz parameters BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Marmoset (calJac3) SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_LIMIT=20 SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Gorilla gorGor2 SEQ2_DIR=/scratch/data/gorGor2/gorGor2.2bit SEQ2_LEN=/scratch/data/gorGor2/chrom.sizes SEQ2_CHUNK=12000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/calJac3/bed/lastzGorGor2.2010-02-22 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # Elapsed time: 1956m3.678s cat fb.calJac3.chainGorGor2Link.txt # 2101356280 bases of 2752505800 (76.343%) in intersection time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ calJac3 gorGor2 > rbest.log 2>&1 # about 4h16m mkdir /hive/data/genomes/gorGor2/bed/blastz.calJac3.swap cd /hive/data/genomes/gorGor2/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzGorGor2.2010-02-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 250m57.089s cat fb.gorGor2.chainCalJac3Link.txt # 2135885920 bases of 2829687208 (75.481%) in intersection ##################################################################### ## 13-Way Multiz (DONE - 2010-02-23 - Hiram) ## Re-work 13-Way Multiz: (DONE 2011-04-19 - Chin) ## See redmine track #162 for more detail mv /hive/data/genomes/calJac3/bed/multiz13way \ /hive/data/genomes/calJac3/bed/multiz13way.2010-05-23 mkdir /hive/data/genomes/calJac3/bed/multiz13way cd /hive/data/genomes/calJac3/bed/multiz13way /cluster/bin/phast/tree_doctor \ --prune-all-but=calJac1,hg19,panTro2,rheMac2,ponAbe2,gorGor1,micMur1,otoGar1,papHam1,tarSyr1,mm9,canFam2,monDom5 \ --rename="calJac1 -> calJac3 ; gorGor1 -> gorGor2 " \ /hive/data/genomes/hg19/bed/multiz46way/fixedTree/46wayFixed.nh > 13way.nh # Note: # The rearrangement was not correct, the 13way.nh produced by tree_doctor can be used directly # for the multiz alignment without distance data. The distance will be generated (by 4d) # and used in the phastCon and phyloP steps. # rearrange calJac3 to the top, get some help from tree_doctor: /cluster/bin/phast/tree_doctor --name-ancestors --reroot calJac3 \ --with-branch 13way.nh # edit out the ancestors, and move calJac3 from the bottom to # the top, resulting in this tree: (calJac3:0.066389,(((((hg19:0.006591,panTro2:0.006639):0.002184, gorGor2:0.009411):0.009942,ponAbe2:0.018342):0.014256, (rheMac2:0.036199,papHam1:0.040000):0.021496):0.010000, ((((monDom5:0.581923,canFam2:0.193569):0.020666,mm9:0.352605):0.019992, (micMur1:0.091452,otoGar1:0.128984):0.035463):0.011307, tarSyr1:0.135169):0.056911):0.000001); # more rearranging after seeing what the distance table looks like # below to get them appearing as much as possible in their # distance order top to bottom: (calJac3:0.066389,(((ponAbe2:0.018342, ((hg19:0.006591,panTro2:0.006639):0.002184, gorGor2:0.009411):0.009942):0.014256, (rheMac2:0.036199,papHam1:0.040000):0.021496):0.010000, (tarSyr1:0.135169,((micMur1:0.091452,otoGar1:0.128984):0.035463, (mm9:0.352605, (canFam2:0.193569,monDom5:0.581923):0.020666):0.019992):0.011307):0.056911) :0.000001); # Use 13way.nh directly cat 13way.nh ((((((((((hg19:0.006591,panTro2:0.006639):0.002184,gorGor2:0.009411):0.009942,ponAbe2:0.018342):0.014256,(rheMac2:0.036199,papHam1:0.040000):0.021496):0.010000,calJac3:0.066389):0.056911,tarSyr1:0.135169):0.011307,(micMur1:0.091452,otoGar1:0.128984):0.035463):0.019992,mm9:0.352605):0.020666,canFam2:0.193569):0.156024,monDom5:0.425899); # Use this specification in the phyloGif tool after changing the names: /cluster/bin/phast/tree_doctor \ --rename="calJac3 -> Marmoset ; ponAbe2 -> Orangutan ; hg19 -> Human ; panTro2 -> Chimp ; gorGor2 -> Gorilla ; rheMac2 -> Rhesus ; papHam1 -> Baboon ; tarSyr1 -> Tarsier ; micMur1 -> Mouse_lemur ; otoGar1 -> Bushbaby ; canFam2 -> Dog ; mm9 -> Mouse ; monDom5 -> Opossum " 13way.nh # use the tree provided by Bob K. for display: cat << '_EOF_' > commonNames.13way.nh ((((((Marmoset,((((Human,Chimp),Gorilla),Orangutan), (Baboon,Rhesus))),Tarsier), (MouseLemur, Bushbaby)),Mouse),Dog),Opossum); '_EOF_' # << happy emacs # use http://genome.ucsc.edu/cgi-bin/phyloGif # to generate calJac3_13way.png image and copy over to # htdocs/images/phylo/ cp calJac3_13way.png /usr/local/apache/htdocs/images/phylo # Note the old a gif image for htdocs/images/phylo/calJac3_13way.gif # is wrong, remove it # rm /usr/local/apache/htdocs/images/phylo/calJac3_13way.gif /cluster/bin/phast/all_dists 13way.nh > 13way.distances.tmp # swap the calJac3 from field 2 to field 1 cat 13way.distances.tmp | \ awk '{if ($2 == "calJac3") print $2, " " $1, $3; else print $0;}' | \ sort > 13way.distances.txt # make sure all symlinks lastz.DB -> lastzDb-date # exist here and at the swap locations, the perl script expects this # in order to find featureBits numbers. # This is for our own bookkeeping business so we can # determine what is done. If there is any zero in the table, # then, we miss some steps such as no chain/net for a calJac3 to hg19. # Use 13way.distances.txt to create the table below # with this perl script: cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "grep -y calJac3 13way.distances.txt | sort -k3,3n|") or die "can not read 13way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($calJac3, $D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/calJac3/bed/lastz.$D/fb.calJac3." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.calJac3/fb.${D}.chainCalJac3Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; if ($swapMeasure eq "N/A") { printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } else { printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # 01 0.1090 - Orangutan ponAbe2 (% 71.893) (% 67.448) # 02 0.1094 - Human hg19 (% 73.768) (% 70.654) # 03 0.1094 - Chimp panTro2 (% 72.304) (% 69.302) # 04 0.1100 - Gorilla gorGor2 (% 76.343) (N/A) # 05 0.1341 - Rhesus rheMac2 (% 69.625) (% 70.711) # 06 0.1379 - Baboon papHam1 (% 70.053) (% 69.607) # 07 0.2585 - Tarsier tarSyr1 (% 46.729) (N/A) # 08 0.2615 - Mouse lemur micMur1 (% 45.726) (% 67.145) # 09 0.2991 - Bushbaby otoGar1 (% 42.743) (% 58.837) # 10 0.3688 - Dog canFam2 (% 50.766) (% 57.162) # 11 0.5072 - Mouse mm9 (% 31.310) (% 32.815) # 12 0.7572 - Opossum monDom5 (% 7.906) (% 6.174) # create species list and stripped down tree for autoMZ # Note: the 13way.nh can not be used directly becasue calJac3 is # not on top. Insted use commonNames.13way.nh provided by Bob # to create species list and stripped down tree. # sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ # 13way.nh > tmp.nh sed -e 's/Marmoset/calJac3/' -e 's/Orangutan/ponAbe2/' \ -e 's/Human/hg19/' -e 's/Chimp/panTro2/' \ -e 's/Gorilla/gorGor2/' -e 's/Rhesus/rheMac2/' \ -e 's/Baboon/papHam1/' -e 's/Tarsier/tarSyr1/' \ -e 's/MouseLemur/micMur1/' -e 's/Bushbaby/otoGar1/' \ -e 's/Dog/canFam2/' -e 's/Mouse/mm9/' \ -e 's/Opossum/monDom5/' \ commonNames.13way.nh | sed 's/ //g; s/,/ /g; s/;//g;' > tree.nh sed 's/[()]//g; s/,/ /g; s/;//g;' tree.nh > species.list # create a tree of species name with comma fo 4d step later sed -e 's/Marmoset/calJac3/' -e 's/Orangutan/ponAbe2/' \ -e 's/Human/hg19/' -e 's/Chimp/panTro2/' \ -e 's/Gorilla/gorGor2/' -e 's/Rhesus/rheMac2/' \ -e 's/Baboon/papHam1/' -e 's/Tarsier/tarSyr1/' \ -e 's/MouseLemur/micMur1/' -e 's/Bushbaby/otoGar1/' \ -e 's/Dog/canFam2/' -e 's/Mouse/mm9/' \ -e 's/Opossum/monDom5/' \ commonNames.13way.nh | sed 's/ //g; s/;//g;' > tree-commas.nh # collect the single whole mafs into one place for splitting: mkdir singleMafs cd singleMafs ln -s ../../lastz.ponAbe2/axtChain/calJac3.ponAbe2.synNet.maf.gz . ln -s ../../lastz.hg19/axtChain/calJac3.hg19.synNet.maf.gz . ln -s ../../lastz.panTro2/axtChain/calJac3.panTro2.synNet.maf.gz . ln -s ../../lastz.gorGor2/axtChain/calJac3.gorGor2.synNet.maf.gz . ln -s ../../lastz.rheMac2/axtChain/calJac3.rheMac2.synNet.maf.gz . ln -s ../../lastz.papHam1/mafRBestNet/calJac3.papHam1.rbest.maf.gz . ln -s ../../lastz.tarSyr1/mafRBestNet/calJac3.tarSyr1.rbest.maf.gz . ln -s ../../lastz.micMur1/mafRBestNet/calJac3.micMur1.rbest.maf.gz . ln -s ../../lastz.otoGar1/mafRBestNet/calJac3.otoGar1.rbest.maf.gz . ln -s ../../lastz.mm9/axtChain/calJac3.mm9.synNet.maf.gz . ln -s ../../lastz.canFam2/axtChain/calJac3.canFam2.synNet.maf.gz . ln -s ../../lastz.monDom5/axtChain/calJac3.monDom5.synNet.maf.gz . mkdir /hive/data/genomes/calJac3/bed/multiz13way/splitMaf cd /hive/data/genomes/calJac3/bed/multiz13way/splitMaf for D in tarSyr1 papHam1 otoGar1 micMur1 do mkdir ${D} mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \ ../singleMafs/calJac3.${D}.rbest.maf.gz done for D in gorGor2 ponAbe2 hg19 panTro2 rheMac2 mm9 canFam2 monDom5 do mkdir ${D} mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \ ../singleMafs/calJac3.${D}.synNet.maf.gz done cd /hive/data/genomes/calJac3/bed/multiz13way mkdir run mkdir penn cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn cd /hive/data/genomes/calJac3/bed/multiz13way/run # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = calJac3 set topDir = /hive/data/genomes/$db/bed/multiz13way set c = $1 set result = $2 set pennBin = $topDir/penn set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = $topDir/splitMaf /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p $topDir/tree.nh $topDir/species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/^$db //" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($pennBin $path); rehash $pennBin/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/calJac3/bed/multiz13way/run/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/calJac3/bed/multiz13way/run mkdir maf find ../splitMaf -type f | grep "/[0-9][0-9][0-9].maf" \ | xargs -L 1 basename | sort -u > chr.part.list gensub2 chr.part.list single template jobList para -ram=8g create jobList # para try check time push stuff # put the split mafs back together into a single result # Completed: 256 of 256 jobs # CPU time in finished jobs: 312697s 5211.62m 86.86h 3.62d 0.010 y # IO & Wait Time: 23882s 398.03m 6.63h 0.28d 0.001 y # Average job time: 1315s 21.91m 0.37h 0.02d # Longest finished job: 25624s 427.07m 7.12h 0.30d # Submission to last job: 25750s 429.17m 7.15h 0.30d ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/run head -q -n 1 maf/000.maf > calJac3.13way.maf for F in maf/*.maf do grep -h -v "^#" ${F} >> calJac3.13way.maf done tail -q -n 1 maf/000.maf >> calJac3.13way.maf # -rw-rw-r-- 1 30553880367 Apr 21 09:40 calJac3.13way.maf #split the /hive/data/genomes/calJac3/bed/multiz13way/run/calJac3.13way.maf # to other folder with fullName for phastCons # load into database ssh hgwdev mkdir -p /gbdb/calJac3/multiz13way/maf cd /hive/data/genomes/calJac3/bed/multiz13way/run ln -s `pwd`/calJac3.13way.maf \ /gbdb/calJac3/multiz13way/maf/multiz13way.maf # this generates an immense multiz13way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/calJac3/multiz13way/maf calJac3 multiz13way # Indexing and tabulating /gbdb/calJac3/multiz13way/maf/multiz13way.maf # Loading multiz13way into database # Loaded 13258003 mafs in 1 files from /gbdb/calJac3/multiz13way/maf # real 16m48.119s # load summary table time nice -n +19 cat /gbdb/calJac3/multiz13way/maf/*.maf \ | hgLoadMafSummary calJac3 -minSize=30000 -verbose=2 \ -mergeGap=1500 -maxSize=200000 multiz13waySummary stdin # Indexing and tabulating stdin # Created 2327841 summary blocks from 99934514 components # and 13258003 mafs from stdin # Loading into calJac3 table multiz13waySummary... # Loading complete # real 16m7.795s wc -l multiz13way*.tab # 13258003 multiz13way.tab # 2327841 multiz13waySummary.tab # 15585844 total rm multiz13way*.tab ############################################################################ # GAP ANNOTATE MULTIZ13WAY MAF AND LOAD TABLES (DONE 2011-05-04 - Chin) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. # use the /hive/data/genomes/calJac3/bed/multiz13way/run/calJac3.13way.maf # to split cd /hive/data/genomes/calJac3/bed/multiz13way/ mkdir mafSplit cd mafSplit mafSplit -byTarget -useFullSequenceName /dev/null . ../run/calJac3.13way.maf # Splitting 1 files by target sequence -- ignoring first argument # /dev/null ls -lt | wc -l # 10348 # got 10348 mafs named after their chrom/scaff .maf # although there are over 14205 chroms and scaffolds (wc -l # chrom.sizes), # some are too small or have nothing aligning. mkdir /hive/data/genomes/calJac3/bed/multiz13way/anno cd /hive/data/genomes/calJac3/bed/multiz13way/anno # most of these will already exist from previous multiple # alignments # remove the echo from in front of the twoBitInfo command to get # them # to run if this loop appears to be correct for DB in `cat ../species.list` do CDIR="/hive/data/genomes/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done for DB in `sed -e "s/calJac3 //" ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL mkdir mafSplit for F in ../mafSplit/*.maf do B=`basename ${F}` mafAddIRows -nBeds=nBeds ${F} /hive/data/genomes/calJac3/calJac3.2bit \ mafSplit/${B} echo $B done # about 5h 20m # combine into one file head -q -n 1 mafSplit/chr1.maf > calJac3.13way.maf for F in mafSplit/*.maf do grep -h -v "^#" ${F} done >> calJac3.13way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 mafSplit/chr1.maf >> calJac3.13way.maf # How about an official end marker: echo "##eof maf" >> calJac3.13way.maf # Load into database # by loading this into the table multiz13waySummary, it will # replace # the previously loaded table with the unannotated mafs rm /gbdb/calJac3/multiz13way/multiz13way.maf # remove old symlink ln -s `pwd`/calJac3.13way.maf /gbdb/calJac3/multiz13way/multiz13way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf calJac3 multiz13way # Indexing and tabulating /gbdb/calJac3/multiz13way/multiz13way.maf # Loading multiz13way into database # Loaded 14067989 mafs in 1 files from /gbdb/calJac3/multiz13way # real 17m32.053s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 calJac3 multiz13waySummary \ /gbdb/calJac3/multiz13way/multiz13way.maf # Indexing and tabulating /gbdb/calJac3/multiz13way/multiz13way.maf # Created 2327841 summary blocks from 99934514 components and # 14067989 mafs from /gbdb/calJac3/multiz13way/multiz13way.maf # Loading into calJac3 table multiz13waySummary... # Loading complete # real 21m4.436s # by loading this into the table multiz13waySummary, it will # replace # the previously loaded table with the unannotated mafs # remove the multiz13way*.tab files in this /scratch/tmp directory wc -l multiz13way*.tab # 14067989 multiz13way.tab rm multiz13way*.tab ##################################################################### # all.joiner update, downloads and in pushQ - (DONE - 2010-04-01 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=calJac3 -all all.joiner mkdir /hive/data/genomes/calJac3/goldenPath cd /hive/data/genomes/calJac3/goldenPath time nice -n +19 makeDownloads.pl calJac3 > do.log 2>&1 # real 22m30.329s # now ready for pushQ entry mkdir /hive/data/genomes/calJac3/pushQ cd /hive/data/genomes/calJac3/pushQ time nice -n +19 makePushQSql.pl calJac3 > calJac3.pushQ.sql 2> stderr.out real 2m52.193s # check for errors in stderr.out, some are OK, e.g.: # WARNING: calJac3 does not have seq # WARNING: calJac3 does not have extFile # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # bosTau4ChainPileUp # copy it to hgwbeta scp -p calJac3.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < calJac3.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE - 2010-04-01 - Hiram) mkdir /hive/data/genomes/calJac3/bed/ctgPos2 cd /hive/data/genomes/calJac3/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs export S=../../genbank/Primary_Assembly/assembled_chromosomes/AGP cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab calJac3 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ############################################################################ # N-SCAN gene predictions (nscanGene) - (2010-04-26 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/calJac3/bed/nscan wget -nv http://mblab.wustl.edu/predictions/marmoset/calJac3/readme.txt wget -nv http://mblab.wustl.edu/predictions/marmoset/calJac3/calJac3.prot.fa wget -nv http://mblab.wustl.edu/predictions/marmoset/calJac3/calJac3.gtf gzip calJac3.* chmod a-w * # load track gtfToGenePred -genePredExt calJac3.gtf.gz stdout | hgLoadGenePred -genePredExt calJac3 nscanGene stdin hgPepPred calJac3 generic nscanPep calJac3.prot.fa.gz rm *.tab # marmoset/calJac3/trackDb.ra, add: track nscanGene override informant Marmoset N-SCAN used mouse (mm9) as the informant for conservation. PASA clustered human ESTs were mapped to the marmoset genome with the use of the human/marmoset synteny chain track. These remapped clusters were then filtered for splice sites, and used as the EST track on marmoset. # veryify top-level search spec, should produce no results: hgsql -Ne 'select name from nscanGene' calJac3 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head ############################################################################ # Fixing references to old calJac1 date to be to March 2009 - (2010-05-12 mary) # trackDb cd /cluster/home/mary/kent/src/hg/makeDb/trackDb/marmoset find -name "*.html" | xargs grep 2007 # downloads cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3 find -name "*" | xargs grep 2007 # hgcentral hgsql hgcentraltest update dbDb set description="March 2009 (WUGSC 3.2/calJac3)" where name="calJac3" ############################################################################ ## Annotate 13-way multiple alignment with gene annotations ## (DONE - 2010-05-17 - Chin) ## use ensGene for calJac3 (DONE - 2010-07-22 - Chin) ## Re-work (DONE 2011-05-05 - Chin) # Gene frames ## survey all genomes to see what type of gene track to use ssh hgwdev mkdir /hive/data/genomes/calJac3/bed/multiz13way/frames cd /hive/data/genomes/calJac3/bed/multiz13way/frames # # survey all the genomes to find out what kinds of gene tracks # they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "xenoRefGene" || $table == "nscanGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql calJac3 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql calJac3 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh ./showGenes.csh # calJac3: ensGene: 55137, nscanGene: 28723, refGene: 53, xenoRefGene: 252282, Mrnas: 3789 # hg19: ensGene: 167071, knownGene: 77614, mgcGenes: 31354, nscanGene: 67448, refGene: 38868, xenoRefGene: 136935, Mrnas: 295362 # panTro2: ensGene: 41488, nscanGene: 22251, refGene: 1457, xenoRefGene: 233381, Mrnas: 2257 # gorGor2: xenoRefGene: 240083, Mrnas: 1 # ponAbe2: ensGene: 31566, nscanGene: 23499, refGene: 3549, xenoRefGene: 241018, Mrnas: 0 # papHam1: xenoRefGene: 267869, Mrnas: 71 # rheMac2: ensGene: 42820, nscanGene: 22003, refGene: 2563, xenoRefGene: 303624, Mrnas: 7297 # tarSyr1: ensGene: 45832, Mrnas: 8 # micMur1: ensGene: 37458, Mrnas: 53 # otoGar1: ensGene: 36560, Mrnas: 1 # mm9: ensGene: 93805, knownGene: 55419, mgcGenes: 26707, nscanGene: 21895, refGene: 28429, xenoRefGene: 132694, Mrnas: 261336 # canFam2: ensGene: 30914, nscanGene: 24615, refGene: 1205, xenoRefGene: 225414, Mrnas: 3476 # monDom5: ensGene: 35113, nscanGene: 45946, refGene: 467, xenoRefGene: 228506, Mrnas: 1719 # rearrange that output to create 3 sections: #1. knownGenes: hg19 mm9 #2. ensGene: calJac3,ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5 #3. xenoRefGene: papHam1 gorGor2 mkdir genes # knownGene echo "hg19 mm9" \ | sed -e "s/ */ /g" > knownGene.list for DB in hg19 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done echo "calJac3 ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5" \ | sed -e "s/ */ /g" > ensGene.list # ensGene for DB in calJac3 ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done echo "papHam1 gorGor2" > xenoRef.list # xenoRefGene for DB in papHam1 gorGor2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/calJac3.gp.gz: 20843 # genes/canFam2.gp.gz: 19245 # genes/gorGor2.gp.gz: 22444 # genes/hg19.gp.gz: 20597 # genes/micMur1.gp.gz: 16240 # genes/mm9.gp.gz: 20905 # genes/monDom5.gp.gz: 19188 # genes/otoGar1.gp.gz: 15388 # genes/panTro2.gp.gz: 19757 # genes/papHam1.gp.gz: 26534 # genes/ponAbe2.gp.gz: 19895 # genes/rheMac2.gp.gz: 21049 # genes/tarSyr1.gp.gz: 13617 ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/frames cat ../anno/calJac3.13way.maf \ | nice -n +19 genePredToMafFrames calJac3 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz13way.mafFrames.gz # verify there are frames on everything: zcat multiz13way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 200715 calJac3 # 232951 canFam2 # 220393 gorGor2 # 205490 hg19 # 205084 micMur1 # 229058 mm9 # 200391 monDom5 # 212293 otoGar1 # 202695 panTro2 # 212325 papHam1 # 205185 ponAbe2 # 202823 rheMac2 # 157578 tarSyr1 ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/frames time hgLoadMafFrames calJac3 multiz13wayFrames multiz13way.mafFrames.gz # real 0m28.294s # check it: hgsql -N -e "select src from multiz13wayFrames;" calJac3 \ | sort | uniq -c | grep calJac3 # 200715 calJac3 # enable the trackDb entries: # frames multiz13wayFrames # irows on ############################################################################# ## create upstream refGene maf files mkdir /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf cd /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits calJac3 xenoRefGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags calJac3 multiz13way \ stdin stdout \ -orgs=/hive/data/genomes/calJac3/bed/multiz13way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # making upstream1000.maf # done upstream1000.maf.gz # making upstream2000.maf # done upstream2000.maf.gz # making upstream5000.maf # done upstream5000.maf.gz # maf files cp -p /hive/data/genomes/calJac3/bed/multiz13way/run/calJac3.13way.maf . gzip calJac3.13way.maf # mv up one level to downloads mv *.* ../ cd .. rm -r maf # download stuff cd /hive/data/genomes/calJac3/bed/multiz13way cp 13way.nh 13way.nh.original cp tree.nh 13way.nh cp 13way.nh downloads/. cp commonNames.13way.nh downloads/. cd downloads # create /hive/data/genomes/calJac3/bed/multiz13way/downloads/README.txt md5sum * > md5sum.txt mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/multiz13way cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/multiz13way ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/*.gz . ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/*.nh . ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/README.txt . ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/md5sum.txt . ######################################################################### # Phylogenetic tree from 13-way (DONE 2010-05-28 - Chin) # Re-work (DONE 2011-04-25 - Chin) # We need one tree for all chroms # use the /hive/data/genomes/calJac3/bed/multiz13way/run/calJac3.13way.maf # to split cd /hive/data/genomes/calJac3/bed/multiz13way/ mkdir mafSplit cd mafSplit mafSplit -byTarget -useFullSequenceName /dev/null . ../run/calJac3.13way.maf # Splitting 1 files by target sequence -- ignoring first argument /dev/null ls -lt | wc -l # 10348 # got 10348 mafs named after their chrom/scaff .maf # although there are over 14205 chroms and scaffolds (wc -l # chrom.sizes), # some are too small or have nothing aligning. mkdir /hive/data/genomes/calJac3/bed/multiz13way/4d cd /hive/data/genomes/calJac3/bed/multiz13way/4d # using ensGene for calJac3, only transcribed genes and nothing # from the randoms and other misc. hgsql calJac3 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \ | egrep -E -v "chrM|random|chrUn" \ > ensGene.gp # verify chromosome selection, should just be the ordinary chroms: cut -f2 ensGene.gp | sort | uniq -c wc -l *.gp # 42677 ensGene.gp genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 19912 ensGeneNR.gp ssh memk mkdir /hive/data/genomes/calJac3/bed/multiz13way/4d/run cd /hive/data/genomes/calJac3/bed/multiz13way/4d/run mkdir ../mfa # whole chrom mafs version, using new version of # uses memory-efficient version of phast, from Melissa Hubisz at Cornell # mjhubisz at gmail.com cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/calJac3/bed/multiz13way" set c = $1 set infile = $r/mafSplit/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGene.gp > $c.gp.tmp sed "s/\t$c\t/\tcalJac3.$c\t/g" $c.gp.tmp > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp.tmp $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/calJac3/bed/multiz13way/mafSplit/*.maf | \ egrep -E -v "chrM|random|chrUn" \ | sed -e "s#.*multiz13way/mafSplit/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check ... push ... etc para time # Completed: 24 of 24 jobs # CPU time in finished jobs: 2913s 48.55m 0.81h 0.03d 0.000 y # IO & Wait Time: 187s 3.12m 0.05h 0.00d 0.000 y # Average job time: 129s 2.15m 0.04h 0.00d # Longest finished job: 238s 3.97m 0.07h 0.00d # Submission to last job: 421s 7.02m 0.12h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/4d # but first clean out junk 1-byte leftovers from above process. # Only 24 (real chrom) out of 2623 files have real data. cd mfa find -type f -size 1c | xargs -iX rm X find -type f -size 0c | xargs -iX rm X cd /hive/data/genomes/calJac3/bed/multiz13way/4d #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >calJac3 # >hg19 # >panTro2 # >gorGor2 # >ponAbe2 # >papHam1 # >rheMac2 # >tarSyr1 # >micMur1 # >otoGar1 # >mm9 # >canFam2 # >monDom5 # cat ../tree-commas.nh # ((((((calJac3,((((hg19,panTro2),gorGor2),ponAbe2),(papHam1,rheMac2))),tarSyr1),(micMur1,otoGar1)),mm9),canFam2),monDom5) # use phyloFit to create tree model (output is phyloFit.mod) /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # Reading alignment from 4d.all.mfa ... # Extracting sufficient statistics ... # Compacting sufficient statistics ... # Fitting tree model to 4d.all.mfa using REV ... # Writing model to phyloFit.mod ... # Done. ############################################################################# # phastCons 13-way (DONE 2010-08-11 - Chin) # Re-work with phast suite 2011-12-30 version (DONE 2011-04-27 - Chin) # split 13way mafs into 10M chunks and generate sufficient # statistics # files for # phastCons ssh swarm mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/ss cd /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split mkdir done cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/calJac3/bed/multiz13way/mafSplit/$c.maf set WINDOWS = /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ done/$(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../mafSplit \ | sed -e "s/.maf//; s/calJac3_//" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 10347 of 10347 jobs # CPU time in finished jobs: 3233s 53.88m 0.90h 0.04d 0.000 y # IO & Wait Time: 48949s 815.82m 13.60h 0.57d 0.002 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 323s 5.38m 0.09h 0.00d # Submission to last job: 574s 9.57m 0.16h 0.01d # some scaffolds were too small to produce output. # expected 10769, found 4749 cd /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/ss find -type f | wc -l # 4913 cd ../../done rm *.done # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/cons/run.cons cd /hive/data/genomes/calJac3/bed/multiz13way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working # directory # $cwd:t which is the "grp" in this script. It is # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/calJac3/bed/multiz13way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/msa.split/ss/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs ls -1S ../msa.split/ss/*/* | sed -e 's/.ss$//' > ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/calJac3/bed/multiz13way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/phyloFit.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 4913 of 4913 jobs # CPU time in finished jobs: 7560s 126.01m 2.10h 0.09d 0.000 y # IO & Wait Time: 44348s 739.13m 12.32h 0.51d 0.001 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest finished job: 57s 0.95m 0.02h 0.00d # Submission to last job: 295s 4.92m 0.08h 0.00d # create Most Conserved track ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all cat bed/*/*.bed | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed calJac3 phastConsElements13way mostConserved.bed # Reading mostConserved.bed # Loaded 1531649 elements of size 5 # Sorted # Creating table definition for phastConsElements13way # Saving bed.tab # Loading calJac3 # # real 0m12.260s # Try for 5% overall cov, and 70% CDS cov featureBits calJac3 -enrichment ensGene:cds phastConsElements13way # ensGene:cds 1.221%, phastConsElements13way 4.704%, #both 0.845%, cover 69.19%, enrich 14.71x # hg19 for comparison # refGene:cds 1.196%, phastConsElements46way 5.065%, # both 0.888%, cover 74.22%, enrich 14.65x # ensGene:cds 1.278%, phastConsElements46way 5.065%, # both 0.910%, cover 71.23%, enrich 14.06x # knownGene:cds 1.252%, phastConsElements46way 5.065%, # both 0.905%, cover 72.29%, enrich 14.27x # Create merged posterier probability file and wiggle track data # files cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons13way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons13way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/calJac3/chrom.sizes \ phastCons13way.bw) > bigWig.log 2>&1 & grep real bigWig.log # real 28m26.948s grep VmPeak bigWig.log # pid=28269: VmPeak: 31261764 kB bigWigInfo phastCons13way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 4,004,563,748 # primaryIndexSize: 84,708,528 # zoomLevels: 10 # chromCount: 4648 # basesCovered: 2,302,681,633 # mean: 0.118453 # min: 0.000000 # max: 1.000000 # std: 0.238431 # encode those files into wiggle data time (zcat downloads/phastCons13way.wigFix.gz \ | wigEncode stdin phastCons13way.wig phastCons13way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 10m35.994s du -hsc *.wi? # 2.2G phastCons13way.wib # 255M phastCons13way.wig # 2.4G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons13way.wib /gbdb/calJac3/multiz13way/phastCons13way.wib nice hgLoadWiggle -pathPrefix=/gbdb/calJac3/multiz13way calJac3 \ phastCons13way phastCons13way.wig # Connected to database calJac3 for track phastCons13way # Creating wiggle table definition in calJac3.phastCons13way # Saving wiggle.tab # Loading calJac3 # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh calJac3 phastCons13way ## db.table min max mean count sumData stdDev viewLimits # calJac3.phastCons13way 0 1 0.118453 2302681633 2.7276e+08 0.238431 # viewLimits=0:1 # Create histogram to get an overview of all the data hgWiggle -doHistogram -db=calJac3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons13way >& histogram.data # create plot of histogram: #orig set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Marmoset calJac3 Histogram phastCons13way track" set xlabel " phastCons13way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # download stuff mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/phastCons13way cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/phastCons13way ln -s /hive/data/genomes/calJac3/bed/multiz13way/cons/all/all.mod . ln -s /hive/data/genomes/calJac3/bed/multiz13way/cons/all/histo.png . ln -s /hive/data/genomes/calJac3/bed/multiz13way/cons/all/README.txt . ln -s /hive/data/genomes/calJac3/bed/multiz13way/cons/all/downloads/phastCons13way.wigFix.gz . md5sum phastCons13way.wigFix.gz > md5sum.txt md5sum all.mod >> md5sum.txt md5sum histo.png >> md5sum.txt md5sum README.txt >> md5sum.txt ############################################################################# # Quality track (DONE - 2010-07-14 - Hiram) mkdir -p /hive/data/genomes/calJac3/bed/qual/genbank cd /hive/data/genomes/calJac3/bed/qual/genbank cd /hive/data/genomes/calJac3/bed/qual grep "^chr" ../../calJac3.agp \ | egrep -v "contig|fragment|telo|centrom|clone" \ | awk '{print $6}' | sort -u > contig.name.list cd genbank wget --timestamping \ ftp://ftp.ncbi.nih.gov/genbank/wgs/wgs.ACFV.*.qscore.gz zcat wgs.ACFV.*.qscore.gz | grep "^>" | sed -e "s/^>gb|//; s/|.*//" \ sort -u > contig.list wc -l ../contig.name.list contig.list # 201514 ../contig.name.list # 202483 contig.list # not all the contigs have quality scores here, but almost all: comm -12 ../contig.name.list contig.list | wc -l # 201433 cat << '_EOF_' > extractContigs.pl #!/usr/bin/env perl use strict; use warnings; sub usage() { printf STDERR "usage: ./ctContigs.pl ../contig.name.list > calJac3.qual.fa\n"; printf STDERR "after collecting the contig names from the given list\n"; printf STDERR "this is going to read wgs.ACFV.*.qscore.gz\n"; printf STDERR "and output any fasta elements that match the name list\n"; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; exit 255; } my $file = shift; my %ctgNames; my $readNames = 0; open (FH, "<$file") or die "can not open $file"; while (my $line = ) { chomp $line; $ctgNames{$line} = 1; ++$readNames; } close (FH); printf STDERR "read $readNames from $file\n"; my $ctgCount = 0; foreach my $key (keys %ctgNames) { ++$ctgCount; } printf STDERR "found $ctgCount contig names in $file\n"; open (NF,">not.found.list") or die "can not write to not.found.list"; my $notFound = 0; my $ctgsDone = 0; for (my $i = 1; $i < 15; ++$i) { my $fname = sprintf("wgs.ACFV.%d.qscore.gz", $i); my $goodElement = 0; open (FH, "zcat $fname|") or die "can not read $fname"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $name = $line; $name =~ s/^>gb\|//; $name =~ s/\|.*//; # printf STDERR "name: %s\n", $name; if (exists($ctgNames{$name})) { $goodElement = 1; printf ">%s\n", $name; ++$ctgsDone; } else { printf NF "%s:%s\n", $name, $line; ++$notFound; $goodElement = 0; } } elsif ($goodElement > 0) { $line =~ s/^\s+//; printf "%s", $line; } } close (FH); } close(NF); printf STDERR "found $ctgsDone during processing\n"; printf STDERR "not found $notFound during processing\n"; '_EOF_' # << happy emacs chmod +x ./extractContigs.pl time ./extractContigs.pl ../contig.name.list \ | gzip -c > ../calJac3.contigs.qual.fa.gz # read 201514 from ../contig.name.list # found 201514 contig names in ../contig.name.list # found 201433 during processing # not found 1050 during processing cd .. time qaToQac calJac3.contigs.qual.fa.gz calJac3.contigs.qual.qac # real 1m28.846s time qacAgpLift -mScore=0 ../../calJac3.agp \ calJac3.contigs.qual.qac calJac3.qual.qac # real 1m16.994s time qacToWig -fixed calJac3.qual.qac stdout | gzip -c \ > calJac3.qual.wigFix.gz # real 7m21.203s # avoid memory size problem sizeG=50000000 export sizeG ulimit -d $sizeG ulimit -v $sizeG time wigToBigWig calJac3.qual.wigFix.gz ../../chrom.sizes calJac3.qual.bw # real 29m7.143s bigWigInfo -minMax calJac3.qual.bw # 0.000000 97.000000 bigWigInfo calJac3.qual.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 587,271,637 # primaryIndexSize: 91,661,572 # zoomLevels: 10 # chromCount: 14205 # basesCovered: 2,914,958,544 # mean: 87.707709 # min: 0.000000 # max: 97.000000 # std: 25.751106 mkdir /gbdb/calJac3/bbi ln -s `pwd`/calJac3.qual.bw /gbdb/calJac3/bbi hgsql calJac3 -e 'drop table if exists qualityBw; \ create table qualityBw \ (fileName varchar(255) not null); \ insert into qualityBw values ("/gbdb/calJac3/bbi/calJac3.qual.bw");' ############################################################################# # calJac3 - Marmoset - Ensembl Genes version 58 (DONE - 2010-07-20 - hiram) ssh hgwdev cd /hive/data/genomes/calJac3 cat << '_EOF_' > calJac3.ensGene.ra # required db variable db calJac3 nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=58 calJac3.ensGene.ra ssh hgwdev cd /hive/data/genomes/calJac3/bed/ensGene.58 featureBits calJac3 ensGene # 50565085 bases of 2752505800 (1.837%) in intersection ######################################################################### # phyloP conservation for 13-way (Done 2010-08-31 - Chin) # Re-Work ( DONE 2011-04-29 - Chin) # # split SS files into 1M chunks, this business needs smaller files # to complete mkdir /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP cd /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP mkdir run.split cd run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/calJac3/bed/multiz13way/mafSplit/$c.maf set WINDOWS = /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/run.split/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh ls -1S -r ../../mafSplit | sed -e "s/.maf//" > maf.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out exists+ done/$(path1).done} #ENDLOOP '_EOF_' # << happy emacs mkdir ss done ssh swarm cd /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/run.split gensub2 maf.list single template jobList para -ram=8g create jobList para try para push para check para time # Checking finished jobs # Completed: 10347 of 10347 jobs # CPU time in finished jobs: 3454s 57.57m 0.96h 0.04d 0.000 y # IO & Wait Time: 68218s 1136.97m 18.95h 0.79d 0.002 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest finished job: 354s 5.90m 0.10h 0.00d # Submission to last job: 654s 10.90m 0.18h 0.01d # run phyloP with --method LRT ssh swarm mkdir /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/run.phyloP cd /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/run.phyloP # Adjust model file base composition background and rate matrix to # be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.548 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.548 > all.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set out = $2 set cName = $f:r:r set chrDir = $f:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "$cons/run.split/ss/$chrDir/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $f.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$f.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../run.split/ss -type f | sed -e "s/.ss$//; s#^../run.split/ss/##" \ > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/all cd /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 7411 of 7411 jobs # CPU time in finished jobs: 129126s 2152.10m 35.87h 1.49d 0.004 y # IO & Wait Time: 189014s 3150.24m 52.50h 2.19d 0.006 y # Average job time: 43s 0.72m 0.01h 0.00d # Longest finished job: 127s 2.12m 0.04h 0.00d # Submission to last job: 454s 7.57m 0.13h 0.01d ssh hgwdev cd /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/all find ./wigFix -type f \ | sed -e "s#^./##; s/\./ /g; s/-/ - /g" \ | sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigEncode stdin phyloP13way.wig \ phyloP13way.wib > wigEncode.log 2>&1 & cat wigEncode.log # Converted stdin, upper limit 1.28, lower limit -9.33 # good test to make sure no overlapping coordinates, bigWig: # consumes massive amount of memory, in bash raise your memory # limits: ulimit -d 188743680 ulimit -v 188743680 time cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigToBigWig stdin ../../../../chrom.sizes phyloP13way.bw & # real 38m1.704s # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP13way.bw /gbdb/calJac3/bbi hgsql calJac3 -e 'drop table if exists phyloP13wayAll; \ create table phyloP13wayAll \ (fileName varchar(255) not null); \ insert into phyloP13wayAll values ("/gbdb/calJac3/bbi/phyloP13way.bw");' # loading the wiggle table: ln -s `pwd`/phyloP13way.wib /gbdb/calJac3/multiz13way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/calJac3/multiz13way calJac3 \ phyloP13wayAll phyloP13way.wig # Connected to database calJac3 for track phyloP13wayAll # Creating wiggle table definition in calJac3.phyloP13wayAll # Saving wiggle.tab # Loading calJac3 # # real 0m27.247s # create download files: cat << '_EOF_' > mkDown.csh #!/bin/csh -fe foreach F (`cat wigFile.list`) set C = $F:h:t:r cat $F | sed -e "s/__[0-9]//" >> downloads/${C}.wigFix end '_EOF_' # << happy emacs chmod +x ./mkDown.csh mkdir downloads time ./mkDown.csh # real 24m44.973s time gzip downloads/chr*.wigFix # real 30m14.304s cd downloads md5sum chr*.wigFix.gz >> md5sum.txt cd .. wigTableStats.sh calJac3 phyloP13wayAll # db.table min max mean count sumData # calJac3.phyloP13wayAll -9.328 1.283 0.0881734 2302681633 2.03035e+08 # stdDev 0.642244 # viewLimits=-3.12305:1.283 # that range is: 9.328 + 1.283 = 10.611 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.010611 -hBinCount=1000 -hMinVal=-9.32 -verbose=2 \ -db=calJac3 phyloP13wayAll > histogram.data 2>&1 # real 2m55.034s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Marmoset calJac3 phyloP13way track" set xlabel " phyloP13way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.04] set xrange [-2:2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # download stuff mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/phyloP13way cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/phyloP13way mkdir vertebrate cd vertebrate ln -s /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/all/downloads/chr*.wigFix.gz . ln -s /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/all/downloads/md5sum.txt . cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/phyloP13way ln -s /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/run.phyloP/all.mod . ln -s /hive/data/genomes/calJac3/bed/multiz13way/consPhyloP/README.txt md5sum all.mod > md5sum.txt md5sum README.txt >> md5sum.txt # packs tracks ############################################################################ # Add 13-Way conservation to pushQ /hive/data/genomes/calJac3/bed/multiz13way/quals/calJac3] ln -s /hive/data/genomes/calJac3/bed/qual/calJac3.qual.qac . /hive/data/genomes/calJac3/bed/multiz13way/quals/calJac3] ln -s /hive/data/genomes/calJac3/*.agp . qacAddGapIdx calJac3.agp calJac3.qual.qac calJac3.qac calJac3.qdx /hive/data/genomes/calJac3/bed/multiz13way/quals] time mafAddQRows 10way.list ../run/calJac3.13way.maf calJac3.quals.13way.maf ############################################################################ ## Fix gold/gap tables (DONE - 2010-09-22 - Hiram) # the hgGoldGapGl program has been fixed for chrom names > 16 characters mkdir /hive/data/genomes/calJac3/bed/goldGap cd /hive/data/genomes/calJac3/bed/goldGap # record table status before loading anything: hgsql -e "show table status;" calJac3 > before.status # verify SQL create statements with -noLoad option: hgGoldGapGl -noLoad -verbose=2 -noGl \ calJac3 /hive/data/genomes/calJac3/calJac3.agp # if that looks OK, load the two tables: hgGoldGapGl -verbose=2 -noGl \ calJac3 /hive/data/genomes/calJac3/calJac3.agp # verify only gold and gap tables have changed hgsql -e "show table status;" calJac3 > after.status # it appears only the gold table actually changed diff before.status after.status < gap MyISAM 10 Dynamic 187214 44 8366704 > gap MyISAM 10 Dynamic 187214 44 8366704 < gold MyISAM 10 Dynamic 194974 46 9129232 > gold MyISAM 10 Dynamic 201523 47 9496340 # gold and gap tables together now cover the entire genome: featureBits -countGaps -or calJac3 gold gap # 2914958544 bases of 2914958544 (100.000%) in intersection ############################################################################ # BacEnd track (DONE - 2010-07-12 - Hiram) # data obtained from Pat Minx (pminx at watson dot wustl dot edu) # and Wes Warren (wwarren at watson dot wustl dot edu) mkdir /hive/data/genomes/calJac3/bed/bacEnds cd /hive/data/genomes/calJac3/bed/bacEnds wget --timestamping \ http://genome.wustl.edu/pub/organism/Primates/Callithrix_jacchus/end_sequences/marmoset_bac_ends.3.2.bed.gz gunzip marmoset_bac_ends.3.2.bed.gz egrep -v "^track|^browser" marmoset_bac_ends.3.2.bed \ | hgLoadBed calJac3 bacEnds stdin checkTableCoords -verboseBlocks calJac3 bacEnds wget --timestamping \ http://genome.wustl.edu/pub/organism/Primates/Callithrix_jacchus/end_sequences/marmoset_singletons.3.2.bed.gz gunzip marmoset_singletons.3.2.bed.gz sort -k1,1 -k2,2n marmoset_singletons.3.2.bed \ | hgLoadBed calJac3 bacEndCalJac3Singles stdin checkTableCoords -verboseBlocks calJac3 bacEndCalJac3Singles ############################################################################ # Create liftOver files to and from: calJac3 <-> hg18 (DONE 2011-01-10 - Chin) # original alignment on hg19: cd /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20 # and for this swap mkdir /hive/data/genomes/calJac3/bed/blastz.hg18.swap cd /hive/data/genomes/calJac3/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20/DEF \ -swap -syntenicNet \ -stop net \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 48m54s cd /hive/data/genomes/calJac3/bed/ ln -s blastz.hg18.swap lastz.hg18 cd /hive/data/genomes/calJac3/bed/lastz.hg18/axtChain cp calJac3.hg18.over.chain.gz ../../liftOver/. cd /hive/data/genomes/calJac3/bed/liftOver mv calJac3.hg18.over.chain.gz calJac3ToHg18.over.chain.gz md5sum *.gz > md5sum.txt cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/liftOver ln -s /hive/data/genomes/calJac3/bed/liftOver/calJac3ToHg18.over.chain.gz . ############################################################################ # construct ucscToEnsembl chrom name translation (2011-04-20 - Hiram) # the Ensembl genes v62 have new chrom names, construct translation: mkdir /hive/data/genomes/calJac3/bed/ucscToEnsembl cd /hive/data/genomes/calJac3/bed/ucscToEnsembl zcat ../ensGene.62/download/Callithrix_jacchus.C_jacchus3.2.1.62.gtf.gz \ | awk '{print $1}' | sort -u \ | sed -e 's/^\([0-9XY][0-9]*\)/chr\1/; s/\.1$//;' | sort > t.t cat t.t | while read C do WC=`grep "${C}" ../../chrom.sizes | wc -l` EN=`echo ${C} | sed -e "s/^chr//; s/^\(ACFV.*\)/\1.1/; s/^\(GL.*\)/\1.1/;"` if [ "${WC}" -eq 1 ]; then UCSC=`grep "${C}" ../../chrom.sizes | awk '{print $1}'` echo -e "${UCSC}\t${EN}" else WC=`egrep "^${C} " ../../chrom.sizes | wc -l` if [ "${WC}" -eq 1 ]; then UCSC=`egrep "^${C} " ../../chrom.sizes | awk '{print $1}'` echo -e "${UCSC}\t${EN}" else echo "ERROR: one only not found for: ${C} - ${WC}" fi fi done | sort > ensemblGenes.tab cut -f1 ensemblGenes.tab | sort > ucsc.done.list cut -f1 ../../chrom.sizes | sort > ucsc.all.list comm -13 ucsc.done.list ucsc.all.list | while read UCSC do EN=`echo $UCSC | sed -e 's/^chr[0-9XYU][n0-9]*_//; s/_random//; s/$/.1/'` echo -e "${UCSC}\t${EN}" done > noEnsemblGenes.tab sort ensemblGenes.tab noEnsemblGenes.tab > ucscToEnsembl.tab cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(21)) ); '_EOF_' hgsql calJac3 < ucscToEnsembl.sql hgsql calJac3 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl' ############################################################################ # calJac3 - Marmoset - Ensembl Genes version 62 (DONE - 2011-04-21 - hiram) # need a new lift file to translate names cd /hive/data/genomes/calJac3/jkStuff grep "^chr" ../chrom.sizes | while read L do size=`echo $L | awk '{print $2}'` ucName=`echo $L | awk '{print $1}'` ensName=`hgsql -N -e 'select ensembl from ucscToEnsembl where ucsc="'${ucName}'";' calJac3` echo -e "0\t${ensName}.1\t$size\t$ucName\t$size\n" done > ens.62.lft # place that file name in the calJac3.ensGene.ra file, then: ssh hgwdev cd /hive/data/genomes/calJac3 cat << '_EOF_' > calJac3.ensGene.ra # required db variable db calJac3 # nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/; # s/^GL\([0-9][0-9]*\).1/chrUn_GL\1/;" # name translation in Ensembl v62 liftUp /hive/data/genomes/calJac3/jkStuff/ens.62.lft '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=62 calJac3.ensGene.ra ssh hgwdev cd /hive/data/genomes/calJac3/bed/ensGene.62 featureBits calJac3 ensGene # 51944550 bases of 2752505800 (1.887%) in intersection ############################################################################ # hgPal downloads (DONE 2011-05-20 braney) # FASTA from 13way for refGene, ensGene, nscanGene ssh hgwdev screen bash rm -rf /cluster/data/calJac3/bed/multiz13way/pal mkdir /cluster/data/calJac3/bed/multiz13way/pal cd /cluster/data/calJac3/bed/multiz13way/pal for i in `cat ../species.list`; do echo $i; done > order.lst mz=multiz13way gp=refGene db=calJac3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 5m40.876s # user 2m32.229s # sys 2m48.508s mz=multiz13way gp=refGene db=calJac3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz13way gp=refGene db=calJac3 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz13way gp=ensGene db=calJac3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 109m32.085s # user 11m13.926s # sys 7m8.591s mz=multiz13way gp=ensGene db=calJac3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz13way gp=ensGene db=calJac3 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz13way gp=nscanGene db=calJac3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 76m43.791s # user 6m21.521s # sys 6m46.179s mz=multiz13way gp=nscanGene db=calJac3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz13way gp=nscanGene db=calJac3 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ############################################################################ # LASTZ Gorilla calJac3 (DONE - 2011-10-21,27 - Hiram) mkdir /hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21 cd /hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21 cat << '_EOF_' > DEF # Gorilla vs Marmoset # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Marmoset calJac3 SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit SEQ1_LEN=/scratch/data/calJac3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=60 # QUERY: Gorilla gorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # after recovering manually after a hive filesystem outage time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > cat.log 2>&1 & # real 147m39.439s cat fb.calJac3.chainGorGor3Link.txt # 1928768328 bases of 2752505800 (70.073%) in intersection cd /hive/data/genomes/calJac3/bed ln -s lastzGorGor3.2011-10-21 lastz.gorGor3 # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.calJac3.swap cd /hive/data/genomes/gorGor3/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 86m1.559s cat fb.gorGor3.chainCalJac3Link.txt # 1933784568 bases of 2822760080 (68.507%) in intersection ############################################################################ # LASTZ Rhesus rheMac3 SWAP (DONE - 2012-07-19 - Chin) cd /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19 cat fb.rheMac3.chainCalJac3Link.txt # 1893656467 bases of 2639145830 (71.753%) in intersection # running the swap mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap cd /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 84m31.344s cat fb.calJac3.chainRheMac3Link.txt # 1931533122 bases of 2752505800 (70.174%) in intersection cd /hive/data/genomes/calJac3/bed ln -s blastz.rheMac3.swap lastz.rheMac3 ############################################################################