# for emacs: -*- mode: sh; -*- # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+-------------------------+ # | tableName | type | # +-----------+-------------------------+ # | refGene | genePred refPep refMrna | # | mgcGenes | genePred | # | genscan | genePred genscanPep | # +-----------+-------------------------+ # CREATE BUILD DIRECTORY (DONE 4/17/06 angie) # df -h /cluster/store*, choose the one with the most space... ssh kkstore04 mkdir /cluster/store8/xenTro2 ln -s /cluster/store8/xenTro2 /cluster/data/xenTro2 # DOWNLOAD SEQUENCE (DONE 4/17/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/downloads cd /cluster/data/xenTro2/downloads wget --timestamp ftp://ftp.jgi-psf.org/pub/JGI_data/Frog/v4.1/Xentr4.fasta.gz faSize Xentr4.fasta.gz #1513925492 bases (154525475 N's 1359400017 real 1193882410 upper 165517607 lower) in 19759 sequences in 1 files #Total size: mean 76619.5 sd 381240.5 min 2001 (scaffold_20095) max 7817814 (scaffold_1) median 7909 #N count: mean 7820.5 sd 19478.8 #U count: mean 60422.2 sd 323863.1 #L count: mean 8376.8 sd 44842.2 # MAKE JKSTUFF AND BED DIRECTORIES (DONE 4/17/06 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/xenTro2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/xenTro2/bed # REPEATMASKER (DONE 4/17/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/RMRun cd /cluster/data/xenTro2/RMRun # Record RM version used: ls -l /cluster/bluearc/RepeatMasker #lrwxrwxrwx 1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320/ grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20060315; * # Run RepeatMasker on a dummy input, just to make it initialize its # species libraries once before the cluster run: /cluster/bluearc/RepeatMasker/RepeatMasker -spec "xenopus tropicalis" \ /dev/null #Building species libraries in: /cluster/bluearc/RepeatMasker060320/Libraries/20060315/xenopus_tropicalis # Apparently RepeatMasker's default lib is smaller than the lib that # was provided along with assembly version 3 (our xenTro1), so we will # run with both the default lib and JGI's lib. faSize /cluster/bluearc/RepeatMasker060320/Libraries/20060315/xenopus_tropicalis/specieslib #84811 bases (92 N's 84719 real 0 upper 84719 lower) in 85 sequences in 1 files faSize /cluster/data/xenTro1/downloads/xt3.lib1.fasta #505033 bases (113 N's 504920 real 504920 upper 0 lower) in 367 sequences in 1 files mkdir /cluster/bluearc/xenTro2 cp -p /cluster/data/xenTro1/downloads/xt3.lib1.fasta \ /cluster/bluearc/xenTro2/ /cluster/bluearc/RepeatMasker/RepeatMasker \ -lib /cluster/bluearc/xenTro2/xt3.lib1.fasta /dev/null #- Split sequence into 500kb chunks, at gaps if possible: mkdir /cluster/data/xenTro2/scaffoldsSplit500k cd /cluster/data/xenTro2/scaffoldsSplit500k faSplit -outDirDepth=2 -lift=ss500k.lft \ gap ../downloads/Xentr4.fasta.gz 500000 ss500k #22243 pieces of 22243 written #- Make the run directory and job list: # Run RepeatMasker twice, once with default xenopus lib and once # with the larger lib that was distributed with xenTro1... cd /cluster/data/xenTro2 cat << '_EOF_' > jkStuff/RMXenopus #!/bin/csh -fe set tmpDir = /scratch/tmp /bin/mkdir -p $tmpDir/xenTro2/$2 /bin/cp $1/$2 $tmpDir/xenTro2/$2/ cd $tmpDir/xenTro2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /cluster/bluearc/xenTro2/xt3.lib1.fasta $2 /bin/cp $tmpDir/xenTro2/$2/$2.out $3.jgi /bin/rm -fr $tmpDir/xenTro2/$2/* /bin/cp $1/$2 $tmpDir/xenTro2/$2/ /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec "xenopus tropicalis" $2 /bin/cp $tmpDir/xenTro2/$2/$2.out $3 /bin/rm -fr $tmpDir/xenTro2/$2/* /bin/rmdir --ignore-fail-on-non-empty $tmpDir/xenTro2/$2 /bin/rmdir --ignore-fail-on-non-empty $tmpDir/xenTro2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMXenopus mkdir RMRun RMOut cp /dev/null RMRun/RMJobs foreach i (0 1 2 3 4 5 6 7 8 9) mkdir RMOut/$i foreach j (0 1 2 3 4 5 6 7 8 9) mkdir RMOut/$i/$j foreach f (/cluster/data/xenTro2/scaffoldsSplit500k/$i/$j/ss500k*.fa) echo /cluster/data/xenTro2/jkStuff/RMXenopus \ $f:h $f:t \ '{'check out line+ /cluster/data/xenTro2/RMOut/$i/$j/$f:t.out'}' \ >> RMRun/RMJobs end end end wc -l RMRun/RMJobs #22243 RMRun/RMJobs #- Do the run ssh pk cd /cluster/data/xenTro2/RMRun para make RMJobs; para time | mail -s 'RM cluster run finished' $USER para time #Completed: 22243 of 22243 jobs #CPU time in finished jobs: 6136121s 102268.68m 1704.48h 71.02d 0.195 y #IO & Wait Time: 66562s 1109.37m 18.49h 0.77d 0.002 y #Average job time: 279s 4.65m 0.08h 0.00d #Longest finished job: 2325s 38.75m 0.65h 0.03d #Submission to last job: 23238s 387.30m 6.46h 0.27d #- Lift up the 500KB chunk .out's # Also, remove the individual .out file headers and remove duplicate # items found by the two runs (too bad there's no -libOnly). The # duplicate items are unique for the first 97-105ish characters (up to # the sequential ID field which we ignore anyway). ssh kkstore04 cd /cluster/data/xenTro2/RMOut head -3 0/0/ss500k000.fa.out > scaffolds.out foreach i (0 1 2 3 4 5 6 7 8 9) foreach j (0 1 2 3 4 5 6 7 8 9) echo $i/$j/ foreach f ($i/$j/*.fa.out) liftUp -type=.out stdout \ /cluster/data/xenTro2/scaffoldsSplit500k/ss500k.lft warn \ $f $f.jgi \ | tail +4 \ | sort -k 5,5 -k 6n,6n \ | uniq -w 97 \ >> scaffolds.out end end end wc -l scaffolds.out #2001365 scaffolds.out #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/xenTro2/RMOut hgLoadOut xenTro2 -nosplit scaffolds.out featureBits -chrom=scaffold_1 xenTro2 rmsk #1222207 bases of 7578677 (16.127%) in intersection featureBits -chrom=scaffold_1 xenTro1 rmsk #704506 bases of 7406505 (9.512%) in intersection # Clean up unmasked split scaffolds. ssh kkstore04 rm -r /cluster/data/xenTro2/scaffoldsSplit500k # CREATING DATABASE (DONE 4/17/06 angie) ssh hgwdev hgsql '' -e 'create database xenTro2' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql #/dev/sdc1 1.8T 1.5T 168G 90% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/17/06 angie) ssh hgwdev hgsql xenTro2 -e "create table grp (PRIMARY KEY(NAME)) select * from rn4.grp" # CREATE AGP FILES AND GAP/GOLD TABLES (DONE 4/17/06 angie) ssh kkstore04 cd /cluster/data/xenTro2 # Look for overrepresented round-number run-of-N sizes: faGapSizes -niceSizes=5,10,20,25,50,100,200,500,1000,2000,5000,10000,20000,50000,100000 downloads/Xentr4.fasta.gz # Of those, 50 is the only overwhelmingly overrepresented round number. # Use that as the -minContigGap (instead of the default 25). # There are 65 gaps > 50000 (none exactly 50k or 100k) but I don't see # any other obvious threshold to use for -minScaffoldGap and 65 is pretty # few in the grand scheme of things (all we have are scaffolds though). hgFakeAgp -minContigGap=50 downloads/Xentr4.fasta.gz xenTro2.agp ssh hgwdev cd /cluster/data/xenTro2 hgGoldGapGl -noGl xenTro2 xenTro2.agp # SIMPLE REPEATS (TRF) (DONE 4/17/06 angie) ssh kkr8u00 mkdir /cluster/data/xenTro2/bed/simpleRepeat cd /cluster/data/xenTro2/bed/simpleRepeat trfBig -trf=/cluster/bin/i386/trf ../../downloads/Xentr4.fasta.gz \ /dev/null -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp # Took just over 5 hours. # Load into the database: ssh hgwdev hgLoadBed xenTro2 simpleRepeat \ /cluster/data/xenTro2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql nice featureBits -chrom=scaffold_1 xenTro2 simpleRepeat #138459 bases of 7578677 (1.827%) in intersection # Compare to rn3: nice featureBits -chrom=scaffold_1 xenTro1 simpleRepeat #123690 bases of 7406505 (1.670%) in intersection # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/xenTro2/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed xenTro2 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE 4/17/06 angie) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore04 cd /cluster/data/xenTro2/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 4/17/06 angie) ssh kkstore04 cd /cluster/data/xenTro2 # Soft-mask scaffolds with RepeatMasker and filtered TRF: maskOutFa -soft downloads/Xentr4.fasta.gz bed/simpleRepeat/trfMask.bed \ xenTro2.fa maskOutFa -softAdd xenTro2.fa RMOut/scaffolds.out xenTro2.fa # See how many bases are lower-cased now: faSize xenTro2.fa #1513925492 bases (154525475 N's 1359400017 real 1092281194 upper 267118823 lower) in 19759 sequences in 1 files # ~19.6% of non-N bases are lower-cased, sounds reasonable. # Hard-mask scaffolds: maskOutFa xenTro2.fa hard xenTro2.fa.masked # Make 2bit (for hgBlat, browser): faToTwoBit xenTro2.fa xenTro2.2bit # PUT 2BIT ON /SCRATCH (DONE 4/17/06 angie) ssh kkstore04 mkdir /cluster/bluearc/scratch/hg/xenTro2 cp -p /cluster/data/xenTro2/xenTro2.2bit \ /cluster/bluearc/scratch/hg/xenTro2/ # Ask cluster-admin to distribute to /scratch on big & small cluster # MAKE GCPERCENT (DONE 4/17/06 angie) ssh kkr7u00 mkdir /cluster/data/xenTro2/bed/gc5Base cd /cluster/data/xenTro2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 xenTro2 \ /cluster/data/xenTro2 \ | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev mkdir /gbdb/xenTro2/wib cd /cluster/data/xenTro2/bed/gc5Base ln -s `pwd`/gc5Base.wib /gbdb/xenTro2/wib hgLoadWiggle -pathPrefix=/gbdb/xenTro2/wib xenTro2 gc5Base gc5Base.wig # MAKE CHROMINFO TABLE WITH 2BIT (DONE 4/17/06 angie) ssh kkstore04 cd /cluster/data/xenTro2 mkdir bed/chromInfo twoBitInfo xenTro2.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/xenTro2/xenTro2.2bit";}' \ > bed/chromInfo/chromInfo.tab # Link to 2bit from /gbdb/xenTro2/: ssh hgwdev cd /cluster/data/xenTro2 mkdir /gbdb/xenTro2 ln -s /cluster/data/xenTro2/xenTro2.2bit /gbdb/xenTro2/ # Load /gbdb/xenTro2/xenTro2.2bit paths into database and save size info. hgLoadSqlTab xenTro2 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \ /cluster/data/xenTro2/bed/chromInfo/chromInfo.tab echo "select chrom,size from chromInfo" | hgsql -N xenTro2 > chrom.sizes # take a look at chrom.sizes size wc chrom.sizes # 19759 39518 392532 chrom.sizes # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/17/06 angie) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvsup # Add trackDb directories and a description.html mkdir xenTro/xenTro2 cvs add xenTro/xenTro2 cvs add xenTro/xenTro2/description.html cvs ci xenTro/xenTro2 # Edit that makefile to add xenTro2 in all the right places and do make update DBS=xenTro2 mkdir /gbdb/xenTro2/html cvs ci makefile # Go public on genome-test. In a clean tree (no mods, up-to-date): cvs up makefile make alpha # Note: hgcentral*.genome values must correspond # with defaultDb.genome values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("xenTro2", "Aug. 2005", "/gbdb/xenTro2", "X. tropicalis", \ "scaffold_19", 1, 36, "X. tropicalis", \ "Xenopus tropicalis", "/gbdb/xenTro2/html/description.html", \ 0, 0, "JGI v4.1");' ## re-work orderKey 2007-02-20 to get Lizard above frog - Hiram hgsql -e 'update dbDb set orderKey="445" where name="xenTro2";' \ hgcentraltest # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/17/06 angie) ssh kkr8u00 cd /cluster/data/xenTro2 #- Build the .tar.gz files -- no genbank for now. mkdir bigZips gzip -c xenTro2.fa > bigZips/xenTro2.fa.gz gzip -c xenTro2.fa.masked > bigZips/xenTro2.hardmasked.fa.gz gzip -c RMOut/scaffolds.out > bigZips/xenTro2.rmsk.out.gz gzip -c bed/simpleRepeat/simpleRepeat.bed > bigZips/xenTro2.trf.bed.gz cd bigZips md5sum *.gz > md5sum.txt # Make a README.txt #- Link the .gz and .txt files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/xenTro2 mkdir -p $gp/bigZips ln -s /cluster/data/xenTro2/bigZips/*.{gz,txt} $gp/bigZips # Take a look at bigZips/* mkdir $gp/database # Create README.txt file in database/ to explain the files. # MAKE 11.OOC (DONE 4/17/06 angie) # Use -repMatch=540 as in makeXenTro1.doc (roughly scaled from human # repMatch by ratio of frog size to human size) ssh kkr7u00 cd /cluster/data/xenTro2 mkdir /cluster/bluearc/xenTro2 blat xenTro2.2bit /dev/null /dev/null \ -tileSize=11 -makeOoc=/cluster/bluearc/xenTro2/11.ooc -repMatch=540 #Wrote 25734 overused 11-mers to /cluster/bluearc/xenTro2/11.ooc # GENSCAN GENE PREDICTIONS (DONE 4/17/06 angie) ssh hgwdev mkdir /cluster/data/xenTro2/bed/genscan cd /cluster/data/xenTro2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # create hard masked .fa files ssh kkstore04 cd /cluster/data/xenTro2 mkdir hardMasked faSplit about xenTro2.fa.masked 5000000 hardMasked/ # Generate a list file, genome.list, of all the hard-masked contig chunks: ls -1S /cluster/data/xenTro2/hardMasked/* > bed/genscan/genome.list wc -l bed/genscan/genome.list #262 bed/genscan/genome.list # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/xenTro2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single gsub jobList para make jobList para time #Completed: 261 of 262 jobs #Crashed: 1 jobs #CPU time in finished jobs: 33758s 562.63m 9.38h 0.39d 0.001 y #IO & Wait Time: 905s 15.08m 0.25h 0.01d 0.000 y #Average job time: 133s 2.21m 0.04h 0.00d #Longest finished job: 320s 5.33m 0.09h 0.00d #Submission to last job: 3605s 60.08m 1.00h 0.04d # If there are crashes, diagnose with "para problems" and "para crashed". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr8u00 cd /cluster/data/xenTro2/bed/genscan /cluster/bin/x86_64/gsBig /cluster/data/xenTro2/hardMasked/30.fa gtf/30.gtf -trans=pep/30.pep -subopt=subopt/30.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 ls -1 gtf | wc -l #262 endsInLf gtf/* # Concatenate results: ssh kkstore04 cd /cluster/data/xenTro2/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load into the database (without -genePredExt because no frame info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /cluster/data/xenTro2/bed/genscan ldHgGene -gtf xenTro2 genscan genscan.gtf hgLoadBed xenTro2 genscanSubopt genscanSubopt.bed featureBits -chrom=scaffold_1 xenTro2 genscan #249658 bases of 7578677 (3.294%) in intersection featureBits -chrom=scaffold_1 xenTro1 genscan #275145 bases of 7406505 (3.715%) in intersection # Strange that the coverage dropped a bit. Well, more seq is masked...? # GENBANK AUTO UPDATE (DONE 4/18/06 angie) # align with revised genbank process. drop xeno ESTs. cd ~/kent/src/makeDb/genbank cvsup # edit etc/genbank.conf to add xenTro2 # xenTro2 (X. tropicalis) 19579 scaffolds xenTro2.serverGenome = /cluster/data/xenTro2/xenTro2.2bit xenTro2.clusterGenome = /scratch/hg/xenTro2/xenTro2.2bit xenTro2.ooc = /cluster/bluearc/xenTro2/11.ooc xenTro2.lift = no xenTro2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} xenTro2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} xenTro2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} xenTro2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} xenTro2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} xenTro2.refseq.mrna.native.load = no xenTro2.genbank.mrna.xeno.load = no xenTro2.downloadDir = xenTro2 xenTro2.perChromTables = no xenTro2.mgcTables.default = full xenTro2.mgcTables.mgc = all # N.B. above was changed later to include refseqs don't just copy this or a spell will be # cast on your descendents. cvs ci etc/genbank.conf # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial xenTro2 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad xenTro2 & # enable daily alignment and update of hgwdev cd ~/kent/src/makeDb/genbank cvsup # add xenTro2 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update # enabled native refSeq: 2006-05-11 markd # set this in genbank.conf: xenTro2.refseq.mrna.native.load = yes # kick off an alignment to verify ssh kkstore02 cd /cluster/data/genbank (nice ./bin/gbAlignStep xenTro2)|&mail markd& # CPGISSLANDS (WUSTL) (DONE 4/18/06 angie) ssh hgwdev mkdir -p /cluster/data/xenTro2/bed/cpgIsland cd /cluster/data/xenTro2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/xenTro2/bed/cpgIsland/ ssh kkr8u00 cd /cluster/data/xenTro2/bed/cpgIsland ./cpglh.exe ../../xenTro2.fa > xenTro2.cpg # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk *.cpg > cpgIsland.bed wc -l cpgIsland.bed # 42984 cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/xenTro2/bed/cpgIsland hgLoadBed xenTro2 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed featureBits -chrom=scaffold_1 xenTro2 cpgIslandExt #54127 bases of 7578677 (0.714%) in intersection featureBits -chrom=scaffold_1 xenTro1 cpgIslandExt #48512 bases of 7406505 (0.655%) in intersection featureBits xenTro1 cpgIslandExt #19279778 bases of 1381238994 (1.396%) in intersection # CPGISLANDS (ANDY LAW) (DONE 4/18/06 angie) # See notes in makeGalGal2.doc ssh kr8u00 mkdir /cluster/data/xenTro2/bed/cpgIslandGgfAndy cd /cluster/data/xenTro2/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE # Use soft-masked sequence since this is not a mammal... ~/bin/x86_64/preProcGgfAndy ../../xenTro2.fa \ | /cluster/home/angie/ggf-andy-cpg-island-ucsc.pl \ > cpgIslandGgfAndy.bed wc -l ../cpgIsland/cpgIsland.bed *bed # 42984 ../cpgIsland/cpgIsland.bed # 184980 cpgIslandGgfAndy.bed # load into database: ssh hgwdev cd /cluster/data/xenTro2/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed xenTro2 cpgIslandGgfAndy -tab \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed featureBits -chrom=scaffold_1 xenTro2 cpgIslandExt #54127 bases of 7578677 (0.714%) in intersection featureBits -chrom=scaffold_1 xenTro2 cpgIslandGgfAndy #251543 bases of 7578677 (3.319%) in intersection # SWAP CHAINS/NET MM8 (DONE 4/21/06 hiram -- see makeMm8.doc) # SWAP CHAINS/NET HG18 (DONE 4/24/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.hg18.swap cd /cluster/data/xenTro2/bed/blastz.hg18.swap doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.hg18.swap /cluster/data/xenTro2/bed/blastz.hg18 # SWAP CHAINS/NET RN4 (DONE 4/24/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.rn4.swap cd /cluster/data/xenTro2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/xenTro2/bed/blastz.rn4 # SWAP CHAINS/NET MONDOM4 (DONE 4/27/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.monDom4.swap cd /cluster/data/xenTro2/bed/blastz.monDom4.swap doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log # hgLoadChain ran out of memory while sorting the giant chain set. # Pre-sort on a machine with big RAM: ssh kkr8u00 cd /cluster/data/xenTro2/bed/blastz.monDom4.swap time nice chainSort -target axtChain/xenTro2.monDom4.all.chain.gz \ axtChain/all.tSorted.chain #176.750u 66.580s 4:36.82 87.9% 0+0k 0+0io 0pf+0w # -- it's a 5-minute job on a machine with sufficient memory, but will # thrash all day on a machine that doesn't have enough. # Manually run hgLoadChain -noSort, then run the rest of loadUp.csh: ssh hgwdev cd /cluster/data/xenTro2/bed/blastz.monDom4.swap/axtChain nice hgLoadChain -noSort -tIndex xenTro2 chainMonDom4 all.tSorted.chain #Loading 10580431 chains into xenTro2.chainMonDom4 grep -v hgLoadChain loadUp.csh > tmp.csh nice csh -efx tmp.csh >>& ../do.log & tail -f ../do.log # back on kkstore04 cd /cluster/data/xenTro2/bed/blastz.monDom4.swap doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.xenTro2/DEF \ -continue download -workhorse kkr8u00 >>& do.log & tail -f do.log ln -s blastz.monDom4.swap /cluster/data/xenTro2/bed/blastz.monDom4 # SWAP CHAINS/NET GALGAL2 (DONE 4/27/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.galGal2.swap cd /cluster/data/xenTro2/bed/blastz.galGal2.swap doBlastzChainNet.pl -swap /cluster/data/galGal2/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.galGal2.swap /cluster/data/xenTro2/bed/blastz.galGal2 # SWAP CHAINS/NET DANRER4 (DONE 4/27/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.danRer4.swap cd /cluster/data/xenTro2/bed/blastz.danRer4.swap doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.danRer4.swap /cluster/data/xenTro2/bed/blastz.danRer4 # MULTIZ7WAY (DONE 4/28/06 angie) # ((xenTro2 (galGal2 (monDom4 (hg18 (mm8 rn4))))) danRer4) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27 cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27 # Prune the hg17 17way tree to just these 7 and update db names: /cluster/bin/phast/tree_doctor \ --prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \ --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \ /cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh # *carefully* edit 7way.nh to put frog first. # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst # Split MAFs by sequence onto cluster-friendly server mkdir /cluster/bluearc/xenTro2/mafNet foreach s (galGal2 monDom4 hg18 mm8 rn4 danRer4) echo $s mafSplit -byTarget -outDirDepth=2 -useSequenceName \ dummyArg /cluster/bluearc/xenTro2/mafNet/$s/split \ /cluster/data/xenTro2/bed/blastz.$s/mafNet/* end # Change the split%05d names to scaffold_1 etc. so they exactly match # sequence names. cd /cluster/bluearc/xenTro2/mafNet foreach db ( danRer4 galGal2 hg18 mm8 monDom4 rn4 ) echo $db foreach d0 ($db/*) foreach f ($d0/*/*) set g = `echo $f | sed -e 's/split0*/scaffold_/'` mv $f $g end end end ssh kki cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = xenTro2 set d = $1 set c = $2 set mafOut = $3 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /cluster/bluearc/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) if ($s == $db) then continue endif set clusterMaf = $pairs/$s/$d/$c.maf set localMaf = $db.$s.sing.maf if (-e $clusterMaf) then cp $clusterMaf $localMaf else echo "##maf version=1 scoring=autoMZ" > $localMaf endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $mafOut rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/xenTro2/bed/multiz7way.2006-04-27/maf/$(dir1)/$(root1).maf} #ENDLOOP 'EOF' # << emacs # List scaffolds in the dir structure created by mafSplit above: perl -wpe 's/^scaffold_(\d+)\s+.*$//; $d0 = $1 % 10; $d1 = ($1/10) % 10; \ print "$d1/$d0/scaffold_$1";' \ /cluster/data/xenTro2/chrom.sizes > scaf.lst # Make corresponding output directory structure: foreach d1 (0 1 2 3 4 5 6 7 8 9) mkdir ../maf/$d1 foreach d0 (0 1 2 3 4 5 6 7 8 9) mkdir ../maf/$d1/$d0 end end gensub2 scaf.lst single spec jobList para make jobList para time #Completed: 19759 of 19759 jobs #CPU time in finished jobs: 8981s 149.68m 2.49h 0.10d 0.000 y #IO & Wait Time: 51309s 855.15m 14.25h 0.59d 0.002 y #Average job time: 3s 0.05m 0.00h 0.00d #Longest finished job: 44s 0.73m 0.01h 0.00d #Submission to last job: 3771s 62.85m 1.05h 0.04d # ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 4/28/2006 angie) # -- mafFilter'd and reloaded 6/9/2006 ssh kkr8u00 mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/xenTro2/bed/multiz7way.2006-04-27/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end ssh kki cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno/run cp /dev/null jobList foreach d1 (/cluster/data/xenTro2/bed/multiz7way.2006-04-27/maf/*) echo $d1 foreach f ($d1/*/*.maf) set maf = $f:t set d1 = $f:h:h:t set d0 = $f:h:t echo mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /scratch/hg/xenTro2/xenTro2.2bit ../maf/$d1/$d0/$maf >> jobList end end # Make corresponding output directory structure: foreach d1 (0 1 2 3 4 5 6 7 8 9) mkdir ../maf/$d1 foreach d0 (0 1 2 3 4 5 6 7 8 9) mkdir ../maf/$d1/$d0 end end para make jobList para time #Completed: 19759 of 19759 jobs #CPU time in finished jobs: 40336s 672.27m 11.20h 0.47d 0.001 y #IO & Wait Time: 50270s 837.83m 13.96h 0.58d 0.002 y #Average job time: 5s 0.08m 0.00h 0.00d #Longest finished job: 8s 0.13m 0.00h 0.00d #Submission to last job: 5667s 94.45m 1.57h 0.07d # Consolidate multi-level maf to monolithic file # No need to sort chunks by position because the chunk size is greater # than the largest scaffold size. That may not be true in other # assemblies. ssh kkstore04 cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno cp /dev/null xenTro2.maf foreach scaf (`awk '{print $1;}' /cluster/data/xenTro2/chrom.sizes`) set f = `echo $scaf | perl -wpe 's/scaffold_(\d+)//; \ $d0 = $1 % 10; $d1 = ($1/10) % 10; \ print "maf/$d1/$d0/scaffold_$1.maf";'` cat $f >> xenTro2.maf end # 6/9/2006 -- mafFilter rejected two single-row (xenTro2 only) blocks due to # its default minRow of 2. That's reasonable, so replace the original with # the filtered version (and reload the db tables based on the file). mafFilter -overlap -reject=rf xenTro2.maf > xenTro2.mf.maf #rejected 2 blocks gzip -c xenTro2.maf > xenTro2.preFilter.maf.gz mv xenTro2.mf.maf xenTro2.maf # Load annotated maf ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno mkdir -p /gbdb/xenTro2/multiz7way/anno ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno/xenTro2.maf \ /gbdb/xenTro2/multiz7way/anno/ time nice hgLoadMaf -pathPrefix=/gbdb/xenTro2/multiz7way/anno xenTro2 multiz7way #Loaded 1610296 mafs in 1 files from /gbdb/xenTro2/multiz7way/anno #old hgwdev: #126.780u 41.500s 7:15.57 38.6% 0+0k 0+0io 264pf+0w #new hgwdev: #31.754u 8.324s 1:12.56 55.2% 0+0k 0+0io 3pf+0w # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kkr8u00 cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno time nice hgLoadMafSummary xenTro2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz7waySummary xenTro2.maf #Created 570630 summary blocks from 3284353 components and 1610296 mafs from xenTro2.maf #55.767u 11.404s 1:08.32 98.3% 0+0k 0+0io 4pf+0w ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz7waySummary.sql time nice hgLoadSqlTab xenTro2 multiz7waySummary \ /tmp/multiz7waySummary.sql multiz7waySummary.tab #old hgwdev: #0.000u 0.000s 2:01.79 0.0% 0+0k 0+0io 234pf+0w #new hgwdev: #0.000u 0.001s 0:10.88 0.0% 0+0k 0+0io 3pf+0w rm *.tab ln -s multiz7way.2006-04-27 /cluster/data/xenTro2/bed/multiz7way # MULTIZ7WAY DOWNLOADABLES (DONE 6/9/2006 angie) # Annotated MAF is now documented, so use anno/maf for downloads. ssh hgwdev mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits xenTro2 mgcGenes:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags xenTro2 multiz7way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log # old hgwdev: #628.650u 89.300s 14:54.56 80.2% 0+0k 0+0io 4617pf+0w # new hgwdev: #89.631u 24.492s 2:58.28 64.0% 0+0k 0+0io 0pf+0w # Make a gzipped version of the monolithic annotated maf file: ssh kkstore04 cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27 time nice gzip -c anno/xenTro2.maf > mafDownloads/xenTro2.maf.gz #416.982u 2.804s 7:10.50 97.5% 0+0k 0+0io 0pf+0w cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads time nice gzip up*.maf #6.301u 0.096s 0:06.50 98.3% 0+0k 0+0io 0pf+0w time nice md5sum *.gz > md5sum.txt #1.812u 0.369s 0:02.24 96.8% 0+0k 0+0io 0pf+0w ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/xenTro2/multiz7way mkdir $dir ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/rn4/multiz9way/README.txt $dir # edit README.txt # MULTIZ7WAY MAF FRAMES (DONE 4/28/2006 angie - REDONE 2006-06-09 markd) ssh hgwdev mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/frames cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: danRer4 mkdir genes foreach queryDb (danRer4) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using mgcGenes for xenTro2 # using knownGene for rn4 mm8 hg18 # using refGene for galGal2 # no genes for monDom4 # genePreds; (must keep only the first 10 columns for knownGene) foreach queryDb (xenTro2 rn4 mm8 hg18 galGal2) if ($queryDb == "xenTro2") then set geneTbl = mgcGenes else if ($queryDb == "galGal2") then set geneTbl = refGene else set geneTbl = knownGene endif hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end #------------------------------------------------------------------------ # create frames set clusterDir = /cluster/bluearc/xenTro2/multiz7wayFrames set multizDir = /cluster/data/xenTro2/bed/multiz7way.2006-04-27 set mafDir = $multizDir/mafDownloads set geneDir = $multizDir/frames/genes set clusterMafDir = ${clusterDir}/maf set clusterGeneDir = ${clusterDir}/genes set clusterFramesDir = ${clusterDir}/mafFrames.kki # copy mafs to cluster storage mkdir $clusterDir ssh -x kkstore04 "rsync -av $mafDir/xenTro2.maf.gz $clusterMafDir/" # copy genes to cluster storage ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/" # run cluster jobs set tmpExt = `mktemp temp.XXXXXX` set paraDir = $multizDir/frames/para.${tmpExt} mkdir mafFrames $paraDir rm -f $paraDir/jobList mkdir ${clusterFramesDir} foreach queryDb (`cat /cluster/data/xenTro2/bed/multiz7way.2006-04-27/species.lst`) mkdir ${clusterFramesDir}/${queryDb} if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} xenTro2 \ ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/xenTro2.maf.gz \ ${clusterFramesDir}/${queryDb}/xenTro2.mafFrames \ >> $paraDir/jobList endif end rm -f $tmpExt ssh -x kki "cd ${paraDir} && para make jobList && para time" #Completed: 12 of 12 jobs #CPU time in finished jobs: 354s 5.89m 0.10h 0.00d 0.000 y #IO & Wait Time: 46s 0.77m 0.01h 0.00d 0.000 y #Average job time: 33s 0.56m 0.01h 0.00d #Longest finished job: 37s 0.62m 0.01h 0.00d #Submission to last job: 37s 0.62m 0.01h 0.00d # combine results from cluster foreach queryDb (`cat ../species.lst`) echo $queryDb ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz" end #------------------------------------------------------------------------ # load the database hgLoadMafFrames xenTro2 multiz7wayFrames mafFrames/*.mafFrames.gz #------------------------------------------------------------------------ # clean up rm -rf ${clusterDir} ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore04 cd /cluster/data/xenTro2/bed/multiz7way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (find ../anno/maf -name '*.maf'|xargs cat | time genePredToMafFrames xenTro2 stdin stdout danRer4 genes/danRer4.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way/frames hgLoadMafFrames xenTro2 multiz7wayFrames multiz7way.mafFrames.gz >&log& # PHASTCONS (DONE 5/2/2006 angie - REDONE w/pruned ENCODE model 6/14/06) # Using Kate's process from makeHg17.doc. # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 17way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. ssh kkstore04 # Prune Elliot's model to just our 7 species: mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons /cluster/bin/phast/tree_doctor \ --prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \ --rename="rn3 -> rn4 ; mm7 -> mm8 ; hg17 -> hg18 ; monDom2 -> monDom4 ; \ xenTro1 -> xenTro2 ; danRer3 -> danRer4" \ /cluster/data/hg17/bed/multiz17way/cons/elliotsEncode.mod \ > elliotsEncodePruned.mod # Split MAF into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # 6/14/06: For a chrom-based genome we would run a splitting job on the # small cluster. However, since this is scaffold-based and we have # thousands of files (for sequences so small that they don't even get # split), the I/O time of a cluster run is huge and wasteful. # So run this directly on the fileserver. cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons cat > doSplitOnFileserver.csh << '_EOF_' #!/bin/csh -fex set WINDOWS = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss set tmpDir = /scratch/tmp rm -fr $WINDOWS mkdir -p $WINDOWS date # Make directory structure corresponding to maf and fill with .ss: foreach d1 (0 1 2 3 4 5 6 7 8 9) mkdir $WINDOWS/$d1 foreach d0 (0 1 2 3 4 5 6 7 8 9) mkdir $WINDOWS/$d1/$d0 foreach f (../maf/$d1/$d0/scaf*.maf) # skip the maf files that have only comments -- those crash msa_split: if (`grep -v ^\# $f | wc -l`) then set c = $f:t:r twoBitToFa /cluster/data/xenTro2/xenTro2.2bit -seq=$c $tmpDir/$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $f -i MAF \ -M $tmpDir/$c.fa \ -o SS -r $WINDOWS/$d1/$d0/$c -w 10000000,0 -I 1000 -B 5000 rm -f $tmpDir/$c.fa endif end end end date '_EOF_' # << emacs chmod a+x doSplitOnFileserver.csh nice ./doSplitOnFileserver.csh >& split.log & tail -f split.log # Took 15 minutes (was 1 hour on small cluster due to I/O) # check tree model on a single chunk, using params recommended by Adam, # (to verify branch lengths on 2X species -- though we aren't using any # of those here) ssh kolossus cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss/0/1/scaffold_1.1-7817814.ss \ -o phyloFit.tree # Comment from makeHg17.doc: # # he ok'ed the results -- not necessary for next human run # 6/14/06: # The elliotsEncode.mod is gives about twice as much coverage as # I thought we wanted, despite extremely low --target-coverage, # but Adam and Jim say the coverage from phyloFit.tree is too low. # When I loaded wiggles made with elliotsEncode and phyloFit into # the browser for visual comparison, the elliotsEncode wiggle was maxed # out (~1 wherever anything aligned) while the phyloFit wiggle showed a # lot more dynamics. So initially I went with phyloFit for that reason, # but it was not a good reason. I'll rerun with elliotsEncode. # Run phastCons mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set d = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons cp -p $san/ss/$d/$f.ss ../elliotsEncodePruned.mod $tmp pushd $tmp > /dev/null set c = $f:r:r /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$d $san/bed/$d sleep 1 mv $tmp/$f.pp $san/pp/$d mv $tmp/$f.bed $san/bed/$d rm -fr $tmp 'EOF' # << emacs chmod a+x doPhast.csh # 6/14/06: Estimate rho on scaffold_1 /cluster/bin/phast/$MACHTYPE/phastCons --estimate-rho /tmp/estimatedRho.mod \ --target-coverage 0.005 --expected-length 12 --no-post-probs \ /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss/0/1/scaffold_1.*.ss \ ../elliotsEncodePruned.mod #(rho = 0.255026) # for target-coverage 0.1: (rho = 0.223770) # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(dir1) $(file1) 12 .005 .26 #ENDLOOP 'EOF' # << emacs # Create parasol batch and run it ssh kki cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons pushd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss cp /dev/null /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons/in.list foreach d (*/*) ls -1S $d/*.ss | sed 's/.ss$//' \ >> /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons/in.list end popd gensub2 in.list single template jobList para make jobList para time #Completed: 5587 of 5587 jobs #CPU time in finished jobs: 3281s 54.68m 0.91h 0.04d 0.000 y #IO & Wait Time: 20478s 341.31m 5.69h 0.24d 0.001 y #Average job time: 4s 0.07m 0.00h 0.00d #Longest finished job: 28s 0.47m 0.01h 0.00d #Submission to last job: 1489s 24.82m 0.41h 0.02d # create Most Conserved track ssh kolossus cd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "*.bed" \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k7,7 -k9,9n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed cp -p phastConsElements7way.bed /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons # Measure coverage. If good, load elements into database and proceed with wiggle. # Try for somewhere in the neighborhood of 5% overall cov, and 70% CDS cov. # (Jim tried for 4% overall in xenTro1) ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons featureBits xenTro2 -enrichment mgcGenes:cds phastConsElements7way.bed # FIRST ITERATION: elliotsEncode, doPhast (len cov rho) = (14 .007 .27) #mgcGenes:cds 0.235%, phastConsElements7way.bed 10.972%, both 0.196%, cover 83.49%, enrich 7.61x mv phastConsElements7way.bed phastConsElements7way_14_007_27.bed # SECOND ITERATION: elliotsEncode, doPhast (len cov rho) = (12 .005 .27) #mgcGenes:cds 0.235%, phastConsElements7way.bed 10.561%, both 0.195%, cover 82.94%, enrich 7.85x # just scaffold_1: #mgcGenes:cds 0.246%, phastConsElements7way.bed 11.117%, both 0.212%, cover 86.33%, enrich 7.77x mv phastConsElements7way.bed phastConsElements7way_12_005_27.bed # THIRD ITERATION: phyloFit, doPhast (len cov rho) = (12 .100 .339) #mgcGenes:cds 0.235%, phastConsElements7way.bed 0.727%, both 0.059%, cover 25.02%, enrich 34.43x mv phastConsElements7way.bed phastConsElements7way_12_100_339.bed # FOURTH ITERATION: phyloFit, doPhast (len cov rho) = (12 .100 .3) #mgcGenes:cds 0.235%, phastConsElements7way.bed 0.683%, both 0.054%, cover 22.86%, enrich 33.45x mv phastConsElements7way.bed phastConsElements7way_12_100_300.bed # FIFTH ITERATION: phyloFit, 12 .50 .339 #mgcGenes:cds 0.235%, phastConsElements7way.bed 1.353%, both 0.091%, cover 38.78%, enrich 28.67x mv phastConsElements7way.bed phastConsElements7way_12_500_339.bed # SIXTH ITERATION: elliotsEncode, doPhast (len cov rho) = (12 .005 .26) #mgcGenes:cds 0.235%, phastConsElements7way.bed 10.377%, both 0.194%, cover 82.42%, enrich 7.94x mv phastConsElements7way.bed phastConsElements7way_12_005_26.bed # When happy: hgLoadBed -strict xenTro2 phastConsElements7way phastConsElements7way_12_005_26.bed # Create merged posterior probability file and wiggle track data files ssh kolossus cd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode time find ./pp -name "*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode -noOverlap stdin phastCons7way.wig phastCons7way.wib cp -p phastCons7way.wi? \ /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons ln -s `pwd`/phastCons7way.wib /gbdb/xenTro2/multiz7way/ hgLoadWiggle -pathPrefix=/gbdb/xenTro2/multiz7way xenTro2 \ phastCons7way phastCons7way.wig # Make .jpg for tree, check in to browser/images and install in # htdocs/images/phylo/... don't forget to request a push of that # file. The treeImage setting in trackDb.ra is # phylo/xenTro2_7way.jpg (relative to htdocs/images). # Use 7way.nh from the model used in the final run: # elliotsEncodePruned.mod . ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27 mv 7way.nh 7way.elliotsEncode.nh tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' \ > 7way.phyloFit.nh /cluster/bin/phast/all_dists 7way.elliotsEncode.nh > 7way.distances.txt grep xenTro2 7way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt #1.3604 chicken_galGal2 #1.4357 human_hg18 #1.4659 monodelphis_monDom4 #1.7936 mouse_mm8 #1.7983 rat_rn4 #1.8771 zebrafish_danRer4 # the order in the browser display will be by tree topology, # not by distance. # Just for reference, these were the distances from phyloFit.tree.mod... # Quite different! #0.4899 galGal2 #0.5621 monDom4 #0.5738 hg18 #0.6038 mm8 #0.6074 danRer4 #0.6075 rn4 /cluster/bin/phast/draw_tree 7way.elliotsEncode.nh > 7way.ps ps2pdf 7way.ps > 7way.pdf pstopnm -stdout 7way.ps | pnmtojpeg > xenTro2_7way.jpg # If you haven't already, check out the browser CVS tree in your ~/: # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser) cp xenTro2_7way.jpg ~/browser/images/phylo/ cd ~/browser/images/phylo cvs add xenTro2_7way.jpg cvs ci xenTro2_7way.jpg cd ../.. cvsup make alpha # PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 5/2/06 angie - REDONE 6/14/06) ssh kolossus mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads set ppDir = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/pp cp /dev/null xenTro2.pp # No need to sort chunks by position because the chunk size is greater # than the largest scaffold size. That may not be true in other # assemblies. foreach scaf (`awk '{print $1;}' /cluster/data/xenTro2/chrom.sizes`) set d = `echo $scaf | perl -wpe 's/scaffold_(\d+)//; \ $d0 = $1 % 10; $d1 = ($1/10) % 10; \ print "$d1/$d0";'` set f = `ls -1 $ppDir/$d | egrep ^$scaf'\.1-[0-9]+\.pp'` if ("x$f" != "x") then cat $ppDir/$d/$f >> xenTro2.pp endif end nice gzip xenTro2.pp md5sum xenTro2.pp.gz > md5sum.txt ssh hgwdev cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/xenTro2/phastCons7way mkdir $dir ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/rn4/phastCons9way/README.txt $dir # edit README.txt # Clean up after phastCons run. ssh kkstore04 rm /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/*.tab rm -r /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons # ENSEMBL is on 4.0 not 4.1... do the coords mostly carry over??? ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney2006-06-16) ssh kkstore04 bash # if not using bash shell already mkdir -p /cluster/data/xenTro2/blastDb cd /cluster/data/xenTro2/blastDb faSplit sequence ../xenTro2.fa 500 x for i in *.fa do /cluster/bluearc/blast229/formatdb -p F -i $i done rm *.log *.fa mkdir -p /san/sanvol1/scratch/xenTro2/blastDb cd /cluster/data/xenTro2/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/xenTro2/blastDb done mkdir -p /cluster/data/xenTro2/bed/tblastn.hg18KG cd /cluster/data/xenTro2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/xenTro2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 496 query.lst # we want around 150000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(150000/496) = 121.443947 mkdir -p /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa split -l 121 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/xenTro2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/xenTro2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 150784 of 150784 jobs # CPU time in finished jobs: 12376974s 206282.91m 3438.05h 143.25d 0.392 y # IO & Wait Time: 603271s 10054.51m 167.58h 6.98d 0.019 y # Average job time: 86s 1.43m 0.02h 0.00d # Longest finished job: 462s 7.70m 0.13h 0.01d # Submission to last job: 80194s 1336.57m 22.28h 0.93d ssh kkstore04 cd /cluster/data/xenTro2/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/xenTro2/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 304 of 304 jobs # CPU time in finished jobs: 764s 12.73m 0.21h 0.01d 0.000 y # IO & Wait Time: 11950s 199.17m 3.32h 0.14d 0.000 y # Average job time: 42s 0.70m 0.01h 0.00d # Longest finished job: 77s 1.28m 0.02h 0.00d # Submission to last job: 2117s 35.28m 0.59h 0.02d ssh kkstore04 cd /cluster/data/xenTro2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/xenTro2/bed/tblastn.hg18KG/blastHg18KG.psl pslCheck blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/xenTro2/bed/tblastn.hg18KG hgLoadPsl xenTro2 blastHg18KG.psl # check coverage featureBits xenTro2 refGene:cds blastHg18KG -enrichment # refGene:cds 0.337%, blastHg18KG 1.477%, both 0.263%, cover 78.14%, enrich 52.91x ssh kkstore04 rm -rf /cluster/data/xenTro2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # SWAP CHAINS/NET GALGAL3 (DONE 7/20/06 angie) ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.galGal3.swap cd /cluster/data/xenTro2/bed/blastz.galGal3.swap doBlastzChainNet.pl -swap /cluster/data/galGal3/bed/blastz.xenTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.galGal3.swap /cluster/data/xenTro2/bed/blastz.galGal3 ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna xenTro2 ########################################################################## ## WindowMasker masked sequence (DONE - 2007-02-20 - Hiram) ssh kolossus ## This directory was placed on store12 since xenTro2 filesystem ## was full. This is actually a symlink mkdir /cluster/data/xenTro2/bed/WindowMasker.2007-02-19 cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19 ## copy the .csh scripts from mm8 WindowMasker run, edit to fixup # reference to correct DB and work directory. time nice -n +19 ./doCounts.csh > doCounts.out 2>&1 time nice -n +19 ./doSdust.csh > doSdust.out 2>&1 ssh kkstore05 cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19 gzip *.counts *.bed nice -n +19 ./applyMask.csh # this addTrf properly gets the n's changed to N which WM masked nice -n +19 ./addTrf.csh # measuring faSize of resulting xenTro2.sdTrf.2bit: # 1513925492 bases (154525475 N's 1359400017 real # 835902481 upper 523497536 lower) in 19759 sequences in 1 files # %38.51 = 523497536 / 1359400017 # vs. existing xenTro2.2bit # 1513925492 bases (154525475 N's 1359400017 real # 1092281194 upper 267118823 lower) in 19759 sequences in 1 files # %19.65 = 267118823 / 1359400017 ssh hgwdev cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19 time nice -n +19 ./load.csh > load.out 2>&1 # Loaded 7013938 elements of size 3 # real 2m29.659s ######################################################################### ## BLASTZ SWAP Lizard/anoCar1 - (DONE - 2007-02-22 - Hiram) # measurement of anoCar1 coverage by Frog time nice -n +19 featureBits anoCar1 chainXenTro2Link \ > fb.anoCar1.chainXenTro2Link.txt 2>&1 # real 11m33.086s # 83873500 bases of 1741478929 (4.816%) in intersection ## the swap ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap > swap.log 2>&1 & # real 136m3.288s ssh hgwdev cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap time nice -n +19 featureBits xenTro2 chainAnoCar1Link \ > fb.xenTro2.chainAnoCar1Link.txt 2>&1 # 84514985 bases of 1359412157 (6.217%) in intersection ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: xenTro2.upstreamGeneTbl = mgcGenes xenTro2.upstreamMaf = multiz7way /hive/data/genomes/xenTro2/bed/multiz7way/species.lst ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-23 - Galt) # original alignment to danRer6 cd /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22 cat fb.danRer6.chainXenTro2Link.txt # 100078259 bases of 1506896106 (6.641%) in intersection # running the swap - DONE - 2009-12-23 mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer6.swap cd /hive/data/genomes/xenTro2/bed/blastz.danRer6.swap time nice +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap >& swap.log & cat fb.xenTro2.chainDanRer6Link.txt # 92089833 bases of 1359412157 (6.774%) in intersection ####################################################################### # lastz swap from danRer7 (DONE - 2010-12-20 - Hiram) # original alignment to danRer7 cd /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17 cat fb.danRer7.chainXenTro2Link.txt # 90625809 bases of 1409770109 (6.428%) in intersection # running the swap mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap cd /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 32m57.901s cat fb.xenTro2.chainDanRer7Link.txt # 89862892 bases of 1359412157 (6.610%) in intersection ####################################################################### # lastz swap from melGal1 (DONE - 2011-04-02 - Chin) # original alignment to melGal1 cd /hive/data/genomes/melGal1/bed/lastzXenTro2.2011-04-02 cat fb.melGal1.chainXenTro2Link.txt # 36288270 bases of 935922386 (3.877%) in intersection # running the swap mkdir /hive/data/genomes/xenTro2/bed/blastz.melGal1.swap cd /hive/data/genomes/xenTro2/bed/blastz.melGal1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/melGal1/bed/lastzXenTro2.2011-04-02/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 3m40.138s cat fb.xenTro2.chainMelGal1Link.txt # 41967849 bases of 1359412157 (3.087%) in intersection cd /hive/data/genomes/xenTro2/bed ln -s blastz.melGal1.swap lastz.melGal1 ####################################################################### # lastz swap from anoCar2 (DONE - 2011-04-26 - Hiram) # original alignment cd /hive/data/genomes/anoCar2/bed/lastzXenTro2.2011-04-25 cat fb.anoCar2.chainXenTro2Link.txt # 85962319 bases of 1701353770 (5.053%) in intersection # running the swap - DONE - 2011-04-26 mkdir /hive/data/genomes/xenTro2/bed/blastz.anoCar2.swap cd /hive/data/genomes/xenTro2/bed/blastz.anoCar2.swap time nice -n +25 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/anoCar2/bed/lastzXenTro2.2011-04-25/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet -swap -qRepeats=windowmaskerSdust \ -tRepeats=windowmaskerSdust > swap.log 2>&1 & # real 29m36.856s cat fb.xenTro2.chainAnoCar2Link.txt # 91934327 bases of 1359412157 (6.763%) in intersection ############################################################################## # construct liftOver to xenTro3 (DONE - 2011-09-19 - Hiram) screen # manage this longish running job in a screen mkdir /hive/data/genomes/xenTro2/bed/blat.xenTro3.2011-09-19 cd /hive/data/genomes/xenTro2/bed/blat.xenTro3.2011-09-19 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/xenTro2/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev xenTro2 xenTro3 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/xenTro2/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev xenTro2 xenTro3 > do.log 2>&1 # real 597m54.341s # verify this file exists: # /gbdb/xenTro2/liftOver/xenTro2ToAnoCar2.over.chain.gz # and try out the conversion on genome-test from xenTro2 to xenTro3 ############################################################################ ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd