# for emacs: -*- mode: sh; -*- # IMPORTANT NOTE: # # DUE TO A SIMPLISTIC APPROACH/ERROR, THE chrUn WAS CONSTRUCTED WHICH DID NOT # PRESERVE THE BRIDGED GAPS OF ABOUT 1,000 SUPERCONTIGS WHICH ARE NOT PLACED IN # REGULAR CHROMOSOMES. # # WHEN LOADING 3RD PARTY DATA, PLEASE DO NOT LOAD chrUn. # # PLEASE SEE THE SECTION OF "CREATE A SINGLE chrUn" FOR MORE DETAILS. # (Fan, 5/10/07). # Equus caballus - Broad Institute 1.0 wget --timestamping http://www.genome.gov/Images/press_photos/lowres/20008-72.jpg ######################################################################### # DOWNLOAD SEQUENCE (DONE 2/6/07 Fan) ssh kkstore05 mkdir /cluster/store12/equCab1 ln -s /cluster/store12/equCab1 /cluster/data/equCab1 mkdir /cluster/data/equCab1/downloads cd /cluster/data/equCab1/downloads wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/BasicAssemblyOneLiner.out wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/Draft_v1.agp.chromosome.fasta.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/Draft_v1.agp.chromosome.qual.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/ForDistribution.command wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.agp wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.bases.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.bases.split.tar.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.links wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.quals.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.quals.split.tar.gz wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.unplaced wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/chromosomes.agp wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/source wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/unplaced.fasta.gz faSize Draft_v1.agp.chromosome.fasta.gz #2454853977 bases (32946942 N's 2421907035 real 2421907035 upper 0 lower) in #10526 sequences in 1 files #Total size: mean 233218.1 sd 993878.5 min 3000 (scaffold_10140.1-3000) max #5000000 (scaffold_0.1-5000000) median 4087 #N count: mean 3130.1 sd 16192.7 #U count: mean 230088.1 sd 982249.2 #L count: mean 0.0 sd 0.0 # CREATE A SINGLE chrUn # PLEASE NOTE THE FOLLOWING STEPS THAT CREATED THE chrUn WAS # LESS-THAN-OPTIMAL, BECAUSE IT DID NOT TAKE INTO CONSIDERATION THAT SOME # chrUnXXXXX SCAFFOLDS CONTAINS MULTIPLE CONTIGS WITH KNOWN BRIDGED GAP LENGTHS. # A MORE ACCURATE APPROACH NEXT TIME WOULD BE TO CONSTRUCT THE .FA AND .AGP # FILES THAT PRESERVE THOSE KNOWN BRIDGED GAPS IN chrUnXXXX SCAFFOLDS # INSTEAD OF LETTING THE scaffoldFaToAgp PROGRAM TO ASSIGN THE DEFAULT # 1,000 BASE GAPS BETWEEN ALL CONTIGS. (Fan 5/10/07). ssh kkstore05 cd /cluster/data/equCab1 mkdir ucsc cd ucsc cat ../chromosomes.agp |grep Un |grep -v fragment |cut -f 1,6 >un.tab ssh hgwdev hgsql equCab1 CREATE TABLE horseUnContig ( unId varchar(40) NOT NULL, contigId varchar(40) NOT NULL, KEY (unId), KEY (contigId) ) TYPE=MyISAM; load data local infile "un.tab" into table horseUnContiq; quit hgsql equCab1 -N -e 'select s.* from horseContigSeq s, horseUnContig u where s.id=u.contigId' >unSeq.tab hgsql equCab1 -e 'load data local infile "unSeq.tab" into table horseUnContigSeq' pepPredToFa equCab1 horseUnContigSeq horseUnSeq.fa scaffoldFaToAgp horseUnSeq.fa cat ../chromosomes.agp |grep -v Un >j1.tmp cat j1.tmp horseUnSeq.agp >ucsc.agp qaToQac ../assembly.quals.gz assembly.quals.qac qacAgpLift ../ucsc.agp assembly.quals.qac ucscChrom.qac mv ucscChrom.qac .. ######################################################################### # MAKE GENOME DB (IN PROGRESS 2/8/07 Fan) ssh kkstore05 cd /cluster/data/equCab1 cat > equCab1.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log # Follow the directions at the end of the log after #NOTES -- STUFF THAT YOU WILL HAVE TO DO -- ######################################################################### # REPEATMASKER (DONE 02/14/07, Fan) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl equCab1 -verbose 3 -debug # Run it for real and tail the log: mkdir /cluster/data/equCab1/bed/RepeatMasker.2007-02-13 cd /cluster/data/equCab1/bed/RepeatMasker.2007-02-13 doRepeatMasker.pl equCab1 -verbose 3 \ >& do.log & tail -f do.log # RM version info from do.log: #grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker ## October 6 2006 (open-3-1-6) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl ##CC RELEASE 20061006; # Took almost 2 days. # Completed: 4942 of 4942 jobs # CPU time in finished jobs: 34711332s 578522.21m 9642.04h 401.75d 1.101 y # IO & Wait Time: 918255s 15304.24m 255.07h 10.63d 0.029 y # Average job time: 7210s 120.16m 2.00h 0.08d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 13135s 218.92m 3.65h 0.15d # Submission to last job: 125032s 2083.87m 34.73h 1.45d ssh hgwdev featureBits equCab1 rmsk # 994130286 bases of 2421923695 (41.047%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2/15/07 Fan) ssh kolossus nice tcsh mkdir /cluster/data/equCab1/bed/simpleRepeat cd /cluster/data/equCab1/bed/simpleRepeat twoBitToFa ../../equCab1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~2 hours # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev cd /cluster/data/equCab1/bed/simpleRepeat hgLoadBed equCab1 simpleRepeat \ /cluster/data/equCab1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits equCab1 simpleRepeat #40603979 bases of 2421923695 (1.677%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 2/15/07 Fan) ssh kolossus cd /cluster/data/equCab1 time twoBitMask equCab1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed equCab1.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). 2.234u 4.142s 0:17.72 35.9% 0+0k 0+0io 5pf+0w # Link to it from /gbdb: ssh hgwdev ln -s /cluster/data/equCab1/equCab1.2bit /gbdb/equCab1/equCab1.2bit ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 2/15/07 Fan) cd /cluster/data/equCab1 /cluster/bin/scripts/makeDownloads.pl equCab1 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log ######################################################################### ## Send sequence to san for kluster runs (DONE - 2007-02-15 - Fan) ssh kkstore05 cd /cluster/data/equCab1 mkdir /san/sanvol1/scratch/equCab1 cp -p equCab1.2bit /san/sanvol1/scratch/equCab1/ cp -p chrom.* /san/sanvol1/scratch/equCab1/ ######################################################################### ## Photograph (DONE - 2007-02-20 - Hiram) mkdir /cluster/data/equCab1/photograph wget --timestamping \ "http://www.genome.gov/Images/press_photos/highres/20008-300.jpg" \ -O Twilight_equCab1.jpg convert -quality 80 -sharpen 0 -normalize -geometry 320x200 \ Twilight_equCab1.jpg Equus_caballus.jpg # Add Equus_caballus.jpg to the browser/images source tree and copy # to /usr/local/apache/htdocs/images # Remember to use the -kb option when adding this binary data: cvs add -kb Equus_caballus.jpg ########################################################################## ## WindowMasker masked sequence (DONE - 2007-02-20 - Hiram) ssh kolossus mkdir /cluster/data/equCab1/bed/WindowMasker.2007-02-20 cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20 ## copy the .csh scripts from xenTro2 WindowMasker run, edit to fixup # reference to correct DB and work directory. time nice -n +19 ./doCount.csh > doCount.out 2>&1 # real 78m24.832s # user 61m36.550s # sys 2m28.392s time nice -n +19 ./doSdust.csh > doSdust.out 2>&1 ### Surprisingly, this did not mask more than RepeatMasker ? ssh kkstore05 cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20 gzip *.counts *.bed nice -n +19 ./applyMask.csh # this addTrf properly gets the n's changed to N which WM masked nice -n +19 ./addTrf.csh # measuring faSize of resulting equCab1.sdTrf.2bit: # 2462493800 bases (40570105 N's 2421923695 real 1686809847 upper # 735113848 lower) in 34 sequences in 1 files # %29.85 masked total, %30.35 masked real # vs. existing equCab1.2bit # 2462493800 bases (40570105 N's 2421923695 real # 1427706342 upper 994217353 lower) in 34 sequences in 1 files # %40.37 masked total, %41.05 masked real ssh hgwdev cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20 time nice -n +19 ./load.csh > load.out 2>&1 # Loaded 14600828 elements of size 3 ########################################################################## ## ADD BLAT SERVER (DONE - 2007-02-20 - Fan) # ask admin to create blat server. echo 'INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("equCab1", "blat11", "17778", "1"); \ INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("equCab1", "blat11", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest ########################################################################## # MAKE 11.OOC FILE FOR BLAT (DONE 2/20/07, Fan) # Use -repMatch=860 (based on size -- for human we use 1024, and # horse size is 84% of human judging by twoBitInfo -noNs) ssh kolossus blat /cluster/data/equCab1/equCab1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/equCab1/11.ooc -repMatch=860 # Wrote 22800 overused 11-mers to /cluster/bluearc/equCab1/11.ooc ssh kkr1u00 mkdir /iscratch/i/equCab1 cd /iscratch/i/equCab1 cp -p /cluster/bluearc/equCab1/11.ooc . cp /cluster/data/equCab1/equCab1.2bit . -p iSync ########################################################################## # BLASTZ/CHAIN/NET RN4 (DONE 2/18/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.rn4.2007-02-18 cd /cluster/data/equCab1/bed/blastz.rn4.2007-02-18 cat << '_EOF_' > DEF # Horse vs. Rat BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Mouse rn4 SEQ2_DIR=/scratch/hg/rn4/nib SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.rn4.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1Rn4 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed ln -s blastz.rn4.2007-02-18 /cluster/data/equCab1/bed/blastz.rn4 nice featureBits equCab1 -chrom=chr1 chainRn4Link # 67408374 bases of 177498097 (37.977%) in intersection bash time nice -n 19 featureBits equCab1 chainRn4Link \ > fb.equCab1.chainRn4Link.txt 2>&1 # 859116859 bases of 2421923695 (35.472%) in intersection ssh kkstore05 mkdir /cluster/data/rn4/bed/blastz.equCab1.swap cd /cluster/data/rn4/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.rn4.2007-02-18/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/equCab1Rn4 > swap.log 2>&1 & ssh hgwdev cd /cluster/data/rn4/bed/blastz.equCab1.swap bash time nice -n 19 featureBits rn4 chainEquCab1Link \ > fb.rn4.chainEquCab1Link.txt 2>&1 # 867029914 bases of 2571531505 (33.716%) in intersection ######################################################################### # BLASTZ/CHAIN/NET CANFAM2 (DONE 2/19/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 cat << '_EOF_' > DEF # Horse vs. Dog BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Dog canFam2 SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log & tail -f do.log # 6 jobs failed. # Robert re-ran those 6 jobs. (2/22/07) doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -continue cat -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log & tail -f do2.log ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2 ssh hgwdev nice featureBits equCab1 -chrom=chrI chainCanFam2Link # 128747357 bases of 177498097 (72.534%) in intersection bash time nice -n 19 featureBits equCab1 chainCanFam2Link \ > fb.equCab1.chainCanFam2Link.txt 2>&1 # 1717664968 bases of 2421923695 (70.922%) in intersection ssh kkstore04 mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap cd /cluster/data/canFam2/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & # real 115m55.880s cat *.txt # 1673177547 bases of 2384996543 (70.154%) in intersection ######################################################################### # BLASTZ/CHAIN/NET PANTRO2 (DONE 2/21/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 cd /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 cat << '_EOF_' > DEF # Horse vs. Chimp BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Chimp panTro2 SEQ2_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ2_LEN=/cluster/data/panTro2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1PanTro2 >& do.log & tail -f do.log ln -s blastz.panTro2.2007-02-19 /cluster/data/equCab1/bed/blastz.panTro2 nice featureBits equCab1 -chrom=chr1 chainPanTro2Link # 121391547 bases of 177498097 (68.390%) in intersection bash time nice -n 19 featureBits equCab1 chainPanTro2Link \ > fb.equCab1.chainPanTro2Link.txt 2>&1 # 1593914101 bases of 2421923695 (65.812%) in intersection ssh kkstore04 mkdir /cluster/data/panTro2/bed/blastz.equCab1.swap cd /cluster/data/panTro2/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/equCab1PanTro2 > swap.log 2>&1 & # The -blastzOutRoot /cluster/bluearc/equCab1PanTro2 option above # should not be there. It caused an error message during cleaning up. # Don't specify it for -swap next time. ssh hgwdev cd /cluster/data/panTro2/bed/blastz.equCab1.swap bash time nice -n 19 featureBits panTro2 chainEquCab1Link \ > fb.panTro2.chainEquCab1Link.txt 2>&1 # 1636782924 bases of 2909485072 (56.257%) in intersection ######################################################################### # BLASTZ/CHAIN/NET mm8 (DONE 2/21/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 cat << '_EOF_' > DEF # Horse vs. Mouse BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Mouse mm8 SEQ2_DIR=/scratch/hg/mm8/mm8.2bit SEQ2_LEN=/cluster/data/mm8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8 nice featureBits equCab1 -chrom=chr1 chainMm8Link # 70800969 bases of 177498097 (39.888%) in intersection bash time nice -n 19 featureBits equCab1 chainMm8Link \ > fb.equCab1.chainMm8Link.txt 2>&1 # 903993981 bases of 2421923695 (37.325%) in intersection ssh kkstore05 mkdir /cluster/data/mm8/bed/blastz.equCab1.swap cd /cluster/data/mm8/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & tail -f swap.log # real 76m34.873s ssh hgwdev cd /cluster/data/mm8/bed/blastz.equCab1.swap bash time nice -n 19 featureBits mm8 chainEquCab1Link \ > fb.mm8.chainEquCab1Link.txt 2>&1 # 906568751 bases of 2567283971 (35.312%) in intersection ######################################################################### # BLASTZ/CHAIN/NET galGal3 (DONE 2/27/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 cat << '_EOF_' > DEF # Horse vs. Chicken BLASTZ_M=50 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3 nice featureBits equCab1 -chrom=chr1 chainGalGal3Link # 9136777 bases of 177498097 (5.148%) in intersection bash time nice -n 19 featureBits equCab1 chainGalGal3Link \ > fb.equCab1.chainGalGal3Link.txt 2>&1 # 114216918 bases of 2421923695 (4.716%) in intersection ssh kkstore05 mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap cd /cluster/data/galGal3/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & tail -f swap.log # the above script stopped with the following lines in the .log file: # netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout # axtSort stdin stdout # gzip -c # Processing chrM # end # netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout # axtSort stdin stdout # gzip -c # Processing chrUn_random # Hiram helped to created the following temp .csh: cat finiNet.csh #!/bin/csh -efx # This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl # from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF # It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain . # It generates nets (without repeat/gap stats -- those are added later on # hgwdev) from chains, and generates axt, maf and .over.chain from the nets. # This script will fail if any of its commands fail. cd /cluster/data/galGal3/bed/blastz.equCab1.swap foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz end # Make mafNet for multiz: one .maf per galGal3 seq. mkdir mafNet foreach f (axtNet/*.galGal3.equCab1.net.axt.gz) axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \ /cluster/data/galGal3/chrom.sizes /san/sanvol1/scratch/equCab1/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end ssh kolossus cd /cluster/data/galGal3/bed/blastz.equCab1.swap fininet.csh ssh hgwdev cd /cluster/data/galGal3/bed/blastz.equCab1.swap bash time nice -n 19 featureBits galGal3 chainEquCab1Link \ > fb.galGal3.chainEquCab1Link.txt 2>&1 # 104418042 bases of 1042591351 (10.015%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE Fan 2007-02-19) ssh kkstore05 bash mkdir /cluster/data/equCab1/blastDb cd /cluster/data/equCab1 zcat /cluster/data/equCab1/goldenPath/chromosomes/*.fa.gz > temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/equCab1/blastDb cd /cluster/data/equCab1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/equCab1/blastDb done mkdir -p /cluster/data/equCab1/bed/tblastn.hg18KG cd /cluster/data/equCab1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/equCab1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 34 query.lst # we want around 100000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(100000/34) = 12.487180 # 13 did not work, 50 did not work either, finally 100 seems to work mkdir -p /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/equCab1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/equCab1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time #Completed: 12512 of 12512 jobs #CPU time in finished jobs: 20551564s 342526.06m 5708.77h 237.87d 0.652 y #IO & Wait Time: 119733s 1995.56m 33.26h 1.39d 0.004 y #Average job time: 1652s 27.54m 0.46h 0.02d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 29231s 487.18m 8.12h 0.34d #Submission to last job: 162262s 2704.37m 45.07h 1.88d ssh kkstore05 cd /cluster/data/equCab1/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/equCab1/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 368 of 368 jobs # CPU time in finished jobs: 297634s 4960.57m 82.68h 3.44d 0.009 y # IO & Wait Time: 221852s 3697.53m 61.63h 2.57d 0.007 y # Average job time: 1412s 23.53m 0.39h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 47807s 796.78m 13.28h 0.55d # Submission to last job: 47828s 797.13m 13.29h 0.55d ssh kkstore05 cd /cluster/data/equCab1/bed/tblastn.hg18KG/blastOut bash for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/equCab1/bed/tblastn.hg18KG/blastHg18KG.psl cd /cluster/data/equCab1/bed/tblastn.hg18KG pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/equCab1/bed/tblastn.hg18KG hgLoadPsl equCab1 blastHg18KG.psl # check coverage nice featureBits equCab1 blastHg18KG # 32605747 bases of 2421923695 (1.346%) in intersection # In comparison to cat and dog: nice featureBits felCat3 blastHg18KG # 15218612 bases of 1642698377 (0.926%) in intersection nice featureBits canFam2 blastHg18KG # 32565727 bases of 2384996543 (1.365%) in intersection featureBits equCab1 refGene:cds blastHg18KG -enrichment # refGene:cds 0.011%, blastHg18KG 1.346%, both 0.010%, cover 90.61%, enrich 67.31x ssh kkstore04 rm -rf /cluster/data/equCab1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut #end tblastn ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-02-21 - Fan) # Make a liftAll.lft that specifies 5M chunks for genbank: ssh kkstore05 cd /cluster/data/equCab1 simplePartition.pl equCab1.2bit 5000000 /tmp/equCab1P cat /tmp/equCab1P/*/*.lft > jkStuff/liftAll.lft rm -r /tmp/equCab1P cat equCab1.agp | agpToLift -revStrand > jkStuff/genbank.lft # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add equCab1 just after gasAcu1 # equCab1 (Equus caballus) equCab1.serverGenome = /cluster/data/equCab1/equCab1.2bit equCab1.clusterGenome = /iscratch/i/equCab1/equCab1.2bit equCab1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} equCab1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} equCab1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} equCab1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} equCab1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} equCab1.ooc = /iscratch/i/equCab1/11.ooc equCab1.lift = /cluster/data/equCab1/jkStuff/genbank.lft equCab1.align.unplacedChroms = chrUn equCab1.genbank.mrna.xeno.load = yes equCab1.downloadDir = equCab1 cvs ci -m "Added equCab1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Encountered a problem: # test -e /cluster/data/genbank/etc/CVS || cp -r etc/CVS /cluster/data/genbank/etc (cd /cluster/data/genbank/etc && cvs update -d) # Permission denied. # cvs [update aborted]: end of file from server (consult above messages if any) # make[1]: *** [install-etc] Error 1 # added the following line to my .cshrc fix the problem. setenv CVS_RSH ssh # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added equCab (horse)." src/lib/gbGenome.c make install-server ssh kkstore02 cd /cluster/data/genbank bash nice -n +19 bin/gbAlignStep -initial equCab1 & # cluster jobs crashed when KK shut down due to power outage # manually pushed to finish the remaining jobs. ssh kkstore02 cd /cluster/data/genbank bash nice -n +19 bin/gbAlignStep -initial -continue=finish equCab1 & # load database when finished ssh hgwdev cd /cluster/data/genbank bash time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad equCab1 # real 16m19.370s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add equCab1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added equCab1." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # BLASTZ/CHAIN/NET HG18 PART I (STARTED 2/16/07, DONE 2/21/07, Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 # NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT # DISCOVER THIS UNTIL THE TASK IS DONE. cat << '_EOF_' > DEF # Horse vs. Human BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse equCab1 SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log & tail -f do.log ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1 nice featureBits hg18 -chrom=chr1 chainEquCab1Link # 132947074 bases of 224999719 (59.088%) in intersection ssh hgwdev cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 bash time nice -n 19 featureBits hg18 chainEquCab1Link \ > fb.hg18.chainEquCab1Link.txt 2>&1 & # 1643928877 bases of 2881515245 (57.051%) in intersection ######################################################################### # BLASTZ/CHAIN/NET HG18 PART II (DONE 2/27/07 Fan) # NOTE: THE PART II WAS NOT DONE BY USING -swap OPTION ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-25 cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-25 cat << '_EOF_' > DEF # Horse vs. Human BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Human hg18 SEQ2_DIR=/scratch/hg/hg18/hg18.2bit SEQ2_LEN=/cluster/data/hg18/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1Hg18 >& do.log & tail -f do.log rm /cluster/data/equCab1/bed/blastz.hg18 ln -s blastz.hg18.2007-02-25 /cluster/data/equCab1/bed/blastz.hg18 nice featureBits equCab1 -chrom=chr1 chainHg18Link # 122777285 bases of 177498097 (69.171%) in intersection bash time nice -n 19 featureBits equCab1 chainHg18Link \ > fb.equCab1.chainHg18Link.txt 2>&1 # 1618657165 bases of 2421923695 (66.834%) in intersection ############################################################################ ## Reset default position the HMGB1 gene (DONE - 2004-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr17:9817422-9822403" where name="equCab1";' hgcentraltest ########################################################################## # NSCAN track - (2007-03-08 markd) # uses hg18 as informat, with pseudogene masking # # remove old, alpha N-SCAN track built by robert cd /cluster/data/equCab1/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv http://mblab.wustl.edu/predictions/horse/equCab1.gtf wget -nv http://mblab.wustl.edu/predictions/horse/equCab1.prot.fa bzip2 equCab1.* chmod a-w *.bz2 # load tracks. ldHgGene -bin -gtf -genePredExt equCab1 nscanGene equCab1.gtf.bz2 hgPepPred equCab1 generic nscanPep equCab1.prot.fa.bz2 rm -f *.tab # update trackDb; need a equCab1-specific page to describe informants horse/equCab1/nscanGene.html horse/equCab1/trackDb.ra # request push of nscanGene nscanPep # QA NOTE (ASZ 06-11-2007): mytouch of nscanPep table to bring into line with nscanGene. # sudo mytouch equCab1 nscanPep sudo mytouch equCab1 nscanPep 200705081600.00 ########################################################################## ## ADD chrM record into chrM_gold. (DONE, 2007-05-02, Fan) # get a template record from oryLat1 hgsql oryLat1 -e 'select * from chrM_gold' equCab1.chrM_gold.tab # edit it with equCab1 chrM legth and accession. vi equCab1.chrM_gold.tab # load it into chrM_gold table hgsql equCab1 -e 'load data local infile "equCab1M.tab" into table chrM_gold' ########################################################################## # SGP GENES (DONE - 2007-12-20 - Hiram) ssh kkstore05 mkdir /cluster/data/equCab1/bed/sgp cd /cluster/data/equCab1/bed/sgp mkdir gtf for C in `awk '{print $1}' /cluster/data/equCab1/chrom.sizes` do wget --timestamping \ "http://genome.imim.es/genepredictions/E.caballus/ecJan2007/SGP/200603mm8/${C}.gtf" \ -O "gtf/${C}.gtf" done ssh hgwdev cd /cluster/data/equCab1/bed/sgp ldHgGene -gtf -genePredExt equCab1 sgpGene gtf/chr*.gtf # Read 30403 transcripts in 267753 lines in 33 files # 30403 groups 33 seqs 1 sources 3 feature types # 30403 gene predictions featureBits equCab1 sgpGene # 32915117 bases of 2421923695 (1.359%) in intersection featureBits equCab1 -enrichment refGene:CDS sgpGene # refGene:CDS 0.015%, sgpGene 1.359%, both 0.014%, cover 89.17%, enrich 65.61x ##################################################################### # LOAD GENEID GENES (DONE - 2007-12-20 - Hiram) ssh kkstore05 mkdir -p /cluster/data/equCab1/bed/geneid cd /cluster/data/equCab1/bed/geneid mkdir gtf prot for C in `awk '{print $1}' /cluster/data/equCab1/chrom.sizes` do echo $C wget --timestamping \ "http://genome.imim.es/genepredictions/E.caballus/ecJan2007/geneid_v1.2/${C}.gtf" \ -O "gtf/${C}.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/E.caballus/ecJan2007/geneid_v1.2/${C}.prot" \ -O "prot/${C}.prot" done # Add missing .1 to protein id's cd prot foreach f (*.prot) perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot end ssh hgwdev cd /cluster/data/equCab1/bed/geneid ldHgGene -genePredExt -gtf equCab1 geneid gtf/*.gtf # Read 35577 transcripts in 285553 lines in 33 files # 35577 groups 33 seqs 1 sources 3 feature types # 35577 gene predictions # load proteins hgPepPred equCab1 generic geneidPep prot/chr*-fixed.prot featureBits equCab1 geneid # 38938548 bases of 2421923695 (1.608%) in intersection featureBits equCab1 -enrichment refGene:CDS geneid # refGene:CDS 0.015%, geneid 1.608%, both 0.013%, cover 86.04%, enrich 53.52x ######################################################################### # EQUCAB1->EQUCAB2 LIFTOVER (DONE - 2008-07-08 - larrym) ssh kkstore02 cd /cluster/data/equCab1 cp -p /cluster/data/equCab2/equCab2.2bit /cluster/bluearc/equCab2 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug equCab1 equCab2 # Real run: cd /cluster/data/equCab1/bed/blat.equCab2.2008-07-08 doSameSpeciesLiftOver.pl equCab1 equCab2 >& do.log & tail -f do.log rm -f /cluster/bluearc/equCab2/equCab2.2bit ########################################################################## ######################################################################### # REDO EQUCAB1->EQUCAB2 LIFTOVER (DONE - 2008-09-09 - larrym) # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -workhorse=hgwdev -bigClusterHub=pk -debug equCab1 equCab2 # Real run: cd /cluster/data/equCab1/bed/blat.equCab2.2008-07-08 doSameSpeciesLiftOver.pl -ooc /-workhorse=hgwdev -bigClusterHub=pk equCab1 equCab2 >& do.log & tail -f do.log rm -f /cluster/bluearc/equCab2/equCab2.2bit # redo to make sure we used the right .ooc file: doSameSpeciesLiftOver.pl -ooc /hive/data/genomes/equCab2/equCab2.11.ooc -workhorse=hgwdev -bigClusterHub=pk -debug equCab1 equCab2 HgStepManager: executing from step 'align' through step 'cleanup'. HgStepManager: executing step 'align' Thu Sep 4 12:59:45 2008. align: can't find equCab2/equCab2.2bit in /cluster/bluearc/, /san/sanvol1/scratch/, /scratch/hg/ -- please distribute. # fix scratch data for equCab2 cd /san/sanvol1/scratch/equCab2 ln -s /hive/data/genomes/equCab2/equCab2.2bit . cd /cluster/data/equCab1 doSameSpeciesLiftOver.pl -ooc /hive/data/genomes/equCab2/equCab2.11.ooc -workhorse=hgwdev -bigClusterHub=pk equCab1 equCab2