# for emacs: -*- mode: sh; -*- # Gasterosteus aculeatus - Broad Institute 1.0 # Jeff McKinnon gave permission for photo 8/16/06 and asked for notification # when the browser is public -- # e-mail: mckinnoj@uww.edu http://facstaff.uww.edu/mckinnoj/mckinnon.html # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANH01 ######################################################################### # DOWNLOAD SEQUENCE (DONE 8/15/06 angie) ssh kkstore05 mkdir /cluster/store12/gasAcu1 ln -s /cluster/store12/gasAcu1 /cluster/data/gasAcu1 mkdir /cluster/data/gasAcu1/downloads cd /cluster/data/gasAcu1/downloads foreach f (Stickleback1.0.agp \ Stickleback1.0.agp.chromosome.fasta.gz \ Stickleback1.0.agp.chromosome.qual.gz \ assembly.agp \ assembly.bases.gz \ assembly.ps) wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/fish/stickleback/gasAcu1/$f end faSize Stickleback1.0.agp.chromosome.fasta.gz #463338706 bases (16726587 N's 446612119 real 446612119 upper 0 lower) in 105 sequences in 1 files #Total size: mean 4412749.6 sd 1314228.9 min 83130 (XIII.20000001-20083130) max 5000000 (I.1-5000000) median 5000000 #N count: mean 159300.8 sd 213368.9 #U count: mean 4253448.8 sd 1311287.4 #L count: mean 0.0 sd 0.0 # Doh! Their chromosome fasta is chunked into 5M pieces. Not so # useful for us. # AGP and FA use different names, too. AGP has "groupI" and FA has "I". # A lot of our software really wants to see "chr" at the beginning. # So make AGP with substituted names and use that to construct the # chrom FA. sed -e 's/^group/chr/' Stickleback1.0.agp > UCSC.gasAcu1.agp # Their chromosome qual is chunked into 5M pieces too. (There is also # an assembly.agp.qual.gz that has *scaffolds* chunked.) Unchunk, # relying on the fact that chunks are in order in the file: zcat Stickleback1.0.agp.chromosome.qual.gz \ | perl -wpe 's/^>(\S+)\.1-\d+.*/>chr$1/; s/^>\S+\.\d+-\d+.*\n$//;' \ > UCSC.gasAcu1.qual ######################################################################### # MAKE GENOME DB (DONE 8/17/06 angie) ssh kkstore05 cd /cluster/data/gasAcu1 cat > gasAcu1.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log # Follow the directions at the end of the log after #NOTES -- STUFF THAT YOU WILL HAVE TO DO -- ######################################################################### # REPEATMASKER (DONE (but might need better libs??) 8/16/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -debug # Run it for real and tail the log: cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-16 ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 \ >& do.log & tail -f do.log # RM version info from do.log: #grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker ## March 20 2006 (open-3-1-5) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20060315; * # The cluster job crashed because no repeats were found on chrM # so check out line+ failed. NBD. Manually made run.cluster/run.time # and continued: ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -continue cat \ >>& do.log & tail -f do.log ssh hgwdev featureBits gasAcu1 rmsk #11005259 bases of 446627861 (2.464%) in intersection # Wow, that is awfully skimpy. Stickleback is a ~675M vertebrate genome, # probably pretty compact, but still... # RepeatMaskerLib.embl has only 2 stickleback-specific repeats. ######################################################################### # REPEATMASKER - FUGU LIBS (DONE 8/17/06 angie) # RepeatMasker with stickleback as the species had very low coverage. # I found a paper on stickleback sex determination where they studied # repeats and they said they used fugu libs, so give that a try... # PMID: 15324658 # http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VRT-4D56PX0-N&_coverDate=08%2F24%2F2004&_alid=435874407&_rdoc=1&_fmt=&_orig=search&_qd=1&_cdi=6243&_sort=d&view=c&_acct=C000059601&_version=1&_urlVersion=0&_userid=4428&md5=e5d62090265bd7b92b95f8f4cda5e079 ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 -debug # Run it for real and tail the log: cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-17 ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 \ >& do.log & tail -f do.log # RM version info from do.log: #grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker ## March 20 2006 (open-3-1-5) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20060315; * featureBits gasAcu1 rmsk #16563793 bases of 446627861 (3.709%) in intersection # Still seems rather slim, but it's an improvement. ######################################################################### # SIMPLE REPEATS (TRF) (DONE 8/16/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/gasAcu1/bed/simpleRepeat cd /cluster/data/gasAcu1/bed/simpleRepeat twoBitToFa ../../gasAcu1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~40 minutes # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed gasAcu1 simpleRepeat \ /cluster/data/gasAcu1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits gasAcu1 simpleRepeat #11062299 bases of 446627861 (2.477%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/16/06 angie) ssh kolossus cd /cluster/data/gasAcu1 time twoBitMask gasAcu1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed gasAcu1.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.372u 0.716s 0:04.34 24.8% 0+0k 0+0io 0pf+0w # Link to it from /gbdb: ssh hgwdev ln -s /cluster/data/gasAcu1/gasAcu1.2bit /gbdb/gasAcu1/gasAcu1.2bit ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/17/06 angie) cd /cluster/data/gasAcu1 ~/kent/src/utils/makeDownloads.pl gasAcu1 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/gasAcu1/goldenPath/database/README.txt # /cluster/data/gasAcu1/goldenPath/bigZips/README.txt # /cluster/data/gasAcu1/goldenPath/chromosomes/README.txt ######################################################################### # BLASTZ/CHAIN/NET DANRER4 (DONE 10/17/06 angie) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16 cd /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16 cat << '_EOF_' > DEF # Stickleback vs. Zebrafish # Try "human-fugu" (more distant, less repeat-killed than mammal) params +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes # QUERY: Zebrafish danRer4 SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit SEQ2_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit SEQ2_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft SEQ2_LEN=/cluster/data/danRer4/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/gasAcu1danRer4 >& do.log & tail -f do.log ln -s blastz.danRer4.2006-10-16 /cluster/data/gasAcu1/bed/blastz.danRer4 nice featureBits gasAcu1 -chrom=chrI chainDanRer4Link #9296061 bases of 27557028 (33.734%) in intersection time nice -n 19 featureBits gasAcu1 chainDanRer4Link \ > fb.gasAcu1.chainDanRer4Link.txt 2>&1 # 157049518 bases of 446627861 (35.163%) in intersection # I'm curious what a swap would look like on this one # (DONE - 2006-12-08 - Hiram) ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.gasAcu1.swap cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/gasAcu1danRer4 > swap.log 2>&1 & # real 38m51.527s ssh hgwdev cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap time nice -n 19 featureBits danRer4 chainGasAcu1Link \ > fb.danRer4.chainGasAcu1Link.txt 2>&1 # 213083380 bases of 1626093931 (13.104%) in intersection ######################################################################### # BLASTZ/CHAIN/NET TETNIG1 (DONE 10/17/06 angie) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17 cd /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17 cat << '_EOF_' > DEF # Stickleback vs. Tetraodon # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes # QUERY: Tetraodon tetNig1 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 >& do.log & tail -f do.log ln -s blastz.tetNig1.2006-10-17 /cluster/data/gasAcu1/bed/blastz.tetNig1 nice featureBits gasAcu1 -chrom=chrI chainTetNig1Link #11175314 bases of 27557028 (40.553%) in intersection time nice -n 19 featureBits gasAcu1 chainTetNig1Link \ > fb.gasAcu1.chainTetNig1Link.txt 2>&1 & # 187779775 bases of 446627861 (42.044%) in intersection # I'm curious what a swap would look like on this one # (DONE - 2006-12-08 - Hiram) ssh kkstore04 mkdir /cluster/data/tetNig1/bed/blastz.gasAcu1.swap cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 > swap.log 2>&1 & ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap time nice -n 19 featureBits tetNig1 chainGasAcu1Link \ > fb.tetNig1.chainGasAcu1Link.txt 2>&1 & # 171216660 bases of 342403326 (50.004%) in intersection ######################################################################### # BLASTZ/CHAIN/NET HG18 (DONE 10/17/06 angie) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17 cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17 cat << '_EOF_' > DEF # Stickleback vs. Human # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes # QUERY: Human hg18 SEQ2_DIR=/scratch/hg/hg18/hg18.2bit SEQ2_LEN=/cluster/data/hg18/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/gasAcu1hg18 >& do.log & tail -f do.log ln -s blastz.hg18.2006-10-17 /cluster/data/gasAcu1/bed/blastz.hg18 nice featureBits gasAcu1 -chrom=chrI chainHg18Link #2973001 bases of 27557028 (10.789%) in intersection ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17 time nice -n 19 featureBits gasAcu1 chainHg18Link \ > fb.gasAcu1.chainHg18Link.txt 2>&1 & # 50705450 bases of 446627861 (11.353%) in intersection # Been swapped ssh hgwdev cd /cluster/data/hg18/bed/blastz.gasAcu1.swap time nice -n 19 featureBits hg18 chainGasAcu1Link \ > fb.hg18.chainGasAcu1Link.txt 2>&1 & # 55424609 bases of 2881515245 (1.923%) in intersection ######################################################################### # BLASTZ/CHAIN/NET MM8 (DONE 10/17/06 angie) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17 cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17 cat << '_EOF_' > DEF # Stickleback vs. Mouse # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes # QUERY: Mouse mm8 SEQ2_DIR=/scratch/hg/mm8/mm8.2bit SEQ2_LEN=/cluster/data/mm8/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/gasAcu1mm8 >& do.log & tail -f do.log ln -s blastz.mm8.2006-10-17 /cluster/data/gasAcu1/bed/blastz.mm8 nice featureBits gasAcu1 -chrom=chrI chainMm8Link #2981604 bases of 27557028 (10.820%) in intersection ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17 time nice -n 19 featureBits gasAcu1 chainMm8Link \ > fb.gasAcu1.chainMm8Link.txt 2>&1 & ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 11/14/06 angie) # Use -repMatch=200 (based on size -- for human we use 1024, and # stickleback size is ~15% of human judging by gapless gasAcu1 vs. hg18 # genome sizes from featureBits. Bump it up a bit to be conservative. ssh kolossus blat /cluster/data/gasAcu1/gasAcu1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/gasAcu1/11.ooc -repMatch=200 #Wrote 5700 overused 11-mers to /cluster/bluearc/gasAcu1/11.ooc ######################################################################### # GENBANK AUTO UPDATE (DONE 11/15/06 angie) # Make a liftAll.lft that specifies 5M chunks for genbank: ssh kkstore05 cd /cluster/data/gasAcu1 simplePartition.pl gasAcu1.2bit 5000000 /tmp/gasAcu1P cat /tmp/gasAcu1P/*/*.lft > jkStuff/liftAll.lft rm -r /tmp/gasAcu1P # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add gasAcu1 just after dm2... # gasAcu1 (G. Aculeatus) gasAcu1.serverGenome = /cluster/data/gasAcu1/gasAcu1.2bit gasAcu1.clusterGenome = /iscratch/i/gasAcu1/gasAcu1.2bit gasAcu1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} gasAcu1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} gasAcu1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} gasAcu1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} gasAcu1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} gasAcu1.ooc = /cluster/bluearc/gasAcu1/11.ooc gasAcu1.lift = /cluster/data/gasAcu1/jkStuff/liftAll.lft gasAcu1.genbank.mrna.xeno.load = yes gasAcu1.downloadDir = gasAcu1 cvs ci -m "Added gasAcu1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added G. aculeatus (stickleback)." src/lib/gbGenome.c make install-server ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial gasAcu1 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad gasAcu1 & # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add gasAcu1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added gasAcu1." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # EXTRACT CHROM TO SCAFFOLD LIFTUP (DONE 3/6/07 angie) # Ensembl genes use scaffold_* instead of chrUn. Broad doesn't seem to # provide a scaffold->chrom mapping in their download files, but here's # a way to extract one from ctg->chrom and ctg->scaffold agp... cd /cluster/data/gasAcu1/downloads # Distill contig coordinates to a single field that will be the same # for ctg->chrom and ctg->scaf maps: awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \ UCSC.gasAcu1.agp \ | sort > chrom.tmp awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \ assembly.agp \ | sort > scaf.tmp # Join the maps into what is AGP except it has a bunch of lines for # each scaffold that must be collapsed into one line per scaffold: join -t " " -o '1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 1.7' chrom.tmp scaf.tmp \ | sort -k 1,1 -k 2n,2n \ > tmp.agp # Make sure the join went right (same #lines in all): wc -l chrom.tmp scaf.tmp tmp.agp # 16967 chrom.tmp # 16967 scaf.tmp # 16967 tmp.agp # Now collapse the many per-contig lines for each scaffold into a single # line per scaffold. This produces gapless AGP with sequence numbers # that aren't consecutive (but still monotonically increasing): perl -we 'while (<>) { \ chomp; @w = split; \ if (! defined $lastScaf) { \ ($chr, $chrS, $chrE, $seq, $type, \ $lastScaf, $firstScafS, $firstScafE, $strand) = @w; \ $lastScafS = $firstScafS; $lastScafE = $firstScafE; \ } elsif ($lastScaf ne $w[5]) { \ $scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\ $scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\ print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \ "$lastScaf\t$scafS\t$scafE\t$strand\n"; \ ($chr, $chrS, $chrE, $seq, $type, \ $lastScaf, $firstScafS, $firstScafE, $strand) = @w; \ $lastScafS = $firstScafS; $lastScafE = $firstScafE; \ } else { \ ($thisChr,undef,$chrE,undef,undef, \ undef,$lastScafS,$lastScafE,undef) = @w; \ die "$lastScaf $lastScafE: $thisChr != $chr" \ if ($thisChr ne $chr); \ } \ } \ $scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\ $scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\ print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \ "$lastScaf\t$scafS\t$scafE\t$strand\n"; \ ' tmp.agp \ > UCSC.chromToScaffoldSansGaps.agp # fortunately agpToLift doesn't need gap lines in its input, just frags: agpToLift -revStrand < UCSC.chromToScaffoldSansGaps.agp \ > ../jkStuff/UCSC.chromToScaffoldSansGaps.lft ######################################################################### # ENSEMBL GENES (DONE 11/27/06 angie - RELOADED 3/8/07 angie) ssh hgwdev mkdir /cluster/data/gasAcu1/bed/ensembl cd /cluster/data/gasAcu1/bed/ensembl # Go to www.ensembl.org and click around their evolving interface # to get the following types of data: # 1. a tab-separated file that relates Ensembl gene, transcript and # protein IDs. Save as ensGtp.txt.gz # 2. peptide fasta with transcript IDs in header -> ensemblPep.fa.gz # They have started dumping GTF files so we can download that directly: wget 'ftp://ftp.ensembl.org/pub/current_gasterosteus_aculeatus/data/gtf/*' # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Also need to lift scaffold coords up to chrUn: gunzip -c Gasterosteus_aculeatus.BROADS1.43.gtf.gz \ | sed -e 's/^group/chr/; s/^MT/chrM/;' \ | liftUp -type=.gtf stdout ../../jkStuff/UCSC.chromToScaffoldSansGaps.lft \ carry stdin \ > ensGene.gtf ldHgGene -gtf -genePredExt gasAcu1 ensGene \ /cluster/data/gasAcu1/bed/ensembl/ensGene.gtf # strip header line: gunzip -c ensGtp.txt.gz | tail +2 > ensGtp.txt hgLoadSqlTab gasAcu1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/\|.*//' \ > ensPep.fa # ONE TIME ONLY... seven protein sequences were chopped up in the file # from BioMart and the fragments appeared as competing duplicate # sequences. Ensembl helpdesk is still looking at it but sent the # correct whole sequences for the seven, so I'm manually editing those # in. The ID's are ENSGACT00000000021, ENSGACT00000000176, # ENSGACT00000017572, ENSGACT00000011000, ENSGACT00000000042, # ENSGACT00000000087, ENSGACT00000022185. hgPepPred gasAcu1 generic ensPep ensPep.fa ######################################################################### ######################################################################### ######################################################################### # BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-20 - Hiram) ## no chrUn in gasAcu1, and align to fr1 scaffolds, ## results lifted to fr1 chrUn coordinates ssh kkstore04 mkdir /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08 cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08 cat << '_EOF_' > DEF # Stickleback vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stippleback gasAcu1, no chrUn in this run SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=30 SEQ1_LAP=0 # QUERY: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/gasAcu1Fr1 > do.log 2>&1 & # real 272m33.395s # user 0m0.189s # sys 0m0.163s ssh hgwdev cd /cluster/data/gasAcu1/bed ln -s blastz.fr1.2006-12-08 blastz.fr1 cd blastz.fr1 time nice -n 19 featureBits gasAcu1 chainFr1Link \ > fb.gasAcu1.chainFr1Link.txt 2>&1 & # 146655780 bases of 446627861 (32.836%) in intersection mkdir /cluster/data/fr1/bed/blastz.gasAcu1.swap cd /cluster/data/fr1/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > do.log 2>&1 & time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \ -continue=net -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > net.log 2>&1 & # real 24m33.761s # user 0m0.125s # sys 0m0.080s ssh hgwdev cd /cluster/data/fr1/bed/blastz.gasAcu1.swap time nice -n 19 featureBits fr1 chainGasAcu1Link \ > fb.fr1.chainGasAcu1Link.txt 2>&1 & # 148005715 bases of 315518167 (46.909%) in intersection ####################################################################### ## Swap of oryLat1 alignments to gasAcu1 (DONE - 2006-12-08 - Hiram) ## This sequence duplicated in oryLat1.txt too ssh kkstore04 cd /cluster/data/gasAcu1/bed rm blastz.oryLat1.swap mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.swap cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 26m32.549s ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap nice -n 19 featureBits -noRandom gasAcu1 chainOryLat1Link \ >fb.gasAcu1.oryLat1.txt 2>&1 & # 148335558 bases of 391398261 (37.899%) in intersection ########################################################################### ## checking measurements to determine which fish is close to which fish ## 2006-12-08 - Hiram # featureBits chainLink measures # chainGasAcu1Link chain linearGap # distance on gasAcu1 on other minScore # 1 0.xxxx - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose # 3 0.xxxx - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose # 4 0.xxxx - fugu fr1 (% 32.836) (% 46.909) 2000 loose # 5 0.xxxx - human hg18 (% 11.353) (% 1.923) 2000 loose # 6 0.xxxx - mouse mm8 (% 11.070) (% x.923) 2000 # from oryLat1.txt: # 2 0.xxxx - medaka oryLat1 (% 37.899) (% 28.110) 2000 loose ###### danRer4 # 157049518 bases of 446627861 (35.163%) in intersection tetNig1 # 187779775 bases of 446627861 (42.044%) in intersection hg18 # 50705450 bases of 446627861 (11.353%) in intersection on hg18 # 55424609 bases of 2881515245 (1.923%) in intersection on tetNig1 # 171216660 bases of 342403326 (50.004%) in intersection on danRer4 # 213083380 bases of 1626093931 (13.104%) in intersection fr1 # 146655780 bases of 446627861 (32.836%) in intersection on fr1 # 148005715 bases of 315518167 (46.909%) in intersection ####################################################################### ## BLASTZ oryLat1 chrUn contigs/scaffolds only (DONE - 2006-12-20 - Hiram) ## This was an experiment, not used ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20 cd /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20 cat << '_EOF_' > DEF # Stickleback vs. Medaka, both chrUn random contigs/scaffolds only # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 chrUn only # chrUn in contigs for this alignment run # The largest is 418,000 bases and there are 5,000 of them. SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes SEQ1_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ1_CHUNK=1000000 SEQ1_LIMIT=100 SEQ1_LAP=0 # QUERY: Medaka oryLat1 chrUn only # chrUn in scaffolds for this alignment run # The largest scaffold is only 76,000 bases, there are 36,000 of them # and a total size of 119 million. SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20 TMPDIR=/scratch/tmp '_EOF_' time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \ -bigClusterHub=kk > do.log 2>&1 & # real 106m42.410s # user 0m0.087s # sys 0m0.062s # Had some difficulty with the kk kluster, continuing after getting the # jobs completed time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \ -continue=cat -bigClusterHub=kk > cat.log 2>&1 & ####################################################################### ## BLASTZ Stickleback chroms vs Medaka random contigs ## (DONE - 2006-12-20 - Hiram) ## This was an experiment, not used ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20 cd /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20 cat << '_EOF_' > DEF # Stickleback chroms vs. Medaka random contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms # The largest is 32 million bases, there are 22 of them SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Medaka oryLat1 chrUn only # chrUn in scaffolds for this alignment run # The largest scaffold is only 76,000 bases, there are 36,000 of them # and a total size of 119 million. SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20 TMPDIR=/scratch/tmp '_EOF_' time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1Un \ -bigClusterHub=kk > do.log 2>&1 & ####################################################################### ## BLASTZ Stickleback chroms vs Medaka chroms and random contigs ## (DONE - 2006-12-22 - Hiram) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22 cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22 cat << '_EOF_' > DEF # Stickleback chroms vs. Medaka random contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms # The largest is 32 million bases, there are 22 of them SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Medaka oryLat1 chroms and chrUn in scaffolds # The largest scaffold is only 76,000 bases, there are 36,000 of them # and a total size of 119 million. # The biggest chrom is just under 40 million SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1ChrUn \ -bigClusterHub=pk > do.log 2>&1 & # real 132m54.403s # user 0m0.213s # sys 0m0.138s ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22 time featureBits gasAcu1 chainOryLat1Link \ > fb.gasAcu1.chainOryLat1Link.txt 2>&1 # 164155231 bases of 446627861 (36.754%) in intersection ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 470 where name = 'gasAcu1';" ######################################################################### ## 7-Way Multiz (DONE - 2006-12-22 - Hiram) ## replaced by 8-Way done below ssh hgwdev mkdir /cluster/data/gasAcu1/bed/multiz7way cd /cluster/data/gasAcu1/bed/multiz7way # Constructed this tree by pruining the 17-way tree from mm8 # multiz17way, and then adding in gasAcu1 and oryLat1 branch # Arbitrarily set 0.2 distances for this added branch # All other distances remain as specified in the 17way.nh cat << '_EOF_' > 7way.nh ((human_hg18:0.054922,mouse_mm8:0.412790):0.754413, ( ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961, zebrafish_danRer4:0.782561):0.156067); '_EOF_' # << happy emacs /cluster/bin/phast/draw_tree 7way.nh > 7way.ps /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt # Use this output to create the table below grep -y gasAcu1 7way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainGasAcu1Link chain linearGap # distance on gasAcu1 on other minScore # 1 0.4000 - medaka oryLat1 (% 38.754) (% 27.044) 2000 loose # 2 0.7994 - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose # 3 0.8399 - fugu fr1 (% 32.836) (% 49.909) 2000 loose # 4 1.4755 - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose # 6 1.6584 - human hg18 (% 11.353) (% 1.923) 2000 loose # 7 2.0162 - mouse mm8 (% 11.070) (% 2.056) 2000 cd /cluster/data/gasAcu1/bed/multiz7way # bash shell syntax here ... export H=/cluster/data/gasAcu1/bed mkdir mafLinks for G in oryLat1 tetNig1 fr1 danRer4 hg18 mm8 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore05 mkdir /san/sanvol1/scratch/gasAcu1/multiz7way cd /san/sanvol1/scratch/gasAcu1/multiz7way time rsync -a --copy-links --progress \ /cluster/data/gasAcu1/bed/multiz7way/mafLinks/ . mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/gasAcu1/bed/multiz7way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 7way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = gasAcu1 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz7way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz7way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz7way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 23 jobs para try ... check ... push ... etc ... # Completed: 23 of 23 jobs # CPU time in finished jobs: 9086s 151.43m 2.52h 0.11d 0.000 y # IO & Wait Time: 194s 3.24m 0.05h 0.00d 0.000 y # Average job time: 403s 6.72m 0.11h 0.00d # Longest finished job: 707s 11.78m 0.20h 0.01d # Submission to last job: 707s 11.78m 0.20h 0.01d # combine results into a single file for loading and gbdb reference ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz7way time catDir maf > multiz7way.maf # real 0m6.467s # makes an 1.5 Gb file: # -rw-rw-r-- 1 1577409197 Dec 22 14:49 multiz7way.maf # Create per-chrom individual maf files for downloads ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz7way mkdir mafDownloads time for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # real 4m58.923s # user 4m41.178s # sys 0m14.387s # deliver to downloads ssh hgwdev ln -s /cluster/data/gasAcu1/bed/multiz7way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz7way # Load into database ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz7way mkdir /gbdb/gasAcu1/multiz7way ln -s /cluster/data/gasAcu1/bed/multiz7way/multiz7way.maf \ /gbdb/gasAcu1/multiz7way time nice -n +19 hgLoadMaf gasAcu1 multiz7way # Loaded 1925835 mafs in 1 files from /gbdb/gasAcu1/multiz7way # real 0m45.970s # user 0m25.438s # sys 0m8.453s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 gasAcu1 multiz7waySummary multiz7way.maf # Created 684283 summary blocks from 5182575 components # and 1925835 mafs from multiz7way.maf # real 0m54.289s # user 0m48.004s # sys 0m1.779s # Create tree image for details page # You can get a better image from the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif cp 7way.nh treeImage.nh # Edit treeImage.nh to make the names as desired # Submit this treeImage.nh to the phyloGif program, or: /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps ############################################################################ # ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2006-12-22 - Hiram) ## replaced by 8-Way done below ssh kolossus mkdir /cluster/data/gasAcu1/bed/multiz7way/anno cd /cluster/data/gasAcu1/bed/multiz7way/anno mkdir maf run cd run rm -f sizes nBeds ln -s /cluster/data/gasAcu1/chrom.sizes gasAcu1.len twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed} ln -s /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed for DB in `grep -v gasAcu1 /cluster/data/gasAcu1/bed/multiz7way/species.lst` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \ /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # real 19m20.705s # user 7m11.818s # sys 11m31.508s # Load anno/maf ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf mkdir -p /gbdb/gasAcu1/multiz7way/anno/maf ln -s /cluster/data/gasAcu1/bed/multiz7way/anno/maf/*.maf \ /gbdb/gasAcu1/multiz7way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/gasAcu1/multiz7way/anno/maf gasAcu1 multiz7way # Loaded 2269386 mafs in 23 files from /gbdb/gasAcu1/multiz7way/anno/maf # real 0m52.053s # user 0m35.608s # sys 0m3.762s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz7waySummary stdin ####################################################################### ## BLASTZ Stickleback chroms vs chicken chroms and random contigs ## (DONE - 2006-12-28 - 2007-01-05 - Hiram) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28 cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28 cat << '_EOF_' > DEF # Stickleback chroms vs. Chicken random contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms # The largest is 32 million bases, there are 22 of them SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 chroms and randoms in contigs # The largest chrom is 200M bases, the largest contig 122,228 # The smallest chrom is 1028 bases, smallest contig 54 bases # there are 25,991 chroms and contig pieces # total size 1,089,690,673 bases # 47107538 N's 1042583135 real 834274605 upper 208308530 lower SEQ2_DIR=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/galGal3/randomContigs.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \ -bigClusterHub=pk > do.log 2>&1 & # real 53m30.713s # failed during the load step due to difficulties on hgwdev # continuing 2007-01-02 cd axtChain ./loadUp.csh cd .. time doBlastzChainNet.pl -verbose=2 DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \ -continue=download -bigClusterHub=pk > download.log 2>&1 & # And, to swap: mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # failed on the net step: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [galGal3.gasAcu1.]all.chain[.gz]). time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 & # real 7m11.286s featureBits gasAcu1 chainGalGal3Link > fb.gasAcu1.chainGalGal2Link.txt # 37154459 bases of 446627861 (8.319%) in intersection featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt # 32781658 bases of 1042591351 (3.144%) in intersection # Adding the chrUn result from the galGal3 swapped alignments ssh kkstore05 cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain chainSplit chain gasAcu1.galGal3.all.chain.gz cp -p ../../blastz.galGal3.swap/axtChain/chain/chrUn.chain chain/chrUn.chain ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain hgLoadChain gasAcu1 chrUn_chainGalGal3 chrUn.chain Loading 13904 chains into gasAcu1.chrUn_chainGalGal3 ssh kkstore05 cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain chainMergeSort *.chain | gzip -c > ../gasAcu1.galGal3.all.chain.gz # This changes the IDs, but nothing else # reusing the existing netChains with some alterations cp netChains.csh netChainsUn.csh # fixup the reference to chrom.sizes and the 2bit file in this script # and running it after moving existing items out of the way ######################################################################### ## 8-Way Multiz (DONE - 2007-01-02 - 2007-01-05 - Hiram) ## re-done with new chrUn.maf 2007-01-13 - Hiram ## re-done with fr2 instead of fr1 - 2007-02-02 - Hiram ssh hgwdev mkdir /cluster/data/gasAcu1/bed/multiz8way cd /cluster/data/gasAcu1/bed/multiz8way /cluster/bin/phast/tree_doctor --prune chimp_panTro2,macaque_rheMac2,rat_rn4,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,monodelphis_monDom4,xenopus_xenTro2 17way.nh # use the output of that to manually construct this tree. # Arbitrarily set 0.2 distances for this added branch # All other distances remain as specified in the 17way.nh cat << '_EOF_' > 8way.nh (((human_hg18:0.054922,mouse_mm8:0.412790):0.475049, chicken_galGal3:0.454691):0.279364, (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961, zebrafish_danRer4:0.782561):0.156067); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/gasAcu1_8way.gif /cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt # Use this output to create the table below grep -y gasAcu1 8way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainGasAcu1Link chain linearGap # distance on gasAcu1 on other minScore # 1 0.4000 - medaka oryLat1 (% 38.308) (% 27.044) 2000 loose # 2 0.7994 - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose # 3 0.8399 - fugu fr2 (% 37.574) (% 40.269) 2000 loose # 3 0.8399 - fugu fr1 (% 35.925) (% 46.909) 2000 loose # 4 1.4755 - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose # 6 1.5831 - chicken galGal3 (% 9.181) (% 3.144) 2000 loose # 7 1.6584 - human hg18 (% 11.353) (% 1.923) 2000 loose # 8 2.0162 - mouse mm8 (% 11.070) (% 2.056) 2000 loose cd /cluster/data/gasAcu1/bed/multiz8way # bash shell syntax here ... export H=/cluster/data/gasAcu1/bed mkdir mafLinks for G in oryLat1 tetNig1 fr2 danRer4 galGal3 hg18 mm8 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore05 mkdir /san/sanvol1/scratch/gasAcu1/multiz8way cd /san/sanvol1/scratch/gasAcu1/multiz8way time rsync -a --copy-links --progress \ /cluster/data/gasAcu1/bed/multiz8way/mafLinks/ . mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/gasAcu1/bed/multiz8way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 8way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = gasAcu1 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz8way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz8way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz8way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 23 jobs para try ... check ... push ... etc ... # Completed: 23 of 23 jobs # CPU time in finished jobs: 11075s 184.58m 3.08h 0.13d 0.000 y # IO & Wait Time: 155s 2.58m 0.04h 0.00d 0.000 y # Average job time: 488s 8.14m 0.14h 0.01d # Longest finished job: 1123s 18.72m 0.31h 0.01d # Submission to last job: 1123s 18.72m 0.31h 0.01d # combine results into a single file for loading and gbdb reference ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz8way time nice -n +19 catDir maf > multiz8way.maf # real 0m7.656s # makes an 1.7 Gb file: # -rw-rw-r-- 1 1742307109 Feb 2 16:58 multiz8way.maf # Create per-chrom individual maf files for downloads # NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz8way mkdir mafDownloads time for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # real 5m9.273 # deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH # THE ANNOTATED MAFS ssh hgwdev ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way # Load into database ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way mkdir /gbdb/gasAcu1/multiz8way ln -s /cluster/data/gasAcu1/bed/multiz8way/multiz8way.maf \ /gbdb/gasAcu1/multiz8way time nice -n +19 hgLoadMaf gasAcu1 multiz8way # Loaded 1989046 mafs in 1 files from /gbdb/gasAcu1/multiz8way # real 0m47.121s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 gasAcu1 multiz8waySummary multiz8way.maf # Created 775687 summary blocks from 6070904 components # and 2047322 mafs from multiz8way.maf # real 1m0.331s # Create tree image for details page # You can get a better image from the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # cp 8way.nh treeImage.nh # Edit treeImage.nh to make the names as desired # Submit this treeImage.nh to the phyloGif program, or: # /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps ############################################################################ # ANNOTATE MULTIZ8WAY MAF AND LOAD TABLES (DONE - 2007-01-05 - Hiram) ## re-done with new chrUn.maf 2007-01-13 - Hiram ## re-done with fr2 instead of fr1 - 2007-02-02 - Hiram ## re-done with correct nBeds and sizes files, 2007-03-27 - Hiram ssh kolossus mkdir /cluster/data/gasAcu1/bed/multiz8way/anno cd /cluster/data/gasAcu1/bed/multiz8way/anno mkdir maf run cd run rm -f sizes nBeds ln -s /cluster/data/gasAcu1/chrom.sizes gasAcu1.len # This were done before during the 7-way # twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed} ln -s /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed for DB in \ `cat /cluster/data/gasAcu1/bed/multiz8way/species.lst | sed -e "s/ gasAcu1//"` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \ /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time nice -n +19 ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # this was run on titan (can not do this for larger genomes on titan) # real 48m40.965s # Load anno/maf ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf mkdir -p /gbdb/gasAcu1/multiz8way/anno/maf ln -s /cluster/data/gasAcu1/bed/multiz8way/anno/maf/*.maf \ /gbdb/gasAcu1/multiz8way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/gasAcu1/multiz8way/anno/maf gasAcu1 multiz8way # Loaded 2395584 mafs in 23 files from /gbdb/gasAcu1/multiz8way/anno/maf # real 2m16.918s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz8waySummary stdin # Created 322344 summary blocks from 6111763 components # and 2395584 mafs from stdin # real 23m49.073s # -rw-rw-r-- 1 15063781 Mar 27 12:02 multiz8waySummary.tab ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf sed -e 's/mafSummary/multiz8waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz8waySummary.sql time nice -n +19 hgLoadSqlTab gasAcu1 multiz8waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz8waySummary.tab # real 0m4.525 ######################################################################### # Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram) # edit src/hg/makeDb/genbank/genbank.conf to add: gasAcu1.upstreamGeneTbl = ensGene gasAcu1.upstreamMaf = multiz8way /hive/data/genomes/gasAcu1/bed/multiz8way/species.lst ######################################################################### # MULTIZ8WAY DOWNLOADABLES (DONE - 2007-01-05 - Hiram) ## re-done with new chrUn.maf 2007-01-13 - Hiram ## re-done with fr2 in place of fr1 - 2007-02-03 - Hiram # Annotated MAF is now documented, so use anno/maf for downloads/ ssh hgwdev mkdir /cluster/data/gasAcu1/bed/multiz8way/mafDownloads cd /cluster/data/gasAcu1/bed/multiz8way # upstream mafs # rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst # did not have mm8 as the first one # There isn't any refGene table, using ensGene instead for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 gasAcu1 \ ensGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags gasAcu1 multiz8way \ stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz echo "done ensGene.upstream${S}.maf.gz" done ## re-done after correction to anno/maf files 2007-03-27 - Hiram ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz8way time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz echo ${B}.gz done done # real 6m1.454 cd mafDownloads nice -n +19 md5sum *.maf.gz > md5sum.txt # Make a README.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way cd /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads/{*.gz,*.txt} . ####################################################################### # MULTIZ8WAY MAF FRAMES (DONE - 2007-01-10 - Hiram) ## re-done with new chrUn.maf 2007-01-13 - Hiram ## re-donw with fr2 in place of fr1 - 2007-02-03 - Hiram ssh hgwdev mkdir /cluster/data/gasAcu1/bed/multiz8way/frames cd /cluster/data/gasAcu1/bed/multiz8way/frames mkdir genes # The following is adapted from the mm8 sequence #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for gasAcu1 oryLat1 for qDB in danRer4 galGal3 do geneTbl=refGene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done # using knownGene for mm8 hg18 # using genscan for tetNig1 # using ensGene for gasAcu1, oryLat1 and fr2 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in mm8 hg18 gasAcu1 oryLat1 fr2 tetNig1 do if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then geneTbl=ensGene elif [ $qDB = "tetNig1" ]; then geneTbl=genscan else geneTbl=knownGene fi echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done ### ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz8way/frames time cat ../maf/*.maf | nice -n +19 genePredToMafFrames gasAcu1 stdin stdout gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz fr2 genes/fr2.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz galGal3 genes/galGal3.gp.gz mm8 genes/mm8.gp.gz hg18 genes/hg18.gp.gz | gzip > multiz8way.mafFrames.gz # real 0m55.706 ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way/frames time nice -n +19 hgLoadMafFrames gasAcu1 multiz8wayFrames \ multiz8way.mafFrames.gz # real 0m32.220s ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2007-01-10 - 2007-01-17 - Hiram) ## re-done with fr2 in place of fr1 - 2007-02-03 - Hiram # Estimate phastCons parameters ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/multiz8way/cons cd /cluster/data/gasAcu1/bed/multiz8way/cons twoBitToFa -seq=chrIV ../../../gasAcu1.sdTrf.2bit chrIV.fa # Create a starting-tree.mod based on chrIV (the largest one) time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrIV.maf \ --refseq ./chrIV.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 26 seconds time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(((hg18,mm8),galGal3),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4))" \ --out-root starting-tree # Fitting tree model to s1.1-32632948.ss using REV ... # Writing model to starting-tree.mod ... # real 3m53.084 rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.449 # This 0.449 is used in the --gc argument below ## the fa files are needed for the sequence and they are created during # this loop if they haven't been done before # Create SS files on san filesystem # Using a size of 10,000,000 to slow down the phastCons pk jobs ssh kkstore05 mkdir -p /san/sanvol1/scratch/gasAcu1/cons/ss cd /san/sanvol1/scratch/gasAcu1/cons/ss time for C in `awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes` do if [ -s /cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf ]; then mkdir -p ${C} mkdir -p /cluster/data/gasAcu1/bed/multiz8way/cons/${C} echo msa_split $C if [ ! -s /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa ]; then twoBitToFa -seq=${C} /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit \ /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa fi chrN=${C/chr/} chrN=${chrN/_random/} nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf \ --refseq /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa \ --in-format MAF --windows 10000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 6m31.721 # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/gasAcu1/cons/ss ls -1l chr*/chr*.ss | grep -v Un | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/gasAcu1/cons/treeRun6 cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/gasAcu1/bed/multiz8way/cons/starting-tree.mod \ --gc 0.449 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 10 --target-coverage 0.20 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP ./makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 47 of 47 jobs # CPU time in finished jobs: 78013s 1300.22m 21.67h 0.90d 0.002 y # IO & Wait Time: 224s 3.73m 0.06h 0.00d 0.000 y # Average job time: 1665s 27.74m 0.46h 0.02d # Longest finished job: 2123s 35.38m 0.59h 0.02d # Submission to last job: 2123s 35.38m 0.59h 0.02d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh pk cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt 2>&1 & ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt cp -p ave.*.mod .. cd .. cp -p ave.*.mod /cluster/data/gasAcu1/bed/multiz8way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \ 0.20 10 ave.{cons,noncons}.mod ## 0.20 10 with fr2 ( Solving for new omega: 10.000000 12.945684 12.467518 12.456325 ) Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000 Relative entropy: H=1.364850 bits/site Expected min. length: L_min=6.767655 sites Expected max. length: L_max=4.721873 sites Phylogenetic information threshold: PIT=L_min*H=9.236837 bits Recommended expected length: omega=12.456325 sites (for L_min*H=9.783400) ## 0.20 10 ( Solving for new omega: 10.000000 12.922391 12.450309 12.439360 ) Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000 Relative entropy: H=1.358227 bits/site Expected min. length: L_min=6.803722 sites Expected max. length: L_max=4.741215 sites Phylogenetic information threshold: PIT=L_min*H=9.241002 bits Recommended expected length: omega=12.439360 sites (for L_min*H=9.783400) ## 0.01 12 does not convert, trying without --NH Transition parameters: gamma=0.010000, omega=12.000000, mu=0.083333, nu=0.000842 Relative entropy: H=1.012920 bits/site Expected min. length: L_min=15.386530 sites Expected max. length: L_max=10.103812 sites Phylogenetic information threshold: PIT=L_min*H=15.585327 bits ## 0.10 12 # ( Solving for new omega: 12.000000 7.303911 5.603783 4.937054 4.700330 4.651391 4.648962 4.648956 ) # Transition parameters: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.200128 bits/site # Expected min. length: L_min=9.375672 sites # Expected max. length: L_max=6.463613 sites # Phylogenetic information threshold: PIT=L_min*H=11.252009 bits # Recommended expected length: omega=4.648956 sites (for L_min*H=9.783400) ## 0.17 12 # ( Solving for new omega: 12.000000 10.568744 10.445646 10.444628 ) # Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.267443 bits/site # Expected min. length: L_min=7.976979 sites # Expected max. length: L_max=5.664374 sites # Phylogenetic information threshold: PIT=L_min*H=10.110368 bits # Recommended expected length: omega=10.444628 sites (for L_min*H=9.783400) /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \ ave.cons.mod ave.noncons.mod Transition parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 Relative entropy: H=1.253111 bits/site Expected min. length: L_min=8.076184 sites Expected max. length: L_max=5.736142 sites Phylogenetic information threshold: PIT=L_min*H=10.120357 bits #Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.454874 bits/site # Required length: N=7.596943 sites # Total entropy: NH=11.052595 bits # SKIP to here passing by the tuning numbers ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/gasAcu1/cons/consRun5 cd /san/sanvol1/scratch/gasAcu1/cons/consRun5 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 10 --target-coverage 0.20 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP ./doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 58 of 58 jobs # CPU time in finished jobs: 1739s 28.99m 0.48h 0.02d 0.000 y # IO & Wait Time: 302s 5.03m 0.08h 0.00d 0.000 y # Average job time: 35s 0.59m 0.01h 0.00d # Longest finished job: 47s 0.78m 0.01h 0.00d # Submission to last job: 77s 1.28m 0.02h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/gasAcu1/cons/consRun5 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less # if that looks right, then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved8Way.bed # ~ 16 seconds cp -p mostConserved8Way.bed /cluster/data/gasAcu1/bed/multiz8way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore05 cd /cluster/data/gasAcu1/bed/multiz8way/cons faSize chr*/*.fa # 463354448 bases (16726587 N's 446627861 real 348847599 # upper 97780262 lower) in 23 sequences in 23 files cd /san/sanvol1/scratch/gasAcu1/cons/consRun5 # The 446627861 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/446627861\n",100.0*sum/446627861,sum}' \ mostConserved8Way.bed # % 11.58 = 100.0*51703035/446627861 --exp-len 10 --tar-cov 0.20 fr2 # % 11.49 = 100.0*51326297/446627861 --exp-len 10 --tar-cov 0.20 # % 8.36 = 100.0*37347218/446627861 --exp-len 12 --tar-cov 0.01 # % 10.24 = 100.0*45733412/446627861 --exp-len 12 --tar-cov 0.10 # % 11.45 = 100.0*51136218/446627861 --exp-len 12 --tar-cov 0.17 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits gasAcu1 \ -enrichment ensGene:cds mostConserved8Way.bed # --expected-length 10 --target-coverage 0.20 fr2 # ensGene:cds 6.618%, mostConserved8Way.bed 11.576%, both 4.070%, cover # 61.49%, enrich 5.31x # --expected-length 10 --target-coverage 0.20 # ensGene:cds 6.618%, mostConserved8Way.bed 11.492%, both 4.041%, cover # 61.07%, enrich 5.31x # --expected-length 12 --target-coverage 0.01 # ensGene:cds 6.618%, mostConserved8Way.bed 8.362%, both 3.628%, cover # 54.82%, enrich 6.56x # --expected-length 12 --target-coverage 0.10 # ensGene:cds 6.618%, mostConserved8Way.bed 10.240%, both 3.920%, cover # 59.24%, enrich 5.79x # --expected-length 12 --target-coverage 0.17 # ensGene:cds 6.618%, mostConserved8Way.bed 11.449%, both 4.092%, cover # 61.84%, enrich 5.40x # Load most conserved track into database ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadBed -strict gasAcu1 phastConsElements8way \ mostConserved8Way.bed # Loaded 1263039 elements of size 5 # real 0m29.697 # should measure the same as above time nice -n +19 \ featureBits gasAcu1 -enrichment ensGene:cds phastConsElements8way # At: --expected-length 10 --target-coverage 0.20 fr2 # ensGene:cds 6.618%, phastConsElements8way 11.576%, both 4.070%, cover # 61.49%, enrich 5.31x # At: --expected-length 10 --target-coverage 0.20 # ensGene:cds 6.618%, phastConsElements8way 11.492%, both 4.041%, cover # 61.07%, enrich 5.31x # At: --expected-length 12 --target-coverage 0.10 result: # ensGene:cds 6.618%, phastConsElements8way 10.240%, both 3.920%, cover # 59.24%, enrich 5.79x # At: --expected-length 12 --target-coverage 0.17 result: # ensGene:cds 6.618%, phastConsElements8way 11.449%, # both 4.092%, cover 61.84%, enrich 5.40x # Create merged posterier probability file and wiggle track data files ssh pk cd /san/sanvol1/scratch/gasAcu1/cons/consRun5 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \ phastCons8.wig phastCons8.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 3m55.780 # -rw-rw-r-- 1 297832681 Feb 3 14:50 phastCons8.wib # -rw-rw-r-- 1 49168315 Feb 3 14:50 phastCons8.wig time nice -n +19 cp -p phastCons8.wi? /cluster/data/gasAcu1/bed/multiz8way/ # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/gasAcu1/cons/consRun5 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons8Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons8Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 5m31.855s # copy them for downloads ssh kkstore05 # this directory is actually a symlink from store9 to store8 to # avoid the data full problem on store9 mkdir /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores cd /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores cp -p /san/sanvol1/scratch/gasAcu1/cons/consRun5/phastCons8Scores/* . # make a README.txt file here, and an md5sum ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores cd /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores ln -s /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores/* . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way ln -s `pwd`/phastCons8.wib /gbdb/gasAcu1/wib/phastCons8.wib # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadWiggle gasAcu1 phastCons8 phastCons8.wig # real 0m9.256s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/gasAcu1/bed/multiz8way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=gasAcu1 phastCons8 > histogram.data 2>&1 # real 0m30.744 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Stickleback gasAcu1 Histogram phastCons8 track" set xlabel " phastCons8 score - --expected-length 10 --target-coverage 0.20" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # The mostConserved can also be characterized by a histogram awk '{print $3-$2}' mostConserved8Way.bed > mostCons.txt textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \ > mostCons.histo.txt cat << '_EOF_' | gnuplot > mostCons.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Stickleback gasAcu1 histogram: lengths of mostConserved track elements" set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20" set ylabel " # of elements at this length" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set xrange [0:200] set y2tics set boxwidth 2 set style fill solid plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \ "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs ############################################################################## ## Medaka oryLat1 swap (DONE - 2007-01-12 - Hiram) ## And the swap ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.orylat1.swap cd /cluster/data/gasAcu1/bed/blastz.orylat1.swap time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \ /cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=net -bigClusterHub=pk > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \ /cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -stop=net -bigClusterHub=pk > net-swap.log 2>&1 & # real 18m21.951s ## Taking this chrUn chain result over to the other global result ssh kkstore05 # Note: previous result files temporarily moved out of the way cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain chainSplit chain gasAcu1.oryLat1.all.chain.gz cp -p ../../blastz.orylat1.swap/axtChain/chain/chrUn.chain ./chain cd chain chainMergeSort *.chain | gzip -c > ../gasAcu1.oryLat1.all.chain.gz cd .. rm -fr chain # they have new IDs chainSplit chain gasAcu1.oryLat1.all.chain.gz # And then reusing the netChains.csh script, fixing up references # to the appropriate chrom.sizes and 2bit file for gasAcu1 cp netChains.csh netChainsUn.csh time ./netChainsUn.csh # real 30m9.385s ## reloading, reuse the load.csh script, fixup reference to correct ## chrom.sizes ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain ########################################################################### ## SWAP FR1 BLASTZ (DONE - 2007-01-13 - Hiram) ssh kkstore02 mkdir /cluster/data/gasAcu1/bed/blastz.fr1.swap cd /cluster/data/gasAcu1/bed/blastz.fr1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=net -bigClusterHub=pk > swap.log 2>&1 & # real 130m31.028s ## Take that chrUn result into the primary fr1 blastz result cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain chainSplit chain gasAcu1.fr1.all.chain.gz cd chain cp -p /cluster/data/gasAcu1/bed/blastz.fr1.swap/axtChain/chain/chrUn.chain . chainMergeSort *.chain | gzip -c > ../gasAcu1.fr1.all.chain.gz cd .. chainSplit chain gasAcu1.fr1.all.chain.gz # And then reusing the netChains.csh script, fixing up references # to the appropriate chrom.sizes and 2bit file for gasAcu1 ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain cp netChains.csh netChainsUn.csh time ./netChainsUn.csh # real 122m36.719s # And then reloading these, reusing the loadUp.csh script, fixup # reference to chrom.sizes cp loadUp.csh reLoad.csh time ./reLoad.csh > reLoad.out 2>&1 # real 3m38.799s # After moving previous stuff out of the way ./makeMd5sum.csh ./installDownloads.csh ######################################################################### # BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-23 - Hiram) ## no chrUn in gasAcu1, and align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23 cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23 cat << '_EOF_' > DEF # Stickleback vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stippleback gasAcu1, no chrUn in this run SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=30 SEQ1_LAP=0 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > do.log 2>&1 & # real 27m33.395s # user 0m0.189s # sys 0m0.163s # Error because of inconsistent chrom sizes. We had attempted to modify # our lift file to remove a gap inserted at the end of the entire # sequence, which we should not have done. time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -continue=chainRun -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > chainRun.log 2>&1 & ###################################################### ## Now that it's finished, let's see how it did. This already ## set up chain and net tracks for fr2 on gasAcu1, though a modification ## to ~/kent/src/hg/makeDb/trackDb/trackDb.ra needed to be done to make ## sure fr2 got displayed in the browser. ## ## track chainFr2 ## shortLabel $o_db Chain ## longLabel $o_Organism ($o_date/$o_db) Chained Alignments ## group compGeno ## priority 140 ## visibility hide ## color 100,50,0 ## altColor 255,240,200 ## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 ## spectrum on ## type chain fr2 ## otherDb fr2 ## track netFr2 ## shortLabel $o_db Net ## longLabel $o_Organism ($o_date/$o_db) Alignment Net ## group compGeno ## priority 140.1 ## visibility hide ## spectrum on ## type netAlign fr2 chainFr2 ## otherDb fr2 ## To see how the matching did, let's use featureBits to find percentage of matching bases. ## This compares the gasAcu1 entries to only those in the Fr2 chain links that were created. ssh hgwdev cd /cluster/data/gasAcu1/bed ln -s blastz.fr2.2007-01-23 blastz.fr2 cd blastz.fr2 time nice -n 19 featureBits gasAcu1 chainFr2Link \ > fb.gasAcu1.chainFr2Link.txt 2>&1 & # 153570946 bases of 446627861 (34.385%) in intersection ## In addition, we want to put the chains and nets from gasAcu1 onto the ## fr2 browser, done below: # This is going to make the link back on the fr2 browser to # gasAcu1. mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap cd /cluster/data/fr2/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 24m33.761s # user 0m0.125s # sys 0m0.080s ## Now let's see how well these were matched back to Fr2. ssh hgwdev cd /cluster/data/fr2/bed/blastz.gasAcu1.swap time nice -n 19 featureBits fr2 chainGasAcu1Link \ > fb.fr2.chainGasAcu1Link.txt 2>&1 & # 158383996 bases of 393312790 (40.269%) in intersection ########################################################################### # Manually fixing up the fr2 alignments to gasAcu1 to get chrUn results ## (DONE - 2007-01-31 - 2007-02-02 - Hiram) ## Ran the blastz on fr2 and aligned just the chrUn sequence to fr2 ## swap back to gasAcu1 mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap cd /cluster/data/gasAcu1/bed/blastz.fr2.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 & ## Now, with that chain result in hand, place it manually back in with ## the full chroms chains and re-run the nets and so forth. ssh kkstore05 cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain # restore the chain directory which was removed by cleanUp.csh chainSplit chain gasAcu1.fr2.all.chain.gz # copy into here the chrUn.chain result cd chain cp -p \ /cluster/data/gasAcu1/bed/blastz.fr2.swap/axtChain/chain/chrUn.chain . time nice -n +19 chainMergeSort *.chain \ | gzip -c > ../gasAcu1.fr2.all.chain.gz cd .. rm -fr chain # they have new IDs chainSplit chain gasAcu1.fr2.all.chain.gz # And then reusing the netChains.csh script, fixing up references # to the appropriate chrom.sizes and 2bit file for gasAcu1 cp netChains.csh netChainsUn.csh time nice -n +19 ./netChainsUn.csh # real 30m9.385s ## reloading, reuse the load.csh script, fixup reference to correct ## chrom.sizes ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain time nice -n +19 ./reLoad.csh > reLoad.out 2>&1 # real 3m53.174s nice -n +19 featureBits gasAcu1 chainFr2Link \ > fb.gasAcu1.chainFr2Link.txt 2>&1 # 167817405 bases of 446627861 (37.574%) in intersection # without chrUn covered before, it was: # 153570946 bases of 446627861 (34.385%) in intersection # and fr2 is covered: nice -n +19 featureBits fr2 chainGasAcu1Link \ > fb.fr2.chainGasAcu1Link.txt 2>&1 # 158383996 bases of 393312790 (40.269%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-27) ssh kkstore05 bash # if not using bash shell already mkdir /cluster/data/gasAcu1/blastDb cd /cluster/data/gasAcu1 awk '/chrUn/ {if ($5 == "W") print $6}' downloads/UCSC.gasAcu1.agp > chrUnContigs.lst zcat downloads/assembly.bases.gz | faSomeRecords stdin chrUnContigs.lst chrUnContigs.fa cat noUn/chr*fa > temp.fa faSplit gap temp.fa 1000000 blastDb/ faSplit sequence chrUnContigs.fa 100 blastDb/y rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/gasAcu1/blastDb cd /cluster/data/gasAcu1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/gasAcu1/blastDb done mkdir -p /cluster/data/gasAcu1/bed/tblastn.hg18KG cd /cluster/data/gasAcu1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/gasAcu1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 609 query.lst # we want around 200000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(200000/609) = 111.833715 mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa split -l 112 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/gasAcu1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/gasAcu1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/gasAcu1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 199752 of 199752 jobs # CPU time in finished jobs: 4730447s 78840.78m 1314.01h 54.75d 0.150 y # IO & Wait Time: 929809s 15496.82m 258.28h 10.76d 0.029 y # Average job time: 28s 0.47m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 519s 8.65m 0.14h 0.01d # Submission to last job: 65891s 1098.18m 18.30h 0.76d ssh kkstore05 cd /cluster/data/gasAcu1/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 328 of 328 jobs # CPU time in finished jobs: 25906s 431.77m 7.20h 0.30d 0.001 y # IO & Wait Time: 77171s 1286.18m 21.44h 0.89d 0.002 y # Average job time: 314s 5.24m 0.09h 0.00d # Longest finished job: 942s 15.70m 0.26h 0.01d # Submission to last job: 3630s 60.50m 1.01h 0.04d ssh kkstore05 cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/gasAcu1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp blastHg18KG.psl ../../Un/chrUn.lft carry unliftBlastHg18KG.psl # load table ssh hgwdev cd /cluster/data/gasAcu1/bed/tblastn.hg18KG hgLoadPsl gasAcu1 blastHg18KG.psl # check coverage featureBits gasAcu1 blastHg18KG # 20127551 bases of 446627861 (4.507%) in intersection featureBits gasAcu1 ensGene:cds blastHg18KG -enrichment # ensGene:cds 6.618%, blastHg18KG 4.507%, both 3.790%, cover 57.26%, enrich 12.71x ssh kkstore05 rm -rf /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut #end tblastn ####################################################################### ## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram) ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18 cd /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18 cat << '_EOF_' > DEF # stickleback vs. lizard # Use same params as used for gasAcu1-danRer4 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Stickleback gasAcu1 - largest chunk big enough # for all chroms except chrUn SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 > do.log 2>&1 & time doBlastzChainNet.pl -verbose=2 DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 \ -continue=cleanup > cleanup.log 2>&1 & ## measuring time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \ > fb.gasAcu1.chainAnoCar1Link.txt 2>&1 # 56386298 bases of 446627861 (12.625%) in intersection ## swap documented in anoCar1.txt time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & time nice -n +19 featureBits anoCar1 chainGasAcu1Link \ > fb.anoCar1.chainGasAcu1Link.txt 2>&1 # real 1m14.245s # 54464074 bases of 1741478929 (3.127%) in intersection ####################################################################### # SELF CHAINS (DONE 3/14/07 angie) # Requested by Gill. ssh kkstore05 mkdir /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13 cd /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13 cat << '_EOF_' > DEF # stickleback vs. self BLASTZ_H=2000 BLASTZ_M=200 # TARGET # Stickleback SEQ1_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ1_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Stickleback SEQ2_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ2_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -chainMinScore=5000 -chainLinearGap=medium \ -blastzOutRoot=/cluster/bluearc/gasAcu1Self >& do.log & tail -f do.log # hgwdev can't see /iscratch, so the loadUp.csh script failed looking # for SEQ1_LEN. So I edited DEF to use /cluster/data location and # continued -load. ########################################################################## # NSCAN track - (2007-06-05 markd) # uses danRer4 as informant, no pseudogene masking # cd /cluster/data/gasAcu1/bed/nscan/ # obtainedf NSCAN predictions from michael brent's group # at WUSTL wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.gtf wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.prot.fa gzip -9 gasAcu1.* chmod a-w *.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt gasAcu1 nscanGene gasAcu1.gtf.gz hgPepPred gasAcu1 generic nscanPep gasAcu1.prot.fa.gz rm -f *.tab # update trackDb; need a gasAcu1-specific page to describe informants stickleback/gasAcu1/nscanGene.html stickleback/gasAcu1/trackDb.ra ######################################################################### # BLASTZ SWAP from Mouse Mm9 (DONE - 2007-09-07 - Hiram) # the original cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06 cat fb.mm9.chainGasAcu1Link.txt # 48448585 bases of 2620346127 (1.849%) in intersection # and the swap mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap cd /cluster/data/gasAcu1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & cat fb.gasAcu1.chainMm9Link.txt # 43730193 bases of 446627861 (9.791%) in intersection ######################################################################### ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram) ssh kkstore02 # not too important since everything moved to hive screen # use screen to control this job cd /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26 cat fb.oryLat2.chainGasAcu1Link.txt # 211589769 bases of 700386597 (30.210%) in intersection # And, for the swap: mkdir /cluster/data/gasAcu1/bed/blastz.oryLat2.swap cd /cluster/data/gasAcu1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium -swap \ /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 31m44.711s cat fb.gasAcu1.chainOryLat2Link.txt # 189793612 bases of 446627861 (42.495%) in intersection ############################################################################ # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute gasAcu1 (1.0) (NCBI project 13579, AANH01000000)" where name="gasAcu1";' hgcentraltest ########################################################################### ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # BLASTZ/CHAIN/NET TetNig2 (DONE - 2009-08-10,09-15 - Hiram) # create contigs only sequence to align properly to gasAcu1 contigs mkdir /hive/data/genomes/gasAcu1/nonBridged cd /hive/data/genomes/gasAcu1/nonBridged gapToLift -verbose=2 gasAcu1 gasAcu1.contigs.lift \ -bedFile=gasAcu1.contigs.bed # chrom count: 23 # WARNING: gap at end of chromosome not telomere at # chrUn:62549211-62550211, type: clone # found 16945 gaps # bed output requested to gasAcu1.contigs.bed # no gaps on chrom: chrM, size: 15742 ~/kent/src/hg/utils/lft2BitToFa.pl ../gasAcu1.2bit gasAcu1.contigs.lift \ | gzip -c > gasAcu1.contigs.fa.gz # make sure nothing was destroyed: faCount *.fa.gz > faCount.contigs.txt 2>&1 twoBitToFa ../gasAcu1.2bit stdout | faCount stdin > faCount.2bit.txt 2>&1 tail -1 faCount.contigs.txt # total 461441448 123670916 99610982 99564587 # 123781376 14813587 14615136 tail -1 faCount.2bit.txt # total 463354448 123670916 99610982 99564587 # 123781376 16726587 14615136 # only the total size and N count are different faToTwoBit gasAcu1.contigs.fa.gz gasAcu1.contigs.2bit twoBitInfo gasAcu1.contigs.2bit stdout | sort -k2nr > gasAcu1.contigs.sizes cp -p gasAcu1.contigs.2bit gasAcu1.contigs.sizes gasAcu1.contigs.lift \ /hive/data/staging/data/gasAcu1 mkdir /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10 cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10 cat << '_EOF_' > DEF # Stickleback vs. Tetraodon # TARGET: Stickleback gasAcu1, chunk large enough to run largest piece SEQ1_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ1_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ1_CTGDIR=/scratch/data/gasAcu1/gasAcu1.contigs.2bit SEQ1_CTGLEN=/scratch/data/gasAcu1/gasAcu1.contigs.sizes SEQ1_LIFT=/scratch/data/gasAcu1/gasAcu1.contigs.lift SEQ1_CHUNK=22000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Tetraodon TetNig2 - single chunk big enough to run single largest item SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # about 72 minutes # forgot to indicate type of repeats, continuing the load: cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/axtChain netClass -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=0 -noAr noClass.net gasAcu1 tetNig2 gasAcu1.tetNig2.net netFilter -minGap=10 gasAcu1.tetNig2.net \ | hgLoadNet -verbose=0 gasAcu1 netTetNig2 stdin cd .. featureBits gasAcu1 chainTetNig2Link >&fb.gasAcu1.chainTetNig2Link.txt cat fb.gasAcu1.chainTetNig2Link.txt # 134497679 bases of 446627861 (30.114%) in intersection # and finish the downloads time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -continue=download > downloads.log 2>&1 mkdir /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap cd /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/DEF \ -swap -qRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > swap.log 2>&1 & ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ######################################################################### # LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-21 - Galt) # original alignment to danRer6 cd /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21 cat fb.danRer6.chainGasAcu1Link.txt # 147479454 bases of 1506896106 (9.787%) in intersection # running the swap - DONE - 2009-12-21 mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap cd /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap time nice +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap >& swap.log & cat fb.gasAcu1.chainDanRer6Link.txt # 114451338 bases of 446627861 (25.626%) in intersection ####################################################################### # lastz swap danRer7 (DONE - 2010-12-17 - Hiram # original alignment to danRer7 cd /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17 cat fb.danRer7.chainGasAcu1Link.txt # 144782335 bases of 1409770109 (10.270%) in intersection # running the swap mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap cd /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 25m35.024s cat fb.gasAcu1.chainDanRer7Link.txt # 122918766 bases of 446627861 (27.522%) in intersection ######################################################################### # lastz swap lizard anoCar2 (DONE - 2011-04-27 - Hiram) # the original alignment cd /cluster/data/anoCar2/bed/lastzGasAcu1.2011-04-26 cat fb.anoCar2.chainGasAcu1Link.txt # 53478872 bases of 1701353770 (3.143%) in intersection ## and running the swap mkdir /hive/data/genomes/gasAcu1/bed/blastz.anoCar2.swap cd /hive/data/genomes/gasAcu1/bed/blastz.anoCar2.swap time nice -n +25 doBlastzChainNet.pl -verbose=2 \ /cluster/data/anoCar2/bed/lastzGasAcu1.2011-04-26/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm -tRepeats=windowmaskerSdust \ -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 & # real 15m28.724s cat fb.gasAcu1.chainAnoCar2Link.txt # 55990919 bases of 446627861 (12.536%) in intersection ####################################################################### ## WINDOWMASKER (DONE - 2011-12-01 - Hiram) # this was actually run in 2006 without the extra step of eliminating # the gap masking. The table was not reloaded with this result since # the old one is on the RR. This business was done to run an experiment # with the fr3 lastz to see if it actually matters or not if the # query genome here is more masked or less. mkdir /hive/data/genomes/gasAcu1/bed/windowMasker cd /hive/data/genomes/gasAcu1/bed/windowMasker time doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev gasAcu1 > do.log 2>&1 & # real 21m10.950s # Masking statistics twoBitToFa gasAcu1.wmsk.sdust.2bit stdout | faSize stdin # 463354448 bases (16726587 N's 446627861 real 349071712 upper # 97556149 lower) in 23 sequences in 1 files # %21.05 masked total, %21.84 masked real hgLoadBed gasAcu1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 2285358 elements of size 3 featureBits -countGaps gasAcu1 windowmaskerSdust # 114282736 bases of 463354448 (24.664%) in intersection # eliminate the gaps from the masking featureBits gasAcu1 -not gap -bed=notGap.bed # 446627861 bases of 446627861 (100.000%) in intersection time nice -n +19 featureBits gasAcu1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 97556149 bases of 446627861 (21.843%) in intersection featureBits -countGaps gasAcu1 cleanWMask.bed.gz # 97556149 bases of 463354448 (21.054%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../gasAcu1.unmasked.2bit stdin \ -type=.bed gasAcu1.cleanWMSdust.2bit twoBitToFa gasAcu1.cleanWMSdust.2bit stdout | faSize stdin \ > gasAcu1.cleanWMSdust.faSize.txt cat gasAcu1.cleanWMSdust.faSize.txt # 463354448 bases (16726587 N's 446627861 real 349071712 upper # 97556149 lower) in 23 sequences in 1 files # %21.05 masked total, %21.84 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps gasAcu1 rmsk cleanWMask.bed.gz # 13193723 bases of 463354448 (2.847%) in intersection # add TRF mask: twoBitMask gasAcu1.cleanWMSdust.2bit \ -add ../simpleRepeat/trfMask.bed gasAcu1.TRF.WMSdust.2bit twoBitToFa gasAcu1.TRF.WMSdust.2bit stdout | faSize stdin \ > gasAcu1.TRF.WMSdust.faSize.txt # 463354448 bases (16726587 N's 446627861 real 348847599 upper # 97780262 lower) in 23 sequences in 1 files # %21.10 masked total, %21.89 masked real ##########################################################################