# for emacs: -*- mode: sh; -*- # Drosophila virilis -- Agencourt "CAF1" via Eisen's 12-fly site # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT ######################################################################### # DOWNLOAD SEQUENCE (DONE 9/21/06 angie) ssh kkstore05 mkdir /cluster/store12/droVir3 ln -s /cluster/store12/droVir3 /cluster/data/droVir3 mkdir /cluster/data/droVir3/downloads cd /cluster/data/droVir3/downloads wget http://rana.lbl.gov/drosophila/caf1/dvir_caf1.tar.gz tar xvzf dvir_caf1.tar.gz cd dvir faSize scaffolds.bases #206026697 bases (16820834 N's 189205863 real 189205863 upper 0 lower) in 13530 sequences in 1 files #Total size: mean 15227.4 sd 412436.0 min 43 (scaffold_474) max 25233164 (scaffold_13049) median 1215 #N count: mean 1243.2 sd 14416.5 #U count: mean 13984.2 sd 405263.9 #L count: mean 0.0 sd 0.0 ######################################################################### # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/21/06 angie) ssh kkstore05 cd /cluster/data/droVir3 cat > droVir3.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE 9/21/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl droVir3 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl droVir3 -species drosophila -verbose 3 \ >& /cluster/data/droVir3/bed/RepeatMasker.2006-09-21/do.log & tail -f /cluster/data/droVir3/bed/RepeatMasker.2006-09-21/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly: featureBits -chrom=scaffold_13049 droVir3 rmsk #2148620 bases of 24930185 (8.619%) in intersection featureBits -chrom=scaffold_13049 droVir2 rmsk #2157147 bases of 24956889 (8.643%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 9/21/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/droVir3/bed/simpleRepeat cd /cluster/data/droVir3/bed/simpleRepeat twoBitToFa ../../droVir3.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~100 minutes (longer than D. mel, must be because of the scaffolds) # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed droVir3 simpleRepeat \ /cluster/data/droVir3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly: featureBits -chrom=scaffold_13049 droVir3 simpleRepeat #935315 bases of 24930185 (3.752%) in intersection featureBits -chrom=scaffold_13049 droVir2 simpleRepeat #933182 bases of 24956889 (3.739%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/21/06 angie) ssh kolossus cd /cluster/data/droVir3 time twoBitMask droVir3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droVir3.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.301u 0.301s 0:01.91 31.4% 0+0k 0+0io 0pf+0w # Because this is a no-browser build (just masking for alignment) # I did not make the usual /gbdb/$db/$db.2bit link. ########################################################################### # WINDOWMASKER EXPERIMENT (DONE 10/23/06 angie) # The droVir3-droGri2 blastz run was just destroyed by mega-output, # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not # too hopeful)... so let's try a de-novo masker before alignment: ssh kolossus mkdir /cluster/data/droVir3/bed/windowmasker.2006-10-23 cd /cluster/data/droVir3/bed/windowmasker.2006-10-23 twoBitToFa /cluster/data/droVir3/droVir3.2bit tmp.fa # First, collect counts: time /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \ -output wm.counts #116.091u 3.329s 2:01.89 97.9% 0+0k 0+0io 115pf+0w # Then use those counts to mask sequence: time /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \ -output wm.intervals #194.555u 1.303s 3:17.49 99.1% 0+0k 0+0io 0pf+0w perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.intervals > windowmasker.bed # Quick coverage: awk '{print $3 - $2;}' windowmasker.bed | total #66922828 awk '{print $2;}' ../../chrom.sizes | total #206026697 calc 66922828 / 206026697 #66922828 / 206026697 = 0.324826 # Make a masked .2bit: twoBitMask ../../droVir3.2bit windowmasker.bed ../../droVir3.WM.2bit # Now try with -sdust to additionally mask low-complexity sequence: time /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \ -input tmp.fa -output wm.sdust.intervals #545.704u 1.433s 9:09.64 99.5% 0+0k 0+0io 0pf+0w perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.sdust.intervals > windowmasker.sdust.bed awk '{print $3 - $2;}' windowmasker.sdust.bed | total #86036805 calc 86036805 / 206026697 #86036805 / 206026697 = 0.417600 # Make a masked .2bit (even if we don't end up needing it): twoBitMask ../../droVir3.2bit windowmasker.sdust.bed \ ../../droVir3.WMSDust.2bit rm tmp.fa ########################################################################### # BLASTZ/CHAIN/NET DROGRI2 (DONE 10/23/06 angie) # Using WindowMasker'd sequence since prior runs died due to massive # output (netChainSubset out of memory), even with chainMinScore=10000 # or chainMinScore=8000 + M=20. ssh kkstore05 mkdir /cluster/data/droVir3/bed/blastz.droGri2.2006-10-23 cd /cluster/data/droVir3/bed/blastz.droGri2.2006-10-23 cat << '_EOF_' > DEF # D. virilis vs. D. grimshawi BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. virilis SEQ1_DIR=/san/sanvol1/scratch/droVir3/droVir3.WM.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/san/sanvol1/scratch/droVir3/chrom.sizes SEQ1_LIMIT=50 # QUERY - D. grimshawi SEQ2_DIR=/san/sanvol1/scratch/droGri2/droGri2.WM.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droGri2/chrom.sizes SEQ2_LIMIT=200 BASE=/cluster/data/droVir3/bed/blastz.droGri2.2006-10-23 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/droVir3droGri2 >& do.log & tail -f do.log ln -s blastz.droGri2.2006-10-23 /cluster/data/droVir3/bed/blastz.droGri2 ######################################################################### # SWAP DM3 CHAIN/NET (DONE 4/6/09 angie) mkdir /hive/data/genomes/droVir3/bed/blastz.dm3.swap cd /hive/data/genomes/droVir3/bed/blastz.dm3.swap doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \ -workhorse kolossus \ /hive/data/genomes/dm3/bed/blastz.droVir3/DEF.hive >& do.log & tail -f do.log ln -s blastz.dm3.swap /hive/data/genomes/droVir3/bed/blastz.dm3 ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2011-01-17 -Chin) # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~6.5% of human judging by gapless ce4 vs. hg18 genome # size from featureBits. So we would use 64, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. cd /hive/data/genomes/droVir3 blat droVir3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/droVir3.11.ooc -repMatch=100 # Wrote 14348 overused 11-mers to jkStuff/droVir3.11.ooc # Wrote 8514 overused 11-mers to jkStuff/11.ooc mkdir /hive/data/staging/data/droVir3 cd /hive/data/genomes/droVir3 cp -p droVir3.2bit jkStuff/droVir3.11.ooc chrom.sizes /hive/data/staging/data/droVir3 # request rsync to cluster nodes ############################################################################ # LIFTOVER TO droVir2 (DONE - 2011-01-17 - Chin ) mkdir /hive/data/genomes/droVir3/bed/blat.droVir2.2011-01-15 cd /hive/data/genomes/droVir3/bed/blat.droVir2.2011-01-15 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/droVir3/jkStuff/droVir3.11.ooc -debug droVir3 droVir2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/droVir3/jkStuff/droVir3.11.ooc droVir3 droVir2 > do.log 2>&1 # real 29m40.917s cd /usr/local/apache/htdocs-hgdownload/goldenPath/droVir3/liftOver md5sum *.gz > md5sum.txt #########################################################################