# for emacs: -*- mode: sh; -*- # Drosophila ananassae -- Agencourt "CAF1" via Eisen's 12-fly site # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT ######################################################################### # DOWNLOAD SEQUENCE (DONE 9/20/06 angie) ssh kkstore05 mkdir /cluster/store12/droAna3 ln -s /cluster/store12/droAna3 /cluster/data/droAna3 mkdir /cluster/data/droAna3/downloads cd /cluster/data/droAna3/downloads wget http://rana.lbl.gov/drosophila/caf1/dana_caf1.tar.gz tar xvzf dana_caf1.tar.gz cd dana faSize scaffolds.bases #230993012 bases (17074195 N's 213918817 real 213918817 upper 0 lower) in 13749 sequences in 1 files #Total size: mean 16800.7 sd 387178.2 min 55 (scaffold_13714) max 23697760 (scaffold_13340) median 1517 #N count: mean 1241.8 sd 9836.0 #U count: mean 15558.9 sd 382128.9 #L count: mean 0.0 sd 0.0 ######################################################################### # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/20/06 angie) ssh kkstore05 cd /cluster/data/droAna3 cat > droAna3.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE 9/20/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl droAna3 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl droAna3 -species drosophila -verbose 3 \ >& /cluster/data/droAna3/bed/RepeatMasker.2006-09-20/do.log & tail -f /cluster/data/droAna3/bed/RepeatMasker.2006-09-20/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly: featureBits -chrom=scaffold_13340 droAna3 rmsk #688661 bases of 23471240 (2.934%) in intersection featureBits -chrom=scaffold_13340 droAna2 rmsk #682697 bases of 23466339 (2.909%) in intersection nice featureBits droAna3 rmsk #35293237 bases of 213918817 (16.498%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 9/20/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/droAna3/bed/simpleRepeat cd /cluster/data/droAna3/bed/simpleRepeat twoBitToFa ../../droAna3.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~110 minutes (longer than D. mel, must be because of the scaffolds) # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed droAna3 simpleRepeat \ /cluster/data/droAna3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly: featureBits -chrom=scaffold_13340 droAna3 simpleRepeat #520511 bases of 23471240 (2.218%) in intersection featureBits -chrom=scaffold_13340 droAna2 simpleRepeat #515863 bases of 23466339 (2.198%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/20/06 angie) ssh kolossus cd /cluster/data/droAna3 time twoBitMask droAna3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droAna3.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.266u 0.341s 0:02.01 29.8% 0+0k 0+0io 0pf+0w # Because this is a no-browser build (just masking for alignment) # I did not make the usual /gbdb/$db/$db.2bit link. ########################################################################### # WINDOWMASKER EXPERIMENT (DONE 10/17/06 angie) # The blastz run vs. droWil1 was just destroyed by mega-output, # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not # too hopeful)... so let's try a de-novo masker before alignment: ssh kolossus mkdir /cluster/data/droAna3/bed/windowmasker.2006-10-17 cd /cluster/data/droAna3/bed/windowmasker.2006-10-17 # Instructions for running windowmasker: wget ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/windowmasker/README.windowmasker twoBitToFa /cluster/data/droAna3/droAna3.2bit tmp.fa # First, collect counts: /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \ -output wm.counts # Then use those counts to mask sequence: # Note: this can't run on a small cluster host because it dies if it # can't open a socket to www.ncbi.nlm.nih.gov:80 ! /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \ -output wm.intervals perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.intervals > windowmasker.bed # Quick coverage: awk '{print $3 - $2;}' windowmasker.bed | total #77984750 awk '{print $2;}' ../../chrom.sizes | total #230993012 calc 77984750 / 230993012 #77984750 / 230993012 = 0.337607 # and that's including gaps -- Much better than the 16.498% coverage # from RepeatMasker! # Make a masked .2bit: twoBitMask ../../droAna3.2bit windowmasker.bed ../../droAna3.WM.2bit # Now try with -sdust to additionally mask low-complexity sequence: /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \ -input tmp.fa -output wm.sdust.intervals perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.sdust.intervals > windowmasker.sdust.bed awk '{print $3 - $2;}' windowmasker.sdust.bed | total #97909778 calc 97909778 / 230993012 #97909778 / 230993012 = 0.423865 # Make a masked .2bit (even if we don't end up needing it): twoBitMask ../../droAna3.2bit windowmasker.sdust.bed \ ../../droAna3.WMSDust.2bit ssh hgwdev cd /cluster/data/droAna3/bed/windowmasker.2006-10-17 hgLoadBed droAna3 windowmasker windowmasker.bed hgLoadBed droAna3 windowmaskerSdust windowmasker.sdust.bed nice featureBits droAna3 rmsk windowmasker #20661102 bases of 213918817 (9.658%) in intersection nice featureBits droAna3 rmsk \!windowmasker #14632135 bases of 213918817 (6.840%) in intersection nice featureBits droAna3 \!rmsk windowmasker #57323648 bases of 213918817 (26.797%) in intersection # Sanity checks on windowmasker+Sdust being a superset of windowmasker: nice featureBits droAna3 windowmasker windowmaskerSdust #77984750 bases of 213918817 (36.455%) in intersection nice featureBits droAna3 windowmasker \!windowmaskerSdust #0 bases of 213918817 (0.000%) in intersection nice featureBits droAna3 \!windowmasker windowmaskerSdust #19925028 bases of 213918817 (9.314%) in intersection rm tmp.fa bed.tab ########################################################################### # BLASTZ/CHAIN/NET DROWIL1 (DONE 10/20/06 angie) # Using WindowMasker'd sequence since prior runs died due to massive # output (netChainSubset out of memory), even with chainMinScore=10000 # or chainMinScore=8000 + M=20. ssh kkstore05 mkdir /cluster/data/droAna3/bed/blastz.droWil1.2006-10-19 cd /cluster/data/droAna3/bed/blastz.droWil1.2006-10-19 cat << '_EOF_' > DEF # D. ananassae vs. D. willistoni BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. ananassae SEQ1_DIR=/san/sanvol1/scratch/droAna3/droAna3.WM.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/san/sanvol1/scratch/droAna3/chrom.sizes SEQ1_LIMIT=50 # QUERY - D. willistoni SEQ2_DIR=/san/sanvol1/scratch/droWil1/droWil1.WM.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droWil1/chrom.sizes SEQ2_LIMIT=200 BASE=/cluster/data/droAna3/bed/blastz.droWil1.2006-10-19 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/droAna3droWil1 >& do.log & tail -f do.log ln -s blastz.droWil1.2006-10-19 /cluster/data/droAna3/bed/blastz.droWil1 ######################################################################### # SWAP DM3 CHAIN/NET (DONE 4/2/09 angie) mkdir /hive/data/genomes/droAna3/bed/blastz.dm3.swap cd /hive/data/genomes/droAna3/bed/blastz.dm3.swap doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \ -workhorse kolossus \ /hive/data/genomes/dm3/bed/blastz.droAna3/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /hive/data/genomes/droAna3/bed/blastz.dm3 #########################################################################