# for emacs: -*- mode: sh; -*- # Drosophila pseudoobscura -- Baylor "CAF1" via Eisen's 12-fly site # (Identical to FlyBase dpse_r2.1_FB2008_02) # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT ######################################################################### # DOWNLOAD SEQUENCE (DONE 9/26/06 angie) ssh kkstore05 mkdir /cluster/store12/dp4 ln -s /cluster/store12/dp4 /cluster/data/dp4 mkdir /cluster/data/dp4/downloads cd /cluster/data/dp4/downloads wget http://rana.lbl.gov/drosophila/caf1/dpse_caf1.tar.gz tar xvzf dpse_caf1.tar.gz cd dpse faSize scaffolds.fa #152738921 bases (6681752 N's 146057169 real 146057169 upper 0 lower) in 4896 sequences in 1 files #Total size: mean 31196.7 sd 649312.0 min 101 (Unknown_singleton_1691) max 30794189 (Ch2) median 1734 #N count: mean 1364.7 sd 21011.7 #U count: mean 29831.9 sd 628747.5 #L count: mean 0.0 sd 0.0 # Tweak their funny chromosome names (Ch*) to our pattern (chr*): sed -e 's/^>Ch/>chr/' scaffolds.fa > UCSC.fa sed -e 's/^Ch/chr/' assembly.agp > UCSC.agp ######################################################################### # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/27/06 angie) ssh kkstore05 cd /cluster/data/dp4 cat > dp4.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log # Because of the extremely long sequence names for unplaced sequences, # the chromInfo load command failed because the index string length was # too short. So I temporarily modified my ~/kent/src/hg/lib/chromInfo.sql # to lengthen the index, dropped the dp4 database and ran -continue db. ######################################################################### # REPEATMASKER (DONE 9/27/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl dp4 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl dp4 -species drosophila -verbose 3 \ >& /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log & tail -f /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly: featureBits -chrom=chr2 dp4 rmsk #1245499 bases of 29873196 (4.169%) in intersection featureBits -chrom=chr2 dp3 rmsk #1140300 bases of 29702756 (3.839%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 9/27/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/dp4/bed/simpleRepeat cd /cluster/data/dp4/bed/simpleRepeat twoBitToFa ../../dp4.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~50 minutes (longer than D. mel, must be because of the scaffolds) # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed dp4 simpleRepeat \ /cluster/data/dp4/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly: featureBits -chrom=chr2 dp4 simpleRepeat #530609 bases of 29873196 (1.776%) in intersection featureBits -chrom=chr2 dp3 simpleRepeat #517077 bases of 29702756 (1.741%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/27/06 angie) ssh kolossus cd /cluster/data/dp4 time twoBitMask dp4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dp4.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.201u 0.367s 0:01.78 31.4% 0+0k 0+0io 1pf+0w # Because this is a no-browser build (just masking for alignment) # I did not make the usual /gbdb/$db/$db.2bit link. ########################################################################### # BLASTZ/CHAIN/NET DROANA3 (DONE 10/3/06 angie) ssh kkstore05 mkdir /cluster/data/dp4/bed/blastz.droAna3.2006-10-02 cd /cluster/data/dp4/bed/blastz.droAna3.2006-10-02 cat << '_EOF_' > DEF # D. pseudoobscura vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. pseudoobscura SEQ1_DIR=/iscratch/i/dp4/dp4.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dp4/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droAna3/chrom.sizes BASE=/cluster/data/dp4/bed/blastz.droAna3.2006-10-02 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dp4droAna3 >& do.log & tail -f do.log ln -s blastz.droAna3.2006-10-02 /cluster/data/dp4/bed/blastz.droAna3 ########################################################################### # BLASTZ/CHAIN/NET DROWIL1 (DONE 10/3/06 angie) ssh kkstore05 mkdir /cluster/data/dp4/bed/blastz.droWil1.2006-10-02 cd /cluster/data/dp4/bed/blastz.droWil1.2006-10-02 cat << '_EOF_' > DEF # D. pseudoobscura vs. D. willistoni BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. pseudoobscura SEQ1_DIR=/iscratch/i/dp4/dp4.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dp4/chrom.sizes # QUERY - D. willistoni SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droWil1/chrom.sizes BASE=/cluster/data/dp4/bed/blastz.droWil1.2006-10-02 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dp4droWil1 >& do.log & tail -f do.log ln -s blastz.droWil1.2006-10-02 /cluster/data/dp4/bed/blastz.droWil1 ######################################################################### # SWAP DM3 CHAIN/NET (DONE 6/30/08 angie) ssh kkstore05 mkdir /cluster/data/dp4/bed/blastz.dm3.swap cd /cluster/data/dp4/bed/blastz.dm3.swap doBlastzChainNet.pl -swap \ /cluster/data/dm3/bed/blastz.dp4.2006-12-04/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /cluster/data/dp4/bed/blastz.dm3 ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 9/18/08 angie) # Use -repMatch=85 -- increased from dp3's 75 because this assembly # loses 9272 tiles w/75, as opposed to dp3's 6790 -- even if dp4 has # more repetitive reads than dp3, that seems like too big of an increase # (would lose sensitivity). ssh kolossus blat /hive/data/genomes/dp4/dp4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/hive/data/genomes/dp4/11.ooc -repMatch=85 #Wrote 7218 overused 11-mers to /hive/data/genomes/dp4/11.ooc ######################################################################### # LIFTOVER TO DP3 (DONE 9/18/08) doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \ -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 -debug # *** Steps were performed in /cluster/data/dp4/bed/blat.dp3.2008-09-18 cd /cluster/data/dp4/bed/blat.dp3.2008-09-18 #NOTE FOR NEXT TIME: save log file! (oops) doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \ -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 #########################################################################