# for emacs: -*- mode: sh; -*- # Drosophila erecta -- Agencourt "CAF1" via Eisen's 12-fly site # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT ######################################################################### # DOWNLOAD SEQUENCE (DONE 9/20/06 angie) ssh kkstore05 mkdir /cluster/store12/droEre2 ln -s /cluster/store12/droEre2 /cluster/data/droEre2 mkdir /cluster/data/droEre2/downloads cd /cluster/data/droEre2/downloads wget http://rana.lbl.gov/drosophila/caf1/dere_caf1.tar.gz tar xvzf dere_caf1.tar.gz cd dere faSize scaffolds.bases #152712140 bases (7628121 N's 145084019 real 145084019 upper 0 lower) in 5124 sequences in 1 files #Total size: mean 29803.3 sd 721223.0 min 154 (scaffold_5056) max 26641161 (scaffold_4929) median 1729 #N count: mean 1488.7 sd 9323.5 #U count: mean 28314.6 sd 717773.6 #L count: mean 0.0 sd 0.0 ######################################################################### # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/20/06 angie) ssh kkstore05 cd /cluster/data/droEre2 cat > droEre2.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE 9/20/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl droEre2 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl droEre2 -species drosophila -verbose 3 \ >& /cluster/data/droEre2/bed/RepeatMasker.2006-09-20/do.log & tail -f /cluster/data/droEre2/bed/RepeatMasker.2006-09-20/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly: featureBits droEre2 rmsk #18539872 bases of 145084019 (12.779%) in intersection featureBits droEre1 rmsk #18339464 bases of 145196048 (12.631%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 9/20/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/droEre2/bed/simpleRepeat cd /cluster/data/droEre2/bed/simpleRepeat twoBitToFa ../../droEre2.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~52 minutes (longer than D. mel, must be because of the scaffolds) # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed droEre2 simpleRepeat \ /cluster/data/droEre2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly: featureBits droEre2 simpleRepeat #9028488 bases of 145084019 (6.223%) in intersection featureBits droEre1 simpleRepeat #9071572 bases of 145196048 (6.248%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/20/06 angie) ssh kolossus cd /cluster/data/droEre2 time twoBitMask droEre2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droEre2.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.161u 0.368s 0:01.61 32.2% 0+0k 0+0io 1pf+0w # Because this is a no-browser build (just masking for alignment) # I did not make the usual /gbdb/$db/$db.2bit link. ######################################################################### # SWAP DM3 CHAIN/NET (DONE 4/3/09 angie) mkdir /hive/data/genomes/droEre2/bed/blastz.dm3.swap cd /hive/data/genomes/droEre2/bed/blastz.dm3.swap doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \ -workhorse kolossus \ /hive/data/genomes/dm3/bed/blastz.droEre2/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /hive/data/genomes/droEre2/bed/blastz.dm3 #########################################################################