# for emacs: -*- mode: sh; -*- # Drosophila grimshawi -- Agencourt "CAF1" via Eisen's 12-fly site # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT ######################################################################### # DOWNLOAD SEQUENCE (DONE 9/26/06 angie) ssh kkstore05 mkdir /cluster/store12/droGri2 ln -s /cluster/store12/droGri2 /cluster/data/droGri2 mkdir /cluster/data/droGri2/downloads cd /cluster/data/droGri2/downloads wget http://rana.lbl.gov/drosophila/caf1/dgri_caf1.tar.gz tar xvzf dgri_caf1.tar.gz cd dgri faSize scaffolds.bases #200467819 bases (14377150 N's 186090669 real 186090669 upper 0 lower) in 17440 sequences in 1 files #Total size: mean 11494.7 sd 331870.4 min 80 (scaffold_16070) max 24565398 (scaffold_15110) median 1702 #N count: mean 824.4 sd 8307.0 #U count: mean 10670.3 sd 326457.5 #L count: mean 0.0 sd 0.0 ######################################################################### # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/26/06 angie) ssh kkstore05 cd /cluster/data/droGri2 cat > droGri2.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE 9/26/06 angie) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl droGri2 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl droGri2 -species drosophila -verbose 3 \ >& /cluster/data/droGri2/bed/RepeatMasker.2006-09-26/do.log & tail -f /cluster/data/droGri2/bed/RepeatMasker.2006-09-26/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly (largest scaf in each, not apples-apples): featureBits -chrom=scaffold_15110 droGri2 rmsk #1755374 bases of 24078664 (7.290%) in intersection featureBits -chrom=scaffold_25013 droGri1 rmsk #1231873 bases of 14091279 (8.742%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 9/26/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/droGri2/bed/simpleRepeat cd /cluster/data/droGri2/bed/simpleRepeat twoBitToFa ../../droGri2.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # ~120 minutes (longer than D. mel, must be because of the scaffolds) # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed droGri2 simpleRepeat \ /cluster/data/droGri2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly (largest scaf in each, not apples-apples): featureBits -chrom=scaffold_15110 droGri2 simpleRepeat #559069 bases of 24078664 (2.322%) in intersection featureBits -chrom=scaffold_25013 droGri1 simpleRepeat #251144 bases of 14091279 (1.782%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/26/06 angie) ssh kolossus cd /cluster/data/droGri2 time twoBitMask droGri2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droGri2.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.345u 0.405s 0:02.33 31.7% 0+0k 0+0io 1pf+0w # Because this is a no-browser build (just masking for alignment) # I did not make the usual /gbdb/$db/$db.2bit link. ########################################################################### # WINDOWMASKER EXPERIMENT (DONE 10/23/06 angie) # The droVir3-droGri2 blastz run was just destroyed by mega-output, # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not # too hopeful)... so let's try a de-novo masker before alignment: ssh kolossus mkdir /cluster/data/droGri2/bed/windowmasker.2006-10-23 cd /cluster/data/droGri2/bed/windowmasker.2006-10-23 twoBitToFa /cluster/data/droGri2/droGri2.2bit tmp.fa # First, collect counts: time /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \ -output wm.counts #109.210u 2.980s 1:54.25 98.1% 0+0k 0+0io 0pf+0w # Then use those counts to mask sequence: time /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \ -output wm.intervals #188.383u 1.445s 3:11.42 99.1% 0+0k 0+0io 0pf+0w perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.intervals > windowmasker.bed # Quick coverage: awk '{print $3 - $2;}' windowmasker.bed | total #76056110 awk '{print $2;}' ../../chrom.sizes | total #200467819 calc 76056110 / 200467819 #76056110 / 200467819 = 0.379393 # Make a masked .2bit: twoBitMask ../../droGri2.2bit windowmasker.bed ../../droGri2.WM.2bit # Now try with -sdust to additionally mask low-complexity sequence: time /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \ -input tmp.fa -output wm.sdust.intervals #488.269u 1.617s 8:11.81 99.6% 0+0k 0+0io 0pf+0w perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \ if (/^(\d+) - (\d+)/) { \ $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \ }' wm.sdust.intervals > windowmasker.sdust.bed awk '{print $3 - $2;}' windowmasker.sdust.bed | total #92936045 calc 92936045 / 200467819 #92936045 / 200467819 = 0.463596 # Make a masked .2bit (even if we don't end up needing it): twoBitMask ../../droGri2.2bit windowmasker.sdust.bed \ ../../droGri2.WMSDust.2bit rm tmp.fa ######################################################################### # SWAP DM3 CHAIN/NET (DONE 4/3/09 angie) mkdir /hive/data/genomes/droGri2/bed/blastz.dm3.swap cd /hive/data/genomes/droGri2/bed/blastz.dm3.swap doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \ -workhorse kolossus \ /hive/data/genomes/dm3/bed/blastz.droGri2/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /hive/data/genomes/droGri2/bed/blastz.dm3 #########################################################################