# for emacs: -*- mode: sh; -*- # armadillo ( Dasypus novemcinctus ) # http://www.ncbi.nlm.nih.gov/bioproject/12594 # http://www.ncbi.nlm.nih.gov/genome/235 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGV00 ######################################################################### # DOWNLOAD SEQUENCE (working braney ssh kolossus mkdir /hive/data/genomes/dasNov2 rm /cluster/data/dasNov2 ln -s /hive/data/genomes/dasNov2 /cluster/data mkdir /cluster/data/dasNov2/broad cd /cluster/data/dasNov2/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin dasNov2.qual.qac wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/BasicStats.out # no BasicStats.out cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 292141 ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE braney 2008-09-22) ssh kolossus cd /cluster/data/dasNov2 cat << _EOF_ >dasNov2.config.ra # Config parameters for makeGenomeDb.pl: db dasNov2 clade mammal genomeCladePriority 35 scientificName Dasypus novemcinctus commonName Armadillo assemblyDate Jul. 2008 assemblyLabel Broad Institute dasNov2 orderKey 346.1 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/dasNov2/broad/assembly.bases.gz agpFiles /cluster/data/dasNov2/broad/assembly.agp qualFiles /cluster/data/dasNov2/broad/dasNov2.qual.qac dbDbSpeciesDir armadillo _EOF_ makeGenomeDb.pl -workhorse=kolossus -verbose=2 dasNov2.config.ra > makeGenomeDb.out 2>&1 & # when done cut -f 2 chrom.sizes | ave stdin # Q1 2152.000000 # median 4946.000000 # Q3 10566.000000 # average 16477.740413 # min 600.000000 # max 985764.000000 # count 292141 # total 4813823562.000000 # standard deviation 32801.578215 ######################################################################### # REPEATMASKER (DONE braney 2008-10-07) ssh kolossus screen # use a screen to manage this job mkdir /cluster/data/dasNov2/bed/repeatMasker cd /cluster/data/dasNov2/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/dasNov2/bed/repeatMasker \ dasNov2 > do.log 2>&1 & # Completed: 10555 of 10555 jobs # CPU time in finished jobs: 22325655s 372094.25m 6201.57h 258.40d 0.708 y # IO & Wait Time: 86043s 1434.05m 23.90h 1.00d 0.003 y # Average job time: 2123s 35.39m 0.59h 0.02d # Longest finished job: 7949s 132.48m 2.21h 0.09d # Submission to last job: 69062s 1151.03m 19.18h 0.80d doRepeatMasker.pl -buildDir=/cluster/data/dasNov2/bed/repeatMasker \ -continue cat dasNov2 > do2.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh hgwdev cd /cluster/data/dasNov2/bed/repeatMasker time nice -n +19 featureBits dasNov2 rmsk > fb.dasNov2.rmsk.txt 2>&1 & # 888874669 bases of 2371493872 (37.482%) in intersection ######################################################################### # SIMPLE REPEATS TRF (DONE braney 2008-10-07) ssh kolossus screen # use a screen to manage this job mkdir /cluster/data/dasNov2/bed/simpleRepeat cd /cluster/data/dasNov2/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/dasNov2/bed/simpleRepeat \ dasNov2 > do.log 2>&1 & #### When done ssh pk para time # Completed: 97 of 97 jobs # CPU time in finished jobs: 61803s 1030.05m 17.17h 0.72d 0.002 y # IO & Wait Time: 3111s 51.85m 0.86h 0.04d 0.000 y # Average job time: 669s 11.15m 0.19h 0.01d # Longest finished job: 7949s 132.48m 2.21h 0.09d # Submission to last job: 7957s 132.62m 2.21h 0.09d featureBits dasNov2 simpleRepeat # 26484090 bases of 2371493872 (1.117%) in intersection # after RM run is done, add this mask: cd /cluster/data/dasNov2 twoBitMask dasNov2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dasNov2.2bit twoBitToFa dasNov2.2bit stdout | faSize stdin # 4813823562 bases (2442329690 N's 2371493872 real 1488699780 upper 882794092 # lower) in 292141 sequences in 1 files # Total size: mean 16477.7 sd 32801.6 min 600 (scaffold_292140) max 985764 # (scaffold_0) median 4946 # N count: mean 8360.1 sd 19087.4 # U count: mean 5095.8 sd 10731.3 # L count: mean 3021.8 sd 5626.4 # %18.34 masked total, %37.23 masked real twoBitToFa dasNov2.rmsk.2bit stdout | faSize stdin # 4813823562 bases (2442329690 N's 2371493872 real 1489221550 upper 882272322 # lower) in 292141 sequences in 1 files # Total size: mean 16477.7 sd 32801.6 min 600 (scaffold_292140) max 985764 # (scaffold_0) median 4946 # N count: mean 8360.1 sd 19087.4 # U count: mean 5097.6 sd 10734.5 # L count: mean 3020.0 sd 5623.4 # %18.33 masked total, %37.20 masked real # Link to it from /gbdb ssh hgwdev rm -f /gbdb/dasNov2/dasNov2.2bit ln -s /cluster/data/dasNov2/dasNov2.2bit /gbdb/dasNov2/dasNov2.2bit ######################################################################### # prepare for kluster runs (DONE - 2008-10-22 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # dasNov2: 2371493872 # thus: 1024 * 2371493872/2881421696 = 842 # rounding up to 900 for a bit more conservative masking cd /hive/data/genomes/dasNov2 time blat dasNov2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=dasNov2.11.ooc -repMatch=900 # Wrote 30934 overused 11-mers to dasNov2.11.ooc # real 2m49.196s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/dasNov2 cp -p dasNov2.2bit chrom.sizes dasNov2.11.ooc \ /hive/data/staging/data/dasNov2 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-22 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute dasNov2 (NCBI project 12594, AAGV020000000)" where name="dasNov2";' hgcentraltest ###########################################################################