# for emacs: -*- mode: sh; -*- # elephant ( Loxodonta africana) ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kolossus mkdir /hive/data/genomes/loxAfr2 ln -s /hive/data/genomes/loxAfr2 /cluster/data mkdir /cluster/data/loxAfr2/broad cd /cluster/data/loxAfr2/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin loxAfr2.qual.qac wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/elephant/loxAfr2/BasicStats.out # no BasicStats.out cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 225378 ######################################################################### # Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07) ssh hgwdev echo "drop database loxAfr2" | hgsql "" ssh kolossus cd /cluster/data/loxAfr2 cat << _EOF_ >loxAfr2.config.ra # Config parameters for makeGenomeDb.pl: db loxAfr2 clade mammal genomeCladePriority 35 scientificName Loxodonta africana commonName Elephant assemblyDate Jul. 2008 assemblyLabel Broad Institute loxAfr2 orderKey 346.1 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/loxAfr2/broad/assembly.bases.gz agpFiles /cluster/data/loxAfr2/broad/assembly.agp qualFiles /cluster/data/loxAfr2/broad/loxAfr2.qual.qac dbDbSpeciesDir elephant _EOF_ screen makeGenomeDb.pl -verbose=2 loxAfr2.config.ra > makeGenomeDb.out 2>&1 & # when done cut -f 2 chrom.sizes | ave stdin # Q1 2133.000000 # median 5499.000000 # Q3 13596.000000 # average 18504.090248 # min 568.000000 # max 644881.000000 # count 225378 # total 4170414852.000000 # standard deviation 35140.147724 ######################################################################### # REPEATMASKER (DONE braney 2008-10-07) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/loxAfr2/bed/repeatMasker cd /cluster/data/loxAfr2/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/loxAfr2/bed/repeatMasker \ loxAfr2 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously ssh pk para time # Completed: 9212 of 9212 jobs # CPU time in finished jobs: 22535275s 375587.92m 6259.80h 260.82d 0.715 y # IO & Wait Time: 75555s 1259.25m 20.99h 0.87d 0.002 y # Average job time: 2454s 40.91m 0.68h 0.03d # Longest finished job: 7588s 126.47m 2.11h 0.09d # Submission to last job: 80995s 1349.92m 22.50h 0.94d cd /cluster/data/loxAfr2/bed/repeatMasker doRepeatMasker.pl -workhorse kolossus -buildDir=/cluster/data/loxAfr2/bed/repeatMasker \ loxAfr2 -continue cat > do2.log 2>&1 & ssh hgwdev cd /cluster/data/loxAfr2/bed/repeatMasker time nice -n +19 featureBits loxAfr2 rmsk > fb.loxAfr2.rmsk.txt 2>&1 & # 1088787467 bases of 2444975542 (44.532%) in intersection ######################################################################### # SIMPLE REPEATS TRF (DONE braney 2008-10-07) ssh kolossus screen # use a screen to manage this job mkdir /cluster/data/loxAfr2/bed/simpleRepeat cd /cluster/data/loxAfr2/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/loxAfr2/bed/simpleRepeat \ loxAfr2 > do.log 2>&1 & ssh pk para time # Completed: 84 of 84 jobs # CPU time in finished jobs: 43572s 726.19m 12.10h 0.50d 0.001 y # IO & Wait Time: 6346s 105.77m 1.76h 0.07d 0.000 y # Average job time: 594s 9.90m 0.17h 0.01d # Longest finished job: 5352s 89.20m 1.49h 0.06d # Submission to last job: 5409s 90.15m 1.50h 0.06d doSimpleRepeat.pl -buildDir=/cluster/data/loxAfr2/bed/simpleRepeat \ loxAfr2 -continue filter > do2.log 2>&1 & featureBits loxAfr2 simpleRepeat # 41337206 bases of 2444975542 (1.691%) in intersection # after RM run is done, add this mask: cd /cluster/data/loxAfr2 twoBitMask loxAfr2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed loxAfr2.2bit twoBitToFa loxAfr2.2bit stdout | faSize stdin # 4170414852 bases (1725439310 N's 2444975542 real 1366083755 upper 1078891787 # lower) in 225378 sequences in 1 files # Total size: mean 18504.1 sd 35140.2 min 568 (scaffold_218703) max 644881 # (scaffold_0) median 5499 # N count: mean 7655.8 sd 14579.3 # U count: mean 6061.3 sd 13409.2 # L count: mean 4787.0 sd 9801.0 # %25.87 masked total, %44.13 masked real twoBitToFa loxAfr2.rmsk.2bit stdout | faSize stdin # 4170414852 bases (1725439310 N's 2444975542 real 1366661766 upper 1078313776 # lower) in 225378 sequences in 1 files # Total size: mean 18504.1 sd 35140.2 min 568 (scaffold_218703) max 644881 # (scaffold_0) median 5499 # N count: mean 7655.8 sd 14579.3 # U count: mean 6063.9 sd 13412.5 # L count: mean 4784.5 sd 9797.5 # %25.86 masked total, %44.10 masked real ssh hgwdev # Link to it from /gbdb rm -f /gbdb/loxAfr2/loxAfr2.2bit ln -s /cluster/data/loxAfr2/loxAfr2.2bit /gbdb/loxAfr2/loxAfr2.2bit ######################################################################### # prepare for kluster runs (DONE - 2008-10-22 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # loxAfr2: 2444975542 # thus: 1024 * 2444975542/2881421696 = 868 # rounding up to 900 for a bit more conservative masking cd /hive/data/genomes/loxAfr2 time blat loxAfr2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=loxAfr2.11.ooc -repMatch=900 # Wrote 38872 overused 11-mers to loxAfr2.11.ooc # real 2m46.741s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/loxAfr2 cp -p loxAfr2.2bit chrom.sizes loxAfr2.11.ooc \ /hive/data/staging/data/loxAfr2 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-22 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute loxAfr2 (NCBI project 12569, AAGU02000000)" where name="loxAfr2";' hgcentraltest ###########################################################################