# for emacs: -*- mode: sh; -*- # microbat ( Myotis lucifugus ) ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-07-11) ssh kkstore05 mkdir /cluster/store12/myoLuc1 ln -s /cluster/store12/myoLuc1 /cluster/data mkdir /cluster/data/myoLuc1/broad cd /cluster/data/myoLuc1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/microbat/myoLuc1/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/microbat/myoLuc1/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/microbat/myoLuc1/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin myoLuc1.qual.qac wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/microbat/myoLuc1/BasicStats.out # -------------------------------------------------------------------------------- # Thu Mar 02 15:01:30 2006 run, based on Tue Feb 28 15:18:13 EST 2006 make # BasicStats PRE=/wga/dev1/WGAdata DATA=projects/Bat RUN=run/work \ # SUBDIR=assisted_5 QUAL_STATS=True #-------------------------------------------------------------------------------- # #Supercontigs having < 3 reads or < 1kb sequence are ignored. #8056 gaps <= -1000; 2733 gaps <= -10000; 4 gaps <= -100000 #fraction of gaps < -10kb or more than 4 deviations below zero: 1.76% #33071 gaps > 10kb, 15 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb #81% of reads were used in the assembly (84.16% of bases, 85.3% of Q20 bases) #0% of reads were used multiply in the assembly #584023 contigs, having N50 length 3206 #total contig length: 1617176597, spanning 2780941149 bases (with 41.8% in #gaps) #136582 supercontigs, having N50 length 53069 (not including gaps) #76.6% of assembly in supers of size < 200000 (2131474931 bases) #Assembly base coverage: 1.72X. Assembly Q20 coverage: 1.48X. #99.75% of bases have q >= 1 #95.27% of bases have q >= 20 #89.72% of bases have q >= 30 #82.51% of bases have q >= 40 #72.94% of bases have q >= 50 cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 193323 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kkstore05 cd /cluster/data/myoLuc1 cat << _EOF_ >myoLuc1.config.ra # Config parameters for makeGenomeDb.pl: db myoLuc1 clade mammal genomeCladePriority 35 scientificName Myotis lucifugus commonName Microbat assemblyDate Mar. 2006 assemblyLabel Broad Institute myoLuc1 orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/myoLuc1/broad/assembly.bases.gz agpFiles /cluster/data/myoLuc1/broad/assembly.agp qualFiles /cluster/data/myoLuc1/broad/myoLuc1.qual.qac dbDbSpeciesDir microbat _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -verbose=2 myoLuc1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 1154.000000 # median 2190.000000 # Q3 7139.000000 # average 14742.433953 # min 200.000000 # max 1293446.000000 # count 193323 # total 2850051559.000000 # standard deviation 41575.958603 ######################################################################### # REPEATMASKER (DONE braney 2008-07-29) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/myoLuc1/bed/repeatMasker cd /cluster/data/myoLuc1/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/myoLuc1/bed/repeatMasker \ myoLuc1 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh pk para time # Completed: 6580 of 6580 jobs # CPU time in finished jobs: 16608254s 276804.23m 4613.40h 192.23d 0.527 y # IO & Wait Time: 77692s 1294.87m 21.58h 0.90d 0.002 y # Average job time: 2536s 42.26m 0.70h 0.03d # Longest finished job: 6707s 111.78m 1.86h 0.08d # Submission to last job: 138115s 2301.92m 38.37h 1.60d time nice -n +19 featureBits myoLuc1 rmsk > fb.myoLuc1.rmsk.txt 2>&1 & # 416753531 bases of 1673855868 (24.898%) in intersection # RepeatMasker and lib version from do.log: # Jun 13 2008 (open-3-2-5) version of RepeatMasker # CC RELEASE 20080611; ######################################################################### # SIMPLE REPEATS TRF (DONE braney 2008-07-29) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/myoLuc1/bed/simpleRepeat cd /cluster/data/myoLuc1/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/myoLuc1/bed/simpleRepeat \ myoLuc1 > do.log 2>&1 & #### When done ssh pk para time # Completed: 58 of 58 jobs # CPU time in finished jobs: 36439s 607.32m 10.12h 0.42d 0.001 y # IO & Wait Time: 211s 3.52m 0.06h 0.00d 0.000 y # Average job time: 632s 10.53m 0.18h 0.01d # Longest finished job: 5886s 98.10m 1.64h 0.07d # Submission to last job: 6114s 101.90m 1.70h 0.07d featureBits myoLuc1 simpleRepeat # 33491779 bases of 1673855868 (2.001%) in intersection # after RM run is done, add this mask: cd /cluster/data/myoLuc1 twoBitMask myoLuc1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed myoLuc1.2bit twoBitToFa myoLuc1.2bit stdout | faSize stdin # 2850051559 bases (1176195691 N's 1673855868 real 1256986222 upper 416869646 # lower) in 193323 sequences in 1 files # Total size: mean 14742.4 sd 41576.1 min 200 (scaffold_48228) max 1293446 # (scaffold_148345) median 2190 # N count: mean 6084.1 sd 18925.1 # U count: mean 6502.0 sd 18734.7 # L count: mean 2156.3 sd 6031.6 # %14.63 masked total, %24.90 masked real twoBitToFa myoLuc1.rmsk.2bit stdout | faSize stdin # 2850051559 bases (1176195691 N's 1673855868 real 1257573734 upper 416282134 # lower) in 193323 sequences in 1 files # Total size: mean 14742.4 sd 41576.1 min 200 (scaffold_48228) max 1293446 # (scaffold_148345) median 2190 # N count: mean 6084.1 sd 18925.1 # U count: mean 6505.0 sd 18741.6 # L count: mean 2153.3 sd 6025.1 # %14.61 masked total, %24.87 masked real # Link to it from /gbdb ssh hgwdev ln -s /cluster/data/myoLuc1/myoLuc1.2bit /gbdb/myoLuc1/myoLuc1.2bit # mkdir /san/sanvol1/scratch/myoLuc1 cp /cluster/data/myoLuc1/myoLuc1.2bit /san/sanvol1/scratch/myoLuc1 cp /cluster/data/myoLuc1/chrom.sizes /san/sanvol1/scratch/myoLuc1 ############################################################################ # myoLuc1 - Microbat - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh kolossus cd /hive/data/genomes/myoLuc1 cat << '_EOF_' > myoLuc1.ensGene.ra # required db variable db myoLuc1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the three genes that have invalid structures from Ensembl: # 1265: ENSMLUT00000004658 no exonFrame on CDS exon 1 # 17770: ENSMLUT00000003427 no exonFrame on CDS exon 10 # 32743: ENSMLUT00000009601 no exonFrame on CDS exon 1 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 myoLuc1.ensGene.ra ssh hgwdev cd /hive/data/genomes/myoLuc1/bed/ensGene.51 featureBits myoLuc1 ensGene # 24559555 bases of 1673855868 (1.467%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/myoLuc1/bed/ensGene.51 ############################################################################