# for emacs: -*- mode: sh; -*- # http://www.ncbi.nlm.nih.gov/bioproject/13937 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAQQ01 # http://www.ncbi.nlm.nih.gov/nuccore/107785164 # http://www.ncbi.nlm.nih.gov/genome/472 # squirrel ( Spermophilus tridecemlineatus) ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-07-31) ssh kkstore05 mkdir /cluster/store12/speTri1 ln -s /cluster/store12/speTri1 /cluster/data mkdir /cluster/data/speTri1/broad cd /cluster/data/speTri1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin speTri1.qual.qac wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/BasicStats.out # -------------------------------------------------------------------------------- # Tue May 16 00:08:35 2006 run, based on Mon May 8 09:51:44 EDT 2006 make # BasicStats PRE=/wga/dev1/WGAdata DATA=projects/Squirrel RUN=run/work ## \ # SUBDIR=assisted_3 QUAL_STATS=True # -------------------------------------------------------------------------------- # # Supercontigs having < 3 reads or < 1kb sequence are ignored. # 4229 gaps <= -1000; 25 gaps <= -10000; 0 gaps <= -100000 # fraction of gaps < -10kb or more than 4 deviations below zero: 0.384% # 42487 gaps > 10kb, 47 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb # 82.4% of reads were used in the assembly (84.67% of bases, 85.56% of Q20 # bases) # 0% of reads were used multiply in the assembly # 758936 contigs, having N50 length 2799 # total contig length: 1873262772, spanning 3431133042 bases (with 45.4% in # gaps) # 122351 supercontigs, having N50 length 55167 (not including gaps) # 79.3% of assembly in supers of size < 200000 (2722264171 bases) # Assembly base coverage: 1.37X. Assembly Q20 coverage: 1.18X. # 99.87% of bases have q >= 1 # 94.72% of bases have q >= 20 # 87.83% of bases have q >= 30 # 79.33% of bases have q >= 40 # 68.22% of bases have q >= 50 cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 160889 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kkstore05 cd /cluster/data/speTri1 cat << _EOF_ >speTri1.config.ra # Config parameters for makeGenomeDb.pl: db speTri1 clade mammal genomeCladePriority 35 scientificName Spermophilus tridecemlineatus commonName Squirrel assemblyDate Feb. 2008 assemblyLabel Broad Institute speTri1 orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/speTri1/broad/assembly.bases.gz agpFiles /cluster/data/speTri1/broad/assembly.agp qualFiles /cluster/data/speTri1/broad/speTri1.qual.qac dbDbSpeciesDir squirrel _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -verbose=2 speTri1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 1345.000000 # median 4617.000000 # Q3 12707.500000 # average 21684.320196 # min 200.000000 # max 996791.000000 # count 160889 # total 3488768592.000000 # standard deviation 47336.311846 ######################################################################### # REPEATMASKER (DONE braney 2008-08-03) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/speTri1/bed/repeatMasker cd /cluster/data/speTri1/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/speTri1/bed/repeatMasker \ speTri1 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: time nice -n +19 featureBits speTri1 rmsk > fb.speTri1.rmsk.txt 2>&1 & # 514833923 bases of 1913367893 (26.907%) in intersection # RepeatMasker and lib version from do.log: # Jun 13 2008 (open-3-2-5) version of RepeatMasker # CC RELEASE 20080611; ######################################################################### # SIMPLE REPEATS TRF (DONE braney 2008-08-03) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/speTri1/bed/simpleRepeat cd /cluster/data/speTri1/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/speTri1/bed/simpleRepeat \ speTri1 > do.log 2>&1 & #### When done ssh hgwdev featureBits speTri1 simpleRepeat # 150142628 bases of 1913367893 (7.847%) in intersection # after RM run is done, add this mask: cd /cluster/data/speTri1 twoBitMask speTri1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed speTri1.2bit twoBitToFa speTri1.2bit stdout | faSize stdin # 3488768592 bases (1575400699 N's 1913367893 real 1398742896 upper 514624997 # lower) in 160889 sequences in 1 files # Total size: mean 21684.3 sd 47336.5 min 200 (scaffold_63211) max 996791 # (scaffold_113960) median 4617 # N count: mean 9791.8 sd 22439.9 # U count: mean 8693.8 sd 19894.4 # L count: mean 3198.6 sd 8123.2 # %14.75 masked total, %26.90 masked real twoBitToFa speTri1.rmsk.2bit stdout | faSize stdin # 3488768592 bases (1575400699 N's 1913367893 real 1399153971 upper 514213922 # lower) in 160889 sequences in 1 files # Total size: mean 21684.3 sd 47336.5 min 200 (scaffold_63211) max 996791 # (scaffold_113960) median 4617 # N count: mean 9791.8 sd 22439.9 # U count: mean 8696.4 sd 19900.2 # L count: mean 3196.1 sd 8117.5 # %14.74 masked total, %26.87 masked real # Link to it from /gbdb ln -s /cluster/data/speTri1/speTri1.2bit /gbdb/speTri1/speTri1.2bit # mkdir /san/sanvol1/scratch/speTri1 cp /cluster/data/speTri1/speTri1.2bit /san/sanvol1/scratch/speTri1 cp /cluster/data/speTri1/chrom.sizes /san/sanvol1/scratch/speTri1 ############################################################################ # speTri1 - Squirrel - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh kkr14u07 cd /hive/data/genomes/speTri1 cat << '_EOF_' > speTri1.ensGene.ra # required db variable db speTri1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the single gene that has an invalid structure from Ensembl: # 1071: ENSSTOT00000007455 no exonFrame on CDS exon 1 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 speTri1.ensGene.ra ssh hgwdev cd /hive/data/genomes/speTri1/bed/ensGene.51 featureBits speTri1 ensGene # 21525994 bases of 1913367893 (1.125%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/speTri1/bed/ensGene.51 ############################################################################