# for emacs: -*- mode: sh; -*- # Vicugna vicugna ######################################################################### # DOWNLOAD SEQUENCE ssh kkstore05 mkdir /cluster/store12/vicVic1 ln -s /cluster/store12/vicVic1 /cluster/data mkdir /cluster/data/vicVic1/broad cd /cluster/data/vicVic1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.quals.gz qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin vicVic1.qual.qac wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/BasicStats.out # -------------------------------------------------------------------------------- # Sun May 25 15:19:02 2008 run (pid=20827) using Mon May 19 16:28:59 EDT 2008 # make # BasicStats PRE=/wga/dev/WGAdata DATA=projects/LowCoverage/Vicugna # \ # RUN=run/work SUBDIR=assisted_2.5 QUAL_STATS=True # \ # OUTFILE=BasicStats.out # -------------------------------------------------------------------------------- # # Supercontigs having < 3 reads or < 1kb sequence are # ignored. # 4189 gaps <= -1000; 7 gaps <= -10000; 0 gaps <= -100000 # fraction of gaps < -10kb or more than 4 deviations below # zero: 0.812% # 27886 gaps > 10kb, 0 gaps > 50kb, 0 gaps > 200kb, 0 gaps # > 1Mb # 80.92% of reads were used in the assembly (83.85% of # bases, 85.43% of Q20 bases) # 0% of reads were used multiply in the assembly # 621891 contigs, having N50 length 4346 # total contig length: 1826863513, spanning 2856433242 # bases (with 36% in gaps) # 199050 supercontigs, having N50 length 203550 (not # including gaps) # 45.6% of assembly in supers of size < 200000 (1302053740 # bases) # Assembly base coverage: 2.17X. Assembly Q20 coverage: # 1.93X. # 99.98% of bases have q >= 1 # 96.62% of bases have q >= 20 # 93.57% of bases have q >= 30 # 90.13% of bases have q >= 40 # 86.95% of bases have q >= 50 cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 298420 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kkstore05 cd /cluster/data/vicVic1 cat << _EOF_ >vicVic1.config.ra # Config parameters for makeGenomeDb.pl: db vicVic1 clade mammal genomeCladePriority 35 scientificName Vicugna vicugna commonName Alpaca assemblyDate Jul. 2008 assemblyLabel Broad Institute vicVic1 orderKey 233.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/vicVic1/broad/assembly.bases.gz agpFiles /cluster/data/vicVic1/broad/assembly.agp qualFiles /cluster/data/vicVic1/broad/vicVic1.qual.qac dbDbSpeciesDir alpaca _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -verbose=2 vicVic1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 1042.000000 # median 1372.000000 # Q3 2292.000000 # average 9926.782253 # min 600.000000 # max 5516956.000000 # count 298420 # total 2962350360.000000 # standard deviation 67784.633210 ######################################################################### # REPEATMASKER (DONE - 2008-07-13 braney ) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/vicVic1/bed/repeatMasker cd /cluster/data/vicVic1/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/vicVic1/bed/repeatMasker \ vicVic1 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh pk para time # Completed: 7141 of 7141 jobs # CPU time in finished jobs: 19375689s 322928.14m 5382.14h 224.26d 0.614 y # IO & Wait Time: 185390s 3089.84m 51.50h 2.15d 0.006 y # Average job time: 2739s 45.65m 0.76h 0.03d # Longest finished job: 13925s 232.08m 3.87h 0.16d # Submission to last job: 175075s 2917.92m 48.63h 2.03d time nice -n +19 featureBits vicVic1 rmsk > fb.vicVic1.rmsk.txt 2>&1 & # 614774346 bases of 1923010363 (31.969%) in intersection # RepeatMasker and lib version from do.log: # Jun 13 2008 (open-3-2-5) version of RepeatMasker # CC RELEASE 20080611; ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-07-11 braney) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/vicVic1/bed/simpleRepeat cd /cluster/data/vicVic1/bed/simpleRepeat # had to change genus/species to "vicugna genus" in dummyRun doSimpleRepeat.pl -buildDir=/cluster/data/vicVic1/bed/simpleRepeat \ vicVic1 > do.log 2>&1 & #### When done ssh pk para time # Completed: 60 of 60 jobs # CPU time in finished jobs: 71877s 1197.95m 19.97h 0.83d 0.002 y # IO & Wait Time: 2811s 46.85m 0.78h 0.03d 0.000 y # Average job time: 1245s 20.75m 0.35h 0.01d # Longest finished job: 12819s 213.65m 3.56h 0.15d # Submission to last job: 13133s 218.88m 3.65h 0.15d featureBits vicVic1 simpleRepeat # 77931003 bases of 1923010363 (4.053%) in intersection # after RM run is done, add this mask: cd /cluster/data/vicVic1 twoBitMask vicVic1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicVic1.2bit twoBitToFa vicVic1.2bit stdout | faSize stdin # 2962350360 bases (1039339997 N's 1923010363 real 1305704427 upper 617305936 # lower) in 298420 sequences in 1 files # Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956 (scaffold_0) median 1372 # N count: mean 3482.8 sd 21549.2 # U count: mean 4375.4 sd 33856.2 # L count: mean 2068.6 sd 15148.2 # %20.84 masked total, %32.10 masked real twoBitToFa vicVic1.rmsk.2bit stdout | faSize stdin # 2962350360 bases (1039339997 N's 1923010363 real 1309436724 upper 613573639 # lower) in 298420 sequences in 1 files # Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956 # (scaffold_0) median 1372 # N count: mean 3482.8 sd 21549.2 # U count: mean 4387.9 sd 33863.9 # L count: mean 2056.1 sd 15140.4 # %20.71 masked total, %31.91 masked real ln -s /cluster/data/vicVic1/vicVic1.2bit /gbdb/vicVic1/vicVic1.2bit cp /cluster/data/vicVic1/vicVic1.2bit /san/sanvol1/scratch/vicVic1 cp /cluster/data/vicVic1/chrom.sizes /san/sanvol1/scratch/vicVic1