# for emacs: -*- mode: sh; -*- # Tursiops truncatus Bottle-nosed dolphin # http://www.ncbi.nlm.nih.gov/bioproject/20367 # http://www.ncbi.nlm.nih.gov/genome/769 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRN00 ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kkstore05 mkdir /cluster/store12/turTru1 ln -s /cluster/store12/turTru1 /cluster/data mkdir /cluster/data/turTru1/broad cd /cluster/data/turTru1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin turTru1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 116467 ######################################################################### # Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07) ssh kolossus cd /cluster/data/turTru1 cat << _EOF_ >turTru1.config.ra # Config parameters for makeGenomeDb.pl: db turTru1 clade mammal genomeCladePriority 35 scientificName Tursiops truncatus commonName Dolphin assemblyDate Feb. 2008 assemblyLabel Broad Institute turTru1 orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/turTru1/broad/assembly.bases.gz agpFiles /cluster/data/turTru1/broad/assembly.agp qualFiles /cluster/data/turTru1/broad/turTru1.qual.qac dbDbSpeciesDir dolphin _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -workhorse kolossus -verbose=2 turTru1.config.ra > makeGenomeDb.out 2>&1 & cut -f 2 chrom.sizes | ave stdin # Q1 1243.000000 # median 1593.000000 # Q3 6511.000000 # average 21628.860415 # min 336.000000 # max 1406907.000000 # count 116467 # total 2519048486.000000 # standard deviation 63795.083568 ######################################################################### ## Repeat Masker (DONE - 2008-10-16 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/turTru1/bed/repeatMasker cd /hive/data/genomes/turTru1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` turTru1 > do.log 2>&1 & # real 797m7.301s twoBitToFa turTru1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 2519048486 bases (220604396 N's 2298444090 real 1290940163 upper 1007503927 # lower) in 116467 sequences in 1 files # %40.00 masked total, %43.83 masked rea ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-15 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/turTru1/bed/simpleRepeat cd /hive/data/genomes/turTru1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/turTru1/bed/simpleRepeat turTru1 > do.log 2>&1 & # real 74m17.044s cat fb.simpleRepeat # 36542185 bases of 2298444090 (1.590%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/turTru1 rm turTru1.2bit twoBitMask turTru1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed turTru1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa turTru1.2bit stdout | faSize stdin > turTru1.2bit.faSize.txt # 2519048486 bases (220604396 N's 2298444090 real 1290364660 upper 1008079430 # lower) in 116467 sequences in 1 files # %40.02 masked total, %43.86 masked rea # link to gbdb ln -s `pwd`/turTru1.2bit /gbdb/turTru1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-16 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # turTru1: 2298444090 # thus: 1024 * 2298444090/2881421696 = 816 # rounding up to 850 for a more conservative masking cd /hive/data/genomes/turTru1 time blat turTru1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=turTru1.11.ooc -repMatch=850 # Wrote 31521 overused 11-mers to turTru1.11.ooc # real 2m9.494s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/turTru1 cp -p turTru1.2bit chrom.sizes turTru1.11.ooc \ /hive/data/staging/data/turTru1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute turTru1 (NCBI project 20367, ABRN01000000)" where name="turTru1";' hgcentraltest ############################################################################ # turTru1 - Hedgehog - Ensembl Genes version 51 (DONE - 2008-12-04 - hiram) ssh swarm cd /hive/data/genomes/turTru1 cat << '_EOF_' > turTru1.ensGene.ra # required db variable db turTru1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the 14,543 genes (of 26,340) that do not map to UCSC coordinates '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 turTru1.ensGene.ra # This did not pass the check during load of coverage of # peptides vs. gene counts ssh hgwdev cd /hive/data/genomes/turTru1/bed/ensGene.51 featureBits turTru1 ensGene # 10421083 bases of 2298444090 (0.453%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/turTru1/bed/ensGene.51 ############################################################################ # SWAP mm10 lastz (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16/fb.mm10.chainTurTru1Link.txt # 762961671 bases of 2652783500 (28.761%) in intersection # and this swap mkdir /hive/data/genomes/turTru1/bed/blastz.mm10.swap cd /hive/data/genomes/turTru1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 79m38.703s cat fb.turTru1.chainMm10Link.txt # 744359707 bases of 2298444090 (32.385%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru1/bed ln -s blastz.mm10.swap lastz.mm10 ##############################################################################