# for emacs: -*- mode: sh; -*- # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRO01 # Dipodomys ordii Kangaroo rat ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kkstore05 mkdir /cluster/store12/dipOrd1 ln -s /cluster/store12/dipOrd1 /cluster/data mkdir /cluster/data/dipOrd1/broad cd /cluster/data/dipOrd1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin dipOrd1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 210053 ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE braney 2008-10-07) ssh kolossus cd /cluster/data/dipOrd1 cat << _EOF_ >dipOrd1.config.ra # Config parameters for makeGenomeDb.pl: db dipOrd1 clade mammal genomeCladePriority 35 scientificName Dipodomys ordii commonName Kangaroo rat assemblyDate Jul. 2008 assemblyLabel Broad Institute orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/dipOrd1/broad/assembly.bases.gz agpFiles /cluster/data/dipOrd1/broad/assembly.agp qualFiles /cluster/data/dipOrd1/broad/dipOrd1.qual.qac dbDbSpeciesDir kangarooRat _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -workhorse kolossus dipOrd1.config.ra > makeGenomeDb.out 2>&1 & cut -f 2 chrom.sizes | ave stdin # Q1 1417.000000 # median 2509.000000 # Q3 8139.000000 # average 10275.987955 # min 554.000000 # max 11421927.000000 # count 210053 # total 2158502098.000000 # standard deviation 40860.463605 ######################################################################### ## Repeat Masker (DONE - 2008-10-14 - Hiram) screen # to manage this job mkdir /hive/data/genomes/dipOrd1/bed/repeatMasker cd /hive/data/genomes/dipOrd1/bed/repeatMasker time doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` dipOrd1 > do.log 2>&1 & # real 663m46.841s # failed in doMask.csh due to no dipOrd1.unmasked.2bit file ? # which was there at the beginning, now it is gone ? # the broken doRepeatMasker.pl script removed it due to hive confusion # went back to the jkStuff/makeUnmasked script and re-ran part of that. # then ran the doMask.csh script to finish that, and continuing: time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -continue=install -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` dipOrd1 > install.log 2>&1 & # real 15m17.199s twoBitToFa dipOrd1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 2158502098 bases (313540677 N's 1844961421 real 1404133178 upper 440828243 # lower) in 210053 sequences in 1 files # %20.42 masked total, %23.89 masked real ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-17 - Hiram) screen # use a screen to manage this job mkdir /cluster/data/dipOrd1/bed/simpleRepeat cd /cluster/data/dipOrd1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/dipOrd1/bed/simpleRepeat \ dipOrd1 > do.log 2>&1 & # real 2630m12.078s featureBits dipOrd1 simpleRepeat # 211605979 bases of 1844961421 (11.469%) in intersection # after RM run is done, add this mask: cd /cluster/data/dipOrd1 twoBitMask dipOrd1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dipOrd1.2bit twoBitToFa dipOrd1.2bit stdout | faSize stdin > dipOrd1.2bit.faSize.txt # 2158502098 bases (313540677 N's 1844961421 real 1323236442 upper 521724979 # lower) in 210053 sequences in 1 files # %24.17 masked total, %28.28 masked real # link to gbdb ln -s `pwd`/dipOrd1.2bit /gbdb/dipOrd1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-17 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # dipOrd1: 1844961421 # thus: 1024 * 1844961421/2881421696 = 655 # rounding up to 700 to be a bit conservative cd /hive/data/genomes/dipOrd1 time blat dipOrd1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=dipOrd1.11.ooc -repMatch=700 # Wrote 33062 overused 11-mers to dipOrd1.11.ooc # real 1m34.993s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/dipOrd1 cp -p dipOrd1.2bit chrom.sizes dipOrd1.11.ooc \ /hive/data/staging/data/dipOrd1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute dipOrd1 (NCBI project 20385, ABRO01000000)" where name="dipOrd1";' hgcentraltest ########################################################################### # dipOrd1 - Kangaroo rat - Ensembl Genes version 51 (DONE - 2008-12-02 - # hiram) ssh hgwdev cd /hive/data/genomes/dipOrd1 cat << '_EOF_' > dipOrd1.ensGene.ra # required db variable db dipOrd1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the single gene that have invalid structures from Ensembl: # 11275: ENSDORT00000004734 no exonFrame on CDS exon 12 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 dipOrd1.ensGene.ra ssh hgwdev cd /hive/data/genomes/dipOrd1/bed/ensGene.51 featureBits dipOrd1 ensGene # 25129085 bases of 1844961421 (1.362%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/dipOrd1/bed/ensGene.51 ############################################################################ # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/dipOrd1/bed/cpgIslands cd /hive/data/genomes/dipOrd1/bed/cpgIslands time doCpgIslands.pl dipOrd1 > do.log 2>&1 # real 635m52.950s # fixing broken command in script: time ./doLoadCpg.csh # real 2m18.163s time doCpgIslands.pl -continue=cleanup dipOrd1 > cleanup.log 2>&1 # real 97m27.451s cat fb.dipOrd1.cpgIslandExt.txt # 16112003 bases of 1844961421 (0.873%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/dipOrd1/bed/genscan cd /hive/data/genomes/dipOrd1/bed/genscan time doGenscan.pl dipOrd1 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 317m41.669s time doGenscan.pl -continue=load dipOrd1 > load.log 2>&1 # real 14m39.736s # failed out of inodes on hive: time doGenscan.pl -continue=cleanup dipOrd1 > cleanup.log 2>&1 # real 43m1.037s cat fb.dipOrd1.genscan.txt # 46775156 bases of 1844961421 (2.535%) in intersection cat fb.dipOrd1.genscanSubopt.txt # 46026502 bases of 1844961421 (2.495%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) mkdir /hive/data/genomes/dipOrd1/bed/windowMasker cd /hive/data/genomes/dipOrd1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev dipOrd1 > do.log 2>&1 # Elapsed time: 574m42s sed -e 's/^/ #\t/' fb.dipOrd1.windowmaskerSdust.beforeClean.txt # 1092394968 bases of 2158502098 (50.609%) in intersection sed -e 's/^/ #\t/' fb.dipOrd1.windowmaskerSdust.clean.txt # 778854291 bases of 2158502098 (36.083%) in intersection sed -e 's/^/ #\t/' fb.dipOrd1.rmsk.windowmaskerSdust.txt # 294876541 bases of 2158502098 (13.661%) in intersection ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Dipodomys ordii - does not exist in this file # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # dipOrd1 (Kangaroo rat) dipOrd1.serverGenome = /hive/data/genomes/dipOrd1/dipOrd1.2bit dipOrd1.clusterGenome = /scratch/data/dipOrd1/dipOrd1.2bit dipOrd1.ooc = /scratch/data/dipOrd1/dipOrd1.11.ooc dipOrd1.lift = no dipOrd1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} dipOrd1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} dipOrd1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} dipOrd1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} dipOrd1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} dipOrd1.refseq.mrna.native.load = no dipOrd1.refseq.mrna.xeno.load = yes dipOrd1.genbank.mrna.xeno.load = yes dipOrd1.genbank.est.native.load = no dipOrd1.downloadDir = dipOrd1 dipOrd1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding dipOrd1 kangaroo rat" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for dipOrdNames Dipodomys ordii" \ src/lib/gbGenome.c # Western Painted Turtle - Chrysemys picta bellii - Dec 2011 git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S dipOrd1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial dipOrd1 & # var/build/logs/2012.05.03-16:19:09.dipOrd1.initalign.log # real 6078m6.247s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad dipOrd1 & # logFile: var/dbload/hgwdev/logs/2012.05.08-15:08:30.dbload.log # real 38m3.733s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add dipOrd1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added dipOrd1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to OPN5 gene displays (DONE - 2012-07-20 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_7358:2,192-33,995" where name="dipOrd1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-20 - Hiram) mkdir /hive/data/genomes/dipOrd1/pushQ cd /hive/data/genomes/dipOrd1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl dipOrd1 2> stderr.txt | grep -v transMap > dipOrd1.sql scp -p dipOrd1.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < dipOrd1.sql ############################################################################