# for emacs: -*- mode: sh; -*- # rock hyrax ( Procavia capensis ) # http://www.ncbi.nlm.nih.gov/genome/2488 # http://www.ncbi.nlm.nih.gov/bioproject/13972 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRQ00 ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kkstore05 mkdir /cluster/store12/proCap1 ln -s /cluster/store12/proCap1 /cluster/data mkdir /cluster/data/proCap1/broad cd /cluster/data/proCap1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin proCap1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 295006 ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE braney 2008-10-07) ssh kolossus cd /cluster/data/proCap1 cat << _EOF_ >proCap1.config.ra # Config parameters for makeGenomeDb.pl: db proCap1 clade mammal genomeCladePriority 35 scientificName Procavia capensis commonName Rock hyrax assemblyDate Jul. 2008 assemblyLabel Broad Institute proCap1 orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/proCap1/broad/assembly.bases.gz agpFiles /cluster/data/proCap1/broad/assembly.agp qualFiles /cluster/data/proCap1/broad/proCap1.qual.qac dbDbSpeciesDir rockHyrax _EOF_ makeGenomeDb.pl -workhorse kolossus proCap1.config.ra > makeGenomeDb.out 2>&1 & cut -f 2 chrom.sizes | ave stdin # Q1 1549.000000 # median 4249.000000 # Q3 11283.000000 # average 10119.316214 # min 315.000000 # max 390036.000000 # count 295006 # total 2985258999.000000 # standard deviation 16728.518926 ######################################################################### ## Repeat Masker (DONE - 2008-10-16 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/proCap1/bed/repeatMasker cd /hive/data/genomes/proCap1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` proCap1 > do.log 2>&1 & # real 867m49.502s twoBitToFa proCap1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 2985258999 bases (577411318 N's 2407847681 real 1731549681 upper 676298000 # lower) in 295006 sequences in 1 files # %22.65 masked total, %28.09 masked rea ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-16 - Hiram) screen # use a screen to manage this job mkdir /cluster/data/proCap1/bed/simpleRepeat cd /cluster/data/proCap1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/proCap1/bed/simpleRepeat proCap1 > do.log 2>&1 & # real 67m53.007s # batch failed on pk, finish it manually # this was an unusual error, when TrfRun.csh was splitting up the .lft # file, it ended up with a single file in the last split, and that # single file had no results for it, so the lift of that non-existent # fail failed. Temporarily changed, in TrfRun.csh: # split -l 500 $inLft SplitLft. # to be: # split -l 400 $inLft SplitLft. # and ran TrfRun.csh manually: time ./TrfRun.csh /san/sanvol1/scratch/proCap1/TrfPart/039/039.lst.bed # now, continuing: time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -continue=filter \ -buildDir=/cluster/data/proCap1/bed/simpleRepeat proCap1 \ > filter.log 2>&1 & cat fb.simpleRepeat # 24599087 bases of 2407847681 (1.022%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/proCap1 rm proCap1.2bit twoBitMask proCap1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed proCap1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa proCap1.2bit stdout | faSize stdin > proCap1.2bit.faSize.txt # 2985258999 bases (577411318 N's 2407847681 real 1731119427 upper 676728254 # lower) in 295006 sequences in 1 files # %22.67 masked total, %28.11 masked rea # link to gbdb ln -s `pwd`/proCap1.2bit /gbdb/proCap1 ########################################################################### # prepare for kluster runs (DONE 2008-10-16 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # proCap1: 2407847681 # thus: 1024 * 2407847681/2881421696 = 855 # rounding up to 900 to be conservative cd /hive/data/genomes/proCap1 time blat proCap1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=proCap1.11.ooc -repMatch=900 # Wrote 30662 overused 11-mers to proCap1.11.ooc # real 2m12.136s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/proCap1 cp -p proCap1.2bit chrom.sizes proCap1.11.ooc \ /hive/data/staging/data/proCap1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute proCap1 (NCBI project 13972, ABRQ01000000)" where name="proCap1";' hgcentraltest ############################################################################ # proCap1 - Rock hyrax - Ensembl Genes version 51 (DONE - 2008-12-02 - # hiram) ssh hgwdev cd /hive/data/genomes/proCap1 cat << '_EOF_' > proCap1.ensGene.ra # required db variable db proCap1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the two genes that have invalid structures from Ensembl: # 4595: ENSPCAT00000007286 no exonFrame on CDS exon 1 # 28894: ENSPCAT00000000699 no exonFrame on CDS exon 4 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 proCap1.ensGene.ra ssh hgwdev cd /hive/data/genomes/proCap1/bed/ensGene.51 featureBits proCap1 ensGene # 25217399 bases of 2407847681 (1.047%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/proCap1/bed/ensGene.51 ########################################################################### # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/proCap1/bed/cpgIslands cd /hive/data/genomes/proCap1/bed/cpgIslands time doCpgIslands.pl proCap1 > do.log 2>&1 # real 884m46.543s # fixing broken command in script: time ./doLoadCpg.csh # real 3m6.649s time doCpgIslands.pl -continue=cleanup proCap1 > cleanup.log 2>&1 # real 105m41.557s cat fb.proCap1.cpgIslandExt.txt # 13143269 bases of 2407847681 (0.546%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/proCap1/bed/genscan cd /hive/data/genomes/proCap1/bed/genscan time doGenscan.pl proCap1 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 483m14.978s time doGenscan.pl -continue=load proCap1 > load.log 2>&1 # real 39m19.361s cat fb.proCap1.genscan.txt # 66421419 bases of 2407847681 (2.759%) in intersection cat fb.proCap1.genscanSubopt.txt # 74279950 bases of 2407847681 (3.085%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S proCap1 mkdir /hive/data/genomes/proCap1/bed/windowMasker cd /hive/data/genomes/proCap1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev proCap1 > do.log 2>&1 # real 1091m9.829s # fixing a broken command in the doLoad step: time ./lastLoad.csh # real 11m17.751s time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \ -dbHost=hgwdev proCap1 > cleanup.log 2>&1 # real 2m21.973s sed -e 's/^/ #\t/' fb.proCap1.windowmaskerSdust.beforeClean.txt # 1550951798 bases of 2985258999 (51.954%) in intersection sed -e 's/^/ #\t/' fb.proCap1.windowmaskerSdust.clean.txt # 973540480 bases of 2985258999 (32.612%) in intersection sed -e 's/^/ #\t/' fb.proCap1.rmsk.windowmaskerSdust.txt # 389997236 bases of 2985258999 (13.064%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2407847681 / 2897316137 \) \* 1024 # ( 2407847681 / 2897316137 ) * 1024 = 851.006900 # round up to 900 cd /hive/data/genomes/proCap1 time blat proCap1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/proCap1.11.ooc -repMatch=900 # Wrote 30662 overused 11-mers to jkStuff/proCap1.11.ooc # real 0m50.748s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" proCap1 | sort | uniq -c # 552392 yes # cd /hive/data/genomes/proCap1/jkStuff # gapToLift proCap1 proCap1.nonBridged.lift -bedFile=proCap1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' proCap1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Procavia capensis not found on the list # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add proCap1 just after ce2 # proCap1 (Rock hyrax) proCap1.serverGenome = /hive/data/genomes/proCap1/proCap1.2bit proCap1.clusterGenome = /hive/data/genomes/proCap1/proCap1.2bit proCap1.ooc = /hive/data/genomes/proCap1/jkStuff/proCap1.11.ooc proCap1.lift = no proCap1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} proCap1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} proCap1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} proCap1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} proCap1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} proCap1.refseq.mrna.native.load = no proCap1.refseq.mrna.xeno.load = yes proCap1.genbank.mrna.xeno.load = yes proCap1.genbank.est.native.load = no proCap1.downloadDir = proCap1 proCap1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding proCap1 Rock hyrax" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for proCapNames" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S proCap1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial proCap1 & # var/build/logs/2012.06.08-09:53:54.proCap1.initalign.log # real 2549m58.923s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad proCap1 & # logFile: var/dbload/hgwdev/logs/2012.06.11-13:27:43.dbload.log # real 8m41.825s # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add proCap1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added proCap1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_15515:19471-24894" where name="proCap1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/proCap1/pushQ cd /hive/data/genomes/proCap1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl proCap1 2> stderr.txt | grep -v transMap > proCap1.sql # real 3m42.061s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/proCap1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/proCap1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/proCap1/bbi/quality.bw # WARNING: proCap1 does not have seq # WARNING: proCap1 does not have extFile # WARNING: proCap1 does not have estOrientInfo scp -p proCap1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/proCap1.sql" ############################################################################