######################################################################### # Sorex unguiculatus -- Common Shrew # Broad sorAra1 2X release # http://www.ncbi.nlm.nih.gov/genome/91 ######################################################################### # Download sequence (2006-03-05 kate) ssh kkstore05 mkdir -p /cluster/store12/sorAra1 ln -s /cluster/store12/sorAra1 /cluster/data/sorAra1 cd /cluster/data/sorAra1 mkdir downloads cd downloads wget -r -nd ftp://ftp.broad.mit.edu/pub/assemblies/mammals/commonShrew/sorAra1 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh hgwdev cd /cluster/data/sorAra1 cat << '_EOF_' >sorAra1.config.ra # Config parameters for makeGenomeDb.pl: db sorAra1 clade mammal genomeCladePriority 36 scientificName Sorex araneus commonName Shrew assemblyDate June 2006 assemblyLabel Broad Institute sorAra1 (Draft_v2) orderKey 100 # search Entrez nucleotide for 'sorex araneus mitochondrion complete genome, use GI #' # not found (only sorex unguiculatus) mitoAcc none fastaFiles /cluster/data/sorAra1/downloads/assembly.bases.gz agpFiles /cluster/data/sorAra1/downloads/assembly.agp qualFiles /cluster/data/sorAra1/downloads/scaffold.lifted.qac dbDbSpeciesDir shrew '_EOF_' makeGenomeDb.pl -verbose=2 -stop=seq sorAra1.config.ra >& makeGenomeDb.out & # Need dbDb entry for name lookup hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("sorAra1", "June 2006", "/gbdb/sorAra1", "Shrew", \ "", 0, 100, "Shrew", \ "Sorex araneus", "", 0, 0, \ "Broad Institute sorAra (Draft_v2)")' -h localhost hgcentraltest ################################################ ## WINDOWMASKER (2006-03-04 kate) ssh kkstore05 cd /cluster/data/sorAra1/bed/ # note: only runs on kolossus, due to network connection to NCBI # requested Andy hardcode this. nice /cluster/bin/scripts/doWindowMasker.pl sorAra1 \ -workhorse=kolossus >& wmRun.log & ln -s WindowMasker.2007-03-05 WMRun mv wmRun.log WMRun cd WMRun # upper-case n's left by WM (request to Andy to fix this) twoBitToFa sorAra1.wmsk.sdust.2bit stdout | tr n N | \ faToTwoBit stdin /cluster/data/sorAra1/sorAra1.2bit # stats on masking cd /cluster/data/sorAra1 twoBitToFa sorAra1.2bit stdout | nice faSize stdin >& faSize.log & # 2936119008 bases (1103254311 N's 1832864697 real 1116732269 upper 716132428 lower) in 262057 sequences in 1 files # Total size: mean 11204.1 sd 25927.2 min 203 (scaffold_107701) max 791662 (scaffold_254556) median 4789 # 39.1% masked # Mouse and rat are ~43%, from RM. TRF gives them an extra 2%, skip here mkdir -p /san/sanvol1/scratch/sorAra1 cp -p sorAra1.2bit chrom.sizes /san/sanvol1/scratch/sorAra1 # load this table after creating a db (DONE - 2008-10-21 - Hiram) cd /hive/data/genomes/sorAra1/bed/WindowMasker.2007-03-05 time hgLoadBed sorAra1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 11038904 elements of size 3 # real 3m20.424s ################################################ # DOWNLOADS (2007-06-05 kate) ssh kkstore05 cd /cluster/data/sorAra1 mkdir bigZips cd bigZips nice twoBitToFa ../sorAra1.2bit sorAra1.fa cp ../downloads/assembly.agp sorAra1.agp nice gzip sorAra1.fa sorAra1.agp md5sum *.gz > md5sum.txt ssh hgwdev set d = /usr/local/apache/htdocs/goldenPath set bd = /cluster/data/sorAra1 cp $d/felCat3/bigZips/README.txt $bd/bigZips # EDIT mkdir -p $d/sorAra1/bigZips ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/sorAra1/bigZips ############################################################################## # creating DB to make it easier to work with this (DONE - 2008-10-20 - Hiram) cd /hive/data/genomes/sorAra1/downloads qaToQac assembly.quals.gz stdout \ | qacAgpLift assembly.agp stdin sorAra1.quals.qac cd /hive/data/genomes/sorAra1 # edit sorAra1.config.ra to set: # qualFiles /cluster/data/sorAra1/downloads/sorAra1.quals.qac makeGenomeDb.pl -workhorse=hgwdev -continue=agp -stop=agp \ sorAra1.config.ra > makeAgp.log 2>&1 time makeGenomeDb.pl -workhorse=hgwdev -continue=db -stop=db \ sorAra1.config.ra > makeDb.log 2>&1 # real 27m48.147s time makeGenomeDb.pl -workhorse=hgwdev -continue=dbDb -stop=dbDb \ sorAra1.config.ra > makeDbDb.log 2>&1 # real 0m1.294s ############################################################################## ## Repeat Masker (DONE - 2008-10-20 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/sorAra1/bed/repeatMasker cd /hive/data/genomes/sorAra1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` sorAra1 > do.log 2>&1 & # real 258m6.902s twoBitToFa sorAra1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 2936119008 bases (1103254311 N's 1832864697 real 1411267433 upper 421597264 # lower) in 262057 sequences in 1 files # 14.36 masked total, %23.00 masked rea ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-21 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/sorAra1/bed/simpleRepeat cd /hive/data/genomes/sorAra1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/sorAra1/bed/simpleRepeat sorAra1 > do.log 2>&1 & # real 140m0.001s cat fb.simpleRepeat # 32481599 bases of 1832864697 (1.772%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/sorAra1 rm sorAra1.2bit twoBitMask sorAra1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed \ sorAra1.rmTrf.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa sorAra1.rmTrf.2bit stdout \ | faSize stdin > sorAra1.rmTrf.faSize.txt # 2936119008 bases (1103254311 N's 1832864697 real 1410119486 upper 422745211 # lower) in 262057 sequences in 1 files # %14.40 masked total, %23.06 masked real # leaving the windowmasker masked sequence in place, do not # overwrite with this one # link to gbdb # ln -s `pwd`/sorAra1.2bit /gbdb/sorAra1 ######################################################################### # prepare for kluster runs (DONE _ 2008-10-16 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # sorAra1: 1832864697 # thus: 1024 * 1832864697/2881421696 = 651 # rounding up to 700 for a bit more conservative masking cd /hive/data/genomes/sorAra1 time blat sorAra1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=sorAra1.11.ooc -repMatch=700 # Wrote 26119 overused 11-mers to sorAra1.11.ooc # real 1m57.876s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/sorAra1 cp -p sorAra1.2bit chrom.sizes sorAra1.11.ooc \ /hive/data/staging/data/sorAra1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute sorAra1 (NCBI project 20325, ABRP01000000)" where name="sorAra1";' hgcentraltest ############################################################################ # sorAra1 - Shrew - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram) ssh kkr14u06 cd /hive/data/genomes/sorAra1 cat << '_EOF_' > sorAra1.ensGene.ra # required db variable db sorAra1 # do we need to translate geneScaffold coordinates geneScaffolds yes '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 sorAra1.ensGene.ra ssh hgwdev cd /hive/data/genomes/sorAra1/bed/ensGene.51 featureBits sorAra1 ensGene # 19400431 bases of 1832864697 (1.058%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/sorAra1/bed/ensGene.51 ############################################################################ # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/sorAra1/bed/cpgIslands cd /hive/data/genomes/sorAra1/bed/cpgIslands time doCpgIslands.pl sorAra1 > do.log 2>&1 # real 823m38.651s # fixing broken command in script: time ./doLoadCpg.csh # real 2m45.268s time doCpgIslands.pl -continue=cleanup sorAra1 > cleanup.log 2>&1 # real 117m41.101s cat fb.sorAra1.cpgIslandExt.txt # 19811608 bases of 1832864697 (1.081%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/sorAra1/bed/genscan cd /hive/data/genomes/sorAra1/bed/genscan time doGenscan.pl sorAra1 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 400m44.222s # continuing: time doGenscan.pl -continue=load sorAra1 > load.log 2>&1 # real 27m31.386s cat fb.sorAra1.genscan.txt # 27964827 bases of 1832864697 (1.526%) in intersection cat fb.sorAra1.genscanSubopt.txt # 28734625 bases of 1832864697 (1.568%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1832864697 / 2897316137 \) \* 1024 # ( 1832864697 / 2897316137 ) * 1024 = 647.790355 # round up to 700 cd /hive/data/genomes/sorAra1 time blat sorAra1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/sorAra1.11.ooc -repMatch=700 # Wrote 26119 overused 11-mers to jkStuff/sorAra1.11.ooc # real 0m45.014s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" sorAra1 | sort | uniq -c # 403896 yes # cd /hive/data/genomes/sorAra1/jkStuff # gapToLift sorAra1 sorAra1.nonBridged.lift -bedFile=sorAra1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' sorAra1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Sorex unguiculatus 2 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add sorAra1 # sorAra1 (common shrew) sorAra1.serverGenome = /hive/data/genomes/sorAra1/sorAra1.2bit sorAra1.clusterGenome = /hive/data/genomes/sorAra1/sorAra1.2bit sorAra1.ooc = /hive/data/genomes/sorAra1/jkStuff/sorAra1.11.ooc sorAra1.lift = no sorAra1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} sorAra1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} sorAra1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} sorAra1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} sorAra1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} sorAra1.refseq.mrna.native.load = no sorAra1.refseq.mrna.xeno.load = yes sorAra1.genbank.mrna.xeno.load = yes sorAra1.genbank.est.native.load = no sorAra1.downloadDir = sorAra1 sorAra1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding sorAra1 common shrew" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for sorAraNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S sorAra1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial sorAra1 & # var/build/logs/2012.06.08-09:57:32.sorAra1.initalign.log # real 2505m42.331s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sorAra1 & # logFile: var/dbload/hgwdev/logs/2012.06.12-13:15:38.dbload.log # real 19m9.715s # enable daily alignment and update of hgwdev (DONE - 2012-06-12 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add sorAra1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added sorAra1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_163319:8348-15207" where name="sorAra1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/sorAra1/pushQ cd /hive/data/genomes/sorAra1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl sorAra1 2> stderr.txt | grep -v transMap > sorAra1.sql # real 3m42.169s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/sorAra1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/sorAra1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/sorAra1/bbi/quality.bw # WARNING: sorAra1 does not have seq # WARNING: sorAra1 does not have extFile # WARNING: sorAra1 does not have estOrientInfo scp -p sorAra1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/sorAra1.sql" ############################################################################