######################################################################### # Erinaceus europaeus -- European Hedgehog # Broad eriEur1 2X release # http://www.ncbi.nlm.nih.gov/genome/227 # http://www.ncbi.nlm.nih.gov/bioproject/74585 - Illumina # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANN00 - the 2X # http://www.ncbi.nlm.nih.gov/bioproject/11862 - chrMt ######################################################################### # Download sequence (2006-03-04 kate) ssh kkstore05 mkdir -p /cluster/store12/eriEur1 ln -s /cluster/store12/eriEur1 /cluster/data/eriEur1 cd /cluster/data/eriEur1 mkdir downloads cd downloads wget -nd -r ftp://ftp.broad.mit.edu/pub/assemblies/mammals/hedgehog/eriEur1 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh hgwdev cd /cluster/data/eriEur1 cat << '_EOF_' >eriEur1.config.ra # Config parameters for makeGenomeDb.pl: db eriEur1 clade mammal genomeCladePriority 37 scientificName Erinaceus europaeus commonName hedgehog assemblyDate June 2006 assemblyLabel Broad Institute eriEur1 (Draft_v1) orderKey 200 mitoAcc 5835792 fastaFiles /cluster/data/eriEur1/downloads/assembly.bases.gz agpFiles /cluster/data/eriEur1/downloads/assembly.agp qualFiles /cluster/data/eriEur1/downloads/scaffold.lifted.qac dbDbSpeciesDir hedgehog '_EOF_' makeGenomeDb.pl -verbose=2 -stop=seq eriEur1.config.ra > makeGenomeDb.out & # Need dbDb entry for name lookup hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("eriEur1", "June 2006", "/gbdb/eriEur1", "Hedgehog", \ "", 0, 100, "Hedgehog", \ "Erinaceus europaeus", "", 0, 0, \ "Broad Institute eriEur1 (Draft_v1)")' -h localhost hgcentraltest ################################################ ## WINDOWMASKER (2006-03-04 kate) ssh kkstore05 cd /cluster/data/eriEur1/bed/ nice /cluster/bin/scripts/doWindowMasker.pl \ -workhorse=kolossus eriEur1 >& wmRun.log & ln -s WindowMasker.2007-03-05 WMRun mv wmRun.log WMRun cd WMRun # upper-case n's left by WM (request to Andy to fix this) twoBitToFa eriEur1.wmsk.sdust.2bit stdout | tr n N | \ faToTwoBit stdin /cluster/data/eriEur1/eriEur1.2bit # stats on masking cd /cluster/data/eriEur1 twoBitToFa eriEur1.2bit stdout | nice faSize stdin >& faSize.log & #3367787358 bases (1234652522 N's 2133134836 real 1047679039 upper 1085455797 lower) in 379802 sequences in 1 files #Total size: mean 8867.2 sd 18005.2 min 201 (scaffold_126826) max 555390 (scaffold_371434) median 2901 # 50.6% masked # Mouse and rat are ~43%, from RM. TRF gives them an extra 2%, skip here mkdir -p /san/sanvol1/scratch/eriEur1 cp -p eriEur1.2bit chrom.sizes /san/sanvol1/scratch/eriEur1 # load this table after creating a db (DONE - 2008-10-21 - Hiram) cd /hive/data/genomes/eriEur1/bed/WindowMasker.2007-03-05 hgLoadBed eriEur1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 10735601 elements of size 3 # real 3m25.230s ################################################ # DOWNLOADS (2007-06-05 kate) ssh kkstore05 cd /cluster/data/eriEur1 mkdir bigZips cd bigZips nice twoBitToFa ../eriEur1.2bit eriEur1.fa cp ../downloads/assembly.agp eriEur1.agp nice gzip eriEur1.fa eriEur1.agp md5sum *.gz > md5sum.txt ssh hgwdev set d = /usr/local/apache/htdocs/goldenPath set bd = /cluster/data/eriEur1 cp $d/sorAra1/bigZips/README.txt $bd/bigZips # edit mkdir -p $d/eriEur1/bigZips ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/eriEur1/bigZips ############################################################################## ## Repeat Masker (DONE - 2008-10-21 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/eriEur1/bed/repeatMasker cd /hive/data/genomes/eriEur1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` eriEur1 > do.log 2>&1 & # real 307m30.737s twoBitToFa eriEur1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 3367787358 bases (1234652522 N's 2133134836 real 1749034540 upper 384100296 # lower) in 379802 sequences in 1 files # %11.41 masked total, %18.01 masked real ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-21 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/eriEur1/bed/simpleRepeat cd /hive/data/genomes/eriEur1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/eriEur1/bed/simpleRepeat eriEur1 > do.log 2>&1 & # real 74m17.044s # failed during load due to no eriEur1 database present, add that # (see below) and run the doLoad.csh manually: /hive/data/genomes/eriEur1/bed/simpleRepeat time ./doLoad.csh # real 2m34.038s cat fb.simpleRepeat # 47016125 bases of 2133134836 (2.204%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/eriEur1 # do NOT disturb the existing WindowMasked sequence twoBitMask eriEur1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed \ eriEur1.rmTrf.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa eriEur1.rmTrf.2bit stdout | faSize stdin \ > eriEur1.rmTrf.faSize.txt # 3367787358 bases (1234652522 N's 2133134836 real 1747981326 upper 385153510 # lower) in 379802 sequences in 1 files # %11.44 masked total, %18.06 masked rea # link to gbdb # do NOT replace the existing WindowMasked sequence # ln -s `pwd`/eriEur1.2bit /gbdb/eriEur1 ######################################################################### # creating DB to make it easier to work with this (DONE - 2008-10-20 - Hiram) cd /hive/data/genomes/eriEur1/downloads qaToQac assembly.quals.gz stdout \ | qacAgpLift assembly.agp stdin eriEur1.quals.qac cd /hive/data/genomes/eriEur1 # edit eriEur1.config.ra to set: # qualFiles /cluster/data/eriEur1/downloads/eriEur1.quals.qac makeGenomeDb.pl -workhorse=hgwdev -continue=agp -stop=agp \ eriEur1.config.ra > makeAgp.log 2>&1 time makeGenomeDb.pl -workhorse=hgwdev -continue=db -stop=db \ eriEur1.config.ra > makeDb.log 2>&1 # real 32m3.304s makeGenomeDb.pl -workhorse=hgwdev -continue=dbDb -stop=dbDb \ eriEur1.config.ra > makeDbDb.log 2>&1 ######################################################################### # prepare for kluster runs (DONE _ 2008-10-21 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # eriEur1: 2133134836 # thus: 1024 * 2133134836/2881421696 = 758 # rounding up to 800 for a bit more conservative masking cd /hive/data/genomes/eriEur1 time blat eriEur1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=eriEur1.11.ooc -repMatch=800 # Wrote 43503 overused 11-mers to eriEur1.11.ooc # real 2m23.915s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/eriEur1 cp -p eriEur1.2bit chrom.sizes eriEur1.11.ooc \ /hive/data/staging/data/eriEur1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-11-17 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute eriEur1 (Draft_v1, NCBI project 12575)" where name="eriEur1";' hgcentraltest hgsql -e 'update dbDb set organism="Hedgehog" where name="eriEur1";' hgcentraltest ############################################################################ # eriEur1 - Hedgehog - Ensembl Genes version 51 (DONE - 2008-12-04 - hiram) ssh kkr14u08 cd /hive/data/genomes/eriEur1 cat << '_EOF_' > eriEur1.ensGene.ra # required db variable db eriEur1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the three genes that have invalid structures from Ensembl: # 4691: ENSEEUT00000004188 no exonFrame on CDS exon 7 # 35795: ENSEEUT00000003156 no exonFrame on CDS exon 4 # 40908: ENSEEUT00000001064 no exonFrame on CDS exon 2 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 eriEur1.ensGene.ra ssh hgwdev cd /hive/data/genomes/eriEur1/bed/ensGene.51 featureBits eriEur1 ensGene # 21932124 bases of 2133134836 (1.028%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/eriEur1/bed/ensGene.51 ############################################################################ # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/eriEur1/bed/cpgIslands cd /hive/data/genomes/eriEur1/bed/cpgIslands time doCpgIslands.pl eriEur1 > do.log 2>&1 # fixing broken script, manually: time ./doLoadCpg.csh # real 2m21.892s # continuing time doCpgIslands.pl -continue=cleanup eriEur1 > cleanup.log 2>&1 # real 25m45.187s cat fb.eriEur1.cpgIslandExt.txt # 8392822 bases of 2133134836 (0.393%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/eriEur1/bed/genscan cd /hive/data/genomes/eriEur1/bed/genscan time doGenscan.pl eriEur1 > do.log 2>&1 # real 852m24.551s cat fb.eriEur1.genscan.txt # 23586367 bases of 2133134836 (1.106%) in intersection cat fb.eriEur1.genscanSubopt.txt # 20779803 bases of 2133134836 (0.974%) in intersection ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Erinaceus europaeus 10 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add eriEur1 just after ce2 # eriEur1 (European Hedgehog) eriEur1.serverGenome = /hive/data/genomes/eriEur1/eriEur1.2bit eriEur1.clusterGenome = /scratch/data/eriEur1/eriEur1.2bit eriEur1.ooc = /scratch/data/eriEur1/eriEur1.11.ooc eriEur1.lift = no eriEur1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} eriEur1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} eriEur1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} eriEur1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} eriEur1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} eriEur1.refseq.mrna.native.load = no eriEur1.refseq.mrna.xeno.load = yes eriEur1.genbank.mrna.xeno.load = yes eriEur1.genbank.est.native.load = no eriEur1.downloadDir = eriEur1 eriEur1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding eriEur1 European hedgehog" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for eriEurNames" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S eriEur1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial eriEur1 & # var/build/logs/2012.05.29-09:59:15.eriEur1.initalign.log # real 8831m21.557s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad eriEur1 & # logFile: var/dbload/hgwdev/logs/2012.06.07-16:03:53.dbload.log # real 13m24.324s # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add eriEur1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added eriEur1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-20 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_373857:3457-13237" where name="eriEur1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-20 - Hiram) mkdir /hive/data/genomes/eriEur1/pushQ cd /hive/data/genomes/eriEur1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl eriEur1 2> stderr.txt | grep -v transMap > eriEur1.sql scp -p eriEur1.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < eriEur1.sql ############################################################################