# Tupaia belangeri -- 2X Tree Shrew 2X assembly from the Broad # Assembly named: tupBel1 # Sequence downloaded and masked by Robert Baertsch, 12/06 # http://www.ncbi.nlm.nih.gov/bioproject/13971 # http://www.ncbi.nlm.nih.gov/genome/473 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAPY00 ################################################ # DOWNLOADS (2007-06-05 kate) # Posted masked sequence to download site since # it is used in the 28way multiz on hg18 ssh kkstore05 cd /cluster/data/tupBel1 mkdir bigZips cd bigZips nice twoBitToFa ../tupBel1.2bit tupBel1.fa nice gzip tupBel1.fa md5sum *.gz > md5sum.txt ssh hgwdev set d = /usr/local/apache/htdocs/goldenPath set bd = /cluster/data/tupBel1 cp $d/sorAra1/bigZips/README.txt $bd/bigZips # EDIT mkdir -p $d/tupBel1/bigZips ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/tupBel1/bigZips ############################################################################ ## BLASTZ swap from mm9 alignments (2007-11-11 - markd) # ERROR: chainMm9Link failed, build not complete ssh hgwdev mkdir /cluster/data/tupBel1/bed/blastz.mm9.swap cd /cluster/data/tupBel1/bed/blastz.mm9.swap ln -s blastz.mm9.swap ../blastz.mm9 /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/mm9/bed/blastz.tupBel1/DEF >& swap.out& # GOT ERROR: load data local infile 'link.tab' into table chainMm9Link mySQL error 1114: The table 'chainMm9Link' is full # fb.tupBel1.chainMm9Link.txt: # # fixing the broken tables: cd /hive/data/genomes/tupBel1/bed/blastz.mm9.swap/axtChain # fixups done to finish: Thu May 10 19:18:48 PDT 2012 - Hiram # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainMm9Link table # need max length of tName for tName KEY size: cut -f3 chain.tab | awk '{print length($0)}' | sort -rn | head CREATE TABLE chainMm9 ( bin smallint(5) unsigned NOT NULL default '0', score double not null, # score of chain tName varchar(255) not null, # Target sequence name tSize int unsigned not null, # Target sequence size tStart int unsigned not null, # Alignment start position in target tEnd int unsigned not null, # Alignment end position in target qName varchar(255) not null, # Query sequence name qSize int unsigned not null, # Query sequence size qStrand char(1) not null, # Query strand qStart int unsigned not null, # Alignment start position in query qEnd int unsigned not null, # Alignment end position in query id int unsigned not null, # chain id #Indices KEY tName (tName(25),bin), KEY id (id) ) TYPE=MyISAM; hgsql -e "drop table chainMm9;" tupBel1 hgsql tupBel1 < chainMm9.sql time nice -n +19 hgsql -e \ "load data local infile \"chain.tab\" into table chainMm9;" tupBel1 # real 1m2.412s CREATE TABLE chainMm9Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(25),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainMm9Link;" tupBel1 # real 14m51.892s # finish off the nets: time netClass -verbose=0 -noAr noClass.net tupBel1 mm9 tupBel1.mm9.net # real 5m1.569s time netFilter -minGap=10 tupBel1.mm9.net \ | hgLoadNet -verbose=0 tupBel1 netMm9 stdin # real 1m49.538s cd /hive/data/genomes/tupBel1/bed/blastz.mm9.swap time featureBits tupBel1 chainMm9Link > fb.tupBel1.chainMm9Link.txt 2>&1 # real 12m53.574s cat fb.tupBel1.chainMm9Link.txt # 639266386 bases of 2137225476 (29.911%) in intersection # the DEF file was so old, it had invalid path names cp /hive/data/genomes/mm9/bed/blastz.tupBel1/DEF . # fix path name: # SEQ2_DIR=/hive/data/genomes/tupBel1/tupBel1.2bit /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -continue=download -swap > download.out 2>&1 # Elapsed time: 1m24s ############################################################################ ## BLASTZ swap from hg18 alignments (2007-11-11 - markd) ssh hgwdev mkdir /cluster/data/tupBel1/bed/blastz.hg18.swap cd /cluster/data/tupBel1/bed/blastz.hg18.swap ln -s blastz.hg18.swap ../blastz.hg18 /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/hg18/bed/blastz.tupBel1/DEF >& swap.out& # crashed: could be out of memory # fb.tupBel1.chainHg18Link.txt: # ######################################################################### ## Repeat Masker (WORKING - 2008-10-14 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/tupBel1/bed/repeatMasker cd /hive/data/genomes/tupBel1/bed/repeatMasker time doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` tupBel1 > do.log 2>&1 & # real 660m58.642s # failed in doMask.csh due to no tupBel1.unmasked.2bit file ? # which was there at the beginning, now it is gone ? # the broken doRepeatMasker.pl script removed it due to hive confusion # went back to the jkStuff/makeUnmasked script and re-ran part of that. # then ran the doMask.csh script to finish that, and continuing: time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -continue=install -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` tupBel1 > install.log 2>&1 & # real 23m37.870s twoBitToFa tupBel1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 3660774957 bases (1523549481 N's 2137225476 real 1701215604 upper 436009872 # lower) in 150851 sequences in 1 files # %11.91 masked total, %20.40 masked real ########################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-15 - Hiram) screen # use a screen to manage this job mkdir /cluster/data/tupBel1/bed/simpleRepeat cd /cluster/data/tupBel1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/tupBel1/bed/simpleRepeat \ tupBel1 > do.log 2>&1 & # real 81m6.278s cat fb.simpleRepeat # 77116433 bases of 2137225476 (3.608%) in intersection # after RM run is done, add this mask: cd /cluster/data/tupBel1 rm tupBel1.2bit twoBitMask tupBel1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tupBel1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa tupBel1.2bit stdout | faSize stdin > tupBel.2bit.faSize.txt # 3660774957 bases (1523549481 N's 2137225476 real 1700569122 upper 436656354 # lower) in 150851 sequences in 1 files # %11.93 masked total, %20.43 masked real # link to gbdb ln -s `pwd`/tupBel1.2bit /gbdb/tupBel1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-15 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # turTru1: 2137225476 # thus: 1024 * 2137225476/2881421696 = 759 # rounding up to 800 to be conservative cd /hive/data/genomes/tupBel1 time blat tupBel1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=tupBel1.11.ooc -repMatch=800 # real 2m18.283s # Wrote 27284 overused 11-mers to tupBel1.11.ooc # and staging data for push to kluster nodes mkdir /hive/data/staging/data/tupBel1 cp -p tupBel1.2bit chrom.sizes tupBel1.11.ooc \ /hive/data/staging/data/tupBel1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ############################################################################ # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute tupBel1 (NCBI project 13971, AAPY01000000)" where name="tupBel1";' hgcentraltest ############################################################################ # tupBel1 - TreeShrew - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh kkr14u07 cd /hive/data/genomes/tupBel1 cat << '_EOF_' > tupBel1.ensGene.ra # required db variable db tupBel1 # do we need to translate geneScaffold coordinates geneScaffolds yes # after geneScaffold conversions, change Ensembl chrom names to UCSC names liftUp /cluster/data/tupBel1/jkStuff/ensGene.lft # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the two genes that have invalid structures from Ensembl: # 2993: ENSTBET00000015831 no exonFrame on CDS exon 11 # 3556: ENSTBET00000013522 no exonFrame on CDS exon 1 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 tupBel1.ensGene.ra ssh hgwdev cd /hive/data/genomes/tupBel1/bed/ensGene.51 featureBits tupBel1 ensGene # 22746299 bases of 2137225476 (1.064%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/tupBel1/bed/ensGene.51 ############################################################################ # SWAP lastz mm10 (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cat /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/fb.mm10.chainTupBel1Link.txt # 524337666 bases of 2652783500 (19.766%) in intersection # and this swap mkdir /hive/data/genomes/tupBel1/bed/blastz.mm10.swap cd /hive/data/genomes/tupBel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 136m7.163s cat fb.tupBel1.chainMm10Link.txt # 537379661 bases of 2137225476 (25.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tupBel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # cpgIslands - (DONE - 2011-04-24 - Hiram) mkdir /hive/data/genomes/tupBel1/bed/cpgIslands cd /hive/data/genomes/tupBel1/bed/cpgIslands time doCpgIslands.pl tupBel1 > do.log 2>&1 # real 296m4.437s cat fb.tupBel1.cpgIslandExt.txt # 23173299 bases of 2137225476 (1.084%) in intersection ######################################################################### # genscan - (DONE - 2011-04-27 - Hiram) mkdir /hive/data/genomes/tupBel1/bed/genscan cd /hive/data/genomes/tupBel1/bed/genscan time doGenscan.pl tupBel1 > do.log 2>&1 # real 60m14.339s # failed kluster run due to out of disk space # finished manually: # Completed: 150851 of 150851 jobs # CPU time in finished jobs: 51736s 862.26m 14.37h 0.60d 0.002 y # IO & Wait Time: 1645536s 27425.61m 457.09h 19.05d 0.052 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest finished job: 92s 1.53m 0.03h 0.00d # Submission to last job: 241720s 4028.67m 67.14h 2.80d # continuing: time doGenscan.pl -workhorse=hgwdev -continue=makeBed tupBel1 \ > makeBed.log 2>&1 # real 83m4.947s cat fb.tupBel1.genscan.txt # 57907224 bases of 2137225476 (2.709%) in intersection cat fb.tupBel1.genscanSubopt.txt # 76071378 bases of 2137225476 (3.559%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S tupBel1 mkdir /hive/data/genomes/tupBel1/bed/windowMasker cd /hive/data/genomes/tupBel1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev tupBel1 > do.log 2>&1 # real 989m54.597s # fixing a broken command in the doLoad step: time ./lastLoad.csh # real 7m1.961s time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \ -dbHost=hgwdev tupBel1 > cleanup.log 2>&1 # real 2m10.270s sed -e 's/^/ #\t/' fb.tupBel1.windowmaskerSdust.beforeClean.txt # 2375760953 bases of 3660774957 (64.898%) in intersection sed -e 's/^/ #\t/' fb.tupBel1.windowmaskerSdust.clean.txt # 852211472 bases of 3660774957 (23.280%) in intersection sed -e 's/^/ #\t/' fb.tupBel1.rmsk.windowmaskerSdust.txt # 191436221 bases of 3660774957 (5.229%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-08 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2137225476 / 2897316137 \) \* 1024 # ( 2137225476 / 2897316137 ) * 1024 = 755.360749 # round up to 800 cd /hive/data/genomes/tupBel1 time blat tupBel1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/tupBel1.11.ooc -repMatch=800 # Wrote 27284 overused 11-mers to jkStuff/tupBel1.11.ooc # real 0m55.398s # there are non-bridged gaps, construct lift file needed for genbank hgsql -N -e "select bridge from gap;" tupBel1 | sort | uniq -c # 95346 no # 594809 yes cd /hive/data/genomes/tupBel1/jkStuff gapToLift tupBel1 tupBel1.nonBridged.lift -bedFile=tupBel1.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' tupBel1.nonBridged.bed | sort -nr | head # 301412 scaffold_113710.1-440055 109313 410725 scaffold_113710.1-440055.02 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Tupaia belangeri 63 2365 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add tupBel1 # tupBel1 (tree shrew) tupBel1.serverGenome = /hive/data/genomes/tupBel1/tupBel1.2bit tupBel1.clusterGenome = /scratch/data/tupBel1/tupBel1.2bit tupBel1.ooc = /scratch/data/tupBel1/tupBel1.11.ooc tupBel1.lift = /hive/data/genomes/tupBel1/jkStuff/tupBel1.nonBridged.lift tupBel1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} tupBel1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} tupBel1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} tupBel1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} tupBel1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} tupBel1.refseq.mrna.native.load = no tupBel1.refseq.mrna.xeno.load = yes tupBel1.genbank.mrna.xeno.load = yes tupBel1.genbank.est.native.load = yes tupBel1.downloadDir = tupBel1 tupBel1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding tupBel1 tree shrew" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for tupBelNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S tupBel1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial tupBel1 & # var/build/logs/2012.06.08-10:03:29.tupBel1.initalign.log # real 2861m50.257s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad tupBel1 & # logFile: var/dbload/hgwdev/logs/2012.06.11-17:15:26.dbload.log # real 27m16.356s # enable daily alignment and update of hgwdev (DONE - 2012-05-12 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add tupBel1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added tupBel1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_149294.1-299378:275786-282564" where name="tupBel1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/tupBel1/pushQ cd /hive/data/genomes/tupBel1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl tupBel1 2> stderr.txt | grep -v transMap > tupBel1.sql # real 3m48.507s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/tupBel1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/tupBel1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/tupBel1/bbi/quality.bw # WARNING: tupBel1 does not have seq # WARNING: tupBel1 does not have extFile # WARNING: tupBel1 does not have estOrientInfo scp -p tupBel1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/tupBel1.sql" ############################################################################