# Bos Taurus -- University of Maryland UMD Release 3.0 (Aug 25 2009) # # "$Id: bosTauMd3.txt,v 1.4 2009/11/30 19:10:04 galt Exp $" # # creating minimal build in order make liftover files between MD3 and Btau4 ########################################################################### # set up main genome directory ssh hgwdev cd /hive/data/genomes mkdir bosTauMd3 cd bosTauMd3 # DOWNLOAD SEQUENCE (DONE - 2009-11-18 - Galt) mkdir download cd download # get sequence from UMD wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp' wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz' # fixup ids for agp # change ">Chr" # to ">chr" mv bos_taurus.agp bos_taurus.orig.agp cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp # back to the main directory cd /hive/data/genomes/bosTauMd3 # Run automation to make the basic genome cat << '_EOF_' > bosTauMd3.config.ra # Config parameters for makeGenomeDb.pl: db bosTauMd3 clade mammal scientificName Bos Taurus assemblyDate Aug. 2009 assemblyLabel Univ. Maryland Release 3.0 orderKey 236 dbDbSpeciesDir cow mitoAcc 60101824 commonName Cow taxId 9913 fastaFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.fa.gz agpFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.agp subsetLittleIds Y '_EOF_' # << happy emacs time makeGenomeDb.pl bosTauMd3.config.ra > & makeGenomeDb.pl.out & # took 11 minutes #note: I added subsetLittleIds option to config.ra # because I had checked the .agp which was ok, # but the script complained that there were some extra # sequences in the fasta file. This option allows # the system to ignore those, as long as everything # in column 6 of agp is found in the fasta file(s). featureBits -countGaps bosTauMd3 gap #20737263 bases of 2660922743 (0.779%) in intersection cat chrom.sizes | awk '{sum+=$2;print sum,$0}' #2660922743 # similar total # TODO # Organism Image wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \ 'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg' # TODO # Edit and check-in templates for description.html, gold.html, gap.html, bosTauMd3/trackDb.ra # repeat mask # for next time, put this under the bed/ dir mkdir repeatMasker cd repeatMasker time doRepeatMasker.pl bosTauMd3 -buildDir=`pwd` > & doRepeatMasker.pl.out & cat faSize.rmsk.txt #%47.84 masked total, %48.22 masked real featureBits -countGaps bosTauMd3 rmsk #1273525727 bases of 2660922743 (47.860%) in intersection # simple repeat masker trf cd /hive/data/genomes/bosTauMd3/bed mkdir simpleRepeat cd simpleRepeat time doSimpleRepeat.pl bosTauMd3 -buildDir=`pwd` > & doSimpleRepeat.pl.out & # failed only on chrM which doesn't matter # perhaps someone will look into why bigTrf fails on chrM ssh swarm cd /hive/data/genomes/bosTauMd3/bed/simpleRepeat/run.cluster /parasol/bin/para time > run.time time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTauMd3 \ >> & doSimpleRepeat.pl.out & featureBits -countGaps bosTauMd3 simpleRepeat #29450984 bases of 2660922743 (1.107%) in intersection # make final masked .2bit cd /hive/data/genomes/bosTauMd3 twoBitMask bosTauMd3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTauMd3.2bit #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it #might contain block coordinates, but this program uses only the first three #fields # (the entire span -- no support for blocks). # this seems to be generic output we always see. ############################################################################ # prepare cluster data (DONE - 2009-11-20 - Galt) ssh hgwdev cd /hive/data/genomes/bosTauMd3 # create gbdb symlink ln -s `pwd`/bosTauMd3.2bit /gbdb/bosTauMd3/ time blat bosTauMd3.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024 #Wrote 27298 overused 11-mers to 11.ooc #151.299u 3.434s 3:06.07 83.1% 0+0k 0+0io 3pf+0w mkdir /hive/data/staging/data/bosTauMd3 cp -p bosTauMd3.2bit /hive/data/staging/data/bosTauMd3 cp -p 11.ooc /hive/data/staging/data/bosTauMd3 cp -p chrom.sizes /hive/data/staging/data/bosTauMd3 # ask admin to sync this directory: /hive/data/staging/data/bosTauMd3/ # to the kluster nodes /scratch/data/bosTauMd3/ ############################################################################# # LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt ) mkdir /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23 cd /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23 time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ bosTauMd3 bosTau4 >& do.log # it actually ran out of space on /scratch/tmp (used over 16GB temporarily) # and I moved the temp dir somewhere else and # finished the net step. Then I did the load and cleanup steps. ################################################### #Change bosTauMd3 in hgcentraltest.dbDb to active=0: hgsql hgcentraltest update dbDb set active=0 where name='bosTauMd3'; # changed because liftover is confusing otherwise # when people are expecting successive versions # of the same assembly rather than alternate assemblies. update dbDb set description='Aug. 2009 (UMD3)' where name='bosTauMd3';