# for emacs: -*- mode: sh; -*- # Caenorhabditis briggsae # Washington University School of Medicine GSC and Sanger Institute # # It isn't clear which strain this sequence belongs to: # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAAC02 # ########################################################################### ## Download sequence (DONE - 2011-05-24 - Hiram) mkdir /hive/data/genomes/cb4 cd /hive/data/genomes/cb4 mkdir ws225 cd ws225 wget --no-parent --timestamping -m -nH --cut-dirs=5 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS225/genomes/c_briggsae/ # only one chr name needs to change for UCSC names: chrun -> chrUn zcat sequences/dna/c_briggsae.WS225.dna.fa.gz \ | sed -e "s/chrun/chrUn/" | gzip -c > ws225.cb4.fa.gz sed -e "s/chrun/chrUn/g" sequences/dna/c_briggsae.WS225.agp > ws225.cb4.agp # this WS225 sequence: faCount sequences/dna/c_briggsae.WS225.dna.fa.gz #seq len A C G T N cpg chrI 15455979 4674815 2814496 2823698 4685612 457358 515219 chrI_random 43675 13205 8196 7334 13918 1022 1189 chrII 16627154 5014506 3007279 2999594 5039234 566541 523789 chrIII 14578851 4425093 2660000 2650645 4435169 407944 476040 chrIII_random 105344 30820 18762 18708 30552 6502 3373 chrIV 17485439 5341252 3129458 3132848 5340787 541094 549865 chrIV_random 25117 7259 4832 4883 6642 1501 1006 chrV 19495157 5944973 3554042 3563651 5952440 480051 614609 chrV_random 85292 26967 16100 15801 25572 852 2433 chrX 21540570 6597031 3953736 3955488 6613637 420678 679173 chrX_random 28673 9151 5734 5452 8335 1 972 chrun 2948414 899162 512035 515443 902192 119582 87954 total 108419665 32984234 19684670 19693545 33054090 3003126 3455622 # verify you get the same in ws225.cb4.fa.gz faCount ws225.cb4.fa.gz ########################################################################### ## Initial sequence (DONE - 2011-05-24 - Hiram) cd /hive/data/genomes/cb4 cat << '_EOF_' > cb4.config.ra # Config parameters for makeGenomeDb.pl: db cb4 # clade worm # genomeCladePriority 10 scientificName Caenorhabditis briggsae commonName C. briggsae assemblyDate Oct. 2010 assemblyShortLabel WS225 assemblyLabel Washington University School of Medicine GSC and Sanger Institute cb4 orderKey 867 # used to be: AC186293 == 95102164 mitoAcc NC_009885.1 fastaFiles /hive/data/genomes/cb4/ws225/ws225.cb4.fa.gz agpFiles /cluster/data/cb4/ws225/ws225.cb4.agp # qualFiles /dev/null dbDbSpeciesDir worm taxId 6238 '_EOF_' # << happy emacs mkdir jkStuff # run just to AGP to make sure things are sane first time nice -n +19 makeGenomeDb.pl cb4.config.ra -stop=agp \ > jkStuff/makeGenomeDb.agp.log 2>&1 # real 0m21.703s # check that log to verify it has no errors # now, continuing to make the Db and all time nice -n +19 makeGenomeDb.pl cb4.config.ra -continue=db \ > jkStuff/makeGenomeDb.db.log 2>&1 # real 1m3.490s # take the trackDb business there and check it into the source tree # fixup the description, gap and gold html page descriptions ########################################################################### ## RepeatMasker (DONE - 2011-05-24 - Hiram) mkdir /hive/data/genomes/cb4/bed/repeatMasker cd /hive/data/genomes/cb4/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=swarm \ -buildDir=`pwd` cb4 > do.log 2>&1 & # real 24m19.647s # from the do.log: # RepeatMasker version development-$Id: RepeatMasker,v # 1.25 2010/09/08 21:32:26 angie Exp $ # CC RELEASE 20090604; cat faSize.rmsk.txt # 108434085 bases (3003126 N's 105430959 real 84307872 upper 21123087 lower) # in 13 sequences in 1 files # %19.48 masked total, %20.03 masked real ########################################################################### ## Simple Repeats (DONE - 2011-05-24 - Hiram) mkdir /cluster/data/cb4/bed/simpleRepeat cd /cluster/data/cb4/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \ -workhorse=hgwdev -buildDir=`pwd` cb4 > do.log 2>&1 & # real 12m38.324s cat fb.simpleRepeat # 4521985 bases of 108371485 (4.173%) in intersection ########################################################################### ## WindowMasker (DONE - 2011-05-24 - Hiram) ssh hgwdev mkdir /hive/data/genomes/cb4/bed/windowMasker cd /hive/data/genomes/cb4/bed/windowMasker time nice -n +19 doWindowMasker.pl -verbose=2 -buildDir=`pwd` \ -workhorse=hgwdev cb4 > do.log 2>&1 & # real 3m39.332s twoBitToFa cb4.wmsk.sdust.2bit stdout | faSize stdin # 108434085 bases (3003126 N's 105430959 real 67017090 upper 38413869 lower) # in 13 sequences in 1 files # %35.43 masked total, %36.44 masked real # load this initial data to get ready to clean it cd /hive/data/genomes/cb4/bed/windowMasker hgLoadBed cb4 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 819950 elements of size 3 featureBits -countGaps cb4 windowmaskerSdust # 41415268 bases of 108434085 (38.194%) in intersection # eliminate the gaps from the masking featureBits cb4 -not gap -bed=notGap.bed # 108371485 bases of 108371485 (100.000%) in intersection time nice -n +19 featureBits cb4 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 41352668 bases of 108371485 (38.158%) in intersection # reload track to get it clean hgLoadBed cb4 windowmaskerSdust cleanWMask.bed.gz # Loaded 820028 elements of size 4 featureBits -countGaps cb4 windowmaskerSdust # 41352668 bases of 108434085 (38.136%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../cb4.unmasked.2bit stdin \ -type=.bed cb4.cleanWMSdust.2bit twoBitToFa cb4.cleanWMSdust.2bit stdout | faSize stdin \ > cb4.cleanWMSdust.faSize.txt cat cb4.cleanWMSdust.faSize.txt # 108434085 bases (3003126 N's 105430959 real 67017090 upper 38413869 lower) # in 13 sequences in 1 files # %35.43 masked total, %36.44 masked real ######################################################################## # MASK SEQUENCE WITH WM+TRF (DONE - 2011-05-24 - Hiram) cd /hive/data/genomes/cb4 twoBitMask -add bed/windowMasker/cb4.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed cb4.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa cb4.2bit stdout | faSize stdin > faSize.cb4.txt cat faSize.cb4.txt # 108434085 bases (3003126 N's 105430959 real 66928053 upper 38502906 lower) # in 13 sequences in 1 files # %35.51 masked total, %36.52 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/cb4/cb4.2bit ln -s `pwd`/cb4.2bit /gbdb/cb4/cb4.2bit ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2011-05-24 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # C. briggsae size is ~3.4% of human judging by gapless cb4 vs. hg18 # genome sizes from featureBits. So we would use 34, but that yields a # very high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. cd /cluster/data/cb4 blat cb4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/cb4.11.ooc -repMatch=100 cd jkStuff gapToLift -verbose=2 cb4 cb4.nonBridged.lift -bedFile=cb4.nonBridged.bed # chrom count: 13 # found 626 gaps of size >= 0 # bed output requested to cb4.nonBridged.bed # no gaps on chrom: chrX_random, size: 28673 # no gaps on chrom: chrIV_random, size: 25117 # no gaps on chrom: chrV_random, size: 85292 # no gaps on chrom: chrM, size: 14420 cd .. # Wrote 8141 overused 11-mers to jkStuff/cb4.11.ooc mkdir /hive/data/staging/data/cb4 cp -p chrom.sizes cb4.2bit jkStuff/cb4.11.ooc jkStuff.cb4.nonBridged.lift \ /hive/data/staging/data/cb4 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2011-05-24,25 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add cb4 just before ce6 # cb4 (C. briggsae) cb4.serverGenome = /hive/data/genomes/cb4/cb4.2bit cb4.clusterGenome = /scratch/data/cb4/cb4.2bit cb4.ooc = /scratch/data/cb4/cb4.11.ooc cb4.lift = /hive/data/genomes/cb4/jkStuff/cb4.nonBridged.lift cb4.align.unplacedChroms = chrUn,chr*_random cb4.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} cb4.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} cb4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} cb4.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} cb4.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} cb4.refseq.mrna.native.load = no cb4.refseq.mrna.xeno.load = yes cb4.genbank.mrna.xeno.load = yes cb4.downloadDir = cb4 git commit -m "Added cb4 C. elegans WS225" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial cb4 & # logFile: var/build/logs/2011.05.25-10:55:02.cb4.initalign.log # real 291m13.960s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad cb4 # logFile: var/dbload/hgwdev/logs/2011.05.25-15:46:59.dbload.log # real 22m27.349s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add cb4 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added cb4 - C. briggsae WS220" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # lastz ce10 to cb4 swap (DONE - 2011-06-07 - Hiram) # the original alignment on ce10 cd /hive/data/genomes/ce10/bed/lastzCb4.2011-06-07 cat fb.ce10.chainCb4Link.txt # 39499136 bases of 100286070 (39.386%) in intersection mkdir /hive/data/genomes/cb4/bed/blastz.ce10.swap cd /hive/data/genomes/cb4/bed/blastz.ce10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce10/bed/lastzCb4.2011-06-07/DEF \ -syntenicNet -workhorse=hgwdev -bigClusterHub=swarm \ -smallClusterHub=encodek -swap > swap.log 2>&1 & # real 2m17.430s cat fb.cb4.chainCe10Link.txt # 39219709 bases of 108371485 (36.190%) in intersection ############################################################################ # Constructing Downloads (DONE - 2011-06-10 - Hiram) cd /hive/data/genomes/cb4 time makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 cb4 \ > downloads.log 2>&1 # real 1m33.231s # fixup the README files constructed in goldenPath/*/README.txt # add window masker bed file: cp -p bed/windowMasker/cleanWMask.bed.gz \ goldenPath/bigZips/chromWMSdust.bed.gz ############################################################################