# for emacs: -*- mode: sh; -*- # This file describes browser build for the otoGar3 # Otolemur garnettii - Bushbaby genome: March 2011 # http://www.ncbi.nlm.nih.gov/genome/assembly/249878/ # http://www.ncbi.nlm.nih.gov/genome/451 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAQR03 # Input coverage estimates: Fragments: 53x; Jumps: 78x; Fosmids: 6x; TOTAL: 137x ############################################################################# # Fetch sequence from genbank (DONE - 2012-01-06 - Hiram) mkdir -p /hive/data/genomes/otoGar3/genbank cd /hive/data/genomes/otoGar3/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Otolemur_garnettii/OtoGar3/ ./ # measure total sequence in this assembly faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2519724550 bases (160194097 N's 2359530453 real 2359530453 upper 0 lower) in # 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 # min 1000 (gi|325699471|gb|AAQR03200240.1|) # max 73491278 (gi|326327706|gb|GL873520.1|) median 1593 ############################################################################# # process into UCSC naming scheme (DONE - 2012-03-12 - Hiram) cd /hive/data/genomes/otoGar3/genbank # watch out for the pattern match below: s/.*gb\|//; # depends upon what string is in the header of the fasta file cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 1m1.270s # compress these files gzip *.fa *.agp # verify nothing has changed in the sequence, should be the same as above: faSize unplaced.fa.gz # 2519724550 bases (160194097 N's 2359530453 real 2359530453 upper 0 lower) # in 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 min 1000 (AAQR03200240) # max 73491278 (GL873520) median 1593 ############################################################################# # Initial database build (DONE - 2012-03-12 - Hiram) cd /hive/data/genomes/otoGar3 cat << '_EOF_' > otoGar3.config.ra # Config parameters for makeGenomeDb.pl: db otoGar3 clade mammal genomeCladePriority 16 scientificName Otolemur garnettii commonName Bushbaby assemblyDate Mar. 2011 assemblyLabel Broad Institute (GCA_000181295.3) assemblyShortLabel Broad OtoGar3 orderKey 464 mitoAcc none fastaFiles /hive/data/genomes/otoGar3/genbank/unplaced.fa.gz agpFiles /hive/data/genomes/otoGar3/genbank/unplaced.agp.gz dbDbSpeciesDir bushbaby taxId 30611 ncbiAssemblyId 249878 ncbiAssemblyName OtoGar3 # http://www.ncbi.nlm.nih.gov/genome/assembly/249878/ '_EOF_' # << happy emacs # first verify the sequence and AGP files are OK time makeGenomeDb.pl -stop=agp -workhorse=hgwdev otoGar3.config.ra \ > agp.log 2>&1 # real 2m34.742s # verify that was OK, look at the agp.log file time makeGenomeDb.pl -continue=db -workhorse=hgwdev otoGar3.config.ra \ > db.log 2>&1 # real 18m32.134s # verify that was OK, look at the do.log file # copy the trackDb business to the source tree, check it in and add # to the trackDb/makefile ############################################################################# # running repeat masker (DONE - 2012-03-12 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/repeatMasker cd /hive/data/genomes/otoGar3/bed/repeatMasker # had a missing i from the name in the dbDb table, fixed that after # started with this species argument. time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -species "otolemur garnettii" \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek otoGar3 > do.log 2>&1 & # real 1015m56.518s cat faSize.rmsk.txt # 2519724550 bases (160194097 N's 2359530453 real 1427970093 upper # 931560360 lower) in 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 min 1000 (AAQR03200240) # max 73491278 (GL873520) median 1593 grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker time featureBits -countGaps otoGar3 rmsk # 933290813 bases of 2519724550 (37.039%) in intersection # real 0m28.081s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-03-12 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/simpleRepeat cd /hive/data/genomes/otoGar3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ otoGar3 > do.log 2>&1 & # real 11m58.743s cat fb.simpleRepeat # 27508759 bases of 2359530453 (1.166%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/otoGar3 twoBitMask otoGar3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed otoGar3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa otoGar3.2bit stdout | faSize stdin > faSize.otoGar3.2bit.txt cat faSize.otoGar3.2bit.txt # 2519724550 bases (160194097 N's 2359530453 real 1427186952 upper # 932343501 lower) in 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 min 1000 (AAQR03200240) # max 73491278 (GL873520) median 1593 # %37.00 masked total, %39.51 masked real rm /gbdb/otoGar3/otoGar3.2bit ln -s `pwd`/otoGar3.2bit /gbdb/otoGar3/otoGar3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-03-12 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/gap cd /hive/data/genomes/otoGar3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../otoGar3.unmasked.2bit > findMotif.txt 2>&1 # real real 1m7.907s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits otoGar3 -not gap -bed=notGap.bed # real 0m14.265s # 2359530453 bases of 2359530453 (100.000%) in intersection time featureBits otoGar3 allGaps.bed notGap.bed -bed=new.gaps.bed # real 174m56.543s # 0 bases of 2359530453 (0.000%) in intersection # no new gaps here hgsql -N -e "select bridge from gap;" otoGar3 | sort | uniq -c # 192447 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-03-12 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/windowMasker cd /hive/data/genomes/otoGar3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev otoGar3 > do.log 2>&1 & # about 235 minutes # Masking statistics twoBitToFa otoGar3.wmsk.2bit stdout | faSize stdin # 2519724550 bases (160194097 N's 2359530453 real 1606796836 upper # 752733617 lower) in 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 min 1000 (AAQR03200240) # max 73491278 (GL873520) median 1593 # %29.87 masked total, %31.90 masked real twoBitToFa otoGar3.wmsk.sdust.2bit stdout | faSize stdin # 2519724550 bases (160194097 N's 2359530453 real 1594373851 upper # 765156602 lower) in 7793 sequences in 1 files # Total size: mean 323331.8 sd 2530369.2 min 1000 (AAQR03200240) # max 73491278 (GL873520) median 1593 # %30.37 masked total, %32.43 masked real hgLoadBed otoGar3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 13510985 elements of size 3 time featureBits -countGaps otoGar3 windowmaskerSdust # 925350699 bases of 2519724550 (36.724%) in intersection # real 1m16.823s # eliminate the gaps from the masking time featureBits otoGar3 -not gap -bed=notGap.bed # 2359530453 bases of 2359530453 (100.000%) in intersection # real 0m16.975s time nice -n +19 featureBits otoGar3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 765156602 bases of 2359530453 (32.428%) in intersection # real 5m47.021s hgLoadBed otoGar3 windowmaskerSdust cleanWMask.bed.gz # Loaded 13582557 elements of size 4 featureBits -countGaps otoGar3 windowmaskerSdust # 765156602 bases of 2519724550 (30.367%) in intersection # DO NOT NEED TO mask the sequence with this clean mask # The RepeatMasker did a good job, using that masked sequence. # zcat cleanWMask.bed.gz \ # | twoBitMask ../../otoGar3.unmasked.2bit stdin \ # -type=.bed otoGar3.cleanWMSdust.2bit # twoBitToFa otoGar3.cleanWMSdust.2bit stdout | faSize stdin \ # > otoGar3.cleanWMSdust.faSize.txt # cat otoGar3.cleanWMSdust.faSize.txt # 824327835 bases (216289280 N's 608038555 real 418637635 upper # 189400920 lower) in 427428 sequences in 1 files # %22.98 masked total, %31.15 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps otoGar3 rmsk windowmaskerSdust # 487674435 bases of 2519724550 (19.354%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-03-14 - Hiram) # Do not need to do this since Repeat Masker was used ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/cpgIslands cd /hive/data/genomes/otoGar3/bed/cpgIslands time doCpgIslands.pl otoGar3 > do.log 2>&1 # real 29m34.875s cat fb.otoGar3.cpgIslandExt.txt # 14292453 bases of 2359530453 (0.606%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/otoGar3/bed/genscan cd /hive/data/genomes/otoGar3/bed/genscan time doGenscan.pl otoGar3 > do.log 2>&1 # real 25m26.159s # one broken job: ./runGsBig.csh GL873659 000 gtf/000/GL873659.gtf pep/000/GL873659.pep subopt/000/GL873659.bed # rerunning with 2000000 window size # about 2 minutes # continuing: time doGenscan.pl -continue=makeBed otoGar3 > makeBed.log 2>&1 # real 4m12.143s cat fb.otoGar3.genscan.txt # 55174547 bases of 2359530453 (2.338%) in intersection cat fb.otoGar3.genscanSubopt.txt # 58681737 bases of 2359530453 (2.487%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-05 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2359530453 / 2897316137 \) \* 1024 # ( 2359530453 / 2897316137 ) * 1024 = 833.930117 # round up to 850 cd /hive/data/genomes/otoGar3 time blat otoGar3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/otoGar3.11.ooc -repMatch=850 # Wrote 23673 overused 11-mers to jkStuff/otoGar3.11.ooc # real 0m44.369s # there are no non-bridged gaps, do not create lift file for genbank hgsql -N -e "select bridge from gap;" otoGar3 | sort | uniq -c # 192447 yes # cd /hive/data/genomes/otoGar3/jkStuff # gapToLift otoGar3 otoGar3.nonBridged.lift -bedFile=otoGar3.nonBridged.bed # this assembly has gaps abutting each other which produces warnings # from this gapToLift program. # largest non-bridged contig: # awk '{print $3-$2,$0}' otoGar3.nonBridged.bed | sort -nr | head # 3862550 chr13 35251702 39114252 chr13.72 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-05 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Otolemur garnettii 3 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # otoGar3 (bushbaby) otoGar3.serverGenome = /hive/data/genomes/otoGar3/otoGar3.2bit otoGar3.clusterGenome = /hive/data/genomes/otoGar3/otoGar3.2bit otoGar3.ooc = /hive/data/genomes/otoGar3/jkStuff/otoGar3.11.ooc otoGar3.perChromTables = no otoGar3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} otoGar3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} otoGar3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} otoGar3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} otoGar3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} otoGar3.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} otoGar3.downloadDir = otoGar3 otoGar3.refseq.mrna.native.load = no otoGar3.refseq.mrna.xeno.load = yes otoGar3.genbank.mrna.xeno.load = yes otoGar3.genbank.mrna.xeno.loadDesc = no otoGar3.genbank.est.native.load = no # end of section added to etc/genbank.conf git commit -m "adding otoGar3 bushbaby" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for otoGarNames Otolemur garnettii" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S otoGar3 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial otoGar3 & # var/build/logs/2012.05.07-09:43:51.otoGar3.initalign.log # real 1551m45.359s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad otoGar3 & # logFile: var/dbload/hgwdev/logs/2012.05.09-09:49:13.dbload.log # real 74m22.846s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add otoGar3 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added otoGar3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="GL873620:5980653-5987990" where name="otoGar3";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/otoGar3/pushQ cd /hive/data/genomes/otoGar3/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl otoGar3 2> stderr.txt | grep -v transMap > otoGar3.sql # real 3m40.599s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/otoGar3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/otoGar3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/otoGar3/bbi/quality.bw # WARNING: otoGar3 does not have seq # WARNING: otoGar3 does not have extFile # WARNING: otoGar3 does not have estOrientInfo scp -p otoGar3.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/otoGar3.sql" ############################################################################