# for emacs: -*- mode: sh; -*- # This file describes browser build for the saiBol1 # Squirrel monkey - Saimiri boliviensis - Oct 2011 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGCE01 # 80X coverage WGS # http://www.ncbi.nlm.nih.gov/bioproject/67945 # http://www.ncbi.nlm.nih.gov/genome/6907 # http://www.ncbi.nlm.nih.gov/genome/assembly/420378 ############################################################################# # Fetch sequence from genbank (DONE - 2012-01-06 - Hiram) mkdir -p /hive/data/genomes/saiBol1/genbank cd /hive/data/genomes/saiBol1/genbank time wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Saimiri_boliviensis/SaiBol1.0/*" # Downloaded: 13 files, 1.0G in 13m 58s (1.23 MB/s) # real 14m3.742s # measure sequence to be used here faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2608572064 bases (131440969 N's 2477131095 real 2477131095 upper 0 # lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 # (gi|356714203|gb|AGCE01151413.1|) max 72162052 # (gi|357434258|gb|JH378105.1|) median 1553 ############################################################################# # process into UCSC naming scheme (DONE - 2012-01-06 - Hiram) mkdir /hive/data/genomes/saiBol1/ucsc cd /hive/data/genomes/saiBol1/ucsc # verify we don't have any .acc numbers different from .1 zcat ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|AGCE" \ | sed -e 's/^JH[0-9][0-9]*//; s/^AGCE[0-9][0-9]*//' | sort | uniq -c # 300141 .1 # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 1m21.710s time gzip *.fa *.agp # real 13m29.352s # -rw-rw-r-- 1 15919402 Jan 6 15:44 unplaced.agp # -rw-rw-r-- 1 2645873375 Jan 6 15:45 unplaced.fa # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa.gz # 2608572064 bases (131440969 N's 2477131095 real 2477131095 upper 0 lower) in # 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553 ############################################################################# # Initial browser build (DONE - 2012-01-06 - Hiram) cd /hive/data/genomes/saiBol1 cat << '_EOF_' > saiBol1.config.ra # Config parameters for makeGenomeDb.pl: db saiBol1 clade mammal genomeCladePriority 16 scientificName Saimiri boliviensis commonName Squirrel monkey assemblyDate Oct. 2011 assemblyLabel Broad SaiBol1.0 (GCA_000235385.1) assemblyShortLabel SaiBol1.0 orderKey 40 mitoAcc none fastaFiles /hive/data/genomes/saiBol1/ucsc/*.fa.gz agpFiles /hive/data/genomes/saiBol1/ucsc/*.agp.gz dbDbSpeciesDir saiBol taxId 39432 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp saiBol1.config.ra > agp.log 2>&1 # real 3m14.814s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db saiBol1.config.ra > db.log 2>&1 # real 21m32.613s ############################################################################# # running repeat masker (DONE - 2012-01-06 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/repeatMasker cd /hive/data/genomes/saiBol1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk saiBol1 > do.log 2>&1 & # real 751m55.197s cat faSize.rmsk.txt # 2608572064 bases (131440969 N's 2477131095 real 1321507940 upper # 1155623155 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) # max 72162052 (JH378105) median 1553 # %44.30 masked total, %46.65 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps saiBol1 rmsk # 1157555619 bases of 2608572064 (44.375%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-06 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/simpleRepeat cd /hive/data/genomes/saiBol1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ saiBol1 > do.log 2>&1 & # real 20m15.294s cat fb.simpleRepeat # 38276602 bases of 2477131095 (1.545%) in intersection # not going to add to rmsk here, using the window masker instead since # it masks more sequence cd /hive/data/genomes/saiBol1 twoBitMask saiBol1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed saiBol1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa saiBol1.2bit stdout | faSize stdin > faSize.saiBol1.2bit.txt cat faSize.saiBol1.2bit.txt # 2608572064 bases (131440969 N's 2477131095 real 1320629270 upper # 1156501825 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) # max 72162052 (JH378105) median 1553 rm /gbdb/saiBol1/saiBol1.2bit ln -s `pwd`/saiBol1.2bit /gbdb/saiBol1/saiBol1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/gap cd /hive/data/genomes/saiBol1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../saiBol1.unmasked.2bit > findMotif.txt 2>&1 # real 0m46.931s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps saiBol1 -not gap -bed=notGap.bed # 2969184009 bases of 3309577922 (89.715%) in intersection # real 0m26.597s time featureBits -countGaps saiBol1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2608572064 (0.000%) in intersection # real 8m30.686s # no unannotated gaps, no more to run here ########################################################################## ## WINDOWMASKER (DONE - 2012-01-06 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/windowMasker cd /hive/data/genomes/saiBol1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev saiBol1 > do.log 2>&1 & # about 3 hours # Masking statistics twoBitToFa saiBol1.wmsk.2bit stdout | faSize stdin # 2608572064 bases (131440969 N's 2477131095 real 1630475943 upper # 846655152 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 # min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553 # %32.46 masked total, %34.18 masked real twoBitToFa saiBol1.wmsk.sdust.2bit stdout | faSize stdin # 2608572064 bases (131440969 N's 2477131095 real 1617331205 upper # 859799890 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 # min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553 # %32.96 masked total, %34.71 masked real hgLoadBed saiBol1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 14414165 elements of size 3 featureBits -countGaps saiBol1 windowmaskerSdust # 991240859 bases of 2608572064 (37.999%) in intersection # eliminate the gaps from the masking featureBits saiBol1 -not gap -bed=notGap.bed # 2477131095 bases of 2477131095 (100.000%) in intersection time nice -n +19 featureBits saiBol1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 859799890 bases of 2477131095 (34.710%) in intersection # real 5m21.813s # reload track to get it clean hgLoadBed saiBol1 windowmaskerSdust cleanWMask.bed.gz # Loaded 14489638 elements of size 4 time featureBits -countGaps saiBol1 windowmaskerSdust # 859799890 bases of 2608572064 (32.961%) in intersection # real 1m24.055s # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../saiBol1.unmasked.2bit stdin \ -type=.bed saiBol1.cleanWMSdust.2bit twoBitToFa saiBol1.cleanWMSdust.2bit stdout | faSize stdin \ > saiBol1.cleanWMSdust.faSize.txt cat saiBol1.cleanWMSdust.faSize.txt # 2608572064 bases (131440969 N's 2477131095 real 1617331205 upper # 859799890 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 # min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553 # %32.96 masked total, %34.71 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps saiBol1 rmsk windowmaskerSdust # 627944117 bases of 2608572064 (24.072%) in intersection ######################################################################### # cpgIslands - (DONE - 2011-04-20 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/cpgIslands cd /hive/data/genomes/saiBol1/bed/cpgIslands time doCpgIslands.pl saiBol1 > do.log 2>&1 # Elapsed time: 11m10s cat fb.saiBol1.cpgIslandExt.txt # 17439071 bases of 2477131095 (0.704%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/saiBol1/bed/genscan cd /hive/data/genomes/saiBol1/bed/genscan # while running this procedure, constructed the script to do this # automatically, the steps were run manually for this instance. doGenscan.pl hetGla2 # one job failed during the genscan kluster run. # to recover, edit the runGsBig.csh script and lower the size of chunks # for gsBig, in this case from 2400000 to 2000000: /cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/hive/data/staging/data/genscan/genscan -par=/hive/data/staging/data/genscan/HumanIso.smat -tmp=/tmp -window=2000000 # then run the job on hgwdev that uses this script: ./runGsBig.csh JH378249 000 gtf/000/JH378249.gtf \ pep/000/JH378249.pep subopt/000/JH378249.bed # may need to adjust the number even smaller, or in the worst case, # actually partition the bit in question. # when all finished: cat fb.saiBol1.genscan.txt # 51449399 bases of 2477131095 (2.077%) in intersection cat fb.saiBol1.genscanSubopt.txt # 50823007 bases of 2477131095 (2.052%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2477131095 / 2897316137 \) \* 1024 # ( 2477131095 / 2897316137 ) * 1024 = 875.493775 # round up to 900 cd /hive/data/genomes/saiBol1 time blat saiBol1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/saiBol1.11.ooc -repMatch=900 # Wrote 25485 overused 11-mers to jkStuff/saiBol1.11.ooc # real 1m7.834s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" saiBol1 | sort | uniq -c # 148728 yes # cd /hive/data/genomes/saiBol1/jkStuff # gapToLift saiBol1 saiBol1.nonBridged.lift -bedFile=saiBol1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' saiBol1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Saimiri boliviensis 52 0 0 # Saimiri boliviensis boliviensis 8 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add saiBol1 # saiBol1 (squirrel monkey) saiBol1.serverGenome = /hive/data/genomes/saiBol1/saiBol1.2bit saiBol1.clusterGenome = /hive/data/genomes/saiBol1/saiBol1.2bit saiBol1.ooc = /hive/data/genomes/saiBol1/jkStuff/saiBol1.11.ooc saiBol1.lift = no saiBol1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} saiBol1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} saiBol1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} saiBol1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} saiBol1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} saiBol1.refseq.mrna.native.load = no saiBol1.refseq.mrna.xeno.load = yes saiBol1.genbank.mrna.xeno.load = yes saiBol1.genbank.est.native.load = no saiBol1.downloadDir = saiBol1 saiBol1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding saiBol1 squirrel monkey" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for saiBolNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S saiBol1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial saiBol1 & # var/build/logs/2012.05.08-14:42:12.saiBol1.initalign.log # real 3156m37.952s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad saiBol1 & # logFile: var/dbload/hgwdev/logs/2012.05.10-19:23:54.dbload.log # real 6m6.352s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add saiBol1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added saiBol1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH378122:1872708-1880068" where name="saiBol1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/saiBol1/pushQ cd /hive/data/genomes/saiBol1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl saiBol1 2> stderr.txt | grep -v transMap > saiBol1.sql # real 3m42.169s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/saiBol1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/saiBol1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/saiBol1/bbi/quality.bw # WARNING: saiBol1 does not have seq # WARNING: saiBol1 does not have extFile # WARNING: saiBol1 does not have estOrientInfo scp -p saiBol1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/saiBol1.sql" ############################################################################