# for emacs: -*- mode: sh; -*- # Pteropus vampyrus # http://www.ncbi.nlm.nih.gov/bioproject/20325 # http://www.ncbi.nlm.nih.gov/genome/757 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRP00 ######################################################################### # DOWNLOAD SEQUENCE ssh kkstore05 mkdir /cluster/store12/pteVam1 ln -s /cluster/store12/pteVam1 /cluster/data mkdir /cluster/data/pteVam1/broad cd /cluster/data/pteVam1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin pteVam1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 96944 ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kolossus cd /cluster/data/pteVam1 cat << _EOF_ >pteVam1.config.ra # Config parameters for makeGenomeDb.pl: db pteVam1 clade mammal genomeCladePriority 35 scientificName Pteropus vampyrus commonName Megabat assemblyDate Jul. 2008 assemblyLabel Broad Institute pteVam1 orderKey 233.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/pteVam1/broad/assembly.bases.gz agpFiles /cluster/data/pteVam1/broad/assembly.agp qualFiles /cluster/data/pteVam1/broad/pteVam1.qual.qac dbDbSpeciesDir megabat _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -workhorse kolossus pteVam1.config.ra > makeGenomeDb.out 2>&1 & cut -f 2 chrom.sizes | ave stdin # Q1 1162.000000 # median 1777.000000 # Q3 9074.500000 # average 20589.994327 # min 601.000000 # max 1019178.000000 # count 96944 # total 1996076410.000000 # standard deviation 54685.855293 ######################################################################### ## Repeat Masker (DONE - 2008-10-16 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/pteVam1/bed/repeatMasker cd /hive/data/genomes/pteVam1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` pteVam1 > do.log 2>&1 & # real 806m48.065s twoBitToFa pteVam1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 1996076410 bases (156639750 N's 1839436660 real 1256029530 upper # 583407130 lower) in 96944 sequences in 1 files # %29.23 masked total, %31.72 masked real ######################################################################### # SIMPLE REPEATS TRF (WORKING - 2008-10-15 - Hiram) screen # use a screen to manage this job mkdir /cluster/data/pteVam1/bed/simpleRepeat cd /cluster/data/pteVam1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/pteVam1/bed/simpleRepeat pteVam1 > do.log 2>&1 & # real 98m59.097 cat fb.simpleRepeat # 33179383 bases of 1839436660 (1.804%) in intersection XXX - waiting for RM to complete 2008-10-15 13:32 # after RM run is done, add this mask: cd /hive/data/genomes/pteVam1 rm pteVam1.2bit twoBitMask pteVam1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed pteVam1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa pteVam1.2bit stdout | faSize stdin > pteVam1.2bit.faSize.txt # 1996076410 bases (156639750 N's 1839436660 real 1255425444 upper 584011216 # lower) in 96944 sequences in 1 files # %29.26 masked total, %31.75 masked rea # link to gbdb ln -s `pwd`/pteVam1.2bit /gbdb/pteVam1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-16 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # pteVam1: 1839436660 # thus: 1024 * 1839436660/2881421696 = 653 # rounding up to 700 for a bit more conservative masking cd /hive/data/genomes/pteVam1 time blat pteVam1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=pteVam1.11.ooc -repMatch=700 # Wrote 19840 overused 11-mers to pteVam1.11.ooc # real 1m44.676s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/pteVam1 cp -p pteVam1.2bit chrom.sizes pteVam1.11.ooc \ /hive/data/staging/data/pteVam1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute pteVam1 (NCBI project 20325, ABRP01000000)" where name="pteVam1";' hgcentraltest ############################################################################ # pteVam1 - Megabat - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram) ssh kkr14u07 cd /hive/data/genomes/pteVam1 cat << '_EOF_' > pteVam1.ensGene.ra # required db variable db pteVam1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the two genes that have invalid structures from Ensembl: # 6381: ENSPVAT00000012919 no exonFrame on CDS exon 14 # 23522: ENSPVAT00000010661 no exonFrame on CDS exon 0 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 pteVam1.ensGene.ra ssh hgwdev cd /hive/data/genomes/pteVam1/bed/ensGene.51 featureBits pteVam1 ensGene # 28722584 bases of 1839436660 (1.561%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/pteVam1/bed/ensGene.51 ############################################################################ # cpgIslands - (DONE - 2011-04-24 - Hiram) mkdir /hive/data/genomes/pteVam1/bed/cpgIslands cd /hive/data/genomes/pteVam1/bed/cpgIslands time doCpgIslands.pl pteVam1 > do.log 2>&1 # real 239m18.855s cat fb.pteVam1.cpgIslandExt.txt # 49493178 bases of 1839436660 (2.691%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/pteVam1/bed/genscan cd /hive/data/genomes/pteVam1/bed/genscan time doGenscan.pl pteVam1 > do.log 2>&1 # Elapsed time: 356m2s cat fb.pteVam1.genscan.txt # 55912473 bases of 1839436660 (3.040%) in intersection cat fb.pteVam1.genscanSubopt.txt # 60619613 bases of 1839436660 (3.296%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S pteVam1 mkdir /hive/data/genomes/pteVam1/bed/windowMasker cd /hive/data/genomes/pteVam1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev pteVam1 > do.log 2>&1 # real 278m34.535s # fixing the last couple of commands from doLoadClean step time ./lastClean.csh # real 4m47.606s cat fb.pteVam1.windowmaskerSdust.clean.txt # 514392905 bases of 1996076410 (25.770%) in intersection cat faSize.pteVam1.cleanWMSdust.txt # 1996076410 bases (156639750 N's 1839436660 real 1325043755 upper 514392905 lower) in 96944 sequences in 1 files # Total size: mean 20590.0 sd 54686.1 min 601 (scaffold_96943) max 1019178 (scaffold_0) median 1777 # %25.77 masked total, %27.96 masked real cat fb.pteVam1.rmsk.windowmaskerSdust.txt # 238443817 bases of 1996076410 (11.946%) in intersection time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \ -dbHost=hgwdev pteVam1 > cleanup.log 2>&1 # real 2m19.430s ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1839436660 / 2897316137 \) \* 1024 # ( 1839436660 / 2897316137 ) * 1024 = 650.113088 # round up to 700 cd /hive/data/genomes/pteVam1 time blat pteVam1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/pteVam1.11.ooc -repMatch=700 # Wrote 19840 overused 11-mers to jkStuff/pteVam1.11.ooc # real 0m31.002s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" pteVam1 | sort | uniq -c # 291864 yes # cd /hive/data/genomes/pteVam1/jkStuff # gapToLift pteVam1 pteVam1.nonBridged.lift -bedFile=pteVam1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' pteVam1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Pteropus vampyrus 2 1 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add pteVam1 just after ce2 # pteVam1 (megabat) pteVam1.serverGenome = /hive/data/genomes/pteVam1/pteVam1.2bit pteVam1.clusterGenome = /hive/data/genomes/pteVam1/pteVam1.2bit pteVam1.ooc = /hive/data/genomes/pteVam1/jkStuff/pteVam1.11.ooc pteVam1.lift = no pteVam1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} pteVam1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} pteVam1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} pteVam1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} pteVam1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} pteVam1.refseq.mrna.native.load = no pteVam1.refseq.mrna.xeno.load = yes pteVam1.genbank.mrna.xeno.load = yes pteVam1.genbank.est.native.load = no pteVam1.downloadDir = pteVam1 pteVam1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding pteVam1 megabat" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for pteVamNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S pteVam1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial pteVam1 & # var/build/logs/2012.06.08-09:55:28.pteVam1.initalign.log # real 1862m46.698s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad pteVam1 & # logFile: var/dbload/hgwdev/logs/2012.06.11-13:09:05.dbload.log # real 14m48.000s # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add pteVam1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added pteVam1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_8765:35746-39000" where name="pteVam1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/pteVam1/pushQ cd /hive/data/genomes/pteVam1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl pteVam1 2> stderr.txt | grep -v transMap > pteVam1.sql # real 3m39.590s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/pteVam1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/pteVam1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/pteVam1/bbi/quality.bw # WARNING: pteVam1 does not have seq # WARNING: pteVam1 does not have extFile # WARNING: pteVam1 does not have estOrientInfo scp -p pteVam1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/pteVam1.sql" ############################################################################