# for emacs: -*- mode: sh; -*- # Caenorhabditis japonica # Washington University School of Medicine GSC and Sanger Institute # WUSTL version 7.0.1 Sep 2010 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABLE03 ############################################################################## ## Fetch sequence (DONE - 2010-09-20 - Hiram) wget --timestamping -O wgs.ABLE.1.fsa_nt.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.fsa_nt.gz wget --timestamping -O wgs.ABLE.1.qscore.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.qscore.gz wget --timestamping -O wgs.ABLE.1.gbff.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABLE.1.gbff.gz zcat wgs.ABLE.1.qscore.gz | sed -e 's#^>gb|#>#; s#.1|.*##' \ | gzip -c > ABLE000.ucsc.qa.gz zcat wgs.ABLE.1.fsa_nt.gz | sed -e 's#^>gi.*ABLE#>ABLE#; s#.1|.*##' \ | gzip -c > ABLE000.ucsc.fa.gz cat << '_EOF_' > replaceNames.pl #!/bin/env perl use strict; use warnings; my %ableToContig; open (FH, ") { chomp $line; my ($ableName, $contigName) = split('\s+', $line); $ableName =~ s/.1$//; $ableToContig{$ableName} = $contigName; } close (FH); open (FH, ") { chomp $line; my ($acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd, $strand) = split('\s+', $line); $acc =~ s/.1$//; printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $acc, $start, $end, $id, $type, $ableToContig{$acc}, $ctgStart, $ctgEnd, $strand; } close (FH); '_EOF_' # << happy emacs chmod +x replaceNames.pl ./replaceNames.pl > contig.ucsc.agp ############################################################################## ## Initial browser build (DONE - 2010-09-20 - Hiram) cd /hive/data/genomes/caeJap3 cat << '_EOF_' > caeJap3.config.ra # Config parameters for makeGenomeDb.pl: db caeJap3 clade worm genomeCladePriority 70 scientificName Caenorhabditis japonica commonName C. japonica assemblyDate Sep. 2010 assemblyLabel Washington University School of Medicine GSC C. japonica 7.0.1 assemblyShortLabel C. japonica 7.0.1 orderKey 880 mitoAcc none fastaFiles /hive/data/genomes/caeJap3/able0.3/ABLE000.ucsc.fa.gz agpFiles /hive/data/genomes/caeJap3/able0.3/contig.ucsc.agp qualFiles /hive/data/genomes/caeJap3/able0.3/ABLE000.ucsc.qac dbDbSpeciesDir worm taxId 281687 '_EOF_' # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \ caeJap3.config.ra > makeGenomeDb.log 2>&1 # real 2m45.252s ############################################################################## # REPEATMASKER (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/caeJap3/bed/repeatMasker cd /hive/data/genomes/caeJap3/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \ -buildDir=`pwd` caeJap3 > do.log 2>&1 & # real 58m59.433s # from the do.log: # RepeatMasker version development-$Id: # RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp # CC RELEASE 20090604; * ######################################################################### # SIMPLE REPEATS (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/caeJap3/bed/simpleRepeat cd /hive/data/genomes/caeJap3/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \ -smallClusterHub=memk -buildDir=`pwd` caeJap3 > do.log 2>&1 & # real 30m8.251s ######################################################################### # run window masker (DONE - 2010-09-21 - Hiram) mkdir /hive/data/genomes/caeJap3/bed/windowMasker cd /hive/data/genomes/caeJap3/bed/windowMasker time doWindowMasker.pl -verbose=2 -workhorse=kolossus \ -buildDir=`pwd` caeJap3 > do.log 2>&1 & # real 4m14.628s twoBitToFa caeJap3.wmsk.sdust.2bit stdout | faSize stdin # 154057934 bases (0 N's 154057934 real 82158768 upper 71899166 lower) # in 35931 sequences in 1 files # %46.67 masked total, %46.67 masked real hgLoadBed caeJap3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1023323 elements of size 3 featureBits caeJap3 windowmaskerSdust # 71899166 bases of 154057934 (46.670%) in intersection # this is the mask for the genome cd /hive/data/genomes/caeJap3 zcat bed/windowMasker/windowmasker.sdust.bed.gz \ | twoBitMask caeJap3.unmasked.2bit stdin \ -type=.bed caeJap3.2bit twoBitToFa caeJap3.2bit stdout | faSize stdin # 154057934 bases (0 N's 154057934 real 82158768 upper 71899166 lower) # in 35931 sequences in 1 files # %46.67 masked total, %46.67 masked real ln -s `pwd`/caeJap3.2bit /gbdb/caeJap3/caeJap3.2bit ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram) # numerator is caeJap3 gapless bases "real" as reported by faSize # denominator is hg19 gapless bases as reported by featureBits, # featureBits hg19 gap # 1024 is threshold used for human -repMatch: calc \( 154057934 / 2897310462 \) \* 1024 # ( 154057934 / 2897310462 ) * 1024 = 54.448885 # 54 is way too small, use 100 to keep the number of overused # 11-mers to a smaller number cd /hive/data/genomes/caeJap3 blat caeJap3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=caeJap3.11.ooc -repMatch=100 # Wrote 21362 overused 11-mers to caeJap3.11.ooc mkdir /hive/data/staging/data/caeJap3 cp -p caeJap3.2bit chrom.sizes caeJap3.11.ooc \ /hive/data/staging/data/caeJap3 ######################################################################### # SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram) # original alignment cd /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20 cat fb.ce9.chainCaeJap1Link.txt # 26876526 bases of 100286004 (26.800%) in intersection # swap, this is also in caeJap1.txt mkdir /hive/data/genomes/caeJap1/bed/blastz.ce9.swap cd /hive/data/genomes/caeJap1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 & # real 1m18.062s cat fb.caeJap1.chainCe9Link.txt # 26200691 bases of 129347181 (20.256%) in intersection ######################################################################### # GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add caeJap3 just before caeJap2 # caeJap3 (C. japonica) caeJap3.serverGenome = /hive/data/genomes/caeJap3/caeJap3.2bit caeJap3.clusterGenome = /scratch/data/caeJap3/caeJap3.2bit caeJap3.ooc = /scratch/data/caeJap3/caeJap3.11.ooc caeJap3.lift = no caeJap3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} caeJap3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} caeJap3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} caeJap3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} caeJap3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} caeJap3.refseq.mrna.native.load = yes caeJap3.refseq.mrna.xeno.load = yes caeJap3.refseq.mrna.xeno.loadDesc = yes caeJap3.genbank.mrna.xeno.load = yes caeJap3.genbank.est.native.load = yes caeJap3.genbank.est.native.loadDesc = no caeJap3.downloadDir = caeJap3 caeJap3.perChromTables = no git commit -m "Added caeJap3 C. japonica WUGSC 7.0.1" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial caeJap3 & # logFile: var/build/logs/2010.09.27-13:22:49.caeJap3.initalign.log # real 548m31.324s # this had a temporary failure: # Cannot allocate memory # can't fork # command failed: gbAlignInstall -noMigrate -workdir=work/initial.caeJap3/align -orgCats=native genbank.179.0 daily.0827 mrna caeJap3 at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. # continuing time ./etc/align-genbank -finish -workdir work/initial.caeJap3/align # logFile: var/build/logs/2010.09.28-10:08:30.align.log # real 5m47.281s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeJap3 # logFile: var/dbload/hgwdev/logs/2010.09.28-10:27:34.dbload.log # real 17m14.683s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add caeJap3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added caeJap3 - C. elegans WS210" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caeJap3", "blat9", "17790", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caeJap3", "blat9", "17791", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset default position to ZC101.2e protein blat result ssh hgwdev hgsql -e 'update dbDb set defaultPos="ABLE03026714:2543-32231" where name="caeJap3";' hgcentraltest # there isn't much there on this small contig, better to use a larger # contig with many gene definitions: hgsql -e 'update dbDb set defaultPos="ABLE03030821:1-151,170" where name="caeJap3";' hgcentraltest ############################################################################ # ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-07 - Hiram) cd /hive/data/genomes/caeJap3 mkdir blastDb twoBitToFa caeJap3.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 35931 pieces of 35931 written rm temp.fa cd blastDb for i in *.fa do /scratch/data/blast-2.2.11/bin/formatdb -i $i -p F done rm *.fa ## create the query protein set mkdir -p /hive/data/genomes/caeJap3/bed/tblastn.ce9SG cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG echo /hive/data/genomes/caeJap3/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 35931 query.lst # we want around 200000 jobs calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 28103/(200000/35931) = 5048.844465 mkdir sgfa split -l 5000 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=caeJap3 BLASTMAT=/scratch/data/blast-2.2.11/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/x86_64/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh swarm cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG gensub2 query.lst sg.lst template jobList para create jobList para try, check, push, check etc. # Completed: 215586 of 215586 jobs # CPU time in finished jobs: 15523183s 258719.71m 4312.00h 179.67d 0.492 y # IO & Wait Time: 1986539s 33108.99m 551.82h 22.99d 0.063 y # Average job time: 81s 1.35m 0.02h 0.00d # Longest finished job: 185s 3.08m 0.05h 0.00d # Submission to last job: 83070s 1384.50m 23.07h 0.96d # do the cluster run for chaining ssh swarm mkdir /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne #!/bin/csh -fe cd $1 set b = $1:t cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \ /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut/c.$b.psl '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/chainRun para create jobList para try, check, push, check etc. # Completed: 6 of 6 jobs # CPU time in finished jobs: 336s 5.60m 0.09h 0.00d 0.000 y # IO & Wait Time: 6832s 113.87m 1.90h 0.08d 0.000 y # Average job time: 1195s 19.91m 0.33h 0.01d # Longest finished job: 1198s 19.97m 0.33h 0.01d # Submission to last job: 1204s 20.07m 0.33h 0.01d cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > ../blastCe9SG.psl cd .. pslCheck blastCe9SG.psl # checked: 31430 failed: 0 errors: 0 # load table ssh hgwdev cd /hive/data/genomes/caeJap3/bed/tblastn.ce9SG hgLoadPsl caeJap3 blastCe9SG.psl # check coverage featureBits caeJap3 blastCe9SG # 12894398 bases of 154057934 (8.370%) in intersection featureBits cb3 blastCe9SG # 18490367 bases of 108433446 (17.052%) in intersection featureBits caeRem3 blastCe9SG # 20302540 bases of 138406388 (14.669%) in intersection featureBits caePb2 blastCe9SG # 23730009 bases of 170473138 (13.920%) in intersection featureBits melHap1 blastCe9SG # 4376245 bases of 53017507 (8.254%) in intersection featureBits melInc1 blastCe9SG # 3882043 bases of 82095019 (4.729%) in intersection featureBits priPac2 blastCe9SG # 5436779 bases of 133634773 (4.068%) in intersection featureBits bruMal1 blastCe9SG # 4424694 bases of 89235536 (4.958%) in intersection featureBits haeCon1 blastCe9SG # 4990746 bases of 278844984 (1.790%) in intersection featureBits ce9 sangerGene # 28689552 bases of 100286004 (28.608%) in intersection rm -rf blastOut ######################################################################### # set this as the defaultDb (DONE - 2010-10-20 - Hiram) hgsql -e 'update defaultDb set name="caeJap3" where name="caeJap1";' \ hgcentraltest ######################################################################### # liftOver sangerGene WS210 to this caeJap3 (DONE - 2010-10-15 - Hiram) mkdir /hive/data/genomes/caeJap3/bed/ws210Gene cd /hive/data/genomes/caeJap3/bed/ws210Gene ln -s /hive/data/genomes/caeJap2a/bed/ws210Gene/caeJap2a.ws210Gene.gp . ln -s \ /hive/data/genomes/caeJap2a/bed/liftOver/caeJap2aToCaeJap3.over.chain.gz . export DB=caeJap3 liftOver -genePred caeJap2a.ws210Gene.gp caeJap2aToCaeJap3.over.chain.gz \ ${DB}.ws210Gene.gp ws210gene.unmapped.gp genePredCheck -db=${DB} ${DB}.ws210Gene.gp # checked: 16233 failed: 0 hgLoadGenePred ${DB} ws210Gene ${DB}.ws210Gene.gp ######################################################################### # verify all.joiner has everything (DONE - 2010-10-21 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=caeJap3 ./all.joiner joinerCheck -keys -database=caeJap3 ./all.joiner joinerCheck -times -database=caeJap3 ./all.joiner joinerCheck -all -database=caeJap3 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ######################################################################### # construct downloads files (DONE - 2010-10-21 - Hiram) cd /hive/data/genomes/caeJap3 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 caeJap3 \ > downloads.log 2>&1 ######################################################################### ## Creating pushQ (DONE - 2010-10-21 - Hiram) ssh hgwdev mkdir /hive/data/genomes/caeJap3/pushQ cd /hive/data/genomes/caeJap3/pushQ makePushQSql.pl caeJap3 > caeJap3.sql 2> errorLog.out ## check the errorLog.out for anything that needs to be fixed ## copy caeJap3.sql to hgwbeta:/tmp ## then on hgwbeta: hgsql qapushq < caeJap3.sql #######################################################################