# for emacs: -*- mode: sh; -*- # Meloidogyne incognita # Sanger and Genoscope # http://www.nematode.net/Species.Summaries/Meloidogyne.incognita/index.php # Genbank: CABB00000000 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CABB01 ############################################################################## ## Fetch sequence (DONE - 2010-09-20 - Hiram) mkdir -p /hive/data/genomes/melInc1/ws210 cd /hive/data/genomes/melInc1/ws210 wget --no-parent --timestamping -m -nH --cut-dirs=5 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/m_incognita/ hgFakeAgp -minContigGap=1 \ sequences/dna/b_malayi.WS210.dna.fa.gz ucsc.fake.melInc1.agp ############################################################################## ## Initial browser build (DONE - 2010-09-20 - Hiram) cd /hive/data/genomes/melInc1 cat << '_EOF_' > melInc1.config.ra # Config parameters for makeGenomeDb.pl: db melInc1 clade worm genomeCladePriority 84 scientificName Meloidogyne incognita commonName M. incognita assemblyDate Feb. 2008 assemblyLabel Sanger and Genoscope Genbank: CABB00000000 assemblyShortLabel M. incognita WS210 orderKey 897 mitoAcc none fastaFiles /hive/data/genomes/melInc1/ws210/sequences/dna/m_incognita.WS210.dna.fa.gz agpFiles /hive/data/genomes/melInc1/ws210/ucsc.fake.melInc1.agp # qualFiles none dbDbSpeciesDir worm taxId 6306 '_EOF_' # << happy emacs makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \ -stop=agp melInc1.config.ra > agp.log 2>&1 makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \ -continue=db melInc1.config.ra > db.log 2>&1 ############################################################################## # REPEATMASKER (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/melInc1/bed/repeatMasker cd /hive/data/genomes/melInc1/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \ -buildDir=`pwd` melInc1 > do.log 2>&1 & # real 53m51.544s # from the do.log: # RepeatMasker version development-$Id: # RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp # CC RELEASE 20090604; * cat faSize.rmsk.txt # 82095019 bases (0 N's 82095019 real 73879883 upper 8215136 lower) # in 9538 sequences in 1 files # %10.01 masked total, %10.01 masked real ######################################################################### # SIMPLE REPEATS (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/melInc1/bed/simpleRepeat cd /hive/data/genomes/melInc1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \ -smallClusterHub=memk -buildDir=`pwd` melInc1 > do.log 2>&1 & # real 14m29.878s cat fb.simpleRepeat # 3947412 bases of 82095019 (4.808%) in intersection ######################################################################### # run window masker (DONE - 2010-09-21 - Hiram) mkdir /hive/data/genomes/melInc1/bed/windowMasker cd /hive/data/genomes/melInc1/bed/windowMasker time doWindowMasker.pl -verbose=2 -workhorse=kolossus \ -buildDir=`pwd` melInc1 > do.log 2>&1 & # about 14 minutes twoBitToFa melInc1.wmsk.sdust.2bit stdout | faSize stdin # 82095019 bases (0 N's 82095019 real 44822089 upper 37272930 lower) # in 9538 sequences in 1 files # %45.40 masked total, %45.40 masked real hgLoadBed melInc1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 775123 elements of size 3 featureBits melInc1 windowmaskerSdust # 37272930 bases of 82095019 (45.402%) in intersection # this is the mask for the genome cd /hive/data/genomes/melInc1 zcat bed/windowMasker/windowmasker.sdust.bed.gz \ | twoBitMask melInc1.unmasked.2bit stdin \ -type=.bed melInc1.2bit twoBitToFa melInc1.2bit stdout | faSize stdin # 82095019 bases (0 N's 82095019 real 44822089 upper 37272930 lower) # in 9538 sequences in 1 files # %45.40 masked total, %45.40 masked real ln -s `pwd`/melInc1.2bit /gbdb/melInc1/melInc1.2bit ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram) # numerator is melInc1 gapless bases "real" as reported by faSize # denominator is hg19 gapless bases as reported by featureBits, # featureBits hg19 gap # 1024 is threshold used for human -repMatch: calc \( 82095019 / 2897310462 \) \* 1024 # ( 82095019 / 2897310462 ) * 1024 = 29.014944 # 29 is way too small, use 100 to keep the number of overused # 11-mers to a smaller number cd /hive/data/genomes/melInc1 blat melInc1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=melInc1.11.ooc -repMatch=100 # Wrote 8613 overused 11-mers to melInc1.11.ooc mkdir /hive/data/staging/data/melInc1 cp -p melInc1.2bit chrom.sizes melInc1.11.ooc \ /hive/data/staging/data/melInc1 ######################################################################### # LASTZ SWAP ce9 (DONE - 2010-09-23 - Hiram) # original alignment cd /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22 cat fb.ce9.chainMelInc1Link.txt # 3310201 bases of 100286004 (3.301%) in intersection # running swap mkdir /hive/data/genomes/melInc1/bed/blastz.ce9.swap cd /hive/data/genomes/melInc1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 0m36.095s cat fb.melInc1.chainCe9Link.txt # 4240790 bases of 82095019 (5.166%) in intersection ######################################################################### # GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add melInc1 just before caeJap2 # melInc1 (Meloidogyne incognita) melInc1.serverGenome = /hive/data/genomes/melInc1/melInc1.2bit melInc1.clusterGenome = /scratch/data/melInc1/melInc1.2bit melInc1.ooc = /scratch/data/melInc1/melInc1.11.ooc melInc1.lift = no melInc1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} melInc1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} melInc1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} melInc1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} melInc1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} melInc1.refseq.mrna.native.load = yes melInc1.refseq.mrna.xeno.load = yes melInc1.refseq.mrna.xeno.loadDesc = yes melInc1.genbank.mrna.xeno.load = yes melInc1.genbank.est.native.load = yes melInc1.genbank.est.native.loadDesc = no melInc1.downloadDir = melInc1 melInc1.perChromTables = no git commit -m "Added melInc1 M. incognita WS210" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial melInc1 & # logFile: var/build/logs/2010.09.27-13:31:08.melInc1.initalign.log # real 388m50.914s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad melInc1 # logFile: var/dbload/hgwdev/logs/2010.09.28-10:56:39.dbload.log # real 28m22.985s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add melInc1 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added melInc1" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("melInc1", "blat5", "17790", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("melInc1", "blat5", "17791", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset default position to ZC101.2e protein blat result ssh hgwdev hgsql -e 'update dbDb set defaultPos="Minc_Contig26:7019-50820" where name="melInc1";' hgcentraltest ############################################################################ # ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram) cd /hive/data/genomes/melInc1 mkdir blastDb twoBitToFa melInc1.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 9538 pieces of 9538 written rm temp.fa cd blastDb for i in *.fa do /scratch/data/blast-2.2.11/bin/formatdb -i $i -p F done rm *.fa ## create the query protein set mkdir -p /hive/data/genomes/melInc1/bed/tblastn.ce9SG cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG echo /hive/data/genomes/melInc1/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 9538 query.lst # we want around 200000 jobs calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 28103/(200000/9538) = 1340.232070 mkdir sgfa split -l 1300 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=melInc1 BLASTMAT=/scratch/data/blast-2.2.11/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/x86_64/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh swarm cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG gensub2 query.lst sg.lst template jobList para create jobList para try, check, push, check etc. # Completed: 209836 of 209836 jobs # CPU time in finished jobs: 4293845s 71564.08m 1192.73h 49.70d 0.136 y # IO & Wait Time: 968639s 16143.98m 269.07h 11.21d 0.031 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest finished job: 76s 1.27m 0.02h 0.00d # Submission to last job: 66006s 1100.10m 18.34h 0.76d # do the cluster run for chaining ssh swarm mkdir /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne #!/bin/csh -fe cd $1 set b = $1:t cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \ /hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut/c.$b.psl '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun para create jobList para -maxJob=30 push para try, check, push, check etc. # Completed: 22 of 22 jobs # CPU time in finished jobs: 139s 2.32m 0.04h 0.00d 0.000 y # IO & Wait Time: 11262s 187.70m 3.13h 0.13d 0.000 y # Average job time: 518s 8.64m 0.14h 0.01d # Longest finished job: 537s 8.95m 0.15h 0.01d # Submission to last job: 562s 9.37m 0.16h 0.01d cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > ../blastCe9SG.psl cd .. pslCheck blastCe9SG.psl # checked: 15615 failed: 0 errors: 0 # load table ssh hgwdev cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG hgLoadPsl melInc1 blastCe9SG.psl # check coverage featureBits melInc1 blastCe9SG # 3882043 bases of 82095019 (4.729%) in intersection featureBits melHap1 blastCe9SG # 4376245 bases of 53017507 (8.254%) in intersection featureBits bruMal1 blastCe9SG # 4424694 bases of 89235536 (4.958%) in intersection featureBits priPac2 blastCe9SG # 5436779 bases of 133634773 (4.068%) in intersection featureBits haeCon1 blastCe9SG # 4990746 bases of 278844984 (1.790%) in intersection featureBits caeJap3 blastCe9SG # 12894398 bases of 154057934 (8.370%) in intersection featureBits caePb2 blastCe9SG # 23730009 bases of 170473138 (13.920%) in intersection featureBits caeRem3 blastCe9SG # 20302540 bases of 138406388 (14.669%) in intersection featureBits cb3 blastCe9SG # 18490367 bases of 108433446 (17.052%) in intersection featureBits ce9 sangerGene # 28689552 bases of 100286004 (28.608%) in intersection rm -rf blastOut ######################################################################### # verify all.joiner has everything (DONE - 2010-10-21 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=melInc1 ./all.joiner joinerCheck -keys -database=melInc1 ./all.joiner joinerCheck -times -database=melInc1 ./all.joiner joinerCheck -all -database=melInc1 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ######################################################################### # construct downloads files (DONE - 2010-10-21 - Hiram) cd /hive/data/genomes/melInc1 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 melInc1 \ > downloads.log 2>&1 ######################################################################### ## Creating pushQ (DONE - 2010-10-21 - Hiram) ssh hgwdev mkdir /hive/data/genomes/melInc1/pushQ cd /hive/data/genomes/melInc1/pushQ makePushQSql.pl melInc1 > melInc1.sql 2> errorLog.out ## check the errorLog.out for anything that needs to be fixed ## copy melInc1.sql to hgwbeta:/tmp ## then on hgwbeta: hgsql qapushq < melInc1.sql #######################################################################