# for emacs: -*- mode: sh; -*- # Meloidogyne hapla # The Center for the Biology of Nematode Parasitism, N.C. State # http://www.pngg.org/cbnp/index.php # "Sequence and genetic map of Meloidogyne hapla: # A compact nematode genome for plant parasitism" # http://www.pnas.org/content/105/39/14802.full # PNAS September 30, 2008 vol. 105 no. 39 14802-14807 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABLG01 ############################################################################## ## Fetch sequence (DONE - 2010-09-20 - Hiram) mkdir -p /hive/data/genomes/melHap1/ws210 cd /hive/data/genomes/melHap1/ws210 wget --no-parent --timestamping -m -nH --cut-dirs=5 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/m_hapla/ hgFakeAgp -minContigGap=1 \ sequences/dna/b_malayi.WS210.dna.fa.gz ucsc.fake.melHap1.agp ############################################################################## ## Initial browser build (DONE - 2010-09-20 - Hiram) cd /hive/data/genomes/melHap1 cat << '_EOF_' > melHap1.config.ra # Config parameters for makeGenomeDb.pl: db melHap1 clade worm genomeCladePriority 82 scientificName Meloidogyne hapla commonName M. hapla assemblyDate Sep. 2008 assemblyLabel The Center for the Biology of Nematode Parasitism, N.C. State assemblyShortLabel M. hapla VW9 WS210 orderKey 895 mitoAcc none fastaFiles /hive/data/genomes/melHap1/ws210/sequences/dna/m_hapla.WS210.dna.fa.gz agpFiles /hive/data/genomes/melHap1/ws210/ucsc.fake.melHap1.agp # qualFiles none dbDbSpeciesDir worm taxId 6305 '_EOF_' # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \ melHap1.config.ra > makeGenomeDb.log 2>&1 # real 0m42.028s ############################################################################## # REPEATMASKER (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/melHap1/bed/repeatMasker cd /hive/data/genomes/melHap1/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \ -buildDir=`pwd` melHap1 > do.log 2>&1 & # real 23m56.249s # from the do.log: # RepeatMasker version development-$Id: # RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp # CC RELEASE 20090604; * cat faSize.rmsk.txt # 53017507 bases (0 N's 53017507 real 43261440 upper 9756067 lower) # in 3452 sequences in 1 files # %18.40 masked total, %18.40 masked real ######################################################################### # SIMPLE REPEATS (DONE - 2010-09-21 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/melHap1/bed/simpleRepeat cd /hive/data/genomes/melHap1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \ -smallClusterHub=memk -buildDir=`pwd` melHap1 > do.log 2>&1 & # real 11m40.808s cat fb.simpleRepeat # 2456208 bases of 53017507 (4.633%) in intersection ######################################################################### # run window masker (DONE - 2010-09-21 - Hiram) mkdir /hive/data/genomes/melHap1/bed/windowMasker cd /hive/data/genomes/melHap1/bed/windowMasker time doWindowMasker.pl -verbose=2 -workhorse=kolossus \ -buildDir=`pwd` melHap1 > do.log 2>&1 & # real 1m59.029s twoBitToFa melHap1.wmsk.sdust.2bit stdout | faSize stdin # 53017507 bases (0 N's 53017507 real 24940979 upper 28076528 lower) # in 3452 sequences in 1 files # %52.96 masked total, %52.96 masked real hgLoadBed melHap1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 477268 elements of size 3 featureBits melHap1 windowmaskerSdust # 28076528 bases of 53017507 (52.957%) in intersection # this is the mask for the genome cd /hive/data/genomes/melHap1 zcat bed/windowMasker/windowmasker.sdust.bed.gz \ | twoBitMask melHap1.unmasked.2bit stdin \ -type=.bed melHap1.2bit twoBitToFa melHap1.2bit stdout | faSize stdin # 53017507 bases (0 N's 53017507 real 24940979 upper 28076528 lower) # in 3452 sequences in 1 files # %52.96 masked total, %52.96 masked real ln -s `pwd`/melHap1.2bit /gbdb/melHap1/melHap1.2bit ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram) # numerator is melHap1 gapless bases "real" as reported by faSize # denominator is hg19 gapless bases as reported by featureBits, # featureBits hg19 gap # 1024 is threshold used for human -repMatch: calc \( 53017507 / 2897310462 \) \* 1024 # ( 53017507 / 2897310462 ) * 1024 = 18.738043 # 18 is way too small, use 100 to keep the number of overused # 11-mers to a smaller number cd /hive/data/genomes/melHap1 blat melHap1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=melHap1.11.ooc -repMatch=50 # Wrote 6149 overused 11-mers to melHap1.11.ooc mkdir /hive/data/staging/data/melHap1 cp -p melHap1.2bit chrom.sizes melHap1.11.ooc \ /hive/data/staging/data/melHap1 ######################################################################### # SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram) # original alignment cd /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22 cat fb.ce9.chainMelHap1Link.txt # 3933301 bases of 100286004 (3.922%) in intersection # running swap mkdir /hive/data/genomes/melHap1/bed/blastz.ce9.swap cd /hive/data/genomes/melHap1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 0m23.154s cat fb.melHap1.chainCe9Link.txt # 3631046 bases of 53017507 (6.849%) in intersection ######################################################################### # GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add melHap1 just before caeJap2 # melHap1 (Meloidogyne hapla) melHap1.serverGenome = /hive/data/genomes/melHap1/melHap1.2bit melHap1.clusterGenome = /scratch/data/melHap1/melHap1.2bit melHap1.ooc = /scratch/data/melHap1/melHap1.11.ooc melHap1.lift = no melHap1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} melHap1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} melHap1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} melHap1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} melHap1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} melHap1.refseq.mrna.native.load = yes melHap1.refseq.mrna.xeno.load = yes melHap1.refseq.mrna.xeno.loadDesc = yes melHap1.genbank.mrna.xeno.load = yes melHap1.genbank.est.native.load = yes melHap1.genbank.est.native.loadDesc = no melHap1.downloadDir = melHap1 melHap1.perChromTables = no git commit -m "Added melHap1 M. hapla WS210" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial melHap1 & # logFile: var/build/logs/2010.09.27-13:30:40.melHap1.initalign.log # real 119m17.816s # this had a failure during the fetch: # Exception: process signaled 9: "gbAlignGet # -workdir=work/initial.melHap1/align -orgCats=xeno,native -polyASizes # -fasize=4194304 -noMigrate genbank.179.0 full mrna melHap1" # remove the work/initial.melHap1 directory and start over: time nice -n +19 bin/gbAlignStep -initial melHap1 & # logFile: var/build/logs/2010.09.28-10:10:02.melHap1.initalign.log # real 201m42.832s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad melHap1 # logFile: var/dbload/hgwdev/logs/2010.09.28-14:33:02.dbload.log # real 20m26.836s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add melHap1 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added melHap1" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("melHap1", "blat5", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("melHap1", "blat5", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset default position to ZC101.2e protein blat result ssh hgwdev hgsql -e 'update dbDb set defaultPos="MhA1_Contig210:36435-59955" where name="melHap1";' hgcentraltest ############################################################################ # ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram) cd /hive/data/genomes/melHap1 mkdir blastDb twoBitToFa melHap1.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 3452 pieces of 3452 written rm temp.fa cd blastDb for i in *.fa do /scratch/data/blast-2.2.11/bin/formatdb -i $i -p F done rm *.fa ## create the query protein set mkdir -p /hive/data/genomes/melHap1/bed/tblastn.ce9SG cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG echo /hive/data/genomes/melHap1/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 3452 query.lst # we want around 50000 jobs calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) # 28103/(50000/3452) = 1940.231120 mkdir -p sgfa split -l 1900 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=melHap1 BLASTMAT=/scratch/data/blast-2.2.11/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/x86_64/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh swarm cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG gensub2 query.lst sg.lst template jobList para create jobList para try, check, push, check etc. # Completed: 51780 of 51780 jobs # CPU time in finished jobs: 1643951s 27399.18m 456.65h 19.03d 0.052 y # IO & Wait Time: 283698s 4728.31m 78.81h 3.28d 0.009 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest finished job: 127s 2.12m 0.04h 0.00d # Submission to last job: 65982s 1099.70m 18.33h 0.76d # do the cluster run for chaining ssh swarm mkdir /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne #!/bin/csh -fe cd $1 set b = $1:t cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \ /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut/c.$b.psl '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun para create jobList para -maxJob=30 push para try, check, push, check etc. # Completed: 15 of 15 jobs # CPU time in finished jobs: 47s 0.79m 0.01h 0.00d 0.000 y # IO & Wait Time: 966s 16.09m 0.27h 0.01d 0.000 y # Average job time: 68s 1.13m 0.02h 0.00d # Longest finished job: 90s 1.50m 0.03h 0.00d # Submission to last job: 257s 4.28m 0.07h 0.00d cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastCe9SG.psl cd .. pslCheck blastCe9SG.psl # checked: 14315 failed: 0 errors: 0 # load table ssh hgwdev cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG hgLoadPsl melHap1 blastCe9SG.psl # check coverage featureBits melHap1 blastCe9SG # 4376245 bases of 53017507 (8.254%) in intersection featureBits melInc1 blastCe9SG # 3882043 bases of 82095019 (4.729%) in intersection featureBits haeCon1 blastCe9SG # 4990746 bases of 278844984 (1.790%) in intersection featureBits priPac2 blastCe9SG # 5436779 bases of 133634773 (4.068%) in intersection featureBits bruMal1 blastCe9SG # 4424694 bases of 89235536 (4.958%) in intersection featureBits caeJap3 blastCe9SG # 12894398 bases of 154057934 (8.370%) in intersection featureBits caePb2 blastCe9SG # 23730009 bases of 170473138 (13.920%) in intersection featureBits caeRem3 blastCe9SG # 20302540 bases of 138406388 (14.669%) in intersection featureBits cb3 blastCe9SG # 18490367 bases of 108433446 (17.052%) in intersection featureBits ce9 sangerGene # 28689552 bases of 100286004 (28.608%) in intersection rm -rf blastOut ######################################################################### # verify all.joiner has everything (DONE - 2010-10-25 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=melHap1 ./all.joiner joinerCheck -keys -database=melHap1 ./all.joiner joinerCheck -times -database=melHap1 ./all.joiner joinerCheck -all -database=melHap1 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ######################################################################### # construct downloads files (DONE - 2010-10-25 - Hiram) cd /hive/data/genomes/melHap1 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 melHap1 \ > downloads.log 2>&1 ######################################################################### ## Creating pushQ (DONE - 2010-10-25 - Hiram) ssh hgwdev mkdir /hive/data/genomes/melHap1/pushQ cd /hive/data/genomes/melHap1/pushQ makePushQSql.pl melHap1 > melHap1.sql 2> errorLog.out ## check the errorLog.out for anything that needs to be fixed ## copy melHap1.sql to hgwbeta:/tmp ## then on hgwbeta: hgsql qapushq < melHap1.sql #######################################################################