# Nematostella vectensis DOE JGI Nemve1 # $Id: nemVec1.txt,v 1.7 2008/08/11 23:51:00 kord Exp $ # $Source: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/nemVec1.txt,v $ ####################################################################### # CREATE BUILD DIRECTORY (DONE 2008-Jul-01 - Kord) # what store? ssh hgwdev df -h # Filesystem Size Used Avail Use% Mounted on # /dev/sda3 9.7G 5.9G 3.3G 65% / # /dev/sda1 99M 36M 58M 39% /boot # none 16G 0 16G 0% /dev/shm # /dev/sda5 20G 15G 3.5G 81% /scratch # /dev/sdd1 1.5T 626G 767G 45% /data # /dev/sde1 2.0T 1.8T 120G 94% /data/mysql # kkhome-10:/export/cluster/home # 766G 415G 291G 59% /cluster/home # ... # kkstore06-10:/export/cluster/store3 # 2.3T 1.9T 302G 87% /cluster/store3 # ... ssh kkstore06 mkdir /cluster/store3/nemVec1 ln -s /cluster/store3/nemVec1 /cluster/data/nemVec1 ####################################################################### # DOWNLOAD THE ASSEMBLY (DONE - 2008-Jul-05 - Kord) ssh kkstore06 cd /cluster/data/nemVec1 mkdir jkStuff mkdir bed mkdir /cluster/data/nemVec1/downloads cd /cluster/data/nemVec1/downloads # GenBank WGS sequences and Scaffolds: # http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=ABAV01000000 # WGS ABAV01000001-ABAV01059149 # WGS_SCAFLD DS469507-DS480310 # JGI supplied contigs # http://genome.jgi-psf.org/Nemve1/Nemve1.download.ftp.html # Unmasked wget --timestamp \ ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/v1.0/assembly/Nemve1.fasta.gz faSize Nemve1.fasta.gz # 356613585 bases (59221913 N's 297391672 real 297391672 upper 0 lower) in 10804 sequences in 1 files # Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708 # N count: mean 5481.5 sd 10915.3 # U count: mean 27526.1 sd 141670.8 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real # Masked wget --timestamp \ ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/v1.0/assembly/Nemve1.allmasked.gz faSize Nemve1.allmasked.gz # 356613585 bases (59221913 N's 297391672 real 216553990 upper 80837682 lower) in 10804 sequences in 1 files # Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708 # N count: mean 5481.5 sd 10915.3 # U count: mean 20043.9 sd 107371.3 # L count: mean 7482.2 sd 35162.7 # %22.67 masked total, %27.18 masked real # Tandemly repated wget --timestamp \ ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/annotation/v1.0/NvTRjug.fasta.gz faSize NvTRjug.fasta.gz # 14414 bases (0 N's 14414 real 14414 upper 0 lower) in 10 sequences in 1 files # Total size: mean 1441.4 sd 2104.8 min 191 (GTGTTTGTGGTGTTTTjuggernaut) max 7162 (AAAAAAAAATCGAACAjuggernaut) median 786 # N count: mean 0.0 sd 0.0 # U count: mean 1441.4 sd 2104.8 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ####################################################################### # run the makeGenomeDb procedure to create the db and unmasked sequence # (DONE - 2008-Jul-14 - Kord) # create a new genomeClade? # Nope - Jim doesn't want one-browser clades in the menu # So, its into "other" for Nematostella until Hydra magnipapillata is done # gnerate a fake AGP for use in downstream tools ssh kkstore06 cd /cluster/data/nemVec1/downloads hgFakeAgp -minContigGap=50 Nemve1.fasta.gz nemVec1.agp cat << _EOF_ > nemVec1.config.ra # Config parameters for makeGenomeDb.pl: db nemVec1 # new clade? cnidarian clade other genomeCladePriority 18 scientificName Nematostella vectensis commonName Starlet sea anemone assemblyDate Jun. 2007 assemblyLabel US DOE JGI-PGF Nemve1 orderKey 227 mitoAcc none fastaFiles /cluster/data/nemVec1/downloads/Nemve1.fasta.gz agpFiles /cluster/data/nemVec1/downloads/nemVec1.agp # qualFiles /dev/null dbDbSpeciesDir other _EOF_ # run just to the AGP to make sure things are sane first nice time -p makeGenomeDb.pl nemVec1.config.ra -stop agp \ |tee -a jkStuff/makeGenomeDb.ajp 2>&1 # now, contining to make the Db and all time -p nice -n +19 makeGenomeDb.pl nemVec1.config.ra -continue db \ >jkStuff/makeGenomeDb.log 2>&1 # real 223.58 # user 0.20 # sys 0.16 cd /cluster/data/nemVec1/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/other/nemVec1 # Updated the html # description.html /cluster/data/nemVec1/html/{trackDb.ra,gap.html,gold.html} cd ../.. # (to trackDb/) and # - edit makefile to add nemVec1 to DBS. # - (if necessary) cvs add other # - cvs add other/nemVec1 # - cvs add other/nemVec1/*.{ra,html} # - cvs ci -m "Added nemVec1 to DBS." makefile # - cvs ci -m "Initial descriptions for nemVec1." other/nemVec1 # - (if necessary) cvs ci other # - Run make update DBS=nemVec1 and make alpha when done. # - (optional) Clean up /cluster/data/nemVec1/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. ####################################################################### # REPEATMASKER (DONE - 2008-Jul-16 - Kord) ssh kkstore06 screen mkdir /cluster/data/nemVec1/bed/repeatMasker cd /cluster/data/nemVec1/bed/repeatMasker time -p nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \ -buildDir=/cluster/data/nemVec1/bed/repeatMasker nemVec1 > do.log 2>&1 & # versions from the do.log # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker # Jun 13 2008 (open-3-2-5) version of RepeatMasker # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC RELEASE 20080611; ssh pk cd /cluster/data/nemVec1/bed/repeatMasker/run.cluster para time > para.time 2>&1 exit ssh kkstore06 cd /cluster/data/nemVec1/bed/repeatMasker cat run.cluster/para.time # 879 jobs in batch # 29017 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 879 of 879 jobs # CPU time in finished jobs: 3593738s 59895.63m 998.26h 41.59d 0.114 y # IO & Wait Time: 44003s 733.38m 12.22h 0.51d 0.001 y # Average job time: 4138s 68.97m 1.15h 0.05d # Longest finished job: 9437s 157.28m 2.62h 0.11d # Submission to last job: 75500s 1258.33m 20.97h 0.87d # Estimated complete: 0s 0.00m 0.00h 0.00d # continue the script after fixing crashed and failed cluster jobs date > run.cluster/run.time time -p nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \ -buildDir=/cluster/data/nemVec1/bed/repeatMasker \ -continue cat nemVec1 > do-cat.log 2>&1 & # *** All done! # *** Steps were performed in /cluster/data/nemVec1/bed/repeatMasker # *** Please check the log file to see if hgLoadOut had warnings that we # should pass on to Arian and Robert. # *** Please also spot-check some repeats in the browser, run twoBitToFa on # a portion of nemVec1.2bit to make sure there's some masking, etc. ####################################################################### # SIMPLE REPEATS (DONe - 2008-Jul-16 - Kord) ssh kkstore06 screen ssh memk parasol status # CPUs free: 20 # CPUs busy: 0 # CPUs dead: 8 # Jobs running: 0 # Jobs waiting: 0 # Jobs finished: 6334 # Jobs crashed: 88 # Spokes free: 30 # Spokes busy: 0 # Spokes dead: 0 # Active batches: 0 # Total batches: 26 # Active users: 0 # Total users: 7 # Days up: 27.866007 # Version: 11 mkdir /cluster/data/nemVec1/bed/simpleRepeat cd /cluster/data/nemVec1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \ -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do.log 2>&1 & # generated a permissions error on mkdir /san/sanvol1 # So, I'll run it on kk with the new parasol version (12.08) time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \ -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do.log 2>&1 & #real 342m39.774s wc -l /san/sanvol1/scratch/nemVec1/TrfPart/00*/00*.lst # 24 /san/sanvol1/scratch/nemVec1/TrfPart/000/000.lst # 40 /san/sanvol1/scratch/nemVec1/TrfPart/001/001.lst # 62 /san/sanvol1/scratch/nemVec1/TrfPart/002/002.lst # 102 /san/sanvol1/scratch/nemVec1/TrfPart/003/003.lst # 201 /san/sanvol1/scratch/nemVec1/TrfPart/004/004.lst # 1016 /san/sanvol1/scratch/nemVec1/TrfPart/005/005.lst # 5428 /san/sanvol1/scratch/nemVec1/TrfPart/006/006.lst # 3931 /san/sanvol1/scratch/nemVec1/TrfPart/007/007.lst # 10804 total ssh kk cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster para time > para.time 2>&1 exit cd /cluster/data/nemVec1/bed/simpleRepeat cat run.cluster/para.time # 8 jobs in batch # 2 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 7 of 8 jobs # Crashed: 1 jobs # CPU time in finished jobs: 11930s 198.84m 3.31h 0.14d 0.000 y # IO & Wait Time: 9131s 152.18m 2.54h 0.11d 0.000 y # Average job time: 3009s 50.15m 0.84h 0.03d # Longest finished job: 9775s 162.92m 2.72h 0.11d # Submission to last job: 20268s 337.80m 5.63h 0.23d # Estimated complete: 0s 0.00m 0.00h 0.00d ssh kk cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster para crashed # 8 jobs in batch # 2 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # ./TrfRun.csh /san/sanvol1/scratch/nemVec1/TrfPart/003/003.lst.bed # generate new list cp para.time para.time.orig cp jobList jobList.orig # remove the other jobs from the list and recreate batch rm para.bookmark para make jobList # pick up where we left off... time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \ -continue trf \ -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do3.log 2>&1 & ssh pk cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster para time # 8 jobs in batch # 0 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 8 of 8 jobs # CPU time in finished jobs: 13551s 225.86m 3.76h 0.16d 0.000 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 1692s 28.20m 0.47h 0.02d # Longest finished job: 2353s 39.22m 0.65h 0.03d # Submission to last job: 2357s 39.28m 0.65h 0.03d # Estimated complete: 0s 0.00m 0.00h 0.00d # pick up where we left off... the right way... time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \ -continue filter \ -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do4.log 2>&1 & ####################################################################### # MASK SEQUENCE WITH RM+TRF (DONE - 2008-Jul-16 - Kord) ssh kolossus cd /cluster/data/nemVec1 twoBitMask nemVec1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed nemVec1.2bit # ignornig extra BED column warnin twoBitToFa nemVec1.2bit stdout | faSize stdin # 356613585 bases (59221913 N's 297391672 real 257024600 upper 40367072 lower) in 10804 sequences in 1 files # Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708 # N count: mean 5481.5 sd 10915.3 # U count: mean 23789.8 sd 125342.3 # L count: mean 3736.3 sd 17397.9 # %11.32 masked total, %13.57 masked real # set the symlink on hgwdev to /gbdb/nemVec1 ssh hgwdev cd /gbdb/nemVec1 ln -s /cluster/data/nemVec1/nemVec1.2bit . ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2008-Aug-01 - Kord) ssh kkstore06 cd /cluster/data/nemVec1 # Human uses 1024 # nemVec1 is 0.375/3.1 = 0.115 human # 0.115 * 1024 = 118 (fr2.txt uses 128) blat nemVec1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=128 # Wrote 15776 overused 11-mers to jkStuff/11.ooc # As a note... # A previous run with -repMatch=300 # Wrote 1377 overused 11-mers to jkStuff/11.ooc # copy all of this stuff to the Iservers ssh kkr1u00 mkdir -p /iscratch/i/nemVec1 cd /iscratch/i/nemVec1 cp -p /cluster/data/nemVec1/nemVec1.2bit . cp -p /cluster/data/nemVec1/jkStuff/11.ooc . cp -p /cluster/data/nemVec1/chrom.sizes . exit cp -p jkStuff/11.ooc /san/sanvol1/scratch/nemVec1 ####################################################################### # GENBANK AUTO-UPDATE ( DONE - 2008-Aug-11 - Kord ) # alingn with latest genbank process ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add nemVec1 # nemVec1 (Starlet Sea Anemone) nemVec1.serverGenome = /cluster/data/nemVec1/nemVec1.2bit nemVec1.clusterGenome = /san/sanvol1/scratch/nemVec1/nemVec1.2bit nemVec1.ooc = /san/sanvol1/scratch/nemVec1/11.ooc # pslCDnaFilter lines should have ordered instead of finished # (only human and mouse are finished) nemVec1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} nemVec1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} nemVec1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} nemVec1.maxIntron = 100000 nemVec1.lift = no # Per Mark: only load if >10^3 available # check /cluster/data/genbank/data/organism.lst # organism mrnaCnt estCnt refSeqCnt # Nematostella vectensis 147 163314 0 nemVec1.genbank.est.native.load = yes nemVec1.genbank.est.xeno.load = no nemVec1.genbank.mrna.native.load = no nemVec1.genbank.mrna.xeno.load = yes nemVec1.refseq.mrna.xeno.load = yes nemVec1.refseq.mrna.native.load = no nemVec1.perChromTables = no nemVec1.downloadDir = nemVec1 cvs com -m "Added nemVec1" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial nemVec1 & # var/build/logs/2008.08.01-16\:40\:20.nemVec1.initalign.log # genbank 2008.08.01-17:13:29 nemVec1.initalign: command: ./bin/gbAlignStep -initial nemVec1 # no species defined for database "nemVec1"; edit gbGenome.c to add definition # update gbGenome.c ssh hgwdev cd ~/kent/src/hg/makeDb/genbank/ cvsup vi src/lib/gbGenome.c # added: # static char *strPurNames[] = {"Strongylocentrotus purpuratus", NULL}; # {"nemVec", nemVecNames}, cvs com -m "nemVec" gbGenome.c make install-server exit # delete previous failed run rm -rf /cluster/data/genbank/build/work/initial.nemVec1/align time nice -n +19 ./bin/gbAlignStep -initial nemVec1 & # logFile: var/build/logs/2008.08.08-15:13:58.nemVec1.initalign.log # real 51m43.328s # user 37m44.638s # sys 11m55.862s # updated job database on disk # total sick machines: 7 failures: 25 # Sick batch! Correct problem and then run para clearSickNodes. # command failed: ssh -x pk cd /cluster/data/genbank/build/work/initial.nemVec1/align\; para make # -maxPush=200000 align.jobs = sick batch threshold (25) # Checking finished jobs # unsubmitted jobs: 35835 # crashed: 32 # total jobs in batch: 41202 # total sick machines: 7 failures: 25 # Sick batch! Correct problem and then run para clearSickNodes. # It looks like all are sick nodes para problems | egrep failure | awk '{print $3}' | sort | uniq -c # 32 crash para try # 41202 jobs in batch # 21559 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # updated job database on disk # Pushed Jobs: 10 # Can't open /iscratch/i/nemVec1/nemVec1.2bit to read: No such file or directory para problem | egrep 2bit | egrep Can | wc -l # 41202 jobs in batch # 20652 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # 52 ssh hgwdev ls -l /iscratch/i/nemVec1/nemVec1.2bit # -rw-rw-r-- 1 kord protein 91277763 Jul 16 21:25 /iscratch/i/nemVec1/nemVec1.2bit # rsync to the iservers - this takes a long time these days # iservers no available? # Using /san/sanvol1/scratch/nemVec1/ cp /cluster/data/nemVec1/nemVec1.2bit /san/sanvol1/scratch/nemVec1/ cp /cluster/data/nemVec1/jkStuff/11.ooc /san/sanvol1/scratch/nemVec1/ ssh kkr10u26.kilokluster.ucsc.edu "ls -l /san/sanvol1/scratch/nemVec1/" total 89708 # -rw-rw-r-- 1 kord protein 63112 Aug 11 10:10 11.ooc # drwxrwxr-x 4 kord protein 38912 Jul 16 10:29 RMPart # drwxrwxr-x 4 kord protein 2048 Jul 16 21:01 TrfPart # -rw-rw-r-- 1 kord protein 91277763 Aug 11 10:10 nemVec1.2bit # Updated kent/src/hg/makeDb/genbank/etc/genbank.conf with new paths (and updated # doc above) cd /cluster/home/kord/kent/src/hg/makeDb/genbank vi etc/genbank.conf cvs com -m "updated nemVec1 2bit and ooc path" etc/genbank.conf make etc-update ssh genbank screen cd /cluster/data/genbank egrep nemVec etc/genbank.conf # nemVec1 (Starlet Sea Anemone) # nemVec1.serverGenome = /cluster/data/nemVec1/nemVec1.2bit # nemVec1.clusterGenome = /san/sanvol1/scratch/nemVec1/nemVec1.2bit # nemVec1.ooc = /san/sanvol1/scratch/nemVec1/11.ooc # ... rm -rf /cluster/data/genbank/build/work/initial.nemVec1/align time nice -n +19 ./bin/gbAlignStep -initial nemVec1 & # logFile: var/build/logs/2008.08.11-10:15:07.nemVec1.initalign.log # real 52m22.977s # user 38m37.753s # sys 12m28.625s # updated job database on disk # total sick machines: 3 failures: 10 # Sick batch! Correct problem and then run para clearSickNodes. # command failed: ssh -x pk cd /cluster/data/genbank/build/work/initial.nemVec1/align\; para make -maxPush=200000 align.jobs /dev/null; if [ $? == 0 ]; then echo $f >> genome.list fi done wc -l genome.list # 157 genome.list # use kk ssh kk parasol status # CPUs total: 62 # CPUs free: 62 # CPUs busy: 0 # Nodes total: 31 # ... cd /cluster/data/nemVec1/bed/genscan cat << _EOF_ > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP _EOF_ # ended up using pk instead of kk ssh pk /parasol/bin/gensub2 genome.list single gsub jobList para create jobList # Checking input files # ....... # 157 jobs written to /cluster/store3/nemVec1/bed/genscan/batch para try # 157 jobs in batch # 218392 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # updated job database on disk # Pushed Jobs: 10 para check # 157 jobs in batch # 218209 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # unsubmitted jobs: 147 # ranOk: 10 # total jobs in batch: 157 para push # 157 jobs in batch # 218193 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # ....... # updated job database on disk # Pushed Jobs: 147 para check # 157 jobs in batch # 217861 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # ...................... # ranOk: 157 # total jobs in batch: 157 para time > para.time 2>&1 cat para.time # 157 jobs in batch # 217819 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 157 of 157 jobs # CPU time in finished jobs: 8635s 143.92m 2.40h 0.10d 0.000 y # IO & Wait Time: 499s 8.31m 0.14h 0.01d 0.000 y # Average job time: 58s 0.97m 0.02h 0.00d # Longest finished job: 101s 1.68m 0.03h 0.00d # Submission to last job: 303s 5.05m 0.08h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d # Concatenate ssh kkstore06 cd /cluster/data/nemVec1/bed/genscan cat gtf/*gtf > genscan.gtf cat pep/*pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load into the databse ssh hgwdev cd /cluster/data/nemVec1/bed/genscan ldHgGene -gtf nemVec1 genscan genscan.gtf # Reading genscan.gtf # Read 30239 transcripts in 183271 lines in 1 files # 30239 groups 7812 seqs 1 sources 1 feature types # 30239 gene predictions hgPepPred nemVec1 generic genscanPep genscan.pep # Processing genscan.pep hgLoadBed nemVec1 genscanSubopt genscanSubopt.bed # Reading genscanSubopt.bed # Loaded 131648 elements of size 6 # Sorted # Creating table definition for genscanSubopt # Saving bed.tab # Loading nemVec1 featureBits nemVec1 genscan # 43681106 bases of 297398104 (14.688%) in intersection featureBits nemVec1 genscanSubopt # 22102365 bases of 297398104 (7.432%) in intersection featureBits nemVec1 genscan rmsk # 630 bases of 297398104 (0.000%) in intersection ####################################################################### # Updating the description with an image (DONE 2008-Aug-11 - Kord) # checking out the browser CVS tree ssh hgwdev cvs co browser/images # add the image cd browser/images mv ~/Nematostella_vectensis.jpg . cvs add ~/Nematostella_vectensis.jpg cvs com ~/Nematostella_vectensis.jpg # update the description page cd ~/kent/src/hg/makeDb/trackDb cvs com description.html