# for emacs: -*- mode: sh; -*- # Pristionchus pacificus # Washington University School of Medicine GSC and Sanger Institute # # $Id: priPac1.txt,v 1.16 2008/08/15 21:53:32 hiram Exp $ ########################################################################### ## Download sequence - (DONE - 2007-02-25 - Hiram) ssh kkstore02 mkdir /cluster/store5/priPac1 ln -s /cluster/store5/priPac1 /cluster/data/priPac1 mkdir /cluster/data/priPac1/downloads cd /cluster/data/priPac1/downloads wget --timestamping -np -nd \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Pristionchus_pacificus/assembly/draft/Pristionchus_pacificus-5.0/ASSEMBLY" wget --timestamping -np -nd \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Pristionchus_pacificus/assembly/draft/Pristionchus_pacificus-5.0/README" rm -f README.priPac1 mv README README.priPac1 wget --timestamping -m -np -nd \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Pristionchus_pacificus/assembly/draft/Pristionchus_pacificus-5.0/output/" ############################################################################ ## Create chrUn agp and lift files - (DONE - 2007-03-04 - Hiram ssh kkstore02 cd /cluster/data/priPac1/downloads cat << '_EOF_' > mkSuperLift.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $agpItemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; my $superStart = 0; my $superEnd = 1; my $superLength = 0; my $chrUnSize = 174852139; my $chrUnName = "chrUn"; open (CT,">ctgPos2.tab") or die "Can not write to ctgPos2.tab"; open (AG,">priPac1.chrUn.agp") or die "Can not write to priPac1.chrUn.agp"; open (GL,">priPac1.gold.tab") or die "Can not write to gold.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $gapCounter, $gapWN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; printf AG "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $agpItemCount++, $gapWN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $ctgCounter, $ctgWN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $superLength = $superEnd - $superStart; $end = $start + $scaffoldGapSize - 1; printf AG "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $agpItemCount++, $scaffoldGapSize; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; $start = $end + 1; $scafEnd = $cStart - 1; $superStart = $start - 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; $superEnd = $end; printf AG "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $agpItemCount++, $ctgWN, $ctgName, $fragStart, $scafEnd, $strand; printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n", $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); close (AG); $superLength = $superEnd - $superStart; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; close (GL); '_EOF_' # << happy emacs chmod +x ./mkSuperLift.pl ./mkSuperLift.pl > super.lift qaToQac contigs.fa.qual.gz stdout \ | qacAgpLift chrUn.agp.gz stdin chrUn.qac ########################################################################## ## Initial browser build (DONE - 2007-03-04 - Hiram) ssh kkstore02 cd /cluster/data/priPac1 cat << '_EOF_' > priPac1.config.ra # Config parameters for makeGenomeDb.pl: db priPac1 clade worm genomeCladePriority 76 scientificName Pristionchus pacificus commonName P. pacificus assemblyDate Feb. 2007 assemblyLabel Washington University School of Medicine GSC Pristionchus pacificus 5.0 orderKey 885 mitoAcc none fastaFiles /cluster/data/priPac1/downloads/supercontigs.fa.gz agpFiles /cluster/data/priPac1/downloads/priPac1.chrUn.agp qualFiles /cluster/data/priPac1/downloads/chrUn.qac dbDbSpeciesDir worm '_EOF_' # << happy emacs makeGenomeDb.pl priPac1.config.ra > makeGenomeDb.out 2>&1 ssh hgwdev cd /cluster/data/priPac1/downloads hgLoadBed -sqlTable=chrUn_gold.sql priPac1 chrUn_gold priPac1.gold.tab hgLoadSqlTab priPac1 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ########################################################################### ## Photograph added (DONE - 2007-04-03 - Hiram) # Obtained via email from metta.riebesell@tuebingen.mpg.de Metta Riebesell # from the lab of: Ralf J. Sommer # http://www.eb.tuebingen.mpg.de/departments/4-evolutionary-biology/department-4-evolutionary-biology # http://www.eb.tuebingen.mpg.de/core-facilities/microscopy-unit/sem_pictures/scanning-electron-microscopic-pictures # Credits to Juergen Berger and Ralf J. Sommer mkdir /cluster/data/priPac1/photograph/ # copy the file from email to this directory convert -quality 100 -crop "1920x2080+320+160" \ Pristionchus_pacificus_RS312_red.jpg priPac1_crop.jpg convert -quality 80 -geometry "14%" priPac1_crop.jpg priPac1_3x3.jpg identify priPac1_3x3.jpg # priPac1_3x3.jpg JPEG 269x291 269x291+0+0 DirectClass 6e+01kb # add priPac1_3x3.jpg to the browser doc source tree, calling the # file: Pristionchus_pacificus.jpg ############################################################################ ## Default position (DONE - 2007-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:66109664-66116326" where name="priPac1";' hgcentraltest ########################################################################### ## RepeatMasker (DONE - 2007-03-04 - Hiram) ssh kkstore02 mkdir /cluster/data/priPac1/bed/RepeatMasker cd /cluster/data/priPac1/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl \ -buildDir=/cluster/data/priPac1/bed/RepeatMaskerpriPac1 \ > doRepeatMasker.out 2>&1 & # real 271m29.185s twoBitToFa ../../priPac1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 174852139 bases (28905831 N's 145946308 real # 143274679 upper 2671629 lower) in 1 sequences in 1 files # %1.53 masked total, %1.83 masked real ########################################################################### ## Simple Repeats (DONE - 2007-03-14 - Hiram) ## Documented 31 may 2007 ssh kolossus mkdir /cluster/data/priPac1/bed/simpleRepeat cd /cluster/data/priPac1/bed/simpleRepeat twoBitToFa ../../priPac1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/priPac1/bed/simpleRepeat nice -n +19 hgLoadBed priPac1 simpleRepeat \ /cluster/data/priPac1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits priPac1 simpleRepeat > fb.priPac1.simpleRepeat.txt 2>&1 # 3043060 bases of 145948246 (2.085%) in intersection ############################################################################ ## Add TRF to RMSK (DONE - 2007-03-04 - Hiram) ssh kkstore02 cd /cluster/data/priPac1 cat bed/simpleRepeat/trfMask.bed \ twoBitMask -add -type=.bed priPac1.rmsk.2bit stdin priPac1.rmskTrf.2bit twoBitToFa priPac1.rmskTrf.2bit stdout | faSize stdin \ > faSize.priPac1.rmskTrf.txt # 174852139 bases (28905831 N's 145946308 real # 143130041 upper 2816267 lower) in 1 sequences in 1 files # %1.61 masked total, %1.93 masked real ############################################################################ ## BLASTZ swap ce4 (DONE - 2007-04-06 - Hiram) ssh kkstore02 cd /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04 cat fb.ce4.chainPriPac1Link.txt # 8671813 bases of 100281244 (8.647%) in intersection mkdir /cluster/data/priPac1/bed/blastz.ce4.swap cd /cluster/data/priPac1/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04/DEF \ -bigClusterHub=pk -swap > swap.log 2>&1 & cat fb.priPac1.chainCe4Link.txt # 9730314 bases of 145948246 (6.667%) in intersection ############################################################################## ## BLASTZ caeRem2 (DONE - 2007-04-20,21 - Hiram) mkdir /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20 cd /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20 cat << '_EOF_' > DEF # priPac1 vs caeRem2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: Pristionchus pacificus priPac1, unmasked sequence SEQ1_DIR=/san/sanvol1/scratch/worms/priPac1/priPac1.unmasked.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/priPac1/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: briggsae caeRem2, 9,660 contigs, longest 5,925,111 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/caeRem2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.sizes SEQ2_LIFT=/san/sanvol1/scratch/worms/caeRem2/caeRem2.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=kk \ -blastzOutRoot /cluster/bluearc/priPac1CaeRem2 > do.log 2>&1 & # real 238m33.642s cat fb.priPac1.chainCaeRem2Link.txt # 9047547 bases of 145948246 (6.199%) in intersection ## swap to caeRem2 - also in caeRem2.txt mkdir /cluster/data/caeRem2/bed/blastz.priPac1.swap cd /cluster/data/caeRem2/bed/blastz.priPac1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \ /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20/DEF \ -swap > swap.log 2>&1 & # less than 2 minutes cat fb.caeRem2.chainPriPac1Link.txt # 8971992 bases of 146898439 (6.108%) in intersection ########################################################################## ## SWAP BLASTZ caePb1 (DONE - 2007-04-21 - Hiram) ssh kkstore02 cd /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20 cat fb.caePb1.chainPriPac1Link.txt # 13589188 bases of 175247318 (7.754%) in intersection mkdir /cluster/data/priPac1/bed/blastz.caePb1.swap cd /cluster/data/priPac1/bed/blastz.caePb1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20/DEF \ -swap > swap.log 2>&1 & # real 9m18.554s cat fb.priPac1.chainCaePb1Link.txt # 11713567 bases of 145948246 (8.026%) in intersection ########################################################################## ## SWAP BLASTZ cb3 (DONE - 2007-04-21 - Hiram) ssh kkstore02 cd /cluster/data/cb3/bed/blastz.priPac1.um.2007-04-21 cat fb.cb3.chainPriPac1Link.txt # 7075879 bases of 108433446 (6.526%) in intersection mkdir /cluster/data/priPac1/bed/blastz.cb3.swap cd /cluster/data/priPac1/bed/blastz.cb3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/cb3/bed/blastz.priPac1.um.2007-04-21/DEF \ -swap > swap.log 2>&1 & # real 0m48.872s cat fb.priPac1.chainCb3Link.txt # 7800313 bases of 145948246 (5.345%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2007-04-23 - Hiram) # Use -repMatch=42 (based on size -- for human we use 1024, and # P. pacificus size is ~4.1% of human judging by gapless priPac1 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/priPac1 blat priPac1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=42 # Wrote 32320 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/priPac1 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-04-23,25 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add priPac1 just before caePb1 # priPac1 (P. pacificus) priPac1.serverGenome = /cluster/data/priPac1/priPac1.2bit priPac1.clusterGenome = /iscratch/i/worms/priPac1/priPac1.2bit priPac1.ooc = /iscratch/i/worms/priPac1/11.ooc priPac1.lift = /iscratch/i/worms/priPac1/priPac1.supercontigs.lift priPac1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} priPac1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} priPac1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} priPac1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} priPac1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} priPac1.refseq.mrna.native.load = no priPac1.refseq.mrna.xeno.load = yes priPac1.refseq.mrna.xeno.loadDesc = yes priPac1.genbank.mrna.xeno.load = yes priPac1.genbank.est.native.load = yes priPac1.genbank.est.native.loadDesc = no priPac1.downloadDir = priPac1 priPac1.perChromTables = no cvs ci -m "Added priPac1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added P. pacificus." src/lib/gbGenome.c make install-server ssh kkstore01 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial priPac1 & # logFile: var/build/logs/2007.07.10-17:00:50.priPac1.initalign.log # real 329m0.244s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad priPac1 # logFile: var/dbload/hgwdev/logs/2007.07.11-13:04:12.dbload.log # logFile: var/dbload/hgwdev/logs/2007.04.25-09:26:37.dbload.log # real 20m2.686s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add priPac1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added priPac1 - P. pacificus" etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("priPac1", "blat11", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("priPac1", "blat11", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ########################################################################## ## summarize chainLink measurements (2007-04-25 - Hiram) # org on priPac1 on other # caePb1 8.026 7.754 # ce4 6.667 8.647 # caeRem2 6.199 6.108 # cb3 5.345 6.526 ########################################################################## # Set default position to cover the 16S ribosomal RNA area ## (DONE - 2007-04-26 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:64713500-64721000" where name="priPac1"' hgcentraltest ########################################################################## ## Creating downloads (DONE - 2007-05-01 - Hiram) # There is only one chrom, make its trfMaskChrom file exist ssh hgwdev mkdir /cluster/data/priPac1/bed/simpleRepeat/trfMaskChrom cd /cluster/data/priPac1/bed/simpleRepeat/trfMaskChrom ## symlink didn't work here, the symlink ended up in the tar file cp -p ../trfMask.bed ./chrUn.bed cd /cluster/data/priPac1 nice -n +19 /cluster/bin/scripts/makeDownloads.pl priPac1 ## *!* EDIT THE README.txt FILES *!* ## ## make the links to the liftOver files: cd /cluster/data/priPac1/bed/liftOver md5sum *.gz > md5sum.txt cd /usr/local/apache/htdocs/goldenPath/priPac1/liftOver ln -s /cluster/data/priPac1/bed/liftOver/* . ########################################################################## ## Creating pushQ (DONE - 2007-05-01 - Hiram) ssh hgwdev mkdir /cluster/data/priPac1/pushQ cd /cluster/data/priPac1/pushQ /cluster/bin/scripts/makePushQSql.pl priPac1 > priPac1.sql 2> stderr.out ## check the stderr.out for anything that needs to be fixed ## copy priPac1.sql to hgwbeta:/tmp ## then on hgwbeta hgsql qapushq < priPac1.sql ########################################################################### # ELEGANS (ce4) PROTEINS TRACK (DONE - Hiram - 2007-05-02) ssh kkstore02 # breaking up this target genome into manageable pieces mkdir /cluster/data/priPac1/blastDb cd /cluster/data/priPac1 twoBitToFa priPac1.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 185 pieces of 185 written rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ## copy to san for kluster access mkdir -p /san/sanvol1/scratch/worms/priPac1/blastDb cd /san/sanvol1/scratch/worms/priPac1/blastDb rsync -a --progress --stats /cluster/data/priPac1/blastDb/ . ## create the query protein set mkdir -p /cluster/data/priPac1/bed/tblastn.ce4SG cd /cluster/data/priPac1/bed/tblastn.ce4SG echo /san/sanvol1/scratch/worms/priPac1/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 185 query.lst # we want around 50000 jobs calc `wc /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) 23192/(50000/185) = 85.810400 # round up this number a bit, and use in the split here mkdir -p /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/sgfa split -l 100 /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl \ /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/sgfa/sg ln -s /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/sgfa sgfa cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/blastOut ln -s /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/blastOut ./blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/priPac1/bed/tblastn.ce4SG cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/priPac1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce4/bed/blat.ce4SG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh kk cd /cluster/data/priPac1/bed/tblastn.ce4SG gensub2 query.lst sg.lst template jobList para create jobList # para try, check, push, check etc. # Completed: 42920 of 42920 jobs # CPU time in finished jobs: 2397247s 39954.12m 665.90h 27.75d 0.076 y # IO & Wait Time: 327951s 5465.85m 91.10h 3.80d 0.010 y # Average job time: 63s 1.06m 0.02h 0.00d # Longest finished job: 238s 3.97m 0.07h 0.00d # Submission to last job: 63130s 1052.17m 17.54h 0.73d # do the cluster run for chaining ssh kk mkdir /cluster/data/priPac1/bed/tblastn.ce4SG/chainRun cd /cluster/data/priPac1/bed/tblastn.ce4SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/blastOut/c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList para create jobList para maxNode 30 para try, check, push, check etc. # Completed: 232 of 232 jobs # CPU time in finished jobs: 45956s 765.93m 12.77h 0.53d 0.001 y # IO & Wait Time: 15391s 256.52m 4.28h 0.18d 0.000 y # Average job time: 264s 4.41m 0.07h 0.00d # Longest finished job: 13411s 223.52m 3.73h 0.16d # Submission to last job: 13415s 223.58m 3.73h 0.16d ssh kkstore02 cd /cluster/data/priPac1/bed/tblastn.ce4SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /cluster/data/priPac1/bed/tblastn.ce4SG/blastCe4SG.psl cd .. pslCheck blastCe4SG.psl # checked: 12671 failed: 0 # load table ssh hgwdev cd /cluster/data/priPac1/bed/tblastn.ce4SG hgLoadPsl priPac1 blastCe4SG.psl # check coverage featureBits priPac1 blastCe4SG # 5617285 bases of 145948246 (3.849%) in intersection featureBits caePb1 blastCe4SG # 22988044 bases of 175247318 (13.117%) in intersection featureBits caeRem2 blastCe4SG # 19763359 bases of 146898439 (13.454%) in intersection featureBits cb3 blastCe4SG # 18218293 bases of 108433446 (16.801%) in intersection featureBits ce4 sangerGene # 27906202 bases of 100281244 (27.828%) in intersection ssh kkstore02 rm -rf /cluster/data/priPac1/bed/tblastn.ce4SG/blastOut rm -rf /cluster/bluearc/worms/priPac1/bed/tblastn.ce4SG rmdir /cluster/bluearc/worms/priPac1/bed rmdir /cluster/bluearc/worms/priPac1 #end tblastn ########################################################################### ## Window Masker (DONE - 2008-06-09 - Hiram) ssh kolossus mkdir /cluster/data/priPac1/bed/windowMasker cd /cluster/data/priPac1/bed/windowMasker time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse kolossus \ -buildDir=/cluster/data/priPac1/bed/windowMasker priPac1 > do.log 2>&1 # real 16m19.271s twoBitToFa priPac1.wmsk.sdust.2bit stdout \ | faSize stdin > priPac1.wmsk.sdust.2bit.faSize 2>&1 cat priPac1.wmsk.sdust.2bit.faSize # 174852139 bases (28905831 N's 145946308 real # 112045482 upper 33900826 lower) in 1 sequences in 1 files # %19.39 masked total, %23.23 masked real ssh hgwdev cd /cluster/data/priPac1/bed/windowMasker hgLoadBed priPac1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 869830 elements of size 3 # eliminate the gaps from the masking (WM bug) featureBits priPac1 -not gap -bed=notGap.bed # 145948246 bases of 145948246 (100.000%) in intersection time nice -n +19 featureBits priPac1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 33901913 bases of 145948246 (23.229%) in intersection # reload track to get it clean hgLoadBed priPac1 windowmaskerSdust cleanWMask.bed.gz # Loaded 867522 elements of size 4 featureBits priPac1 windowmaskerSdust # 33901913 bases of 145948246 (23.229%) in intersection ############################################################################## ## combine trf mask with windowmasker (DONE - 2008-06-09 - Hiram) ssh kkstore02 cd /cluster/data/priPac1 zcat bed/windowMasker/cleanWMask.bed.gz \ | twoBitMask priPac1.unmasked.2bit stdin \ -type=.bed priPac1.cleanWMSdust.2bit cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed priPac1.cleanWMSdust.2bit stdin \ priPac1.TrfWM.2bit # can safely ignore the warning about BED file >= 13 fields # check it twoBitToFa priPac1.TrfWM.2bit stdout | faSize stdin # 174852139 bases (28905831 N's 145946308 real # 112012655 upper 33933653 lower) in 1 sequences in 1 files # %19.41 masked total, %23.25 masked real ########################################################################### ## create masked contigs for blastz runs (DONE - 2008-06-09 - Hiram) ssh kkstore02 mkdir /cluster/data/priPac1/trfWM.maskedContigs cd /cluster/data/priPac1/trfWM.maskedContigs ln -s ../downloads/super.lift . ../jkStuff/lft2BitToFa.pl ../priPac1.TrfWM.2bit super.lift > super.fa faToTwoBit super.fa priPac1.TrfWM.supers.2bit faSize super.fa # 169747139 bases (23800831 N's 145946308 real # 112012655 upper 33933653 lower) in 5106 sequences in 1 files # %19.99 masked total, %23.25 masked real twoBitToFa priPac1.TrfWM.supers.2bit stdout | faSize stdin # 169747139 bases (23800831 N's 145946308 real # 112012655 upper 33933653 lower) in 5106 sequences in 1 files # %19.99 masked total, %23.25 masked real twoBitInfo priPac1.TrfWM.supers.2bit stdout \ | sort -k2rn > priPac1.TrfWM.supers.sizes ########################################################################### # ELEGANS (ce6) PROTEINS TRACK (DONE - Hiram - 2008-06-11) ssh kkstore02 # trying a better split scheme this time compared to the Ce4 run mkdir /cluster/data/priPac1/blastDb cd /cluster/data/priPac1 twoBitToFa -noMask trfWM.maskedContigs/priPac1.TrfWM.supers.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 5160 pieces of 5160 written rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ## copy to san for kluster access mkdir -p /san/sanvol1/scratch/worms/priPac1/blastDb cd /san/sanvol1/scratch/worms/priPac1/blastDb rsync -a --progress --stats /cluster/data/priPac1/blastDb/ . ## create the query protein set mkdir -p /cluster/data/priPac1/bed/tblastn.ce6SG cd /cluster/data/priPac1/bed/tblastn.ce6SG echo /san/sanvol1/scratch/worms/priPac1/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 5160 query.lst # we want around 80000 jobs calc `wc /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl | awk "{print \\\$1}"`/\(80000/`wc query.lst | awk "{print \\\$1}"`\) # 23741/(80000/5160) = 1531.294500 mkdir -p /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/sgfa split -l 1500 /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl \ /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/sgfa/sg ln -s /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/sgfa sgfa cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/blastOut ln -s /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/priPac1/bed/tblastn.ce6SG cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=priPac1 BLASTMAT=/scratch/data/blast229/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast229/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /cluster/data/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /cluster/data/ce6/bed/blat.ce6SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh kk cd /cluster/data/priPac1/bed/tblastn.ce6SG gensub2 query.lst sg.lst template jobList para create jobList # para try, check, push, check etc. # Completed: 82560 of 82560 jobs # CPU time in finished jobs: 13104748s 218412.47m 3640.21h 151.68d 0.416 y # IO & Wait Time: 518235s 8637.25m 143.95h 6.00d 0.016 y # Average job time: 165s 2.75m 0.05h 0.00d # Longest finished job: 2798s 46.63m 0.78h 0.03d # Submission to last job: 85527s 1425.45m 23.76h 0.99d # do the cluster run for chaining ssh kk mkdir /cluster/data/priPac1/bed/tblastn.ce6SG/chainRun cd /cluster/data/priPac1/bed/tblastn.ce6SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/blastOut/c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /cluster/data/priPac1/bed/tblastn.ce6SG/chainRun para create jobList para maxNode 30 para try, check, push, check etc. # Completed: 16 of 16 jobs # CPU time in finished jobs: 558s 9.30m 0.16h 0.01d 0.000 y # IO & Wait Time: 2841s 47.35m 0.79h 0.03d 0.000 y # Average job time: 212s 3.54m 0.06h 0.00d # Longest finished job: 228s 3.80m 0.06h 0.00d # Submission to last job: 243s 4.05m 0.07h 0.00d ssh kkstore02 cd /cluster/data/priPac1/bed/tblastn.ce6SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /cluster/data/priPac1/bed/tblastn.ce6SG/blastCe6SG.psl cd .. pslCheck blastCe6SG.psl # checked: 15089 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/priPac1/bed/tblastn.ce6SG liftUp -nohead -type=.psl stdout ../../jkStuff/priPac1.supercontigs.lift \ error blastCe6SG.psl | sort -k14,14 -k16,16n \ | hgLoadPsl -table=blastCe6SG priPac1 stdin # check coverage featureBits priPac1 blastCe6SG # 5853433 bases of 145948246 (4.011%) in intersection featureBits priPac1 blastCe4SG # 5617285 bases of 145948246 (3.849%) in intersection featureBits ce6 sangerGene # 28134889 bases of 100281426 (28.056%) in intersection ssh kkstore02 rm -rf /cluster/data/priPac1/bed/tblastn.ce6SG/blastOut rm -rf /cluster/bluearc/worms/priPac1/bed/tblastn.ce6SG ######################################################################### # SWAP LASTZ ce9 (DONR - 2010-09-23 - Hiram) # original alignment cd /hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23 cat fb.ce9.chainPriPac1Link.txt # 6251889 bases of 100286004 (6.234%) in intersection # swap, this is also in priPac1.txt mkdir /hive/data/genomes/priPac1/bed/blastz.ce9.swap cd /hive/data/genomes/priPac1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk -smallClusterHub=memk \ -swap > swap.log 2>&1 & # real 0m43.464s cat fb.priPac1.chainCe9Link.txt # 6799226 bases of 145948246 (4.659%) in intersection ############################################################################ # LIFTOVER TO priPac2 (DONE - 2010-02-11 - Hiram ) mkdir /hive/data/genomes/priPac1/bed/blat.priPac2.2010-10-14 cd /hive/data/genomes/priPac1/bed/blat.priPac2.2010-10-14 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -ooc=/hive/data/genomes/priPac1/jkStuff/11.ooc -debug priPac1 priPac2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/priPac1/jkStuff/11.ooc \ priPac1 priPac2 > do.log 2>&1 # real 4m16.827s #############################################################################