# for emacs: -*- mode: sh; -*- # Caenorhabditis remanei # From Wash U GSC # http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+remanei # $Id: caeRem2.txt,v 1.15 2008/05/23 23:07:19 hiram Exp $ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2007-03-16 - Hiram) ssh kkstore01 mkdir /cluster/store10/caeRem2 ln -s /cluster/store10/caeRem2 /cluster/data/caeRem2 cd /cluster/data/caeRem2 mkdir downloads cd downloads wget --timestamping \ ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/ASSEMBLY wget --timestamping \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/README" \ -O README.caeRem2 wget --timestamping -m -np -nd \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/output/" ########################################################################## ## Create chrUn.agp file (DONE - 2007-03-16 - Hiram) ssh kkstore01 cd /cluster/data/caeRem2/downloads cat << '_EOF_' > mkSuperLift.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $agpItemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; my $superStart = 0; my $superEnd = 1; my $superLength = 0; my $chrUnSize = 161944676; my $chrUnName = "chrUn"; my $ctgFragCount = 1; open (CT,">caeRem2.ctgPos2.tab") or die "Can not write to caeRem2.ctgPos2.tab"; open (SC,">caeRem2.CtgScaf.agp") or die "Can not write to caeRem2.CtgScaf.agp"; open (AG,">caeRem2.chrUn.agp") or die "Can not write to caeRem2.chrUn.agp"; open (GL,">caeRem2.gold.tab") or die "Can not write to caeRem2.gold.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $gapCounter, $gapWN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; printf SC "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $agpItemCount, $gapWN, $gapLen, $frag, $yesNo; printf AG "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $agpItemCount++, $gapWN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $ctgCounter, $ctgWN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $superLength = $superEnd - $superStart; $end = $start + $scaffoldGapSize - 1; printf SC "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $agpItemCount, $scaffoldGapSize; printf AG "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $agpItemCount++, $scaffoldGapSize; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; $start = $end + 1; $scafEnd = $cStart - 1; $superStart = $start - 1; $ctgFragCount = 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; $superEnd = $end; my $ctgFragName = sprintf("%s.%d", $ctgName, $ctgFragCount++); my $ctgFragStart = 1; my $ctgFragEnd = $scafEnd - $fragStart + 1; printf SC "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $agpItemCount, $ctgWN, $ctgFragName, $ctgFragStart, $ctgFragEnd, $strand; printf AG "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $agpItemCount++, $ctgWN, $ctgName, $fragStart, $scafEnd, $strand; printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n", $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); close (AG); close (SC); $superLength = $superEnd - $superStart; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; close (GL); '_EOF_' # << happy emacs chmod +x mkSuperLift.pl ./mkSuperLift.pl > caeRem2.chrUn.lift qaToQac contigs.fa.qual.gz stdout \ | qacAgpLift caeRem2.CtgScaf.agp stdin chrUn.qac ########################################################################### ## Initial genome build (DONE - 2007-03-29 - Hiram) ssh kkstore01 cd /cluster/data/caeRem2 cat << '_EOF_' > caeRem2.config.ra # Config parameters for makeGenomeDb.pl: db caeRem2 clade worm genomeCladePriority 10 scientificName Caenorhabditis remanei commonName C. remanei assemblyDate Mar. 2006 assemblyLabel Washington University School of Medicine GSC C. remanei 1.0 orderKey 879 mitoAcc none fastaFiles /cluster/data/caeRem2/downloads/supercontigs.fa.gz agpFiles /cluster/data/caeRem2/downloads/caeRem2.chrUn.agp # qualFiles /cluster/data/caeRem2/downloads/chrUn.qac dbDbSpeciesDir worm '_EOF_' # << happy emacs time nice -n +19 makeGenomeDb.pl caeRem2.config.ra \ > makeGenomeDb.out 2>&1 ssh hgwdev cd /cluster/data/caeRem2/downloads sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \ > chrUn_gold.sql # edit that .sql file to add the bin column # bin smallint(6) NOT NULL default '0', hgLoadBed -sqlTable=chrUn_gold.sql caeRem2 chrUn_gold caeRem2.gold.tab hgLoadSqlTab caeRem2 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql \ caeRem2.ctgPos2.tab ########################################################################### ## Repeat Masker (DONE - 2007-03-29 - Hiram) ssh kkstore01 mkdir /cluster/data/caeRem2/bed/RepeatMasker cd /cluster/data/caeRem2/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl caeRem2 \ -buildDir=/cluster/data/caeRem2/bed/RepeatMasker > do.log 2>&1 cd /cluster/data/caeRem2 twoBitToFa caeRem2.rmsk.2bit stdout | faSize stdin \ > caeRem2.rmsk.2bit.faSize cat caeRem2.rmsk.2bit.faSize # 161944676 bases (15087728 N's 146856948 real # 145425741 upper 1431207 lower) in 1 sequences in 1 files # %0.88 masked total, %0.97 masked real ln -s caeRem2.rmsk.2bit caeRem2.2bit mkdir /san/sanvol1/scratch/worms/caeRem2 cp -p caeRem2.2bit /san/sanvol1/scratch/worms/caeRem2 cp -p chrom.sizes /san/sanvol1/scratch/worms/caeRem2 cp -p downloads/caeRem2.chrUn.lift /san/sanvol1/scratch/worms/caeRem2 ########################################################################### ## Window Masker (DONE - 2007-03-29 - Hiram) ssh kolossus mkdir /cluster/data/caeRem2/bed/WindowMasker cd /cluster/data/caeRem2/bed/WindowMasker time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse kolossus \ -buildDir=/cluster/data/caeRem2/bed/WindowMasker caeRem2 > do.log 2>&1 & # real 12m19.926s twoBitToFa caeRem2.wmsk.sdust.2bit stdout \ | faSize stdin > caeRem2.wmsk.sdust.2bit.faSize 2>&1 cat caeRem2.wmsk.sdust.2bit.faSize # 161944676 bases (15087728 N's 146856948 real # 100232042 upper 46624906 lower) in 1 sequences in 1 files # %28.79 masked total, %31.75 masked real ssh hgwdev cd /cluster/data/caeRem2/bed/WindowMasker hgLoadBed -strict caeRem2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1122372 elements of size 3 ########################################################################### ## Simple Repeats (DONE - 2007-03-29 - Hiram) ssh titan mkdir /cluster/data/caeRem2/bed/simpleRepeat cd /cluster/data/caeRem2/bed/simpleRepeat time twoBitToFa ../../caeRem2.unmasked.2bit stdout \ | nice -n +19 trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp > do.out 2>&1 awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # real 71m21.810s ssh hgwdev cd /cluster/data/caeRem2/bed/simpleRepeat nice -n +19 hgLoadBed caeRem2 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 37049 elements of size 16 featureBits caeRem2 simpleRepeat > fb.caeRem2.simpleRepeat.txt 2>&1 # 8604824 bases of 146898439 (5.858%) in intersection ssh kkstore01 cd /cluster/data/caeRem2 cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed caeRem2.rmsk.2bit stdin \ caeRem2.rmskTrf.2bit twoBitToFa caeRem2.rmskTrf.2bit stdout | faSize stdin \ > faSize.caeRem2.rmskTrf.txt cat faSize.caeRem2.rmskTrf.txt # 161944676 bases (15087728 N's 146856948 real # 145166254 upper 1690694 lower) in 1 sequences in 1 files # %1.04 masked total, %1.15 masked real ########################################################################### ### prepare contig 2bit file for blastz runs ssh kkstore01 mkdir /cluster/data/caeRem2/maskedContigs cd /cluster/data/caeRem2/maskedContigs ln -s ../downloads/caeRem2.chrUn.lift . ~/kent/src/hg/utils/lft2BitToFa.pl ../caeRem2.2bit \ caeRem2.chrUn.lift > caeRem2.contigs.fa faToTwoBit caeRem2.contigs.fa caeRem2.contigs.2bit twoBitInfo caeRem2.contigs.2bit stdout | sort -k2nr > caeRem2.contigs.sizes cp -p caeRem2.contigs.2bit caeRem2.chrUn.lift caeRem2.contigs.sizes \ /san/sanvol1/scratch/worms/caeRem2 ############################################################################ ## Default position (DONE - 2007-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:34567-45678" where name="caeRem2";' hgcentraltest ############################################################################ ## SWAP CB3 briggsae chain/net - (DONE - 2007-04-13 - Hiram) ssh kkstore02 cd /cluster/data/cb3/bed/blastz.caeRem2.2007-04-13 cat fb.cb3.chainCaeRem2Link.txt # 53199792 bases of 108433446 (49.062%) in intersection mkdir /cluster/data/caeRem2/bed/blastz.cb3.swap cd /cluster/data/caeRem2/bed/blastz.cb3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \ -swap \ /cluster/data/cb3/bed/blastz.caeRem2.2007-04-13/DEF > swap.log 2>&1 & # The typical failure: # netChains: looks like previous stage was not successful (can't find # [caeRem2.cb3.]all.chain[.gz]). time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \ -continue=net -swap \ /cluster/data/cb3/bed/blastz.caeRem2.2007-04-13/DEF > net.log 2>&1 & # real 9m6.800s cat fb.caeRem2.chainCb3Link.txt # 63292118 bases of 146898439 (43.086%) in intersection ############################################################################ ## SWAP caePb1 chain/net - (DONE - 2007-04-20 - Hiram) ssh kkstore02 cd /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19 cat fb.caePb1.chainCaeRem2Link.txt # 93139445 bases of 175247318 (53.147%) in intersection # swap to caeRem2 mkdir /cluster/data/caeRem2/bed/blastz.caePb1.swap cd /cluster/data/caeRem2/bed/blastz.caePb1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19/DEF \ -bigClusterHub=kk -swap > swap.log 2>&1 & cat fb.caeRem2.chainCaePb1Link.txt # 78402339 bases of 146898439 (53.372%) in intersection ############################################################################ ## reset defaultDb to caeRem2 (DONE - 2007-04-21 - Hiram) ssh hgwdev hgsql -e 'update defaultDb set name="caeRem2" where genome="C. remanei";' \ hgcentraltest ############################################################################ ## SWAP ce4 chain/net - (DONE - 2007-04-02 - Hiram) cd /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29 cat fb.ce4.chainCaeRem2Link.txt # 45160539 bases of 100281244 (45.034%) in intersection mkdir /cluster/data/caeRem2/bed/blastz.ce4.swap cd /cluster/data/caeRem2/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29/DEF \ -swap > swap.log 2>&1 & # real 3m52.831s cat fb.caeRem2.chainCe4Link.txt # 52254587 bases of 146898439 (35.572%) in intersection ############################################################################ ## SWAP priPac1 chain/net - (DONE - 2007-04-21 - Hiram) cd /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20 cat fb.priPac1.chainCaeRem2Link.txt # 9047547 bases of 145948246 (6.199%) in intersection mkdir /cluster/data/caeRem2/bed/blastz.priPac1.swap cd /cluster/data/caeRem2/bed/blastz.priPac1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \ /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20/DEF \ -swap > swap.log 2>&1 & # less than 2 minutes cat fb.caeRem2.chainPriPac1Link.txt # 8971992 bases of 146898439 (6.108%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2007-04-23 - Hiram) # Use -repMatch=50 (based on size -- for human we use 1024, and # C. remanei size is ~5.1% of human judging by gapless caeRem2 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/caeRem2 blat caeRem2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=50 # Wrote 46981 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caeRem2 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-04-23,25 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add caeRem2 just before caeRem1 # caeRem2 (C. remanei) caeRem2.serverGenome = /cluster/data/caeRem2/caeRem2.2bit caeRem2.clusterGenome = /iscratch/i/worms/caeRem2/caeRem2.2bit caeRem2.ooc = /iscratch/i/worms/caeRem2/11.ooc caeRem2.lift = /iscratch/i/worms/caeRem2/caeRem2.chrUn.lift caeRem2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} caeRem2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} caeRem2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} caeRem2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} caeRem2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} caeRem2.refseq.mrna.native.load = yes caeRem2.refseq.mrna.xeno.load = yes caeRem2.refseq.mrna.xeno.loadDesc = yes caeRem2.genbank.mrna.xeno.load = yes caeRem2.genbank.est.native.load = yes caeRem2.genbank.est.native.loadDesc = no caeRem2.downloadDir = caeRem2 caeRem2.perChromTables = no cvs ci -m "Added caeRem2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh kkstore01 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial caeRem2 & # logFile: var/build/logs/2007.07.10-17:00:24.caeRem2.initalign.log # real 255m14.345s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeRem2 # logFile: var/dbload/hgwdev/logs/2007.07.11-13:49:20.dbload.log # 17:10.13 # logFile: var/dbload/hgwdev/logs/2007.04.23-14:31:51.dbload.log # real 6m57.557s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add caeRem2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added caeRem2 - C. remanei" etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caeRem2", "blat10", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caeRem2", "blat10", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ########################################################################## ## summarize chainLink measurements (2007-04-25 - Hiram) # org on caeRem2 on other # caePb1 53.372 53.147 # cb3 43.086 49.062 # ce4 35.572 45.034 # priPac1 6.108 6.199 ########################################################################## # Set default position to cover the 16S ribosomal RNA area ## (DONE - 2007-04-26 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:103531185-103538203" where name="caeRem2"' hgcentraltest ############################################################################ ## Adding a Photograph, from Eric Hagg in email 2007-04-07 12:04 # -rw-r--r-- 1 369754 Jan 1 2000 remanei_midsection.jpg # remanei_midsection.jpg JPEG 716x972 DirectClass 361kb 0.000u 0:01 mkdir /cluster/data/caeRem2/photograph cd /cluster/data/caeRem2/photograph convert -sharpen 0 -normalize \ -geometry "400x400" "remanei_midsection.jpg" \ Caenorhabditis_remanei.jpg # check this .jpg file into the browser doc source tree images directory cvs add -kb Caenorhabditis_remanei.jpg cvs commit Caenorhabditis_remanei.jpg cp -p Caenorhabditis_remanei.jpg /usr/local/apache/htdocs/images ########################################################################## ## Creating downloads (DONE - 2007-05-01 - Hiram) # There is only one chrom, make its trfMaskChrom file exist ssh hgwdev mkdir /cluster/data/caeRem2/bed/simpleRepeat/trfMaskChrom cd /cluster/data/caeRem2/bed/simpleRepeat/trfMaskChrom ## symlink didn't work here, the symlink ended up in the tar file cp -p ../trfMask.bed ./chrUn.bed cd /cluster/data/caeRem2 nice -n +19 /cluster/bin/scripts/makeDownloads.pl caeRem2 ## *!* EDIT THE README.txt FILES *!* ## ## Also add the WindowMasker business to bigZips cd /cluster/data/caeRem2/goldenPath/bigZips mkdir chrUn cp -p ../../bed/WindowMasker/windowmasker.sdust.bed.gz ./chrUn/ gunzip chrUn/*.gz tar cvzf ./chromWMSdust.bed.tar.gz ./chrUn rm -fr chrUn md5sum *.gz > md5sum.txt ## add description to README.txt # chromWMSdust.bed.tar.gz - per-chrom bed files used as repeat areas # identified by WindowMasker with the -sdust option. ## make the links to the liftOver files: cd /cluster/data/caeRem2/bed/liftOver md5sum *.gz > md5sum.txt cd /usr/local/apache/htdocs/goldenPath/caeRem2/liftOver ln -s /cluster/data/caeRem2/bed/liftOver/* . ########################################################################## ## Creating pushQ (DONE - 2007-05-01 - Hiram) ssh hgwdev mkdir /cluster/data/caeRem2/pushQ cd /cluster/data/caeRem2/pushQ /cluster/bin/scripts/makePushQSql.pl caeRem2 > caeRem2.sql 2> stderr.out ## check the stderr.out for anything that needs to be fixed ## copy caeRem2.sql to hgwbeta:/tmp ## then on hgwbeta hgsql qapushq < caeRem2.sql ########################################################################### # ELEGANS (ce4) PROTEINS TRACK (DONE - Hiram - 2007-05-01) ssh kkstore01 # breaking up this target genome into manageable pieces mkdir /cluster/data/caeRem2/blastDb cd /cluster/data/caeRem2 twoBitToFa caeRem2.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 210 pieces of 210 written rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ## copy to san for kluster access mkdir -p /san/sanvol1/scratch/worms/caeRem2/blastDb cd /san/sanvol1/scratch/worms/caeRem2/blastDb rsync -a --progress --stats /cluster/data/caeRem2/blastDb/ . ## create the query protein set mkdir -p /cluster/data/caeRem2/bed/tblastn.ce4SG cd /cluster/data/caeRem2/bed/tblastn.ce4SG echo /san/sanvol1/scratch/worms/caeRem2/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 210 query.lst # we want around 50000 jobs calc `wc /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) # 23192/(50000/210) = 97.406400 mkdir -p /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa split -l 98 /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl \ /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa/sg ln -s /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa sgfa cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut ln -s /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut ./blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/caeRem2/bed/tblastn.ce4SG cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/caeRem2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce4/bed/blat.ce4SG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh pk cd /cluster/data/caeRem2/bed/tblastn.ce4SG gensub2 query.lst sg.lst template jobList para create jobList # para try, check, push, check etc. # Completed: 49770 of 49770 jobs # CPU time in finished jobs: 773169s 12886.15m 214.77h 8.95d 0.025 y # IO & Wait Time: 173488s 2891.47m 48.19h 2.01d 0.006 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest finished job: 1778s 29.63m 0.49h 0.02d # Submission to last job: 6918s 115.30m 1.92h 0.08d # do the cluster run for chaining ssh pk mkdir /cluster/data/caeRem2/bed/tblastn.ce4SG/chainRun cd /cluster/data/caeRem2/bed/tblastn.ce4SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut/c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList para create jobList para maxNode 30 para try, check, push, check etc. # Completed: 237 of 237 jobs # CPU time in finished jobs: 7408s 123.46m 2.06h 0.09d 0.000 y # IO & Wait Time: 1415s 23.59m 0.39h 0.02d 0.000 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest finished job: 867s 14.45m 0.24h 0.01d # Submission to last job: 1227s 20.45m 0.34h 0.01d ssh kkstore01 cd /cluster/data/caeRem2/bed/tblastn.ce4SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /cluster/data/caeRem2/bed/tblastn.ce4SG/blastCe4SG.psl cd .. pslCheck blastCe4SG.psl # load table ssh hgwdev cd /cluster/data/caeRem2/bed/tblastn.ce4SG hgLoadPsl caeRem2 blastCe4SG.psl # check coverage featureBits caeRem2 blastCe4SG # 19763359 bases of 146898439 (13.454%) in intersection featureBits caePb1 blastCe4SG # 22988044 bases of 175247318 (13.117%) in intersection featureBits priPac1 blastCe4SG # 5617285 bases of 145948246 (3.849%) in intersection featureBits cb3 blastCe4SG # 18218293 bases of 108433446 (16.801%) in intersection featureBits ce4 sangerGene # 27906202 bases of 100281244 (27.828%) in intersection ssh kkstore01 rm -rf /cluster/data/caeRem2/bed/tblastn.ce4SG/blastOut rm -rf /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG rmdir /cluster/bluearc/worms/caeRem2/bed rmdir /cluster/bluearc/worms/caeRem2 #end tblastn ############################################################################# # LIFTOVER TO caeRem3 (WORKING - 2008-05-23 - Hiram) ssh kkstore01 screen -r -d # use screen to control this job # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug caeRem2 caeRem3 \ -ooc /cluster/data/caeRem2/jkStuff/11.ooc # Real run: cd /cluster/data/caeRem2/bed/blat.caeRem3.2008-05-23 time nice -n +19 doSameSpeciesLiftOver.pl caeRem2 caeRem3 \ -ooc /cluster/data/caeRem2/jkStuff/11.ooc # real 8m15.850s # a scaffold to scaffold lift over file was made manually # for Sheldon McKay. # see also the scripts left in: # /cluster/data/caeRem2/bed/blat.caeRem3.supers/run.blat # /cluster/data/caeRem2/bed/blat.caeRem3.supers/run.chain #############################################################################