# for emacs: -*- mode: sh; -*- # Caenorhabditis PB2801 # This is the actual name, it is sometimes known as: # Caenorhabditis n. sp. PB2801 # From Wash U GSC # http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+n.+sp.+PB2801 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram) ssh kkstore01 mkdir /cluster/store10/caePb1 ln -s /cluster/store10/caePb1 /cluster/data/caePb1 cd /cluster/data/caePb1 mkdir downloads cd downloads wget --timestamping \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/ASSEMBLY" wget --timestamping \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/README" -O README.caePb1 wget --timestamping -m -nd -np \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/output/" # Going to manipulate the given supercontigs.agp.gz file to produce a # chrUn.agp.gz file, although this ends up being too detailed. A second # super.lift and gold.tab will also be produced to get the chrUn_gold # table loaded with appropriate supercontigs without all the bridged # gaps. This first chrUn.agp.gz file will be used with the initial # build so that the gap table will be constructed properly with all the # bridged gaps. faCount contigs.fa.gz > faCount.contigs.fa.txt mkdir ../jkStuff cat << '_EOF_' > ../jkStuff/mkChrUnAgp.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $end = $start + $scaffoldGapSize - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $itemCount++, $scaffoldGapSize; $start = $end + 1; } } else { $firstTime = 0; } $end = $start + $ctgLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $itemCount++, $WN, $name, $cStart, $cEnd, $strand; $start = $end + 1; $curContig = $ctgName; } } close (FH); '_EOF_' # << happy emacs chmod +x ../jkStuff/mkChrUnAgp.pl ../jkStuff/mkChrUnAgp.pl | gzip > chrUn.agp.gz qaToQac contigs.fa.qual.gz stdout \ | qacAgpLift chrUn.agp.gz stdin chrUn.qac ########################################################################## ## Initial database construction and beginning tables ## (DONE - 2007-03-01 - Hiram) ssh kkstore01 cd /cluster/data/caePb1 cat << '_EOF_' > caePb1.config.ra # Config parameters for makeGenomeDb.pl: db caePb1 clade worm genomeCladePriority 66 scientificName Caenorhabditis PB2801 commonName C. PB2801 assemblyDate Jan. 2007 assemblyLabel Washington University School of Medicine GSC Caenorhabditis n. sp. PB2801 orderKey 876 mitoAcc none fastaFiles /cluster/data/caePb1/downloads/supercontigs.fa.gz agpFiles /cluster/data/caePb1/downloads/scaffold.agp.gz qualFiles /cluster/data/caePb1/downloads/chrUn.qac dbDbSpeciesDir worm '_EOF_' # << happy emacs time makeGenomeDb.pl -verbose=2 caePb1.config.ra > makeGenomeDb.out 2>&1 ########################################################################## ## Fixup chrUn_gold table, and add ctgPos2 table ssh kkstore01 cd /cluster/data/caePb1/downloads # create a ctgPos2.tab table cat << '_EOF_' > ../jkStuff/mkScafAgp.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; open (CT,">ctgPos2.tab") or die "can not write to ctgPos2.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $end = $start + $scaffoldGapSize - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $itemCount++, $scaffoldGapSize; $start = $end + 1; $scafEnd = $cStart - 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $itemCount++, $WN, $ctgName, $fragStart, $scafEnd, $strand; printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n", $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); close (CT); '_EOF_" # << happy emacs chmod +x ../jkStuff/mkScafAgp.pl nice -n +19 ../jkStuff/mkScafAgp.pl ## and load that ctgPos2 table: ssh hgwdev cd /cluster/data/caePb1/downloads hgLoadSqlTab caePb1 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ## and the super.lift to get supercontigs lifted to chrUn ssh kkstore01 cd /cluster/data/caePb1/downloads cat << '_EOF_' > ../jkStuff/mkSuperLift.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; my $superStart = 0; my $superEnd = 1; my $superLength = 0; my $chrUnSize = 207333000; my $chrUnName = "chrUn"; open (GL,">gold.tab") or die "Can not write to gold.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; # printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", # $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $superLength = $superEnd - $superStart; $end = $start + $scaffoldGapSize - 1; # printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", # $start, $end, $itemCount++, $scaffoldGapSize; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; $start = $end + 1; $scafEnd = $cStart - 1; $superStart = $start - 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; $superEnd = $end; # printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", # $start, $end, $itemCount++, $WN, $ctgName, # $fragStart, $scafEnd, $strand; # printf SL "%s\t%d\tchrUn\t%d\t%d\tW\n", # $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); $superLength = $superEnd - $superStart; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; close (GL); '_EOF_' # << happy emacs chmod +x ../jkStuff/mkSuperLift.pl nice -n +19 ../jkStuff/mkSuperLift.pl > super.lift ## and load that ctgPos2 table: ssh hgwdev cd /cluster/data/caePb1/downloads sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \ > chrUn_gold.sql # edit that .sql file to add the bin column # bin smallint(6) NOT NULL default '0', hgLoadBed -sqlTable=chrUn_gold.sql caePb1 chrUn_gold gold.tab ############################################################################## ## Repeat Masker does not recognize given species: # Species "Caenorhabditis PB2801" is not known to RepeatMasker. ############################################################################## ## WindowMasker masking (DONE - 2007-03-02 - Hiram) ssh kolossus mkdir /cluster/data/caePb1/bed/WindowMasker cd /cluster/data/caePb1/bed/WindowMasker time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \ caePb1 -buildDir=/cluster/data/caePb1/bed/WindowMasker \ -workhorse=kolossus > doWM.out 2>&1 & ## load result ssh hgwdev cd /cluster/data/caePb1/bed/WindowMasker hgLoadBed -strict caePb1 windowmaskerSdust windowmasker.sdust.bed.gz ############################################################################## ## simpleRepeat masking (DONE - 2007-03-02 - Hiram) ssh kolossus mkdir /cluster/data/caePb1/bed/simpleRepeat cd /cluster/data/caePb1/bed/simpleRepeat nice -n +19 twoBitToFa ../../caePb1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/caePb1/bed/simpleRepeat nice -n +19 hgLoadBed caePb1 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits caePb1 simpleRepeat # 4938308 bases of 175247318 (2.818%) in intersection ############################################################################## ## combine trf mask with windowmasker (DONE - 2007-03-03 - Hiram) ssh kkstore02 cd /cluster/data/caePb1 cat << '_EOF_' > addTrf.csh #!/bin/csh -efx # This script will fail if any of its commands fail. set DB = caePb1 set WORK_DIR = /cluster/data/${DB} set WM_DIR = /cluster/data/${DB}/bed/WindowMasker cd ${WORK_DIR} set inputTwoBit = ${WM_DIR}/${DB}.wmsk.sdust.2bit set tmpTwoBit = ${WM_DIR}/${DB}.sdTrf.n.2bit set outputTwoBit = ${WORK_DIR}/${DB}.sdTrf.2bit cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${tmpTwoBit} twoBitToFa ${tmpTwoBit} stdout | sed -e "s/n/N/g" \ | sed -e "s/chrUN/chrUn/" | faToTwoBit stdin ${outputTwoBit} twoBitToFa ${outputTwoBit} stdout | faSize -showPercent stdin \ > faSize.${DB}.sdTrf.txt rm -f ${tmpTwoBit} cat faSize.${DB}.sdTrf.txt '_EOF_' # << happy emacs chmod +x ./addTrf.csh nice -n +19 ./addTrf.csh # produces this masked file: # -rw-rw-r-- 1 63185508 Mar 3 09:18 caePb1.sdTrf.2bit # faSize -showPercent on it indicates: # 207333000 bases (32085682 N's 175247318 real # 124302731 upper 50944587 lower) in 1 sequences in 1 files # %24.57 masked total, %29.07 masked real ############################################################################## ## create masked contigs for blastz (DONE - 2007-03-04 - Hiram) ssh kkstore02 mkdir /cluster/data/caePb1/maskedContigs cd /cluster/data/caePb1/maskedContigs ln -s ../downloads/super.lift . # copy over lft2BitToFa.pl from /cluster/data/gasAcu1/jkStuff ../jkStuff/lft2BitToFa.pl ../caePb1.2bit super.lift > super.fa faToTwoBit super.fa caePb1.supercontigs.sdTrf.2bit twoBitInfo caePb1.supercontigs.sdTrf.2bit stdout | sort -k2nr \ > caePb1.supercontigs.sizes # Copy this 2bit file and sizes over to /san/sanvol1/scratch/worms/caePb1 # the lft2BitToFa.pl script is: cat << '_EOF_' #!/usr/bin/env perl use strict; use warnings; use File::Basename; sub usage() { printf "usage: %s [more_files.lft]\n", basename($0); printf "\tfasta output is to stdout, therefore route stdout to result file\n"; exit 255; } my $argc = scalar(@ARGV); usage if ($argc < 2); my $twoBitFile = shift; while (my $liftFile = shift) { open (FH,"<$liftFile") or die "Can not open $liftFile"; while (my $line=) { chomp $line; my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line); my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout", $chrom, $start, $start+$length); print `$cmd | sed -e "s#^>.*#>$contig#"`; } close (FH); } '_EOF_' # << happy emacs ############################################################################ ## Default position (DONE - 2007-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:123456-234567" where name="caePb1";' hgcentraltest ############################################################################ ## SWAP BLASTZ cb3 alignments (DONE - 2007-04-18 - Hiram) ssh kkstore01 cd /cluster/data/cb3/bed/blastz.caePb1.2007-04-18 cat fb.cb3.chainCaePb1Link.txt # 42772225 bases of 108433446 (39.446%) in intersection mkdir /cluster/data/caePb1/bed/blastz.cb3.swap cd /cluster/data/caePb1/bed/blastz.cb3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/cb3/bed/blastz.caePb1.2007-04-18/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=kk \ -swap > swap.log 2>&1 & # real 3m57.854s cat fb.caePb1.chainCb3Link.txt # 59366018 bases of 175247318 (33.876%) in intersection ############################################################################## ## Repeat masker with -species "caenorhabditis sp. 4 (ce-ii)" ## (DONE - 2007-04-19 - Hiram) ssh kkstore01 mkdir /cluster/data/caePb1/bed/RepeatMasker cd /cluster/data/caePb1/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl -species "caenorhabditis sp. 4 (ce-ii)" time nice -n +19 doRepeatMasker.pl -species "caenorhabditis" \ -bigClusterHub=kk \ -buildDir=/cluster/data/caePb1/bed/RepeatMasker caePb1 > do.log 2>&1 & # real 118m40.879s cd /cluster/data/caePb1 twoBitToFa caePb1.rmsk.2bit stdout | faSize stdin # 207333000 bases (32085682 N's 175247318 real # 169630672 upper 5616646 lower) in 1 sequences in 1 files # %2.71 masked total, %3.20 masked real ssh hgwdev featureBits caePb1 rmsk # 5618476 bases of 175247318 (3.206%) in intersection ############################################################################## ## BLASTZ swap ce4 (DONE - 2007-03-04 -- Hiram) ssh kkstore01 cd /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04 cat fb.ce4.chainCaePb1Link.txt # 52594760 bases of 100281244 (52.447%) in intersection mkdir /cluster/data/caePb1/bed/blastz.ce4.swap cd /cluster/data/caePb1/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04/DEF \ -bigClusterHub=pk -swap > swap.log 2>&1 & # real 22m40.205s cat fb.caePb1.chainCe4Link.txt # 74758640 bases of 175247318 (42.659%) in intersection ############################################################################## ## BLASTZ caeRem2 (DONE - 2007-04-19,20 - Hiram) ssh kkstore01 mkdir /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19 cd /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19 cat << '_EOF_' > DEF # caePb1 vs caeRem2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: C. PB2801 caePb1 unmasked sequence SEQ1_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: briggsae caeRem2, 9,660 contigs, longest 5,925,111 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/caeRem2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.sizes SEQ2_LIFT=/san/sanvol1/scratch/worms/caeRem2/caeRem2.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=kk \ -blastzOutRoot /cluster/bluearc/caePb1CaeRem2 > do.log 2>&1 & # real 569m7.643s cat fb.caePb1.chainCaeRem2Link.txt # 93139445 bases of 175247318 (53.147%) in intersection # swap to caeRem2 (also in caeRem2.txt) mkdir /cluster/data/caeRem2/bed/blastz.caePb1.swap cd /cluster/data/caeRem2/bed/blastz.caePb1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19/DEF \ -bigClusterHub=kk -swap > swap.log 2>&1 & # real 32m51.630s cat fb.caeRem2.chainCaePb1Link.txt # 78402339 bases of 146898439 (53.372%) in intersection ############################################################################## ## BLASTZ priPac1 (DONE - 2007-04-20,21 - Hiram) mkdir /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20 cd /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20 cat << '_EOF_' > DEF # caePb1 vs priPac1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: C. PB2801 caePb1 unmasked sequence SEQ1_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Pristionchus pacificus priPac1 SEQ2_DIR=/san/sanvol1/scratch/worms/priPac1/priPac1.unmasked.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/priPac1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/caePb1/bed/blastz.priPac1.2007-04-20 TMPDIR=/scratch/tmp '_EOF_' time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/caePb1PriPac1 > do.log 2>&1 & # real 406m52.445s cat fb.caePb1.chainPriPac1Link.txt # 13589188 bases of 175247318 (7.754%) in intersection ## swap to priPac1 - also in priPac1.txt mkdir /cluster/data/priPac1/bed/blastz.caePb1.swap cd /cluster/data/priPac1/bed/blastz.caePb1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20/DEF \ -swap > swap.log 2>&1 & # real 9m18.554s cat fb.priPac1.chainCaePb1Link.txt # 11713567 bases of 145948246 (8.026%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2007-04-23 - Hiram) # Use -repMatch=62 (based on size -- for human we use 1024, and # C. PB2801 size is ~6.1% of human judging by gapless caePb1 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/caePb1 blat caePb1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=62 # Wrote 37423 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caePb1 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-04-23,25 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add caePb1 just before caeRem2 # caePb1 (C. PB2801) caePb1.serverGenome = /cluster/data/caePb1/caePb1.2bit caePb1.clusterGenome = /iscratch/i/worms/caePb1/caePb1.2bit caePb1.ooc = /iscratch/i/worms/caePb1/11.ooc caePb1.lift = /iscratch/i/worms/caePb1/caePb1.supercontigs.lift caePb1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} caePb1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} caePb1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} caePb1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} caePb1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} caePb1.refseq.mrna.native.load = no caePb1.refseq.mrna.xeno.load = yes caePb1.refseq.mrna.xeno.loadDesc = yes caePb1.genbank.mrna.xeno.load = yes caePb1.genbank.est.native.load = yes caePb1.genbank.est.native.loadDesc = no caePb1.downloadDir = caePb1 caePb1.perChromTables = no cvs ci -m "Added caePb1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added C. PB2801." src/lib/gbGenome.c make install-server ssh kkstore01 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial caePb1 & # logFile: var/build/logs/2007.07.10-17:01:24.caePb1.initalign.log # real 353m16.891s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caePb1 # logFile: var/dbload/hgwdev/logs/2007.07.11-12:52:25.dbload.log # real 7m49.325s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add caePb1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added caePb1 - C. PB2801" etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caePb1", "blat10", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caePb1", "blat10", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ########################################################################## ## summarize chainLink measurements (2007-04-25 - Hiram) # org on caePb1 on other # caeRem2 53.147 53.372 # ce4 42.659 52.447 # cb3 33.876 39.446 # priPac1 7.754 8.026 ########################################################################## # Set default position to cover the 16S ribosomal RNA area ## (DONE - 2007-04-26 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:97956213-97960870" where name="caePb1"' hgcentraltest ########################################################################## ## This nematode now has an official name (DONE - 2007-04-27 - Hiram) ssh hgwdev hgsql -e 'update dbDb set organism="C. brenneri" where name="caePb1"' hgcentraltest hgsql -e 'update dbDb set genome="C. brenneri" where name="caePb1"' hgcentraltest hgsql -e 'update dbDb set scientificName="Caenorhabditis brenneri" where name="caePb1"' hgcentraltest hgsql -e 'update defaultDb set genome="C. brenneri" where name="caePb1"' hgcentraltest hgsql -e 'update genomeClade set genome="C. brenneri" where genome="C. PB2801"' hgcentraltest ########################################################################## ## Creating downloads (DONE - 2007-04-30 - Hiram) # There is only one chrom, make its trfMaskChrom file exist ssh hgwdev mkdir /cluster/data/caePb1/bed/simpleRepeat/trfMaskChrom cd /cluster/data/caePb1/bed/simpleRepeat/trfMaskChrom ## symlink didn't work here, the symlink ended up in the tar file cp -p ../trfMask.bed ./chrUn.bed cd /cluster/data/caePb1 /cluster/bin/scripts/makeDownloads.pl caePb1 ## *!* EDIT THE README.txt FILES *!* ## ########################################################################## ## Creating pushQ (DONE - 2007-04-30 - Hiram) ssh hgwdev mkdir /cluster/data/caePb1/pushQ cd /cluster/data/caePb1/pushQ /cluster/bin/scripts/makePushQSql.pl caePb1 > caePb1.sql 2> stderr.out ########################################################################### # ELEGANS (ce4) PROTEINS TRACK (DONE - Hiram - 2007-05-02) ssh kkstore01 # breaking up this target genome into manageable pieces mkdir /cluster/data/caePb1/blastDb cd /cluster/data/caePb1 twoBitToFa caePb1.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 222 pieces of 222 written rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ## copy to san for kluster access mkdir -p /san/sanvol1/scratch/worms/caePb1/blastDb cd /san/sanvol1/scratch/worms/caePb1/blastDb rsync -a --progress --stats /cluster/data/caePb1/blastDb/ . ## create the query protein set mkdir -p /cluster/data/caePb1/bed/tblastn.ce4SG cd /cluster/data/caePb1/bed/tblastn.ce4SG echo /san/sanvol1/scratch/worms/caePb1/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 222 query.lst # we want around 50000 jobs calc `wc /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) 23192/(50000/222) = 102.972480 # round up this number a bit, and use in the split here mkdir -p /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa split -l 103 /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl \ /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa/sg ln -s /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa sgfa cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut ln -s /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut ./blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/caePb1/bed/tblastn.ce4SG cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/caePb1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce4/bed/blat.ce4SG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh pk cd /cluster/data/caePb1/bed/tblastn.ce4SG gensub2 query.lst sg.lst template jobList para create jobList # para try, check, push, check etc. # Completed: 50172 of 50172 jobs # CPU time in finished jobs: 944883s 15748.06m 262.47h 10.94d 0.030 y # IO & Wait Time: 193584s 3226.39m 53.77h 2.24d 0.006 y # Average job time: 23s 0.38m 0.01h 0.00d # Longest finished job: 306s 5.10m 0.09h 0.00d # Submission to last job: 3663s 61.05m 1.02h 0.04d # do the cluster run for chaining ssh pk mkdir /cluster/data/caePb1/bed/tblastn.ce4SG/chainRun cd /cluster/data/caePb1/bed/tblastn.ce4SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut/c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList para create jobList para maxNode 30 para try, check, push, check etc. # Completed: 226 of 226 jobs # CPU time in finished jobs: 13628s 227.13m 3.79h 0.16d 0.000 y # IO & Wait Time: 18809s 313.48m 5.22h 0.22d 0.001 y # Average job time: 144s 2.39m 0.04h 0.00d # Longest finished job: 1492s 24.87m 0.41h 0.02d # Submission to last job: 1499s 24.98m 0.42h 0.02d ssh kkstore01 cd /cluster/data/caePb1/bed/tblastn.ce4SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /cluster/data/caePb1/bed/tblastn.ce4SG/blastCe4SG.psl cd .. pslCheck blastCe4SG.psl # checked: 32292 failed: 0 # load table ssh hgwdev cd /cluster/data/caePb1/bed/tblastn.ce4SG hgLoadPsl caePb1 blastCe4SG.psl # check coverage featureBits caePb1 blastCe4SG # 22988044 bases of 175247318 (13.117%) in intersection featureBits priPac1 blastCe4SG # 5617285 bases of 145948246 (3.849%) in intersection featureBits caeRem2 blastCe4SG # 19763359 bases of 146898439 (13.454%) in intersection featureBits cb3 blastCe4SG # 18218293 bases of 108433446 (16.801%) in intersection featureBits ce4 sangerGene # 27906202 bases of 100281244 (27.828%) in intersection ssh kkstore01 rm -rf /cluster/data/caePb1/bed/tblastn.ce4SG/blastOut rm -rf /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG rmdir /cluster/bluearc/worms/caePb1/bed rmdir /cluster/bluearc/worms/caePb1 #end tblastn ############################################################################# # LIFTOVER TO caePb2 (WORKING - 2008-05-14 - Hiram) ssh kkstore01 screen -r -d # use screen to control this job # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug caePb1 caePb2 \ -ooc /cluster/data/caePb1/jkStuff/11.ooc # Real run: cd /cluster/data/caePb1/bed/blat.caePb2.2008-05-13 time nice -n +19 doSameSpeciesLiftOver.pl caePb1 caePb2 \ -ooc /cluster/data/caePb1/jkStuff/11.ooc > do.log 2>&1 & # real 14m13.501s # a scaffold to scaffold lift over file was made manually # for Sheldon McKay. # see also the scripts left in: # /cluster/data/caePb1/bed/blat.caePb2.supers/run.blat # /cluster/data/caePb1/bed/blat.caePb2.supers/run.chain