# for emacs: -*- mode: sh; -*- # Caenorhabditis PB2801 # This is the actual name, it is sometimes known as: # Caenorhabditis n. sp. PB2801 # From Wash U GSC # http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+n.+sp.+PB2801 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABEG02 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2008-05-13 - Hiram) ssh kkstore01 mkdir /cluster/store1/caePb2 ln -s /cluster/store1/caePb2 /cluster/data/caePb2 mkdir /cluster/data/caePb2/downloads cd /cluster/data/caePb2/downloads wget --timestamping \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/ASSEMBLY" wget --timestamping \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/README" -O README.caePb2 wget --timestamping -m -nd -np \ "ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/output/" # Going to manipulate the given supercontigs.agp.gz file to produce a # chrUn.agp.gz file, although this ends up being too detailed. A second # super.lift and gold.tab will also be produced to get the chrUn_gold # table loaded with appropriate supercontigs without all the bridged # gaps. This first chrUn.agp.gz file will be used with the initial # build so that the gap table will be constructed properly with all the # bridged gaps. faCount contigs.fa.gz > faCount.contigs.fa.txt mkdir ../jkStuff cat << '_EOF_' > ../jkStuff/mkChrUnAgp.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $end = $start + $scaffoldGapSize - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $itemCount++, $scaffoldGapSize; $start = $end + 1; } } else { $firstTime = 0; } $end = $start + $ctgLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $itemCount++, $WN, $name, $cStart, $cEnd, $strand; $start = $end + 1; $curContig = $ctgName; } } close (FH); '_EOF_' # << happy emacs chmod +x ../jkStuff/mkChrUnAgp.pl ../jkStuff/mkChrUnAgp.pl | gzip > chrUn.agp.gz qaToQac contigs.fa.qual.gz stdout \ | qacAgpLift chrUn.agp.gz stdin chrUn.qac ########################################################################## ## Initial database construction and beginning tables ## (DONE - 2008-05-13 - Hiram) ssh kkstore01 cd /cluster/data/caePb2 cat << '_EOF_' > caePb2.config.ra # Config parameters for makeGenomeDb.pl: db caePb2 clade worm genomeCladePriority 66 scientificName Caenorhabditis brenneri commonName C. brenneri assemblyDate Apr. 2008 assemblyLabel Washington University School of Medicine GSC Caenorhabditis brenneri(PB2801) 6.0.1 orderKey 839 mitoAcc none fastaFiles /cluster/data/caePb2/downloads/contigs.fa.gz agpFiles /cluster/data/caePb2/downloads/chrUn.agp.gz # qualFiles none dbDbSpeciesDir worm '_EOF_' # << happy emacs time makeGenomeDb.pl -verbose=2 caePb2.config.ra > makeGenomeDb.out 2>&1 # real 3m24.338s ########################################################################## ## Fixup chrUn_gold table, and add ctgPos2 table ssh kkstore01 cd /cluster/data/caePb2/downloads # create a ctgPos2.tab table cat << '_EOF_' > ../jkStuff/mkScafAgp.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; open (CT,">ctgPos2.tab") or die "can not write to ctgPos2.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $end = $start + $scaffoldGapSize - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", $start, $end, $itemCount++, $scaffoldGapSize; $start = $end + 1; $scafEnd = $cStart - 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $start, $end, $itemCount++, $WN, $ctgName, $fragStart, $scafEnd, $strand; printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n", $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); close (CT); '_EOF_' # << happy emacs chmod +x ../jkStuff/mkScafAgp.pl nice -n +19 ../jkStuff/mkScafAgp.pl ## and load that ctgPos2 table: ssh hgwdev cd /cluster/data/caePb2/downloads hgLoadSqlTab caePb2 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ## and the super.lift to get supercontigs lifted to chrUn ssh kkstore01 cd /cluster/data/caePb2/downloads # beware of the chrUnSize specified below cat << '_EOF_' > ../jkStuff/mkSuperLift.pl #!/usr/bin/env perl use strict; use warnings; my $start = 1; my $end = 1; my $itemCount = 1; my $curContig = ""; my $firstTime = 1; my $scaffoldGapSize = 1000; my $scafEnd = 1; my $superStart = 0; my $superEnd = 1; my $superLength = 0; my $chrUnSize = 194283334; my $chrUnName = "chrUn"; open (GL,">gold.tab") or die "Can not write to gold.tab"; open (FH,'zcat supercontigs.agp.gz|') or die "can not open zcat supercontigs.agp.gz"; while (my $line=) { chomp $line; if ($line =~ m/fragment/) { my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) = split('\s+',$line); $end = $start + $gapLen - 1; $scafEnd += $gapLen; # printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", # $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo; $start = $end + 1; } else { my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) = split('\s+',$line); my $ctgLen = $ctgEnd - $ctgStart + 1; my $cLen = $cEnd - $cStart + 1; die "not matching start, end:\n$line" if ($ctgLen != $cLen); if (!$firstTime) { if ($ctgName ne $curContig) { $superLength = $superEnd - $superStart; $end = $start + $scaffoldGapSize - 1; # printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n", # $start, $end, $itemCount++, $scaffoldGapSize; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; $start = $end + 1; $scafEnd = $cStart - 1; $superStart = $start - 1; } } else { $firstTime = 0; $scafEnd = 0; } $scafEnd += $ctgLen; my $fragStart = $scafEnd - $ctgLen + 1; $end = $start + $ctgLen - 1; $superEnd = $end; # printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", # $start, $end, $itemCount++, $WN, $ctgName, # $fragStart, $scafEnd, $strand; # printf SL "%s\t%d\tchrUn\t%d\t%d\tW\n", # $name, $ctgLen, $start-1, $end; $start = $end + 1; $curContig = $ctgName; } } close (FH); $superLength = $superEnd - $superStart; printf "%d\t%s\t%d\t%s\t%d\n", $superStart, $curContig, $superLength, $chrUnName, $chrUnSize; printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n", $chrUnName, $superStart, $superEnd, $itemCount++, $curContig, $superLength; close (GL); '_EOF_' # << happy emacs chmod +x ../jkStuff/mkSuperLift.pl nice -n +19 ../jkStuff/mkSuperLift.pl > super.lift ## and load this new gold table ssh hgwdev cd /cluster/data/caePb2/downloads sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \ > chrUn_gold.sql # edit that .sql file to add the bin column # bin smallint(6) NOT NULL default '0', hgLoadBed -sqlTable=chrUn_gold.sql caePb2 chrUn_gold gold.tab ############################################################################## ## WindowMasker masking (DONE - 2008-05-13 - Hiram) ssh kolossus mkdir /cluster/data/caePb2/bed/WindowMasker cd /cluster/data/caePb2/bed/WindowMasker time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \ caePb2 -buildDir=/cluster/data/caePb2/bed/WindowMasker \ -workhorse=kolossus > doWM.out 2>&1 & # real 16m34.556s ## load result to prep for cleaning ssh hgwdev cd /cluster/data/caePb2/bed/WindowMasker hgLoadBed caePb2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1346187 elements of size 3 featureBits caePb2 windowmaskerSdust # 73264992 bases of 170473138 (42.977%) in intersection # eliminate the gaps from the masking (WM bug) featureBits caePb2 -not gap -bed=notGap.bed # 170473138 bases of 170473138 (100.000%) in intersection time nice -n +19 featureBits caePb2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 49454796 bases of 170473138 (29.010%) in intersection # reload track to get it clean hgLoadBed caePb2 windowmaskerSdust cleanWMask.bed.gz # Loaded 1343764 elements of size 4 featureBits caePb2 windowmaskerSdust # 49454796 bases of 170473138 (29.010%) in intersection ############################################################################## ## simpleRepeat masking (DONE - 2008-05-13 - Hiram) ssh kolossus mkdir /cluster/data/caePb2/bed/simpleRepeat cd /cluster/data/caePb2/bed/simpleRepeat nice -n +19 twoBitToFa ../../caePb2.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp # about 30 minutes awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/caePb2/bed/simpleRepeat nice -n +19 hgLoadBed caePb2 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits caePb2 simpleRepeat # 5083084 bases of 170473138 (2.982%) in intersection ############################################################################## ## combine trf mask with windowmasker (DONE - 2008-05-13 - Hiram) ssh kkstore01 cd /cluster/data/caePb2 zcat bed/WindowMasker/cleanWMask.bed.gz \ | twoBitMask caePb2.unmasked.2bit stdin \ -type=.bed caePb2.cleanWMSdust.2bit cat bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed caePb2.cleanWMSdust.2bit stdin caePb2.2bit # can safely ignore the warning about BED file >= 13 fields # check it twoBitToFa caePb2.2bit stdout | faSize stdin # 194283334 bases (23810210 N's 170473124 real 120922179 upper # 49550945 lower) in 1 sequences in 1 files # %25.50 masked total, %29.07 masked real ############################################################################## ## create masked contigs for blastz (DONE - 2008-05-13 - Hiram) ssh kkstore01 mkdir /cluster/data/caePb2/maskedContigs cd /cluster/data/caePb2/maskedContigs # copy over lft2BitToFa.pl from /cluster/data/caePb1/jkStuff cp -p /cluster/data/caePb1/jkStuff/lft2BitToFa.pl ../jkStuff ln -s ../downloads/super.lift . time nice -n +19 ../jkStuff/lft2BitToFa.pl ../caePb2.2bit super.lift \ > super.fa # real 1m55.813s # verify it is OK, should have same numbers of real, upper and lower as # measured in the caePb2.2bit, only N's are different: faSize super.fa # 190774334 bases (20301210 N's 170473124 real 120922179 upper 49550945 lower) in 3510 sequences in 1 files # Total size: mean 54351.7 sd 179245.2 min 327 (Contig8561) max 4147112 (Contig0) median 7338 # %25.97 masked total, %29.07 masked real # in fact, the missing N's should be: # (3510-1)*1000 = 23810210-20301210 = 3509000 faToTwoBit super.fa caePb2.supercontigs.2bit twoBitInfo caePb2.supercontigs.2bit stdout | sort -k2nr \ > caePb2.supercontigs.sizes # Copy this 2bit file and sizes over to /san/sanvol1/scratch/worms/caePb2 # the lft2BitToFa.pl script is: cat << '_EOF_' #!/usr/bin/env perl use strict; use warnings; use File::Basename; sub usage() { printf "usage: %s [more_files.lft]\n", basename($0); printf "\tfasta output is to stdout, therefore route stdout to result file\n"; exit 255; } my $argc = scalar(@ARGV); usage if ($argc < 2); my $twoBitFile = shift; while (my $liftFile = shift) { open (FH,"<$liftFile") or die "Can not open $liftFile"; while (my $line=) { chomp $line; my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line); my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout", $chrom, $start, $start+$length); print `$cmd | sed -e "s#^>.*#>$contig#"`; } close (FH); } '_EOF_' # << happy emacs ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2008-05-13 - Hiram) # Use -repMatch=62 (based on size -- for human we use 1024, and # C. PB2801 size is ~6.1% of human judging by gapless caePb2 vs. hg18 # genome sizes from featureBits. So we would use 62, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. ssh kolossus cd /cluster/data/caePb2 blat caePb2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=120 # Wrote 8586 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caePb2 ############################################################################ ## Default position (DONE - 2008-05-13 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrUn:117559143-117563800" where name="caePb2";' hgcentraltest ######################################################################### # GENBANK AUTO UPDATE (DONE - 2008-05-13 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add caePb2 just before caeRem2 # caePb2 (C. brenneri) caePb2.serverGenome = /cluster/data/caePb2/caePb2.2bit caePb2.clusterGenome = /iscratch/i/worms/caePb2/caePb2.2bit caePb2.ooc = /iscratch/i/worms/caePb2/11.ooc caePb2.lift = /iscratch/i/worms/caePb2/caePb2.supercontigs.lift caePb2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} caePb2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} caePb2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} caePb2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} caePb2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} caePb2.refseq.mrna.native.load = no caePb2.refseq.mrna.xeno.load = yes caePb2.refseq.mrna.xeno.loadDesc = yes caePb2.genbank.mrna.xeno.load = yes caePb2.genbank.est.native.load = yes caePb2.genbank.est.native.loadDesc = no caePb2.downloadDir = caePb2 caePb2.perChromTables = no cvs ci -m "Added caePb2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial caePb2 & # logFile: var/build/logs/2008.06.16-10:03:05.caePb2.initalign.log # real 85m23.353s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caePb2 # real 16m2.684s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add caePb2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added caePb2 - C. brenneri" etc/align.dbs etc/hgwdev.dbs make etc-update ########################################################################## ## Creating downloads (DONE - 2008-05-13 - Hiram) # There is only one chrom, make its trfMaskChrom file exist ssh hgwdev mkdir /cluster/data/caePb2/bed/simpleRepeat/trfMaskChrom cd /cluster/data/caePb2/bed/simpleRepeat/trfMaskChrom ## symlink didn't work here, the symlink ended up in the tar file cp -p ../trfMask.bed ./chrUn.bed cd /cluster/data/caePb2 # there is no RM file touch Un/chrUn.fa.out /cluster/bin/scripts/makeDownloads.pl caePb2 ## *!* EDIT THE README.txt FILES *!* ## ########################################################################## ## Creating pushQ (DONE - 2008-05-13 - Hiram) ssh hgwdev mkdir /cluster/data/caePb2/pushQ cd /cluster/data/caePb2/pushQ /cluster/bin/scripts/makePushQSql.pl -noGenbank caePb2 \ > caePb2.sql 2> stderr.out ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-05-13 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caePb2", "blat5", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("caePb2", "blat5", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # Fixup assembly date after confirmation from John Spieth # 2008-05-27 - DONE - Hiram ssh hgwdev hgsql -e 'update dbDb set description="Feb. 2008" where name="caePb2"' \ hgcentraltest ######################################################################### # ELEGANS (ce6) PROTEINS TRACK (DONE - Hiram - 2008-06-11) ssh kkstore01 # breaking up this target genome into manageable pieces # using the contigs so we don't have genes spanning non-bridged gaps mkdir /cluster/data/caePb2/blastDb cd /cluster/data/caePb2 twoBitToFa -noMask maskedContigs/caePb2.supercontigs.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 3545 pieces of 3545 written rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ## copy to san for kluster access mkdir -p /san/sanvol1/scratch/worms/caePb2/blastDb cd /san/sanvol1/scratch/worms/caePb2/blastDb rsync -a --progress --stats /cluster/data/caePb2/blastDb/ . ## create the query protein set mkdir -p /cluster/data/caePb2/bed/tblastn.ce6SG cd /cluster/data/caePb2/bed/tblastn.ce6SG echo /san/sanvol1/scratch/worms/caePb2/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 3545 query.lst # we want around 80000 jobs calc `wc /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl | awk "{print \\\$1}"`/\(80000/`wc query.lst | awk "{print \\\$1}"`\) # 23741/(80000/3545) = 1052.023062 mkdir -p /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa split -l 1000 /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl \ /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa/sg ln -s /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa sgfa cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir -p /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut ln -s /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/caePb2/bed/tblastn.ce6SG cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh CBIN=/cluster/bin/i386 DB=caePb2 BLASTMAT=/scratch/data/blast229/data SCR="/scratch/tmp/${DB}" g=`/bin/basename $2` D=`/bin/basename $1` inputFile=${SCR}/${D}_${g} export CBIN DB BLASTMAT SCR g D /bin/mkdir -p ${SCR} /bin/cp -p $1.* ${SCR} /bin/cp -p $2 ${inputFile} /bin/cp -p /cluster/store1/worm/${DB}/blastDb.lft ${SCR}/${D}_blastDb.lft /bin/cp -p /cluster/store3/worm/ce6/bed/blat.ce6SG/protein.lft \ ${SCR}/${D}_protein.lft f=${SCR}/`/bin/basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast229/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/${D} -i ${inputFile} -o $f.8 then /bin/mv $f.8 $f.1 break; fi done if test -f $f.1 then if ${CBIN}/blastToPsl $f.1 $f.2 then ${CBIN}/liftUp -nosort -type=".psl" -nohead $f.3 \ ${SCR}/${D}_blastDb.lft carry $f.2 > /dev/null ${CBIN}/liftUp -nosort -type=".psl" -pslQ -nohead ${SCR}/$3.tmp \ ${SCR}/${D}_protein.lft warn $f.3 > /dev/null if ${CBIN}/pslCheck -prot ${SCR}/$3.tmp then /bin/rm -f $3 /bin/mv ${SCR}/$3.tmp $3 /bin/rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/${D}.* ${inputFile} \ ${SCR}/${D}_protein.lft ${SCR}/${D}_blastDb.lft /bin/rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi /bin/rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/${D}.* ${inputFile} \ ${SCR}/${D}_protein.lft ${SCR}/${D}_blastDb.lft /bin/rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh kk cd /cluster/data/caePb2/bed/tblastn.ce6SG gensub2 query.lst sg.lst template jobList para create jobList # para try, check, push, check etc. # Completed: 85080 of 85080 jobs # CPU time in finished jobs: 9631747s 160529.12m 2675.49h 111.48d 0.305 y # IO & Wait Time: 401660s 6694.33m 111.57h 4.65d 0.013 y # Average job time: 118s 1.97m 0.03h 0.00d # Longest finished job: 1032s 17.20m 0.29h 0.01d # Submission to last job: 75506s 1258.43m 20.97h 0.87d # do the cluster run for chaining ssh kk mkdir /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun cd /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut/c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun para create jobList # para maxNode 30 there are only 24 jobs para try, check, push, check etc. # Completed: 24 of 24 jobs # CPU time in finished jobs: 497s 8.28m 0.14h 0.01d 0.000 y # IO & Wait Time: 2371s 39.52m 0.66h 0.03d 0.000 y # Average job time: 120s 1.99m 0.03h 0.00d # Longest finished job: 125s 2.08m 0.03h 0.00d # Submission to last job: 161s 2.68m 0.04h 0.00d ssh kkstore01 cd /cluster/data/caePb2/bed/tblastn.ce6SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > /cluster/data/caePb2/bed/tblastn.ce6SG/blastCe6SG.psl cd .. pslCheck blastCe6SG.psl # checked: 35496 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/caePb2/bed/tblastn.ce6SG liftUp -nohead -type=.psl stdout ../../jkStuff/caePb2.supercontigs.lift \ error blastCe6SG.psl | sort -k14,14 -k16,16n \ | hgLoadPsl -table=blastCe6SG caePb2 stdin hgLoadPsl caePb2 blastCe6SG.psl # check coverage featureBits caePb2 blastCe6SG # 23871394 bases of 170473138 (14.003%) in intersection featureBits caePb1 blastCe4SG # 22988044 bases of 175247318 (13.117%) in intersection featureBits ce6 sangerGene # 28134889 bases of 100281426 (28.056%) in intersection ssh kkstore01 cd /cluster/data/caePb2/bed/tblastn.ce6SG rm -rf blastOut sgfa rm -rf /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG ########################################################################### # SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram) # original alignment cd /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20 cat fb.ce9.chainCaePb2Link.txt # 40793086 bases of 100286004 (40.677%) in intersection # running swap mkdir /hive/data/genomes/caePb2/bed/blastz.ce9.swap cd /hive/data/genomes/caePb2/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 & # real 2m36.529s cat fb.caePb2.chainCe9Link.txt # 55084649 bases of 170473138 (32.313%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -continue=syntenicNet -syntenicNet \ -bigClusterHub=pk -smallClusterHub=memk -swap > synNet.log 2>&1 & # real 0m42.368s ########################################################################### # ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram) cd /hive/data/genomes/caePb2 mv blastDb blastDb.2008-06-16 mv blastDb.lft blastDb.lft.2008-06-16 mkdir blastDb twoBitToFa caePb2.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 212 pieces of 212 written rm temp.fa cd blastDb for i in *.fa do /scratch/data/blast-2.2.11/bin/formatdb -i $i -p F done rm *.fa ## create the query protein set mkdir -p /hive/data/genomes/caePb2/bed/tblastn.ce9SG cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG echo /hive/data/genomes/caePb2/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 212 query.lst # we want around 50000 jobs calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | cut -d" " -f1`/\(50000/`wc -l query.lst | cut -d" " -f1`\) # 28103/(50000/212) = 119.156720 mkdir sgfa split -l 100 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=caePb2 BLASTMAT=/scratch/data/blast-2.2.11/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/x86_64/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh swarm cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG gensub2 query.lst sg.lst template jobList para create jobList para try, check, push, check etc. # Completed: 59784 of 59784 jobs # CPU time in finished jobs: 577335s 9622.26m 160.37h 6.68d 0.018 y # IO & Wait Time: 192124s 3202.06m 53.37h 2.22d 0.006 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 15337s 255.62m 4.26h 0.18d # do the cluster run for chaining ssh swarm mkdir /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne #!/bin/csh -fe cd $1 set b = $1:t cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \ /hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut/c.$b.psl '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun para create jobList para try, check, push, check etc. # Completed: 282 of 282 jobs # CPU time in finished jobs: 9777s 162.95m 2.72h 0.11d 0.000 y # IO & Wait Time: 9172s 152.86m 2.55h 0.11d 0.000 y # Average job time: 67s 1.12m 0.02h 0.00d # Longest finished job: 781s 13.02m 0.22h 0.01d # Submission to last job: 839s 13.98m 0.23h 0.01d cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > ../blastCe9SG.psl cd .. pslCheck blastCe9SG.psl # checked: 41278 failed: 0 errors: 0 # load table ssh hgwdev cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG hgLoadPsl caePb2 blastCe9SG.psl # check coverage featureBits caePb2 blastCe9SG # 23730009 bases of 170473138 (13.920%) in intersection featureBits caeRem3 blastCe9SG # 20302540 bases of 138406388 (14.669%) in intersection featureBits cb3 blastCe9SG # 18490367 bases of 108433446 (17.052%) in intersection featureBits caeJap3 blastCe9SG # 12894398 bases of 154057934 (8.370%) in intersection featureBits melHap1 blastCe9SG # 4376245 bases of 53017507 (8.254%) in intersection featureBits melInc1 blastCe9SG # 3882043 bases of 82095019 (4.729%) in intersection featureBits priPac2 blastCe9SG # 5436779 bases of 133634773 (4.068%) in intersection featureBits bruMal1 blastCe9SG # 4424694 bases of 89235536 (4.958%) in intersection featureBits haeCon1 blastCe9SG # 4990746 bases of 278844984 (1.790%) in intersection featureBits ce9 sangerGene # 28689552 bases of 100286004 (28.608%) in intersection rm -rf blastOut #########################################################################