# for emacs: -*- mode: sh; -*- # This file describes how to make the browser database for the # worm C. briggsae ########################################################################### # DOWNLOAD SEQUENCE (DONE, 2005-04-29, hiram) ssh kkstore02 mkdir /cluster/store5/worm/cb2 cd /cluster/store5/worm/cb2 mkdir wustl cd wustl wget "ftp://genome.wustl.edu/private/lhillier/old/cb2.tar.gz" . tar --strip-path=1 -xvzf cb2.tar.gz faSize cb25.agp8mod.fasta # 108124579 bases (2974785 N's 105149794 real 105149794 upper 0 lower) in # 607 sequences in 1 files faCount cb25.agp8mod.fasta > contigs.faCount.txt grep "^>" cb25.agp8mod.fasta > contig.names mkdir contigs cd contigs faSplit byname ../cb25.agp8mod.fasta . # There was a broken sequence cb25.fpc0071c.fa # in this original file, the corrected sequence was received # from LaDeana Hillier 2005-09-08 and placed into # this contigs directory. And then, rebuild this # cb25.agp8mod.fasta file: cd /cluster/data/cb2/wustl mv cb25.agp8mod.fasta cb25.agp8mod.fasta.broken gzip cb25.agp8mod.fasta.broken & cat contigs/c*.fa > cb25.agp8mod.fasta cd /cluster/store5/worm/cb2 # Create chrom fasta records, all upper case time for A in wustl/chr*.agp do AGP=`basename $A` CHR=${AGP/.agp/} echo $AGP $CHR mkdir -p ${CHR} $HOME/bin/i386/agpToFa -verbose=2 -simpleMulti ${A} ${CHR} \ stdout wustl/cb25.agp8mod.fasta | \ tr '[a-z]' '[A-Z]' | \ sed -e "s/^>CHRUN/>chrUn/; s/^>CHR/>chr/; s/RANDOM/random/" \ > ${CHR}/${CHR}.fa rm -f ./${CHR}/${AGP} ln -s ../wustl/${AGP} ./${CHR}/${AGP} done # 1m30s # A single error (This has been fixed as mentioned above.) # chrI.agp chrI # cb25.fpc0071c start:0 end:1956661 seqSize: 1662314 # 1 illegal coordinates found in agp files. (it was bad sequence, # not agp error) # Fragment copy is more than available fragment sequence. # Artifically fix this until a corrected agp is delivered: < OK original chrI.agp > broken chrI.agp 15,16c15,16 < chrI 3303228 5259888 15 W cb25.fpc0071c 1 1956661 + < chrI 5259889 5261888 16 N 2000 contig no --- > chrI 3303228 4965541 15 W cb25.fpc0071c 1 1662314 > + > chrI 4965542 5261888 16 N 294437 contig no faCount chr*/chr*.fa # #seq len A C G T N cpg # chrI 11066658 3330566 2012567 2016656 3341307 365562 369196 # chrII 14273684 4309027 2591324 2597030 4309946 466357 453226 # chrIII 13311297 4030612 2419456 2407118 4032755 421356 432253 # chrIII_random 1149121 348122 206481 205194 348271 41053 36576 # chrII_random 2403442 717077 420926 420733 716887 127819 72445 # chrI_random 3767006 1139118 682120 683610 1145925 116233 123073 # chrIV 15085352 4579142 2700476 2693260 4578635 533839 473913 # chrIV_random 884002 265220 157795 155141 269484 36362 27411 # chrUn 7825149 2272582 1301502 1301660 2278995 670410 226977 # chrV 15759610 4783092 2884362 2881565 4796302 414289 497767 # chrV_random 2980273 910601 539084 535605 914408 80575 93300 # chrX 20107906 6145898 3698455 3673518 6159052 430983 632930 # chrX_random 530426 165643 98518 99104 165320 1841 16683 # total 109143926 32996700 19713066 19670194 330572873706679 3455750 faToTwoBit chr*/chr*.fa cb2.2bit twoBitInfo cb2.2bit stdout | awk '{printf "%s\t%s\t/gbdb/cb2/cb2.2bit\n", $1,$2}' \ > chromInfo.tab twoBitInfo cb2.2bit stdout | sort -rn +1 > chrom.sizes # Back on hgwdev to create cluster data symlinks and start database ssh hgwdev ln -s /cluster/store5/worm/cb2 /cluster/data/cb2 mkdir /gbdb/cb2 ln -s /cluster/data/cb2/cb2.2bit /gbdb/cb2 cd /cluster/data/cb2 hgsql -e "create database cb2;" mysql hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" cb2 hgsql cb2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' cb2 # Enter cb2 into dbDb and defaultDb so test browser knows about # it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("cb2", "Aug 2005", "/gbdb/cb2", "C. briggsae", \ "chrI:10000-110000", 1, 69, "C. briggsae", \ "Caenorhabditis briggsae", \ "/gbdb/cb2/html/description.html", 0, 0, \ "WUSTL Aug05");' \ -h localhost hgcentraltest # Update the defaultDb entry hgsql -e 'UPDATE defaultDb set name="cb2" where name="cb1";' hgcentraltest mkdir html ln -s /cluster/data/cb2/html /gbdb/cb2/html # Add cb2 to the trackDb/makefile and an empty directory in # trackDb/worm/cb2 # Perform a make in trackDb directory to get the trackDb # initialized for cb2 ls -d chr* | egrep -v "chromInfo|chromList" > chromList ########################################################################### # Load the gold/gap tables (DONE - 2005-08-10 - Hiram) # Redone - 2005-09-08 - Hiram cat chr*/chr*.agp | hgGoldGapGl cb2 stdin # !!! *** The indexes do not get built when hgGoldGapGl is run like this # To check: hgsql -e "show index from gold;" cb2 hgsql -e "analyze table gold;" cb2 hgsql -e "show index from gold;" cb2 hgsql -e "show index from gap;" cb2 hgsql -e "analyze table gap;" cb2 hgsql -e "show index from gap;" cb2 # The show index after the analyze will show more numbers in the # Cardinality of the index featureBits cb2 gap # 725000 bases of 108418926 (0.669%) in intersection ########################################################################### # Prepare scratch area for cluster runs (DONE, 2005-08-09 - Hiram) # Redone - 2005-09-08 - Hiram ssh hgwdev mkdir -p /san/sanvol1/scratch/worms/cb2 cd /san/sanvol1/scratch/worms/cb2 mkdir chroms cp -p /cluster/data/cb2/chr*/chr*.fa ./chroms cp -p /cluster/data/cb2/cb2.2bit . ########################################################################### # PREPARE Split contigs into 100,000 bp chunks for cluster runs # The chroms are split. The randoms and chrUn simply use their # fragments as is. Anytime these fragment results need to be put # back together, use the lift files created from the agp via the # perl scrip as used below. # (DONE, 2005-08-05, Hiram) # Redone - 2005-09-08 - Hiram # next machine ssh kkstore02 mkdir -p /san/sanvol1/scratch/worms/cb2/split cd /san/sanvol1/scratch/worms/cb2 time for C in I II III IV V X do mkdir split/${C} faSplit size chroms/chr${C}.fa 100000 split/${C}/${C} \ -lift=split/chr${C}.lft done time for C in I_random II_random III_random IV_random V_random X_random Un do CHR="chr${C}" rm -fr split/${C} mkdir split/${C} grep -v contig /cluster/data/cb2/${CHR}/${CHR}.agp \ | sed -e "/^$/d" | awk '{print $6}' \ | while read FN do cp -p /cluster/data/cb2/wustl/contigs/${FN}.fa ./split/${C} done /cluster/data/cb2/scripts/agpToLift.pl \ /cluster/data/cb2/${CHR}/${CHR}.agp > \ split/${CHR}.lft echo "done with ${CHR}" done cat split/c*I.lft split/c*V.lft split/c*X.lft > liftChroms.lft cat split/*_random.lft split/chrUn.lft > liftRandoms.lft # copy the contigs for the randoms back here from the bluearc # for masking and blastz runs ssh kkstore02 cd /cluster/data/cb2 mkdir randomContigs for C in I_random II_random III_random IV_random V_random X_random Un do rsync -a --progress /san/sanvol1/scratch/worms/cb2/split/${C}/ \ `pwd`/randomContigs/${C}/ done ############################################################################ # Run RepeatMasker on the chromosomes (DONE - 2005-08-09 - Hiram) # RE-DONE - 2005-09-08 - Hiram ssh pk cd /cluster/data/cb2 # make run directory and job list, create the script to use # for the RepeatMasker run cat << '_EOF_' > scripts/RMWorm #!/bin/csh -fe # # This is a slight rearrangement of the # RMChicken script used in makeGalGal2.doc # The results here need to go to a different location # $1 == chrom name: I II III IV V X M # $2 == directory where split contig .fa is found # $3 == name of contig .fa file cd $1 pushd . cd $2 /bin/mkdir -p /tmp/cb2/$3/$1 /bin/cp $3 /tmp/cb2/$3/$1 cd /tmp/cb2/$3/$1 /cluster/bluearc/RepeatMasker050305/RepeatMasker -alignments -s -species elegans $3 popd /bin/cp /tmp/cb2/$3/$1/$3.out ./ if (-e /tmp/cb2/$3/$1/$3.align) /bin/cp /tmp/cb2/$3/$1/$3.align ./ if (-e /tmp/cb2/$3/$1/$3.tbl) /bin/cp /tmp/cb2/$3/$1/$3.tbl ./ if (-e /tmp/cb2/$3/$1/$3.cat) /bin/cp /tmp/cb2/$3/$1/$3.cat ./ /bin/rm -r /tmp/cb2/$3/$1 /bin/rmdir --ignore-fail-on-non-empty /tmp/cb2/$3 /bin/rmdir --ignore-fail-on-non-empty /tmp/cb2 '_EOF_' # emacs happy chmod +x scripts/RMWorm # create job list mkdir RMRun rm -f RMRun/jobList for C in I II III IV V X Un II_random I_random III_random IV_random \ V_random X_random do mkdir /cluster/data/cb2/RMRun/${C} for T in /san/sanvol1/scratch/worms/cb2/split/$C/*.fa do D=`dirname $T` F=`basename $T` echo /cluster/data/cb2/scripts/RMWorm ${C} ${D} ${F} \ '{'check out line+ /cluster/data/cb2/RMRun/$C/${F}.out'}' done >> RMRun/jobList done # Do the run ssh pk cd /cluster/data/cb2/RMRun para create jobList para try, para check, para check, para push, para check, ... XXX - running - 2005-09-08 Completed: 1369 of 1369 jobs CPU time in finished jobs: 713871s 11897.85m 198.30h 8.26d 0.023 y IO & Wait Time: 9868s 164.47m 2.74h 0.11d 0.000 y Average job time: 529s 8.81m 0.15h 0.01d Longest finished job: 12428s 207.13m 3.45h 0.14d Submission to last job: 28879s 481.32m 8.02h 0.33d # Looks like the big contigs are the outliers here. Most # everything else finishes in about 4 minutes. # when they are finished, liftUp and load the .out files into the database: # next machine ssh kkstore02 cd /cluster/data/cb2/RMRun for C in I II III IV V X Un II_random I_random III_random IV_random \ V_random X_random do liftUp chr${C}.fa.out \ /san/sanvol1/scratch/worms/cb2/split/chr${C}.lft warn ${C}/*.fa.out done cat chrI.fa.out > rmsk.fa.out tail +4 chrII.fa.out >> rmsk.fa.out tail +4 chrIII.fa.out >> rmsk.fa.out tail +4 chrIV.fa.out >> rmsk.fa.out tail +4 chrV.fa.out >> rmsk.fa.out tail +4 chrX.fa.out >> rmsk.fa.out tail +4 chrUn.fa.out >> rmsk.fa.out tail --silent --lines=+4 chr*_random.fa.out >> rmsk.fa.out ssh hgwdev cd /cluster/data/cb2/RMRun hgLoadOut -nosplit -verbose=2 cb2 rmsk.fa.out # bad rep range [480, 441] line 6473 of rmsk.fa.out # bad rep range [330, 281] line 34048 of rmsk.fa.out # bad rep range [282, 274] line 34050 of rmsk.fa.out # bad rep range [333, 268] line 51955 of rmsk.fa.out # note: 4 records dropped due to repStart > repEnd featureBits cb2 rmsk # 16115005 bases of 108418926 (14.864%) in intersection featureBits -countGaps cb2 rmsk # 16115005 bases of 109143926 (14.765%) in intersection ####################################################################### # SIMPLE REPEAT [TRF] TRACK (DONE - 2005-08-10 - Hiram) # RE-DONE - 2005-09-09 - Hiram ssh kki mkdir -p /cluster/data/cb2/bed/simpleRepeat cd /cluster/data/cb2/bed/simpleRepeat mkdir trf ls -1S /san/sanvol1/scratch/worms/cb2/chroms/chr*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' # happy emacs gensub2 genome.lst single gsub jobList para create jobList para try # only 2 CPUs today: # Completed: 13 of 13 jobs # CPU time in finished jobs: 1065s 17.75m 0.30h 0.01d 0.000 y # IO & Wait Time: 446s 7.43m 0.12h 0.01d 0.000 y # Average job time: 116s 1.94m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 408s 6.80m 0.11h 0.00d # Submission to last job: 756s 12.60m 0.21h 0.01d # When cluster run is done, combine into one: sort -k1,1 -k2,2n trf/*.bed > simpleRepeat.bed # Load into the database: # next machine ssh hgwdev cd /cluster/data/cb2/bed/simpleRepeat hgLoadBed -strict cb2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 32690 elements of size 16 featureBits cb2 simpleRepeat # 3977808 bases of 108418926 (3.669%) in intersection # And run trf on the contigs for the separate masking and blastz runs ssh kki mkdir /cluster/data/cb2/bed/simpleRepeat/randomContigs cd /cluster/data/cb2/bed/simpleRepeat/randomContigs mkdir trf ls -1S /san/sanvol1/scratch/worms/cb2/split/*_random/*.fa \ /san/sanvol1/scratch/worms/cb2/split/Un/*.fa > contig.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' # happy emacs gensub2 contig.lst single gsub jobList para create jobList para try # only 2 CPUs today: # Completed: 470 of 470 jobs # CPU time in finished jobs: 414s 6.90m 0.11h 0.00d 0.000 y # IO & Wait Time: 2522s 42.03m 0.70h 0.03d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 73s 1.22m 0.02h 0.00d # Submission to last job: 1468s 24.47m 0.41h 0.02d mkdir -p trfMask for F in trf/*.bed do T=${F#trf/} echo "${F} > trfMask/${T}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T} done ####################################################################### # PROCESS SIMPLE REPEATS AND RMSK INTO MASK (DONE, 2005-08-10 - Hiram) # RE-DONE - 2005-09-09 - Hiram # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore02 cd /cluster/data/cb2/bed/simpleRepeat mkdir -p trfMask for F in trf/*.bed do T=${F#trf/} echo "${F} > trfMask/${T}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T} done # create Soft and Hard masks from RepeatMaster and TRF outputs: # and rebuild the 2bit file using the soft masking in the fa. # Might need the nibs for something, so make those too. # next machine ssh kkstore02 cd /cluster/data/cb2 mkdir softMask nib for C in `cat chromList` do echo -n "masking ${C} " rm -f softMask/${C}.fa maskOutFa ${C}/${C}.fa RMRun/${C}.fa.out \ softMask/${C}.fa -soft maskOutFa softMask/${C}.fa \ bed/simpleRepeat/trfMask/${C}.bed \ softMask/${C}.fa -softAdd rm -f nib/${C}.nib faToNib -softMask softMask/${C}.fa nib/${C}.nib done # masking chrI Writing 11066658 bases in 5533337 bytes # masking chrII Writing 14273684 bases in 7136850 bytes # masking chrIII Writing 13311297 bases in 6655657 bytes # masking chrIII_random Writing 1149121 bases in 574569 bytes # masking chrII_random Writing 2403442 bases in 1201729 bytes # masking chrI_random Writing 3767006 bases in 1883511 bytes # masking chrIV Writing 15085352 bases in 7542684 bytes # masking chrIV_random Writing 884002 bases in 442009 bytes # masking chrUn Writing 7825149 bases in 3912583 bytes # masking chrV Writing 15759610 bases in 7879813 bytes # masking chrV_random Writing 2980273 bases in 1490145 bytes # masking chrX Writing 20107906 bases in 10053961 bytes # masking chrX_random Writing 530426 bases in 265221 bytes # re-create the 2bit file rm -f cb2.2bit faToTwoBit softMask/chr*.fa cb2.2bit # create hard masks mkdir hardMask for C in `cat chromList` do echo "masking ${C}" maskOutFa softMask/${C}.fa hard hardMask/${C}.fa done mkdir randomContigs/softMask randomContigs/hardMask randomContigs/nib for C in I_random II_random III_random IV_random V_random X_random Un do for F in randomContigs/${C}/*.fa do B=`basename ${F}` T=${B/.fa/} echo "${B} ${T}" maskOutFa ${F} RMRun/${C}/${B}.out \ randomContigs/softMask/${B} -soft maskOutFa randomContigs/softMask/${B} \ bed/simpleRepeat/randomContigs/trfMask/${T}.bed \ randomContigs/softMask/${B} -softAdd maskOutFa randomContigs/softMask/${B} hard \ randomContigs/hardMask/${B} faToNib -softMask randomContigs/softMask/${B} \ randomContigs/nib/${T}.nib done done mkdir randomContigs/nib for C in I_random II_random III_random IV_random V_random X_random Un do for F in randomContigs/${C}/*.fa do B=`basename ${F}` T=${B/.fa/} echo "${B} ${T}" faToNib -softMask randomContigs/softMask/${B} \ randomContigs/nib/${T}.nib done done faToTwoBit softMask/chrI.fa softMask/chrII.fa softMask/chrIII.fa \ softMask/chrIV.fa softMask/chrV.fa softMask/chrX.fa \ randomContigs/softMask/*.fa chrRandomContigs.2bit # Check that all the sequence is there: twoBitToFa chrRandomContigs.2bit stdout | faSize stdin # 108680926 bases (3243679 N's 105437247 real 89174354 upper 16262893 lower) in 476 sequences in 1 files faSize softMask/c*.fa # 109143926 bases (3706679 N's 105437247 real 89174354 upper 16262893 lower) in 13 sequences in 13 files # Note the real, upper and lower numbers are the same, only the # N's are different # copy to san for cluster runs ssh kkstore02 mkdir -p /san/sanvol1/scratch/worms/cb2/chromNib mkdir /san/sanvol1/scratch/worms/cb2/randomContigs mkdir /san/sanvol1/scratch/worms/cb2/nib cd /cluster/data/cb2/nib cp -p c*I.nib c*V.nib c*X.nib /san/sanvol1/scratch/worms/cb2/chromNib cp -p c*.nib /san/sanvol1/scratch/worms/cb2/nib cd /cluster/data/cb2 cp -p cb2.2bit /san/sanvol1/scratch/worms/cb2 cp -p chrRandomContigs.2bit /san/sanvol1/scratch/worms/cb2 rsync -a --progress /cluster/data/cb2/randomContigs/softMask/ \ /san/sanvol1/scratch/worms/cb2/randomContigs/softMask/ rsync -a --progress /cluster/data/cb2/randomContigs/hardMask/ \ /san/sanvol1/scratch/worms/cb2/randomContigs/hardMask/ ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 2005-08-17 Hiram) # RE-DONE - 2005-09-09 - Hiram # Use -repMatch=100 (based on size -- for human we use 1024, and # this worm size is ~3.7% of human judging by gapless cb2 genome size from # featureBits -- we would use 37, but bump that up a bit to be more # conservative). # featureBits hg17 rmsk # 1390952984 bases of 2866216770 (48.529%) in intersection # featureBits cb2 rmsk # 16092866 bases of 108124579 (14.884%) in intersection # 108124579 / 2866216770 = 0.037723 == 3.7% ssh kkstore02 cd /cluster/data/cb2 blat cb2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=100 # Wrote 8125 overused 11-mers to 11.ooc cp -p 11.ooc /san/sanvol1/scratch/worms/cb2 ####################################################################### # GC5BASE (DONE - 2005-08-10 - Hiram) # RE-DONE - 2005-09-09 - Hiram ssh kkstore02 mkdir /cluster/data/cb2/bed/gc5Base cd /cluster/data/cb2/bed/gc5Base time hgGcPercent -wigOut -doGaps -file=stdout -win=5 cb2 \ /cluster/data/cb2 | wigEncode stdin gc5Base.wig gc5Base.wib # 40 seconds ssh hgwdev cd /cluster/data/cb2/bed/gc5Base mkdir /gbdb/cb2/wib ln -s `pwd`/gc5Base.wib /gbdb/cb2/wib hgLoadWiggle cb2 gc5Base gc5Base.wig rm wiggle.tab ####################################################################### # sangerGeneWS140 - mapping Ce3 sangerGene onto this sequence # (DONE - 2005-08-17 - Hiram) # (RE-DONE - 2005-09-09 - Hiram) # fetched blast-2.2.11 x86_64 executable from: # ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ # Installed in /cluster/bluearc/blast2211x86_64 # The i386 versions wouldn't work on these machines. # ssh kkstore02 # create .fa sequences for blastDb mkdir /san/sanvol1/scratch/worms/cb2/blastDb cd /san/sanvol1/scratch/worms/cb2/blastDb for C in I II III IV V X do CHR="chr${C}" mkdir ${CHR} faSplit size /cluster/data/cb2/${CHR}/${CHR}.fa 100000 ${CHR}/${CHR}_ \ -lift=${CHR}.lft done # some of the randomContigs are too large, so split them too mkdir randomContigs for C in I_random II_random III_random IV_random V_random X_random Un do CHR="chr${C}" mkdir ${CHR} mkdir randomContigs/${CHR} grep -v contig /cluster/data/cb2/${CHR}/${CHR}.agp | sed -e "/^$/d" | \ awk '{print $6}' | while read FN do CTG=${FN#cb25.} faSplit size /cluster/data/cb2/wustl/contigs/${FN}.fa \ 100000 ${CHR}/${CTG}_ -lift=randomContigs/${CHR}/${CTG}.lft done cat randomContigs/${CHR}/*.lft > randomContigs/${CHR}.lft /cluster/data/cb2/scripts/agpToLift.pl \ /cluster/data/cb2/${CHR}/${CHR}.agp > ${CHR}.lft echo "done with ${CHR}" done # consolidate the lift files cat c*I.lft c*V.lft c*X.lft > liftChroms.lft cat c*_random.lft chrUn.lft > liftRandoms.lft cat liftChroms.lft liftRandoms.lft randomContigs/*.lft > liftAll.lft # Make all the .fa files exist in one directory mkdir fasta cd fasta ls ../chr*/*.fa | wc # 1484 1484 32470 # put those 1,484 files together into 10 files to allow for # reasonably sized kluster jobs cat ../chr*/*.fa > tmp.fa faSplit sequence tmp.fa 10 c_ rm -f tmp.fa # And construct the blast database for i in *.fa do /cluster/bluearc/blast2211x86_64/bin/formatdb -i $i -p F done mkdir -p /cluster/data/cb2/bed/tblastn.sangerGene cd /cluster/data/cb2/bed/tblastn.sangerGene ls -1S /san/sanvol1/scratch/worms/cb2/blastDb/fasta/*.fa > query.lst ls -1S /san/sanvol1/scratch/worms/ce3/splitPep/*.fa > pep.lst mkdir blastOut for i in `cat pep.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check in line+ $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast2211x86_64/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/worms/cb2/blastDb/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /san/sanvol1/scratch/worms/cb2/blastDb/liftAll.lft carry $f.3 if pslCheck -prot $f.4 then cp -p $f.4 $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $f.8 $f.3 $f.4 exit 1 '_EOF_' # happy emacs chmod +x blastSome # Now, to the kluster run ssh pk cd /cluster/data/cb2/bed/tblastn.sangerGene gensub2 query.lst pep.lst gsub jobList para create jobList para try; push; check ... etc ... # with lots of pk competition: # Completed: 8170 of 8170 jobs # CPU time in finished jobs: 89221s 1487.02m 24.78h 1.03d 0.003 y # IO & Wait Time: 33532s 558.86m 9.31h 0.39d 0.001 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 63s 1.05m 0.02h 0.00d # Submission to last job: 6937s 115.62m 1.93h 0.08d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' # happy emacs cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7000 stdin ../c.`basename $1`.psl) '_EOF_' # happy emacs chmod +x chainSome ls -1dS `pwd`/blastOut/wp???? > chain.lst gensub2 chain.lst single chainGsub chainJobs para create chainJobs para try; push; check ... etc ... # with lots of pk contention # Completed: 817 of 817 jobs # CPU time in finished jobs: 46s 0.77m 0.01h 0.00d 0.000 y # IO & Wait Time: 3635s 60.58m 1.01h 0.04d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 13s 0.22m 0.00h 0.00d # Submission to last job: 266s 4.43m 0.07h 0.00d ssh kkstore02 cd /cluster/data/cb2/bed/tblastn.sangerGene/blastOut for i in wp???? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 {print}" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n \ | uniq > ../preblastSangerGene.psl XXXX cd .. # haven't tried this yet - 2005-08-12 - should be interesting, need a # psl file indicating where the sangerGene's are on Ce3 and alias name list blatDir=/cluster/data/hg16/bed/blat.hg16KG protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl XXXX ssh hgwdev cd /cluster/data/cb2/bed/tblastn.sangerGene hgLoadPsl -table=blastSangerGene cb2 preblastSangerGene.psl # clean up ssh kkstore02 cd /cluster/data/cb2/bed/tblastn.sangerGene rm -rf blastOut # Is it sane: featureBits cb2 blastSangerGene # 13676954 bases of 108418926 (12.615%) in intersection ######################################################################### # BLASTZ C. remanei caeRem1 (WORKING - 2005-09-14 Hiram) # ssh pk mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14 cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14 # utilizing small target chunks and a single query chunk so that # the dynamic masking with parameter M can function. cat << '_EOF_' > DEF # Cb2 vs caeRem1 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb2 SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ1_2BIT=/san/sanvol1/scratch/worms/cb2/cb2.2bit SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=100 # QUERY: remanei caeRem1 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit SEQ2_2BIT=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=150000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_14 SEQ1_LEN=$BASE/S1.len SEQ1_CTGLEN=$BASE/S1ctg.len SEQ2_LEN=$BASE/S2.len TMPDIR=/scratch/tmp '_EOF_' # << keep emacs coloring happy nibSize /san/sanvol1/scratch/worms/cb2/nib/*.nib \ | awk '{printf "%s\t%s\n", $2, $3}' > S1.len twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S1ctg.len twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len # establish a screen to control this job screen cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=blastz \ `pwd`/DEF > blastz.out 2>&1 & # STARTED 2005-09-14 16:15 ######################################################################### # BLASTZ C. remanei caeRem1 (WORKING - 2005-08-15 Hiram) # RE-DONE 2005-09-09 - Hiram # ssh kk mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09 cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09 # utilizing small target chunks and a single query chunk so that # the dynamic masking with parameter M can function. cat << '_EOF_' > DEF # Cb2 vs caeRem1 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb2 - full chroms only, no randoms SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chromNib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=50 # QUERY: remanei caeRem1 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=150000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_09 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/cb2/chrom.sizes ./S1.len twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len # establish a screen to control this job screen cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09 time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \ `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-08-15 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r # STARTED - 2005-09-09 15:27 XXXX time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -stop=chainMerge \ `pwd`/DEF > thruChainMerge.out 2>&1 & ########## And one for the randomContigs mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs # utilizing small target chunks and a single query chunk so that # the dynamic masking with parameter M can function. cat << '_EOF_' > DEF # Cb2 vs caeRem1 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb2 - random contigs only SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/randomContigs/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=50 # QUERY: remanei caeRem1 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=150000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << happy emacs nibSize /san/sanvol1/scratch/worms/cb2/randomContigs/nib/*.nib | \ awk '{printf "%s\t%s\n", $2, $3}' > S1.len twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len # establish a screen to control this job screen cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \ `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-09-09 15:40 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -stop=chainMerge \ `pwd`/DEF > thruChainMerge.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=net \ `pwd`/DEF > continueNet.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load -stop=load \ `pwd`/DEF > load.out 2>&1 & XXXX # swap results to place cb2 alignments onto cb1 ssh kkstore02 cd /cluster/data/cb2/bed/blastzCb1.2005_05_02 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & ############################################################################# # BLAT SERVER SETUP (DONE - 2005-09-09 - Hiram) ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("cb2", "blat6", 17780, 1, 0);' \ -h localhost hgcentraltest hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("cb2", "blat6", 17781, 0, 1);' \ -h localhost hgcentraltest ############################################################################# # BLASTZ Ce3 (WORKING - 2005-09-12 - Hiram) ssh kk mkdir /cluster/data/cb2/bed/blastzCe3.2005-09-12 cd /cluster/data/cb2/bed ln -s blastzCe3.2005-09-12 blastz.ce3 cd blastzCe3.2005-09-12 # Utilizing tiny target chunks and a single query chunk to use the # dynamic masking available via the BLASTZ_M parameter cat << '_EOF_' > DEF # cb2 vs ce3 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: briggsae Cb2 SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ1_2BIT=/san/sanvol1/scratch/worms/cb2/cb2.2bit SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=100 # QUERY: elegans Ce3 SEQ2_DIR=/san/sanvol1/scratch/worms/ce3/nib SEQ2_2BIT=/san/sanvol1/scratch/worms/ce3/ce3.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzCe3.2005-09-12 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # happy emacs twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S1.len twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \ | grep random >> S1.len twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \ | grep chrUn >> S1.len nibSize /san/sanvol1/scratch/worms/ce3/nib/*.nib | \ awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len XXXX - working on modifications to doBlastzChainNet.pl 2005-09-06 XXXX - to do this random contigs business properly # establish a screen to control this job screen time ./doBlastzChainNet.pl -verbose=2 -stop=cat \ -bigClusterHub=kk \ `pwd`/DEF > stopCat.run.out 2>&1 & # STARTED - 2005-09-09 - 16:18 # with lots of kluster contention # Completed: 104 of 104 jobs # CPU time in finished jobs: 125426s 2090.44m 34.84h 1.45d 0.004 y # IO & Wait Time: 1821s 30.35m 0.51h 0.02d 0.000 y # Average job time: 1224s 20.39m 0.34h 0.01d # Longest finished job: 2849s 47.48m 0.79h 0.03d # Submission to last job: 18148s 302.47m 5.04h 0.21d time ./doBlastzChainNet.pl -continue cat -stop chainRun \ -bigClusterHub=kk \ `pwd`/DEF > toChainRun.run.out 2>&1 & # Completed: 104 of 104 jobs # CPU time in finished jobs: 31s 0.52m 0.01h 0.00d 0.000 y # IO & Wait Time: 351s 5.84m 0.10h 0.00d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 211s 3.52m 0.06h 0.00d time ./doBlastzChainNet.pl -continue chainRun -stop chainMerge \ -bigClusterHub=kk \ `pwd`/DEF > toChainMerge.run.out 2>&1 & #Completed: 7 of 7 jobs #CPU time in finished jobs: 347s 5.78m 0.10h 0.00d 0.000 y #IO & Wait Time: 65s 1.08m 0.02h 0.00d 0.000 y #Average job time: 59s 0.98m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 120s 2.00m 0.03h 0.00d #Submission to last job: 696s 11.60m 0.19h 0.01d time ./doBlastzChainNet.pl -continue chainMerge -stop net \ -bigClusterHub=kk \ `pwd`/DEF > toNet.run.out 2>&1 & time ./doBlastzChainNet.pl -continue load -stop load \ -bigClusterHub=kk \ `pwd`/DEF > loadStep.run.out 2>&1 & time ./doBlastzChainNet.pl -continue download -stop cleanup \ -bigClusterHub=kk \ `pwd`/DEF > thruCleanup.run.out 2>&1 & # swap results to place ce3 alignments onto cb2 time ./doBlastzChainNet.pl -swap \ -bigClusterHub=kk \ `pwd`/DEF > \ swap.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ############################################################################# # BLASTZ SELF (experiments to test doBlastzChainNet.pl and to test # dynamic masking M parameter ssh pk mkdir /cluster/data/cb2/bed/blastzSelfM50 cd /cluster/data/cb2/bed/blastzSelfM50 cat << '_EOF_' > DEF # cb2 vs cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Cb2 SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/nib SEQ1_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ1_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes SEQ1_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=100 # QUERY: briggsae Cb2 SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/nib SEQ2_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ2_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes SEQ2_SELF=1 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=110000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzSelfM50 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/cb2/bed/blastzSelfM50 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -stop=load \ `pwd`/DEF > swapLoadReady.out 2>&1 & # Completed: 231 of 231 jobs # CPU time in finished jobs: 336879s 5614.65m 93.58h 3.90d 0.011 y # IO & Wait Time: 14146s 235.77m 3.93h 0.16d 0.000 y # Average job time: 1520s 25.33m 0.42h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 29772s 496.20m 8.27h 0.34d # Submission to last job: 38823s 647.05m 10.78h 0.45d # Completed: 231 of 231 jobs # CPU time in finished jobs: 171s 2.85m 0.05h 0.00d 0.000 y # IO & Wait Time: 1070s 17.83m 0.30h 0.01d 0.000 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 11s 0.18m 0.00h 0.00d # Submission to last job: 80s 1.33m 0.02h 0.00d # Completed: 45 of 45 jobs # CPU time in finished jobs: 4293s 71.54m 1.19h 0.05d 0.000 y # IO & Wait Time: 268s 4.47m 0.07h 0.00d 0.000 y # Average job time: 101s 1.69m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1605s 26.75m 0.45h 0.02d # Submission to last job: 1639s 27.32m 0.46h 0.02d # Crashed on one of the net steps, fix the script and continue: time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -continue=net -stop=load \ `pwd`/DEF > netToLoad.out 2>&1 & # real 38m41.919s # user 0m0.049s # sys 0m0.041s # Re-Load the tables under a special name: ssh hgwdev cd /cluster/data/cb2/bed/blastzSelfM50/axtChain hgLoadChain -tIndex cb2 M50chainSelf cb2.cb2.all.chain.gz # Loading 5934642 chains into cb2.M50chainSelf netFilter -minGap=10 cb2.cb2.net \ | hgLoadNet -verbose=0 cb2 M50netSelf stdin # real 234m14.863s # user 4m57.370s # sys 2m34.100s time featureBits cb2 M50chainSelfLink # 57461471 bases of 108418926 (52.999%) in intersection ############################################################################# # BLASTZ SELF (experiments to test doBlastzChainNet.pl and to test # dynamic masking M parameter # Repeat experiment with M=1000 ssh pk mkdir /cluster/data/cb2/bed/blastzSelfM1000 cd /cluster/data/cb2/bed/blastzSelfM1000 cat << '_EOF_' > DEF # cb2 vs cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=1000 # TARGET: elegans Cb2 SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/nib SEQ1_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ1_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes SEQ1_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=100 # QUERY: briggsae Cb2 SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/nib SEQ2_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ2_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes SEQ2_SELF=1 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=110000000 SEQ2_LAP=0 BASE=/cluster/data/cb2/bed/blastzSelfM1000 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/cb2/bed/blastzSelfM1000 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -stop=load \ `pwd`/DEF > swapLoadReady.out 2>&1 & featureBits cb2 chainSelf # 108554651 bases of 108418926 (100.125%) in intersection # Load the tables under a special name: ssh hgwdev cd /cluster/data/cb2/bed/blastzSelfM1000/axtChain/chain foreach f (*.chain) set c = $f:r hgLoadChain cb2 ${c}_M1000chainSelf $f end cd /cluster/data/cb2/bed/blastzSelfM1000/axtChain # Load nets: netFilter -minGap=10 cb2.cb2.net \ | hgLoadNet -verbose=0 cb2 M1000netSelf stdin # real 181m56.670s # user 23m58.570s # sys 6m56.920s # featureBits runs out of memory on this on hgwdev ssh kolossus cd /cluster/data/cb2/bed/blastzSelfM1000 time HGDB_CONF=~/.hg.conf.read-only featureBits cb2 \ M1000chainSelfLink > fbCb2.M1000chainSelfLink 2>&1 # 73156472 bases of 108418926 (67.476%) in intersection # real 42m33.104s # user 15m42.360s # sys 5m35.500s # The comparison numbers: # pk kluster runs, x86_64 blastz binary # Target chunk size of 500,000 overlap 100 # Query chunk size, whole genome = 110,000,000 # 231 kluster jobs # # BLASTZ_M parameter M=50 M=1000 # Average job time: 25 min 85 min # Hippos out to: 8.3 hr 7.4 hr # chainSelf table size 5,934,642 rows 27,659,493 rows # 527,006 Kb 2,340,000 Kb # chainSelfLink table 33,604,807 rows 183,315,214 rows # 1,695 Mb 7,639 Mb # netSelf table 274,211 rows 314,441 rows # 28,967 Kb 33,345 Kb # featureBits SelfLink 5,761,471 bases 73,156,472 bases # % 52.999 intersection % 67.476 intersection ########################################################################### # Preparing downloads ssh kkstore02 mkdir /cluster/data/cb2/goldenPath cd /cluster/data/cb2/softMask mkdir ../goldenPath/bigZips tar cvzf ../goldenPath/bigZips/chromFa.tar.gz ./chr*.fa mkdir /cluster/data/cb2/goldenPath/bigZips cp -p ../../RMRun/rmsk.fa.out ./rmsk.out gzip rmsk.out mkdir /cluster/data/cb2/goldenPath/chromosomes cd /cluster/data/cb2/goldenPath/chromosomes cp -p ../../softMask/chr*.fa . gzip chr*.fa ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/cb2/bigZips cd /usr/local/apache/htdocs/goldenPath/cb2/bigZips ln -s /cluster/data/cb2/goldenPath/bigZips/* . mkdir /usr/local/apache/htdocs/goldenPath/cb2/chromosomes cd /usr/local/apache/htdocs/goldenPath/cb2/chromosomes ln -s /cluster/data/cb2/goldenPath/chromosomes/chr*.fa.gz .