# for emacs: -*- mode: sh; -*- # Creating the assembly for Monodelphis domestica # South American, Short-tailed Opossum # http://www.genome.gov/11510687 # http://www.genome.gov/12512285 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram) ssh kkstore01 mkdir -p /cluster/store1/monDom2/broad.mit.edu ln -s /cluster/store1/monDom2 /cluster/data/monDom2 cd /cluster/data/monDom2/broad.mit.edu wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=4 \ ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monDom2/* # That takes a number of hours, ending up with: # -rw-rw-r-- 1 843 Jun 14 17:34 ForDistribution.command # -rw-rw-r-- 1 1199352275 Jun 14 17:35 V3.0_prel.agp.chromosome.fasta.gz # -rw-rw-r-- 1 6918810 Jun 14 17:36 assembly.agp # -rw-rw-r-- 1 470824048 Jun 14 17:36 V3.0_prel.agp.chromosome.qual.gz # -rw-rw-r-- 1 467679128 Jun 14 17:37 assembly.quals.gz # -rw-rw-r-- 1 14839262 Jun 14 17:37 assembly.markup # -rw-rw-r-- 1 2577871 Jun 14 17:37 assembly.links # -rw-rw-r-- 1 1196979152 Jun 14 17:37 assembly.bases.gz # -rw-rw-r-- 1 64 Jun 14 17:41 source # -rw-rw-r-- 1 175152051 Jun 14 17:41 assembly.unplaced # -rw-rw-r-- 1 3568712708 Jun 14 17:41 assembly.reads # -rw-rw-r-- 1 760535060 Jun 14 17:42 unplaced.fasta.gz # -rw-rw-r-- 1 2092672289 Jun 14 17:43 unplaced.qual.gz ######################################################################### # DATA INTEGRITY VERIFICATION (DONE - 2005-08-18 - Hiram) ssh kkstore01 mkdir /cluster/data/monDom2/dataCheck cd /cluster/data/monDom2/dataCheck zcat ../broad.mit.edu/assembly.bases.gz | grep "^>" \ | sed -e "s/^>//" > contigs.names # should be no duplicates sort contigs.names | wc # 65946 65946 846188 sort -u contigs.names | wc # 65946 65946 846188 sort contigs.names | head -3 # contig_0 # contig_1 # contig_10 sort contigs.names | tail -3 # contig_9997 # contig_9998 # contig_9999 wc contigs.names # 65946 65946 846188 contigs.names zcat ../broad.mit.edu/assembly.quals.gz | grep "^>" \ | sed -e "s/^>//" > contigs.quals.names # should be the same: sum -r contigs* # 39126 827 contigs.names # 39126 827 contigs.quals.names awk '{print $6}' ../broad.mit.edu/assembly.agp \ | grep contig_ > assembly.agp.contig.names # The assembly agp contig list appears to be the same contig list sum -r assembly.agp.contig.names # 39126 827 awk '{print $1}' ../broad.mit.edu/assembly.agp \ | sort -u > assembly.agp.scaffold.names wc assembly.agp.scaffold.names # 5077 5077 69968 assembly.agp.scaffold.names head -3 assembly.agp.scaffold.names # scaffold_0 # scaffold_1 # scaffold_10 tail -3 assembly.agp.scaffold.names # scaffold_997 # scaffold_998 # scaffold_999 zcat ../broad.mit.edu/V3.0_prel.agp.chromosome.fasta.gz | \ grep "^>" | sed -e "s/^>//; s/\..*//" > assembly.fasta.scaffold.names # 600 duplicate names in this listing: sort assembly.fasta.scaffold.names | wc # 5677 5677 76915 sort -u assembly.fasta.scaffold.names | wc # 5077 5077 69968 # This is due to their partitioning of large sequences into # 5 Mb chunks, e.g: # >scaffold_0.1-5000000 (TestV3.0_prel) # >scaffold_0.5000001-10000000 (TestV3.0_prel) # >scaffold_0.10000001-15000000 (TestV3.0_prel) faCount ../broad.mit.edu/V3.0_prel.agp.chromosome.fasta.gz \ > faCount.assembly.agp.fasta # seq len A C G # total 3575239585 1087726608 661068673 661056691 # T N cpg # 1086593681 78793932 16830562 # Shortest contig: sort -k2,2n faCount.assembly.agp.fasta | head # scaffold_5076 at 2002 bases # And many at the 5,000,000 chunk size. faCount ../broad.mit.edu/assembly.bases.gz > faCount.contigs # there are 65,946 contigs # seq len A C G # total 3496445653 1087726608 661068673 661056691 # T N cpg # 1086593681 0 16830563 # shortest contig is 2000 bases (about 10 of this size) # longest is 1,015,767 (contig_37046) grep "^contig" faCount.contigs | awk '{print $2}' \ | textHistogram -binSize=1000 -minVal=2000 -maxBinCount=1000 stdin \ | head # beginning of histogram: # 2000 ************************************************************ 4175 # 3000 ******************************************** 3072 # 4000 *************************** 1862 # 5000 ******************************** 2199 # 6000 ****************************** 2121 # 7000 ***************************** 1986 # 8000 ************************ 1662 # 9000 ******************* 1335 #10000 ***************** 1164 #11000 **************** 1126 ######################################################################### # COMBINE scaffold fasta pieces into single chunks, remove the extra # base-range info from the names. (DONE - Hiram - 2005-08-19) ssh kkstore01 cd /cluster/data/monDom2/broad.mit.edu cat << '_EOF_' > collapseScaffolds.sh #!/bin/sh # zcat V3.0_prel.agp.chromosome.fasta.gz | awk ' BEGIN { name="" } /^>/ { id=$1 sub(">","",id) sub("\\..*","",id) if (!match(name,id)) { printf ">%s\n", id name=id } } /^[^>]/ { print } ' '_EOF_' # happy emacs chmod +x collapseScaffolds.sh time ./collapseScaffolds.sh | gzip > scaffolds.fa.gz # 17 minutes # make sure we have not damaged the sequence: time faCount scaffolds.fa.gz > faCount.scaffolds.fa # 2 minutes tail -1 faCount.scaffolds.fa # seq len A C G # total 3575239585 1087726608 661068673 661056691 # T N cpg # 1086593681 78793932 16830563 # Those are the same numbers as above (well, one more cpg) # that could happen if one more is found at a # boundary break ######################################################################### # MAKE 2BIT NIB FILE (DONE - Hiram - 2005-08-19) # This is rebuilt again after repeat and simple masking are applied. # This first copy is just to get a browser running. ssh kkstore01 cd /cluster/data/monDom2 time faToTwoBit broad.mit.edu/scaffolds.fa.gz monDom2.2bit # 2 minutes # check that the sequence hasn't been damaged time twoBitToFa monDom2.2bit stdout | faCount stdin > faCount.2bit tail -1 faCount.2bit # seq len A C G # total 3575239585 1087726608 661068673 661056691 # T N cpg # 1086593681 78793932 16830563 # still the same numbers, looking good # The largest scaffold (0) is 199,827,570 ! # with seven of them over 100,000,000 ######################################################################### # CREATE DATABASE (DONE - Hiram - 2004-12-02) ssh hgwdev cd /cluster/data/monDom2 mkdir /gbdb/monDom2 ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2 twoBitInfo monDom2.2bit stdout | awk '{printf "%s\t%s\t/gbdb/monDom2/monDom2.2bit\n", $1,$2}' \ > chromInfo.tab hgsql -e "create database monDom2;" mysql hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \ monDom2 hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \ monDom2 # generate chrom.sizes list in order by size, biggest first # This listing is going to be used in a variety of procedures # below twoBitInfo monDom2.2bit stdout | sort -rn +1 > chrom.sizes hgsql monDom2 -N -e 'select chrom from chromInfo' > chrom.lst # Enter monDom2 into dbDb and defaultDb so test browser knows about # it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("monDom2", "June 2005", "/gbdb/monDom2", "Opossum", \ "scaffold_0:1000000-11000000", 1, 34, "Opossum", \ "Monodelphis domestica", \ "/gbdb/monDom2/html/description.html", 0, 0, \ "Broad Inst. V3 Prelim Jun05");' \ -h localhost hgcentraltest # do this update later when it is time to make this be the default hgsql -e 'update defaultDb set name="monDom2" where genome="Opossum";' \ hgcentraltest hgsql -e 'update dbDb set defaultPos="scaffold_0:1000000-11000000" where name="monDom2";' \ hgcentraltest # Make trackDb table so browser knows what tracks to expect mkdir /cluster/data/monDom2/html ln -s /cluster/data/monDom2/html /gbdb/monDom2/html mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom2 cd $HOME/kent/src/hg/makeDb/trackDb/opossum cvs add monDom2 cd monDom2 cp -p /cluster/data/monDom1/html/description.html ./description.html # edit that description to update cvs add description.html cvs commit cd $HOME/kent/src/hg/makeDb/trackDb # add monDom2 to the makefile, then make DBS=monDom2 ####################################################################### # LOAD GAP & GOLD TABLES FROM AGP (DONE - 2005-08-22 - Hiram) ssh hgwdev cd /cluster/data/monDom2 hgGoldGapGl -noGl monDom2 broad.mit.edu/assembly.agp # Verify sanity of indexes hgsql -e "show index from gold;" monDom2 hgsql -e "show index from gap;" monDom2 # Look at the Cardinality column, it should have some healthy # numbers for all indices # For some reason, the indices do not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". hgsql -e "analyze table gold; analyze table gap;" monDom2 ####################################################################### # CHUNK up scaffolds for repeat masking (DONE - Hiram - 2005-08-19) ssh kkstore01 cd /cluster/data/monDom2 mkdir scripts # breaks all scaffolds at 500,000 boundaries, result is to a # single file. The tiny scaffolds under that size remain intact. time faSplit size broad.mit.edu/scaffolds.fa.gz 500000 -oneFile \ scaffoldsSplit -lift=scripts/scaffoldsSplit.lft # 2 minutes # Now split that single file into 200,000 chunks, keeping fa record # boundaries, this has the effect of putting all the tiny # scaffolds together in larger single files. The 500,000 scaffold # chunks will be their own separate files. mkdir chunks500k time faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_ # 2 minutes, makes over 7000 files: ls chunks500k | wc # 344 7344 101716 # copy to bluearc for efficient transfer to iservers mkdir /cluster/bluearc/monDom2 rsync -a --progress `pwd`/chunks500k/ /cluster/bluearc/monDom2/chunks500k/ # Then to the iservers ssh kkr1u00 mkdir /iscratch/i/monDom2 cd /iscratch/i/monDom2 rsync -a --progress /cluster/bluearc/monDom2/chunks500k/ ./chunks500k/ # And the special repeat masker library from previous opossum cp -p /iscratch/i/monDom1/monDom1.novel.RM.lib . # efficient rsync, just this data for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/monDom2/ \ kkr${i}u00:/iscratch/i/monDom2/ done ####################################################################### # GC5BASE (DONE - 2005-08-19 - Hiram) # This can be done while the split above and copy is taking place. ssh kkstore01 mkdir /cluster/data/monDom2/bed/gc5Base cd /cluster/data/monDom2/bed/gc5Base time hgGcPercent -wigOut -doGaps -file=stdout -win=5 monDom2 \ /cluster/data/monDom2 | wigEncode stdin gc5Base.wig gc5Base.wib # 19 minutes wc gc5Base.wig # 693243 9012159 72153841 gc5Base.wig ssh hgwdev cd /cluster/data/monDom2/bed/gc5Base mkdir /gbdb/monDom2/wib ln -s `pwd`/gc5Base.wib /gbdb/monDom2/wib hgLoadWiggle monDom2 gc5Base gc5Base.wig ####################################################################### # RUN REPEAT MASKER second time (DONE - 2006-01-20 - 2006-01-23 - Hiram) ssh pk # move the older RM directories aside, then start anew: mkdir /cluster/data/monDom2/RMRun cd /cluster/data/monDom2/RMRun # Initialize the RM libraries for this species on this new repeat # masker /cluster/bluearc/RepeatMasker060120/RepeatMasker \ -species "Monodelphis domestica" /dev/null # RepeatMasker version development-$Id: RepeatMasker,v 1.11 # 2006/01/20 21:10:59 hiram Exp $ # Search engine: Crossmatch # Using Master Database: # /cluster/bluearc/RepeatMasker060120/Libraries/RepeatMaskerLib.embl # Using general libraries in: # /cluster/bluearc/RepeatMasker060120/Libraries/20060120/general # Building species libraries in: # /cluster/bluearc/RepeatMasker060120/Libraries/20060120/monodelphis_domestica # # That creates these five files: ls -og \ /cluster/bluearc/RepeatMasker060120/Libraries/20060120/monodelphis_domestica # -rw-rw-r-- 1 57952 Jan 20 14:22 shortcutlib # -rw-rw-r-- 1 20772 Jan 20 14:22 cutlib # -rw-rw-r-- 1 86793 Jan 20 14:22 shortlib # -rw-rw-r-- 1 89451 Jan 20 14:22 longlib # -rw-rw-r-- 1 194233 Jan 20 14:22 retrolib cd /cluster/data/monDom2 cat << '_EOF_' > scripts/RMOpossumV3 #!/bin/csh -fe cd $1 /bin/mkdir -p /scratch/tmp/monDom2/$2 /bin/cp /san/sanvol1/scratch/monDom2/chunks500k/$2 /scratch/tmp/monDom2/$2/ pushd /scratch/tmp/monDom2/$2 /cluster/bluearc/RepeatMasker060120/RepeatMasker -s \ -species "Monodelphis domestica" $2 popd /bin/cp -p /scratch/tmp/monDom2/$2/$2.out ./$2.out /bin/rm -fr /scratch/tmp/monDom2/$2/* /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom2/$2 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom2 '_EOF_' # happy emacs chmod +x scripts/RMOpossumV3 mkdir RMRun RMOut ls chunks500k | while read chunk do echo ../scripts/RMOpossumV3 /cluster/data/monDom2/RMOut $chunk \ '{'check in line+ /san/sanvol1/scratch/monDom2/chunks500k/$chunk'}' \ '{'check out line+ /cluster/data/monDom2/RMOut/$chunk.out'}' done > RMRun/jobList cd RMRun para create jobList para try ... check ... push ... etc ... XXXX - running 2006-01-20 Completed: 7344 of 7344 jobs CPU time in finished jobs: 26250196s 437503.27m 7291.72h 303.82d 0.832 y IO & Wait Time: 1295145s 21585.75m 359.76h 14.99d 0.041 y Average job time: 3751s 62.51m 1.04h 0.04d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 7619s 126.98m 2.12h 0.09d Submission to last job: 248204s 4136.73m 68.95h 2.87d # There were two failures that pushed out the total time # Without any failures it would take about 12 hours ssh kkstore01 cd /cluster/data/monDom2 # Place all these results together into a single file # Start with just the header head -3 RMOut/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out # Then everything but the header time find ./RMOut -type f | sort -t_ -k2,2n | while read F do tail +4 ${F} >> RMRun/scaffolds.fa.RM.out done # real 2m23.526s # 7 Million lines, 900 Mb file wc RMRun/scaffolds.fa.RM.out # 7239721 108831708 933301569 RMRun/scaffolds.fa.RM.out # lift the split scaffolds to scaffold coordinates cd RMRun time liftUp -type=.out rmsk.out \ ../scripts/scaffoldsSplit.lft warn scaffolds.fa.RM.out # And now, load them up ssh hgwdev cd /cluster/data/monDom2/RMRun time hgLoadOut -verbose=2 -nosplit monDom2 rmsk.out > load.out 2>&1 # note: 75 records dropped due to repStart > repEnd # 15 minutes time featureBits monDom2 rmsk # 1811406819 bases of 3496445653 (51.807%) in intersection # real 6m19.242s # the first RM run had a featureBits of: # 1770704342 bases of 3496445653 (50.643%) in intersection # the second RM run had a featureBits of: # 1769890042 bases of 3496445653 (50.620%) in intersection # real 13m1.867s head -3 RMOutExtra/chunk_00.fa.out > RMExtra1/extra.out time find ./RMOutExtra -type f | sort -t_ -k2,2n | while read F do tail +4 ${F} >> RMExtra1/extra.out done ####################################################################### # RUN REPEAT MASKER (DONE - 2005-08-19 - Hiram)(OBSOLETE, see above) # Reusing the special repeat masker addition from monDom1 # Will run both -mammal and this one, and combine the results. # the library is a copy from monDom1/jkStuff/monDom1.novel.RM.lib # ### !!! *** Next time it would be better to add this library # into the RM set of libraries so only one run would be needed. # ### !!! *** ssh pk mkdir /san/sanvol1/scratch/monDom2 cd /san/sanvol1/scratch/monDom2 rsync -a --progress /cluster/bluearc/monDom2/ ./ cp -p /cluster/data/monDom1/jkStuff/monDom1.novel.RM.lib . mkdir /cluster/data/monDom2/RMOut cd /cluster/data/monDom2 # This script could be smarter and do the job of removing # duplicates from the combination of the two results. Next time # This can be done as it is done below with the awk script during # the liftUp. cat << '_EOF_' > scripts/RMOpossum #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/monDom2/$2 /bin/cp /san/sanvol1/scratch/monDom2/chunks500k/$2 /tmp/monDom2/$2/ pushd /tmp/monDom2/$2 /cluster/bluearc/RepeatMasker050305/RepeatMasker -s -lib /san/sanvol1/scratch/monDom2/monDom1.novel.RM.lib $2 mv $2.out $2.novel.out /cluster/bluearc/RepeatMasker050305/RepeatMasker -s -species "Monodelphis domestica" $2 tail +4 $2.novel.out >> $2.out popd /bin/cp -p /tmp/monDom2/$2/$2.out ./ /bin/rm -fr /tmp/monDom2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/monDom2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/monDom2 '_EOF_' # happy emacs chmod +x scripts/RMOpossum mkdir RMRun ls chunks500k | while read chunk do echo ../scripts/RMOpossum /cluster/data/monDom2/RMOut $chunk \ '{'check in line+ /san/sanvol1/scratch/monDom2/chunks500k/$chunk'}' \ '{'check out line+ /cluster/data/monDom2/RMOut/$chunk.out'}' done > RMRun/jobList cd RMRun para create jobList para try ... check ... push ... etc ... # This takes extra long compared to normal because of the two # passes being run on the data # Completed: 7344 of 7344 jobs # CPU time in finished jobs: 12118042s 201967.37m 3366.12h 140.26d 0.384 y # IO & Wait Time: 46258s 770.97m 12.85h 0.54d 0.001 y # Average job time: 1656s 27.61m 0.46h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2420s 40.33m 0.67h 0.03d # Submission to last job: 67144s 1119.07m 18.65h 0.78d # For comparison, this job on monDom1 on the kk kluster was: # Completed: 7627 of 7627 jobs # CPU time in finished jobs: 117842813s 1964046.88m 32734.11h 1363.92d 3.737 y # IO & Wait Time: 704870s 11747.84m 195.80h 8.16d 0.022 y # Average job time: 15543s 259.05m 4.32h 0.18d # Longest job: 24045s 400.75m 6.68h 0.28d # Submission to last job: 185727s 3095.45m 51.59h 2.15d # the double run above produces duplicate entries for the same # repeat, need to unique the results, and then lift up to scaffold # coordinates (next time eliminate this step by placing the extra # library into the standard RM libraries) ssh kkstore01 cd /cluster/data/monDom2 cat << '_EOF_' > scripts/uniqueRMOut.sh #!/bin/sh FN=$1 if [ ! -f ${FN} ]; then echo "Can not find file ${FN}" exit 255 fi # start with initial header lines head --lines=3 ${FN} # This awk removes the ID number from the end, # and it verifies all the lines are of the correct length. # The sort following the awk sorts by scaffold part name and # chrom start position, the uniq removes duplicate lines, and then # the awk following that adds back the ID number (simple sequence # count) awk ' { if (NF > 14) { if (! ((NF == 15) || (NF == 16))) { printf "ERROR: bad line:\n" > "/dev/stderr" print $0 > "/dev/stderr" } for (i=1; i < NF-1; ++i) {printf "%s ", $i} i = NF-1 printf "%s\n", $i; } else { if (! ((NF == 0) || (NF == 13) || (NF == 14))) { printf "ERROR: bad line:\n" > "/dev/stderr" print $0 > "/dev/stderr" } } } ' ${FN} | sort -k5,5 -k6,6n | uniq | awk ' BEGIN { i = 1 } { printf "%s %d\n", $0, i++ } ' '_EOF_' # happy emacs chmod +x scripts/uniqueRMOut.sh mkdir RMOutLifted find ./RMOut -type f | while read FN do BN=`basename ${FN}` echo "${BN}" ./scripts/uniqueRMOut.sh ${FN} \ | liftUp -type=.out RMOutLifted/${BN} \ ./scripts/scaffoldsSplit.lft warn stdin done # Place all these results together into a single file # Start with just the header head -3 RMOutLifted/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out # Then everything but the header find ./RMOutLifted -type f | sort -t_ -k2,2n | while read F do tail +4 ${F} >> RMRun/scaffolds.fa.RM.out done # 17 Million lines, 2.2 Gb file # And now, load them up ssh hgwdev cd /cluster/data/monDom2/RMRun time hgLoadOut -nosplit monDom2 scaffolds.fa.RM.out # note: 74 records dropped due to repStart > repEnd # 28 minutes time featureBits monDom2 rmsk # 1770704342 bases of 3496445653 (50.643%) in intersection # real 15m19.287s featureBits monDom1 rmsk # 1775646140 bases of 3492108230 (50.847%) in intersection time featureBits -countGaps monDom1 rmsk # 1775646140 bases of 3563247205 (49.832%) in intersection # real 43m3.513s time featureBits -countGaps monDom2 rmsk # 1770704342 bases of 3575239585 (49.527%) in intersection # real 15m32.322s ####################################################################### # SIMPLE REPEAT [TRF] TRACK (DONE - 2005-08-19 - Hiram) # This can be done while the above repeat masker run is working ssh kk mkdir /cluster/data/monDom2/bed/simpleRepeat cd /cluster/data/monDom2/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf $(path1) {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Normally would be checking input file, but doesn't work today on kk # ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} # funny business here to work around todays kluster difficulties # kk doesn't seem to have a good connection to iscratch since it # comes from kkr2 which is off-line ls -1S /cluster/bluearc/monDom2/chunks500k \ | sed -e "s#^#/iscratch/i/monDom2/chunks500k/#" > genome.lst gensub2 genome.lst single gsub jobList para create jobList para try ... check ... push ... etc ... # there was one unusual long-running job, not sure what happened there # Completed: 7344 of 7344 jobs # CPU time in finished jobs: 53229s 887.15m 14.79h 0.62d 0.002 y # IO & Wait Time: 19793s 329.89m 5.50h 0.23d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4650s 77.50m 1.29h 0.05d # Submission to last job: 6054s 100.90m 1.68h 0.07d # lift to scaffold coordinates ssh kkstore01 cd /cluster/data/monDom2/bed/simpleRepeat find ./trf -type f | xargs cat | \ liftUp simpleRepeat.bed \ /cluster/data/monDom2/scripts/scaffoldsSplit.lft \ warn stdin > lu.out 2>&1 # Load into the database: ssh hgwdev cd /cluster/data/monDom2/bed/simpleRepeat hgLoadBed monDom2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 707995 elements of size 16 # Compare with previous assembly time featureBits monDom2 simpleRepeat # 69239555 bases of 3496445653 (1.980%) in intersection # real 2m34.603s time featureBits monDom1 simpleRepeat # 55000238 bases of 3492108230 (1.575%) in intersection # 30 minutes @ ### !!! *** Why are the featureBits on monDom1 taking so long ? ####################################################################### # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-12-06 - Hiram) # After the simpleRepeats track has been built, make a filtered # version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/monDom2/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ####################################################################### # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2005-08-22 - Hiram) # RE-DONE with new RM output 2006-01-23 # RE-DONE with new RM output 2005-12-16 # maskOutFa is trying to read in the entire input sequence to get # it ready for masking. This doesn't work on the ordinary memory # machines since scaffolds.fa is the whole thing. ssh kolossus mkdir /scratch/monDom2 cd /scratch/monDom2 cp -p /cluster/store1/monDom2/broad.mit.edu/scaffolds.fa.gz . cp -p /cluster/store1/monDom2/bed/simpleRepeat/trfMask.bed . rsync -a --progress kkstore01:/cluster/store1/monDom2/RMRun/rmsk.out . maskOutFa -soft scaffolds.fa.gz trfMask.bed maskedScaffolds.fa maskOutFa -clip -softAdd maskedScaffolds.fa rmsk.out maskedScaffolds.fa # looks good: # WARNING: negative rEnd: -15 scaffold_0:42772383-42772698 L1_Mars1 # WARNING: negative rEnd: -569 scaffold_1:112455185-112455386 L1_Mars1 # WARNING: negative rEnd: -27 scaffold_8:83916755-83916961 L1_Mars1 # WARNING: negative rEnd: -679 scaffold_8:83917313-83917408 L1_Mars1 # WARNING: negative rEnd: -16 scaffold_12:60303711-60303766 L3 # WARNING: negative rEnd: -1 scaffold_23:22193902-22193931 L1_Mdo1 # WARNING: negative rEnd: -289 scaffold_28:5288792-5288941 L1_Mars1a # WARNING: negative rEnd: -501 scaffold_28:5289048-5290587 L1_Mars1a # WARNING: negative rEnd: -67 scaffold_39:3078357-3078508 L1_Mdo5 # WARNING: negative rEnd: -76 scaffold_116:2053077-2053119 L1_Mdo5 # WARNING: negative rEnd: -322 scaffold_127:944681-944774 HAL1_Opos1 # real 3m45.815s # May as well create a 2bit while we are here: time faToTwoBit maskedScaffolds.fa masked.2bit # 2 minutes # Then, check it: twoBitToFa masked.2bit stdout | faCount stdin | tail -1 # looks the same as it was before # seq len A C G # total 3575239585 1087726608 661068673 661056691 # T N cpg # 1086593681 78793932 16830563 # compress the masked fa to prepare it to be copied back: time gzip maskedScaffolds.fa # real 30m47.082s # Copy the masked 2bit file back to kksilo ssh kkstore01 cd /cluster/data/monDom2 time rsync -a --progress kolossus:/scratch/monDom2/masked.2bit . # 22 seconds # Since this 2bit file is pointed to from /gbdb/monDom2 # replace it quickly: rm monDom2.2bit; mv masked.2bit monDom2.2bit time rsync -a --progress kolossus:/scratch/monDom2/maskedScaffolds.fa . # 1m 20s # clean up unneeded .fa rm scaffoldsSplit.fa # And clean up kolossus scratch ssh kolossus cd /scratch/monDom2 rm -fr * cd rmdir /scratch/monDom2 # Copy it to bluearc scratch/hg for rsync to kluster nodes: ssh kkstore01 cd /cluster/data/monDom2 ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2005-08-22 - Hiram) ssh kkstore01 cd /cluster/data/monDom2 # use repMatch of 1228 as this genome is ~ %20 larger than human # 1024 + (1024 * 0.2) = 1228 time blat monDom2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228 # Wrote 43908 overused 11-mers to 11.ooc # real 2m41.153s # copy to san filesystem for pk kluster use ssh pk cd /san/sanvol1/scratch/monDom2 cp -p /cluster/data/monDom2/11.ooc . cp -p /cluster/data/monDom2/monDom2.2bit . ########################################################################## # BLASTZ HUMAN hg18 (STARTED - 2006-01-23 - Hiram) ssh pk mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # real 912m22.818s # This failed during the load of the chains due to the size of # chr19.chain. So, go to kolossus: ssh kolossus # There isn't any hg18 db here yet, get it established with a # chromInfo and a 2bit sequence: hgsql -e "create database hg18;" mysql cd /cluster/data/hg18 twoBitInfo hg18.2bit stdout | awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \ > chromInfo.kolossus.tab hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql hg18 \ -e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;' mkdir /gbdb/hg18 ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit # now, loading only chr19: cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain # while that is running, back on hgwdev, get the other chains loaded ssh hgwdev cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain cp loadUp.csh loadUp.noChr19.csh # change the foreach line to eliminate the chr19.chain: diff loadUp.csh loadUp.noChr19.csh < foreach f (*.chain) --- > foreach f (`ls *.chain | grep -v chr19.chain`) # And then run that script time ./loadUp.noChr19.csh > load.noChr19.out 2>&1 # When the kolossus load finishes, email to push-request and ask # for the two tables to be pushed from kolossus to hgwdev: # chr19_chainMonDom2 # chr19_chainMonDom2Link # then, continuing: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=download -bigClusterHub=pk -chainMinScore=5000 \ -chainLinearGap=loose `pwd`/DEF > download.out 2>&1 & # real 2m42.505s ########################################################################## # BLASTZ HUMAN hg17 (OBSOLETE - 2006-01-23 see above - 2005-08-22 - Hiram) ssh pk mkdir /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 cat << '_EOF_' > DEF # Hg17 vs monDom2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Hg17 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: MonDom2 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastzMonDom2.2005_08_22 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp -p /cluster/data/hg17/chrom.sizes ./S1.len twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \ sort -rn +1 > S2.len # establish a screen to control this job # altered local copy of doBlastzChainNet.pl to fix santest location # and run everything on pk since the san location is unique here screen # to detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r time ./doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > blast.run.out 2>&1 & # Completed: 140151 of 140151 jobs # CPU time in finished jobs: 17095319s 284921.98m 4748.70h 197.86d 0.542 y # IO & Wait Time: 549405s 9156.76m 152.61h 6.36d 0.017 y # Average job time: 126s 2.10m 0.03h 0.00d # Longest finished job: 11323s 188.72m 3.15h 0.13d # Submission to last job: 82333s 1372.22m 22.87h 0.95d # something failed with the kluster, finished off the kluster run # manually, and then start with the next step: cat time ./doBlastzChainNet.pl \ -continue=cat \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > cat.run.out 2>&1 & # 34 minutes to the next failure: cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run # to allow script to continue after we fixup this broken part para time > run.time cat run.time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 20950s 349.17m 5.82h 0.24d 0.001 y # IO & Wait Time: 782s 13.03m 0.22h 0.01d 0.000 y # Average job time: 483s 8.05m 0.13h 0.01d # Longest finished job: 2779s 46.32m 0.77h 0.03d # Submission to last job: 2841s 47.35m 0.79h 0.03d # 34 minutes to the next failure: # Batch failed after 4 tries on chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain # Command failed: # ssh -x pk /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run/doChainRun.csh # This is most likely because we aren't working on the kki cluster # Need to get to larger machines. Go to kolossus to run the last job ssh kolossus cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run time ./chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain # real 477m4.130s # user 228m6.042s # sys 162m31.903s # That is a lot of time on kolossus, almost 8 hours ! # Creating a 2 Gb file: # -rw-rw-r-- 1 2030235712 Aug 23 22:48 chr19.nib:chr19:.chain # To continue the run, back to pk ssh pk cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 time ./doBlastzChainNet.pl \ -continue=chainMerge \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > chainMerge.run.out 2>&1 & # real 195m24.430s # But it failed ! hgLoadChain hg17 chr19_chainMonDom2 chr19.chain Out of memory needMem - request size 28 bytes # Need to go back and rescore the chains at 5000 # Remove previous results ssh kkstore01 cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 time ./doBlastzChainNet.pl \ -continue=chainRun \ -chainMinScore=5000 \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > chainRun.run.out 2>&1 & # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 13835s 230.58m 3.84h 0.16d 0.000 y # IO & Wait Time: 2943s 49.06m 0.82h 0.03d 0.000 y # Average job time: 373s 6.21m 0.10h 0.00d # Longest finished job: 1798s 29.97m 0.50h 0.02d # Submission to last job: 1802s 30.03m 0.50h 0.02d # Same problem as before with chr19 ssh kolossus cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run time ./chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain # real 262m56.122s # user 193m29.527s # sys 68m42.523s # -rw-rw-r-- 1 1450265901 Aug 26 15:27 chain/chr19.nib:chr19:.chain # To continue the run, back to pk ssh pk cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 time ./doBlastzChainNet.pl \ -continue=chainMerge \ -chainMinScore=5000 \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > chainMerge.run.out 2>&1 & XXXX # swap results to place hg17 alignments onto monDom2 cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22 time ./doBlastzChainNet.pl -swap \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore02 \ `pwd`/DEF > swap.run.out 2>&1 & # real 17m13.957s # user 0m0.395s # sys 0m0.148s ########################################################################## # SELF BLASTZ (TBD - 2005-08-22 - Hiram) # SELF track not actually that useful on this assembly, not done. # Going to need a set of nibs for this thing to use as SEQ1. # There's only about 5077 of them, so this is feasible. # This wouldn't work if there were a lot more. ssh kkstore01 mkdir /cluster/data/monDom2/maskedScaffolds cd /cluster/data/monDom2/maskedScaffolds faSplit byname ../maskedScaffolds.fa.gz . mkdir /cluster/data/monDom2/maskedNibs cd /cluster/data/monDom2/maskedNibs ls ../maskedScaffolds | while read S do N=${S/.fa/} echo faToNib -softMask ../maskedScaffolds/${S} ./${N}.nib faToNib -softMask ../maskedScaffolds/${S} ./${N}.nib done # place them on the san filesystem for pk kluster run ssh pk rsync -a --progress /cluster/data/monDom2/maskedNibs/ \ /san/sanvol1/scratch/monDom2/maskedNibs/ ########################################################################## # QUALITY (DONE - 2005-08-22 - Hiram) ssh kkstore01 mkdir /cluster/data/monDom2/bed/quality cd /cluster/data/monDom2/bed/quality cp -p /cluster/data/monDom1/bed/quality/qaToWigEncode.pl . # using perl script from previous build # Update the file name in this file: my $AGP_FILE="/cluster/data/monDom2/broad.mit.edu/assembly.agp"; # convert the qual.fa to wiggle single column input and then wigEncode: zcat ../../broad.mit.edu/V3.0_prel.agp.chromosome.qual.gz \ | ./qaToWigEncode.pl /dev/stdin \ | wigEncode stdin quality.wig quality.wib # Converted stdin, upper limit 50.00, lower limit 0.00 # This is interesting, in monDom1 we had an upper limit of 68 # 1 hour 34 minutes of processing time. # Resulting files: ls -og quality.wi? # -rw-rw-r-- 1 3496445653 Aug 22 17:51 quality.wib # -rw-rw-r-- 1 360668080 Aug 22 17:51 quality.wig # Note how the byte size of the .wib file is exactly equal # to the number bases in a featureBits run without -countGaps # from the rmsk featureBits above: # 1770704342 bases of 3496445653 (50.643%) in intersection # This means we have a valid quality score for every base that is # not in a gap, which is what we want ssh hgwdev cd /cluster/data/monDom2/bed/quality ln -s `pwd`/quality.wib /gbdb/monDom2/wib time hgLoadWiggle -pathPrefix=/gbdb/monDom2/wib monDom2 quality quality.wig # 5 minute load time # verify index: hgsql -e "show index from quality;" monDom2 # check that Cardinality is a number and is not NULL ########################################################################## # PRODUCING GENSCAN PREDICTIONS (DONE - 2005-08-23 - Hiram) # RE-DONE with new masked sequence 2006-01-23 - Hiram # Need a hard masked file ssh kolossus mkdir /scratch/monDom2 cd /scratch/monDom2 rsync -a --progress kkstore01:/cluster/data/monDom2/maskedScaffolds.fa . time maskOutFa maskedScaffolds.fa hard hardMaskedScaffolds.fa # 1m 25sec # send the result back (these rsync operations are faster than # simply copying over NFS) ssh kkstore01 cd /cluster/data/monDom2 time rsync -a --progress kolossus:/scratch/monDom2/hardMaskedScaffolds.fa . # break all these hard masked scaffolds at 5,000,000 boundaries, # result is to a single file. The tiny scaffolds under that # size remain intact. time faSplit -lift=genscan.lft -oneFile size \ hardMaskedScaffolds.fa 5000000 genscan_ # 5677 pieces of 5677 written # real 2m1.146s # save lift file for later liftUp cp -p genscan.lft /cluster/data/monDom2/scripts # Now split that single file into 200,000 chunks, keeping fa record # boundaries, this has the effect of putting all the tiny # scaffolds together in larger single files. The 5,000,000 scaffold # chunks will be their own separate files. mkdir hardMasked200K time faSplit about genscan_.fa 200000 hardMasked200K/c_ # real 2m5.636s # makes 1169 files: ls hardMasked200K | wc # 1169 1169 10590 # Clean out the /cluster/bluearc/monDom2/ files that were used # during repeat masking, then, this copy mkdir /san/sanvol1/scratch/monDom2/hardMasked200K/ cd /san/sanvol1/scratch/monDom2/hardMasked200K/ rsync -a --progress /cluster/data/monDom2/hardMasked200K/ . ssh hgwdev mkdir /cluster/data/monDom2/bed/genscan cd /cluster/data/monDom2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to place output files mkdir gtf pep subopt # This should not be done on the pk - these 5,000,000 sized chunks # cause too much memory to be used for the kk nodes. # These jobs are too large to run on the kk - they occupy about 1 Gb # of memory which is the memory size of a kk node, two of them on # the same node cause too much swapping. ssh pk cd /cluster/data/monDom2/bed/genscan ls -1S /san/sanvol1/scratch/monDom2/hardMasked200K/c*.fa > chunks.list cat << '_EOF_' > template #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.list single template jobList para create jobList para try, check, push, check, ... XXX running 2006-01-23 # Completed: 1169 of 1169 jobs # CPU time in finished jobs: 228087s 3801.45m 63.36h 2.64d 0.007 y # IO & Wait Time: 1300618s 21676.96m 361.28h 15.05d 0.041 y # Average job time: 1308s 21.80m 0.36h 0.02d # Longest finished job: 7290s 121.50m 2.02h 0.08d # Submission to last job: 7372s 122.87m 2.05h 0.09d # Concatenate scaffold-level results: ssh kkstore01 cd /cluster/data/monDom2/bed/genscan time cat gtf/*.gtf | liftUp genscan.gtf \ /cluster/data/monDom2/scripts/genscan.lft error stdin time cat subopt/*.bed | liftUp genscanSubopt.bed \ /cluster/data/monDom2/scripts/genscan.lft error stdin cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/monDom2/bed/genscan ldHgGene -gtf monDom2 genscan genscan.gtf # 40051 gene predictions hgPepPred monDom2 generic genscanPep genscan.pep hgLoadBed monDom2 genscanSubopt genscanSubopt.bed # Loaded 492210 elements of size 6 # Compare with other assemblies: featureBits monDom2 genscan # 48628209 bases of 3496445653 (1.391%) in intersection featureBits monDom1 genscan # 47170795 bases of 3492108230 (1.351%) in intersection featureBits monDom2 genscanSubopt # 52714123 bases of 3496445653 (1.508%) in intersection featureBits monDom1 genscanSubopt # 51805835 bases of 3492108230 (1.484%) in intersection featureBits hg17 genscan # 55323340 bases of 2866216770 (1.930%) in intersection featureBits hg17 genscanSubopt # 55986178 bases of 2866216770 (1.953%) in intersection # Clean up cd /san/sanvol1/scratch/monDom2 rm -fr hardMasked200K # This kolossus clean up was actually left until much later. # I found these copies here to be useful later on. ssh kolossus cd /scratch/monDom2 rm -f * cd .. rmdir monDom2 ########################################################################## # MAKE Human Proteins track (WORKING - 2005-08-24 - Hiram) ssh eieio mkdir -p /cluster/data/monDom2/blastDb cd /cluster/data/monDom2/blastDb faSplit sequence ../broad.mit.edu/scaffolds.fa.unmasked 500 x for i in x* do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/monDom2/blastDb cp /cluster/data/monDom2/blastDb/* /iscratch/i/monDom2/blastDb (iSync) > sync.out exit # back to eieio mkdir -p /cluster/data/monDom2/bed/tblastn.hg17KG cd /cluster/data/monDom2/bed/tblastn.hg17KG ls -1S /iscratch/i/monDom2/blastDb/*.nsq | sed "s/\.nsq//" > query.lst calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(350000/499) = 60.102411 mkdir -p /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/monDom2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 323851 of 323851 jobs # CPU time in finished jobs: 113015986s 1883599.76m 31393.33h 1308.06d 3.584 y # IO & Wait Time: 2732896s 45548.27m 759.14h 31.63d 0.087 y # Average job time: 357s 5.96m 0.10h 0.00d # Longest job: 4911s 81.85m 1.36h 0.06d # Submission to last job: 274144s 4569.07m 76.15h 3.17d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/monDom2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 649 of 649 jobs # CPU time in finished jobs: 4726s 78.76m 1.31h 0.05d 0.000 y # IO & Wait Time: 76824s 1280.41m 21.34h 0.89d 0.002 y # Average job time: 126s 2.09m 0.03h 0.00d # Longest job: 312s 5.20m 0.09h 0.00d # Submission to last job: 7487s 124.78m 2.08h 0.09d exit # back to eieio cd /cluster/data/monDom2/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/monDom2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/monDom2/bed/tblastn.hg17KG hgLoadPsl monDom2 blastHg17KG.psl # back to eieio rm -rf blastOut # End tblastn ########################################################################## # blastz self (NOT POSSIBLE - 2005-08-25 - Hiram) # This does not work because it requires over 25,000,000 # kluster jobs. This procedure would have to be done in a # different way. # We do not need the self blastz on this assembly, it is not useful. ssh pk mkdir /cluster/data/monDom2/bed/blastzSelf cd /cluster/data/monDom2/bed/blastzSelf cat << '_EOF_' > DEF # monDom2 vs monDom2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 # Since we are running this without lineage specific repeats, # set BLASTZ_M to limit a single sequence matching 1000 times BLASTZ_M=1000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: MonDom2 SEQ1_DIR=/san/sanvol1/scratch/monDom2/maskedNibs SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: MonDom2 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/monDom2/bed/blastzSelf SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # happy emacs twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \ sort -rn +1 > S1.len cp -p S1.len S2.len # establish a screen to control this job # altered local copy of doBlastzChainNet.pl to fix santest location # and run everything on pk since the san location is unique here screen # to detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r time ./doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore01 \ `pwd`/DEF > blast.run.out 2>&1 & # Fails on the gensub2 trying to create the jobList file # when it reaches some memory limit ########################################################################## # BLASTZ Mouse mm7 (WORKING - 2006-01-24 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 cat << '_EOF_' > DEF # Mouse vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzMonDom2.2006-01-24 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # Started 2006-01-24 13:55 ########################################################################## # BLASTZ Mouse mm6 (STARTED - 2005-08-25 - Hiram)(OBSOLETE - see above Mm7) # copied mm6 nib files from /cluster/data/mm6/nib to # /san/sanvol1/scratch/mm6/nib ssh pk mkdir /cluster/data/mm6/bed/blastzMonDom2.2005_08_25 cd /cluster/data/mm6/bed/blastzMonDom2.2005_08_25 cat << '_EOF_' > DEF # Mm6 vs monDom2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mm6 SEQ1_DIR=/san/sanvol1/scratch/mm6/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: MonDom2 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzMonDom2.2005_08_25 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp -p /cluster/data/mm6/chrom.sizes ./S1.len twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \ sort -rn +1 > S2.len # establish a screen to control this job # altered local copy of doBlastzChainNet.pl to fix santest location # and run everything on pk since the san location is unique here screen # to detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r time ./doBlastzChainNet.pl \ -chainMinScore=5000 \ -bigClusterHub=pk \ -smallClusterHub=pk \ -fileServer=kkstore01 \ `pwd`/DEF > blast.run.out 2>&1 & XXX - STARTED - 2005-08-25 16:32 ############################################################################ # BLATSERVERS ENTRY (DONE - 2005-08-31 - Hiram) ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("monDom2", "blat16", "17780", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("monDom2", "blat16", "17781", "0", "1");' \ hgcentraltest ############################################################################ # BLASTZ SWAP FOR ZEBRAFISH (danRer3) (DONE, 2005-10-19, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # do swap of danRer3 vs. monDom2 chain and net alignments to # create monDom2 vs. danRer3. see makeDanRer3.doc for details. ssh hgwdev cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun mkdir -p /san/sanvol1/scratch/monDom2/monDom2vsdanRer3Out nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/monDom2/monDom2vsdanRer3Out \ -swap -chainMinScore=5000 >& doSwap.log & # Start: Tue Oct 18 21:36 Finished: Oct 18 22:22 # Parameters used for danRer3 vs monDom2 BLASTZ: # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=10000 # BLASTZ_K=2200 # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # BLASTZ_ABRIDGE_REPEATS=0 # Add entries for danRer3 chains and net to trackDb.ra for monDom2 and # modify descriptions to describe the process using scaffolds for danRer3 # chrNA and chrUn.