# for emacs: -*- mode: sh; -*- # Creating the assembly for Monodelphis domestica # South American, Short-tailed Opossum # http://www.genome.gov/11510687 # http://www.genome.gov/12512285 # DOWNLOAD SEQUENCE (DONE - 2004-12-01 - Hiram) ssh kksilo mkdir -p /cluster/store8/monDom1/broad.mit.edu ln -s /cluster/store8/monDom1 /cluster/data/monDom1 cd /cluster/data/monDom1/broad.mit.edu # Set user name and password in your $HOME/.netrc wget --timestamping "ftp://ftp.broad.mit.edu/*" # takes a couple hours # What we have: ls -ogrt # total 4071636 # -rw-rw-r-- 1 3992 Oct 18 16:45 assembly.format # -rw-rw-r-- 1 1196876365 Oct 18 16:52 contigs.bases.gz # -rw-rw-r-- 1 539154819 Oct 18 17:24 contigs.quals.gz # -rwxrwxr-x 1 4418102 Oct 18 17:52 supercontigs.summary # -rwxrwxrwx 1 3910827 Oct 18 17:52 supercontigs # -rw-rw-r-- 1 23299859 Oct 18 17:52 reads.unplaced.gz # -rw-rw-r-- 1 645129395 Oct 18 17:52 reads.placed.gz # -rw-rw-r-- 1 10757748 Oct 18 17:52 assembly.agp # -rw-rw-r-- 1 1199683639 Oct 19 17:15 assembly.agp.fasta.gz # -rw-rw-r-- 1 541975023 Oct 19 17:34 assembly.agp.qual.gz time gunzip *.gz # That took almost 2 hours on a busy kksilo ls -ogrt # total 29983148 # -rw-rw-r-- 1 3992 Oct 18 16:45 assembly.format # -rw-rw-r-- 1 3537338236 Oct 18 16:52 contigs.bases # -rw-rw-r-- 1 10472323694 Oct 18 17:24 contigs.quals # -rwxrwxr-x 1 4418102 Oct 18 17:52 supercontigs.summary # -rwxrwxrwx 1 3910827 Oct 18 17:52 supercontigs # -rw-rw-r-- 1 146964359 Oct 18 17:52 reads.unplaced # -rw-rw-r-- 1 2273830873 Oct 18 17:52 reads.placed # -rw-rw-r-- 1 10757748 Oct 18 17:52 assembly.agp # -rw-rw-r-- 1 3608989613 Oct 19 17:15 assembly.agp.fasta # -rw-rw-r-- 1 10614160078 Oct 19 17:34 assembly.agp.qual # DATA INTEGRITY VERIFICATION (DONE - Hiram - 2004-12-02) ssh kksilo mkdir /cluster/data/monDom1/dataCheck cd /cluster/data/monDom1/dataCheck grep "^>" ../broad.mit.edu/contigs.bases | sed -e "s/^>//" > contigs.names # 2 minutes # Check for duplicates, should be same line count sort contigs.names | wc # 109065 109065 1415800 sort -u contigs.names | wc # 109065 109065 1415800 sort contigs.names | head -3 # contig_0 # contig_1 # contig_10 sort contigs.names | tail -3 # contig_99997 # contig_99998 # contig_99999 wc contigs.names # 109065 109065 1415800 contigs.names grep "^>" ../broad.mit.edu/contigs.quals \ | sed -e "s/^>//" > contigs.quals.names # 6 minutes sum -r contigs* # 35350 1383 contigs.names # 35350 1383 contigs.quals.names awk '{print $6}' ../broad.mit.edu/assembly.agp \ | grep contig_ > assembly.agp.contig.names # The assembly agp contig list appears to be the same contig list sort -u contigs.names | sum -r # 18868 1383 sort -u assembly.agp.contig.names | sum -r # 18868 1383 awk '{print $1}' ../broad.mit.edu/assembly.agp \ | sort -u > assembly.agp.scaffold.names wc assembly.agp.scaffold.names # 19348 19348 279110 assembly.agp.scaffold.names head -3 assembly.agp.scaffold.names # scaffold_0 # scaffold_1 # scaffold_10 tail -3 assembly.agp.scaffold.names # scaffold_9997 # scaffold_9998 # scaffold_9999 grep "^>" ../broad.mit.edu/assembly.agp.fasta | \ sed -e "s/^>//; s/\..*//" > assembly.fasta.scaffold.names # 2 minutes # There are some duplicates in this fasta file: sort -u assembly.fasta.scaffold.names | wc # 19348 19348 279110 wc assembly.fasta.scaffold.names # 19576 19576 282523 assembly.fasta.scaffold.names # For example: grep "^>scaffold_13303" ../broad.mit.edu/assembly.agp.fasta # >scaffold_13303.1-5000000 (MonodelphisPreliminaryAssemblyOct04) # >scaffold_13303.5000001-10000000 (MonodelphisPreliminaryAssemblyOct04) # >scaffold_13303.10000001-15000000 (MonodelphisPreliminaryAssemblyOct04) # >scaffold_13303.15000001-20000000 (MonodelphisPreliminaryAssemblyOct04) # >scaffold_13303.20000001-22286839 (MonodelphisPreliminaryAssemblyOct04) faCount ../broad.mit.edu/assembly.agp.fasta > faCount.assembly.agp.fasta # seq len A C G # total 3563247205 1085390300 660684418 660336426 # T N cpg # 1085697086 71138975 16821709 grep scaffold_ faCount.assembly.agp.fasta > scaffolds.faCount # Taking a look at those counts, the smallest scaffold is 1000 # bases. The largest ones are broken into 5,000,000 base chunks # as evidenced by the duplicate example above. faCount ../broad.mit.edu/contigs.bases > faCount.contigs # seq len A C G # total 3492108230 1085390300 660684418 660336426 # T N cpg # 1085697086 0 16821709 grep contig_ faCount.contigs > contigs.faCount # The contigs range in size from 85 bases to 658,080 # The bulk of them start at a size of 1000 # COMBINE scaffold fasta pieces into single chunks, remove the extra # base-range info from the names. (DONE - Hiram - 2004-12-02) ssh kksilo cd /cluster/data/monDom1/broad.mit.edu cat << '_EOF_' collapseScaffolds.sh #!/bin/sh # awk ' BEGIN { name="" } /^>/ { id=$1 sub(">","",id) sub("\\..*","",id) if (!match(name,id)) { printf ">%s\n", id name=id } } /^[^>]/ { print } ' assembly.agp.fasta '_EOF_' # << this line keeps emacs coloring happy chmod +x collapseScaffolds.sh time ./collapseScaffolds.sh > scaffolds.fa # 11 minutes # make sure we have not damaged the sequence: time faCount scaffolds.fa | tail -1 # 4 minutes # total 3563247205 1085390300 660684418 660336426 # 1085697086 71138975 16821709 # Those are the same numbers as above # MAKE 2BIT NIB FILE (DONE - Hiram - 2004-12-02) ssh kksilo cd /cluster/data/monDom1 faToTwoBit broad.mit.edu/scaffolds.fa monDom1.2bit # check that the sequence hasn't been damaged twoBitToFa monDom1.2bit stdout | faCount stdin | tail -1 # total 3563247205 1085390300 660684418 660336426 # 1085697086 71138975 16821709 # still the same numbers # CREATE DATABASE (DONE - Hiram - 2004-12-02) ssh hgwdev cd /cluster/data/monDom1 mkdir /gbdb/monDom1 ln -s /cluster/data/monDom1/monDom1.2bit /gbdb/monDom1 twoBitInfo monDom1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/monDom1/monDom1.2bit\n", $1,$2}' \ > chromInfo.tab hgsql -e "create database monDom1;" mysql hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \ monDom1 hgsql monDom1 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \ monDom1 # generate chrom.sizes list in order by size, biggest first # This listing is going to be used in a variety of procedures below twoBitInfo monDom1.2bit stdout | sort -rn +1 > chrom.sizes hgsql monDom1 -N -e 'select chrom from chromInfo' > chrom.lst # Enter monDom1 into dbDb and defaultDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("monDom1", "Oct 2004", "/gbdb/monDom1", "M. domestica", \ "scaffold_13303:1000000-11000000", 1, 33, "Opossum", \ "Monodelphis domestica", \ "/gbdb/monDom1/html/description.html", 0, 0, \ "Broad Inst. Prelim Oct04");' \ -h localhost hgcentraltest hgsql -e 'INSERT INTO defaultDb (name, genome) \ VALUES("monDom1", "Opossum")' \ -h localhost hgcentraltest # Make trackDb table so browser knows what tracks to expect (DONE - Hiram) cd $HOME/kent/src/hg/makeDb/trackDb mkdir /cluster/data/monDom1/html ln -s /cluster/data/monDom1/html /gbdb/monDom1/html mkdir opossum cvs add opossum cd opossum mkdir monDom1 cvs add monDom1 cd monDom1 cat << '_EOF_' > description.html Monodelphis domestica

Description page to be done for Monodelphis domestica

'_EOF_' # << this line keeps emacs coloring happy cvs add description.html cvs commit cd ../.. make DBS="monDom1" ZOO_DBS="" # CHUNK up scaffolds for repeat masking (DONE - Hiram - 2004-12-03) # See also: comments about this in: makeApiMel1.doc ssh kksilo cd /cluster/data/monDom1 mkdir -p jkStuff faSplit size broad.mit.edu/scaffolds.fa 500000 -oneFile \ scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft time faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_ # about 9 minutes for each of those two faSplits mkdir -p /cluster/bluearc/monDom1 # copy to bluearc for cluster run cp -Rp ./chunks500k /cluster/bluearc/monDom1 # this business in /cluster/bluearc/monDom1/ was later removed # and to iservers: ssh kkr1u00 mkdir /iscratch/i/monDom1 cd /iscratch/i/monDom1 cp -Rp /cluster/bluearc/monDom1/chunks500k . /cluster/bin/iSync # LOAD GAP & GOLD TABLES FROM AGP (DONE - 2004-12-03 - Hiram) ssh hgwdev cd /cluster/data/monDom1 hgGoldGapGl -noGl monDom1 broad.mit.edu/assembly.agp # Verify sanity of indexes hgsql -e "show index from gold;" monDom1 hgsql -e "show index from gap;" monDom1 # Look at the Cardinality column, it should have some healthy # numbers for all indices # For some reason, the indices do not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". hgsql -e "analyze table gold; analyze table gap;" monDom1 # RUN REPEAT MASKER (DONE - 2004-12-06 - Hiram) # Received a library from Michele Clamp who says Damian Keefe # at Ensembl created to use in addition to the generic mammal library. # Will run both -mammal and this one, and combine the results. # have placed the library in monDom1/jkStuff/monDom1.novel.RM.lib # DIFFERENT fileserver, new disk space ssh kksilo mkdir /cluster/store1/monDom1 mkdir /cluster/store1/monDom1/RMOut ln -s /cluster/store1/monDom1/RMOut /cluster/store8/monDom1 ln -s /cluster/store8/monDom1/chunks500k /cluster/store1/monDom1 ln -s /cluster/store8/monDom1/jkStuff /cluster/store1/monDom1 # cd /cluster/data/monDom1 cat << '_EOF_' > jkStuff/RMOpossum #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/monDom1/$2 /bin/cp ../chunks500k/$2 /tmp/monDom1/$2/ pushd /tmp/monDom1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /iscratch/i/monDom1/monDom1.novel.RM.lib $2 mv $2.out $2.novel.out /cluster/bluearc/RepeatMasker/RepeatMasker -s -species "Monodelphis domestica" $2 tail +4 $2.novel.out >> $2.out popd /bin/cp -p /tmp/monDom1/$2/$2.out ./ /bin/rm -fr /tmp/monDom1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1 '_EOF_' # << this line keeps emacs coloring happy chmod +x jkStuff/RMOpossum mkdir RMRun bash # if you are csh ls chunks500k | while read chunk do echo ../jkStuff/RMOpossum /cluster/data/monDom1/RMOut $chunk \ '{'check in line+ /iscratch/i/monDom1/chunks500k/$chunk'}' \ '{'check out line+ /cluster/data/monDom1/RMOut/$chunk.out'}' done > RMRun/RMJobs exit # out of bash back to csh # do the run ssh kk mkdir /cluster/data/monDom1/RMRun cd /cluster/data/monDom1/RMRun para create RMJobs para try, check, push, check,... # This takes extra long compared to normal because of the two # passes being run on the data Completed: 7627 of 7627 jobs CPU time in finished jobs: 117842813s 1964046.88m 32734.11h 1363.92d 3.737 y IO & Wait Time: 704870s 11747.84m 195.80h 8.16d 0.022 y Average job time: 15543s 259.05m 4.32h 0.18d Longest job: 24045s 400.75m 6.68h 0.28d Submission to last job: 185727s 3095.45m 51.59h 2.15d # Lift up the split-scaffold .out's to scaffold .out's ssh eieio cd /cluster/store1/monDom1 mkdir RMOutLifted ln -s `pwd`/RMOutLifted /cluster/data/monDom1 bash # if you are csh find ./RMOut -type f | while read F do B=${F/.\/RMOut\//} echo "$F -> RMOutLifted/$B" liftUp RMOutLifted/$B jkStuff/scaffoldsSplit.lft warn $F done exit # the bash session if your are csh # Cat all those lifted .out files together into one single file head -3 RMOutLifted/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out find ./RMOutLifted -type f | while read f do tail +4 $f >> RMRun/scaffolds.fa.RM.out done # I tried the following sort in the above loop to see if # the sort mentioned below could be avoided: # tail +4 $f | sort -u >> scaffolds.unique.fa.RM.out # but this sort was not thorough enough. It ended up with more # records loaded into the database, but with the same amount # of featureBits coverage. # Eliminate the duplicates from the double RepeatMasker run. # When run twice, it finds the Simple and Low Complexity repeats # twice. hgLoadOut -tabFile=hgLoadOut.tab monDom1 -nosplit scaffolds.fa.RM.out # A few "Strange perc. field" outputs, which always happens, but # not too many. e.g.: # Strange perc. field -1.4 line 1939539 of scaffolds.fa.RM.out # Strange perc. field -2.5 line 3078381 of scaffolds.fa.RM.out # Strange perc. field -2.1 line 3078381 of scaffolds.fa.RM.out # Strange perc. field -40.9 line 3442991 of scaffolds.fa.RM.out # Strange perc. field -3.9 line 3442991 of scaffolds.fa.RM.out # ... etc ... # note: 3 records dropped due to repStart > repEnd ls -og scaffolds.fa.RM.out hgLoadOut.tab # -rw-rw-r-- 1 2553485808 Dec 6 00:05 scaffolds.fa.RM.out # -rw-rw-r-- 1 1935604506 Dec 6 00:16 hgLoadOut.tab sort -k 6,6 -k7n,7n hgLoadOut.tab | uniq > hgLoadOutUniq.tab # takes almost an hour ssh hgwdev cd /cluster/store1/monDom1 # Get the structure of the table started hgLoadOut -nosplit -table=rmsk monDom1 RMOutLifted/chunk_1000.fa.out # Now, reload it fully with the unique set hgsql -e 'delete from rmsk;' monDom1 hgsql -e 'load data local infile "hgLoadOutUniq.tab" into table rmsk;' \ monDom1 # This load takes an hour, 12 minutes for a featureBits: featureBits monDom1 rmsk # 1775646140 bases of 3492108230 (50.847%) in intersection featureBits hg17 rmsk # 1390952984 bases of 2866216770 (48.529%) in intersection featureBits canFam1 rmsk # 896773874 bases of 2359845093 (38.001%) in intersection featureBits mm5 rmsk # 1137310280 bases of 2615483787 (43.484%) in intersection featureBits rn3 rmsk # 1117483165 bases of 2571104688 (43.463%) in intersection featureBits panTro1 rmsk # 1311281819 bases of 2733948177 (47.963%) in intersection # Verify the indexes are correct hgsql -e "show index from rmsk;" monDom1 # Should be a good cardinality count for each one # It does not, it shows None for genoName (genoName(14), bin) # and 18744904 for both (genomeName(14), genoStart) and (...genoEnd) hgsql monDom1 -e 'drop index genoName on rmsk; \ drop index genoName_2 on rmsk; \ drop index genoName_3 on rmsk; \ create index bin on rmsk (genoName(14), bin); \ create index genoStart on rmsk (genoName(14), genoStart); \ create index genoEnd on rmsk (genoName(14), genoEnd);' # This index creation took 76 minutes # After that, the "bin" index has a cardinality of 48,561 # genoStart and genoEnd are still at 18,744,904 # SIMPLE REPEAT [TRF] TRACK (DONE - 2004-12-03 - Hiram) # Run this on the rack 9 cluster ssh kk9 mkdir /cluster/data/monDom1/bed/simpleRepeat cd /cluster/data/monDom1/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /iscratch/i/monDom1/chunks500k | sed -e "s#^#/iscratch/i/monDom1/chunks500k/#" > genome.lst gensub2 genome.lst single gsub jobList para create jobList para try para check para push Completed: 7627 of 7627 jobs CPU time in finished jobs: 24845s 414.08m 6.90h 0.29d 0.001 y IO & Wait Time: 19407s 323.45m 5.39h 0.22d 0.001 y Average job time: 6s 0.10m 0.00h 0.00d Longest job: 49s 0.82m 0.01h 0.00d Submission to last job: 1309s 21.82m 0.36h 0.02d find ./trf -type f | xargs cat | \ liftUp simpleRepeat.bed \ /cluster/data/monDom1/jkStuff/scaffoldsSplit.lft \ warn stdin > lu.out 2>&1 # Load into the database: ssh hgwdev cd /cluster/data/monDom1/bed/simpleRepeat hgLoadBed monDom1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 697668 elements of size 16 # Compare with some other assemblies featureBits monDom1 simpleRepeat # 55000238 bases of 3492108230 (1.575%) in intersection featureBits rn3 simpleRepeat # 70073656 bases of 2571104688 (2.725%) in intersection featureBits mm5 simpleRepeat # 81414259 bases of 2615483787 (3.113%) in intersection featureBits monDom1 simpleRepeat # 54952425 bases of 2866216770 (1.917%) in intersection # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/monDom1/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed monDom1 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-12-06 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd /cluster/data/monDom1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2004-12-06 - Hiram) # maskOutFa is trying to read in the entire input sequence to get # it ready for masking. This doesn't work on the ordinary memory # machines since scaffolds.fa is the whole thing. ssh kolossus mkdir /scratch/monDom1 cd /scratch/monDom1 cp -p /cluster/store8/monDom1/broad.mit.edu/scaffolds.fa . cp -p /cluster/store1/monDom1/bed/simpleRepeat/trfMask.bed . cp -p /cluster/store1/monDom1/RMRun/scaffolds.fa.RM.out . maskOutFa -soft scaffolds.fa trfMask.bed maskedScaffolds.fa maskOutFa -clip -softAdd maskedScaffolds.fa scaffolds.fa.RM.out \ maskedScaffolds.fa # those two mask operations each take 3 to 4 minutes. # There were a number of negative rEnd warnings on scaffold_13261, # all on type L1Md_T, one warning each on # scaffold_13244, scaffold_13338 for type L1M4c, # and one on scaffold_13498 for type monodelphis_domestica_23_1 # and one warning: # Mask record out of range line 6686033 of scaffolds.fa.RM.out # scaffold_13485 size is 3678616, range is 3678568-3678618 # This last error makes it stop without doing anything. Need to # fix this. Use the -clip option. That helps it past that and # now more warnings are seen, on scaffold_16808 for type Lx5, # and one warning on scaffold_14879 for type monodelphis_domestica_48 # May as well create a 2bit while we are here: time faToTwoBit maskedScaffolds.fa masked.2bit # 3 minutes # Then, check it: twoBitToFa masked.2bit stdout | faCount stdin | tail -1 # 4 minutes, seems to have lost 2A's and 1 C: # total 3563247202 1085390298 660684418 660336425 # 1085697086 71138975 16821709 # previously: # total 3563247205 1085390300 660684418 660336426 # 1085697086 71138975 16821709 # I went back through this sequence and could not duplicate this # failure of missing three bases ... # Copy the masked 2bit file back to kksilo ssh kksilo cd /cluster/data/monDom1 scp -p kolossus:/scratch/hiram/monDom1/masked.2bit . rm monDom1.2bit mv masked.2bit monDom1.2bit # Ask for the blat servers to be restarted with this new masked # sequence # Now clean up the unmasked split scaffolds to avoid confusion later. rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft # MAKE 11.OOC FILE FOR BLAT (TBD) ssh kolossus cd /cluster/bluearc/monDom1 # use repMatch of 1228 as this genome is ~ %20 larger than human # 1024 + (1024 * 0.2) = 1228 blat /cluster/data/monDom1/monDom1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228 # Wrote 43126 overused 11-mers to 11.ooc # -rw-rw-r-- 1 172512 Dec 7 17:22 11.ooc ssh kkr1u00 cd /iscratch/i/monDom1 cp -p /cluster/bluearc/monDom1/11.ooc . /cluster/bin/iSync # at repMatch=1024 # Wrote 61187 overused 11-mers to 11.ooc # at repMatch=540 # Wrote 207412 overused 11-mers to 11.ooc # For comparison, Hg17 has: # Wrote 30077 overused 11-mers to 11.ooc # PUT SEQUENCE ON SCRATCH FOR BLASTZ (DONE - 2004-12-07 - Hiram) # These chunks do not need to correspond to the 500,000 chunks # used in the repeat mask run because we are not going to use the # repeat mask .out files at lineage specific repeats. ssh kksilo cd /tmp # Make sure you have enough room for this: (~ 4 Gb) df -h . # Filesystem Size Used Avail Use% Mounted on # /dev/sda2 32G 23G 8.0G 74% / # split scaffolds that are over 2,000,000 faSplit size /cluster/data/monDom1/maskedScaffolds.fa 2000000 -oneFile \ scaf_ \ -lift=/cluster/data/monDom1/jkStuff/chunkedScaffolds.lft mkdir /cluster/bluearc/scratch/monDom1 cd /cluster/bluearc/scratch/monDom1 mkdir chunks # Collect the split scaffolds into reasonably sized chunks # This produces over 5000 files of size 500,000 # a few smaller, a few larger, 7627 total files faSplit about /tmp/scaf_.fa 10000000 chunks/chunk_ # done with this temp file rm /tmp/scaf_.fa # a list for convenience ls -1S chunks > chunk.list cp -p /cluster/data/monDom1/jkStuff/chunkedScaffolds.lft . cp -p /cluster/data/monDom1/monDom1.2bit . # And create a chunks.2bit for the lavToAxt operations ssh kkr1u00 cd /iscratch/i/monDom1/chunks faToTwoBit chunks*.fa ../chunks.2bit cd .. twoBitInfo chunks.2bit chunks.info /cluster/bin/iSync # request rsync of scratch to the cluster # PRODUCING GENSCAN PREDICTIONS (DONE - 2004-12-07 - Hiram) # PRODUCING GENSCAN PREDICTIONS (REDONE - 2005-01-07 - Angie # note added by kuhn) # genscanPep table touched by alisq to satisfy joinerCheck 2005-03-32 - kuhn) ssh kolossus cd /scratch/monDom1 time maskOutFa maskedScaffolds.fa hard hardMaskedScaffolds.fa # Clean out the /cluster/bluearc/monDom1/ files that were used # during repeat masking, then, this copy cp -p hardMaskedScaffolds.fa /cluster/bluearc/monDom1 cd /cluster/bluearc/monDom1 mkdir hardChunks faSplit about hardMaskedScaffolds.fa 200000 hardChunks/chunk_ # This leaves some rather large pieces, will have to see if # genscan can handle it. (2086 files) ssh hgwdev mkdir /cluster/data/monDom1/bed/genscan cd /cluster/data/monDom1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to place output files mkdir gtf pep subopt ssh kk9 cd /cluster/data/monDom1/bed/genscan ls -1S /cluster/bluearc/monDom1/hardChunks/chunk*.fa > chunks.list cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.list single gsub jobList para create jobList para try, check, push, check, ... # Completed: 2086 of 2086 jobs # CPU time in finished jobs: 130316s 2171.94m 36.20h 1.51d 0.004 y # IO & Wait Time: 17821s 297.01m 4.95h 0.21d 0.001 y # Average job time: 71s 1.18m 0.02h 0.00d # Longest job: 2509s 41.82m 0.70h 0.03d # Submission to last job: 2573s 42.88m 0.71h 0.03d # Concatenate scaffold-level results: ssh eieio cd /cluster/data/monDom1/bed/genscan cat gtf/*.gtf > genscan.gtf cat subopt/*.bed > genscanSubopt.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/monDom1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf monDom1 genscan genscan.gtf hgPepPred monDom1 generic genscanPep genscan.pep hgLoadBed monDom1 genscanSubopt genscanSubopt.bed # 41090 sequences added # Compare with other assemblies: featureBits monDom1 genscan # 47170795 bases of 3492108230 (1.351%) in intersection featureBits monDom1 genscanSubopt # 51805835 bases of 3492108230 (1.484%) in intersection featureBits hg17 genscan # 55323340 bases of 2866216770 (1.930%) in intersection featureBits hg17 genscanSubopt # 55986178 bases of 2866216770 (1.953%) in intersection # Clean up # Save a copy of the hardMask for later .zip prep: ssh kksilo cp -p /cluster/bluearc/monDom1/hardMaskedScaffolds.fa . ssh eieio cd /cluster/bluearc/monDom1 rm -fr hardChunks hardMaskedScaffolds.fa # This kolossus clean up was actually left until much later. # I found these copies here to be useful later on. ssh kolossus cd /scratch/monDom1 rm -f * # BLAT SERVER SETUP (DONE - 2004-12-07 - Hiram) hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("monDom1", "blat14", 17786, 1, 0);' \ -h localhost hgcentraltest hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("monDom1", "blat14", 17787, 0, 0);' \ -h localhost hgcentraltest # Make trackDb table so browser knows what tracks to expect (DONE - Hiram) # AUTO UPDATE GENBANK MRNA RUN (DONE - 2004-12-15 - Hiram) ssh hgwdev # In the source tree, since this is a new organism, go to: # kent/src/hg/makeDb/genbank/src/lib # And edit gbGenome.c to add monDom in two places for two lists: # static char *monDomNames[] = {"Monodelphis domestica", NULL}; # {"monDom", monDomNames, NULL}, # Genbank species list is kept in: # /cluster/data/genbank/data/species.lst # This one is already on there. # And add an entry for the ooc file in: # hg/makeDb/genbank/src/align/gbBlat # In a clean source tree, go to hg/makeDb/genbank # and run a 'make install-server' cd /cluster/data/genbank # This is a new organism, edit the etc/genbank.conf file and add: # monDom1 monDom1.genome = /scratch/monDom1/monDom1.2bit monDom1.refseq.mrna.native.load = no monDom1.mondoTwoBitParts = 7000 monDom1.downloadDir = monDom1 monDom1.lift = no monDom1.perChromTables = no # Do the refseq's first, they are the quick ones ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial monDom1 # var/build/logs/2004.12.08-19:09:59.monDom1.initalign.log # it said: "nothing to align" - i.e. there are no native refseq # mRNA's for this organism, add the line: # monDom1.refseq.mrna.native.load = no # to the configuration file # We were having some trouble with eieio, running these on kksilo # this time. This is best done on eieio. # perform the genbank mrna run, this does only native and xeno mRNA's ssh kksilo nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial monDom1 # This run had non-linear behavior, in the early stages it # appeared that it was going to take 4 days of run time, then in # the last 7 hours it processed over 250,000 jobs: # Completed: 357938 of 357938 jobs # CPU time in finished jobs: 51924178s 865402.97m 14423.38h 600.97d 1.647 y # IO & Wait Time: 3461243s 57687.38m 961.46h 40.06d 0.110 y # Average job time: 155s 2.58m 0.04h 0.00d # Longest job: 3885s 64.75m 1.08h 0.04d # Submission to last job: 101926s 1698.77m 28.31h 1.18d # Due to various interruptions, the cluster job was completed # manually by going to the directory and pushing the jobs. To # clean up after this, and finish the alignment steps: ssh kksilo cd /cluster/data/genbank nice bin/gbAlignStep -continue=finish -srcDb=genbank -type=mrna \ -verbose=1 -initial monDom1 # 2 hour finish time # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad monDom1 # 2 minute load time # And finally, the long-lived EST alignments. Several cluster days ssh kksilo cd /cluster/data/genbank/work mv initial.monDom1 initial.monDom1.genbank.mrna cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial monDom1 # logFile: var/build/logs/2004.12.15-13:16:59.monDom1.initalign.log # It said there was nothing to align, must be no native est's for # the Opossum, add this line to the genbank.conf file: # monDom1.genbank.est.native.load = no # not useful to do xeno ESTs, they are off by default # GC5BASE (DONE - 2004-12-08 - Hiram) ssh eieio mkdir /cluster/data/monDom1/bed/gc5Base cd /cluster/data/monDom1/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 monDom1 \ /cluster/data/monDom1 | wigEncode stdin gc5Base.wig gc5Base.wib # takes about 4 hours, need to improve hgGcPercent and how it reads # five bases at a time from the sequence. ssh hgwdev cd /cluster/data/monDom1/bed/gc5Base mkdir /gbdb/monDom1/wib ln -s `pwd`/gc5Base.wib /gbdb/monDom1/wib hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 gc5Base gc5Base.wig # need to fix-up hgLoadWiggle so it will make a proper index on # these scaffold assemblies: hgsql monDom1 -e "drop index chrom on gc5Base;" hgsql monDom1 -e "create index bin on gc5Base (chrom(14), bin);" # this is fixed in hgLoadWiggle as of 2004-12-09 # QUALITY (DONE - 2004-12-14 - Hiram) ssh eieio mkdir /cluster/data/monDom1/bed/quality cd /cluster/data/monDom1/bed/quality # Create perl script to read the quality file and pipe to # wigEncode: ./qaToWigEncode.pl ../../broad.mit.edu/assembly.agp.qual \ | wigEncode stdin quality.wig quality.wib # Converted stdin, upper limit 68.00, lower limit 0.00 # Took 6 hours 40 minutes of processing time. # Resulting files: ls -og quality.wi? # -rw-rw-r-- 1 377775898 Dec 13 21:40 quality.wig # -rw-rw-r-- 1 3492108230 Dec 13 21:40 quality.wib ssh hgwdev cd /cluster/data/monDom1/bed/quality ln -s `pwd`/quality.wib /gbdb/monDom1/wib hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 quality quality.wig # fix the index (this was supposed to be fixed, hgLoadWiggle # needs to be looked at) hgsql monDom1 -e "drop index chrom on quality;" hgsql monDom1 -e "create index bin on quality (chrom(14), bin);" # ALIGN TO HUMAN (DONE - 2004-12-12 - Hiram) # NAMING CONVENTION: the zb.hg17 directory will be the Opossum # aligned to the Human blastz result. # The bz.hg17 directory will the swapped result, Human aligned # to Opossum ssh kk mkdir -p /cluster/data/monDom1/bed/zb.hg17 cd /cluster/data/monDom1/bed/zb.hg17 ln -s /cluster/data/monDom1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.monDom1 # Create DEF file for run of scaffolds vs. human chromosomes cat << '_EOF_' > DEF # human vs. M.domestica export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum SEQ2_DIR=/iscratch/i/monDom1/chunks SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/monDom1/bed/zb.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp -p /cluster/data/hg17/chrom.sizes S1.len cp -p /cluster/data/monDom1/chrom.sizes S2.len # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh /cluster/data/monDom1/jkStuff/BlastZ_run0.sh cd run.0 para try, push check, para push, para check.... # Completed: 112871 of 112871 jobs # CPU time in finished jobs: 42061387s 701023.12m 11683.72h 486.82d 1.334 y # IO & Wait Time: 578862s 9647.70m 160.80h 6.70d 0.018 y # Average job time: 378s 6.30m 0.10h 0.00d # Longest job: 3157s 52.62m 0.88h 0.04d # Submission to last job: 81243s 1354.05m 22.57h 0.94d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/monDom1/bed/zb.hg17 # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run1.sh /cluster/data/monDom1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 3525s 58.75m 0.98h 0.04d 0.000 y # IO & Wait Time: 11315s 188.58m 3.14h 0.13d 0.000 y # Average job time: 44s 0.73m 0.01h 0.00d # Longest job: 181s 3.02m 0.05h 0.00d # Submission to last job: 1524s 25.40m 0.42h 0.02d # third run: lav -> axt (an I/O bound process, run only on fileserver) # Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt ssh eieio cd /cluster/data/monDom1/bed/zb.hg17 mkdir axtChrom pslChrom cat << '_EOF_' > lav2axt.sh #!/bin/sh for L in lav/chr* do chr=${L/lav\//} cd ${L} echo -n "${L} -> ${chr}.axt working ... " cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/hg17/bothMaskedNibs \ /iscratch/i/monDom1/chunks.2bit stdout \ | axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \ /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin cd ../.. echo -n "${chr}.axt -> ${chr}.psl working ..." axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl echo "DONE" done '_EOF_' # << this line keeps emacs coloring happy chmod +x lav2axt.sh ./lav2axt.sh # load this blast result just to see how it looks # It is not needed in the browser after the chains and nets are # made, drop it later. ssh hgwdev cd /cluster/data/monDom1/bed/zb.hg17/pslChrom hgLoadPsl -table=blastzMonDom1 -noTNameIx hg17 chrM.psl bash # if you are normally csh for I in `ls *.psl | grep -v chrM.psl` do /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \ -noTNameIx hg17 ${I} echo "done: ${I}" done exit # exit bash if you are normally csh # Let's see what featureBits has to say about this data featureBits hg17 blastzMonDom1 # 486216407 bases of 2866216770 (16.964%) in intersection featureBits hg17 blastzMm5 # 1052077141 bases of 2866216770 (36.706%) in intersection featureBits hg17 blastzRn3 # 1013003285 bases of 2866216770 (35.343%) in intersection # CHAIN HUMAN BLASTZ (DONE - 2004-12-12 - Hiram) # Run axtChain on file server, job is mostly I/O, cluster does not help ssh eieio cd /cluster/data/monDom1/bed/zb.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/monDom1/bed/zb.hg17/axtChrom/*.axt > input.lst cat << '_EOF_' > doChain.sh #!/bin/sh for chrAxtFile in `cat input.lst` do chr=`basename ${chrAxtFile}` axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../opossumHumanTuned.gap \ -minScore=5000 ${chrAxtFile} \ /cluster/data/hg17/nib \ /cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out done '_EOF_' # << this line keeps emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line keeps emacs coloring happy chmod +x doChain ./doChain # On the file server weed the chains for repeats ssh eieio cd /cluster/data/monDom1/bed/zb.hg17/axtChain mkdir chain cd chainRaw bash # if you are normally csh for i in *.chain do chainAntiRepeat /cluster/data/hg17/nib \ /cluster/data/monDom1/monDom1.2bit $i ../chain/$i done exit # exit bash if you are normally csh # now on the file server, sort chains. The chainMergeSort and chainSplit # steps both take about 10 minutes ssh kksilo cd /cluster/data/monDom1/bed/zb.hg17/axtChain chainMergeSort chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain for i in *.chain do c=${i/.chain/} echo "loading ${c}" hgLoadChain hg17 ${c}_chainMonDom1 $i done featureBits hg17 chainMonDom1 # 2618859142 bases of 2866216770 (91.370%) in intersection featureBits hg17 chainMonDom1Link # 456069062 bases of 2866216770 (15.912%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1 cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain tar cvzf \ /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.chain.tgz \ ./chr*.chain # NET OPOSSUM TO HUMAN BLASTZ (DONE - 2004-12-13 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.hg17/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # 11 minutes # Create axt files for the net as so: netSplit noClass.net noClass # 2 minutes cd noClass mkdir ../../axtNet bash # if you are csh normally for i in *.net do r=${i/.net/} netToAxt $i ../chain/${r}.chain /cluster/data/hg17/nib \ /cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt done # 18 minutes exit # back to csh if you are csh normally cd .. rm -r noClass # Add classification info using db tables: ssh hgwdev cd /cluster/data/monDom1/bed/zb.hg17/axtChain netClass -noAr noClass.net hg17 monDom1 human.net # 41 minutes # Load the nets into database ssh hgwdev cd /cluster/data/monDom1/bed/zb.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netMonDom1 stdin # 4 minutes # compare featureBits with some other assemblies featureBits hg17 netMonDom1 # 2595812636 bases of 2866216770 (90.566%) in intersection featureBits hg17 netBosTau1 # 2261085097 bases of 2866216770 (78.887%) in intersection featureBits hg17 netRn3 # 2817656275 bases of 2866216770 (98.306%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1 cd /cluster/data/monDom1/bed/zb.hg17/axtChain cp -p human.net \ /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.net cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1 gzip opossum.net # The axtNet files need to go too (2005-02-24 - Hiram) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet cp -p /cluster/data/monDom1/bed/zb.hg17/axtNet/*.axt . gzip * # index had NULL cardinality; analyze to fix (2005-01-18 - Heather) hgsql hg17 -e "analyze table netMonDom1;" # MAKE NET MAFS FOR MULTIZ (2004-12-24 kate) ssh kksilo cd /cluster/data/monDom1/bed/zb.hg17/axtNet mkdir -p ../mafNet cat *.axt | axtSort stdin stdout | axtToMaf -tSplit stdin \ -tPrefix=hg17. /cluster/data/hg17/chrom.sizes \ -qPrefix=monDom1. /cluster/data/monDom1/chrom.sizes ../mafNet # SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE # Create HUMAN TO OPOSSUM chain, net ssh eieio cd /cluster/data/monDom1/bed mkdir -p bz.hg17/axtChain chainSwap zb.hg17/axtChain/all.chain bz.hg17/axtChain/all.chain # 4 minutes cd bz.hg17/axtChain chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.hg17/S2.len \ ../../zb.hg17/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net # 10 minutes mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \ /cluster/data/hg17/nib ../axtNet/all.axt # 23 minutes # Classify net and load into database ssh hgwdev cd /cluster/data/monDom1/bed/bz.hg17/axtChain netClass -noAr noClass.net monDom1 hg17 monDom1.net # 16 minutes netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netHg17 stdin # 6 minutes # Load chains into database (takes about 4 hours). hgLoadChain -tIndex monDom1 chainHg17 all.chain # if you do a 'show index' SQL reports Cardinality NULL for the tName # indexes although they are actually working. To make the # cardinality show up: hgsql monDom1 -e "analyze table chainHg17;" hgsql monDom1 -e "analyze table chainHg17Link;" # check featureBits (~ 30 minute commands) featureBits monDom1 chainHg17Link # 459086956 bases of 3492108230 (13.146%) in intersection featureBits monDom1 chainHg17 # 3246158618 bases of 3492108230 (92.957%) in intersection featureBits monDom1 netHg17 # 3223616544 bases of 3492108230 (92.311%) in intersection # create a copy of for hgdownload (2005-01-13 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsHg17 cd /cluster/data/monDom1/bed/bz.hg17/axtChain gzip --stdout all.chain > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.chain.gz gzip --stdout monDom1.net > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.net.gz # BLASTZ HUMAN CLEAN UP (DONE - 2005-01-12 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.hg17/axtChain nice rm -rf noClass.net chain/chain.tab chain/link.tab chain.run1 \ chainRaw out & nice gzip chain/* & nice gzip human.net & nice gzip all.chain & cd /cluster/data/monDom1/bed/zb.hg17 nice rm -rf raw & nice gzip axtChrom/maf/* & nice gzip axtChrom/*.axt & nice gzip pslChrom/* lav/*/* & cd /cluster/data/monDom1/bed/bz.hg17/axtChain nice rm -f noClass.net link.tab chain.tab & nice gzip all.chain monDom1.net & cd ../axtNet nice gzip all.axt & # LOAD CPGISSLANDS (DONE - 2004-12-13 - Hiram) ssh hgwdev mkdir -p /cluster/data/monDom1/bed/cpgIsland cd /cluster/data/monDom1/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/monDom1/bed/cpgIsland/ ssh eieio cd /cluster/data/monDom1/bed/cpgIsland # AWK filter to transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line keeps emacs coloring happy rm -f cpgIsland.bed cat /dev/null > cpgIsland.bed bash # if you are normally csh for scaffold in `awk '{print $1}' /cluster/data/monDom1/chrom.sizes` do cpgOutput=${scaffold}.cpg echo "${scaffold}" rm -f /tmp/${scaffold}.fa twoBitToFa /cluster/data/monDom1/monDom1.2bit:${scaffold} \ /tmp/${scaffold}.fa ./cpglh.exe /tmp/${scaffold}.fa | awk -f filter.awk >> cpgIsland.bed rm -f /tmp/${scaffold}.fa done exit # exit bash if you are normally csh # load into database: ssh hgwdev cd /cluster/data/monDom1/bed/cpgIsland hgLoadBed monDom1 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 27004 elements of size 10 # compare to some other assemblies featureBits monDom1 cpgIslandExt # 18445119 bases of 3492108230 (0.528%) in intersection featureBits hg17 cpgIslandExt # 21245283 bases of 2866216770 (0.741%) in intersection featureBits panTro1 cpgIsland # 17540676 bases of 2733948177 (0.642%) in intersection featureBits rn3 cpgIsland # 9679881 bases of 2571104688 (0.376%) in intersection featureBits xenTro1 cpgIslandExt # 19279778 bases of 1381238994 (1.396%) in intersection featureBits fr1 cpgIsland # 25518433 bases of 315518167 (8.088%) in intersection # ALIGN TO CHICKEN (DONE - 2004-12-14 - Hiram) # NAMING CONVENTION: the zb.galGal2 directory will be the Opossum # aligned to the Chicken blastz result. # The bz.galGal2 directory will the swapped result, Chicken aligned # to Opossum ssh kk mkdir -p /cluster/data/monDom1/bed/zb.galGal2 cd /cluster/data/monDom1/bed/zb.galGal2 # Create DEF file for run of scaffolds vs. chicken chromosomes cat << '_EOF_' > DEF # chicken vs. M.domestica export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum SEQ2_DIR=/iscratch/i/monDom1/chunks SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/monDom1/bed/zb.galGal2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp -p /cluster/data/galGal2/chrom.sizes S1.len cp -p /cluster/data/monDom1/chrom.sizes S2.len # script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run0.sh /cluster/data/monDom1/jkStuff/BlastZ_run0.sh cd run.0 para try, push check, para push, para check.... # Completed: 49981 of 49981 jobs # CPU time in finished jobs: 24745519s 412425.31m 6873.76h 286.41d 0.785 y # IO & Wait Time: 204758s 3412.64m 56.88h 2.37d 0.006 y # Average job time: 499s 8.32m 0.14h 0.01d # Longest job: 2857s 47.62m 0.79h 0.03d # Submission to last job: 85888s 1431.47m 23.86h 0.99d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/monDom1/bed/zb.galGal2 # script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run1.sh /cluster/data/monDom1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 151 of 151 jobs # CPU time in finished jobs: 1131s 18.85m 0.31h 0.01d 0.000 y # IO & Wait Time: 2130s 35.50m 0.59h 0.02d 0.000 y # Average job time: 22s 0.36m 0.01h 0.00d # Longest job: 95s 1.58m 0.03h 0.00d # Submission to last job: 490s 8.17m 0.14h 0.01d # third run: lav -> axt (an I/O bound process, run only on fileserver) # Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2 mkdir axtChrom pslChrom cat << '_EOF_' > lav2axt.sh #!/bin/sh for L in lav/chr* do chr=${L/lav\//} cd ${L} echo -n "${L} -> ${chr}.axt working ... " cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/galGal2/nib \ /iscratch/i/monDom1/chunks.2bit stdout \ | axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \ /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin cd ../.. echo -n "${chr}.axt -> ${chr}.psl working ..." axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl echo "DONE" done '_EOF_' # << this line keeps emacs coloring happy chmod +x lav2axt.sh ./lav2axt.sh # 12 minutes # load this blast result just to see how it looks # It is not needed in the browser after the chains and nets are # made, drop it later. ssh hgwdev cd /cluster/data/monDom1/bed/zb.galGal2/pslChrom hgLoadPsl -table=blastzMonDom1 -noTNameIx galGal2 chrM.psl bash # if you are normally csh for I in `ls *.psl | grep -v chrM.psl` do /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \ -noTNameIx galGal2 ${I} echo "done: ${I}" done exit # exit bash if you are normally csh # 7 minutes # Let's see what featureBits has to say about this data featureBits galGal2 blastzMonDom1 102241492 bases of 1054197620 (9.699%) in intersection featureBits galGal2 blastzBestHg16 97272784 bases of 1054197620 (9.227%) in intersection # CHAIN CHICKEN BLASTZ (DONE - 2004-12-12 - Hiram) # Run axtChain on file server, job is mostly I/O, cluster does not help ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/monDom1/bed/zb.galGal2/axtChrom/*.axt > input.lst cat << '_EOF_' > doChain.sh #!/bin/sh for chrAxtFile in `cat input.lst` do chr=`basename ${chrAxtFile}` axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../opossumChickenTuned.gap \ -minScore=5000 ${chrAxtFile} \ /cluster/data/galGal2/nib \ /cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out done '_EOF_' # << this line keeps emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumChickenTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line keeps emacs coloring happy chmod +x doChain time ./doChain # 42 minutes # On the file server weed the chains for repeats ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2/axtChain mkdir chain cd run1/chainRaw bash # if you are normally csh for i in *.chain do chainAntiRepeat /cluster/data/galGal2/nib \ /cluster/data/monDom1/monDom1.2bit $i ../../chain/$i echo "$i done" done # 7 minutes exit # exit bash if you are normally csh # now on the file server, sort chains. The chainMergeSort and # chainSplit steps each take less than a minute ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2/axtChain chainMergeSort chain/*.chain > all.chain mv chain chainBeforeMerge chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain bash # if you are csh normally for i in *.chain do c=${i/.chain/} echo "loading ${c}" hgLoadChain galGal2 ${c}_chainMonDom1 $i done # 4 minutes exit # exit bash if you are csh normally # Compare to some other organisms featureBits galGal2 chainMonDom1 # 782212657 bases of 1054197620 (74.200%) in intersection featureBits galGal2 chainHg17 # 921410328 bases of 1054197620 (87.404%) in intersection featureBits galGal2 chainXenTro1 # 691628520 bases of 1054197620 (65.607%) in intersection featureBits galGal2 chainMm5 # 846905330 bases of 1054197620 (80.336%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1 cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain tar cvzf \ /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.chain.tgz \ ./chr*.chain # NET OPOSSUM TO CHICKEN BLASTZ (DONE - 2004-12-13 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # 2 minutes # Create axt files for the net as so: netSplit noClass.net noClass # 1 minute cd noClass mkdir ../../axtNet bash # if you are csh normally for i in *.net do r=${i/.net/} netToAxt $i ../chain/${r}.chain /cluster/data/galGal2/nib \ /cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt done # 6 minutes exit # back to csh if you are csh normally cd .. # Add classification info using db tables: ssh hgwdev cd /cluster/data/monDom1/bed/zb.galGal2/axtChain netClass -noAr noClass.net galGal2 monDom1 chicken.net # 56 minutes # Load the nets into database ssh hgwdev cd /cluster/data/monDom1/bed/zb.galGal2/axtChain netFilter -minGap=10 chicken.net | hgLoadNet galGal2 netMonDom1 stdin # 4 minutes # compare featureBits with some other assemblies featureBits galGal2 netMonDom1 # 771045333 bases of 1054197620 (73.140%) in intersection featureBits galGal2 netHg17 # 908340945 bases of 1054197620 (86.164%) in intersection featureBits galGal2 netMm5 # 835277984 bases of 1054197620 (79.234%) in intersection featureBits galGal2 netXenTro1 # 674438463 bases of 1054197620 (63.976%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1 cd /cluster/data/monDom1/bed/zb.galGal2/axtChain cp -p chicken.net \ /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.net cd /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1 gzip opossum.net # SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE # Create CHICKEN TO OPOSSUM chain, net ssh eieio cd /cluster/data/monDom1/bed mkdir -p bz.galGal2/axtChain chainSwap zb.galGal2/axtChain/all.chain bz.galGal2/axtChain/all.chain # 1 minute cd bz.galGal2/axtChain chainPreNet all.chain ../../zb.galGal2/S2.len \ ../../zb.galGal2/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.galGal2/S2.len \ ../../zb.galGal2/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net # 2 minutes mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \ /cluster/data/galGal2/nib ../axtNet/all.axt # 6 minutes # Classify net and load into database ssh hgwdev cd /cluster/data/monDom1/bed/bz.galGal2/axtChain netClass -noAr noClass.net monDom1 galGal2 monDom1.net # 31 minutes netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netGalGal2 stdin # 4 minutes # Load chains into database (takes about 4 hours). hgLoadChain -tIndex monDom1 chainGalGal2 all.chain # if you do a 'show index' SQL reports Cardinality NULL for the tName # indexes although they are actually working. To make the # cardinality show up: hgsql monDom1 -e "analyze table chainGalGal2;" hgsql monDom1 -e "analyze table chainGalGal2Link;" featureBits monDom1 chainGalGal2Link # 110933245 bases of 3492108230 (3.177%) in intersection featureBits monDom1 chainGalGal2 # 2437546469 bases of 3492108230 (69.802%) in intersection # create a copy of for hgdownload (2005-01-13 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2 cd /cluster/data/monDom1/bed/bz.galGal2/axtChain gzip --stdout all.chain > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.chain.gz gzip --stdout monDom1.net > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.net.gz # BLASTZ CHICKEN CLEAN UP (DONE - 2005-01-12 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.galGal2/axtChain nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \ noClass.net chainBeforeMerge noClass & nice gzip chain/* & nice gzip chicken.net & nice gzip all.chain & cd /cluster/data/monDom1/bed/zb.galGal2 nice rm -rf raw & nice gzip axtNet/*.axt & nice gzip axtChrom/*.axt & nice gzip pslChrom/* lav/*/* & cd /cluster/data/monDom1/bed/bz.galGal2/axtChain nice rm -f noClass.net link.tab chain.tab & nice gzip all.chain monDom1.net & cd ../axtNet nice gzip all.axt & # ALIGN TO MOUSE (DONE - 2004-12-16 - Hiram) # NAMING CONVENTION: the zb.mm5 directory will be the Opossum # aligned to the Mouse blastz result. # The bz.mm5 directory will the swapped result, Mouse aligned # to Opossum ssh kk mkdir -p /cluster/data/monDom1/bed/zb.mm5 cd /cluster/data/monDom1/bed/zb.mm5 # Create DEF file for run of scaffolds vs. mouse chromosomes cat << '_EOF_' > DEF # mouse vs. M.domestica export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: mouse SEQ1_DIR=/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum SEQ2_DIR=/iscratch/i/monDom1/chunks SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/monDom1/bed/zb.mm5 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp -p /cluster/data/mm5/chrom.sizes S1.len cp -p /cluster/data/monDom1/chrom.sizes S2.len # script copied over from /cluster/data/mm5/jkStuff/BlastZ_run0.sh /cluster/data/monDom1/jkStuff/BlastZ_run0.sh cd run.0 para try, push check, para push, para check.... # Completed: 112871 of 112871 jobs # CPU time in finished jobs: 36169619s 602826.99m 10047.12h 418.63d 1.147 y # IO & Wait Time: 452242s 7537.36m 125.62h 5.23d 0.014 y # Average job time: 324s 5.41m 0.09h 0.00d # Longest job: 2755s 45.92m 0.77h 0.03d # Submission to last job: 153519s 2558.65m 42.64h 1.78d # extra delay in this run due to disk problems in the middle and # thus being interrupted # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/monDom1/bed/zb.mm5 # script copied over from /cluster/data/mm5/jkStuff/BlastZ_run1.sh /cluster/data/monDom1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 3131s 52.19m 0.87h 0.04d 0.000 y # IO & Wait Time: 9815s 163.58m 2.73h 0.11d 0.000 y # Average job time: 38s 0.63m 0.01h 0.00d # Longest job: 223s 3.72m 0.06h 0.00d # Submission to last job: 1267s 21.12m 0.35h 0.01d # third run: lav -> axt (an I/O bound process, run only on fileserver) # Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt ssh eieio cd /cluster/data/monDom1/bed/zb.mm5 mkdir axtChrom pslChrom cat << '_EOF_' > lav2axt.sh #!/bin/sh for L in lav/chr* do chr=${L/lav\//} cd ${L} echo -n "${L} -> ${chr}.axt working ... " cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /scratch/mus/mm5/softNib \ /iscratch/i/monDom1/chunks.2bit stdout \ | axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \ /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin cd ../.. echo -n "${chr}.axt -> ${chr}.psl working ..." axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl echo "DONE" done '_EOF_' # << this line keeps emacs coloring happy chmod +x lav2axt.sh ./lav2axt.sh # 38 minutes # load this blast result just to see how it looks # It is not needed in the browser after the chains and nets are # made, drop it later. ssh hgwdev cd /cluster/data/monDom1/bed/zb.mm5/pslChrom hgLoadPsl -table=blastzMonDom1 -noTNameIx mm5 chrM.psl bash # if you are csh normally for I in `ls *.psl | grep -v chrM.psl` do /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \ -noTNameIx mm5 ${I} echo "done: ${I}" done # 12 minutes exit # back to csh if you are csh normally # Let's see what featureBits has to say about this data featureBits mm5 blastzMonDom1 263272041 bases of 2615483787 (10.066%) in intersection # CHAIN MOUSE BLASTZ (TBD) # Run axtChain on file server, job is mostly I/O, cluster does not help ssh eieio cd /cluster/data/monDom1/bed/zb.mm5 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/monDom1/bed/zb.mm5/axtChrom/*.axt > input.lst cat << '_EOF_' > doChain.sh #!/bin/sh for chrAxtFile in `cat input.lst` do chr=`basename ${chrAxtFile}` axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../opossumMouseTuned.gap \ -minScore=5000 ${chrAxtFile} \ /scratch/mus/mm5/softNib \ /cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out done '_EOF_' # << this line keeps emacs coloring happy # Reuse gap penalties from mouse run. cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumMouseTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line keeps emacs coloring happy chmod +x doChain time ./doChain # 95 minutes # On the file server weed the chains for repeats ssh eieio cd /cluster/data/monDom1/bed/zb.mm5/axtChain mkdir chain cd run1/chainRaw bash # if you are normally csh for i in *.chain do chainAntiRepeat /scratch/mus/mm5/softNib \ /cluster/data/monDom1/monDom1.2bit $i ../../chain/$i echo "$i done" done # 34 minutes exit # exit bash if you are normally csh # now on the file server, sort chains. # chainSplit steps each take less than a minute ssh eieio cd /cluster/data/monDom1/bed/zb.mm5/axtChain chainMergeSort chain/*.chain > all.chain # 3 minutes mv chain chainBeforeMerge chainSplit chain all.chain # 3 minutes rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain bash # if you are csh normally for i in *.chain do c=${i/.chain/} echo "loading ${c}" hgLoadChain mm5 ${c}_chainMonDom1 $i done # 20 minutes exit # exit bash if you are csh normally # Compare to some other organisms featureBits mm5 chainMonDom1 # 2121448151 bases of 2615483787 (81.111%) in intersection featureBits -countGaps mm5 chainRn3 # 2646682349 bases of 3164952073 (83.625%) in intersection featureBits mm5 chainRn3 # 2646682349 bases of 2615483787 (101.193%) in intersection featureBits mm5 chainHg17 # 2507720521 bases of 2615483787 (95.880%) in intersection featureBits mm5 chainXenTro1 # 1078618413 bases of 2615483787 (41.240%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1 cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain tar cvzf \ /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.chain.tgz \ ./chr*.chain # NET OPOSSUM TO MOUSE BLASTZ (TBD) ssh eieio cd /cluster/data/monDom1/bed/zb.mm5/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # 6 minutes # Create axt files for the net as so: netSplit noClass.net noClass # 1 minute cd noClass mkdir ../../axtNet bash # if you are csh normally for i in *.net do r=${i/.net/} netToAxt $i ../chain/${r}.chain /scratch/mus/mm5/softNib \ /cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt done # 10 minutes exit # back to csh if you are csh normally cd .. rm -r noClass # Add classification info using db tables: ssh hgwdev cd /cluster/data/monDom1/bed/zb.mm5/axtChain netClass -noAr noClass.net mm5 monDom1 mouse.net # 41 minutes # Load the nets into database ssh hgwdev cd /cluster/data/monDom1/bed/zb.mm5/axtChain netFilter -minGap=10 mouse.net | hgLoadNet mm5 netMonDom1 stdin # 3 minutes # compare featureBits with some other assemblies featureBits mm5 netMonDom1 # 2094316044 bases of 2615483787 (80.074%) in intersection featureBits mm5 netRn3 # 2638255333 bases of 2615483787 (100.871%) in intersection featureBits -countgaps mm5 netRn3 # 2638255333 bases of 3164952073 (83.358%) in intersection featureBits mm5 netHg17 # 2504056038 bases of 2615483787 (95.740%) in intersection featureBits mm5 netCanFam1 # 2456773441 bases of 2615483787 (93.932%) in intersection featureBits mm5 netGalGal2 # 1958796258 bases of 2615483787 (74.892%) in intersection featureBits mm5 netXenTro1 # 1042210258 bases of 2615483787 (39.848%) in intersection featureBits mm5 netTetNig1 # 618111072 bases of 2615483787 (23.633%) in intersection featureBits mm5 netDanRer2 # 560950601 bases of 2615483787 (21.447%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1 cd /cluster/data/monDom1/bed/zb.mm5/axtChain cp -p mouse.net \ /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.net cd /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1 gzip opossum.net # SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE # Create MOUSE TO OPOSSUM chain, net ssh eieio cd /cluster/data/monDom1/bed mkdir -p bz.mm5/axtChain chainSwap zb.mm5/axtChain/all.chain bz.mm5/axtChain/all.chain # 2 minutes cd bz.mm5/axtChain chainPreNet all.chain ../../zb.mm5/S2.len \ ../../zb.mm5/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.mm5/S2.len \ ../../zb.mm5/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net # 6 minutes mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \ /scratch/mus/mm5/softNib ../axtNet/all.axt # 9 minutes # Classify net and load into database ssh hgwdev cd /cluster/data/monDom1/bed/bz.mm5/axtChain netClass -noAr noClass.net monDom1 mm5 monDom1.net # 13 minutes netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netMm5 stdin # 3 minutes # Load chains into database (takes about 4 hours). hgLoadChain -tIndex monDom1 chainMm5 all.chain # if you do a 'show index' SQL reports Cardinality NULL for the tName # indexes although they are actually working. To make the # cardinality show up: hgsql monDom1 -e "analyze table chainMm5;" hgsql monDom1 -e "analyze table chainMm5Link;" featureBits monDom1 chainMm5Link # 249594220 bases of 3492108230 (7.147%) in intersection featureBits monDom1 chainMm5 # 2913812625 bases of 3492108230 (83.440%) in intersection featureBits monDom1 netMm5 # 2889580530 bases of 3492108230 (82.746%) in intersection # create a copy of for hgdownload (2005-01-13 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsMm5 cd /cluster/data/monDom1/bed/bz.mm5/axtChain gzip --stdout all.chain > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.chain.gz gzip --stdout monDom1.net > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.net.gz # BLASTZ MOUSE CLEAN UP (DONE - 2005-01-12 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.mm5/axtChain nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \ noClass.net chainBeforeMerge noClass & nice gzip chain/* & nice gzip mouse.net & nice gzip all.chain & cd /cluster/data/monDom1/bed/zb.mm5 nice rm -rf raw & nice gzip axtNet/*.axt & nice gzip axtChrom/*.axt & nice gzip pslChrom/* lav/*/* & cd /cluster/data/monDom1/bed/bz.mm5/axtChain nice rm -f noClass.net link.tab chain.tab & nice gzip all.chain monDom1.net & cd ../axtNet nice gzip all.axt & # MAKE Human Proteins track (hg17 DONE 2004-12-15 braney) ssh eieio mkdir -p /cluster/data/monDom1/blastDb cd /cluster/data/monDom1/blastDb faSplit sequence ../broad.mit.edu/scaffolds.fa.unmasked 500 x for i in x* do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/monDom1/blastDb cp /cluster/data/monDom1/blastDb/* /iscratch/i/monDom1/blastDb (iSync) > sync.out exit # back to eieio mkdir -p /cluster/data/monDom1/bed/tblastn.hg17KG cd /cluster/data/monDom1/bed/tblastn.hg17KG ls -1S /iscratch/i/monDom1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(350000/499) = 60.102411 mkdir -p /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/monDom1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 323851 of 323851 jobs # CPU time in finished jobs: 113015986s 1883599.76m 31393.33h 1308.06d 3.584 y # IO & Wait Time: 2732896s 45548.27m 759.14h 31.63d 0.087 y # Average job time: 357s 5.96m 0.10h 0.00d # Longest job: 4911s 81.85m 1.36h 0.06d # Submission to last job: 274144s 4569.07m 76.15h 3.17d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/monDom1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 649 of 649 jobs # CPU time in finished jobs: 4726s 78.76m 1.31h 0.05d 0.000 y # IO & Wait Time: 76824s 1280.41m 21.34h 0.89d 0.002 y # Average job time: 126s 2.09m 0.03h 0.00d # Longest job: 312s 5.20m 0.09h 0.00d # Submission to last job: 7487s 124.78m 2.08h 0.09d exit # back to eieio cd /cluster/data/monDom1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/monDom1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/monDom1/bed/tblastn.hg17KG hgLoadPsl monDom1 blastHg17KG.psl # back to eieio rm -rf blastOut # End tblastn # MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 DONE 2005-1-14 JK) # Create working directory and extract known gene mrna from database ssh hgwdev cd /cluster/data/monDom1/bed mkdir blatz.hg17KG cd blatz.hg17KG pepPredToFa hg17 knownGeneMrna known.fa # Split known genes into pieces and load on /iscratch/i temp device ssh kkr1u00 cd /iscratch/i/hg17 mkdir knownRna cd knownRna faSplit sequence /cluster/data/monDom1/bed/blatz.hg17KG/known.fa 9 known iSync rm /cluster/data/monDom1/bed/blatz.hg17KG/known.fa # Do parasol job to align via blatz ssh kk cd /cluster/data/monDom1/bed/blatz.hg17KG mkdir run0 cd run0 mkdir psl awk '{print $1;}' /cluster/data/monDom1/chrom.sizes > genome.lst ls -1 /iscratch/i/hg17/knownRna/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=3000 /scratch/monDom1/monDom1.2bit:$(file1) $(path2) -out=psl {check out line psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy # Note, I recommend -minScore=6000 next time. gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time Completed: 174132 of 174132 jobs CPU time in finished jobs: 9399737s 156662.29m 2611.04h 108.79d 0.298 y IO & Wait Time: 2143618s 35726.96m 595.45h 24.81d 0.068 y Average job time: 66s 1.10m 0.02h 0.00d Longest job: 2437s 40.62m 0.68h 0.03d Submission to last job: 13892s 231.53m 3.86h 0.16d # Do sort and near-besting on file server ssh eieio cd /cluster/data/monDom1/bed/blatz.hg17KG/run0 catDir psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255 sort -k 14 nearBest.psl > blatzHg17KG.psl # Clean up rm -r run0/psl # Load into database ssh hgwdev cd /cluster/data/monDom1/bed/blatz.hg17KG hgLoadPsl monDom1 blatzHg17KG.psl # ALIGN TO ZEBRAFISH (WORKING - 2004-12-17 - Hiram) # NAMING CONVENTION: the zb.danRer2 directory will be the Opossum # aligned to the Zebrafish blastz result. # The bz.danRer2 directory will the swapped result, Zebrafish aligned # to Opossum ssh kk mkdir -p /cluster/data/monDom1/bed/zb.danRer2 cd /cluster/data/monDom1/bed/zb.danRer2 # Create DEF file for run of scaffolds vs. zebrafish chromosomes cat << '_EOF_' > DEF # zebrafish vs. M.domestica export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between (chicken and fish ?) # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: zebrafish SEQ1_DIR=/iscratch/i/danRer2/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum SEQ2_DIR=/iscratch/i/monDom1/chunks SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/monDom1/bed/zb.danRer2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp -p /cluster/data/danRer2/chrom.sizes S1.len cp -p /cluster/data/monDom1/chrom.sizes S2.len # script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run0.sh /cluster/data/monDom1/jkStuff/BlastZ_run0.sh cd run.0 para try, push check, para push, para check.... # Completed: 57263 of 57263 jobs # CPU time in finished jobs: 20015224s 333587.07m 5559.78h 231.66d 0.635 y # IO & Wait Time: 364057s 6067.61m 101.13h 4.21d 0.012 y # Average job time: 356s 5.93m 0.10h 0.00d # Longest job: 2681s 44.68m 0.74h 0.03d # Submission to last job: 117505s 1958.42m 32.64h 1.36d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/monDom1/bed/zb.danRer2 # script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run1.sh /cluster/data/monDom1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... Completed: 173 of 173 jobs CPU time in finished jobs: 1228s 20.47m 0.34h 0.01d 0.000 y IO & Wait Time: 4828s 80.46m 1.34h 0.06d 0.000 y Average job time: 35s 0.58m 0.01h 0.00d Longest job: 49s 0.82m 0.01h 0.00d Submission to last job: 393s 6.55m 0.11h 0.00d # third run: lav -> axt (an I/O bound process, run only on fileserver) # Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2 mkdir axtChrom pslChrom cat << '_EOF_' > lav2axt.sh #!/bin/sh for L in lav/chr* do chr=${L/lav\//} cd ${L} echo -n "${L} -> ${chr}.axt working ... " cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/danRer2/nib \ /iscratch/i/monDom1/chunks.2bit stdout \ | axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \ /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin cd ../.. echo -n "${chr}.axt -> ${chr}.psl working ..." axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl echo "DONE" done '_EOF_' # << this line keeps emacs coloring happy chmod +x lav2axt.sh ./lav2axt.sh # 6 minutes # load this blast result just to see how it looks # It is not needed in the browser after the chains and nets are # made, drop it later. ssh hgwdev cd /cluster/data/monDom1/bed/zb.danRer2/pslChrom hgLoadPsl -table=blastzMonDom1 -noTNameIx danRer2 chrM.psl bash # if you are csh normally for I in `ls *.psl | grep -v chrM.psl` do /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \ -noTNameIx danRer2 ${I} echo "done: ${I}" done # 2 minutes exit # back to csh if you are csh normally # Let's see what featureBits has to say about this data featureBits danRer2 blastzMonDom1 # 56615212 bases of 1560497282 (3.628%) in intersection # CHAIN ZEBRAFISH BLASTZ (DONE - 2004-12-19 - Hiram)) # Run axtChain on file server, job is mostly I/O, cluster does not help ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/monDom1/bed/zb.danRer2/axtChrom/*.axt > input.lst cat << '_EOF_' > doChain.sh #!/bin/sh for chrAxtFile in `cat input.lst` do chr=`basename ${chrAxtFile}` axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../opossumZFishTuned.gap \ -minScore=5000 ${chrAxtFile} \ /iscratch/i/danRer2/nib \ /cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out done '_EOF_' # << this line keeps emacs coloring happy # Reuse gap penalties from mouse run. cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumZFishTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line keeps emacs coloring happy chmod +x doChain.sh time ./doChain.sh # 57 minutes # On the file server weed the chains for repeats ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2/axtChain mkdir chain cd run1/chainRaw bash # if you are normally csh for i in *.chain do chainAntiRepeat /iscratch/i/danRer2/nib \ /cluster/data/monDom1/monDom1.2bit $i ../../chain/$i echo "$i done" done # 4 minutes exit # exit bash if you are normally csh # now on the file server, sort chains. # chainSplit steps each take less than a minute ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2/axtChain chainMergeSort chain/*.chain > all.chain # 1 minute mv chain chainBeforeMerge chainSplit chain all.chain # 1 minute rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain bash # if you are csh normally for i in *.chain do c=${i/.chain/} echo "loading ${c}" hgLoadChain danRer2 ${c}_chainMonDom1 $i done # 3 minutes exit # exit bash if you are csh normally # Compare to some other organisms featureBits danRer2 chainMonDom1 # 267969653 bases of 1560497282 (17.172%) in intersection featureBits danRer2 chainHg17 # 510842993 bases of 1560497282 (32.736%) in intersection featureBits danRer2 chainMm5 # 507912858 bases of 1560497282 (32.548%) in intersection featureBits danRer2 chainFr1 # 309784483 bases of 1560497282 (19.852%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1 cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain tar cvzf \ /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.chain.tgz \ ./chr*.chain # NET OPOSSUM TO ZEBRAFISH BLASTZ (TBD) ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # 1 minute # Create axt files for the net as so: netSplit noClass.net noClass # 1 minute cd noClass mkdir ../../axtNet bash # if you are csh normally for i in *.net do r=${i/.net/} netToAxt $i ../chain/${r}.chain /iscratch/i/danRer2/nib \ /cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt done # 4 minutes exit # back to csh if you are csh normally cd .. rm -r noClass # Add classification info using db tables: ssh hgwdev cd /cluster/data/monDom1/bed/zb.danRer2/axtChain netClass -noAr noClass.net danRer2 monDom1 zfish.net # 39 minutes # Load the nets into database ssh hgwdev cd /cluster/data/monDom1/bed/zb.danRer2/axtChain netFilter -minGap=10 zfish.net | hgLoadNet danRer2 netMonDom1 stdin # 1 minute # compare featureBits with some other assemblies featureBits danRer2 netMonDom1 # 251706312 bases of 1560497282 (16.130%) in intersection featureBits danRer2 netRn3 XXX - TBD - 2004-12-19 20:15 # 2638255333 bases of 2615483787 (100.871%) in intersection featureBits -countgaps danRer2 netRn3 # 2638255333 bases of 3164952073 (83.358%) in intersection featureBits danRer2 netHg17 # 2504056038 bases of 2615483787 (95.740%) in intersection featureBits danRer2 netCanFam1 # 2456773441 bases of 2615483787 (93.932%) in intersection featureBits danRer2 netGalGal2 # 1958796258 bases of 2615483787 (74.892%) in intersection featureBits danRer2 netXenTro1 # 1042210258 bases of 2615483787 (39.848%) in intersection featureBits danRer2 netTetNig1 # 618111072 bases of 2615483787 (23.633%) in intersection featureBits danRer2 netDanRer2 # 560950601 bases of 2615483787 (21.447%) in intersection # create a copy of these things for hgdownload (2005-01-12 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1 cd /cluster/data/monDom1/bed/zb.danRer2/axtChain cp -p zfish.net \ /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.net cd /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1 gzip opossum.net # SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE # Create ZEBRAFISH TO OPOSSUM chain, net ssh eieio cd /cluster/data/monDom1/bed mkdir -p bz.danRer2/axtChain chainSwap zb.danRer2/axtChain/all.chain bz.danRer2/axtChain/all.chain # 1 minutes cd bz.danRer2/axtChain chainPreNet all.chain ../../zb.danRer2/S2.len \ ../../zb.danRer2/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.danRer2/S2.len \ ../../zb.danRer2/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net # 6 minutes mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \ /iscratch/i/danRer2/nib ../axtNet/all.axt # 5 minutes # Classify net and load into database ssh hgwdev cd /cluster/data/monDom1/bed/bz.danRer2/axtChain netClass -noAr noClass.net monDom1 danRer2 ZFish.net # 12 minutes netFilter -minGap=10 ZFish.net | hgLoadNet monDom1 netDanRer2 stdin # 1 minute # Load chains into database hgLoadChain -tIndex monDom1 chainDanRer2 all.chain # 10 minutes # if you do a 'show index' SQL reports Cardinality NULL for the tName # indexes although they are actually working. To make the # cardinality show up: hgsql monDom1 -e "analyze table chainDanRer2;" hgsql monDom1 -e "analyze table chainDanRer2Link;" featureBits monDom1 chainDanRer2Link # 66821827 bases of 3492108230 (1.914%) in intersection featureBits monDom1 chainDanRer2 # 397007345 bases of 3492108230 (11.369%) in intersection featureBits monDom1 chainGalGal2Link # 110933245 bases of 3492108230 (3.177%) in intersection featureBits monDom1 chainGalGal2 # 2437546469 bases of 3492108230 (69.802%) in intersection featureBits monDom1 chainMm5Link # create a copy of for hgdownload (2005-01-13 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2 cd /cluster/data/monDom1/bed/bz.danRer2/axtChain gzip --stdout all.chain > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.chain.gz gzip --stdout ZFish.net > \ /usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.net.gz # BLASTZ ZEBRAFISH CLEAN UP (DONE - 2005-01-12 - Hiram) ssh eieio cd /cluster/data/monDom1/bed/zb.danRer2/axtChain nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \ noClass.net chainBeforeMerge noClass & nice gzip chain/* & nice gzip zfish.net & nice gzip all.chain & cd /cluster/data/monDom1/bed/zb.danRer2 nice rm -rf raw & nice gzip axtNet/*.axt & nice gzip axtChrom/*.axt & nice gzip pslChrom/* lav/*/* & cd /cluster/data/monDom1/bed/bz.danRer2/axtChain nice rm -f noClass.net link.tab chain.tab & nice gzip all.chain ZFish.net & cd ../axtNet nice gzip all.axt & # 5-WAY VAR_MULTIZ - ALIGNMENTS Hg17, Mm5, MonDom1, GalGal2, DanRer2 # DONE - 2005-01-04 - Hiram ssh eieio cd /cluster/data/monDom1/bed mkdir multiz bash # if you are csh normally # if all.axt has been gzipped, simply use all.axt.gz # It will work the same way for A in danRer2 galGal2 mm5 hg17 do axtSort bz.${A}/axtNet/all.axt stdout | axtToMaf -tSplit stdin \ -tPrefix=monDom1. /cluster/data/monDom1/chrom.sizes \ -qPrefix=${A}. /cluster/data/${A}/chrom.sizes \ multiz/${A} done # 6 minutes exit # back to csh if you are csh normally # Copy MAFs to Iservers for kluster run ssh kkr1u00 mkdir /iscratch/i/monDom1/multiz cd /iscratch/i/monDom1/multiz rsync -a /cluster/data/monDom1/bed/multiz/ . mkdir penn cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn cp -p /cluster/bin/penn/maf_project penn /cluster/bin/iSync # Progressive alignment up the tree w/o stager, # using multiz.v10 (var_multiz) # Method: align internal subtrees (using 0 flag to var_multiz) # Then, align these to human (using 1 flag to var_multiz) # NOTE: must use maf_project after each multiz run, in order # to order output. Single-cov guaranteed by use of net MAF's, # so it is not necessary to run single_cov2. ssh eieio cd /cluster/data/monDom1/bed/multiz # Not sure what this is used for. Appears to be from something # that is no longer used. Can find no reference to it in the # procedures. Below in the parameter estimation for phastCons # there is a tree specified: # ((((hg17,mm5),galGal2),monDom1),danRer2) cat << '_EOF_' > tree.nh (((hg17,mm5),galGal2),danRer2) '_EOF_' # << this line keeps emacs coloring happy # make output dir and run dir ssh kk9 cd /cluster/data/monDom1/bed/multiz mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/monDom1/multiz5way.$c set pairs = /iscratch/i/monDom1/multiz # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs endif if (-d $pairs/$s2) then set d2 = $pairs endif set f1 = $d1/$s1/$c.maf set f2 = $d2/$s2/$c.maf # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $f1 && -s $f2) then echo "Aligning $f1 $f2 $flag" /iscratch/i/monDom1/multiz/penn/multiz.v10 $f1 $f2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" /iscratch/i/monDom1/multiz/penn/maf_project $out/$c.tmp.maf monDom1.$c > $out/$c.maf else if (-s $f1) then cp $f1 $out else if (-s $f2) then cp $f2 $out endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh # Copy this script to iscratch ssh kkr1u00 cd /iscratch/i/monDom1/multiz/penn cp -p /cluster/data/monDom1/bed/multiz/run/oneMultiz.csh . /cluster/bin/iSync # back to run the job ssh kki cd /cluster/data/monDom1/bed/multiz/run cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: # multiz.v10 human.maf mouse.maf 1 # multiz.v10 human-mouse.maf chicken.maf 1 # multiz.v10 human-mouse-chicken.maf zfish.maf 1 set c = $1 /iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c hg17 mm5 1 /iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c galGal2 hg17mm5 1 /iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2 galGal2hg17mm5 1 # get final alignment file /iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2galGal2hg17mm5 /cluster/data/monDom1/bed/multiz/maf/$c.maf #cleanup /iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c 'EOF' # << keep emacs coloring happy chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/monDom1/bed/multiz/maf/$(root1).maf} #ENDLOOP 'EOF' # make a list of scaffolds that successfully had actual alignments # at least once somewhere. Not every scaffold has an alignment. cd /iscratch/i/monDom1/multiz for D in danRer2 galGal2 hg17 mm5 do cd $D ls | sed -e "s/.maf//" cd .. done | sort -u > /cluster/data/monDom1/bed/multiz/run/chrom.lst cd /cluster/data/monDom1/bed/multiz/run gensub2 chrom.lst single template jobList para create jobList para try; para check para push # Completed: 5517 of 5517 jobs # CPU time in finished jobs: 8337s 138.95m 2.32h 0.10d 0.000 y # IO & Wait Time: 15166s 252.77m 4.21h 0.18d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 65s 1.08m 0.02h 0.00d # Submission to last job: 1495s 24.92m 0.42h 0.02d # combine results into a single file for loading ssh eieio cd /cluster/data/monDom1/bed/multiz catDir maf | mafFilter stdin -minScore=500 > multiz5way.maf # Load into database (DONE - 2005-01-04 - Hiram) ssh hgwdev cd /cluster/data/monDom1/bed/multiz mkdir -p /gbdb/monDom1/multiz5way ln -s /cluster/data/monDom1/bed/multiz/multiz5way.maf /gbdb/monDom1/multiz5way hgLoadMaf monDom1 multiz5way # Loaded 988922 mafs in 1 files from /gbdb/monDom1/multiz5way # 4 minutes to load # Create pairwise maf files (no longer split across scaffolds) ssh eieio cd /cluster/data/monDom1/bed/multiz mkdir -p pairwise catDir galGal2 > pairwise/galGal2.maf catDir hg17 > pairwise/hg17.maf catDir mm5 > pairwise/mm5.maf catDir danRer2 > pairwise/danRer2.maf # Load pairwise mafs into database (DONE - 2005-01-04 - Hiram) ssh hgwdev set h = /cluster/data/monDom1/bed/multiz/pairwise cd $h foreach i (*.maf) set o = $i:r set t = ${o}_netBlastz mkdir -p /gbdb/monDom1/multiz5way/$t ln -s $h/$i /gbdb/monDom1/multiz5way/$t hgLoadMaf monDom1 $t -pathPrefix=/gbdb/monDom1/multiz5way/$t end # CREATE CONSERVATION WIGGLE WITH PHASTCONS (WORKING - 2005-01-04 - Hiram) # Estimate phastCons parameters ssh eieio mkdir /cluster/data/monDom1/bed/multiz/cons cd /cluster/data/monDom1/bed/multiz/cons # Create a starting-tree.mod based on scaffold_13303. (the largest one) twoBitToFa ../../../monDom1.2bit -seq=scaffold_13303 scaffold_13303.fa /cluster/bin/phast/msa_split ../maf/scaffold_13303.maf --refseq scaffold_13303.fa --in-format MAF --windows 100000000,1000 --out-format SS --between-blocks 5000 --out-root s1 (((hg17,mm5),galGal2),danRer2) /cluster/bin/phast/phyloFit -i SS s1.*.ss --tree "((((hg17,mm5),galGal2),monDom1),danRer2)" --out-root starting-tree rm scaffold_13303.fa s1.*.ss # Create directory full of sequence in individual scaffold files on bluearc. ssh kksilo mkdir -p /cluster/bluearc/monDom1/scaffoldFa cd /cluster/bluearc/monDom1/scaffoldFa twoBitToFa /cluster/data/monDom1/monDom1.2bit stdout | faSplit byname stdin foo # Create big bad bloated SS files in bluearc (takes ~45 minutes) mkdir -p /cluster/bluearc/monDom1/cons/ss cd /cluster/bluearc/monDom1/cons/ss foreach i (`awk '{print $1}' /cluster/data/monDom1/chrom.sizes`) if (-e /cluster/data/monDom1/bed/multiz/maf/$i.maf) then echo msa_split $i /cluster/bin/phast/msa_split /cluster/data/monDom1/bed/multiz/maf/$i.maf \ --refseq /cluster/bluearc/monDom1/scaffoldFa/$i.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root $i endif end # Create a random list of 50 1 mb regions ls -l | awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs # Set up parasol directory to calculate trees on these 50 regions ssh kki mkdir /cluster/bluearc/monDom1/cons/treeRun1 cd /cluster/bluearc/monDom1/cons/treeRun1 mkdir tree log # Create little script that calls phastCons with right arguments # I wonder if target-coverage here has to go along with the # readjusted target-coverage below as it is tuned up ? # In that case, this would be 0.50 here. cat > makeTree << '_EOF_' /cluster/bin/phast/phastCons ../ss/$1.ss \ /cluster/data/monDom1/bed/multiz/cons/starting-tree.mod \ --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.35 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' chmod a+x makeTree # Create gensub file cat > template << '_EOF_' #LOOP makeTree $(root1) #ENDLOOP '_EOF_' # Make cluster job and run it gensub2 ../randomSs single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 1526s 25.43m 0.42h 0.02d 0.000 y # IO & Wait Time: 650s 10.84m 0.18h 0.01d 0.000 y # Average job time: 44s 0.73m 0.01h 0.00d # Longest job: 84s 1.40m 0.02h 0.00d # Submission to last job: 168s 2.80m 0.05h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls tree/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ../ave.cons.mod > cons_summary.txt ls tree/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/monDom1/bed/multiz/cons ssh kk9 # Create cluster dir to do main phastCons run mkdir /cluster/bluearc/monDom1/cons/consRun1 cd /cluster/bluearc/monDom1/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters cat > doPhast << '_EOF_' mkdir -p ppRaw/$2 /cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \ --expected-lengths 12 --target-coverage 0.50 --quiet --seqname $2 \ --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \ ppRaw/$2/$1.pp '_EOF_' chmod a+x doPhast # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(file1) $(root1) #ENDLOOP '_EOF_' # Create parasol batch and run it ls -1 ../ss | sed 's/.ss//' > in.lst gensub2 in.lst single template jobList para create jobList para try/check/push/etc. # Completed: 8171 of 8171 jobs # CPU time in finished jobs: 24641s 410.68m 6.84h 0.29d 0.001 y # IO & Wait Time: 81696s 1361.60m 22.69h 0.95d 0.003 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest job: 39s 0.65m 0.01h 0.00d # Submission to last job: 1128s 18.80m 0.31h 0.01d # combine predictions and transform scores to be in 0-1000 interval catDir bed | awk \ '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > \ /cluster/data/monDom1/bed/multiz/mostConserved.bed # Figure out how much is actually covered by the bed files as so: awk '{print $3 - $2;}' /cluster/data/monDom1/bed/multiz/mostConserved.bed | addCols stdin at 0.35: 99792852.00 at 0.55: 169027762 at 0.50: 140230712.00 at 0.35: 99792852/3492108230 = .028576 at 0.55: 169027762/3492108230 = .048402 with two points claiming to to be negative at 0.00000 changed to 0 at 0.50: 140230712.00/3492108230 = .040156 - no unusual points noted # If the results of the this divided by the non-n genome size (1.5G) aren't # around 4%, then do it again, adjusting the target-coverage phastCons # parameter. Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. # Load most conserved track into database ssh hgwdev cd /cluster/data/monDom1/bed/multiz hgLoadBed monDom1 mostConserved mostConserved.bed # Create merged posterier probability file and wiggle track data files ssh eieio cd /cluster/bluearc/monDom1/cons/consRun1 # interesting sort here on scaffold number and position. # first convert the names to have a consistent field separator # of . then do the sort on fields 6 and 7 (both numerically) # then convert those . field separators back to what they were find ./ppRaw -type f | \ sed -e "s#/scaffold_#.scaffold.#g; s#-#._.#" | \ sort -t\. -k6,6n -k7,7n | \ sed -e "s#.scaffold.#/scaffold_#g; s#\._\.#-#" | xargs cat | \ wigEncode stdin phastCons5.wig phastCons5.wib # 30 minutes for above cp -p phastCons5.wi? /cluster/data/monDom1/bed/multiz/cons # 1 minute copy # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/monDom1/bed/multiz/cons ln -s `pwd`/phastCons5.wib /gbdb/monDom1/wib/phastCons5.wib hgLoadWiggle monDom1 phastCons5 phastCons5.wig # 2 minute load # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/monDom1/bed/multiz/cons time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=monDom1 phastCons5 > histogram.data 2>&1 # about 15 minutes scan all data cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Opossum Histogram phastCons5 track" set xlabel " phastCons5 score" set ylabel "p-Value" set y2label "Cumulative Probability Distribution" set y2range [0:1] set y2tics plot "histogram.data" using 2:5 title " pValue" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CPD" with lines '_EOF_' display histo.png & # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/opossum_image.jpg' -O opossum_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 1712x1240+560+360 -quality 80 -sharpen 0 \ -normalize opossum_image.jpg mmd.jpg convert -geometry 300x200 -quality 80 mmd.jpg Monodelphis_domestica.jpg rm -f mmd.jpg opossum_image.jpg cp -p Monodelphis_domestica.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # MAKE DOWNLOADABLE SEQUENCE FILES (WORKING, 2005-01-06, Hiram) ssh kksilo cd /cluster/data/monDom1 #- Build the .zip files rm -rf zip mkdir zip # ! 2005-02-08 - Should not be .zip files, should be .gz files #zip -j zip/softMask.fa.zip maskedScaffolds.fa & #zip -j zip/hardMask.fa.zip hardMaskedScaffolds.fa & #zip -j zip/novel.RM.lib.fa.zip jkStuff/monDom1.novel.RM.lib & #zip ../../zip/trf.bed.zip trfMask.bed & # RepeatMasker library gzip -c jkStuff/monDom1.novel.RM.lib > zip/novel.RM.lib.fa.gz touch -r jkStuff/monDom1.novel.RM.lib zip/novel.RM.lib.fa.gz # soft masked scaffold fasta gzip -c maskedScaffolds.fa > zip/softMask.fa.gz touch -r maskedScaffolds.fa zip/softMask.fa.gz # hard masked scaffold fasta gzip -c hardMaskedScaffolds.fa > zip/hardMask.fa.gz touch -r hardMaskedScaffolds.fa zip/hardMask.fa.gz # TRF output files gzip -c bed/simpleRepeat/trfMask.bed > zip/trf.bed.gz touch -r bed/simpleRepeat/trfMask.bed zip/trf.bed.gz # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=monDom1 -native GenBank mrna \ /cluster/data/monDom1/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=monDom1 -xeno GenBank mrna \ /cluster/data/monDom1/zip/xenoMrna.fa & # THERE ARE NO native ESTs - skip this - get native GenBank ESTs # ./bin/i386/gbGetSeqs -db=monDom1 -native GenBank est \ # /cluster/data/monDom1/zip/est.fa & cd /cluster/data/monDom1/zip # zip GenBank native and xeno mRNAs and native ESTs, # after the above gbGetSeqs jobs are done. # zip -j mrna.fa.zip mrna.fa # zip -j xenoMrna.fa.zip xenoMrna.fa # zip -j est.fa.zip est.fa #rm -f mrna.fa xenoMrna.fa est.fa gzip mrna.fa xenoMrna.fa # RepeatMasker out files, we don't have this as traditionally delivered # since the process involved turning the data into the database # representation so they could be sorted and uniqued. So instead # of delivering the usual files, simply dump the database ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/bigZips cd /usr/local/apache/htdocs/goldenPath/monDom1/bigZips hgsqldump monDom1 rmsk -T . #zip -j rmsk.txt.zip rmsk.txt & #rm rmsk.txt gzip rmsk.txt & ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/monDom1 rsync -a --progress /cluster/data/monDom1/zip/ . for i in *.gz do /bin/echo -n "$i - " zcat $i | sum -r done # Check zip file integrity #for i in *.zip #do # unzip -t $i | tail -1 #done # No errors detected in compressed data of hardMask.fa.zip. # No errors detected in compressed data of mrna.fa.zip. # No errors detected in compressed data of novel.RM.lib.fa.zip. # No errors detected in compressed data of rmsk.txt.zip. # No errors detected in compressed data of softMask.fa.zip. # No errors detected in compressed data of trf.bed.zip. # No errors detected in compressed data of xenoMrna.fa.zip. # md5sum *.zip > md5sum.txt md5sum *.gz > md5sum.txt # update README.txt. # ACCESSIONS FOR CONTIGS (DONE 2005-03-16 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/monDom1/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc contigAccession opossum.acc > contigAcc.tab wc -l contigAcc.tab # 109065 contigAcc.tab grep contig /cluster/data/monDom1/broad.mit.edu/assembly.agp | wc -l # 109065 hgsql monDom1 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql monDom1 hgsql monDom1 -e "SELECT COUNT(*) FROM contigAcc" # 109065 contigAcc.tab ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET FOR ZEBRAFISH (danRer2) # zb.danRer2 are the blastz alignments, chains and nets for # danRer2 as target and monDom1 as query. (DONE, 2005-05-13, hartera) # GZIP AXTs AND MAFs (DONE, 2005-05-15, hartera) ssh kkstore01 # create axts cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain gunzip zfish.net.gz gunzip chain/*.chain.gz netSplit zfish.net zfishNet mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (zfishNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt zfishNet/$c.net chain/$c.chain \ /cluster/data/danRer2/nib \ /cluster/data/monDom1/monDom1.2bit ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2 mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh kkstore01 cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.monDom1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/danRer2/chrom.sizes /cluster/data/monDom1/chrom.sizes \ ../mafNet/$maf -tPrefix=danRer2. -qPrefix=monDom1. end 'EOF' chmod +x makeMaf.csh csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain nice gzip zfish.net chain/*.chain & cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2 nice gzip axtNet/*.axt mafNet/*.maf & # LIFTOVER CHAINS TO MONDOM2 (WORKING 2006-02-07 kate) # NOTE: split monDom2 is doc'ed in makeMonDom2.doc ssh kkstore01 cd /cluster/data/monDom1 # split scaffolds and convert to 2bit, as done for cow mkdir split2bit cd split2bit faSplit sequence ../maskedScaffolds.fa 50 x. foreach f (*.fa) echo $f faToTwoBit $f $f:r.2bit rm $f end cd .. mkdir -p /san/sanvol1/scratch/monDom1 cp -r split2bit /san/sanvol1/scratch/monDom1 rm -fr split2bit cp /cluster/bluearc/monDom1/11.ooc /san/sanvol1/scratch/monDom1/ # run alignment ssh pk cd /cluster/data/monDom1 makeLoChain-align monDom1 /san/sanvol1/scratch/monDom1/split2bit \ monDom2 /san/sanvol1/scratch/monDom2/liftSplits/split \ /san/sanvol1/scratch/monDom1/11.ooc cd bed rm blat.monDom2 ln -s blat.monDom2.2006-02-07 blat.monDom2 cd blat.monDom2/run para try para check para push # 2500 jobs # All jobs crash when run via cluster, though a test # manual job on a cluster job ran successfully: # blat /san/sanvol1/scratch/monDom1/split2bit/x18.2bit /san/sanvol1/scratch/monDom2/liftSplits/split/x19.fa ../raw/x18_x19.psl -tileSize=11 -ooc=/san/sanvol1/scratch/monDom1/11.ooc -minScore=100 -minIdentity=98 -fastMap # Loaded 83749395 letters in 35 sequences # Searched 71987986 bases in 7199 sequences # the job took ~7 hours -- # this needs to be broken up into much smaller jobs.