# for emacs: -*- mode: sh; -*- # Drosophila simulans -- # # WUSTL's # http://www.genome.wustl.edu/??? # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE (DONE 4/11/05 angie) ssh kolossus mkdir /cluster/store8/droSim1 cd /cluster/data ln -s /cluster/store8/droSim1 droSim1 cd /cluster/data/droSim1 mkdir downloads cd downloads wget ftp://genome.wustl.edu/private/lhillier/old/simulans_050411.tar.gz tar xzvf simulans_050411.tar.gz foreach c (2L 2R 2h 3L 3R 3h 4 U X Xh Yh) mkdir /cluster/data/droSim1/$c mv chr$c{,_random}.{fa,agp} /cluster/data/droSim1/$c/ end # Some warnings from mv about chroms that are _random-only or _random-less, # no problem. # download mitochondrion sequence mkdir /cluster/data/droSim1/M cd /cluster/data/droSim1/M # go to http://www.ncbi.nih.gov/ and search Genome for # "simulans mitochondrion complete". That shows the gi number: # 45332829 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=45332829&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Drosophila simulans mitochondrion complete genome, and then replace the # header line with just ">chrM". cd .. # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 endif end end faSize */chr*.fa #142420719 bases (15167212 N's 127253507 real 127253507 upper 0 lower) in 18 sequences in 18 files #Total size: mean 7912262.2 sd 9722195.0 min 14972 (chrM) max 27517382 (chr3R) median 2996586 #N count: mean 842622.9 sd 861219.0 #U count: mean 7069639.3 sd 9068950.6 #L count: mean 0.0 sd 0.0 # see what kind of gap types we have: awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq #contig #fragment # biggest gap size is 3000: awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1 # rough hist of gap sizes from AGP: awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100 # fancy hist of gap size from FASTA: cat */chr*.fa | faGapSizes -niceSizes=10,50,100,200,300,700,1000,3000 stdin # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 4/11/05 angie) ssh kolossus cd /cluster/data/droSim1 foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mkdir -p $base/lift mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst set msize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft # MAKE JKSTUFF AND BED DIRECTORIES (DONE 4/11/05 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/droSim1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/droSim1/bed # CREATING DATABASE (DONE 4/11/05 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 881G 779G 54% /var/lib/mysql hgsql -e 'create database droSim1' # CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/11/05 angie) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql droSim1 # REPEATMASKER (DONE 4/11/05 angie) #- Split contigs into 500kb chunks, at gaps if possible: ssh kolossus cd /cluster/data/droSim1 foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/droSim1 cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/droSim1/$2 /bin/cp $2 /tmp/droSim1/$2/ cd /tmp/droSim1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 popd /bin/cp /tmp/droSim1/$2/$2.out ./ if (-e /tmp/droSim1/$2/$2.align) /bin/cp /tmp/droSim1/$2/$2.align ./ if (-e /tmp/droSim1/$2/$2.tbl) /bin/cp /tmp/droSim1/$2/$2.tbl ./ if (-e /tmp/droSim1/$2/$2.cat) /bin/cp /tmp/droSim1/$2/$2.cat ./ /bin/rm -fr /tmp/droSim1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/droSim1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/droSim1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (?{,?}) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/droSim1/jkStuff/RMDrosophila \ /cluster/data/droSim1/$d $f \ '{'check out line+ /cluster/data/droSim1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/droSim1/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 320 of 320 jobs #Average job time: 5548s 92.47m 1.54h 0.06d #Longest finished job: 6926s 115.43m 1.92h 0.08d #Submission to last job: 7331s 122.18m 2.04h 0.08d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kolossus cd /cluster/data/droSim1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (?{,?}) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/droSim1 hgLoadOut droSim1 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 4/11/05 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on droSim1 and on a comparable genome build, and compare: ssh hgwdev featureBits droSim1 rmsk #15094026 bases of 127256433 (11.861%) in intersection # compare to dm2: featureBits dm2 rmsk #16178239 bases of 131698467 (12.284%) in intersection # comparable... # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 4/11/05 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. # NOTE: next time, should make .2bit -- smoother to replace later on. ssh kolossus cd /cluster/data/droSim1 mkdir nib foreach c (?{,?}) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/droSim1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/droSim1/nib foreach f (/cluster/data/droSim1/nib/chr*.nib) ln -s $f /gbdb/droSim1/nib end # Load /gbdb/droSim1/nib paths into database and save size info. cd /cluster/data/droSim1 hgsql droSim1 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib droSim1 /gbdb/droSim1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N droSim1 > chrom.sizes # take a look at chrom.sizes, should be 18 lines wc chrom.sizes # 18 36 314 chrom.sizes # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/11/05 angie) # Warning: genome and organism fields must correspond # with defaultDb values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("droSim1", "Apr. 2005", "/gbdb/droSim1/nib", "D. simulans", \ "chr2L:827101-841600", 1, 52, "D. simulans", \ "Drosophila simulans", "/gbdb/droSim1/html/description.html", \ 0, 0, "WUSTL version 1.0");' hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO defaultDb (genome, name) values ("D. simulans", "droSim1");' hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO genomeClade (genome, clade, priority) \ values ("D. simulans", "insect", 15);' # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add droSim1 and do mkdir drosophila/droSim1 make update DBS=droSim1 # go public on genome-test mkdir /gbdb/droSim1/html cvs commit makefile cvs add drosophila/droSim1 cvs commit drosophila/droSim1 # make alpha in a clean tree's trackDb/ # Also add a first-cut trackDb/drosophila/droSim1/description.html, # check it in and "make alpha" in clean tree again. # MAKE LIFTALL.LFT (DONE 4/11/05 angie) # Redone 7/13/04: chrM/lift/ordered.lft was regenerated w/correct size. ssh kolossus cd /cluster/data/droSim1 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # SIMPLE REPEATS (TRF) (DONE 4/11/05 angie) ssh kolossus mkdir /cluster/data/droSim1/bed/simpleRepeat cd /cluster/data/droSim1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/droSim1/?{,?}/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \ trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed droSim1 simpleRepeat \ /cluster/data/droSim1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits droSim1 simpleRepeat #2608862 bases of 127256433 (2.050%) in intersection featureBits dm2 simpleRepeat #1597029 bases of 131698467 (1.213%) in intersection # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 4/11/05 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kolossus cd /cluster/data/droSim1/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) echo -n "filtering $f... " awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords: mkdir trfMaskChrom foreach f (../../?{,?}/chr*.fa) set c = $f:t:r echo $c liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \ trfMask/${c}_[0-9]*.bed > /dev/null end # No simpleRepeats in chrM --> liftUp warning, but it's OK: #chrM #No lines lifted! # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 4/11/05 angie) ssh kolossus cd /cluster/data/droSim1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (?{,?}) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # Make one big masked 2bit file, and make a link to it from /gbdb/droSim1/ faToTwoBit */chr*.fa droSim1.2bit twoBitInfo droSim1.2bit stdout \ | awk '{printf "%s\t%s\t/gbdb/droSim1/droSim1.2bit\n", $1, $2}' \ > chromInfo.tab ssh hgwdev cd /cluster/data/droSim1 ln -s /cluster/data/droSim1/droSim1.2bit /gbdb/droSim1/ # Clean up unmasked nib and update dbDb and chromInfo to point at .2bit: hgsql -h genome-testdb hgcentraltest \ -e 'update dbDb set nibPath = "/gbdb/droSim1" where name = "droSim1"' hgsql droSim1 -e 'drop table chromInfo' hgsql droSim1 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql droSim1 \ -e 'load data local infile "chromInfo.tab" into table chromInfo;' rm -r /gbdb/droSim1/nib rm -r /cluster/data/droSim1/nib/ # GOLD AND GAP TRACKS (DONE 4/11/05 angie) ssh hgwdev cd /cluster/data/droSim1 cp /dev/null chrom.lst foreach f (?{,?}/chr*.agp chrM) echo $f:t:r >> chrom.lst end hgGoldGapGl -noGl -chromLst=chrom.lst droSim1 /cluster/data/droSim1 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql droSim1 # oops, that won't work until v4.1, so do this for the time being: hgsql droSim1 -e 'create table chrM_gap select * from chr2L_gap where 0=1' # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/13/05 angie) ssh kolossus cd /cluster/data/droSim1 #- Build the .tar.gz files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip zipTmp mkdir zip mkdir zipTmp cd zipTmp ln -s ../?{,?}/chr*.agp . tar chvzf ../zip/chromAgp.tar.gz *.agp rm -f ./*; ln -s ../?{,?}/chr*.fa.out . tar chvzf ../zip/chromOut.tar.gz *.fa.out rm -f ./*; ln -s ../?{,?}/chr*.fa . tar chvzf ../zip/chromFa.tar.gz *.fa rm -f ./*; ln -s ../?{,?}/chr*.fa.masked . tar chvzf ../zip/chromFaMasked.tar.gz *.fa.masked cd ../bed/simpleRepeat tar cvzf ../../zip/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. rm -rf zipTmp '_EOF_' # << this line makes emacs coloring happy csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. #- Check .tar.gz file integrity: foreach f (zip/*.tar.gz) tar tvzf $f > $f.test tail -1 $f.test end wc -l zip/*.tar.gz.test #- Copy the compressed files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/droSim1 mkdir -p $gp/bigZips ln -s /cluster/data/droSim1/zip/*.gz $gp/bigZips mkdir -p $gp/chromosomes foreach f ( /cluster/data/droSim1/*/chr*.fa ) echo $f nice gzip -c $f > $gp/chromosomes/$f:t.gz end cd $gp/bigZips nice md5sum *.gz > md5sum.txt cd $gp/chromosomes nice md5sum *.gz > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Can't make refGene upstream sequence files - no refSeq for simulans. # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 4/11/05 angie) ssh kkr1u00 # .2bit of chroms that have been repeat- and trf-masked: mkdir -p /iscratch/i/droSim1 cp -p /cluster/data/droSim1/droSim1.2bit /iscratch/i/droSim1/ iSync # PRODUCING GENSCAN PREDICTIONS (DONE 4/11/05 angie) # Run on small cluster -- genscan needs big mem. ssh hgwdev mkdir /cluster/data/droSim1/bed/genscan cd /cluster/data/droSim1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kolossus cd /cluster/data/droSim1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/droSim1/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ssh kki cd /cluster/data/droSim1/bed/genscan gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... #Completed: 41 of 41 jobs #Average job time: 103s 1.71m 0.03h 0.00d #Longest finished job: 158s 2.63m 0.04h 0.00d #Submission to last job: 710s 11.83m 0.20h 0.01d # If there were crashes, we would diagnose with "para problems". # If a job had crashed due to genscan running out of memory, we would # re-run it manually with "-window=1200000" instead of "-window=2400000". # Convert these to chromosome level files as so: ssh kolossus cd /cluster/data/droSim1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/droSim1/bed/genscan ldHgGene -gtf droSim1 genscan genscan.gtf hgPepPred droSim1 generic genscanPep genscan.pep hgLoadBed droSim1 genscanSubopt genscanSubopt.bed # MAKE 11.OOC FILE FOR BLAT (DONE 4/11/05 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kolossus mkdir /cluster/bluearc/droSim1 blat /iscratch/i/droSim1/droSim1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/droSim1/11.ooc -repMatch=100 #Wrote 4617 overused 11-mers to /cluster/bluearc/droSim1/11.ooc ssh kkr1u00 cp -p /cluster/bluearc/droSim1/*.ooc /iscratch/i/droSim1/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 4/13/05 angie) # get some idea of whether this species is represented in refSeq or # has a significant number of native mRNAs. version numbers will # change: ssh eieio cd /cluster/data/genbank/data/processed/refseq.10/full awk '{print $4 " " $5}' mrna.gbidx | sort | uniq -c | sort -nr cd /cluster/data/genbank/data/processed/genbank.146.0/full awk '$4 == "Drosophila" {print $4 " " $5}' mrna.gbidx \ | sort | uniq -c | sort -nr # 27 Drosophila simulans # Wow... with that few native mRNAs, it's questionable whether we # should even bother with a native mRNA track. But we're doing an # mrna run to get xenoMrna, so what the heck, 27 > limit for single # hgBlat query. grep 'Drosophila simulans' est.*.gbidx | wc -l #4105 ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines: # droSim1 (D. simulans) droSim1.genome = /iscratch/i/droSim1/nib/chr*.nib droSim1.lift = /cluster/data/droSim1/jkStuff/liftAll.lft droSim1.refseq.mrna.native.load = no droSim1.refseq.mrna.xeno.load = yes droSim1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 droSim1.genbank.mrna.xeno.load = yes droSim1.genbank.est.xeno.load = no droSim1.downloadDir = droSim1 cvs ci etc/genbank.conf # Since D. simulans is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. # Edit src/align/gbBlat to add /iscratch/i/droSim1/11.ooc cvs diff |& grep -v Diffing make cvs ci -m "Added droSim1 (D. simulans)." # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) RefSeq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droSim1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droSim1 featureBits droSim1 xenoRefGene #25294349 bases of 127256433 (19.877%) in intersection # Clean up: rm -rf work/initial.droSim1 # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droSim1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droSim1 featureBits droSim1 mrna #17381 bases of 127256433 (0.014%) in intersection featureBits droSim1 xenoMrna #22557993 bases of 127256433 (17.726%) in intersection # Clean up: rm -rf work/initial.droSim1 ssh eieio # -initial for ESTs nice bin/gbAlignStep -srcDb=genbank -type=est -initial droSim1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 droSim1 & featureBits droSim1 intronEst #514738 bases of 127256433 (0.404%) in intersection featureBits droSim1 est #1193700 bases of 127256433 (0.938%) in intersection # Clean up: rm -rf work/initial.droSim1 # MAKE GCPERCENT (DONE 4/11/05 angie) ssh hgwdev mkdir /cluster/data/droSim1/bed/gcPercent cd /cluster/data/droSim1/bed/gcPercent # create and load gcPercent table hgsql droSim1 < ~/src/hg/lib/gcPercent.sql hgGcPercent droSim1 ../../nib # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 5/13/05 alisq) ssh hgwdev echo 'insert into blatServers values("droSim1", "blat9", "17780", 1, 0); \ insert into blatServers values("droSim1", "blat9", "17781", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 5/23/05 angie) mkdir /cluster/data/droSim1/bed/blastz.dm2.swap cd /cluster/data/droSim1/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droSim1/DEF \ >& do.log & tail -f do.log # Add {chain,net}Dm2 to trackDb.ra if necessary. ln -s blastz.dm2.swap /cluster/data/droSim1/bed/blastz.dm2 # MAKE Drosophila Proteins track (DONE braney 2005-04-20) ssh kolossus mkdir -p /cluster/data/droSim1/blastDb cd /cluster/data/droSim1/blastDb awk "{print \$2}" ../*/chr*/*.lft > subChr.lst for i in `cat subChr.lst`; do ln -s ../*/*/$i.fa; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log cd .. cat */chr*/*.lft > jkStuff/subChr.lft ssh kkr1u00 mkdir -p /iscratch/i/droSim1/blastDb cp /cluster/data/droSim1/blastDb/* /iscratch/i/droSim1/blastDb iSync > sync.out mkdir -p /cluster/data/droSim1/bed/tblastn.dm1FB cd /cluster/data/droSim1/bed/tblastn.dm1FB ls -1S /iscratch/i/droSim1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kolossus cd /cluster/data/droSim1/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/320) = 39.968000 split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/dm1/bed/blat.dm1FB/protein.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droSim1/jkStuff/subChr.lft carry $f.3 liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/droSim1/jkStuff/liftAll.lft carry $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/droSim1/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 150080 of 150080 jobs # CPU time in finished jobs: 2042525s 34042.08m 567.37h 23.64d 0.065 y # IO & Wait Time: 631624s 10527.07m 175.45h 7.31d 0.020 y # Average job time: 18s 0.30m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 239s 3.98m 0.07h 0.00d # Submission to last job: 3533s 58.88m 0.98h 0.04d ssh kki cd /cluster/data/droSim1/bed/tblastn.dm1FB cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para try, push... # Completed: 469 of 469 jobs # CPU time in finished jobs: 3454s 57.57m 0.96h 0.04d 0.000 y # IO & Wait Time: 3742s 62.37m 1.04h 0.04d 0.000 y # Average job time: 15s 0.26m 0.00h 0.00d # Longest finished job: 329s 5.48m 0.09h 0.00d # Submission to last job: 649s 10.82m 0.18h 0.01d exit # back to kolossus cd /cluster/data/droSim1/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droSim1/bed/tblastn.dm1FB/blastDm1FB.psl ssh hgwdev cd /cluster/data/droSim1/bed/tblastn.dm1FB hgLoadPsl droSim1 blastDm1FB.psl # End tblastn # BLASTZ/CHAIN/NET DROYAK1 (REDONE 6/26/05 angie) # Originally run w/default human-mouse params 4/19/05. mkdir /cluster/data/droSim1/bed/blastz.droYak1.2006-06-26 cd /cluster/data/droSim1/bed/blastz.droYak1.2006-06-26 cat << '_EOF_' > DEF # D.simulans vs. D.yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. simulans SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/droSim1/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/iscratch/i/droYak1/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak1/chrom.sizes BASE=/cluster/data/droSim1/bed/blastz.droYak1.2006-06-26 '_EOF_' # << this line keeps emacs coloring happy ~/kent/src/utils/doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/droSim1droYak1 >& do.log & tail -f do.log ln -s blastz.droYak1.2006-06-26 /cluster/data/droSim1/bed/blastz.droYak1 # Add {chain,net}DroYak1 to trackDb.ra if necessary. featureBits -chrom=chr2R droSim1 chainDroYak1Link #16921386 bases of 18190314 (93.024%) in intersection featureBits -chrom=chr2R droSim1 chainDm2Link #17421444 bases of 18190314 (95.773%) in intersection featureBits -chrom=chr4 droSim1 chainDroYak1Link #634681 bases of 807953 (78.554%) in intersection featureBits -chrom=chr4 droSim1 chainDm2Link #696892 bases of 807953 (86.254%) in intersection # BLASTZ/CHAIN/NET DP2 (DONE 4/21/05 angie) mkdir /cluster/data/droSim1/bed/blastz.dp2.2005-04-20 cd /cluster/data/droSim1/bed/blastz.dp2.2005-04-20 cat << '_EOF_' > DEF # D.simulans vs. D.pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. simulans SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/droSim1/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp2/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp2/chrom.sizes BASE=/cluster/data/droSim1/bed/blastz.dp2.2005-04-20 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub=kk9 \ -blastzOutRoot /cluster/bluearc/droSim1dp2 >& do.log & tail -f do.log rmdir /cluster/bluearc/droSim1dp2 ln -s blastz.dp2.2005-04-20 /cluster/data/droSim1/bed/blastz.dp2 # Add {chain,net}Dp2 to trackDb.ra if necessary. featureBits -chrom=chr2R droSim1 chainDp2Link #13268189 bases of 18190314 (72.941%) in intersection featureBits -chrom=chr4 droSim1 chainDp2Link #300051 bases of 807953 (37.137%) in intersection # BLASTZ/CHAIN/NET DP3 (DONE 4/21/05 angie) mkdir /cluster/data/droSim1/bed/blastz.dp3.2005-04-21 cd /cluster/data/droSim1/bed/blastz.dp3.2005-04-21 cat << '_EOF_' > DEF # D.simulans vs. D.pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. simulans SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/droSim1/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp3/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp3/chrom.sizes BASE=/cluster/data/droSim1/bed/blastz.dp3.2005-04-21 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/droSim1dp3 >& do.log & tail -f do.log rmdir /cluster/bluearc/droSim1dp3 ln -s blastz.dp3.2005-04-21 /cluster/data/droSim1/bed/blastz.dp3 # Add {chain,net}Dp3 to trackDb.ra if necessary. featureBits -chrom=chr2R droSim1 chainDp3Link #13351459 bases of 18190314 (73.399%) in intersection featureBits -chrom=chr4 droSim1 chainDp3Link #314925 bases of 807953 (38.978%) in intersection # 4-FLY SIM-MEL-YAK-PSEUDO MULTIZ (DONE 4/22/05 angie) # Tree (4-way): # (((droSim1 dm2) droYak1) dp3) ssh kkstore01 mkdir /cluster/data/droSim1/bed/multiz4way.2005-04-21 ln -s /cluster/data/droSim1/bed/multiz4way.2005-04-21 \ /cluster/data/droSim1/bed/multiz4way cd /cluster/data/droSim1/bed/multiz4way # Setup: Copy pairwise MAF to /cluster/bluearc: mkdir /cluster/bluearc/flyMultiz4way foreach db (dm2 droYak1 dp3) cp -pR /cluster/data/droSim1/bed/blastz.$db/mafNet \ /cluster/bluearc/flyMultiz4way/$db end # Make output dir: mkdir maf # Create script to run multiz.v10 in the right order: cat << '_EOF_' > doMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/flyMultiz4way.$chr mkdir $tmp set REF = droSim1.$chr set MEL = /cluster/bluearc/flyMultiz4way/dm2/$chr.maf.gz set YAK = /cluster/bluearc/flyMultiz4way/droYak1/$chr.maf.gz set PSE = /cluster/bluearc/flyMultiz4way/dp3/$chr.maf.gz set DEST = /cluster/data/droSim1/bed/multiz4way/maf/$chr.maf set MZ10 = /cluster/bin/penn/multiz.v10 set PROJECT = /cluster/bin/penn/maf_project if ( -s $MEL && -s $YAK ) then echo "Aligning $MEL $YAK..." zcat $MEL > $tmp/$chr.Mel.maf zcat $YAK > $tmp/$chr.Yak.maf $MZ10 $tmp/$chr.Mel.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.MelYak.maf else if ( -s $MEL ) then zcat $MEL > $tmp/$chr.MelYak.maf else if ( -s $YAK ) then zcat $YAK > $tmp/$chr.MelYak.maf endif if ( -s $PSE && -s $tmp/$chr.MelYak.maf ) then echo "Adding $PSE..." zcat $PSE > $tmp/$chr.Pse.maf $MZ10 $tmp/$chr.MelYak.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.MelYakPse.maf mv $tmp/$chr.MelYakPse.maf $DEST else if ( -s $PSE ) then zcat $PSE > $DEST else if ( -s $tmp/$chr.MelYak.maf ) then cp $tmp/$chr.MelYak.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doMultiz.csh awk '{print "./doMultiz.csh " $1;}' /cluster/data/droSim1/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /cluster/data/droSim1/bed/multiz4way para create jobs.lst para try, check, push, check, ... #Completed: 18 of 18 jobs #Average job time: 380s 6.34m 0.11h 0.00d #Longest finished job: 1735s 28.92m 0.48h 0.02d #Submission to last job: 1735s 28.92m 0.48h 0.02d # make /gbdb/ links to 4way maf files: ssh hgwdev mkdir -p /gbdb/droSim1/multiz4way/maf/multiz4way ln -s /cluster/data/droSim1/bed/multiz4way/maf/chr*.maf \ /gbdb/droSim1/multiz4way/maf/multiz4way/ # load into database cd /tmp hgLoadMaf -warn droSim1 multiz4way \ -pathPrefix=/gbdb/droSim1/multiz4way/maf/multiz4way # put 4way MAF out for download ssh kkstore01 cd /cluster/data/droSim1/bed/multiz4way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload nice md5sum *.maf.gz > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/droSim1/multiz4way ln -s /cluster/data/droSim1/bed/multiz4way/mafDownload/{*.maf.gz,md5sum.txt} \ /usr/local/apache/htdocs/goldenPath/droSim1/multiz4way # make a README.txt # Cleanup rm -rf /cluster/bluearc/flyMultiz4way/ # PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 4/25/05 angie) # Same procedure as for latest dm1 4way -- using the param estimation # methods described in Adam's Genome Research paper. ssh kkstore01 mkdir -p /cluster/bluearc/droSim1/chrom cp -p /cluster/data/droSim1/?{,?}/chr*.fa /cluster/bluearc/droSim1/chrom/ # Split chrom fa & maf into smaller windows for phastCons: ssh kki mkdir /cluster/data/droSim1/bed/multiz4way/phastCons mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.split cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.split set WINDOWS = /cluster/bluearc/droSim1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/droSim1/chrom set WINDOWS=/cluster/bluearc/droSim1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O droSim1,dm2,droYak1,dp3 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/droSim1/bed/multiz4way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 18 of 18 jobs #Average job time: 15s 0.26m 0.00h 0.00d #Longest finished job: 49s 0.82m 0.01h 0.00d #Submission to last job: 49s 0.82m 0.01h 0.00d # Get genome-wide all-species average GC content # by running msa_view --aggregate --summary-only on the WINDOWS .SS files. # msa_view needs seekable single-ss file, so unpack each to /tmp and # compute overall average: # Note: for larger organisms, this trick can be used to take a random # sample of windows: # foreach f (`ls -1 /cluster/bluearc/droSim1/phastCons/WINDOWS/* \ # | randomLines stdin 50 stdout`) ssh kolossus cd /cluster/data/droSim1/bed/multiz4way/phastCons/ cp /dev/null gcs.txt foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*) zcat $f > /tmp/droSim1.sample.ss /cluster/bin/phast/msa_view \ --aggregate droSim1,dm2,droYak1,dp3 \ -i SS --summary-only /tmp/droSim1.sample.ss \ | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \ >> gcs.txt end ave gcs.txt #average 0.428524 # OK, so use 0.429 as the --gc parameter below. rm /tmp/droSim1.sample.ss ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/droSim1/bed/multiz4way/phastCons /cluster/bin/phast/msa_view ../maf/chr3R.maf \ --aggregate droSim1,dm2,droYak1,dp3 -i MAF -o SS \ > all.ss /cluster/bin/phast/phyloFit all.ss \ --tree "(((droSim1,dm2),droYak1),dp3)" \ -i SS --out-root starting-tree cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -74270041.694256 #BACKGROUND: 0.280086 0.220054 0.219835 0.280025 #RATE_MAT: # -0.934494 0.207662 0.469274 0.257559 # 0.264314 -1.083783 0.220083 0.599387 # 0.597889 0.220301 -1.081724 0.263534 # 0.257614 0.471019 0.206888 -0.935522 #TREE: (((droSim1:0.023894,dm2:0.030119):0.030350,droYak1:0.061782):0.147792,dp3:0.147792); # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use the values of --target-coverage and --expected-lengths from the # last iteration of the dm2 8way run, 0.565 and 36. # phyloFit makes one .mod but consEntropy wants two... # Multiply each subst rate on the TREE line by 3.5 which is roughly the # ratio of noncons to cons in # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \ > starting-tree.noncons.mod /cluster/bin/phast/consEntropy --NH 9.7834 0.565 36 \ starting-tree*.mod #( Solving for new omega: 36.000000 37.779742 37.741115 ) #Transition parameters: gamma=0.565000, omega=36.000000, mu=0.027778, nu=0.036079 #Relative entropy: H=0.509558 bits/site #Required length: N=18.908610 sites #Total entropy: NH=9.635032 bits #Recommended expected length: omega=37.741115 sites (for NH=9.783400) # OK, stick with 0.565 but use --expected-lengths 37.75. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.46 36.1 ave.{cons,noncons}.mod #Recommended expected length: omega=26.963371 sites (for NH=9.783400) # OK, --expected-lengths 27 # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.40 27 ave.{cons,noncons}.mod #Recommended expected length: omega=21.874376 sites (for NH=9.783400) # ==> 21.9 # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.35 21.9 ave.{cons,noncons}.mod #Recommended expected length: omega=17.941014 sites (for NH=9.783400) # ==> 17.9 # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.357 18 ave.{cons,noncons}.mod #Recommended expected length: omega=18.561695 sites (for NH=9.783400) # ==> 18.6 # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh kk9 mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.429 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 37.75 --target-coverage 0.565 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 18.6 --target-coverage 0.357 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job para create jobList para try, check, push, check, ...# #Completed: 153 of 153 jobs #Average job time: 105s 1.76m 0.03h 0.00d #Longest finished job: 185s 3.08m 0.05h 0.00d #Submission to last job: 259s 4.32m 0.07h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore01 cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((droSim1:0.013566,dm2:0.015700):0.013732,droYak1:0.031231):0.077792,dp3:0.077792); #ave.noncons.mod:TREE: (((droSim1:0.061644,dm2:0.071348):0.064130,droYak1:0.145205):0.370123,dp3:0.370123); # SECOND ITERATION: #ave.cons.mod:TREE: (((droSim1:0.012146,dm2:0.014020):0.012179,droYak1:0.027991):0.068610,dp3:0.068610); #ave.noncons.mod:TREE: (((droSim1:0.058376,dm2:0.067404):0.060221,droYak1:0.137752):0.346037,dp3:0.346037); # THIRD ITERATION: #ave.cons.mod:TREE: (((droSim1:0.011226,dm2:0.012932):0.011189,droYak1:0.025859):0.062687,dp3:0.062687); #ave.noncons.mod:TREE: (((droSim1:0.056522,dm2:0.065153):0.058010,droYak1:0.133412):0.331867,dp3:0.331867); # FOURTH ITERATION: #ave.cons.mod:TREE: (((droSim1:0.010304,dm2:0.011845):0.010208,droYak1:0.023706):0.056800,dp3:0.056800); #ave.noncons.mod:TREE: (((droSim1:0.054861,dm2:0.063129):0.056035,droYak1:0.129452):0.318836,dp3:0.318836); # FIFTH ITERATION: #ave.cons.mod:TREE: (((droSim1:0.010464,dm2:0.012033):0.010379,droYak1:0.024080):0.057819,dp3:0.057819); #ave.noncons.mod:TREE: (((droSim1:0.055095,dm2:0.063418):0.056328,droYak1:0.130012):0.320744,dp3:0.320744); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off. # We probably don't need to do this, since it has always been very close # if not the same as what we used above, but it only takes a second. # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.565 37.75 ave.{cons,noncons}.mod #Recommended expected length: omega=37.562610 sites (for NH=9.783400) # OK, tweak --expected-lengths to 37.6 below (and for next iteration). # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.46 27 ave.{cons,noncons}.mod #Recommended expected length: omega=26.967450 sites (for NH=9.783400) # ==> keep at 27. # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.40 21.9 ave.{cons,noncons}.mod #Recommended expected length: omega=21.904601 sites (for NH=9.783400) # ==> keep at 21.9. # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.35 17.9 ave.{cons,noncons}.mod #Recommended expected length: omega=18.018268 sites (for NH=9.783400) # ==> tweak to 18. # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.357 18.6 ave.{cons,noncons}.mod #Recommended expected length: omega=18.541147 sites (for NH=9.783400) # ==> tweak to 18.5. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.phast cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 18.5 --target-coverage 0.357 \ --quiet --seqname $chr --idpref $pref \ --viterbi /cluster/bluearc/droSim1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/droSim1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/droSim1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/droSim1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 153 of 153 jobs #Average job time: 22s 0.37m 0.01h 0.00d #Longest finished job: 32s 0.53m 0.01h 0.00d #Submission to last job: 56s 0.93m 0.02h 0.00d # back on kkstore01: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/droSim1/bed/multiz4way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/droSim1/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. # Make a table with just the D. mel. refGenes for measuring coverage # (turned out to be almost same as xenoRefGene anyway). hgsql droSim1 -e \ 'select id from organism where name like "Drosophila melanogaster%";' #| 352 | hgsql droSim1 -e \ 'create table dmRefGene select xenoRefGene.* \ from xenoRefGene,gbCdnaInfo \ where xenoRefGene.name = gbCdnaInfo.acc and gbCdnaInfo.organism = 352;' cd /cluster/data/droSim1/bed/multiz4way/phastCons featureBits -enrichment droSim1 dmRefGene:cds all.bed # FIRST ITERATION: too high; decrease --target-coverage, re-estimate. #xenoRefGene:cds 15.498%, all.bed 52.561%, both 12.532%, cover 80.86%, enrich 1.54x # # SECOND ITERATION: still too high: #xenoRefGene:cds 15.498%, all.bed 47.696%, both 11.771%, cover 75.95%, enrich 1.59x # THIRD ITERATION: still! #xenoRefGene:cds 15.498%, all.bed 44.505%, both 11.208%, cover 72.32%, enrich 1.62x #dmRefGene:cds 15.482%, all.bed 44.505%, both 11.194%, cover 72.30%, enrich 1.62x # FOURTH ITERATION: OK, bump back up just a bit. #dmRefGene:cds 15.482%, all.bed 41.363%, both 10.587%, cover 68.38%, enrich 1.65x # FIFTH ITERATION: Close enough. #dmRefGene:cds 15.482%, all.bed 41.860%, both 10.691%, cover 69.05%, enrich 1.65x # Having met the CDS coverage target, load up the results. hgLoadBed droSim1 phastConsElements4way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.wib cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.wib rm -rf /cluster/bluearc/droSim1/phastCons/wib mkdir -p /cluster/bluearc/droSim1/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /cluster/bluearc/droSim1/phastCons/wib zcat `ls -1 /cluster/bluearc/droSim1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /cluster/bluearc/droSim1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 18 of 18 jobs #Average job time: 8s 0.12m 0.00h 0.00d #Longest finished job: 23s 0.38m 0.01h 0.00d #Submission to last job: 23s 0.38m 0.01h 0.00d # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/droSim1/bed/multiz4way/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/droSim1/phastCons/wib . rsync -av /cluster/bluearc/droSim1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/droSim1/multiz4way/wib cd /cluster/data/droSim1/bed/multiz4way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/droSim1/multiz4way/wib/ hgLoadWiggle droSim1 phastCons4way \ -pathPrefix=/gbdb/droSim1/multiz4way/wib wib/*.wig rm wiggle.tab # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/ds-top5000-4way \ "top 5000 conserved elements (4way)" droSim1 # and clean up bluearc. rm -r /cluster/bluearc/droSim1/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /cluster/bluearc/droSim1/chrom # RELOAD AGP (DONE 5/19/05 angie) # Many gap types that should have been labeled "contig" (type) were # labeled "fragment" in the original AGP. LaDeana sent corrected AGP # in an email attachment -- saved to # /cluster/data/droSim1/downloads/simulans_agp.tar.gz ssh kkstore01 cd /cluster/data/droSim1/downloads tar xvzf simulans_agp.tar.gz foreach f (*.agp) set chr = $f:r set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'` mv ../$c/$f ../$c/$f.bak mv $f ../$c/$f end cd .. # check that line counts are the same, and that the files have no # differences if you ignore the gap type and bridged columns # (and extra whitespace @EOL.) foreach f (*/*.agp) set lc1 = `wc -l < $f` set lc2 = `wc -l < $f.bak` if ($lc1 != $lc2) then echo "LINE COUNT DIFFERENCE: $f $lc1 vs. $f.bak $lc2" else echo $f:t:r ok, $lc1 == $lc2 endif awk '$5 == "N" {$7 = $8 = "ignored";} {print;}' $f \ | perl -wpe 's/\s+$/\n/' > /tmp/1 awk '$5 == "N" {$7 = $8 = "ignored";} {print;}' $f.bak \ | perl -wpe 's/\s+$/\n/' > /tmp/2 set lcd = `diff /tmp/1 /tmp/2 | wc -l` if ($lcd) then echo "DIFFERENCE OTHER THAN GAP TYPE $f vs. $f.bak" else echo $f:t:r ok, no diff other than gap type endif end # run checkAgpAndFa foreach f (*/*.fa) echo -n "$f:t:r - " checkAgpAndFa $f:r.agp $f | tail -1 end # eyeball to make sure all AGP's have at least some contig gaps foreach f (*/*.agp) echo -n "$f:t:r contig gaps: " awk '$5 == "N" && $7 == "contig" {print $6;}' $f | sort -n | uniq -c end # eyeball distribution of fragment gap sizes too: awk '$5 == "N" && $7 == "fragment" {print $6;}' */*.agp | sort -n | uniq -c # Hmmm, 0 fragment gaps with size 1000, fewer than would expect by chance: # 8 997 # 1 998 # 4 999 # 3 1001 # 6 1002 # 5 1003 # super-paranoid: look for contig gaps in the middle of a run of # similarly-named contigs (implying same supercontig). foreach f (*/*.agp) echo $f perl -we ' \ while (<>) { \ @prevprevwords = @prevwords if (defined @prevwords); \ @prevwords = @words if (defined @words); \ chomp; @words = split; \ if (defined $prevwords[4] && $prevwords[4] eq "N" && \ $prevwords[6] eq "contig") { \ $lastsuper = $prevprevwords[5]; $lastsuper =~ s/\.\d+$//; \ $thissuper = $words[5]; $thissuper =~ s/\.\d+$//; \ if ($lastsuper eq $thissuper) { \ print "line $.: contig gap in $thissuper\n"; \ } \ } \ }' $f end #2R/chr2R.agp #line 3433: contig gap in W501_Contig9 #3L/chr3L.agp #line 3473: contig gap in W501_Contig4 #X/chrX.agp #line 4000: contig gap in W501_Contig112 #X/chrX_random.agp #line 3379: contig gap in W501_Contig763 # Got the go-ahead from LaDeana to edit those 4 cases to "fragment\tyes" # and she will do the same. Re-ran checks, everything looks good now. # Reload gap (and gold) tables from agp: ssh hgwdev cd /cluster/data/droSim1 hgGoldGapGl -noGl -chromLst=chrom.lst droSim1 /cluster/data/droSim1 . # MAKE Drosophila Proteins track (DONE 06-30-05 braney) ssh kk mkdir -p /cluster/data/droSim1/bed/tblastn.dm2FB cd /cluster/data/droSim1/bed/tblastn.dm2FB ls -1S /iscratch/i/droSim1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\) #18929/(164630/320) = 36.793294 split -l 50 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/droSim1/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/droSim1/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droSim1/jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droSim1/jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/droSim1/bed/tblastn.dm2FB para create blastSpec para push # Completed: 121280 of 121280 jobs # CPU time in finished jobs: 879967s 14666.12m 244.44h 10.18d 0.028 y # IO & Wait Time: 463525s 7725.41m 128.76h 5.36d 0.015 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 84s 1.40m 0.02h 0.00d # Submission to last job: 16896s 281.60m 4.69h 0.20d ssh kki cd /cluster/data/droSim1/bed/tblastn.dm2FB tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 379 of 379 jobs # CPU time in finished jobs: 3593s 59.89m 1.00h 0.04d 0.000 y # IO & Wait Time: 9877s 164.61m 2.74h 0.11d 0.000 y # Average job time: 36s 0.59m 0.01h 0.00d # Longest finished job: 610s 10.17m 0.17h 0.01d # Submission to last job: 1585s 26.42m 0.44h 0.02d cd /cluster/data/droSim1/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/droSim1/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/droSim1/bed/tblastn.dm2FB hgLoadPsl droSim1 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie) ssh hgwdev mkdir /cluster/data/droSim1/bed/geneid cd /cluster/data/droSim1/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/D.simulans/golden_path_200504/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt droSim1 geneid *.gtf # BLASTZ/CHAIN/NET DROYAK2 (DONE 7/28/05 angie) mkdir /cluster/data/droSim1/bed/blastz.droYak2.2006-07-24 cd /cluster/data/droSim1/bed/blastz.droYak2.2006-07-24 cat << '_EOF_' > DEF # D.simulans vs. D.yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. simulans SEQ1_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/droSim1/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes BASE=/cluster/data/droSim1/bed/blastz.droYak2.2006-07-24 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -blastzOutRoot /san/sanvol1/scratch/droSim1droYak2 >& do.log & tail -f do.log # The chain job for droSim1.chrU kept running out of mem, so I # made a separate .csh script that would chain chunks of chrU. # Some chains may be split because of this, but they're on chrU # anyway... ssh kkr6u00 cd /cluster/store8/droSim1/bed/blastz.droYak2.2006-07-24/axtChain/run para time > run.time cat run.time >>& ../../do.log cat > chain_chrU.csh <<'_EOF_' #!/bin/csh -efx foreach f (../../pslParts/$1*.psl.gz) zcat $f \ | axtChain -psl -verbose=0 -scoreScheme=/cluster/data/blastz/HoxD55.q -linearGap=loose stdin \ /san/sanvol1/scratch/droSim1/droSim1.2bit \ /san/sanvol1/scratch/droYak2/droYak2.2bit \ stdout \ | chainAntiRepeat /san/sanvol1/scratch/droSim1/droSim1.2bit \ /san/sanvol1/scratch/droYak2/droYak2.2bit \ stdin $f:t:r:r.chain end chainMergeSort $1*.chain > $2 '_EOF_' # << emacs ./chain_chrU.csh droSim1.2bit:chrU: chain/droSim1.2bit:chrU:.chain \ >& chain_chrU.log & tail -f chain_chrU.log cat chain_chrU.log >> ../../do.log cd ../.. doBlastzChainNet.pl DEF -continue chainMerge -workhorse=kkr6u00 \ >>& do.log & tail -f do.log ln -s blastz.droYak2.2006-07-24 /cluster/data/droSim1/bed/blastz.droYak2 featureBits -chrom=chr2R droSim1 chainDroYak2Link #16933107 bases of 18190314 (93.089%) in intersection featureBits -chrom=chr2R droSim1 chainDm2Link #17421444 bases of 18190314 (95.773%) in intersection featureBits -chrom=chr4 droSim1 chainDroYak2Link #635094 bases of 807953 (78.605%) in intersection featureBits -chrom=chr4 droSim1 chainDm2Link #696892 bases of 807953 (86.254%) in intersection # RECIPROCAL BEST DROSIM1-DROYAK2 (IN PROGRESS 8/1/06 angie) # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago) ssh kolossus cd /cluster/data/droSim1/bed/blastz.droYak2/axtChain # Swap droSim1-best chains to be droYak2-referenced: chainStitchId droSim1.droYak2.over.chain.gz stdout \ | chainSwap stdin stdout \ | chainSort stdin droYak2.droSim1.tBest.chain # Net those on droYak2 to get droYak2-ref'd reciprocal best net: chainPreNet droYak2.droSim1.tBest.chain \ /cluster/data/{droYak2,droSim1}/chrom.sizes stdout \ | chainNet -minSpace=1 stdin /cluster/data/{droYak2,droSim1}/chrom.sizes \ stdout /dev/null \ | netSyntenic stdin droYak2.droSim1.rbest.net # Extract droYak2-ref'd reciprocal best chain: netChainSubset droYak2.droSim1.rbest.net droYak2.droSim1.tBest.chain stdout \ | chainStitchId stdin droYak2.droSim1.rbest.chain # Swap to get droSim1-ref'd reciprocal best chain: chainSwap droYak2.droSim1.rbest.chain stdout \ | chainSort stdin droSim1.droYak2.rbest.chain # Net those on droSim1 to get droSim1-ref'd reciprocal best net: chainPreNet droSim1.droYak2.rbest.chain \ /cluster/data/{droSim1,droYak2}/chrom.sizes stdout \ | chainNet -minSpace=1 -minScore=0 \ stdin /cluster/data/{droSim1,droYak2}/chrom.sizes stdout /dev/null \ | netSyntenic stdin droSim1.droYak2.rbest.net # Compress and clean up. rm droYak2.droSim1.tBest.chain gzip *.rbest.* md5sum *.rbest.*.gz > md5sum.rbest.txt # Test coverage of *.rbest.* -- should be equal. netToBed -maxGap=1 droYak2.droSim1.rbest.net.gz droYak2.droSim1.rbest.net.bed netToBed -maxGap=1 droSim1.droYak2.rbest.net.gz droSim1.droYak2.rbest.net.bed chainToPsl droYak2.droSim1.rbest.chain.gz \ /cluster/data/{droYak2,droSim1}/chrom.sizes \ /cluster/data/droYak2/droYak2.2bit /cluster/data/droSim1/droSim1.2bit \ droYak2.droSim1.rbest.chain.psl chainToPsl droSim1.droYak2.rbest.chain.gz \ /cluster/data/{droSim1,droYak2}/chrom.sizes \ /cluster/data/droSim1/droSim1.2bit /cluster/data/droYak2/droYak2.2bit \ droSim1.droYak2.rbest.chain.psl ssh hgwdev cd /cluster/data/droSim1/bed/blastz.droYak2/axtChain featureBits droYak2 droYak2.droSim1.rbest.net.bed #101989543 bases of 162681153 (62.693%) in intersection featureBits droYak2 droYak2.droSim1.rbest.chain.psl #101989543 bases of 162681153 (62.693%) in intersection featureBits droSim1 droSim1.droYak2.rbest.chain.psl #101989543 bases of 127256433 (80.145%) in intersection featureBits droSim1 droSim1.droYak2.rbest.net.bed #101989543 bases of 127256433 (80.145%) in intersection mkdir experiments mv *.bed *.psl experiments # Put out for download (hgwdev-only) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/droSim1/vsDroYak2/reciprocalBest cd /usr/local/apache/htdocs/goldenPath/droSim1/vsDroYak2/reciprocalBest ln -s /cluster/data/droSim1/bed/blastz.droYak2/axtChain/*.rbest.*.gz . ln -s /cluster/data/droSim1/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt # Make a README.txt. ########################################################################### # SWAP/CHAIN/NET DM3 (DONE 6/8/07 angie) ssh kkstore02 mkdir /cluster/data/droSim1/bed/blastz.dm3.swap cd /cluster/data/droSim1/bed/blastz.dm3.swap doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.droSim1/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /cluster/data/droSim1/bed/blastz.dm3