# for emacs: -*- mode: sh; -*- # Drosophila melanogaster -- BDGP Release 5.0 (Apr. 2006) ######################################################################### # DOWNLOAD SEQUENCE (DONE 7/7/06 angie) ssh kkstore05 mkdir /cluster/store12/dm3 ln -s /cluster/store12/dm3 /cluster/data/dm3 mkdir /cluster/data/dm3/downloads cd /cluster/data/dm3/downloads wget ftp://ftp.fruitfly.org/pub/download/compressed/dmel_release5-2006-04-16.tgz tar xvzf dmel_release5-2006-04-16.tgz cd Dmel_Release5 faSize na* #168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files #Total size: mean 12051215.7 sd 11751902.6 min 204112 (XHet) max 29004656 (ArmUextra) median 10049037 # Follow FlyBase's lead on the chromosome names, but still use our # "chr" prefix: mkdir ../renamedChromFa foreach f (na_*.dmel.RELEASE5) set chr = `echo $f:r:r | sed -r -e 's/^na_(arm)?/chr/;'` echo $chr if (`grep '^>' $f | wc -l` != 1) then echo $f does not have exactly one fasta record\! break endif sed -e 's/^>.*/>'$chr'/' $f > ../renamedChromFa/$chr.fa end cd ../renamedChromFa/ faSize * #168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files #Total size: mean 12051215.7 sd 11751902.6 min 204112 (chrXHet) max 29004656 (chrUextra) median 10049037 ######################################################################### # MAKE GENOME DB (DONE 7/10/06 angie) ssh kkstore05 cd /cluster/data/dm3 cat > dm3.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE 7/31/06 angie - REDONE 6/20/07) # 6/19/07 -- Debugged the coverage drop w/Robert Hubley. It was due to # version lag between the NCBI taxonomy label for the Drosophila genus # vs. the label used in the Library file. I fixed the local lib file, # and Robert said it will be fixed in the next released lib file. ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: ~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 -debug # Run it for real and tail the log: ~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 \ >& /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log & tail -f /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log # RepeatMasker and lib version from do.log: # May 17 2007 (open-3-1-8) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20061006; * # Compare coverage to previous assembly: featureBits dm3 rmsk #44719009 bases of 162367812 (27.542%) in intersection featureBits dm2 rmsk #16178239 bases of 131698467 (12.284%) in intersection # Wow! What an increase... largely due to the new chrUextra: featureBits dm3 rmsk -chrom=chrUextra #20898714 bases of 25541756 (81.822%) in intersection featureBits dm3 rmsk -chrom=chr3R #1544878 bases of 27905053 (5.536%) in intersection featureBits dm2 rmsk -chrom=chr3R #1538239 bases of 27905053 (5.512%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 8/1/06 angie) ssh kolossus nice tcsh mkdir /cluster/data/dm3/bed/simpleRepeat cd /cluster/data/dm3/bed/simpleRepeat twoBitToFa ../../dm3.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # Only about 20 minutes, but dm3 is only 168Mbp... # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed dm3 simpleRepeat \ /cluster/data/dm3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Compare coverage to previous assembly: featureBits dm3 simpleRepeat #6837416 bases of 162367812 (4.211%) in intersection featureBits dm2 simpleRepeat #1597029 bases of 131698467 (1.213%) in intersection # Looks like the increase is due to chrUextra: featureBits dm3 simpleRepeat -chrom=chrUextra #3871170 bases of 25541756 (15.156%) in intersection featureBits dm3 simpleRepeat -chrom=chr2L #225239 bases of 23011344 (0.979%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/1/06 angie - REDONE 6/21/07) ssh kolossus cd /cluster/data/dm3 time twoBitMask dm3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dm3.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has 10 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #0.136u 0.272s 0:01.98 20.2% 0+0k 0+0io 3pf+0w # Link to it from /gbdb: ssh hgwdev ln -s /cluster/data/dm3/dm3.2bit /gbdb/dm3/dm3.2bit ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/4/06 angie - REDONE 6/21/07) cd /cluster/data/dm3 makeDownloads.pl dm3 -verbose=2 >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/dm3/goldenPath/database/README.txt # /cluster/data/dm3/goldenPath/bigZips/README.txt # /cluster/data/dm3/goldenPath/chromosomes/README.txt # 6/21/07 -- since I'm regenerating these after the GenBank tables are # in place... our bigZips/upstream* files are from FlyBase Genes -- # remove the refseq versions generated by the script and restore # the FlyBase upstream: ssh hgwdev rm /cluster/data/dm3/goldenPath/bigZips/upstream* rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream* ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/up* \ /usr/local/apache/htdocs/goldenPath/dm3/bigZips/ cat /cluster/data/dm3/bed/flybase5.1/bigZips/md5sum.txt \ >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt # Edited out the old up*.gz sums. ######################################################################### # BLASTZ/CHAIN/NET DP3 (DONE 8/14/06 angie) mkdir /cluster/data/dm3/bed/blastz.dp3.2006-08-14 cd /cluster/data/dm3/bed/blastz.dp3.2006-08-14 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm3/dm3.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp3/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp3/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.dp3.2006-08-14 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/dm3dp3 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDp3Link #17523073 bases of 23011344 (76.150%) in intersection #dm2: 74.901% ln -s blastz.dp3.2006-08-14 /cluster/data/dm3/bed/blastz.dp3 ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 8/18/06 angie - REDONE 6/19/07) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kolossus blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dm3/11.ooc -repMatch=100 #Wrote 7657 overused 11-mers to /cluster/bluearc/dm3/11.ooc # More than twice as many as dm2... probably due to chrUextra. # 6/19/07 -- That seems to be adversely affecting genbank alignments, # so bump up repMatch to get a .ooc count similar to dm2's 3031. # repMatch #overused # 100 7657 # 150 3709 # 165 3152 <-- close enough # 180 2688 # 200 2242 blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dm3/11.scaled.ooc -repMatch=165 #Wrote 3152 overused 11-mers to /cluster/bluearc/dm3/11.scaled.ooc # MarkD did a test run, results look good, so I'll replace the # original .ooc. Also, Mark requested moving the ooc to the same # dir as the genbank .2bit (iscratch). mv /cluster/bluearc/dm3/11.scaled.ooc /cluster/bluearc/dm3/11.ooc foreach i (1 2 3 4 6 7) rsync -av /cluster/bluearc/dm3/11.ooc kkr${i}u00:/iscratch/i/dm3/ end # Edited makeDb/genbank/etc/genbank.conf to specify new ooc location. ######################################################################### # GENBANK AUTO UPDATE (DONE 10/9/06 angie - REDONE 6/20/07 markd) # 6/20/07 - MarkD rebuilt with regenerated 11.ooc. # Make a liftAll.lft that specifies 5M chunks for genbank: ssh kkstore05 cd /cluster/data/dm3 ~/kent/src/utils/simplePartition.pl dm3.2bit 5000000 /tmp/dm3P cat /tmp/dm3P/*/*.lft > jkStuff/liftAll.lft rm -r /tmp/dm3P # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add dm3 just after dm2... # dm3 (D. melanogaster) dm3.serverGenome = /cluster/data/dm3/dm3.2bit dm3.clusterGenome = /iscratch/i/dm3/dm3.2bit dm3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} dm3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} dm3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} dm3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} dm3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} dm3.ooc = /iscratch/i/dm3/11.ooc dm3.lift = /cluster/data/dm3/jkStuff/liftAll.lft dm3.genbank.mrna.xeno.load = yes dm3.downloadDir = dm3 cvs ci -m "Added dm3." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial dm3 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad dm3 & # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add dm3 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update # 6/20/07 rebuild by MarkD (realign and reload native cDNAs using new # ooc file): ssh kkstore02 cd /cluster/data/genbank rm -f data/aligned/genbank.159.0/dm3/*/*native* rm -f data/aligned/refseq.23/dm3/*/*native* ./bin/gbAlignStep -initial -orgCat=native dm3 ./bin/gbDbLoadStep -drop -initialLoad dm3 ######################################################################### # BACEND PAIRS TRACK (DONE 8/18/06 angie) ssh kkstore05 # Plenty of room at the moment: df -h /cluster/data/dm3 # 4.5T 2.1T 2.4T 47% /cluster/store12 mkdir -p /cluster/data/dm3/bed/cloneend/ncbi cd /cluster/data/dm3/bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/drosophila_melanogaster/\* # Wow... the directory exists, but contains no files. # Use the libs that Roger Hoskins pointed us to for dm2. mkdir /cluster/data/dm3/bed/bacends cd /cluster/data/dm3/bed/bacends wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM faSize * #40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files #Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009 #N count: mean 91.0 sd 80.8 #U count: mean 801.5 sd 207.4 #L count: mean 71.7 sd 45.5 cat banque_Projet_A* > bacends.fa # Edit the part out of bacends.fa. # In dm2, we parsed bacEndPairs.txt out of sequence headers. # The sequences have not changed since then, so just copy over # bacEndPairs.txt. cp /cluster/data/dm2/bed/bacends/bacEndPairs.txt . # Fly is much smaller than human so we can get away with one job # per chrom, no splitting... ssh kki mkdir /cluster/bluearc/dm3/bacends cd /cluster/bluearc/dm3/bacends cp /cluster/data/dm3/bed/bacends/bacends.fa . mkdir out echo bacends.fa > bacends.lst awk '{print "/iscratch/i/dm3/dm3.2bit:" $1}' /cluster/data/dm3/chrom.sizes \ > dm3.lst cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm3/11.ooc {check out exists out/$(file1).$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 dm3.lst bacends.lst gsub jobList para make jobList para time #Completed: 15 of 15 jobs #CPU time in finished jobs: 1721s 28.68m 0.48h 0.02d 0.000 y #IO & Wait Time: 23s 0.39m 0.01h 0.00d 0.000 y #Average job time: 116s 1.94m 0.03h 0.00d #Longest finished job: 744s 12.40m 0.21h 0.01d #Submission to last job: 744s 12.40m 0.21h 0.01d # Back on kolossus: cd /cluster/bluearc/dm3/bacends cat out/*.psl \ | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ stdin bacEnds.unpaired.psl /dev/null # Roger's suggested 50kb-250kb range lost some (especially on the short # side) in dm2, so I ended up going with Terry's range for human, 35kb-350kb: # NOTE FOR NEXT TIME: Use Roger's range -- physical sizes don't exceed 250kb! pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \ -slopval=10000 -hardMax=500000 \ -slop -short -long -orphan -mismatch -verbose \ bacEnds.unpaired.psl /cluster/data/dm3/bed/bacends/bacEndPairs.txt \ all_bacends bacEnds wc -l bacEnds.* # 17 bacEnds.long # 96 bacEnds.mismatch # 6935 bacEnds.orphan # 11957 bacEnds.pairs # 28 bacEnds.short # 120 bacEnds.slop # 186593 bacEnds.unpaired.psl # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed wc -l *.bed # 11938 bacEndPairs.bed # 4824 bacEndPairsBad.bed # Filter the alignments into those we'll need in the all_bacends table: extractPslLoad -noBin bacEnds.unpaired.psl \ bacEndPairs.bed bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl mv * /cluster/data/dm3/bed/bacends/ # load into database ssh hgwdev cd /cluster/data/dm3/bed/bacends # load BAC end sequences mkdir -p /gbdb/dm3/bacends ln -s /cluster/data/dm3/bed/bacends/bacends.fa /gbdb/dm3/bacends/ hgLoadSeq dm3 /gbdb/dm3/bacends/bacends.fa hgLoadBed dm3 bacEndPairs bacEndPairs.bed \ -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed dm3 bacEndPairsBad bacEndPairsBad.bed \ -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql hgLoadPsl dm3 -table=all_bacends bacEnds.load.psl # Compares favorably to previous release? (good cov up, bad cov down): # Well, this is way up: featureBits dm3 all_bacends #32765057 bases of 162367812 (20.180%) in intersection featureBits dm2 all_bacends #22221379 bases of 131698467 (16.873%) in intersection # The percentage dropped but I think that's because of the increase in # assembly size (yep, once again blame it on chrUextra :). The absolute # count is up which seems good. featureBits dm3 bacEndPairs #15484492 bases of 162367812 (9.537%) in intersection featureBits dm2 bacEndPairs #14771250 bases of 131698467 (11.216%) in intersection # Clean up. rm bed.tab rm -r out rm -r /cluster/bluearc/dm3/bacends/ # end BACEND PAIRS TRACK ######################################################################### # GC5BASE (DONE 8/18/06 angie) # In the future this will be included in the makeGenomeDb.pl run. # Make gc5Base wiggle files. ssh kolossus mkdir -p /cluster/data/dm3/bed/gc5Base cd /cluster/data/dm3/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 dm3 \ /cluster/data/dm3/dm3.unmasked.2bit \ | wigEncode stdin gc5Base.{wig,wib} ssh hgwdev # Load gc5base mkdir -p /gbdb/dm3/wib rm -f /gbdb/dm3/wib/gc5Base.wib ln -s /cluster/data/dm3/bed/gc5Base.wib /gbdb/dm3/wib hgLoadWiggle -pathPrefix=/gbdb/dm3/wib \ dm3 gc5Base /cluster/data/dm3/bed/gc5Base/gc5Base.wig ######################################################################### # BLASTZ/CHAIN/NET DROSIM1 (DONE 8/18/06 angie) mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-08-18 cd /cluster/data/dm3/bed/blastz.droSim1.2006-08-18 cat << '_EOF_' > DEF # D. melanogaster vs. D. simulans BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. simulans SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-08-18 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/dm3droSim1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDroSim1Link #21667700 bases of 23011344 (94.161%) in intersection # dm2: 91.756% rm -f /cluster/data/dm3/bed/blastz.droSim1 ln -s blastz.droSim1.2006-08-18 /cluster/data/dm3/bed/blastz.droSim1 ######################################################################### # BLASTZ/CHAIN/NET DROYAK2 (DONE 8/18/06 angie) mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-08-18 cd /cluster/data/dm3/bed/blastz.droYak2.2006-08-18 cat << '_EOF_' > DEF # D. melanogaster vs. D. yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-08-18 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/dm3droYak2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDroYak2Link #21965072 bases of 23011344 (95.453%) in intersection # dm2: 91.943% rm -f /cluster/data/dm3/bed/blastz.droYak2 ln -s blastz.droYak2.2006-08-18 /cluster/data/dm3/bed/blastz.droYak2 ######################################################################### # BLASTZ/CHAIN/NET DROSEC1 (DONE 8/21/06 angie) mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-08-18 cd /cluster/data/dm3/bed/blastz.droSec1.2006-08-18 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. sechellia SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droSec1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-08-18 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/dm3droSec1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDroSec1Link #21913800 bases of 23011344 (95.230%) in intersection # dm2: 92.860% rm -f /cluster/data/dm3/bed/blastz.droSec1 ln -s blastz.droSec1.2006-08-18 /cluster/data/dm3/bed/blastz.droSec1 ######################################################################### # BLASTZ/CHAIN/NET DROERE1 (DONE 8/22/06 angie) mkdir /cluster/data/dm3/bed/blastz.droEre1.2006-08-21 cd /cluster/data/dm3/bed/blastz.droEre1.2006-08-21 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. sechellia SEQ2_DIR=/san/sanvol1/scratch/droEre1/droEre1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droEre1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droEre1.2006-08-21 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/dm3droEre1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDroEre1Link #21706597 bases of 23011344 (94.330%) in intersection # dm2: 91.103% rm -f /cluster/data/dm3/bed/blastz.droEre1 ln -s blastz.droEre1.2006-08-21 /cluster/data/dm3/bed/blastz.droEre1 # BLASTZ/CHAIN/NET DROANA2 (DONE 8/22/06 angie) mkdir /cluster/data/dm3/bed/blastz.droAna2.2006-08-22 cd /cluster/data/dm3/bed/blastz.droAna2.2006-08-22 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/san/sanvol1/scratch/droAna2/droAna2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droAna2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droAna2.2006-08-22 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /san/sanvol1/scratch/dm3droAna2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L chainDroAna2Link #19329381 bases of 23011344 (83.999%) in intersection #dm3: 81.385% rm -f /cluster/data/dm3/bed/blastz.droAna2 ln -s blastz.droAna2.2006-08-22 /cluster/data/dm3/bed/blastz.droAna2 ######################################################################### # LIFTOVER FLYBASE 4.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 8/18/06 angie) # Until FlyBase releases Release 5 annotations, liftOver'd 4.2 genes # will be better than nothing. ssh hgwdev mkdir /cluster/data/dm3/bed/flybase4.2.liftOver cd /cluster/data/dm3/bed/flybase4.2.liftOver hgsql dm2 -N -e 'select * from flyBaseGene' > dm2.flyBaseGene.tab hgsql dm2 -N -e 'select * from flyBaseNoncoding' > dm2.flyBaseNoncoding.tab liftOver -genePred dm2.flyBaseGene.tab \ /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \ flyBaseLiftGene.tab flyBaseLiftGene.unmapped.tab liftOver -bedPlus=12 -hasBin dm2.flyBaseNoncoding.tab \ /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \ flyBaseLiftNoncoding.tab flyBaseLiftNoncoding.unmapped.tab # Not quite perfect but not bad: wc -l *.tab # 19178 dm2.flyBaseGene.tab # 727 dm2.flyBaseNoncoding.tab # 19173 flyBaseLiftGene.tab # 10 flyBaseLiftGene.unmapped.tab # 727 flyBaseLiftNoncoding.tab # 0 flyBaseLiftNoncoding.unmapped.tab ldHgGene -predTab dm3 flyBaseLiftGene flyBaseLiftGene.tab hgLoadBed -hasBin dm3 flyBaseLiftNoncoding flyBaseLiftNoncoding.tab # Copy pep table from dm2: hgPepPred dm3 tab flyBaseLiftGenePep \ /cluster/data/dm2/bed/flybase4.2/flyBasePep.tab ########################################################################### # BLASTZ/CHAIN/NET DROSIM1 (DONE 11/14/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-11-14 cd /cluster/data/dm3/bed/blastz.droSim1.2006-11-14 cat << '_EOF_' > DEF # D. melanogaster vs. D. simulans BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. simulans SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-11-14 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSim1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSim1Link #flyBaseLiftGene 22.721%, chainDroSim1Link 94.107%, both 22.094%, cover 97.24%, enrich 1.03x rm -f /cluster/data/dm3/bed/blastz.droSim1 ln -s blastz.droSim1.2006-11-14 /cluster/data/dm3/bed/blastz.droSim1 ########################################################################### # BLASTZ/CHAIN/NET DROSEC1 (DONE 11/29/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-11-28 cd /cluster/data/dm3/bed/blastz.droSec1.2006-11-28 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. sechellia SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droSec1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-11-28 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSec1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSec1Link #flyBaseLiftGene 22.721%, chainDroSec1Link 95.197%, both 22.491%, cover 98.99%, enrich 1.04x rm -f /cluster/data/dm3/bed/blastz.droSec1 ln -s blastz.droSec1.2006-11-28 /cluster/data/dm3/bed/blastz.droSec1 ########################################################################### # BLASTZ/CHAIN/NET DROYAK2 (DONE 11/15/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-11-15 cd /cluster/data/dm3/bed/blastz.droYak2.2006-11-15 cat << '_EOF_' > DEF # D. melanogaster vs. D. yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-11-15 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droYak2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroYak2Link #flyBaseLiftGene 22.721%, chainDroYak2Link 95.403%, both 22.489%, cover 98.98%, enrich 1.04x rm -f /cluster/data/dm3/bed/blastz.droYak2 ln -s blastz.droYak2.2006-11-15 /cluster/data/dm3/bed/blastz.droYak2 ########################################################################### # BLASTZ/CHAIN/NET DROPER1 (DONE 11/20/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droPer1.2006-11-15 cd /cluster/data/dm3/bed/blastz.droPer1.2006-11-15 cat << '_EOF_' > DEF # D. melanogaster vs. D. persimilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. persimilis SEQ2_DIR=/san/sanvol1/scratch/droPer1/droPer1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droPer1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droPer1.2006-11-15 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droPer1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroPer1Link #flyBaseLiftGene 22.721%, chainDroPer1Link 76.570%, both 20.929%, cover 92.11%, enrich 1.20x rm -f /cluster/data/dm3/bed/blastz.droPer1 ln -s blastz.droPer1.2006-11-15 /cluster/data/dm3/bed/blastz.droPer1 ########################################################################### # BLASTZ/CHAIN/NET DROERE2 (DONE 11/30/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droEre2.2006-11-29 cd /cluster/data/dm3/bed/blastz.droEre2.2006-11-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. erecta BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. erecta SEQ2_DIR=/san/sanvol1/scratch/droEre2/droEre2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droEre2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droEre2.2006-11-29 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3droEre2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroEre2Link #flyBaseLiftGene 22.721%, chainDroEre2Link 94.227%, both 22.472%, cover 98.90%, enrich 1.05x rm -f /cluster/data/dm3/bed/blastz.droEre2 ln -s blastz.droEre2.2006-11-29 /cluster/data/dm3/bed/blastz.droEre2 ########################################################################### # BLASTZ/CHAIN/NET DROGRI2 (DONE 12/1/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droGri2.2006-11-30 cd /cluster/data/dm3/bed/blastz.droGri2.2006-11-30 cat << '_EOF_' > DEF # D. melanogaster vs. D. grimshawi BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=50000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. grimshawi SEQ2_DIR=/san/sanvol1/scratch/droGri2/droGri2.2bit SEQ2_CHUNK=50000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droGri2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droGri2.2006-11-30 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3droGri2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroGri2Link #flyBaseLiftGene 22.721%, chainDroGri2Link 66.170%, both 20.274%, cover 89.23%, enrich 1.35x rm -f /cluster/data/dm3/bed/blastz.droGri2 ln -s blastz.droGri2.2006-11-30 /cluster/data/dm3/bed/blastz.droGri2 ########################################################################### # BLASTZ/CHAIN/NET DROWIL1 (DONE 12/4/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droWil1.2006-12-04 cd /cluster/data/dm3/bed/blastz.droWil1.2006-12-04 cat << '_EOF_' > DEF # D. melanogaster vs. D. willistoni BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. willistoni SEQ2_DIR=/san/sanvol1/scratch/droWil1/droWil1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droWil1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droWil1.2006-12-04 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3droWil1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroWil1Link #flyBaseLiftGene 22.721%, chainDroWil1Link 73.133%, both 20.583%, cover 90.59%, enrich 1.24x rm -f /cluster/data/dm3/bed/blastz.droWil1 ln -s blastz.droWil1.2006-12-04 /cluster/data/dm3/bed/blastz.droWil1 ########################################################################### # BLASTZ/CHAIN/NET DROANA3 (DONE 12/4/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droAna3.2006-12-04 cd /cluster/data/dm3/bed/blastz.droAna3.2006-12-04 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/san/sanvol1/scratch/droAna3/droAna3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/san/sanvol1/scratch/droAna3/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droAna3.2006-12-04 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3droAna3 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroAna3Link #flyBaseLiftGene 22.721%, chainDroAna3Link 83.841%, both 21.651%, cover 95.29%, enrich 1.14x rm -f /cluster/data/dm3/bed/blastz.droAna3 ln -s blastz.droAna3.2006-12-04 /cluster/data/dm3/bed/blastz.droAna3 ########################################################################### # BLASTZ/CHAIN/NET DROMOJ3 (DONE 12/4/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04 cd /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04 cat << '_EOF_' > DEF # D. melanogaster vs. D. mojavensis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. mojavensis SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droMoj3.2006-12-04 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -blastzOutRoot /panasas/store/dm3droMoj3 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroMoj3Link #flyBaseLiftGene 22.721%, chainDroMoj3Link 65.253%, both 20.171%, cover 88.78%, enrich 1.36x rm -f /cluster/data/dm3/bed/blastz.droMoj3 ln -s blastz.droMoj3.2006-12-04 /cluster/data/dm3/bed/blastz.droMoj3 ########################################################################### # BLASTZ/CHAIN/NET DROVIR3 (DONE 12/4/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.droVir3.2006-12-04 cd /cluster/data/dm3/bed/blastz.droVir3.2006-12-04 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droVir3/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droVir3.2006-12-04 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -blastzOutRoot /cluster/bluearc/dm3droVir3 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroVir3Link #flyBaseLiftGene 22.721%, chainDroVir3Link 67.299%, both 20.340%, cover 89.52%, enrich 1.33x rm -f /cluster/data/dm3/bed/blastz.droVir3 ln -s blastz.droVir3.2006-12-04 /cluster/data/dm3/bed/blastz.droVir3 ########################################################################### # BLASTZ/CHAIN/NET DP4 (DONE 12/4/06 angie) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.dp4.2006-12-04 cd /cluster/data/dm3/bed/blastz.dp4.2006-12-04 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/san/sanvol1/scratch/dp4/dp4.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp4/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.dp4.2006-12-04 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore 3000 \ -blastzOutRoot /san/sanvol1/scratch/dm3dp4 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDp4Link #flyBaseLiftGene 22.721%, chainDp4Link 76.299%, both 20.966%, cover 92.28%, enrich 1.21x rm -f /cluster/data/dm3/bed/blastz.dp4 ln -s blastz.dp4.2006-12-04 /cluster/data/dm3/bed/blastz.dp4 ########################################################################### # BLASTZ/CHAIN/NET ANOGAM1 (DONE 12/5/06) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04 cd /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04 cat <<_EOF_ > DEF # D. melanogaster vs. A. gambiae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - A. gambiae SEQ2_DIR=/iscratch/i/anoGam1/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.anoGam1.2006-12-04 _EOF_ doBlastzChainNet.pl DEF -chainMinScore 3000 \ -blastzOutRoot /cluster/bluearc/dm3anoGam1 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainAnoGam1Link #flyBaseLiftGene 22.721%, chainAnoGam1Link 19.536%, both 12.711%, cover 55.94%, enrich 2.86x rm -f /cluster/data/dm3/bed/blastz.anoGam1 ln -s blastz.anoGam1.2006-12-04 /cluster/data/dm3/bed/blastz.anoGam1 ########################################################################### # BLASTZ/CHAIN/NET APIMEL3 (DONE 7/5/07 except for loading of netApiMel3) ssh kkstore05 mkdir /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11 cd /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11 cat <<_EOF_ > DEF # D. melanogaster vs. A. mellifera BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/san/sanvol1/scratch/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - A. mellifera SEQ2_DIR=/san/sanvol1/scratch/apiMel3/apiMel3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/apiMel3/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.apiMel3.2006-12-11 _EOF_ doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3apiMel3 >& do.log & tail -f do.log # Died in loading phase -- netClass failed because there is no apiMel3 db! featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainApiMel3Link #flyBaseLiftGene 22.721%, chainApiMel3Link 17.865%, both 8.790%, cover 38.68%, enrich 2.17x rm -f /cluster/data/dm3/bed/blastz.apiMel3 ln -s blastz.apiMel3.2006-12-11 /cluster/data/dm3/bed/blastz.apiMel3 # 7/5/07: continue to make download files. doBlastzChainNet.pl DEF -chainMinScore 3000 \ -bigClusterHub pk -smallClusterHub pk \ -blastzOutRoot /cluster/bluearc/dm3apiMel3 \ -continue download >>& do.log & tail -f do.log ########################################################################### # BLASTZ/CHAIN/NET TRICAS2 (DONE 12/11/06) ssh kkstore04 mkdir /cluster/data/dm3/bed/blastz.triCas2.2006-12-11 cd /cluster/data/dm3/bed/blastz.triCas2.2006-12-11 cat <<_EOF_ > DEF # D. melanogaster vs. T. castaneum BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm3/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - T. castaneum SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.triCas2.2006-12-11 _EOF_ doBlastzChainNet.pl DEF -chainMinScore 3000 \ -blastzOutRoot /cluster/bluearc/dm3triCas2 >& do.log & tail -f do.log featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainTriCas2Link #flyBaseLiftGene 22.721%, chainTriCas2Link 20.039%, both 10.463%, cover 46.05%, enrich 2.30x rm -f /cluster/data/dm3/bed/blastz.triCas2 ln -s blastz.triCas2.2006-12-11 /cluster/data/dm3/bed/blastz.triCas2 ########################################################################### # MULTIZ15WAY (DONE 12/11/06 angie) # (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2) # Based on the tree from Teri Markow, it appears that the relationship between # D. yakuba and D. erecta is not correct. The tree should be the following: # ((((((((((dm3,(droSim1,droSec1)),droYak2),droEre2),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2) ssh kkstore04 mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11 cd /cluster/data/dm3/bed/multiz15way.2006-12-11 # copy MAF's to cluster-friendly server mkdir /san/sanvol1/scratch/dm3/mafNet foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel3 triCas2) echo $s rsync -av /cluster/data/dm3/bed/blastz.$s/mafNet/* \ /san/sanvol1/scratch/dm3/mafNet/$s/ end echo '(((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)' \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst ssh pk cd /cluster/data/dm3/bed/multiz15way.2006-12-11 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = dm3 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm3/bed/multiz15way.2006-12-11/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/dm3/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para make jobList para time #Completed: 15 of 15 jobs #CPU time in finished jobs: 44158s 735.97m 12.27h 0.51d 0.001 y #IO & Wait Time: 320s 5.33m 0.09h 0.00d 0.000 y #Average job time: 2965s 49.42m 0.82h 0.03d #Longest finished job: 9309s 155.15m 2.59h 0.11d #Submission to last job: 9309s 155.15m 2.59h 0.11d # ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 12/11/2006 angie) ssh kolossus mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/anno cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/dm3/bed/multiz15way.2006-12-11/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/dm3/dm3.2bit ../maf/`basename $f` >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # 5min. # Load anno/maf ssh hgwdev cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf mkdir -p /gbdb/dm3/multiz15way/anno/maf ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf/*.maf \ /gbdb/dm3/multiz15way/anno/maf date nice hgLoadMaf -pathPrefix=/gbdb/dm3/multiz15way/anno/maf dm3 multiz15way date # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf cat *.maf \ | nice hgLoadMafSummary dm3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \ -test multiz15waySummary stdin #Created 171317 summary blocks from 13563512 components and 1633505 mafs from stdin ssh hgwdev cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz15waySummary.sql time nice hgLoadSqlTab dm3 multiz15waySummary /tmp/multiz15waySummary.sql \ multiz15waySummary.tab #0.000u 0.002s 0:02.59 0.0% 0+0k 0+0io 2pf+0w rm *.tab ln -s multiz15way.2006-12-11 /cluster/data/dm3/bed/multiz15way # MULTIZ15WAY DOWNLOADABLES (ANNOTATED) (DONE 12/11/2006 angie -- UPSTREAM REDONE 5/18/08, 10/27/08) # 5/18/08: regenerating upstream* to fix bug (truncated refGene names to # NM_[0-9]{6} but there are some valid NM_[0-9]{9} IDs now). # 10/27/08: regenerating with FlyBase Genes and mafFrags -txStarts. # Annotated MAF is now documented, so use anno/maf for downloads. ssh hgwdev mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits dm3 flyBaseGene:upstream:$i -fa=/dev/null -bed=stdout \ | sed -re 's/ [^\t]*\t/\t/;' > up.bed nice mafFrags -txStarts dm3 multiz15way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log #171.365u 94.950s 21:16.74 20.8% 0+0k 0+0io 0pf+0w ssh kolossus cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../anno/maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' # << emacs time csh -efx downloads.csh >&! downloads.log #459.706u 5.793s 8:23.71 92.4% 0+0k 0+0io 0pf+0w nice gzip up*.maf nice md5sum up*.maf.gz >> md5sum.txt # Make a README.txt . ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/dm3/multiz15way mkdir $dir ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads/*.{gz,txt} $dir # MULTIZ15WAY MAF FRAMES (DONE 12/11/2006 angie) ssh hgwdev mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/frames cd /cluster/data/dm3/bed/multiz15way.2006-12-11/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes (when available) # currently we have nothing for: # droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2 # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: droSim1 droYak2 anoGam1 mkdir genes foreach queryDb (droSim1 droYak2 anoGam1) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using refGene for dm3 # genePreds; (must keep only the first 10 columns for knownGene) set gpFields = 'name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds' foreach queryDb (dm3) if ($queryDb == "dm3") then set geneTbl = refGene endif hgsql -N -e "select $gpFields from $geneTbl" ${queryDb} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # MarkD's 1-pass maf methodology ssh kolossus cd /cluster/data/dm3/bed/multiz15way/frames nice tcsh # easy way to get process niced (cat ../anno/maf/*.maf \ | time genePredToMafFrames dm3 stdin stdout \ dm3 genes/dm3.gp.gz \ droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \ anoGam1 genes/anoGam1.gp.gz \ | gzip -c > multiz15way.mafFrames.gz) >& frames.log ssh hgwdev cd /cluster/data/dm3/bed/multiz15way/frames hgLoadMafFrames dm3 multiz15wayFrames multiz15way.mafFrames.gz # PHASTCONS 15WAY (DONE 12/11/06 angie) # (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2) # Estimation for dm2 phastCons w/same tree took a long time, and didn't # lead to any real change in params (thank god no iteration req'd). # So skip estimation and use the tree from dm2. ssh kkstore04 mkdir -p /san/sanvol1/scratch/dm3/chrom foreach chr (`awk '{print $1;}' /cluster/data/dm3/chrom.sizes`) twoBitToFa -seq=$chr /cluster/data/dm3/dm3.2bit \ /san/sanvol1/scratch/dm3/chrom/$chr.fa end # Split chrom fa into smaller windows for phastCons: ssh pk mkdir /cluster/data/dm3/bed/multiz15way/phastCons mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.split cd /cluster/data/dm3/bed/multiz15way/phastCons/run.split set WINDOWS = /san/sanvol1/scratch/dm3/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/san/sanvol1/scratch/dm3/chrom set WINDOWS=/san/sanvol1/scratch/dm3/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O dm3,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel3,triCas2 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm3/bed/multiz15way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para make jobList para time #Completed: 15 of 15 jobs #CPU time in finished jobs: 1728s 28.80m 0.48h 0.02d 0.000 y #IO & Wait Time: 1578s 26.30m 0.44h 0.02d 0.000 y #Average job time: 220s 3.67m 0.06h 0.00d #Longest finished job: 1946s 32.43m 0.54h 0.02d #Submission to last job: 1962s 32.70m 0.55h 0.02d ############### SKIP PARAMETER ESTIMATION ############# mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate cd /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate foreach mod (/cluster/data/dm2/bed/multiz15way/phastCons/run.estimate/ave*.mod) sed -e 's/dm2/dm3/g; s/apiMel2/apiMel3/g;' $mod > $mod:t end ############### COMPUTE SCORES AND ELEMENTS ############# ssh pk mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.phast cd /cluster/data/dm3/bed/multiz15way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --seqname $chr --idpref $pref \ --viterbi /san/sanvol1/scratch/dm3/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/san/sanvol1/scratch/dm3/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para make jobList para time #Completed: 178 of 178 jobs #CPU time in finished jobs: 1836s 30.60m 0.51h 0.02d 0.000 y #IO & Wait Time: 16855s 280.92m 4.68h 0.20d 0.001 y #Average job time: 105s 1.75m 0.03h 0.00d #Longest finished job: 544s 9.07m 0.15h 0.01d #Submission to last job: 579s 9.65m 0.16h 0.01d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm3/bed/multiz15way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /san/sanvol1/scratch/dm3/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm3/bed/multiz15way/phastCons featureBits -enrichment dm3 flyBaseLiftGene:cds all.bed # 69.26... not exactly our 68.9% target, but close enough. #flyBaseLiftGene:cds 13.517%, all.bed 32.328%, both 9.361%, cover 69.26%, enrich 2.14x # Having met the CDS coverage target, load up the results. hgLoadBed dm3 phastConsElements15way all.bed # Create wiggle ssh pk mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.wib cd /cluster/data/dm3/bed/multiz15way/phastCons/run.wib rm -rf /san/sanvol1/scratch/dm3/phastCons/wib mkdir -p /san/sanvol1/scratch/dm3/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /san/sanvol1/scratch/dm3/phastCons/wib zcat `ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para make jobList para time #Completed: 15 of 15 jobs #CPU time in finished jobs: 92s 1.53m 0.03h 0.00d 0.000 y #IO & Wait Time: 71s 1.19m 0.02h 0.00d 0.000 y #Average job time: 11s 0.18m 0.00h 0.00d #Longest finished job: 24s 0.40m 0.01h 0.00d #Submission to last job: 24s 0.40m 0.01h 0.00d # back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from san/sanvol1 cd /cluster/data/dm3/bed/multiz15way/phastCons rm -rf wib POSTPROBS rsync -av /san/sanvol1/scratch/dm3/phastCons/wib . rsync -av /san/sanvol1/scratch/dm3/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm3/multiz15way/wib cd /cluster/data/dm3/bed/multiz15way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm3/multiz15way/wib/ hgLoadWiggle dm3 phastCons15way \ -pathPrefix=/gbdb/dm3/multiz15way/wib wib/*.wig rm wiggle.tab # and clean up san/sanvol1. rm -r /san/sanvol1/scratch/dm3/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /san/sanvol1/scratch/dm3/chrom # Offer raw scores for download since fly folks are likely to be interested: ssh kkstore04 cd /cluster/data/dm3/bed/multiz15way/phastCons/POSTPROBS mkdir ../postprobsDownload foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`) zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \ > ../postprobsDownload/$chr.pp.gz end cd ../postprobsDownload md5sum *.gz > md5sum.txt # Make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way cd /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way ln -s /cluster/data/dm3/bed/multiz15way/phastCons/postprobsDownload/* . # make a tree picture using phyloGif. cd /cluster/data/dm3/bed/multiz15way cp tree-commas.nh tree-commas-orgNames.nh # Edit tree-commas-orgNames.nh to use org names. /usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \ -phyloGif_width=260 -phyloGif_height=480 > dm3_15way.gif cp -p dm3_15way.gif ~/browser/images/phylo/dm3_15way.gif # Check in browser/images/phylo/dm3_15way.gif. In a clean ~/browser: cd ~/browser make alpha ######################################################################### # FLYBASE 5.1 ANNOTATIONS (DONE 4/6/07 angie) # OBSOLETED -- see FLYBASE 5.3 ANNOTATIONS BELOW ssh kkstore05 mkdir /cluster/data/dm3/bed/flybase5.1 cd /cluster/data/dm3/bed/flybase5.1 wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/gff/dmel-all-r5.1.gff.gz gunzip dmel-all-r5.1.gff.gz # What data sources are represented in this file? grep -v '^#' dmel-all-r5.1.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "FlyBase" and "."): 67107 FlyBase CDS 65481 FlyBase exon 14601 FlyBase gene 19781 FlyBase mRNA 92 FlyBase miRNA 95 FlyBase ncRNA 51 FlyBase pseudogene 98 FlyBase rRNA 47 FlyBase snRNA 65 FlyBase snoRNA 314 FlyBase tRNA 6003 FlyBase transposable_element 37267 FlyBase transposable_element_insertion_site # Very similar to 4.3 except no more ". transcription_start_site". # What keywords are defined in the 9th field? # Oh dear, 4.3 had HTML-escaped special characters but 5.1 has # unescaped ; willy-nilly in there with the ; which separate var=val's. # | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \ grep -v '^#' dmel-all-r5.1.gff \ | awk '{print $9;}' \ | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \ | sort | uniq -c 17604 Alias 375522 Dbxref 5523024 ID 5521150 Name 3415862 Parent 3962606 Target 18400 associated_with 15770 cyto_range 37956 derived_cyto_location 15600 gbunit 12101 programversion 11795 putative_ortholog_of 13174 to_species # Notable differences from 4.3: no more Synonym; a lot fewer Alias, # a *lot* more Dbxref; some keywords have disappeared but not ones # we use. # Copy over 4.3 script and modify to fit the latest Immortal Unchanging # Widely Adopted Standard GFF3. # Alias no longer holds the CG*/CR* ID's; now it's Dbxref, with a # prefix that must be stripped. # Two pseudogenes do not have CR*; must get those from gene line. # 'dmel_mitochondrion_genome' --> 'chrM' cp /cluster/data/dm2/bed/flybase4.3/extractGenes.pl . extractGenes.pl dmel-all-r5.1.gff mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/fasta/dmel-all-translation-r5.1.fasta.gz # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene: zcat dmel-all-translation-r5.1.fasta.gz \ | perl -we 'open(X, "flyBase2004Xref.tab") || die; \ while () { \ @w = split("\t"); \ $fbtr2cgr{$w[3]} = $w[0]; \ } \ while (<>) { \ if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \ $cgr = $fbtr2cgr{$1}; \ if ($cgr) { \ s/^>.*/>$cgr/; \ } else { die; } \ } \ print; \ } \ ' \ > flyBasePep.fa ssh hgwdev cd /cluster/data/dm3/bed/flybase5.1 # Protein-coding genes: ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf # 19781 groups 7 seqs 1 sources 2 feature types hgPepPred dm3 generic flyBasePep flyBasePep.fa # Noncoding genes: hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed #Loaded 762 elements of size 12 # Cross-referencing info for both coding and noncoding: hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \ flyBase2004Xref.tab # Make sure there are no joinerCheck errors at this point, because # if left here they can spread via doHgNearBlastp: runJoiner.csh dm3 flyBasePep # See if there are still large regions where refGene has genes but # flybase doesn't: featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout # Not so bad now -- some cases where sequence might be duplicated and # we make different calls about where the exons land, but no huge # missing regions of regular chroms (just no annotations on chr*Het). # add upstream* downloadable files ssh hgwdev mkdir /cluster/data/dm3/bed/flybase5.1/bigZips cd /cluster/data/dm3/bed/flybase5.1/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.gz > md5sum.txt ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \ /usr/local/apache/htdocs/goldenPath/dm3/bigZips/ cat md5sum.txt \ >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt ######################################################################### # CHROMOSOME BANDS (DONE (5.1) 4/5/07 angie - REDONE (5.3) 9/18/07) # 10/21/08 (5.12): generated files, but no change since 5.3, so I did not # reload the tables. ssh hgwdev cd /cluster/data/dm3/bed/flybase5.3 grep chromosome_band dmel-all-r5.3.gff \ | perl -wpe 'chomp; @w = split; \ $w[3]--; $w[3] = 0 if ($w[3] < 0); \ $w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \ if ($w[8] !~ /^\d+[A-Z]\d+$/) { s/^.*$//; next; } \ if ($w[4] <= 0) { s/^.*$//; } else { \ s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \ > cytoBand.txt hgLoadSqlTab dm3 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt # Remove the items that are off the end of their chroms: perl -wpe 's/^(\w+)\t(\d+)$/ \ delete from cytoBand where chrom="$1" and chromStart >= $2; \ update cytoBand set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \ ../../chrom.sizes \ | hgsql dm3 checkTableCoords -verbose=2 dm3 cytoBand # As of 5.3, the GFF now includes what we need to make cytoBandIdeo # directly: grep chromosome_band dmel-all-r5.3.gff \ | perl -wpe 'chomp; @w = split; \ $w[3]--; $w[3] = 0 if ($w[3] < 0); \ $w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \ if ($w[8] !~ /^\d+[A-Z]$/) { s/^.*$//; next; } \ if ($w[4] <= 0) { s/^.*$//; } else { \ s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \ > cytoBandIdeo.txt hgLoadSqlTab dm3 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \ cytoBandIdeo.txt # Remove the items that are off the end of their chroms: perl -wpe 's/^(\w+)\t(\d+)$/ \ delete from cytoBandIdeo where chrom="$1" and chromStart >= $2; \ update cytoBandIdeo set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \ ../../chrom.sizes \ | hgsql dm3 checkTableCoords -verbose=2 dm3 cytoBandIdeo ######################################################################### # FLYBASE ACODE CROSS-REFERENCING DATA (N/A 4/5/07 angie) # Hmmm, it seems that flybase no longer offser acode. No dice on # ftping from the old location # (ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt -- not # flybase.net either) # or googling site:flybase.net acode. # Looks like Chado XML is what is offered now. And an interrupted # fetch of it is hanging my FireFox. # When it recovers, I guess I could try downloading it properly # and seeing if it has the same info that hgFlyBase used to parse # out of acode. ftp://ftp.flybase.net/releases/FB2006_01/chado-xml/chado_genes.xml.gz # This might be smaller... ftp://ftp.flybase.net/releases/FB2006_01/dmel_r5.1/chado-xml/chado_dmel_genes.xml.gz ######################################################################### # MAP UNIPROT IDS TO CG*-R* IDS # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07) # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations. # FlyBase 5.3 history: (DONE 9/20/07 angie) # FlyBase 5.12 history: (DONE 10/24/08 angie) # In dm2, I combined three sources: specific transcript mappings # from uniProt.description, less specific gene mappings from # uniProt.description, and fbUniProt mappings extracted from # FlyBase acode. We don't have flyBase acode anymore, so combine # just the first two (uniProt) sources. ssh hgwdev mkdir /cluster/data/dm3/bed/uniProtToFlyBase cd /cluster/data/dm3/bed/uniProtToFlyBase set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'` # Grab UniProt descriptions of all D. mel. proteins so we can parse out # CG*-R* transcript IDs (and CG* gene IDs when -R* not specified). hgsql -N uniProt -e \ 'select description.* from description,accToTaxon where \ description.val not like "CG% (Fragment)." and \ accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \ > uniProt.description.txt # Grab the complete list of CG*-R* transcript IDs: hgsql -N dm3 -e 'select name from flyBaseGene' | sort > transcriptIds.txt ssh kolossus cd /cluster/data/dm3/bed/uniProtToFlyBase # Wrote a perl script, gleanDescription.pl, to parse the various patterns # used to encode real mappings (not mappings to homologs or things that # look like CG*-P? IDs but aren't really). Update the dm2 copy to handle # the new CG\d+-R\w\w transcripts: cp /cluster/data/dm2/bed/uniProtToFlyBase/gleanDescription.pl . # Loosened a couple regexes in gleanDescription.pl for 5.12. ./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt # Look for duplicates (if we didn't trim the "(Fragment)." entries above, # there would be 82! But now we have only one. foreach cg (`cut -f 1 uniProtMapping.txt | uniq -c \ | awk '$1 != 1 {print $2;}' | sed -e 's/-R/-P/'`) echo $cg grep $cg uniProt.description.txt echo "" end #CG9339-PH #Q6Q4F0 CG9339-PH. #Q9VIH7 CG9339-PA, isoform A (CG9339-PB, isoform B) (CG9339-PH, isoform H) (LD10117p). # I looked up both Q6Q4F0 and Q9VIH7 on http://www.pir.uniprot.org/; # Q9VIH7 is a bit longer, so I'll arbitrarily go with that one and # delete the Q6Q4F0 mapping from uniProtMapping.txt. # Wrote a perl script, vagueDescription.pl, to parse out CG* without # isoform specs -- can use those as backups when we don't have a more # specific mapping for a transcript. (updated from dm2) cp /cluster/data/dm2/bed/uniProtToFlyBase/vagueDescription.pl . ./vagueDescription.pl uniProt.description.txt | sort \ > uniProtMappingVague.txt # acode is no longer available, but UniProt mappings can be gleaned from # FlyBase GFF3. perl -we 'while (<>) { chomp; @w = split; \ next unless ($w[8] && $w[1] eq "FlyBase" && $w[2] eq "gene" && \ $w[8] =~ /FlyBase_Annotation_IDs:(CG\d+)/); \ $cg = $1; \ while ($w[8] =~ s/^.*?UniProt\/[-\w]+:(\w+)//) { \ print "$cg\t$1\n"; \ } \ }' ../flybase5.12/dmel-all-r5.12.gff \ > flyBaseCgToUniProt.txt # Using dm2 perl script, combineMappings.pl, to add any additional mappings # available from acode to the UniProt mappings. Since the second arg # is vague-gene and third arg is vague-transcript, feed both vague-gene # files in as the second input: cat uniProtMappingVague.txt flyBaseCgToUniProt.txt \ | /cluster/data/dm2/bed/uniProtToFlyBase/combineMappings.pl transcriptIds.txt \ uniProtMapping.txt - /dev/null \ > flyBaseToUniProt.txt # Make a 2-column version for loading -- and add back in the transcripts for # which we couldn't find a UniProt acc, because they have to have something in # flyBaseToUniProt in order for hgNear's oneGeneQuery to work!: join -t ' ' -a 1 -e n/a -o 1.1,2.2 transcriptIds.txt flyBaseToUniProt.txt \ > flyBaseToUniProtLoad.txt # Load into a genericAlias-type table: ssh hgwdev cd /cluster/data/dm3/bed/uniProtToFlyBase sed -e 's/genericAlias/flyBaseToUniProt/' \ ~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql hgLoadSqlTab dm3 flyBaseToUniProt flyBaseToUniProt.sql \ flyBaseToUniProtLoad.txt ######################################################################### # HGNEAR TABLES # HGNEAR PROTEIN BLAST TABLES # FlyBase 5.1 history: (dm3.* DONE 4/6/07 - dm3.* REDONE 6/18/07, # *.dmBlastTab DONE 7/18/07) # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations. # FlyBase 5.3 history: (DONE 9/18/07) # FlyBase 5.12 history: (DONE 10/21/08) ssh hgwdev mkdir /cluster/data/dm3/bed/hgNearBlastp cd /cluster/data/dm3/bed/hgNearBlastp mkdir 081021 cd 081021 # Get the proteins used by the other hgNear organisms: pepPredToFa hg18 knownGenePep hg18.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer3 ensPep danRer3.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce6 sangerPep ce6.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa # Galt's synBlastp.csh filtering, which uses liftOver chains, # would be the next step if dm3 were reasonably closely related # (like mammal-mammal) to another Gene Sorter organism. But it is # pretty distant from all the others, so specify recipBest for all # query databases. cat << _EOF_ > config.ra # Latest fly vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, worm, yeast targetGenesetPrefix flyBase targetDb dm3 queryDbs hg18 mm9 rn4 danRer3 ce6 sacCer1 recipBest hg18 mm9 rn4 danRer3 ce6 sacCer1 dm3Fa /cluster/data/dm3/bed/hgNearBlastp/081021/dm3.flyBasePep.faa hg18Fa /cluster/data/dm3/bed/hgNearBlastp/081021/hg18.known.faa mm9Fa /cluster/data/dm3/bed/hgNearBlastp/081021/mm9.known.faa rn4Fa /cluster/data/dm3/bed/hgNearBlastp/081021/rn4.known.faa danRer3Fa /cluster/data/dm3/bed/hgNearBlastp/081021/danRer3.ensPep.faa ce6Fa /cluster/data/dm3/bed/hgNearBlastp/081021/ce6.sangerPep.faa sacCer1Fa /cluster/data/dm3/bed/hgNearBlastp/081021/sacCer1.sgdPep.faa buildDir /cluster/data/dm3/bed/hgNearBlastp/081021 scratchDir /san/sanvol1/scratch/dm3HgNearBlastp _EOF_ # Run with -noLoad so we can eyeball files, manually load dm3 tables now, # and later overload other databases' dmBlastTab tables. doHgNearBlastp.pl -noLoad config.ra >& do.log & tail -f do.log # Ran the load scripts for dm3 tables manually as suggested by the # output of doHgNearBlastp.pl: # *** -noLoad was specified -- you can run this script manually to load dm3 tables: run.dm3.dm3/loadPairwise.csh #Loading database with 904530 rows # *** -noLoad was specified -- you can run these scripts manually to load dm3 tables: run.dm3.hg18/loadPairwise.csh #Loading database with 5946 rows run.dm3.mm9/loadPairwise.csh #Loading database with 5906 rows run.dm3.rn4/loadPairwise.csh #Loading database with 3058 rows run.dm3.danRer3/loadPairwise.csh #Loading database with 5328 rows run.dm3.ce6/loadPairwise.csh #Loading database with 4665 rows run.dm3.sacCer1/loadPairwise.csh #Loading database with 2205 rows # 7/18/07,9/18/07,10/21/08: dm3 is on RR, so time to load*.dmBlastTab: # *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases: run.hg18.dm3/loadPairwise.csh #Loading database with 5946 rows run.mm9.dm3/loadPairwise.csh #Loading database with 5906 rows run.rn4.dm3/loadPairwise.csh #Loading database with 3058 rows run.danRer3.dm3/loadPairwise.csh #Loading database with 5328 rows run.ce6.dm3/loadPairwise.csh #Loading database with 4665 rows run.sacCer1.dm3/loadPairwise.csh #Loading database with 2205 rows # CLUSTER GENES AND MAP TO OTHER DATA # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07) # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations. # FlyBase 5.3 history: (DONE 9/20/07 angie) # FlyBase 5.12 history: (DONE 10/24/08 angie) ssh hgwdev hgClusterGenes dm3 flyBaseGene flyBaseIsoforms flyBaseCanonical \ -protAccQuery='select name,alias from flyBaseToUniProt' #Got 13784 clusters, from 21236 genes in 15 chromosomes # Create table that maps between flyBase genes and RefSeq hgMapToGene dm3 refGene flyBaseGene flyBaseToRefSeq # Create table that maps between known genes and Pfam domains hgMapViaSwissProt dm3 flyBaseToUniProt name alias Pfam flyBaseToPfam # Create a table that maps between flyBase genes and the # Stanford Microarray Project expression data. (see makeHgFixed.doc) hgsql dm3 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \ select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene' hgsql dm3 -e 'create index name on flyBaseToCG(name(12))' nice hgExpDistance dm3 hgFixed.arbFlyLifeMedianRatio dummyArg \ arbExpDistance -lookup=flyBaseToCG # MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07) # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations. # FlyBase 5.3 history: (DONE 9/20/07 angie) # FlyBase 5.12 history: (DONE 10/24/08 angie) # hgNear's knownDetails column type expects a single table that links # transcript ID with uniProt description, so whip one up here: # (used by hgGene too though hgGene could join on the fly) ssh hgwdev hgsql dm3 -e 'drop table if exists flyBaseToDescription; \ create table flyBaseToDescription \ select flyBaseToUniProt.name,uniProt.description.val \ from flyBaseToUniProt,uniProt.description \ where flyBaseToUniProt.alias = uniProt.description.acc' hgsql dm3 -e 'create index name on flyBaseToDescription(name(12))' # FLYP2P # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07) # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations. # FlyBase 5.3 history: (hgLoadNetDist REDONE 9/18/07 angie) # FlyBase 5.12 history: (hgLoadNetDist REDONE 10/21/08 angie) # See hg/near/makeNear.doc... mkdir /cluster/data/dm3/bed/p2p cd /cluster/data/dm3/bed/p2p # Watch out... hgLoadNetDist overwrites and removes flyP2P.tab! # so rename here: cp -p /cluster/data/dm1/p2p/flyP2P.tab flyP2P.raw.txt ssh -x kolossus hgNetDist /cluster/data/dm3/bed/p2p/{flyP2P.raw.txt,tmp.tab} hgLoadNetDist tmp.tab dm3 flyP2P \ -sqlRemap='select fbgn,name from flyBase2004Xref' #hgLoadNetDist 512 id-remapping misses, see missing.tab # That's OK, some genes were in dm1 but are not in dm3 (flybase 3 vs. 5) # MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 4/6/07 angie) cd ~/kent/src/hg/near/hgNear/hgNearData # Made dm3 subdir of D_melanogaster with usual .ra files. # Similarly for hgGene/hgGeneData. # Make alpha on genome-test for hgNear, hgGene and trackDb DBS=dm3 . # ENABLE HGNEAR FOR DM3 IN HGCENTRALTEST (DONE 4/6/07 angie) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set hgNearOk = 1 where name = 'dm3';" # END OF HGNEAR STUFF ######################################################################### ######################################################################### # BDGP IN SITU IMAGE LINKS (DONE 4/7/06) # grabbed data again 10/24/08, no change. # HGGENE ssh hgwdev cd /cluster/data/dm3/bed/bdgpExprLink wget -O summary.txt.081024 \ 'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext' # remove header line. hgLoadSqlTab dm3 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \ summary.txt.070406 ######################################################################### # "FLYBASE 5.1" DHGP HETEROCHROMATIN ANNOTATIONS (DONE 6/18/07 angie) # OBSOLETED by load of FlyBase 5.3 genes 9/18/07. # These heterochromatin annotations have just been published in Science # http://www.sciencemag.org/cgi/content/full/316/5831/1586 # -- future updates will be from FlyBase, but for now the GFF is in the # paper's Supplemental Data! ssh kkstore05 mkdir /cluster/data/dm3/bed/flybase5.1/Het cd /cluster/data/dm3/bed/flybase5.1/Het wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_B_GFF/R5_all_GFF_results.tar.gz foreach chr (2LHet 2RHet 3LHet 3RHet U Uextra XHet YHet) wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_C_FASTA/${chr}_translation_dmel_RELEASE5-1.FASTA end tar xvzf R5_all_GFF_results.tar.gz # Of course they invented a completely new encoding of names and # properties into GFF 9th col and FASTA headers. So instead of # attempting to reuse the flybase-parsing script, just pick out # the rows we need and parse each row type... we will not get # enough info to complete flyBase2004Xref, but c'est la vie. # There are files for non-het chroms... I took a look at 2L_annotation # and its annotations seem to be relative to chr2L:22000975... adding # that offset, almost all of the ones I checked appear in our current # dm3 flyBaseGene. So I'll just use the Het files and ignore the others. foreach type (3_UTR 5_UTR CDS CDS_exon annotation exon transcript) echo $type awk '{print $3;}' {*Het,U*}_${type}_dmel* | sort | uniq -c end # Looks like the file-types and $3-types we need are these: # CDS_exon CDS_exon # annotation exon, gene, ncRNA, pseudogene, rRNA # I will exclude RR* (repeat region) # I'll build up a .gff type by type: cp /dev/null extract.gff awk '$9 !~ /^name=RR/ && $3 == "CDS_exon" {print;}' {*Het,U*}_CDS_exon*GFF \ | perl -wpe 's/^/chr/; s/CDS_exon/CDS/; \ s/name=\w+:\d+_(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \ >> extract.gff awk '$9 !~ /^genegrp=RR/ && $3 == "exon" {print;}' {*Het,U*}_annotation*GFF \ | perl -wpe 's/^/chr/; \ s/genegrp=\w+; transgrp=(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \ >> extract.gff # All of the rows break down into either CG* IDs (coding) or CR* (nonc): wc -l extract.gff #3245 extract.gff awk '$9 ~ /^CR/ {print;}' extract.gff | wc -l #261 awk '$9 ~ /^CG/ {print;}' extract.gff | wc -l #2984 expr 261 + 2984 #3245 # Break out coding and noncoding, and sort for readability: awk '$9 ~ /^CR/ {print;}' extract.gff \ | sort -k 9,9 -k 1,1 -k 4n,4n \ > flyBaseNoncodingHet.gff awk '$9 ~ /^CG/ {print;}' extract.gff \ | sort -k 9,9 -k 1,1 -k 4n,4n \ > flyBaseGeneHet.gff # gff -> genePred -> bed for noncoding ldHgGene dummy dummy flyBaseNoncodingHet.gff -nobin -out=stdout \ | genePredToBed \ | awk 'BEGIN {OFS="\t";} $7 == 0 && $8 == 0 {$7 = $2; $8 = $2;} {print;}' \ > flyBaseNoncodingHet.bed #Read 127 transcripts in 261 lines in 1 files # Now get what we can get for flyBase2004Xref: awk '$3 ~ /^(gene|pseudogene|.*RNA)$/ {print;}' {*Het,U*}_annotation*GFF \ | perl -pe 's/^\w+\tgadfly\t(pseudogene|\w+RNA)?(gene)?.*name=(\w+); symbol=([^;]*); (dbxref=FlyBase:(FBgn\d+))?.*$/$3\t$4\t\t\t$6\t\t\t$1/ \ || die "$_";\ s/^(\w+)\t\t/$1\t$1\t/;' \ | sort -k 1,1 \ > flyBase2004XrefHet.tab # That almost works but it needs C*-R* IDs and has only C*'s. # Join with the C*-R* IDs: awk '{print $9;}' fly*gff \ | sort -u \ | perl -wpe 's/^(\w+)/$1\t$1/;' \ > cgTocgr.txt join -j 1 -a 2 -t' ' -o '1.2 2.2 2.3 2.4 2.5 2.6 2.7 2.8' \ cgTocgr.txt flyBase2004XrefHet.tab \ > tmp.tab mv tmp.tab flyBase2004XrefHet.tab # Uh-oh, some of these are already in flyBase2004Xref, so we'll get # mysql errors (unique index) from trying to overload... and with # more of the fields filled in, so omit them from the new load. awk '{print $1;}' ../flyBase2004Xref.tab | sort > tmp1 awk '{print $1;}' flyBase2004XrefHet.tab | sort > tmp2 comm -1 -2 tmp1 tmp2 #CG41431-RA #CG41432-RA #CG41434-RA #CG41436-RA # Only 4 such -- manually removed them from flyBase2004XrefHet.tab. perl -wpe 's/^>(\w+)-P(\w+)/>$1-R$2/' {*Het,U*}_*.FASTA \ > flyBasePepHet.fa # Manually removed those 4 from the fa too. # Also removed this lone RR element from the fa: RR40941-RA # Now *additively* load the new data on top of the old: ssh hgwdev cd /cluster/data/dm3/bed/flybase5.1/Het # Protein-coding genes: ldHgGene -oldTable -nobin dm3 flyBaseGene flyBaseGeneHet.gff #Read 503 transcripts in 2984 lines in 1 files # 503 groups 7 seqs 1 sources 2 feature types cat ../flyBasePep.fa flyBasePepHet.fa \ | hgPepPred dm3 generic flyBasePep stdin # Noncoding genes: hgLoadBed -oldTable dm3 flyBaseNoncoding flyBaseNoncodingHet.bed #Loaded 127 elements of size 12 # Cross-referencing info for both coding and noncoding: hgLoadSqlTab -oldTable dm3 flyBase2004Xref \ ~/kent/src/hg/lib/flyBase2004Xref.sql flyBase2004XrefHet.tab # Make sure there are no joinerCheck errors at this point, because # if left here they can spread via doHgNearBlastp: runJoiner.csh dm3 flyBasePep # See if there are still large regions where refGene has genes but # flybase doesn't: featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout # better now that the Het's (except for Uextra) are included. # add upstream* downloadable files ssh hgwdev mkdir /cluster/data/dm3/bed/flybase5.1/bigZips cd /cluster/data/dm3/bed/flybase5.1/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.gz > md5sum.txt ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \ /usr/local/apache/htdocs/goldenPath/dm3/bigZips/ cat md5sum.txt \ >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt # Edited out the old upstream* md5sums. ######################################################################### # TWEAK DEFAULT POSITION (DONE 6/18/07 angie) ssh hgwdev hgsql hgcentraltest -e 'update dbDb set defaultPos="chr2L:826001-851000" where name = "dm3";' ######################################################################### # ORegAnno - Open Regulatory Annotations # update Oct 26, 2007; July 7, 2008 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track ######################################################################### # FLYBASE 5.3 ANNOTATIONS (DONE 9/17/07 angie) # OBSOLETED by load of FlyBase 5.12 genes 10/21/08. ssh kkstore05 mkdir /cluster/data/dm3/bed/flybase5.3 cd /cluster/data/dm3/bed/flybase5.3 wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/gff/dmel-all-r5.3.gff.gz gunzip dmel-all-r5.3.gff.gz # What data sources are represented in this file? grep -v '^#' dmel-all-r5.3.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "FlyBase" and "."): 69531 FlyBase CDS 68426 FlyBase exon 15185 FlyBase gene 20738 FlyBase mRNA 90 FlyBase miRNA 105 FlyBase ncRNA 94 FlyBase pseudogene 164 FlyBase rRNA 47 FlyBase snRNA 249 FlyBase snoRNA 314 FlyBase tRNA 5385 FlyBase transposable_element 38724 FlyBase transposable_element_insertion_site # Same types as 5.1, good. # What keywords are defined in the 9th field? grep -v '^#' dmel-all-r5.3.gff \ | awk '{print $9;}' \ | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \ | sort | uniq -c 17505 Alias 412018 Dbxref 6571566 ID 6530043 Name 4008974 Parent 3048194 Target 16921 cyto_range # Comparable to 5.1, good. cp ../flybase5.1/extractGenes.pl . # Mods to 5.1 script: # - so many new keywords, just say which ones we're ignoring instead # of die-ing if we don't recognize a new one. # - protein line: Parent --> Derives_from. # - some ncRNA's have CG-*, not CR-*, IDs: time ./extractGenes.pl dmel-all-r5.3.gff #Noncoding FBtr0113460 has CG ID (CG33294-RB) #Noncoding FBtr0113490 has CG ID (CG34647-RA) #Noncoding FBtr0113491 has CG ID (CG34648-RA) #Noncoding FBtr0113492 has CG ID (CG34649-RA) #Keywords that have been ignored in one or more line types: #CDS: ID #CDS: Name #exon: Dbxref #exon: ID #exon: Name #gene: Alias #gene: Ontology_term #gene: cyto_range #gene: gbunit #mRNA: Alias #mRNA: Name #mRNA: score #mRNA: score_text #noncoding: Alias #noncoding: score #noncoding: score_text #protein: Alias #protein: Dbxref #protein: Name #protein: derived_isoelectric_point #protein: derived_molecular_weight #71.391u 1.296s 1:12.75 99.9% 0+0k 0+0io 0pf+0w mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/fasta/dmel-all-translation-r5.3.fasta.gz # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene: zcat dmel-all-translation-r5.3.fasta.gz \ | perl -we 'open(X, "flyBase2004Xref.tab") || die; \ while () { \ @w = split("\t"); \ $fbtr2cgr{$w[3]} = $w[0]; \ } \ while (<>) { \ if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \ $cgr = $fbtr2cgr{$1}; \ if ($cgr) { \ s/^>.*/>$cgr/; \ } else { die; } \ } \ print; \ } \ ' \ > flyBasePep.fa ssh hgwdev cd /cluster/data/dm3/bed/flybase5.3 # Protein-coding genes: ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf # 20738 groups 14 seqs 1 sources 2 feature types hgPepPred dm3 generic flyBasePep flyBasePep.fa # Noncoding genes: hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed #Loaded 1063 elements of size 12 # Cross-referencing info for both coding and noncoding: hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \ flyBase2004Xref.tab # Make sure there are no joinerCheck errors amongst those 3 tables # at this point, because if left here they can spread via doHgNearBlastp. # Since this is an update of those tables to a new FlyBase version, # and other hgNear tables already exist with the old version's set of # IDs, ignore errors for missing CG items in tables that will be # rebuilt in the hgNear process.: runJoiner.csh dm3 flyBasePep # See if there are still large regions where refGene has genes but # flybase doesn't: featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout #335940 bases of 162367812 (0.207%) in intersection # Spot-checked... some transcripts removed here and there, looks OK. # add upstream* downloadable files ssh hgwdev mkdir /cluster/data/dm3/bed/flybase5.3/bigZips cd /cluster/data/dm3/bed/flybase5.3/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.gz > md5sum.txt ln -s /cluster/data/dm3/bed/flybase5.3/bigZips/*.gz \ /usr/local/apache/htdocs/goldenPath/dm3/bigZips/ # When updating, edit out the old upstream* sums from md5sum.txt, then: cat md5sum.txt \ >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt ############################################################################# # N-SCAN GENES track ( markd) # create a composite track with exists ab-inito and new PASA N-SCAN predictions # download predictions cd /cluster/data/dm3/bed/nscan wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.gtf wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.prot.fa wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/readme.txt bzip2 dm3.* chmod a-w * ldHgGene -gtf -genePredExt dm3 nscanPasaGene dm3.gtf.bz2 hgPepPred dm3 generic nscanPasaPep dm3.prot.fa.bz2 rm *.tab # update trackDb; need a dm3-specific page to describe informants and PASA # make sure termRegex matches naming drosophila/dm3/nscanPasaGene.html drosophila/dm3/dm3/trackDb.ra ######################################################################### ########################################################################## # EVOFOLD (Done, 10.02.2007) Jakob Skou Pedersen # RNA secondary structure predictions lifted from dm2 and filtered ssh -C hgwdev mkdir -p /cluster/data/dm3/bed/evofold cd /cluster/data/dm3/bed/evofold echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql dm2 | sed -e 1d > foldsDm2.bed liftOver -minMatch=1.0 foldsDm2.bed /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz tmp.bed unmapped.bed # remove elements which are wrong size after lifting awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsDm3.bed # structure filters # first, remove pairs that can't form in human cut -f 1-6 rawFoldsDm3.bed > tmp.bed # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/ nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=dm3 -bedIn=tmp.bed -fastaOut=tmp.fa cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \ | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsDm3Seq.tab join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsDm3.bed foldsDm3Seq.tab | sed -e 's/ */\t/g' | sort -k1,1 \ | /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsDm3.bed cleanFolds.tab | sed -e 's/ */\t/g' > tmp1.bed # second, remove poor predictions # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \ | /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsDm3.bed # clean up rm tmp.bed tmp1.bed foldsDm2.bed foldsDm3Seq.tab rawFoldsDm3.bed tmp.fa cleanFolds.tab # comment: all the above filtering did not remove any # predictions... apparently dm2 and dm3 are quite similar. # upload hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql dm3 evofold foldsDm3.bed ############################################################################# # CONTRAST GENES (2007-10-02 markd) # recieved predictions from Sam Gross cd /cluster/data/dm3/bed/contrastGene/ wget http://www.stanford.edu/~ssgross/contrast.dm3.bed # this is a custom track, not a pure BED tail +2 contrast.dm3.bed | hgLoadBed -tab dm3 contrastGene stdin # verify # load track db (ra and contrastGene.html are global # request push of contrastGene ########################################################################### # GENE DISRUPTION PROJECT/PSCREEN (DONE 5/1/08 angie) mkdir /cluster/data/dm3/bed/pscreen cd /cluster/data/dm3/bed/pscreen # Saved Excel spreadsheet emailed by Karen Schulze kschulze at bcm dot # tmc dot edu as tab-separated text. Translate to our table format; dos2unix pscreen.txt tail +2 pscreen.txt \ | perl -we ' \ while (<>) { \ chomp; \ ($stock, $strain, $arm, $end, $ori, undef, undef, \ $gene1, undef, $gene2, undef, $gene3) = split("\t"); \ next if (! $end); \ $chr = "chr" . $arm; \ $strand = ($ori eq "Minus") ? "-" : "+"; \ $geneCount = 0; $geneIds = ""; \ foreach $gene ($gene1, $gene2, $gene3) { \ if ($gene) { $geneCount++; $geneIds .= "$gene,"; } \ } \ print join("\t", $chr, $end-1, $end, $strain, 0, $strand, \ $stock, $geneCount, $geneIds) . "\n"; \ }' \ | sort -k1,1 -k2n,2n \ > pscreen.bed hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pscreen.sql -notItemRgb \ -onServer -tab dm3 pscreen pscreen.bed #Loaded 7672 elements of size 9 rm bed.tab ######################################################################### # hgPal downloads (DONE braney 2008-09-30) ssh hgwdev screen bash rm -rf /cluster/data/dm3/bed/multiz15way/pal mkdir /cluster/data/dm3/bed/multiz15way/pal cd /cluster/data/dm3/bed/multiz15way/pal echo $db > order.lst echo "select settings from trackDb where tableName='$mz'" | \ hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \ tail -n +2 >> order.lst mz=multiz15way gp=refGene db=dm3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > refGene.jobs time sh -x refGene.jobs > refGene.job.log 2>&1 & sleep 1 tail -f refGene.job.log # real 35m20.802s # user 5m26.007s # sys 1m25.462s zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz15way gp=flyBaseGene db=dm3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 35m13.992s # user 5m18.930s # sys 1m27.205s zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ######################################################################### # LIFTOVER TO DM2 (DONE 6/11/08 angie) doSameSpeciesLiftOver.pl dm3 dm2 > & dm3ToDm2.log & tail -f dm3ToDm2.log mv dm3ToDm2.log /cluster/data/dm3/bed/blat.dm2.2008-06-11/ ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd -- UNDONE 2008-10-27 markd) update genbank.conf: dm3.upstreamGeneTbl = refGene dm3.upstreamMaf = multiz15way /hive/data/genomes/dm3/bed/multiz15way/species.lst # Those lines have been removed because Ann, Angie and Mark decided to # use FlyBase Genes as the reference set instead of RefSeq. ######################################################################### # FLYBASE 5.12 ANNOTATIONS (DONE 10/21/08 angie) mkdir /hive/data/genomes/dm3/bed/flybase5.11 cd /hive/data/genomes/dm3/bed/flybase5.11 wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/gff/dmel-all-r5.12.gff.gz gunzip dmel-all-r5.12.gff.gz # What data sources are represented in this file? grep -v '^#' dmel-all-r5.12.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "FlyBase" and "."): 72096 FlyBase CDS 69371 FlyBase exon 21140 FlyBase five_prime_UTR 15140 FlyBase gene 21243 FlyBase mRNA 90 FlyBase miRNA 154 FlyBase ncRNA 95 FlyBase pseudogene 161 FlyBase rRNA 47 FlyBase snRNA 249 FlyBase snoRNA 314 FlyBase tRNA 15546 FlyBase three_prime_UTR 5413 FlyBase transposable_element 43178 FlyBase transposable_element_insertion_site # Same types as 5.1, good. # What keywords are defined in the 9th field? (again, ignoring a bunch) grep -v '^#' dmel-all-r5.12.gff \ | awk '{print $9;}' \ | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \ | sort | uniq -c 79889 Alias 408477 Dbxref 21243 Derives_from 3079555 ID 7236214 Name 4460030 Parent 5376450 Target 1384 cyto_range # Comparable to 5.1, good. cp ../flybase5.3/extractGenes.pl . time ./extractGenes.pl dmel-all-r5.12.gff #Noncoding FBtr0070097 has CG ID (CG18275-RA) #Keywords that have been ignored in one or more line types: #CDS: ID #CDS: Name #exon: Dbxref #exon: ID #exon: Name #gene: Alias #gene: Ontology_term #gene: derived_computed_cyto #gene: fullname #gene: gbunit #mRNA: Alias #mRNA: Name #mRNA: score #mRNA: score_text #noncoding: Alias #noncoding: score #noncoding: score_text #protein: Alias #protein: Dbxref #protein: Name #protein: derived_isoelectric_point #protein: derived_molecular_weight #81.970u 2.490s 1:25.12 99.2% 0+0k 0+0io 0pf+0w mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/fasta/dmel-all-translation-r5.12.fasta.gz # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene: zcat dmel-all-translation-r5.12.fasta.gz \ | perl -we 'open(X, "flyBase2004Xref.tab") || die; \ while () { \ @w = split("\t"); \ $fbtr2cgr{$w[3]} = $w[0]; \ } \ while (<>) { \ if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \ $cgr = $fbtr2cgr{$1}; \ if ($cgr) { \ s/^>.*/>$cgr/; \ } else { die; } \ } \ print; \ } \ ' \ > flyBasePep.fa # Load Protein-coding genes: ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf # 21243 groups 14 seqs 1 sources 2 feature types hgPepPred dm3 generic flyBasePep flyBasePep.fa # Noncoding genes: hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed #Loaded 1063 elements of size 12 # Cross-referencing info for both coding and noncoding: hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \ flyBase2004Xref.tab # Make sure there are no joinerCheck errors amongst those 3 tables # at this point, because if left here they can spread via doHgNearBlastp. # Since this is an update of those tables to a new FlyBase version, # and other hgNear tables already exist with the old version's set of # IDs, ignore errors for missing CG items in tables that will be # rebuilt in the hgNear process.: runJoiner.csh dm3 flyBasePep # See if there are still large regions where refGene has genes but # flybase doesn't: featureBits dm3 flyBaseGene -or flyBaseNoncoding -bed=tmp.bed featureBits dm3 refGene \!tmp.bed -minSize=1000 -bed=stdout #279626 bases of 162367812 (0.172%) in intersection # Spot-checked... some transcripts removed here and there, looks OK. # add upstream* downloadable files mkdir /hive/data/genomes/dm3/bed/flybase5.12/bigZips cd /hive/data/genomes/dm3/bed/flybase5.12/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.gz > md5sum.txt rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream?000.fa.gz ln -s /hive/data/genomes/dm3/bed/flybase5.12/bigZips/*.gz \ /usr/local/apache/htdocs/goldenPath/dm3/bigZips/ # When updating, edit out the old upstream* sums from md5sum.txt, then: cat md5sum.txt \ >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt # Now update a bunch of tables: # arbExpDistance, *BlastTab, flyBase{Canonical,Isoforms}, flyBaseTo*, # and any other tables used in conjunction with flyBaseGene in hgGene # or hgNear. ######################################################################### # LIFTOVER BDTNP DATA FROM DM2 (DONE 1/29/09 angie - REDONE with more data 2/2/09, more 9/14/10) ssh hgwdev mkdir /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver # Original files are varStep with unspecified step=1 -- translate # to bed3, liftOver, and translate back to varStep: foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/*.wig) echo $f:t:r grep -v ^track $f \ | perl -wpe 'chomp; @w = split(/[ \t=]/); \ if ($w[0] eq "variableStep") { $chr = $w[2]; $_ = ""; } \ elsif (defined $w[1]) { $_ = "$chr\t" . ($w[0]-1) . "\t$w[0]\t$w[1]\n"; }' \ | liftOver -bedPlus=4 stdin /hive/data/genomes/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \ stdout $f:t:r.unmapped \ | sort -k1,1 -k2n,2n \ | perl -wpe 'chomp; @w=split("\t"); next unless $w[3]; \ print "variableStep chrom=$w[0]\n" if (\!defined $prevChr || $prevChr ne $w[0]); \ $_ = "$w[2]\t$w[3]\n"; $prevChr = $w[0];' \ > $f:t:r.lo.wig end wc -l *.unmapped | grep -v '^ 0 ' # 2 bdtnpDl3Fdr25.unmapped # 2 total # This is the one unmapped item: cat bdtnpDl3Fdr25.unmapped ##Deleted in new #chrX 3521682 3521683 0.983636307483747 mkdir wigEncoded cd wigEncoded foreach f (../*Fdr*.lo.wig) wigEncode $f $f:t:r:r.{wig,wib} end # Same output as for dm2, good: #Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80 #Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80 #Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68 #Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68 #Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06 #Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48 #Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21 #Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49 #Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92 #Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86 #Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88 #Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78 #Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78 #Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01 #Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07 #Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76 #Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68 #Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68 #Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66 #Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66 #Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93 #Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70 #Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95 #Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79 #Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51 #Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51 #Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79 #Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49 #Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65 #Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62 #Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07 #Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84 #Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68 #Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68 #Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68 #Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65 #Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03 #Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35 #Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87 #Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33 #Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90 #Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37 #Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78 #Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78 #Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84 #Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84 #Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60 #Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60 #Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16 #Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58 #Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19 #Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81 #Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09 #Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95 #Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06 #Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62 #Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97 #Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77 #Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57 #Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57 #Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59 #Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59 #Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98 #Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57 #Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77 #Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68 #Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79 #Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55 mkdir /gbdb/dm3/bdtnp ln -s /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded/*.wib /gbdb/dm3/bdtnp/ cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded foreach f (*.wig) hgLoadWiggle -pathPrefix=/gbdb/dm3/bdtnp dm3 $f:r $f end rm wiggle.tab # New data 8/27, 8/30/10, 9/14/10 mkdir /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/100827 cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/100827 # DNase and ChIP/chip: foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/100819/*.bw) echo $f:t:r bigWigToBedGraph $f stdout \ | liftOver -bedPlus=4 -minMatch=0.5 stdin /hive/data/genomes/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \ stdout $f:t:r.unmapped \ | sort -k1,1 -k2n,2n \ | perl -wpe 'chomp; @w=split("\t"); next unless defined $w[3]; \ print "variableStep chrom=$w[0]\n" if (\!defined $prevChr || $prevChr ne $w[0]); \ $_ = "$w[2]\t$w[3]\n"; $prevChr = $w[0];' \ | wigToBigWig -clip stdin /hive/data/genomes/dm3/chrom.sizes $f:t:r.bw end foreach f (bdtnp*.bw) ln -s `pwd`/$f /gbdb/dm3/bdtnp/$f hgBbiDbLink dm3 $f:t:r /gbdb/dm3/bdtnp/$f end # Accessible regions: foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/100819/5%*.txt) set t = `echo "$f" | perl -wpe 's/^.*stage (\d+).txt/bdtnpDnaseAccS$1/'` echo $f --\> $t tail -n +3 "$f" \ | awk '{print $1, ($2-1), $3;}' \ | liftOver -bedPlus=3 -minMatch=0.5 stdin /hive/data/genomes/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \ stdout $t.unmapped \ | hgLoadBed dm3 $t stdin end # How many unmapped? With default minMatch, we only lost one ChIP/chip region, but # lost ~235k DNase datapoints. Lowering minMatch to 0.5 didn't really help (~232k): wc -l *.unmapped | egrep -v '^ +0 ' # 465424 bdtnpDnaseS10R8816.unmapped # 465424 bdtnpDnaseS10R8820.unmapped # 465424 bdtnpDnaseS11R9485.unmapped # 465424 bdtnpDnaseS11R9486.unmapped # 465424 bdtnpDnaseS14R9477.unmapped # 465424 bdtnpDnaseS14R9478.unmapped # 465424 bdtnpDnaseS5R9481.unmapped # 465424 bdtnpDnaseS5R9482.unmapped # 465424 bdtnpDnaseS9R9127.unmapped # 465424 bdtnpDnaseS9R9128.unmapped ######################################################################### # AFFYMETRIX TIMECOURSE (DONE 6/10/10 Fan) ssh hgwdev mkdir /hive/data/genomes/dm3/bed/affyDrosDev cd /hive/data/genomes/dm3/bed/affyDrosDev # First, build the "Affymetrix Drosophila Development Transfrags" track # get dm2 to dm3 lift over file zcat /gbdb/dm2/liftOver/dm2ToDm3.over.chain.gz > dm2ToDm3.over.chain # lift dm2 affyDrosDevTransfrags$t.bed to dm3 affyDrosDevTransfrags$t.bed foreach t (1 2 3 4 5 6 7 8 9 10 11 12) liftOver /hive/data/genomes/dm2/bed/affyDrosDev/affyDrosDevTransfrags$t.bed dm2ToDm3.over.chain dm3.affyDrosDevTransfrags$t.bed unMappedi$t.bed end #load tables into dm3 db foreach t (1 2 3 4 5 6 7 8 9 10 11 12) hgLoadBed dm3 affyDrosDevTransfrags$t dm3.affyDrosDevTransfrags$t.bed end # Copy over the affyDrosDevTransfrags section of # kent/src/hg/makeDb/trackDb/drosophila/dm2/trackDb.ra and insert it into # kent/src/hg/makeDb/trackDb/drosophila/dm3/trackDb.ra # Copy kent/src/hg/makeDb/trackDb/drosophila/dm2/affyDrosDevTransfrags.html to # kent/src/hg/makeDb/trackDb/drosophila/dm3/affyDrosDevTransfrags.html and # add a sentence in the Methods section to indicate it is based on dm2 data # lifted over to dm3. # make update DBS=dm3 # Now, build the "Affymetrix Drosophila Development Signal" track # make a dm2 subdirectory with associated subdirs of dm2 chroms to store some intermediate data foreach chr (`awk '{print $1;}' /hive/data/genomes/dm2/chrom.sizes`) mkdir -p dm2/$chr end # create a temp table signalTemp in dm3 with: cat << '_EOF_' > signalTemp.sql CREATE TABLE signalTemp ( pos int(10) unsigned NOT NULL default '0', val varchar(255) ) TYPE=MyISAM; '_EOF_' hgsql dm3 genBed4 hgsql dm3 -e 'delete from signalTemp' echo processing $1 $2 hgsql dm3 -e \ "load data local infile '/hive/data/genomes/dm2/bed/affyDrosDev/${1}/Dro_Total_AS_${2}_C01.sig.gr' into table signalTemp" hgsql dm3 -N -e "select '${1}', pos-1, pos, val from signalTemp " > dm2/$1/affyDrosDevSignal$2.bed '_EOF_' # create bed 4 files for dm2 affyDrosDevSignal data so that we can lift them to dm3 foreach chr (`awk '{print $1;}' /hive/data/genomes/dm2/chrom.sizes`) foreach t (1 2 3 4 5 6 7 8 9 10 11 12) genBed4 $chr $t end end # lift over the dm2 bed 4 files into dm3 coordinates foreach t (1 2 3 4 5 6 7 8 9 10 11 12) mkdir -p dm3/$t foreach chr (`awk '{print $1;}' /hive/data/genomes/dm2/chrom.sizes`) echo echo processing $t $chr ... liftOver dm2/$chr/affyDrosDevSignal$t.bed dm2ToDm3.over.chain dm3/$t/$t.dm2_$chr.affyDrosDevSignal.bed \ dm3/$t/unMapped.$t.dm2.$chr.bed end cat dm3/$t/$t.dm2_*.affyDrosDevSignal.bed|bedSort stdin dm3/$t.sorted.bed end # The lift overed data contains overlapping data points, when converted to wig files, # they will failure during subsequent wig file processing. So, we need to # remove overlapping data points in the bed files first. # create a script, genSortedUniq. This script removes the duplicate entries with identical chrom and chromStart cat << '_EOF_' > genSortedUniq cut -f1-3 $1.sorted.bed >j1 cut -f1,2 $1.sorted.bed |sed -e 's/\t/_/' >j2 cut -f4 $1.sorted.bed >j3 paste j1 j2 j3 >$1.tmp.bed hgsql dm3 -e 'delete from affyTTmp' hgsql dm3 -e "load data local infile '${1}.tmp.bed' into table affyTTmp" hgsql dm3 -N -e 'select name, chrom, chromStart, chromEnd, val from affyTTmp' > j.in rmDupCol1 j.in >j.tmp bedSort j.tmp $1.sorted.uniq.bed rm $1.tmp.bed '_EOF_' # Run genSortedUniq on each time interval cd dm3 foreach t (1 2 3 4 5 6 7 8 9 10 11 12) echo processing $t ../genSortedUniq $t end cd .. # encode wig files. wigEncode actually works with bed 4 file as input. foreach t (1 2 3 4 5 6 7 8 9 10 11 12) wigEncode dm3/$t.sorted.uniq.bed affyDrosDevSignal$t.wi{g,b} end #Converted dm3/1.sorted.uniq.bed, upper limit 1156.00, lower limit -860.00 #Converted dm3/2.sorted.uniq.bed, upper limit 1499.75, lower limit -580.50 #Converted dm3/3.sorted.uniq.bed, upper limit 1627.50, lower limit -586.00 #Converted dm3/4.sorted.uniq.bed, upper limit 1794.50, lower limit -718.50 #Converted dm3/5.sorted.uniq.bed, upper limit 2277.25, lower limit -913.00 #Converted dm3/6.sorted.uniq.bed, upper limit 2095.25, lower limit -864.00 #Converted dm3/7.sorted.uniq.bed, upper limit 1071.75, lower limit -560.50 #Converted dm3/8.sorted.uniq.bed, upper limit 1182.50, lower limit -1019.00 #Converted dm3/9.sorted.uniq.bed, upper limit 2028.75, lower limit -942.50 #Converted dm3/10.sorted.uniq.bed, upper limit 1212.00, lower limit -779.00 #Converted dm3/11.sorted.uniq.bed, upper limit 1456.75, lower limit -454.50 #Converted dm3/12.sorted.uniq.bed, upper limit 1203.50, lower limit -459.50 mkdir /gbdb/dm3/affyDrosDev ln -s /hive/data/genomes/dm3/bed/affyDrosDev/affyDrosDevSignal*.wib \ /gbdb/dm3/affyDrosDev # load tables foreach t (1 2 3 4 5 6 7 8 9 10 11 12) hgLoadWiggle dm3 affyDrosDevSignal$t \ -pathPrefix=/gbdb/dm3/affyDrosDev affyDrosDevSignal$t.wig end rm bed.tab wiggle.tab # Copy over the affyDrosDevSignal section of # kent/src/hg/makeDb/trackDb/drosophila/dm2/trackDb.ra and insert it into # kent/src/hg/makeDb/trackDb/drosophila/dm3/trackDb.ra # Copy kent/src/hg/makeDb/trackDb/drosophila/dm2/affyDrosDevSignal.html to # kent/src/hg/makeDb/trackDb/drosophila/dm3/affyDrosDevSignal.html and # add a sentence in the Methods section to indicate it is based on dm2 data # lifted over to dm3. ####################################################################################### # Update BlastTab tables for dm3 (Done, Fan, 8/6/2010) ssh hgwdev mkdir -p /hive/data/genomes/dm3/bed/hgNearBlastp cd /hive/data/genomes/dm3/bed/hgNearBlastp mkdir 100806 cd 100806 # Get the proteins used by the other hgNear organisms: pepPredToFa hg19 knownGenePep hg19.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer6 ensPep danRer6.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce6 sangerPep ce6.sangerPep.faa pepPredToFa sacCer2 sgdPep sacCer2.sgdPep.faa cat << _EOF_ > config.ra # Latest fly vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, worm, yeast targetGenesetPrefix flyBase targetDb dm3 queryDbs hg19 mm9 rn4 danRer6 ce6 sacCer2 recipBest hg19 mm9 rn4 danRer6 ce6 sacCer2 dm3Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/dm3.flyBasePep.faa hg19Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/hg19.known.faa mm9Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/mm9.known.faa rn4Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/rn4.known.faa danRer6Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/danRer6.ensPep.faa ce6Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/ce6.sangerPep.faa sacCer2Fa /hive/data/genomes/dm3/bed/hgNearBlastp/100806/sacCer2.sgdPep.faa buildDir /hive/data/genomes/dm3/bed/hgNearBlastp/100806 scratchDir /hive/data/genomes/dm3/bed/hgNearBlastp/100806/tmp _EOF_ doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log ########################################################################### # BLASTZ/CHAIN/NET DROVIR3 (DONE 2012-03-01 - Chin) mkdir /cluster/data/dm3/bed/blastz.droVir2.2012-03-01 cd /cluster/data/dm3/bed/blastz.droVir2.2012-03-01 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/scratch/data/dm3/dm3.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm3/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/scratch/data/droVir2/droVir2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droVir2/chrom.sizes BASE=/cluster/data/dm3/bed/blastz.droVir2.2012-03-01 '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noDbNameCheck -chainMinScore=3000 \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # real 11m58.080s tail do.log # *** All done ! Elapsed time: 11m58s # *** Make sure that goldenPath/dm3/vsDroVir2/README.txt is accurate. # *** Add {chain,net}DroVir2 tracks to trackDb.ra if necessary. featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroVir2Link # flyBaseLiftGene 22.721%, chainDroVir2Link 61.821%, both 20.304%, # cover 89.36%, enrich 1.45x cat fb.dm3.chainDroVir2Link.txt # 75854911 bases of 162367812 (46.718%) in intersection cd /cluster/data/dm3/bed rm -f /cluster/data/dm3/bed/blastz.droVir2 ln -s blastz.droVir2.2012-03-01 /cluster/data/dm3/bed/blastz.droVir2 # swap droVir2 mkdir /cluster/data/droVir2/bed/blastz.dm3.swap cd /cluster/data/droVir2/bed/blastz.dm3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/dm3/bed/blastz.droVir2/DEF \ -swap -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 > swap.log 2>&1 & # real 10m3.062s tail swap.log # *** All done ! Elapsed time: 10m3s # *** Make sure that goldenPath/droVir2/vsDm3/README.txt is accurate. # *** Add {chain,net}Dm3 tracks to trackDb.ra if necessary. cat fb.droVir2.chainDm3Link.txt # 83468526 bases of 189914823 (43.951%) in intersection ######################################################################### ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE 2012-05-25 braney) # bash if not using bash shell already cd /cluster/data/dm3 mkdir /cluster/data/dm3/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst dm3.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst dm3.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 179 mkdir -p /cluster/data/dm3/bed/tblastn.hg18KG cd /cluster/data/dm3/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 179 query.lst # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\) # 36727/(50000/179) = 131.482660 mkdir -p kgfa split -l 132 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/dm3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh swarm cd /cluster/data/dm3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 49941 of 49941 jobs # CPU time in finished jobs: 978939s 16315.65m 271.93h 11.33d 0.031 y # IO & Wait Time: 202756s 3379.27m 56.32h 2.35d 0.006 y # Average job time: 24s 0.39m 0.01h 0.00d # Longest finished job: 1078s 17.97m 0.30h 0.01d # Submission to last job: 9181s 153.02m 2.55h 0.11d ssh swarm cd /cluster/data/dm3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 279 of 279 jobs # CPU time in finished jobs: 1312s 21.87m 0.36h 0.02d 0.000 y # IO & Wait Time: 2781s 46.35m 0.77h 0.03d 0.000 y # Average job time: 15s 0.24m 0.00h 0.00d # Longest finished job: 42s 0.70m 0.01h 0.00d # Submission to last job: 109s 1.82m 0.03h 0.00d cd /cluster/data/dm3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 24896 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/dm3/bed/tblastn.hg18KG hgLoadPsl dm3 blastHg18KG.psl # check coverage featureBits dm3 blastHg18KG # 5948260 bases of 162367812 (3.663%) in intersection featureBits dm2 blastHg18KG # 5946352 bases of 131698467 (4.515%) in intersection featureBits dm3 flyBaseGene blastHg18KG -enrichment # flyBaseGene 18.293%, blastHg18KG 3.663%, both 3.477%, cover 19.01%, enrich 5.19x rm -rf blastOut #end tblastn