# for emacs: -*- mode: sh; -*- # Drosophila Melanogaster -- # # euchromatin (2L, 2R, 3L, 3R, 4, X): # Berkeley Drosophila Genome Project (fruitfly.org) release 4 (Apr. 2004) # http://www.fruitfly.org/annot/release4.html # # heterochromatin (2h, 3h, 4h, Xh, Yh, U): # Drosophila Heterochromatin Genome Project (dhgp.org) release 3.2 # (submitted to GenBank June 2004) # http://www.dhgp.org/index_release_notes.html # # Gene annotations: # FlyBase (http://flybase.bio.indiana.edu/) last updated ??? # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+-------------------------+ # | tableName | type | # +-------------+-------------------------+ # | flyBaseGene | genePred flyBasePep | # | refGene | genePred refPep refMrna | # | nscanGene | genePred | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # | augustus | genePred | # +-------------+-------------------------+ # DOWNLOAD SEQUENCE (DONE 9/8/04 angie) ssh kksilo mkdir /cluster/store8/dm2 ln -s /cluster/store8/dm2 /cluster/data/dm2 cd /cluster/data/dm2 wget ftp://ftp.fruitfly.org/pub/download/compressed/na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz zcat na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz \ | faSplit byname stdin dummyArg # Follow FlyBase's lead on the chromosome names, but still use our # "chr" prefix: foreach c (2L 2R 3L 3R 4 X) mkdir $c sed -e 's/^>arm_/>chr/' arm_$c.fa > $c/chr$c.fa echo arm_$c.fa size: faSize arm_$c.fa echo $c/chr$c.fa size: faSize $c/chr$c.fa echo comparison: faCmp arm_$c.fa $c/chr$c.fa echo "" end # heterochromatin sold separately... ftp://ftp.dhgp.org/pub/DHGP still # has release 3.2 as its latest. But that's June 2004... wtf, grab it: wget ftp://ftp.dhgp.org/pub/DHGP/Release3.2/FASTA/super-scaffolds/heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz tar xvzf heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz foreach c (2h 3h 4h Xh Yh U) mkdir $c perl -wpe 's/^>(\w+).*/>chr$1/' $c.FASTA > $c/chr$c.fa echo $c.FASTA size: faSize $c.FASTA echo $c/chr$c.fa size: faSize $c/chr$c.fa echo comparison: faCmp $c.FASTA $c/chr$c.fa echo "" end # Carefully review output of those commands, then: rm *.fa *.FASTA # put away the big download files mkdir downloads mv *.gz downloads/ # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 9/9/04 angie) mkdir /cluster/data/dm2/M cd /cluster/data/dm2/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "drosophila melanogaster mitochondrion genome". That shows the gi number: # 5835233 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835233&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Drosophila melanogaster mitochondrion complete genome, and then replace the # header line with just ">chrM". # SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 9/9/04 angie) ssh kksilo cd /cluster/data/dm2 foreach c (?{,?}) faSplit -minGapSize=100 -lift=$c/chr$c.lft \ gap $c/chr$c.fa 2000000 $c/chr${c}_ end foreach ctgFa (?{,?}/chr*_*.fa) set ctg = $ctgFa:r mkdir $ctg mv $ctgFa $ctg end # CREATING DATABASE (DONE 9/9/04 angie) ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 535G 1.1T 33% /var/lib/mysql # Create the database. hgsql '' -e 'create database dm2' # EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 9/9/04 angie) ssh kksilo cd /cluster/data/dm2 # Do as Jim suggested back when we got dm1 sequence: # I think that we can probably just show all gaps as bridged # in the non-h chromosomes, and as unbridged in the h chromosomes # and leave it at that. # Extract gaps using scaffoldFaToAgp. It's really meant for a different # purpose, so clean up its output: remove the .lft and .agp, and remove # the last line of .gap (extra gap added at end). Also substitute in # the correct chrom name in .gap. foreach c (?{,?}) set chr = chr$c pushd $c mv $chr.lft $chr.lft.bak scaffoldFaToAgp -minGapSize=100 $chr.fa rm $chr.{lft,agp} mv $chr.lft.bak $chr.lft set chrSize = `faSize $chr.fa | awk '{print $1;}'` set origLines = `cat $chr.gap | wc -l` awk '($2 != '$chrSize'+1) {print;}' $chr.gap \ | sed -e "s/chrUn/$chr/" > $chr.gap2 set newLines = `cat $chr.gap2 | wc -l` if ($newLines == ($origLines - 1)) then mv $chr.gap2 $chr.gap else echo "Error: $chr/$chr.gap2 has wrong number of lines." endif popd end # Call the gaps unbridged in chrU and chr*h: foreach c (U ?h) set chr = chr$c sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2 mv $c/$chr.gap2 $c/$chr.gap end ssh hgwdev hgLoadGap dm2 /cluster/data/dm2 # MAKE JKSTUFF AND BED DIRECTORIES (DONE 9/9/04 angie) # This used to hold scripts -- better to keep them inline in the .doc # so they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/dm2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/dm2/bed # MAKE LIFTALL.LFT (DONE 9/9/04 angie) ssh kksilo cd /cluster/data/dm2 cat ?{,?}/chr*.lft > jkStuff/liftAll.lft # RUN REPEAT MASKER (DONE 9/9/04 angie) # Note: drosophila library ("drosophila.lib") is dated May 27 '03. # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # Split contigs into 500kb chunks: ssh kksilo cd /cluster/data/dm2 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \ gap $contig.fa 500000 ${contig}_ cd ../.. end #- Make the run directory and job list: cd /cluster/data/dm2 cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/dm2/$2 /bin/cp $2 /tmp/dm2/$2/ cd /tmp/dm2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 popd /bin/cp /tmp/dm2/$2/$2.out ./ if (-e /tmp/dm2/$2/$2.tbl) /bin/cp /tmp/dm2/$2/$2.tbl ./ if (-e /tmp/dm2/$2/$2.cat) /bin/cp /tmp/dm2/$2/$2.cat ./ /bin/rm -fr /tmp/dm2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/dm2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/dm2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun cp /dev/null RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/dm2/jkStuff/RMDrosophila \ /cluster/data/dm2/$d $f /cluster/data/dm2/$d \ '{'check out line+ /cluster/data/dm2/$d/$f.out'}' \ >> RMRun/RMJobs end end # do the run ssh kk9 cd /cluster/data/dm2/RMRun para create RMJobs para try, check, push, check,... #Completed: 288 of 288 jobs #Average job time: 3169s 52.82m 0.88h 0.04d #Longest job: 4752s 79.20m 1.32h 0.06d #Submission to last job: 13769s 229.48m 3.82h 0.16d # Lift up the split-contig .out's to contig-level .out's ssh kksilo cd /cluster/data/dm2 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end # Lift up the contig-level .out's to chr-level foreach c (?{,?}) cd $c if (-e chr$c.lft && ! -z chr$c.lft) then echo lifting $c /cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \ `awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null else echo Can\'t find $c/chr$c.lft \! endif cd .. end # Load the .out files into the database with: ssh hgwdev hgLoadOut dm2 /cluster/data/dm2/?{,?}/*.fa.out # SIMPLE REPEATS (TRF) (DONE 9/9/04 angie) ssh kksilo mkdir /cluster/data/dm2/bed/simpleRepeat cd /cluster/data/dm2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach f (/cluster/data/dm2/?{,?}/chr*_*/chr?{,?}_?{,?}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # When job is done do: liftUp simpleRepeat.bed /cluster/data/dm2/jkStuff/liftAll.lft warn \ trf/*.bed # Load this into the database as so ssh hgwdev hgLoadBed dm2 simpleRepeat \ /cluster/data/dm2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 9/9/04 angie) # make a filtered version # of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/dm2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) echo "filtering $f" awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/dm2 mkdir bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \ `awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft` end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 9/9/04 angie) ssh kksilo cd /cluster/data/dm2 foreach c (?{,?}) echo repeat- and trf-masking chr$c.fa /cluster/home/kent/bin/i386/maskOutFa -soft \ $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa /cluster/home/kent/bin/i386/maskOutFa -softAdd \ $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa end foreach c (?{,?}) echo repeat- and trf-masking contigs of chr$c foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa) set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed /cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa /cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa end end # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 9/9/04 angie) # Translate to nib ssh kksilo cd /cluster/data/dm2 mkdir nib foreach c (?{,?}) faToNib -softMask $c/chr$c.fa nib/chr$c.nib end # Make symbolic links from /gbdb/dm2/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/dm2/nib foreach f (/cluster/data/dm2/nib/chr*.nib) ln -s $f /gbdb/dm2/nib end # Load /gbdb/dm2/nib paths into database and save size info. hgsql dm2 < ~/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib dm2 /gbdb/dm2/nib /cluster/data/dm2/?{,?}/chr?{,?}.fa echo "select chrom,size from chromInfo" | hgsql -N dm2 \ > /cluster/data/dm2/chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING (DONE 9/9/04 angie) # Copy all the data from the table "grp" # in the existing database dm1 to the new database ssh hgwdev hgsql dm2 -e "create table grp (PRIMARY KEY(NAME)) select * from dm1.grp" # MAKE GCPERCENT (DONE 2/2/04 angie) ssh hgwdev mkdir /cluster/data/dm2/bed/gcPercent cd /cluster/data/dm2/bed/gcPercent # create and load gcPercent table hgsql dm2 < ~/src/hg/lib/gcPercent.sql hgGcPercent dm2 ../../nib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 9/9/04 angie) # Warning: must genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("dm2", "Apr. 2004", "/gbdb/dm2/nib", "D. melanogaster", \ "chr2L:825964-851061", 1, 55, "D. melanogaster", \ "Drosophila melanogaster", "/gbdb/dm2/html/description.html", \ 0, 0, "BDGP v. 4 / DHGP v. 3.2");' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "dm2" where genome = "D. melanogaster"' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add dm2 to DBS and do make update # go public on genome-test cvs commit makefile make alpha # Add trackDb directories and description.html mkdir drosophila/dm2 cvs add drosophila/dm2 # Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm2/description.html # with a description of the assembly and some sample position queries. chmod a+r drosophila/dm2/description.html # Check it in and copy (via "make alpha" in trackDb/) to # /cluster/data/dm2/html/. cvs add drosophila/dm2/description.html cvs commit drosophila/dm2 mkdir -p /gbdb/dm2/html make alpha # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 2/?/05 galt/angie) ssh hgwdev # Get appropriate hostname and port numbers from cluster admins: echo 'insert into blatServers values("dm2", "blat14", 17794, 1, 0); \ insert into blatServers values("dm2", "blat14", 17795, 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # PUT NIBS ON ISCRATCH (DONE 9/9/04 angie) ssh kkr1u00 mkdir /iscratch/i/dm2 cd /iscratch/i/dm2 cp -pR /cluster/data/dm2/nib . iSync # Added "contigs" (chunks) 9/16/04 mkdir maskedContigs cp -p /cluster/data/dm2/*/chr*_*/chr?{,?}_?{,?}.fa maskedContigs iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 2/1/05 angie) # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # dm2 (D. melanogaster) dm2.genome = /iscratch/i/dm2/nib/chr*.nib dm2.lift = /cluster/data/dm2/jkStuff/liftAll.lft dm2.genbank.mrna.xeno.load = yes dm2.genbank.est.xeno.load = no dm2.downloadDir = dm2 cvs ci etc/genbank.conf # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, refseq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dm2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad dm2 featureBits dm2 refGene #28230571 bases of 131698467 (21.436%) in intersection # Clean up: rm -rf work/initial.dm2 ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dm2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad dm2 featureBits dm2 mrna #23752517 bases of 131698467 (18.036%) in intersection featureBits dm2 xenoMrna #5943805 bases of 131698467 (4.513%) in intersection # Clean up: rm -rf work/initial.dm2 ssh eieio # -initial for ESTs: nice bin/gbAlignStep -srcDb=genbank -type=est -initial dm2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep dm2 featureBits dm2 intronEst #12644283 bases of 131698467 (9.601%) in intersection featureBits dm2 est #30993609 bases of 131698467 (23.534%) in intersection # Clean up: rm -rf work/initial.dm2 # PRODUCING GENSCAN PREDICTIONS (DONE 9/10/04 angie) # Check out hg3rdParty/genscanlinux to get latest genscan: ssh hgwdev mkdir /cluster/data/dm2/bed/genscan cd /cluster/data/dm2/bed/genscan cvs co hg3rdParty/genscanlinux ssh kksilo cd /cluster/data/dm2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Make hard-masked contigs foreach f (/cluster/data/dm2/?{,?}/chr*/chr?{,?}_?{,?}.fa) maskOutFa $f hard $f.masked end # Generate a list file, contigs.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f contigs.list touch contigs.list foreach f ( `ls -1S /cluster/data/dm2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> contigs.list end cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 contigs.list single gsub jobList # Run on small cluster -- genscan needs big mem. ssh kki cd /cluster/data/dm2/bed/genscan para create jobList para try, check, push, check, ... #Completed: 81 of 81 jobs #Average job time: 48s 0.80m 0.01h 0.00d #Longest job: 212s 3.53m 0.06h 0.00d #Submission to last job: 589s 9.82m 0.16h 0.01d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/dm2/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/dm2/bed/genscan ldHgGene -gtf dm2 genscan genscan.gtf hgPepPred dm2 generic genscanPep genscan.pep hgLoadBed dm2 genscanSubopt genscanSubopt.bed featureBits dm2 genscan #24711981 bases of 131698467 (18.764%) in intersection # MYTOUCH FIX - Jen - 2006-01-24 sudo mytouch dm2 genscanPep 0412141800.00 # MAKE DOWNLOADABLE FILES (DONE 1/26/05 angie) ssh kksilo cd /cluster/data/dm2 mkdir zips zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa foreach f (?{,?}/chr?{,?}.fa) maskOutFa $f hard $f.masked end zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked cd bed/simpleRepeat zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. # Make a starter mrna.zip -- it will get updated regularly on the RR. /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \ -db=dm2 -native genbank mrna zips/mrna.fa gzip zips/mrna.fa foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2 cd /usr/local/apache/htdocs/goldenPath/dm2 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cp -p /cluster/data/dm2/zips/*.{zip,gz} bigZips cd bigZips md5sum *.{zip,gz} > md5sum.txt # BLASTZ D.PSEUDOOBSCURA (DONE 9/10/04 angie) # CHAIN PSEUDOOBSCURA BLASTZ (DONE 9/10/04 angie) # NET PSEUDOOBSCURA BLASTZ (DONE 9/11/04 angie) # GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 9/11/04 angie) # MAKE VSDP2 DOWNLOADABLES (DONE 2/15/05 angie) # -- dp2; Originally run with default blastz params; obsoleted by dp3 which # was later run with better params, below. # BLASTZ ANOPHELES (DONE 9/13/04 angie) # CHAIN ANOPHELES BLASTZ (DONE 9/13/04 angie) # NET ANOPHELES BLASTZ (DONE 9/13/04 angie) # MAKE VSANOGAM1 DOWNLOADABLES (DONE 1/28/05 angie) # GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 9/13/04 angie) # -- Originally run with "human-fugu" blastz params; replaced by a run with # better params, below. # MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie) # put the MAFs on bluearc ssh kksilo mkdir -p /cluster/bluearc/multiz.flymo/my cp /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \ /cluster/bluearc/multiz.flymo/my mkdir -p /cluster/bluearc/multiz.flymo/mp cp /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \ /cluster/bluearc/multiz.flymo/mp mkdir -p /cluster/bluearc/multiz.flymo/ma cp /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \ /cluster/bluearc/multiz.flymo/ma ssh kki mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1 cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1 mkdir mypa # Use PSU's new var_multiz: cat << '_EOF_' > doMultiz #!/bin/csh -ef set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path) set chr = $1:t:r if (-s $1 && -s $2) then set tmp = /scratch/$chr.tmp.maf var_multiz $1 $2 0 0 > $tmp maf_project $tmp dm2.$chr > /scratch/$chr.myp.maf rm $tmp else if (-s $1) then cp $1 /scratch/$chr.myp.maf else if (-s $2) then cp $2 /scratch/$chr.myp.maf endif if (-s /scratch/$chr.myp.maf && -s $3) then set tmp = /scratch/$chr.tmp.maf var_multiz /scratch/$chr.myp.maf $3 1 0 > $tmp maf_project $tmp dm2.$chr > $4 rm $tmp else if (-s /scratch/$chr.myp.maf) then cp /scratch/$chr.myp.maf $4 else if (-s $3) then cp $3 $4 endif rm /scratch/$chr.myp.maf '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz cp /dev/null jobList foreach chr (`awk '{print $1;}' /cluster/data/dm2/chrom.sizes`) set f1 = /cluster/bluearc/multiz.flymo/my/$chr.maf set f2 = /cluster/bluearc/multiz.flymo/mp/$chr.maf set f3 = /cluster/bluearc/multiz.flymo/ma/$chr.maf echo "doMultiz $f1 $f2 $f3 mypa/$chr.maf" >> jobList end para create jobList para try, check, push, check #Completed: 13 of 13 jobs #Average job time: 246s 4.09m 0.07h 0.00d #Longest job: 811s 13.52m 0.23h 0.01d #Submission to last job: 811s 13.52m 0.23h 0.01d du -sh mypa #417M mypa # clean up bluearc rm -r /cluster/bluearc/multiz.flymo # setup external files for database reference ssh hgwdev mkdir /gbdb/dm2/mzDy1Dp2Ag1_phast ln -s /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/chr*.maf \ /gbdb/dm2/mzDy1Dp2Ag1_phast/ # load into database cd /tmp hgLoadMaf -warn dm2 mzDy1Dp2Ag1_phast cd /gbdb/dm2 mkdir d_pseudoobscura_mypa d_yakuba_mypa a_gambiae_mypa cd /tmp ln -s /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \ /gbdb/dm2/d_pseudoobscura_mypa hgLoadMaf -WARN dm2 d_pseudoobscura_mypa ln -s /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \ /gbdb/dm2/d_yakuba_mypa hgLoadMaf -WARN dm2 d_yakuba_mypa ln -s /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \ /gbdb/dm2/a_gambiae_mypa hgLoadMaf -WARN dm2 a_gambiae_mypa # PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie) ssh kksilo # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/dm2/chrom cp -p /cluster/data/dm2/?{,?}/chr*.fa /cluster/bluearc/dm2/chrom/ ssh kki mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split set WINDOWS = /cluster/bluearc/dm2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm2/chrom set WINDOWS=/cluster/bluearc/dm2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm2,droYak1,dp2,anoGam1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 13 of 13 jobs #Average job time: 23s 0.38m 0.01h 0.00d #Longest job: 53s 0.88m 0.01h 0.00d #Submission to last job: 53s 0.88m 0.01h 0.00d cd .. # use the model previously estimated (see makeDm1.doc) as a starting model sed -e 's/dm1/dm2/g' /cluster/data/dm1/bed/phastCons4way/rev-dg.mod \ > starting-tree.mod # -- Because of the very long branch length to anoGam1 being pretty # much impossible to estimate from alignment data, edit that file to # reduce the anoGam1 branch length from 2.66 to 0.5. Otherwise # estimation process blows up. So our starting tree becomes #TREE: (((dm2:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5); # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.276938 0.223190 0.223142 0.276730 # add up the C and G: awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.446 # Great, use 0.446 as the --gc parameter in phastCons below:. # Now set up cluster job to estimate model parameters. # Parameters will be estimated separately for each alignment fragment # then will be combined across fragments. # Use --gc from above, and --target-coverage computed as follows: # 1. Jim would like phastConsElements coverage to be 25% for fly. # 2. 83% of dm2 is covered by chainDroYak1Link, so use .25 / .83 = .30 # as an initial --target-coverage. # 3. If actual coverage is different from our target, come back to this # step, adjust --target-coverage and rerun up through phastConsElements. # -- Actually, Adam suggests starting with what proved to work for worm: # --target-coverage 0.4 and --expected-lengths 25 (not 12 as for mammal) # -- OK, that led to too-high coverage: # 51511266 bases of 131698467 (39.113%) in intersection # so next try 0.3, 30: # 49886398 bases of 131698467 (37.879%) in intersection # how about 0.3, 20? # 42067218 bases of 131698467 (31.942%) in intersection # 0.25, 20? # 38397215 bases of 131698467 (29.155%) in intersection # 0.25, 15? # 35039841 bases of 131698467 (26.606%) in intersection # -- that's close enough to the target. If the "bumpiness" of the # wiggle looks aesthetically pleasing then we're done. # -- OK, Adam would like to see more smoothing so that coding exons # stand out better, so try 0.20, 25: # 37574714 bases of 131698467 (28.531%) in intersection # More smooting would be nice for coding exons, so tried 0.20, 50, # but Adam didn't like that as much overall so stick with 0.20, 25. mkdir run.estimate cd run.estimate cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.435 --nrates 1,1 \ --no-post-probs --ignore-missing --expected-lengths 25 \ --target-coverage 0.20 --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk9 cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate para create jobList para try, check, push, check, ... #Completed: 139 of 139 jobs #Average job time: 133s 2.22m 0.04h 0.00d #Longest job: 221s 3.68m 0.06h 0.00d #Submission to last job: 328s 5.47m 0.09h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod #ave.cons.mod:TREE: (((dm2:0.028707,droYak1:0.019740):0.039091,dp2:0.065825):0.086142,anoGam1:0.086142); #ave.noncons.mod:TREE: (((dm2:0.118202,droYak1:0.079147):0.162923,dp2:0.275544):0.360361,anoGam1:0.360361); # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. It's all downhill # from here. ssh kk9 mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \ --idpref $pref \ --viterbi /cluster/bluearc/dm2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 139 of 139 jobs #Average job time: 23s 0.38m 0.01h 0.00d #Longest job: 49s 0.82m 0.01h 0.00d #Submission to last job: 52s 0.87m 0.01h 0.00d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/dm2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons featureBits dm2 all.bed #37574714 bases of 131698467 (28.531%) in intersection # OK, close enough. hgLoadBed dm2 phastConsElements all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib rm -rf /cluster/bluearc/dm2/phastCons/wib mkdir -p /cluster/bluearc/dm2/phastCons/wib cat << 'EOF' > doWigAsciiToBinary #!/bin/csh -ef set chr = $1 zcat `ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigAsciiToBinary -chrom=$chr \ -wibFile=/cluster/bluearc/dm2/phastCons/wib/${chr}_phastCons stdin 'EOF' # << for emacs chmod a+x doWigAsciiToBinary rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigAsciiToBinary $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 13 of 13 jobs #Average job time: 14s 0.24m 0.00h 0.00d #Longest job: 36s 0.60m 0.01h 0.00d #Submission to last job: 36s 0.60m 0.01h 0.00d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm2/phastCons/wib . rsync -av /cluster/bluearc/dm2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir -p /gbdb/dm2/wib/mzDy1Dp2Ag1_phast cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm2/wib/mzDy1Dp2Ag1_phast/ hgLoadWiggle dm2 mzDy1Dp2Ag1_phast_wig \ -pathPrefix=/gbdb/dm2/wib/mzDy1Dp2Ag1_phast wib/*.wig #NOT DONE -- still using dm1 for this: # make top-5000 list and launcher on Adam's home page: sort -k5,5nr raw.bed | head -5000 > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-4way \ "top 5000 conserved elements (4way)" dm2 # and clean up bluearc. rm -r /cluster/bluearc/dm2/phastCons rm -r /cluster/bluearc/dm2/chrom # LIFTOVER BDGP 3.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 9/15/04 angie) ssh hgwdev mkdir /cluster/data/dm2/bed/bdgp3.2.liftOver cd /cluster/data/dm2/bed/bdgp3.2.liftOver hgsql dm1 -N -e 'select * from bdgpGene' > dm1.bdgpGene.tab hgsql dm1 -N -e 'select * from bdgpNonCoding' > dm1.bdgpNonCoding.tab ssh kksilo cd /cluster/data/dm2/bed/bdgp3.2.liftOver # lift files to try out: # blastz: # /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain/dm1ToDm2.over.chain \ # blat on dm1 chroms vs. dm2 2Mb chunks: # /cluster/data/dm1/bed/blat.dm2.2004-09-16/dm1ToDm2.over.chain \ # blat on dm1 chroms vs. dm2 3kb chunks: # /cluster/data/dm1/bed/blat.dm2.2004-09-17/dm1ToDm2.over.chain \ liftOver -genePred dm1.bdgpGene.tab \ /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \ bdgpLiftGene.tab bdgpLiftGene.unmapped.tab liftOver -genePred dm1.bdgpNonCoding.tab \ /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \ bdgpLiftNonCoding.tab bdgpLiftNonCoding.unmapped.tab # Not perfect but not bad: wc -l bdgp*.tab # 18707 bdgpLiftGene.tab # 78 bdgpLiftGene.unmapped.tab # 2134 bdgpLiftNonCoding.tab # 62 bdgpLiftNonCoding.unmapped.tab ssh hgwdev cd /cluster/data/dm2/bed/bdgp3.2.liftOver ldHgGene -predTab dm2 bdgpLiftGene bdgpLiftGene.tab ldHgGene -predTab dm2 bdgpLiftNonCoding bdgpLiftNonCoding.tab # Copy over the gene info tables hgsql dm2 -e 'create table bdgpLiftGeneInfo (PRIMARY KEY(bdgpName(7)), INDEX(flyBaseId(11))) select * from dm1.bdgpGeneInfo' hgsql dm2 -e 'create table bdgpLiftNonCodingInfo (INDEX(bdgpName(16)), INDEX(flyBaseId(11))) select * from dm1.bdgpNonCodingInfo' # FLYBASE ANNOTATIONS (DONE 1/12/05 angie) # REPLACED 2/23/05 -- SEE "FLYBASE 4.1" BELOW ssh kksilo mkdir /cluster/data/dm2/bed/flybase cd /cluster/data/dm2/bed/flybase foreach c (2L 2R 3L 3R 4 X) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/gff/dmel-$c-r4.0.gff.gz end zcat *.gff.gz > flybase.gff3 # What data sources are represented in this file? grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "."): 18747 . CDS 62629 . exon 13472 . gene 19301 . mRNA 70 . ncRNA 40 . pseudogene 96 . rRNA 28 . snRNA 28 . snoRNA 288 . tRNA 36921 . transcription_start_site 1571 . transposable_element 4680 . transposable_element_insertion_site # What keywords are defined in the 9th field? grep -v '^#' flybase.gff3 \ | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \ | sort | uniq -c # This incarnation of gff3 looks similar to the one encountered in # dp3, but still uses slightly different keywords and there are different # data sources of interest. So one-shot perl-script again: extractGenes.pl flybase.gff3 # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/fasta/dmel-all-translation-r4.0.fasta.gz zcat dmel-all-translation-r4.0.fasta.gz \ | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa ssh hgwdev cd /cluster/data/dm2/bed/flybase # Protein-coding genes: ldHgGene -gtf dm2 flyBaseGene flybase.gtf hgPepPred dm2 generic flyBasePep flybasePep.fa # Noncoding genes: hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed # Cross-referencing info for both coding and noncoding: hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \ into table flyBase2004Xref' # add upstream* downloadable files (added 2/10/05) cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips foreach size (1000 2000 5000) echo upstream$size featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \ | gzip -c > upstream$size.fa.gz end md5sum upstream*.fa.gz >> md5sum.txt # BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie) # CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie) # NET PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie) # GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 1/18/05 angie) # MAKE VSDP3 DOWNLOADABLES (DONE 1/18/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ.V7 EVAL: BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie) ssh kk9 mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18 cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18 ln -s blastzv7.dp3.2005-01-18 /cluster/data/dm2/bed/blastzv7.dp3 # Note the full path for BLASTZ so we get blastz.v7 (which is not in # PSU's CVS, argh!): cat << '_EOF_' > DEF # D.melanogaster vs. D.pseudoobscura export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=/cluster/bin/penn/blastz.2004-12-27/blastz-source/blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib # unused: SEQ1_RMSK= SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp3/nib # unused: SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm2/bed/blastzv7.dp3.2005-01-18 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # Create run dir, job list, and raw/ dir structure: mkdir run ~/kent/src/utils/blastz-make-joblist DEF \ /cluster/data/dm2/chrom.sizes /cluster/data/dp3/chrom.sizes \ > run/j csh -ef ./xdir.sh cd run # cluster run para create j para try, check, push, check, .... #Completed: 552 of 552 jobs #Average job time: 100s 1.66m 0.03h 0.00d #Longest job: 480s 8.00m 0.13h 0.01d #Submission to last job: 733s 12.22m 0.20h 0.01d # back on kksilo... mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1 cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1 /cluster/bin/scripts/blastz-make-out2lav ../DEF .. > j # small cluster run ssh kki cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1 para create j para try, check, push, check, .... #Completed: 23 of 23 jobs #Average job time: 8s 0.14m 0.00h 0.00d #Longest job: 29s 0.48m 0.01h 0.00d #Submission to last job: 60s 1.00m 0.02h 0.00d # Translate .lav to axt: ssh kksilo cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18 rm -r raw mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm2/nib /cluster/data/dp3/nib stdout \ | axtSort stdin ../../$out popd end # BLASTZ.V7 EVAL: CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm2/nib \ /iscratch/i/dp3/nib stdout \ | chainAntiRepeat /iscratch/i/dm2/nib /iscratch/i/dp3/nib \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 13 of 13 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 22s 0.37m 0.01h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm2 ${c}_chainBz7Dp3 $i end # Compare to regular (blastz.v6) chains: featureBits dm2 chainDp3Link #76557614 bases of 131698467 (58.131%) in intersection featureBits dm2 chainBz7Dp3Link #76340831 bases of 131698467 (57.966%) in intersection # Looks like v6 found more than v7 in general, but not too different. # Look at some cases where blastz.v7 found something but v6 didn't: featureBits dm2 chainBz7Dp3Link \!chainDp3Link -minSize=20 -bed=stdout #chrYh 212776 212811 chrYh.1 ... #chr3R 18027130 18027160 chr3R.59 ... #chr2h 300968 301032 chr2h.1 ... #34843 bases of 131698467 (0.026%) in intersection # BLATZ EVAL: D. PSEUDOOBSCURA (DP3) (IN PROGRESS 1/24/05 angie) ssh kk9 mkdir /cluster/data/dm2/bed/blatz.dp3.2004-01-19 cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19 mkdir chainRaw partitionSequence.pl 10000000 10000 /iscratch/i/dm2/nib \ /cluster/data/dm2/chrom.sizes > dm2.lst partitionSequence.pl 10000000 10000 /iscratch/i/dp3/nib \ /cluster/data/dp3/chrom.sizes > dp3.lst cat << '_EOF_' > gsub #LOOP blatz $(path1) $(path2) {check out line chainRaw/$(file1)_$(file2).chain} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 dm2.lst dp3.lst gsub spec para create spec para try, check, push, check, ... #Completed: 549 of 552 jobs #Crashed: 3 jobs #Average job time: 289s 4.82m 0.08h 0.00d #Longest job: 1664s 27.73m 0.46h 0.02d #Submission to last job: 6206s 103.43m 1.72h 0.07d # With default blatz params, 3 jobs crashed due to out-of-mem, # all others succeeded. Try again with thresholds more like # blastz human-mouse, i.e. -minGapless=3000? No, Jim says forget # the crashers for now (narrow down later) and just forge ahead # with the comparison to blastz (v6). ssh kksilo cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19 sed -e 's@/iscratch/i/d../nib/@@g; s/.nib//g;' chainRaw/* \ | chainMergeSort stdin > all.chain chainSplit chain all.chain # take a look at score distr's -- LOTS of low-scoring chains foreach f (chain/*.chain) grep ^chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Jim recommends -minScore=5000 for cross-species. Filter for now: mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm2 ${c}_chainBz1Dp3 $i end # Compare to regular (blastz.v6) chains: featureBits dm2 -enrichment flyBaseGene:CDS chainDp3Link #flyBaseGene:CDS 16.551%, chainDp3Link 58.131%, both 15.303%, cover 92.46%, enrich 1.59x featureBits dm2 -enrichment flyBaseGene:CDS chainBz1Dp3Link #flyBaseGene:CDS 16.551%, chainBz1Dp3Link 79.858%, both 16.179%, cover 97.76%, enrich 1.22x # Look at some examples where blatz covers some CDS that blastz doesn't: featureBits -chrom=chr3R dm2 flyBaseGene:CDS chainBz1Dp3Link \ \!chainDp3Link -minSize=30 -bed=/tmp/bz1Cds.bed # BLASTZ D.YAKUBA (DONE 1/26/05 angie) # CHAIN YAKUBA BLASTZ (DONE 1/26/05 angie) # NET YAKUBA BLASTZ (DONE 1/26/05 angie) # GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 1/26/05 angie) # MAKE VSDROYAK1 DOWNLOADABLES (DONE 1/26/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ D.ANANASSAE (DONE 1/28/05 angie) # CHAIN ANANASSAE BLASTZ (DONE 1/28/05 angie) # NET ANANASSAE BLASTZ (DONE 1/28/05 angie) # GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie) # MAKE VSDROANA1 DOWNLOADABLES (DONE 1/28/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ D.VIRILIS (DONE 1/28/05 angie) # CHAIN VIRILIS BLASTZ (DONE 1/28/05 angie) # NET VIRILIS BLASTZ (DONE 1/28/05 angie) # GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie) # MAKE VSDROVIR1 DOWNLOADABLES (DONE 1/28/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ D.MOJAVENSIS (DONE 1/28/05 angie) # CHAIN MOJAVENSIS BLASTZ (DONE 1/28/05 angie) # NET MOJAVENSIS BLASTZ (DONE 1/28/05 angie) # GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie) # MAKE VSDROMOJ1 DOWNLOADABLES (DONE 1/28/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ A.MELLIFERA (DONE 1/28/05 angie) # CHAIN MELLIFERA BLASTZ (DONE 1/31/05 angie) # NET MELLIFERA BLASTZ (DONE 1/31/05 angie) # GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 1/31/05 angie) # MAKE VSAPIMEL1 DOWNLOADABLES (DONE 1/31/05 angie) # -- Originally run on apiMel2 with "human-fugu" blastz params; outdated by # apiMel2 which was later run with better params, below. # BLASTZ A.MELLIFERA 2.0 (DONE 2/8/05 Andy) # CHAIN MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy) # NET MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy) # GENERATE APIMEL2 AXTNET AND MAF FOR MULTIZ (DONE 2/9/05 Andy) # MAKE VSAPIMEL2 DOWNLOADABLES (DONE 2/9/2005 Andy) # -- Originally run with "human-fugu" blastz params; replaced by a run with # better params, below. # MULTIZ.V10 8WAY (6 FLIES, MOSQUITO, HONEYBEE) (REDONE 3/7/05 angie # originally done 2/1/05 # REDONE 3/7/05 angie -- User found that yakuba was missing (due to # pairwise maf files having different .suffixes, and the script tolerating # missing files... yikes!!!). :( # REPLACED with 9WAY 5/23/05 # MULTIZ.V10 9WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 5/24/05 angie) # Better, looser blastz params were used to redo alignments of all # other insects to dm2. Much better coverage on the pairwise inputs # now, so re-run multiz and phastCons to get a better Conservation # track... also using dp3 and apiMel2 instead of dp2 and apiMel1, # and adding droSim1. # Tree (9-way): # ((((((dm2 droYak1) droAna1) dp3) (droVir1 droMoj1)) anoGam1) apiMel2) ssh kkstore01 mkdir /cluster/data/dm2/bed/multiz9way.2005-05-23 ln -s /cluster/data/dm2/bed/multiz9way.2005-05-23 \ /cluster/data/dm2/bed/multiz9way cd /cluster/data/dm2/bed/multiz9way # Setup: Copy pairwise MAF to /santest/scratch: mkdir /santest/scratch/flyMultiz9way foreach db (droSim1 droYak1 droAna1 dp3 droVir1 droMoj1 anoGam1 apiMel2) cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \ /santest/scratch/flyMultiz9way/$db end ls -lLR /santest/scratch/flyMultiz9way # Make output dir: mkdir maf # Create script to run multiz.v10 in the right order: cat << '_EOF_' > doMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/flyMultiz9way.$chr mkdir $tmp set REF = dm2.$chr set SIM = /santest/scratch/flyMultiz9way/droSim1/$chr.maf.gz set YAK = /santest/scratch/flyMultiz9way/droYak1/$chr.maf.gz set ANA = /santest/scratch/flyMultiz9way/droAna1/$chr.maf.gz set PSE = /santest/scratch/flyMultiz9way/dp3/$chr.maf.gz set VIR = /santest/scratch/flyMultiz9way/droVir1/$chr.maf.gz set MOJ = /santest/scratch/flyMultiz9way/droMoj1/$chr.maf.gz set ANO = /santest/scratch/flyMultiz9way/anoGam1/$chr.maf.gz set API = /santest/scratch/flyMultiz9way/apiMel2/$chr.maf.gz set DEST = /cluster/data/dm2/bed/multiz9way/maf/$chr.maf set MZ10 = /cluster/bin/penn/multiz.v10 set PROJECT = /cluster/bin/penn/maf_project if ( -s $SIM && -s $YAK ) then echo "Aligning $SIM $YAK..." zcat $SIM > $tmp/$chr.Sim.maf zcat $YAK > $tmp/$chr.Yak.maf $MZ10 $tmp/$chr.Sim.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYak.maf else if ( -s $SIM ) then touch missing.$chr.Sim zcat $SIM > $tmp/$chr.SimYak.maf else if ( -s $YAK ) then touch missing.$chr.Yak zcat $YAK > $tmp/$chr.SimYak.maf endif if ( -s $tmp/$chr.SimYak.maf && -s $ANA ) then echo "Aligning $tmp/$chr.SimYak.maf $ANA..." zcat $ANA > $tmp/$chr.Ana.maf $MZ10 $tmp/$chr.SimYak.maf $tmp/$chr.Ana.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAna.maf else if ( -s $tmp/$chr.SimYak.maf ) then touch missing.$chr.Ana cp $tmp/$chr.SimYak.maf $tmp/$chr.SimYakAna.maf else if ( -s $ANA ) then touch missing.$chr.SimYak zcat $ANA > $tmp/$chr.SimYakAna.maf endif if ( -s $PSE && -s $tmp/$chr.SimYakAna.maf ) then echo "Adding $PSE..." zcat $PSE > $tmp/$chr.Pse.maf $MZ10 $tmp/$chr.SimYakAna.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPse.maf else if ( -s $PSE ) then touch missing.$chr.SimYakAna zcat $PSE > $tmp/$chr.SimYakAnaPse.maf else if ( -s $tmp/$chr.SimYakAna.maf ) then touch missing.$chr.Pse cp $tmp/$chr.SimYakAna.maf $tmp/$chr.SimYakAnaPse.maf endif # droVir1 and droMoj1 are a subtree -- run on just them, then fold in: if ( -s $VIR && -s $MOJ ) then echo "Aligning $VIR $MOJ..." zcat $VIR > $tmp/$chr.Vir.maf zcat $MOJ > $tmp/$chr.Moj.maf $MZ10 $tmp/$chr.Vir.maf $tmp/$chr.Moj.maf 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf else if ( -s $VIR ) then touch missing.$chr.Moj zcat $VIR > $tmp/$chr.VirMoj.maf else if ( -s $MOJ ) then touch missing.$chr.Vir zcat $MOJ > $tmp/$chr.VirMoj.maf endif if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.SimYakAnaPse.maf ) then echo "Adding $tmp/$chr.VirMoj.maf..." $MZ10 $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.VirMoj.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMoj.maf else if ( -s $tmp/$chr.VirMoj.maf ) then touch missing.$chr.SimYakAnaPse cp $tmp/$chr.VirMoj.maf $tmp/$chr.SimYakAnaPseVirMoj.maf else if ( -s $tmp/$chr.SimYakAnaPse.maf ) then touch missing.$chr.VirMoj cp $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.SimYakAnaPseVirMoj.maf endif if ( -s $ANO && -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then echo "Adding $ANO..." zcat $ANO > $tmp/$chr.Ano.maf $MZ10 $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.Ano.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMojAno.maf else if ( -s $ANO ) then touch missing.$chr.SimYakAnaPseVirMoj zcat $ANO > $tmp/$chr.SimYakAnaPseVirMojAno.maf else if ( -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then touch missing.$chr.Ano cp $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.SimYakAnaPseVirMojAno.maf endif if ( -s $API && -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then echo "Adding $API..." zcat $API > $tmp/$chr.Api.maf $MZ10 $tmp/$chr.SimYakAnaPseVirMojAno.maf $tmp/$chr.Api.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf cp $tmp/$chr.final.maf $DEST else if ( -s $API ) then touch missing.$chr.SimYakAnaPseVirMojAno zcat $API > $DEST else if ( -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then touch missing.$chr.Api cp $tmp/$chr.SimYakAnaPseVirMojAno.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doMultiz.csh awk '{print "./doMultiz.csh " $1;}' /cluster/data/dm2/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /cluster/data/dm2/bed/multiz9way para create jobs.lst para try, check, push, check, ... #Completed: 13 of 13 jobs #Average job time: 2000s 33.34m 0.56h 0.02d #Longest finished job: 5969s 99.48m 1.66h 0.07d #Submission to last job: 15605s 260.08m 4.33h 0.18d ls -1 missing* #ls: No match. # make /gbdb/ links to 9way maf files: ssh hgwdev mkdir -p /gbdb/dm2/multiz9way/maf/multiz9way ln -s /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \ /gbdb/dm2/multiz9way/maf/multiz9way/ # load into database cd /tmp hgLoadMaf -warn dm2 multiz9way \ -pathPrefix=/gbdb/dm2/multiz9way/maf/multiz9way # load summary table to replace pairwise time cat /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \ | nice hgLoadMafSummary dm2 multiz9waySummary stdin # put 9way MAF out for download ssh kkstore01 cd /cluster/data/dm2/bed/multiz9way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload md5sum *.maf.gz > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz9way ln -s /cluster/data/dm2/bed/multiz9way/mafDownload/{*.maf.gz,md5sum.txt} \ /usr/local/apache/htdocs/goldenPath/dm2/multiz9way # make a README.txt # Cleanup rm -rf /santest/scratch/flyMultiz9way/ # PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 3/8/05 angie) # originally done 2/2/05 # REDONE 3/8/05 angie -- after regenerating multiz alignments above. # Same procedure as for latest dm1 8way -- using the param estimation # methods described in Adam's Genome Research paper. # REPLACED with 9way 5/23/05 # PHASTCONS 9WAY WITH METHODS FROM PAPER (DONE 5/27/05 angie) ssh kkstore01 mkdir -p /santest/scratch/dm2/chrom cp -p /cluster/data/dm2/?{,?}/chr*.fa /santest/scratch/dm2/chrom/ # Split chrom fa into smaller windows for phastCons: ssh kki mkdir /cluster/data/dm2/bed/multiz9way/phastCons mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.split cd /cluster/data/dm2/bed/multiz9way/phastCons/run.split set WINDOWS = /santest/scratch/dm2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/santest/scratch/dm2/chrom set WINDOWS=/santest/scratch/dm2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm2/bed/multiz9way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 13 of 13 jobs #Average job time: 44s 0.73m 0.01h 0.00d #Longest finished job: 117s 1.95m 0.03h 0.00d #Submission to last job: 117s 1.95m 0.03h 0.00d ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/dm2/bed/multiz9way/phastCons /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \ --aggregate dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \ -i MAF -o SS > all.ss /cluster/bin/phast/phyloFit all.ss \ --tree "(((((((dm2,droSim1),droYak1),droAna1),dp3),(droVir1,droMoj1)),anoGam1),apiMel2)" \ -i SS --out-root starting-tree cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -393252389.008460 #BACKGROUND: 0.286960 0.213375 0.213302 0.286363 #RATE_MAT: # -0.918686 0.202545 0.451023 0.265118 # 0.272394 -1.108053 0.230681 0.604978 # 0.606771 0.230760 -1.109751 0.272220 # 0.265671 0.450782 0.202768 -0.919221 #TREE: (((((((dm2:0.031118,droSim1:0.035465):0.029649,droYak1:0.076216):0.089605,droAna1:0.210217):0.044178,dp3:0.215369):0.046188,(droVir1:0.128011,droMoj1:0.139931):0.195030):0.109752,anoGam1:0.342365):0.228550,apiMel2:0.228550); # also get GC content from model -- if similar enough, no need to extract it # separately above. awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod #0.426677 # OK, use .427 for --gc below. # Use the values of --target-coverage and --expected-lengths from the # last iteration of the 8way run, 0.573 and 36.8. # Multiply each subst rate on the TREE line by 3.5 which is roughly the # ratio of noncons to cons in # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \ > starting-tree.noncons.mod /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.8 \ starting-tree{,.noncons}.mod #( Solving for new omega: 36.800000 36.416935 36.415050 ) #Transition parameters: gamma=0.573000, omega=36.800000, mu=0.027174, nu=0.036465 #Relative entropy: H=1.931731 bits/site #Expected min. length: L_min=5.081081 sites #Expected max. length: L_max=5.398876 sites #Total entropy: L_min*H=9.815283 bits #Recommended expected length: omega=36.415050 sites (for L_min*H=9.783400) # OK, use --expected-lengths 36.4. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.52 36.8 ave.{cons,noncons}.mod #Recommended expected length: omega=32.240755 sites (for L_min*H=9.783400) # OK, --expected-lengths 32.2 # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod #Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400) # ==> 27.1 # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.425 27.1 ave.{cons,noncons}.mod #Recommended expected length: omega=25.440532 sites (for L_min*H=9.783400) # ==> 25.4 # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.393 25.4 ave.{cons,noncons}.mod #Recommended expected length: omega=23.420199 sites (for L_min*H=9.783400) # ==> 23.4 # SIXTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.4 ave.{cons,noncons}.mod #Recommended expected length: omega=23.736472 sites (for L_min*H=9.783400) # ==> 23.7 # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh kk9 mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 36.4 --target-coverage 0.573 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 23.7 --target-coverage 0.398 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job para create jobList para try, check, push, check, ...# #Completed: 139 of 139 jobs #Average job time: 4131s 68.85m 1.15h 0.05d #Longest finished job: 8787s 146.45m 2.44h 0.10d #Submission to last job: 8886s 148.10m 2.47h 0.10d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.019592,droSim1:0.027067):0.014875,droYak1:0.047389):0.041218,droAna1:0.116484):0.020166,dp3:0.125948):0.018479,(droVir1:0.073409,droMoj1:0.080389):0.106180):0.056105,anoGam1:0.235476):0.127958,apiMel2:0.127958); #ave.noncons.mod:TREE: (((((((dm2:0.059164,droSim1:0.077944):0.047044,droYak1:0.144159):0.135352,droAna1:0.373788):0.067147,dp3:0.405695):0.062350,(droVir1:0.234677,droMoj1:0.258111):0.355922):0.191534,anoGam1:0.773044):0.423894,apiMel2:0.423894); # SECOND ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.019133,droSim1:0.026408):0.014525,droYak1:0.046282):0.040238,droAna1:0.113841):0.019632,dp3:0.123153):0.017914,(droVir1:0.071760,droMoj1:0.078554):0.104006):0.054630,anoGam1:0.230601):0.124952,apiMel2:0.124952); #ave.noncons.mod:TREE: (((((((dm2:0.058357,droSim1:0.076814):0.046405,droYak1:0.142227):0.133457,droAna1:0.368999):0.065975,dp3:0.400678):0.061001,(droVir1:0.231694,droMoj1:0.254750):0.352032):0.188199,anoGam1:0.764503):0.418063,apiMel2:0.418063); # THIRD ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.018454,droSim1:0.025454):0.014002,droYak1:0.044650):0.038757,droAna1:0.109828):0.018811,dp3:0.118845):0.017034,(droVir1:0.069220,droMoj1:0.075740):0.100602):0.052271,anoGam1:0.222690):0.120267,apiMel2:0.120267); #ave.noncons.mod:TREE: (((((((dm2:0.057221,droSim1:0.075276):0.045493,droYak1:0.139568):0.130778,droAna1:0.362074):0.064250,dp3:0.393146):0.058938,(droVir1:0.227290,droMoj1:0.249820):0.346379):0.183000,anoGam1:0.750810):0.409058,apiMel2:0.409058); # FOURTH ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.018200,droSim1:0.025097):0.013808,droYak1:0.044040):0.038208,droAna1:0.108335):0.018509,dp3:0.117244):0.016712,(droVir1:0.068273,droMoj1:0.074693):0.099332):0.051396,anoGam1:0.219717):0.118553,apiMel2:0.118553); #ave.noncons.mod:TREE: (((((((dm2:0.056840,droSim1:0.074761):0.045186,droYak1:0.138661):0.129841,droAna1:0.359716):0.063642,dp3:0.390643):0.058209,(droVir1:0.225781,droMoj1:0.248129):0.344408):0.181133,anoGam1:0.745987):0.406113,apiMel2:0.406113); # FIFTH ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.017875,droSim1:0.024638):0.013564,droYak1:0.043255):0.037506,droAna1:0.106409):0.018128,dp3:0.115161):0.016304,(droVir1:0.067046,droMoj1:0.073340):0.097682):0.050261,anoGam1:0.215779):0.116296,apiMel2:0.116296); #ave.noncons.mod:TREE: (((((((dm2:0.056321,droSim1:0.074061):0.044780,droYak1:0.137445):0.128624,droAna1:0.356472):0.062873,dp3:0.387024):0.057266,(droVir1:0.223703,droMoj1:0.245817):0.341759):0.178650,anoGam1:0.739131):0.401733,apiMel2:0.401733); # SIXTH ITERATION: #ave.cons.mod:TREE: (((((((dm2:0.017924,droSim1:0.024709):0.013600,droYak1:0.043377):0.037613,droAna1:0.106707):0.018185,dp3:0.115488):0.016365,(droVir1:0.067237,droMoj1:0.073549):0.097937):0.050437,anoGam1:0.216405):0.116660,apiMel2:0.116660); #ave.noncons.mod:TREE: (((((((dm2:0.056410,droSim1:0.074182):0.044846,droYak1:0.137649):0.128814,droAna1:0.357039):0.062987,dp3:0.387708):0.057413,(droVir1:0.224066,droMoj1:0.246214):0.342186):0.179046,anoGam1:0.740329):0.402594,apiMel2:0.402594); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off. # We probably don't need to do this, since it has always been very close # if not the same as what we used above, but it only takes a second. # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.4 ave.{cons,noncons}.mod #Recommended expected length: omega=36.779882 sites (for L_min*H=9.783400) # OK, tweak --expected-lengths to 36.8 below (and for next iteration). # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.52 32.2 ave.{cons,noncons}.mod #Recommended expected length: omega=32.239216 sites (for L_min*H=9.783400) # ==> keep at 32.2. # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.45 27.1 ave.{cons,noncons}.mod #Recommended expected length: omega=27.102528 sites (for L_min*H=9.783400) # ==> keep at 27.1. # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.425 25.4 ave.{cons,noncons}.mod #Recommended expected length: omega=25.445150 sites (for L_min*H=9.783400) # ==> keep at 25.4 # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.4 ave.{cons,noncons}.mod #Recommended expected length: omega=23.427025 sites (for L_min*H=9.783400) # ==> keep at 23.4 # SIXTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 ave.{cons,noncons}.mod #Recommended expected length: omega=23.735901 sites (for L_min*H=9.783400) # ==> keep at 23.7 # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.phast cd /cluster/data/dm2/bed/multiz9way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 23.7 --target-coverage 0.398 \ --quiet --seqname $chr --idpref $pref \ --viterbi /santest/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /santest/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 139 of 139 jobs #Average job time: 18s 0.31m 0.01h 0.00d #Longest finished job: 29s 0.48m 0.01h 0.00d #Submission to last job: 46s 0.77m 0.01h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm2/bed/multiz9way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /santest/scratch/dm2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm2/bed/multiz9way/phastCons featureBits -enrichment dm2 flyBaseGene:cds all.bed # FIRST ITERATION: too high; decrease --target-coverage, re-estimate. #flyBaseGene:cds 16.567%, all.bed 42.995%, both 12.251%, cover 73.95%, enrich 1.72x # SECOND ITERATION: still too high. #flyBaseGene:cds 16.567%, all.bed 41.408%, both 12.030%, cover 72.62%, enrich 1.75x # THIRD ITERATION: still... #flyBaseGene:cds 16.567%, all.bed 39.143%, both 11.693%, cover 70.58%, enrich 1.80x # FOURTH ITERATION: dang, should have used a calculator. #flyBaseGene:cds 16.567%, all.bed 38.329%, both 11.563%, cover 69.80%, enrich 1.82x # FIFTH ITERATION: close... but overshot. Back off just a bit. #flyBaseGene:cds 16.567%, all.bed 37.277%, both 11.388%, cover 68.74%, enrich 1.84x # SIXTH ITERATION: done. #flyBaseGene:cds 16.567%, all.bed 37.445%, both 11.417%, cover 68.92%, enrich 1.84x # Having met the CDS coverage target, load up the results. hgLoadBed dm2 phastConsElements9way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.wib cd /cluster/data/dm2/bed/multiz9way/phastCons/run.wib rm -rf /santest/scratch/dm2/phastCons/wib mkdir -p /santest/scratch/dm2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /santest/scratch/dm2/phastCons/wib zcat `ls -1 /santest/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /santest/scratch/dm2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 13 of 13 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest finished job: 23s 0.38m 0.01h 0.00d #Submission to last job: 23s 0.38m 0.01h 0.00d # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from santest cd /cluster/data/dm2/bed/multiz9way/phastCons rm -rf wib POSTPROBS rsync -av /santest/scratch/dm2/phastCons/wib . rsync -av /santest/scratch/dm2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm2/multiz9way/wib cd /cluster/data/dm2/bed/multiz9way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz9way/wib/ hgLoadWiggle dm2 phastCons9way \ -pathPrefix=/gbdb/dm2/multiz9way/wib wib/*.wig rm wiggle.tab # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-9way \ "top 5000 conserved elements (9way)" dm2 # and clean up santest. rm -r /santest/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /santest/scratch/dm2/chrom # Offer raw scores for download since fly folks are likely to be interested: ssh kkstore01 cd /cluster/data/dm2/bed/multiz9way/phastCons/POSTPROBS mkdir ../postprobsDownload foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`) zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \ > ../postprobsDownload/$chr.pp.gz end cd ../postprobsDownload md5sum *.gz > md5sum.txt # Make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way ln -s /cluster/data/dm2/bed/multiz9way/phastCons/postprobsDownload/* . # MAP FEEP PROBES (DONE 2/3/05 angie) ssh kolossus mkdir /cluster/data/dm2/bed/flyFeep cd /cluster/data/dm2/bed/flyFeep set feepDir = /projects/compbio/data/microarray/flyFEEP # Make per-chrom probe FASTA files: foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh) echo $c awk '$3 == "'$c'" {printf ">%s\n%s\n", $1, $7;}' \ $feepDir/probes3.1-Exons.1_apr11_exon \ $feepDir/probes3.1-Noncoding.1_apr11_tiling \ > chr${c}_exonNonExonProbes.fa awk '$1 != "ID" && $5 == "'$c'+" {printf ">%s\n%s\n", $1, $8;}' \ $feepDir/probes3.1-junctions.txt \ > chr${c}_junctionProbes.fa end # For exon/non-exon probes (36 consecutive bases), require 36 matching # bases (for some reason -minScore=36 doesn't filter anything out!). foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh) blat -noHead \ /cluster/data/dm2/nib/chr$c.nib chr${c}_exonNonExonProbes.fa stdout \ | grep ^36 | uniq \ > chr${c}_exonNonExonProbes.psl end # For splice junction probes (2 blocks of 18 consecutive bases), # use a small tileSize and -fine, and still require 36 matching bases: # This was slow, do as small cluster job next time, will still take hours. foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh) blat -noHead -tileSize=6 -fine \ /cluster/data/dm2/nib/chr$c.nib chr${c}_junctionProbes.fa stdout \ | grep ^36 | uniq \ > chr${c}_junctionProbes.psl end # Check on how many probes were aligned: foreach f (chr*exon*.fa chr*junc*.fa) set probes = `cat $f | wc -l` set probes = `expr $probes / 2` set aligned = `cat $f:r.psl | wc -l` set reallyAligned = `awk '{print $10;}' $f:r.psl | sort -u | wc -l ` set not = `expr $probes - $reallyAligned` echo $f:r"\t"$probes"\t"$aligned"\t"$reallyAligned"\t"\($not\) end # set count aligned reallyA not #chr2h_exonNonExonProbes 644 611 611 (33) #chr2L_exonNonExonProbes 26631 26749 26627 (4) #chr2R_exonNonExonProbes 26100 26111 26098 (2) #chr3h_exonNonExonProbes 833 833 833 (0) #chr3L_exonNonExonProbes 28756 28773 28749 (7) #chr3R_exonNonExonProbes 35060 35064 35055 (5) #chr4_exonNonExonProbes 1798 1797 1797 (1) #chrX_exonNonExonProbes 28912 28917 28912 (0) #chrXh_exonNonExonProbes 354 354 354 (0) #chrYh_exonNonExonProbes 97 97 97 (0) #chr2h_junctionProbes 3 3 3 (0) #chr2L_junctionProbes 5048 5015 5012 (36) #chr2R_junctionProbes 5697 5709 5652 (45) #chr3h_junctionProbes 1 1 1 (0) #chr3L_junctionProbes 6498 6437 6429 (69) #chr3R_junctionProbes 8747 8695 8684 (63) #chr4_junctionProbes 1056 1048 1048 (8) #chrXh_junctionProbes 50 50 50 (0) #chrX_junctionProbes 3688 3684 3680 (8) #chrYh_junctionProbes 0 0 0 (0) # Most probes successfully mapped, but a fair number of duplicates. # Dug into one of those (159937) and found that its probe sequence # halves were from a tandem repeat! (And didn't map to where they # were supposed to in dm1 either.) I suspect the probe sequences # might not be correct in the files I downloaded. Sent Chris Mason # an email about that. # If the probe dm1 locations are correct (I sure hope so), just not # some given probe sequences, I could extract dm1 sequence and map # that to dm2 instead of mapping the given probes as above. # Translate PSL to bed12 (this is too simple for PSL in general, but # since we require the entire query to map perfectly, it works here): awk '{printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t%d\t%s\t%s\n", \ $14, $16, $17, $10, $9, $16, $17, $18, $19, $20;}' \ chr*.psl \ > flyFeepProbesBed12.bed # Load probes ssh hgwdev cd /cluster/data/dm2/bed/flyFeep # Add scores to bed: hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \ flyFeepProbesBed12.bed hgLoadBed dm2 flyFeep flyFeep.bed # (back on fileserver) Winnow probes by Bussemaker lab's lists: set dm1FeepDir = /cluster/data/dm1/bed/flyFeep $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesExpressedAboveBackground.txt \ flyFeep.bed \ > flyFeepPEAB.bed $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesAnovaDiffExpressed.txt \ flyFeep.bed \ > flyFeepAnova.bed # (back on hgwdev) Load winnowed sets: hgLoadBed dm2 flyFeepPEAB flyFeepPEAB.bed hgLoadBed dm2 flyFeepAnova flyFeepAnova.bed # FLYBASE 4.1 ANNOTATIONS (DONE 2/28/05 angie) ssh kksilo mkdir /cluster/data/dm2/bed/flybase4.1 cd /cluster/data/dm2/bed/flybase4.1 foreach c (2L 2R 3L 3R 4 X) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/gff/dmel-$c-r4.1.gff.gz end zcat *.gff.gz > flybase.gff3 # What data sources are represented in this file? grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "."): 18941 . CDS 63033 . exon 14066 . gene 18941 . mRNA 144 . ncRNA 39 . pseudogene 96 . rRNA 29 . snRNA 28 . snoRNA 295 . tRNA 36921 . transcription_start_site 1571 . transposable_element 16404 . transposable_element_insertion_site # What keywords are defined in the 9th field? grep -v '^#' flybase.gff3 \ | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \ | sort | uniq -c # Once again, the previous round's parsing needed some updates to # handle the new data. extractGenes.pl flybase.gff3 #Oddball parentless ID=Sps2-exon-10336472..10336531 for exon # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/fasta/dmel-all-translation-r4.1.fasta.gz zcat dmel-all-translation-r4.1.fasta.gz \ | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa ssh hgwdev cd /cluster/data/dm2/bed/flybase4.1 # Protein-coding genes: ldHgGene -gtf dm2 flyBaseGene flybase.gtf hgPepPred dm2 generic flyBasePep flybasePep.fa # Fix typo caught by Galt & all.joiner: hgsql dm2 -e 'update flyBasePep set name = "CG6207-RC" where name = "GLCAT-P-PC"' # Noncoding genes: hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed # Cross-referencing info for both coding and noncoding: hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \ into table flyBase2004Xref' # Some featureBits comparisons with refGene which is pretty much like # version 4.0 but apparently includes some noncoding genes too: featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout #chr3R 8221731 8222987 chr3R.1 #... #87577 bases of 131698467 (0.066%) in intersection # This shows only the dropped isoforms/duplicates (ignores noncoding): featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \ -bed=stdout #chr3R 15620310 15621493 chr3R.1 #chr2L 4698094 4700940 chr2L.1 #chrX 3638700 3639895 chrX.1 #49911 bases of 131698467 (0.038%) in intersection # add upstream* downloadable files cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end md5sum *.zip *.gz > md5sum.txt # FLYREG (DONE 2/23/05 angie) ssh kksilo mkdir /cluster/data/dm2/bed/flyreg cd /cluster/data/dm2/bed/flyreg wget http://www.gen.cam.ac.uk/casey/data/Bergman2004/v2.0/Footprint.GFF # This is not GTF; it should really be bed +. The contributor, # Casey Bergman, says that coords are 0-based half-open, so # translation will be even easier. grep -v '^#' Footprint.GFF \ | perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"; FPID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6\t$7/) { die "Cant parse line $.:\n$_\n"; }' \ > flyreg2.bed ssh hgwdev hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg2.sql \ dm2 flyreg2 /cluster/data/dm2/bed/flyreg/flyreg2.bed # Add Dan Pollard's MEME motif data for the footprints: ssh kksilo cd /cluster/data/dm2/bed/flyreg wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt /cluster/data/dm1/bed/flyreg/extractMatrices.pl \ footprint_matrices.txt > flyregMotif.tab ssh hgwdev cd /cluster/data/dm2/bed/flyreg sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \ > flyregMotif.sql hgsql dm2 < flyregMotif.sql hgsql dm2 -e 'load data local infile "flyregMotif.tab" into table flyregMotif' # LIFTOVER DM1 FLYREG, COMPARE TO DM2 FLYREG (DONE 2/25/05 angie) # Casey Bergman is very eager to see alternate confirmation of his # dm1->dm2 mapping of flyreg, so run liftOver on it: ssh kksilo cd /cluster/data/dm2/bed/flyreg liftOver /cluster/data/dm1/bed/flyreg/flyreg.bed \ /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \ flyregLift.bed flyregLift.unmapped wc -l flyreg2.bed flyregLift.bed flyregLift.unmapped # 1362 flyreg2.bed # 1363 flyregLift.bed # 0 flyregLift.unmapped awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyreg2.bed | sort \ > /tmp/1 awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyregLift.bed | sort \ > /tmp/2 diff -c /tmp/[12] > newVsLift # Analyzed the differences: 2 duplicates in original (uniquified in v2), # 1 new item in v2, and 4 items incorrectly mapped by liftOver (fooled # by what looks like a slightly diverged local duplication) but probably # correctly mapped by Casey. # SELF ALIGNMENTS (DONE 2/24/05 angie) # Doing this largely as a test of doBlastzChainNet.pl... ssh kksilo mkdir /cluster/data/dm2/bed/blastz.dm2.2005-02-23 cd /cluster/data/dm2/bed/blastz.dm2.2005-02-23 cat << '_EOF_' > DEF # D. melanogaster vs. self # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. melanogaster SEQ2_DIR=/iscratch/i/dm2/nib SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dm2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.dm2.2005-02-23 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2SelfOut >& do.log & tail -f do.log # The script died at the start of the cat step when kki couldn't see # /panasas (doh!). Asked cluster-admin to fix that. # When fixed, restarted with -continue, appending to do.log: rm -r run.cat doBlastzChainNet.pl -continue cat DEF \ -blastzOutRoot /panasas/store/dm2SelfOut >>& do.log & tail -f do.log rmdir /panasas/store/dm2SelfOut ln -s blastz.dm2.2005-02-23 /cluster/data/dm2/bed/blastz.dm2 # chainSelf is already in top-level trackDb.ra, so no need to add. # Add /usr/local/apache/htdocs/goldenPath/dm2/vsSelf/README.txt # BLASTZ/CHAIN/NET DROSIM1 (DONE 4/13/05 angie) # -- Originally run with default blastz params; replaced by a run with # better params, below. # BLASTZ/CHAIN/NET ANOGAM1 -- LOOSER PARAMS (DONE 5/19/05 angie) # Lower the L threshold from 6000 to 4000 (5-18) and then 3000 (5-19), # see if that increases coverage from this measurement from the previous # "human-fugu" params: # featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link #flyBaseGene 22.967%, chainAnoGam1Link 15.999%, both 12.567%, cover 54.72%, enrich 3.42x # 5/19, L=3000: too much... low-complexity-anchored alignments cluttering. # but interesting to know we can get this kind of coverage: #flyBaseGene 22.967%, chainAnoGam1Link 60.227%, both 18.542%, cover 80.73%, enrich 1.34x # So use results of this L=4000 run: ssh kkstore01 mkdir /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18 cd /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18 cat << '_EOF_' > DEF # D.melanogaster vs. A. gambiae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - A. gambiae SEQ2_DIR=/iscratch/i/anoGam1/nib SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.anoGam1.2005-05-18 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2anoGam1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link # 5/18, L=4000: #flyBaseGene 22.967%, chainAnoGam1Link 22.072%, both 14.077%, cover 61.29%, enrich 2.78x # Looks good in the browser -- top-level net chains extend farther # without too much extra crap like we saw with L=3000. rm /cluster/data/dm2/bed/blastz.anoGam1 ln -s blastz.anoGam1.2005-05-18 /cluster/data/dm2/bed/blastz.anoGam1 # BLASTZ/CHAIN/NET APIMEL2 -- LOOSER PARAMS (DONE 5/19/05 angie) # Since L=4000 noticeably helped mosquito, try it on the honeybee, # which had this coverage with L=6000: #flyBaseGene 22.967%, chainApiMel2Link 13.767%, both 7.802%, cover 33.97%, enrich 2.47x mkdir /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19 cd /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19 cat << '_EOF_' > DEF # D.melanogaster vs. A. mellifera BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - A. mellifer SEQ2_DIR=/iscratch/i/apiMel2/nib SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.apiMel2.2005-05-19 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2apiMel2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainApiMel2Link #flyBaseGene 22.967%, chainApiMel2Link 36.701%, both 11.095%, cover 48.31%, enrich 1.32x # Wow, quite an increase in coverage! And drop in enrichment... some # of the new alignments are on the low-complexity side but not nearly # as bad as mosquito at L=3000. Never know what to expect from these # param experiments! :| Well, using L=4000 seems like a net gain so # go with it... rm /cluster/data/dm2/bed/blastz.apiMel2 ln -s blastz.apiMel2.2005-05-19 /cluster/data/dm2/bed/blastz.apiMel2 # BLASTZ/CHAIN/NET DROMOJ1 -- LOOSER PARAMS (DONE 5/19/05 angie) # Here's what L=6000 got us: #flyBaseGene 22.967%, chainDroMoj1Link 59.129%, both 20.159%, cover 87.77%, enrich 1.48x # OK, let's see what droMoj1 looks like at L=4000... mkdir /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19 cd /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19 cat << '_EOF_' > DEF # D. melanogaster vs. D. mojavensis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. mojavensis SEQ2_DIR=/iscratch/i/droMoj1/droMoj1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droMoj1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droMoj1.2005-05-19 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2droMoj1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj1Link #flyBaseGene 22.967%, chainDroMoj1Link 65.901%, both 20.698%, cover 90.12%, enrich 1.37x # Wow, actually looks pretty good in the browser. Finds more non-top- # level alignments than L=6000, but not a huge number, and most look # reasonable in the detailed view. rm -f /cluster/data/dm2/bed/blastz.droMoj1 ln -s blastz.droMoj1.2005-05-19 /cluster/data/dm2/bed/blastz.droMoj1 # BLASTZ/CHAIN/NET DROVIR1 -- LOOSER PARAMS (DONE 5/20/05 angie) # coverage with default blastz params: #flyBaseGene 22.967%, chainDroVir1Link 47.957%, both 18.585%, cover 80.92%, enrich 1.69x # OK, let's see what droVir1 looks like at L=4000... mkdir /cluster/data/dm2/bed/blastz.droVir1.2005-05-19 cd /cluster/data/dm2/bed/blastz.droVir1.2005-05-19 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/santest/scratch/droVir1/droVir1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/santest/scratch/droVir1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droVir1.2005-05-19 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2droVir1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link #flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x rm -f /cluster/data/dm2/bed/blastz.droVir1 ln -s blastz.droVir1.2005-05-19 /cluster/data/dm2/bed/blastz.droVir1 # BLASTZ/CHAIN/NET DP3 -- LOOSER PARAMS (DONE 5/20/05 angie) # coverage with default blastz params was not too bad but let's see if # we can do better: #flyBaseGene 22.967%, chainDp3Link 67.232%, both 20.112%, cover 87.57%, enrich 1.30x mkdir /cluster/data/dm2/bed/blastz.dp3.2005-05-20 cd /cluster/data/dm2/bed/blastz.dp3.2005-05-20 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp3/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp3/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.dp3.2005-05-20 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2dp3 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp3Link #flyBaseGene 22.967%, chainDp3Link 74.901%, both 21.335%, cover 92.89%, enrich 1.24x # Again, reasonable extension of coverage without too much crap. rm -f /cluster/data/dm2/bed/blastz.dp3 ln -s blastz.dp3.2005-05-20 /cluster/data/dm2/bed/blastz.dp3 # BLASTZ/CHAIN/NET DROANA1 -- LOOSER PARAMS (DONE 5/21/05 angie) # coverage with default blastz params: #flyBaseGene 22.967%, chainDroAna1Link 73.508%, both 20.981%, cover 91.35%, enrich 1.24x # OK, let's see what droAna1 looks like at L=4000... mkdir /cluster/data/dm2/bed/blastz.droAna1.2005-05-20 cd /cluster/data/dm2/bed/blastz.droAna1.2005-05-20 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/santest/scratch/droAna1/droAna1.2bit SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LEN=/santest/scratch/droAna1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droAna1.2005-05-20 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2droAna1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna1Link #flyBaseGene 22.967%, chainDroAna1Link 80.519%, both 21.949%, cover 95.57%, enrich 1.19x rm -f /cluster/data/dm2/bed/blastz.droAna1 ln -s blastz.droAna1.2005-05-20 /cluster/data/dm2/bed/blastz.droAna1 # BLASTZ/CHAIN/NET DROYAK1 -- LOOSER PARAMS (DONE 5/21/05 angie) # coverage with default blastz params: #flyBaseGene 22.967%, chainDroYak1Link 90.253%, both 22.586%, cover 98.34%, enrich 1.09x # OK, let's see what droYak1 looks like at L=4000... mkdir /cluster/data/dm2/bed/blastz.droYak1.2005-05-20 cd /cluster/data/dm2/bed/blastz.droYak1.2005-05-20 cat << '_EOF_' > DEF # D. melanogaster vs. D. yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/iscratch/i/droYak1/nib SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droYak1.2005-05-20 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2droYak1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak1Link #flyBaseGene 22.967%, chainDroYak1Link 91.896%, both 22.736%, cover 98.99%, enrich 1.08x rm -f /cluster/data/dm2/bed/blastz.droYak1 ln -s blastz.droYak1.2005-05-20 /cluster/data/dm2/bed/blastz.droYak1 # BLASTZ/CHAIN/NET DROSIM1 -- LOOSER PARAMS (DONE 5/21/05 angie) # coverage with default blastz params: #flyBaseGene 22.967%, chainDroSim1Link 90.656%, both 22.139%, cover 96.40%, enrich 1.06x # OK, let's see what droSim1 looks like at L=4000... mkdir /cluster/data/dm2/bed/blastz.droSim1.2005-05-21 cd /cluster/data/dm2/bed/blastz.droSim1.2005-05-21 cat << '_EOF_' > DEF # D. melanogaster vs. D. simulans BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. simulans SEQ2_DIR=/iscratch/i/droSim1/droSim1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droSim1.2005-05-21 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/dm2droSim1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSim1Link #flyBaseGene 22.967%, chainDroSim1Link 91.756%, both 22.420%, cover 97.62%, enrich 1.06x rm -f /cluster/data/dm2/bed/blastz.droSim1 ln -s blastz.droSim1.2005-05-21 /cluster/data/dm2/bed/blastz.droSim1 # AUGUSTUS GENE PREDICTIONS (Done 6/1/2005 Andy) ssh hgwdev cd /cluster/data/dm2/bed mkdir augustus cd augustus/ cat > cleanAugustus.awk << _EOF_ # Add the chrom name to the gene and transcript IDs. { ch = \$1 while (match(\$0, /"g[0-9\.]+"/)) { before = substr(\$0, 1, RSTART) name = substr(\$0, RSTART+1, RLENGTH) after = substr(\$0, RSTART + RLENGTH + 1) \$0 = before "" ch "." name "" after } print } _EOF_ wget http://augustus.gobics.de/predictions/dm2/dm2.allchr.augustus.gtf.gz zcat dm2.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > dm2.allchr.augustus.clean.gtf.gz ldHgGene -gtf dm2 augustus dm2.allchr.augustus.clean.gtf.gz rm dm2.allchr.augustus.gtf.gz # BLAT FlyBase predicted DM proteins against DM (DONE 2005-06-27 braney) ssh hgwdev cd /cluster/data/dm2/bed mkdir blat.dm2FB cd blat.dm2FB pepPredToFa dm2 flyBasePep dm2FB.fa hgPepPred dm2 generic blastFBPep01 dm2FB.fa ssh kk cd /cluster/data/dm2/bed/blat.dm2FB cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /iscratch/i/dm2/nib/*.nib > bug.lst mkdir fbfas cd fbfas faSplit sequence ../dm2FB.fa 5010 fb cd .. ls -1S fbfas/*.fa > fb.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 bug.lst fb.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../bug.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 63193 of 63193 jobs # CPU time in finished jobs: 712543s 11875.72m 197.93h 8.25d 0.023 y # IO & Wait Time: 172878s 2881.30m 48.02h 2.00d 0.005 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest finished job: 2172s 36.20m 0.60h 0.03d # Submission to last job: 3192s 53.20m 0.89h 0.04d ssh eieio cd /cluster/data/dm2/bed/blat.dm2FB pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null sort -rn cov90.psl | pslUniq stdin dm2FB.psl pslxToFa dm2FB.psl dm2FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft fbName dm2 dm2FB.psl blastFBRef01 ssh hgwdev cd /cluster/data/dm2/bed/blat.dm2FB hgsql dm2 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastFBRef01" | hgsql dm2 echo "load data local infile 'blastFBRef01' into table blastFBRef01" | hgsql dm2 # MAKE Human Proteins track (DONE 06-30-05 braney) ssh kk mkdir -p /cluster/data/dm2/blastDb cd /cluster/data/dm2/blastDb for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/dm2/blastDb cp /cluster/data/dm2/blastDb/* /iscratch/i/dm2/blastDb iSync 2>&1 > sync.out mkdir -p /cluster/data/dm2/bed/tblastn.hg17KG cd /cluster/data/dm2/bed/tblastn.hg17KG ls -1S /iscratch/i/dm2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\) # 37365/(150000/288) = 71.740800 split -l 72 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/dm2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 149472 of 149472 jobs # CPU time in finished jobs: 5837692s 97294.87m 1621.58h 67.57d 0.185 y # IO & Wait Time: 1404308s 23405.13m 390.09h 16.25d 0.045 y # Average job time: 48s 0.81m 0.01h 0.00d # Longest finished job: 1095s 18.25m 0.30h 0.01d # Submission to last job: 50465s 841.08m 14.02h 0.58d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' ssh kki cd /cluster/data/dm2/bed/tblastn.hg17KG tcsh cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 519 of 519 jobs # CPU time in finished jobs: 3835s 63.91m 1.07h 0.04d 0.000 y # IO & Wait Time: 5794s 96.57m 1.61h 0.07d 0.000 y # Average job time: 19s 0.31m 0.01h 0.00d # Longest finished job: 114s 1.90m 0.03h 0.00d # Submission to last job: 824s 13.73m 0.23h 0.01d ssh kkstore01 cd /cluster/data/dm2/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/dm2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/dm2/bed/tblastn.hg17KG hgLoadPsl dm2 blastHg17KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # BLASTZ/CHAIN/NET TRICAS1 (DONE 7/11/05 Andy) ssh kkstore01 mkdir /cluster/data/dm2/bed/blastz.triCas1.2005-07-10 cd /cluster/data/dm2/bed/blastz.triCas1.2005-07-10 cat << "_EOF_" > DEF # D. melanogaster vs. T. castaneum BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/panasas/store/triCas1/triCas1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/panasas/store/triCas1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.triCas1.2005-07-10 _EOF_ # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2triCas1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link #flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x rm -f /cluster/data/dm2/bed/blastz.triCas1 ln -s blastz.triCas1.2005-05-19 /cluster/data/dm2/bed/blastz.triCas1 # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie) ssh hgwdev mkdir /cluster/data/dm2/bed/geneid cd /cluster/data/dm2/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/D.melanogaster/golden_path_200404/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt dm2 geneid *.gtf # BLASTZ/CHAIN/NET DROERE1 (DONE 8/8/05 angie) mkdir /cluster/data/dm2/bed/blastz.droEre1.2005-08-08 cd /cluster/data/dm2/bed/blastz.droEre1.2005-08-08 cat << '_EOF_' > DEF # D. melanogaster vs. D. erecta BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. erecta SEQ2_DIR=/iscratch/i/droEre1/droEre1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droEre1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droEre1.2005-08-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droEre1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre1Link #flyBaseGene 22.967%, chainDroEre1Link 91.103%, both 22.725%, cover 98.95%, enrich 1.09x rm -f /cluster/data/dm2/bed/blastz.droEre1 ln -s blastz.droEre1.2005-08-08 /cluster/data/dm2/bed/blastz.droEre1 # BLASTZ/CHAIN/NET DROANA2 (DONE 8/8/05 angie) mkdir /cluster/data/dm2/bed/blastz.droAna2.2005-08-08 cd /cluster/data/dm2/bed/blastz.droAna2.2005-08-08 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/iscratch/i/droAna2/droAna2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droAna2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droAna2.2005-08-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droAna2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna2Link #flyBaseGene 22.967%, chainDroAna2Link 81.385%, both 22.050%, cover 96.01%, enrich 1.18x rm -f /cluster/data/dm2/bed/blastz.droAna2 ln -s blastz.droAna2.2005-08-08 /cluster/data/dm2/bed/blastz.droAna2 # BLASTZ/CHAIN/NET DROMOJ2 (DONE 8/9/05 angie) mkdir /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08 cd /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08 cat << '_EOF_' > DEF # D. melanogaster vs. D. mojavensis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. mojavensis SEQ2_DIR=/iscratch/i/droMoj2/droMoj2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droMoj2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droMoj2.2005-08-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droMoj2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj2Link #flyBaseGene 22.967%, chainDroMoj2Link 66.524%, both 20.799%, cover 90.56%, enrich 1.36x rm -f /cluster/data/dm2/bed/blastz.droMoj2 ln -s blastz.droMoj2.2005-08-08 /cluster/data/dm2/bed/blastz.droMoj2 # BLASTZ/CHAIN/NET DROGRI1 (DONE 8/9/05 angie) mkdir /cluster/data/dm2/bed/blastz.droGri1.2005-08-08 cd /cluster/data/dm2/bed/blastz.droGri1.2005-08-08 cat << '_EOF_' > DEF # D. melanogaster vs. D. grimshawi BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. grimshawi SEQ2_DIR=/iscratch/i/droGri1/droGri1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droGri1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droGri1.2005-08-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droGri1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri1Link #flyBaseGene 22.967%, chainDroGri1Link 68.520%, both 20.822%, cover 90.66%, enrich 1.32x rm -f /cluster/data/dm2/bed/blastz.droGri1 ln -s blastz.droGri1.2005-08-08 /cluster/data/dm2/bed/blastz.droGri1 # BLASTZ/CHAIN/NET DROVIR2 (DONE 8/12/05 angie) mkdir /cluster/data/dm2/bed/blastz.droVir2.2005-08-11 cd /cluster/data/dm2/bed/blastz.droVir2.2005-08-11 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/iscratch/i/droVir2/droVir2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droVir2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droVir2.2005-08-11 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droVir2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir2Link #flyBaseGene 22.967%, chainDroVir2Link 67.025%, both 20.901%, cover 91.01%, enrich 1.36x rm -f /cluster/data/dm2/bed/blastz.droVir2 ln -s blastz.droVir2.2005-08-11 /cluster/data/dm2/bed/blastz.droVir2 # RE-RUN NETTOAXT, AXTTOMAF IN PREP FOR MULTIZ (DONE 8/19/05 angie) # Kate checked in a fix to netToAxt, to prevent overlapping blocks # which were causing problems for (display of) multiple alignments. # Before running multiz, regenerate the axtNet/ and mafNet/ pairwise # inputs (axtNet downloads should probably be repushed after this). # The /cluster/data/dm2/bed/blastz.*/axtChain/netChains.csh files # contain the original axtNet and mafNet commands, adapted here: ssh kolossus cd /tmp cat << '_EOF_' > /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh #!/bin/csh -efx foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \ anoGam1 apiMel2 triCas1) set oSeq = /iscratch/i/$oDb/$oDb.2bit if (! -e $oSeq) then set oSeq = /iscratch/i/$oDb/nib endif cd /cluster/data/dm2/bed/blastz.$oDb/axtChain netSplit dm2.$oDb.net.gz net chainSplit chain dm2.$oDb.all.chain.gz cd .. mv axtNet axtNet.bak mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /iscratch/i/dm2/nib $oSeq stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.dm2.$oDb.net.axt.gz end mv mafNet mafNet.bak mkdir mafNet foreach f (axtNet/*.dm2.$oDb.net.axt.gz) axtToMaf -tPrefix=dm2. -qPrefix=$oDb. $f \ /cluster/data/dm2/chrom.sizes /cluster/data/$oDb/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end end '_EOF_' # << for emacs chmod 775 /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh >& re.log echo $status #0 # Made sure everything ran smoothly. Spot-checked some .bak's vs. # new to make sure they're of comparable size -- newer can be slightly # smaller due to lack of dup's, or slightly larger due to more headers. # Clean up: foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \ anoGam1 apiMel2 triCas1) rm -r /cluster/data/dm2/bed/blastz.$oDb/axtChain/{chain,net} rm -r /cluster/data/dm2/bed/blastz.$oDb/{axt,maf}Net.bak end # MULTIZ.V10 12WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 8/20/05 angie) # Since 9way, adding D. erecta, D. grimshawi, T. castaneum; # dro{Ana,Moj,Vir}1 --> dro{Ana,Moj,Vir}2. # Tree (12-way): # ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1) ssh kkstore01 mkdir /cluster/data/dm2/bed/multiz12way.2005-08-20 ln -s /cluster/data/dm2/bed/multiz12way.2005-08-20 \ /cluster/data/dm2/bed/multiz12way cd /cluster/data/dm2/bed/multiz12way # Setup: Copy pairwise MAF to /san/sanvol1/scratch: mkdir /san/sanvol1/scratch/flyMultiz12way foreach db (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \ anoGam1 apiMel2 triCas1) echo $db cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \ /san/sanvol1/scratch/flyMultiz12way/$db end ls -lLR /san/sanvol1/scratch/flyMultiz12way # Make output dir: mkdir maf # Create scripts to run multiz.v10 in the right order: # (Thanks to Andy for the idea to use a subroutine for individual # multiz invocations) cat << '_EOF_' > /cluster/bin/scripts/runMultizV10.csh #!/bin/csh -fex # Run multiz on two inputs. set closerMaf = $1 set fartherMaf = $2 set v = $3 set ref = $4 set tmpDir = $5 set outMaf = $6 if ($outMaf == "") then echo "usage: $0 closerMaf fartherMaf v ref tmpDir outMaf" exit 1 endif set MZ10 = /cluster/bin/penn/multiz.v10 set PROJECT = /cluster/bin/penn/maf_project if (-e $closerMaf && $closerMaf:e == "gz") then gunzip -c $closerMaf > $tmpDir/closer.maf set closerMaf = $tmpDir/closer.maf endif if (-e $fartherMaf && $fartherMaf:e == "gz") then gunzip -c $fartherMaf > $tmpDir/farther.maf set fartherMaf = $tmpDir/farther.maf endif set closerSubbed = `echo $closerMaf | sed -e 's@/@_@g;'` set fartherSubbed = `echo $fartherMaf | sed -e 's@/@_@g;'` if ( -s $closerMaf && -s $fartherMaf ) then $MZ10 $fartherMaf $closerMaf $v > $tmpDir/tmp.maf $PROJECT $tmpDir/tmp.maf $ref > $outMaf else if ( -s $closerMaf ) then touch missing.$fartherSubbed cp $closerMaf $outMaf else if ( -s $fartherMaf ) then touch missing.$closerSubbed cp $fartherMaf $outMaf else touch missing.$closerSubbed touch missing.$fartherSubbed endif '_EOF_' # << for emacs cat << '_EOF_' > doMultizAll.csh #!/bin/csh -fex set chr = $1 set tmpDir = /scratch/flyMultiz12way.$chr mkdir $tmpDir set mafScratch = /san/sanvol1/scratch/flyMultiz12way # Really should write a perl script to take a tree like this and generate # commands like the ones below: # ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1) /cluster/bin/scripts/runMultizV10.csh \ $mafScratch/droYak1/$chr.maf.gz \ $mafScratch/droEre1/$chr.maf.gz \ 0 dm2.$chr $tmpDir $tmpDir/$chr.YakEre.maf /cluster/bin/scripts/runMultizV10.csh \ $mafScratch/droSim1/$chr.maf.gz \ $tmpDir/$chr.YakEre.maf \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEre.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEre.maf \ $mafScratch/droAna2/$chr.maf.gz \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAna.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAna.maf \ $mafScratch/dp3/$chr.maf.gz \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPse.maf /cluster/bin/scripts/runMultizV10.csh \ $mafScratch/droVir2/$chr.maf.gz \ $mafScratch/droMoj2/$chr.maf.gz \ 0 dm2.$chr $tmpDir $tmpDir/$chr.VirMoj.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAnaPse.maf \ $tmpDir/$chr.VirMoj.maf \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf \ $mafScratch/droGri1/$chr.maf.gz \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf \ $mafScratch/anoGam1/$chr.maf.gz \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf \ $mafScratch/apiMel2/$chr.maf.gz \ 1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf /cluster/bin/scripts/runMultizV10.csh \ $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf \ $mafScratch/triCas1/$chr.maf.gz \ 1 dm2.$chr $tmpDir maf/$chr.maf rm -f $tmpDir/*.maf rmdir $tmpDir '_EOF_' # << for emacs chmod 775 /cluster/bin/scripts/runMultizV10.csh chmod 775 doMultizAll.csh awk '{print "./doMultizAll.csh " $1;}' /cluster/data/dm2/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /cluster/data/dm2/bed/multiz12way para make jobs.lst para time #Completed: 13 of 13 jobs #Average job time: 124s 2.07m 0.03h 0.00d #Longest finished job: 538s 8.97m 0.15h 0.01d #Submission to last job: 15659s 260.98m 4.35h 0.18d ls -1 missing* #ls: No match. # make /gbdb/ links to 12way maf files: ssh hgwdev mkdir -p /gbdb/dm2/multiz12way/maf/multiz12way ln -s /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \ /gbdb/dm2/multiz12way/maf/multiz12way/ # load into database cd /tmp hgLoadMaf -warn dm2 multiz12way \ -pathPrefix=/gbdb/dm2/multiz12way/maf/multiz12way # load summary table to replace pairwise cat /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \ | nice hgLoadMafSummary dm2 multiz12waySummary stdin # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql dm2 -e "alter table multiz9waySummary drop index chrom_2" hgsql dm2 -e "alter table multiz9waySummary drop index chrom_3" # put 12way MAF out for download ssh kolossus cd /cluster/data/dm2/bed/multiz12way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload md5sum *.maf.gz > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz12way ln -s /cluster/data/dm2/bed/multiz12way/mafDownload/{*.maf.gz,md5sum.txt} \ /usr/local/apache/htdocs/goldenPath/dm2/multiz12waya # make a README.txt # Cleanup rm -rf /san/sanvol1/scratch/flyMultiz12way/ # PHASTCONS 12WAY WITH METHODS FROM PAPER (DONE 9/20/05 angie) # ((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1) ssh kkstore01 mkdir -p /san/sanvol1/scratch/dm2/chrom cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/ # Split chrom fa into smaller windows for phastCons: ssh pk mkdir /cluster/data/dm2/bed/multiz12way/phastCons mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.split cd /cluster/data/dm2/bed/multiz12way/phastCons/run.split set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/san/sanvol1/scratch/dm2/chrom set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm2/bed/multiz12way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para make jobList para time #Completed: 13 of 13 jobs #Average job time: 81s 1.35m 0.02h 0.00d #Longest finished job: 219s 3.65m 0.06h 0.00d #Submission to last job: 219s 3.65m 0.06h 0.00d ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/dm2/bed/multiz12way/phastCons /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \ --aggregate dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \ -i MAF -o SS > all.ss /cluster/bin/phast/phyloFit all.ss \ --tree "((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)" \ -i SS --out-root starting-tree cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -496988087.624743 #BACKGROUND: 0.288383 0.211930 0.211868 0.287819 #RATE_MAT: # -0.922499 0.207559 0.437636 0.277304 # 0.282436 -1.104124 0.227654 0.594033 # 0.595687 0.227721 -1.105784 0.282377 # 0.277848 0.437405 0.207862 -0.923114 #TREE: ((((((((dm2:0.028572,droSim1:0.042955):0.010136,(droYak1:0.058491,droEre1:0.056179):0.048245):0.060386,droAna2:0.259454):0.024698,dp3:0.263678):0.012004,((droVir2:0.126666,droMoj2:0.147640):0.138076,droGri1:0.248877):0.126753):0.094211,anoGam1:0.382011):0.052522,apiMel2:0.456711):0.211776,triCas1:0.211776); # also get GC content from model -- if similar enough, no need to extract it # separately above. awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod #0.423798 # OK, use .424 for --gc below. # Use the values of --target-coverage and --expected-lengths from the # last iteration of the 9way run, 0.398 and 23.7. # Multiply each subst rate on the TREE line by 3.5 which is roughly the # ratio of noncons to cons in # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \ > starting-tree.noncons.mod /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 \ starting-tree{,.noncons}.mod #( Solving for new omega: 23.700000 24.343724 24.335125 ) #Transition parameters: gamma=0.398000, omega=23.700000, mu=0.042194, nu=0.027896 #Relative entropy: H=2.651656 bits/site #Expected min. length: L_min=3.660302 sites #Expected max. length: L_max=3.808284 sites #Phylogenetic information threshold: PIT=L_min*H=9.705862 bits #Recommended expected length: omega=24.335125 sites (for L_min*H=9.783400) # OK, use --expected-lengths 24.3. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.393 24.1 ave.{cons,noncons}.mod #Recommended expected length: omega=23.801397 sites (for L_min*H=9.783400) # OK, --expected-lengths 23.8 # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod #Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400) # ==> 27.1 # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh pk mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.424 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 24.3 --target-coverage 0.398 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.424 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end para make jobList para time #Completed: 139 of 139 jobs #Average job time: 6022s 100.37m 1.67h 0.07d #Longest finished job: 14397s 239.95m 4.00h 0.17d #Submission to last job: 14419s 240.32m 4.01h 0.17d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: ((((((((dm2:0.015283,droSim1:0.028475):0.004064,(droYak1:0.031904,droEre1:0.033703):0.024654):0.021822,droAna2:0.128599):0.008507,dp3:0.135654):0.003923,((droVir2:0.064832,droMoj2:0.074814):0.064856,droGri1:0.130122):0.056057):0.035045,anoGam1:0.225562):0.032118,apiMel2:0.207067):0.100280,triCas1:0.100280); #ave.noncons.mod:TREE: ((((((((dm2:0.049188,droSim1:0.086476):0.013855,(droYak1:0.101928,droEre1:0.105461):0.080386):0.076467,droAna2:0.435801):0.030536,dp3:0.463395):0.014225,((droVir2:0.218767,droMoj2:0.254568):0.225537,droGri1:0.443544):0.200405):0.126301,anoGam1:0.786660):0.117326,apiMel2:0.720501):0.348402,triCas1:0.348402); # SECOND ITERATION: #ave.cons.mod:TREE: ((((((((dm2:0.015261,droSim1:0.028419):0.004049,(droYak1:0.031866,droEre1:0.033655):0.024630):0.021816,droAna2:0.128628):0.008503,dp3:0.135728):0.003923,((droVir2:0.064840,droMoj2:0.074832):0.064900,droGri1:0.130188):0.056115):0.035117,anoGam1:0.225895):0.032170,apiMel2:0.207375):0.100397,triCas1:0.100397); #ave.noncons.mod:TREE: ((((((((dm2:0.049200,droSim1:0.086486):0.013823,(droYak1:0.101982,droEre1:0.105514):0.080420):0.076507,droAna2:0.436376):0.030537,dp3:0.464200):0.014233,((droVir2:0.219076,droMoj2:0.254936):0.225862,droGri1:0.444226):0.200721):0.126658,anoGam1:0.788468):0.117561,apiMel2:0.722456):0.349173,triCas1:0.349173); # THIRD ITERATION: cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # FIRST ITERATION ONLY: # Check the total entropy figure to see if we're way off. # This takes an hour for 12way (exponential in #species) and has never # produced a different answer from the input after the first iteration, # so do this for the first iteration only: /cluster/bin/phast/consEntropy --NH 9.7834 0.398 24.1 ave.{cons,noncons}.mod #Recommended expected length: omega=24.095425 sites (for L_min*H=9.783400) # ==> use 24.1 instead of 24.3 below. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh pk mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.phast cd /cluster/data/dm2/bed/multiz12way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --seqname $chr --idpref $pref \ --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para make jobList para time #Completed: 139 of 139 jobs #Average job time: 15s 0.25m 0.00h 0.00d #Longest finished job: 19s 0.32m 0.01h 0.00d #Submission to last job: 45s 0.75m 0.01h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm2/bed/multiz12way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm2/bed/multiz12way/phastCons featureBits -enrichment dm2 flyBaseGene:cds all.bed # FIRST ITERATION: just a bit too high, reduce target-coverage a bit: #flyBaseGene:cds 16.567%, all.bed 35.522%, both 11.442%, cover 69.07%, enrich 1.94x # SECOND ITERATION: OK, this is the first time I've seen a reduction in # target-coverage result in an increase in CDS coverage. Dang. Well, # I'll take it as a sign that we're close enough (esp. since each iteration # take a half-day now). #flyBaseGene:cds 16.567%, all.bed 35.514%, both 11.446%, cover 69.09%, enrich 1.95x # Having met the CDS coverage target, load up the results. hgLoadBed dm2 phastConsElements12way all.bed # Create wiggle ssh pk mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.wib cd /cluster/data/dm2/bed/multiz12way/phastCons/run.wib rm -rf /san/sanvol1/scratch/dm2/phastCons/wib mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /san/sanvol1/scratch/dm2/phastCons/wib zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para make jobList para time #Completed: 13 of 13 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest finished job: 24s 0.40m 0.01h 0.00d #Submission to last job: 99s 1.65m 0.03h 0.00d # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from san/sanvol1 cd /cluster/data/dm2/bed/multiz12way/phastCons rm -rf wib POSTPROBS rsync -av /san/sanvol1/scratch/dm2/phastCons/wib . rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm2/multiz12way/wib cd /cluster/data/dm2/bed/multiz12way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz12way/wib/ hgLoadWiggle dm2 phastCons12way \ -pathPrefix=/gbdb/dm2/multiz12way/wib wib/*.wig rm wiggle.tab # and clean up san/sanvol1. rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /san/sanvol1/scratch/dm2/chrom # Offer raw scores for download since fly folks are likely to be interested: ssh kkstore01 cd /cluster/data/dm2/bed/multiz12way/phastCons/POSTPROBS mkdir ../postprobsDownload foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`) zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \ > ../postprobsDownload/$chr.pp.gz end cd ../postprobsDownload md5sum *.gz > md5sum.txt # Make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way ln -s /cluster/data/dm2/bed/multiz12way/phastCons/postprobsDownload/* . # MAKE 11.OOC FILE FOR BLAT (DONE 9/19/05 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kolossus blat /cluster/data/dm2/dm2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dm2/11.ooc -repMatch=100 #Wrote 3031 overused 11-mers to /cluster/bluearc/dm2/11.ooc # BAC END PAIRS (DONE 9/20/05 angie) # Excerpt from an email from Roger Hoskins, 9/17/05: # > There are two end-sequenced BAC libraries. The RPCI-98 BAC library, # > produced by Pieter and Kazu, is called "BDGP" on the Genoscope # > website; individual clones are named BACR01A01 - BACR48H12 (96-well # > format; R stands for EcoRI). The DrosBAC library, produced in France # > for the European Drosophila Genome Project, is called "EDGP" on the # > Genoscope website; individual clones are named BACN01A01 - BACN47H12 # > and BACH48A01-BACH61H12 (N stands for NdeII; H stands for HinDIII). # > # > It's not necessary to query GenBank for these data. The BAC end # > sequences are available from the Genoscope web site, Resources page: # > http://www.genoscope.cns.fr/externe/English/Outils/ # > For RPCI-98: # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL # > For DrosBAC (EDGP): # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM # > # > Note that if you decide to do this, it would be important to have a # > fairly high stringency imposed on the sequence alignments for mapping. # > BACs only go to one location in the genome; paired ends must point # > toward each other and be separated by an interval of say 50kb to 250 ssh kolossus # fileserver is unhappy... so work on bluearc for now, copy over later: mkdir -p /cluster/bluearc/dm2/bed/bacends cd /cluster/bluearc/dm2/bed/bacends wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM faSize * #40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files #Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009 #N count: mean 91.0 sd 80.8 #U count: mean 801.5 sd 207.4 #L count: mean 71.7 sd 45.5 cat banque_Projet_A* > bacends.fa # Edit the part out of bacends.fa. # Fly is much smaller than human so we can get away with one job # per chrom, no splitting... ssh kki cd /cluster/bluearc/dm2/bed/bacends mkdir out echo bacends.fa > bacends.lst ls -1S /iscratch/i/dm2/nib/* > dm2.lst cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm2/11.ooc {check out exists out/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 dm2.lst bacends.lst gsub jobList para make jobList para time #Completed: 13 of 13 jobs #Average job time: 79s 1.32m 0.02h 0.00d #Longest finished job: 360s 6.00m 0.10h 0.00d #Submission to last job: 360s 6.00m 0.10h 0.00d # Back on kolossus: cat out/*.psl \ | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ stdin bacEnds.unpaired.psl /dev/null #..Processed 318892 alignments # Translate the particular style of FASTA headers into bacEndPairs.txt: perl -we 'while (<>) { \ if (/^>(\w+)\s+BAC=(\w+)\s+END=(\w+)/) { \ if (! defined $bacs{$2}) { \ $bacs{$2}->[0] = ""; $bacs{$2}->[1] = ""; \ } \ if ($3 eq "T7") { \ $bacs{$2}->[0] .= "$1,"; \ } else { \ $bacs{$2}->[1] .= "$1,"; \ } \ } elsif (/^>/) { \ die "parse:\n$_\n"; \ } \ } \ foreach $bac (keys %bacs) { \ $t7 = $bacs{$bac}->[0]; $t7 = "," if ($t7 eq ""); \ $other = $bacs{$bac}->[1]; $other = "," if ($other eq ""); \ print "$t7\t$other\t$bac\n"; \ }' bacends.fa \ > bacEndPairs.txt # Roger's suggested 50kb-250kb range lost some (especially on the short # side), so I ended up going with Terry's range for human, 35kb-350kb: pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \ -slopval=10000 -hardMax=500000 \ -slop -short -long -orphan -mismatch -verbose \ bacEnds.unpaired.psl bacEndPairs.txt all_bacends bacEnds wc -l bacEnds.* # 21 bacEnds.long # 180 bacEnds.mismatch # 7037 bacEnds.orphan # 11417 bacEnds.pairs # 37 bacEnds.short # 126 bacEnds.slop # 118041 bacEnds.unpaired.psl # (With Roger's sugg. 50kb-250kb, there were 575 short, 53 long) # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed wc -l *.bed # 11410 bacEndPairs.bed # 4842 bacEndPairsBad.bed # Filter the alignments into those we'll need in the all_bacends table: extractPslLoad -noBin bacEnds.unpaired.psl \ bacEndPairs.bed bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl cd .. mv /cluster/bluearc/dm2/bed/bacends /cluster/data/dm2/bed/bacends # load into database ssh hgwdev cd /cluster/data/dm2/bed/bacends hgLoadBed dm2 bacEndPairs bacEndPairs.bed \ -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed dm2 bacEndPairsBad bacEndPairsBad.bed \ -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql hgLoadPsl dm2 -table=all_bacends bacEnds.load.psl # load BAC end sequences mkdir -p /gbdb/dm2/bacends ln -s /cluster/data/dm2/bed/bacends/bacends.fa /gbdb/dm2/bacends/ hgLoadSeq dm2 /gbdb/dm2/bacends/bacends.fa ########################################################################## # NSCAN track - (2005-09-29 markd) loaded proteins 2005-10-13 cd /cluster/data/dm2/bed/nscan/ # obtained NSCAN-EST predictions from michael brent's group at WUSTL wget http://genome.cse.wustl.edu/predictions/Drosophila/Dm2_09_19_05/Dm2_09_19_05.tar.gz tar -zxf Dm2_09_19_05.tar.gz # change protein fasta file to have transcript id in header foreach f (chr_ptx/*.ptx) awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix end ldHgGene -gtf -genePredExt dm2 nscanGene chr_gtf/chr*.gtf hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix rm -rf chr_* *.tab # update trackDb; need a dm2-specific page to describe informants mouse/dm2/nscanGene.html mouse/dm2/trackDb.ra # BLASTZ/CHAIN/NET DROYAK2 (DONE 11/23/05 angie) mkdir /cluster/data/dm2/bed/blastz.droYak2.2005-11-23 cd /cluster/data/dm2/bed/blastz.droYak2.2005-11-23 cat << '_EOF_' > DEF # D. melanogaster vs. D. yakuba BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/iscratch/i/droYak2/droYak2.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droYak2.2005-11-23 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/dm2droYak2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak2Link #flyBaseGene 22.967%, chainDroYak2Link 91.943%, both 22.758%, cover 99.09%, enrich 1.08x rm -f /cluster/data/dm2/bed/blastz.droYak2 ln -s blastz.droYak2.2005-11-23 /cluster/data/dm2/bed/blastz.droYak2 # BLASTZ/CHAIN/NET DROPER1 (DONE 11/29/05 angie) mkdir /cluster/data/dm2/bed/blastz.droPer1.2005-11-28 cd /cluster/data/dm2/bed/blastz.droPer1.2005-11-28 cat << '_EOF_' > DEF # D. melanogaster vs. D. persimilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. persimilis SEQ2_DIR=/iscratch/i/droPer1/droPer1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droPer1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droPer1.2005-11-28 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droPer1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroPer1Link #flyBaseGene 22.967%, chainDroPer1Link 74.744%, both 21.320%, cover 92.83%, enrich 1.24x rm -f /cluster/data/dm2/bed/blastz.droPer1 ln -s blastz.droPer1.2005-11-28 /cluster/data/dm2/bed/blastz.droPer1 # BLASTZ/CHAIN/NET DROSEC1 (DONE 12/5/05 angie) mkdir /cluster/data/dm2/bed/blastz.droSec1.2005-11-30 cd /cluster/data/dm2/bed/blastz.droSec1.2005-11-30 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. sechellia SEQ2_DIR=/iscratch/i/droSec1/droSec1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droSec1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droSec1.2005-11-30 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droSec1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSec1Link #flyBaseGene 22.967%, chainDroSec1Link 92.860%, both 22.759%, cover 99.09%, enrich 1.07x rm -f /cluster/data/dm2/bed/blastz.droSec1 ln -s blastz.droSec1.2005-11-30 /cluster/data/dm2/bed/blastz.droSec1 # FLYBASE 4.2 ANNOTATIONS (DONE (except for contacting flybase-help) 1/13/06 angie) # RELOADED flyBaseNoncoding and flyBase2004Xref 5/30/06 after changing # extractGenes to catch some new RNA keywords after user reported loss of # many mir's. (They were marked "ncRNA" in 4.1 but "miRNA" in 4.2.) # Now extractGenes catches [a-z]+RNA. # RELOADED flyBaseGene 7/12/06 after user reported a problem with split # start codons -- cause by error in extractGenes.pl that caused us not to # notice a single-base overlap between CDS range and exon range. ssh kkstore01 mkdir /cluster/data/dm2/bed/flybase4.2 cd /cluster/data/dm2/bed/flybase4.2 foreach c (2L 2R 3L 3R 4 X) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/gff/dmel-$c-r4.2.1.gff.gz end zcat *.gff.gz > flybase.gff3 # What data sources are represented in this file? grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "."): 19178 . CDS 63583 . exon 14380 . gene 19178 . mRNA 66 . miRNA 113 . ncRNA 49 . pseudogene 96 . rRNA 46 . snRNA 63 . snoRNA 294 . tRNA 36921 . transcription_start_site 6006 . transposable_element 33268 . transposable_element_insertion_site # What keywords are defined in the 9th field? grep -v '^#' flybase.gff3 \ | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \ | sort | uniq -c 117483 Dbxref 3111982 ID 1428395 Name 847565 Parent 20326 cyto_range 46928 dbxref_2nd 15705 gbunit 6 species 92077 synonym 10032 synonym_2nd # Wow... for the first time, it looks like the same flavor of GFF3 # has been used as last time! cp ../flybase4.1/extractGenes.pl . extractGenes.pl flybase.gff3 # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/fasta/dmel-all-translation-r4.2.1.fasta perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' dmel-all-translation-r4.2.1.fasta \ > flybasePep.fa ssh hgwdev cd /cluster/data/dm2/bed/flybase4.2 # Protein-coding genes: ldHgGene -gtf dm2 flyBaseGene flybase.gtf hgPepPred dm2 generic flyBasePep flybasePep.fa # Noncoding genes: hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed # Cross-referencing info for both coding and noncoding: hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \ flyBase2004Xref.tab # Some featureBits comparisons with refGene which shoudl be pretty much # like version 4.0 with a few noncoding genes tossed in... but this is # a bigger difference than we expect: featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout #516631 bases of 131698467 (0.392%) in intersection # Hmmm, flyBaseGene has nothing on the last 175,000bp of chrX!: # chrX:22,039,591-22,224,390 # nor the first 40,000bp of chrX: # chrX:1-102,664 # nor the last 270,000bp of chr3L: # chr3L:23,273,167-23,771,897 # nor the first 500,000bp of chr2R: # chr2R:1-600,000 # Also quite a few on the chr*h but I won't look at all those. #TODO: contact flybase-help about the lack of annotations in those ranges... # This shows only the dropped isoforms/duplicates (ignores noncoding): featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \ -bed=stdout #chr3R 15620310 15621493 chr3R.1 #chr2L 4698094 4700940 chr2L.1 #chrX 3638700 3639895 chrX.1 #474282 bases of 131698467 (0.360%) in intersection # add upstream* downloadable files cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.zip *.gz > md5sum.txt # 2/6/06: fill in missing symbol values with accession so hgNear # has something to draw a link on... hgsql dm2 -e 'update flyBase2004Xref set symbol = name where symbol = ""' # 7/12/06: reload flyBaseGene after correcting extractGenes.pl. ldHgGene -gtf dm2 flyBaseGene flybase.gtf # FLYBASE ACODE CROSS-REFERENCING DATA (DONE 1/27/06 angie) # 10/11/06 -- reloaded after updating flyBaseGene, but no change. ssh hgwdev mkdir /cluster/data/dm2/bed/flyBase cd /cluster/data/dm2/bed/flyBase wget -O genes.txt.061011 \ ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt # Had to edit genes.txt.060126 to get around these two lines that didn't # start with acode symbols: #line 1609033 #Spiracle sheaths dark. #line 3664838 (3664839 of original) #Ommatidia are no hgFlyBase dm2 genes.txt.061011 -geneTable=flyBaseGene #Processed 57068 records in 4067314 lines # MAP UNIPROT IDS TO CG*-R* IDS (DONE 1/27/06 angie - REDONE 10/11/06) # The UniProt.description table actually contains the most specific # mapping of UniProt IDs to CG*-R* transcript IDs. Use that when # available, and otherwise use the more generic mapping to gene IDs # that we extracted from the acode into fbUniProt. ssh hgwdev mkdir /cluster/data/dm2/bed/uniProtToFlyBase cd /cluster/data/dm2/bed/uniProtToFlyBase set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'` # Grab UniProt descriptions of all D. mel. proteins so we can parse out # CG*-R* transcript IDs (and CG* gene IDs when -R* not specified). hgsql -N uniProt -e \ 'select description.* from description,accToTaxon where accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \ > uniProt.description.txt # fbUniProt maps FBgn* IDs to multiple UniProt IDs; use flyBase2004Xref # to map FBgn* to CG* (actually CG*-R* which implies more precision than # there really is). hgsql -N dm2 -e \ 'select name,uniProtId from flyBase2004Xref,fbUniProt where fbgn=geneId' \ | sort > flyBaseMapping.txt # Grab the complete list of CG*-R* transcript IDs: hgsql -N dm2 -e 'select name from flyBaseGene' | sort > transcriptIds.txt ssh kolossus cd /cluster/data/dm2/bed/uniProtToFlyBase # Wrote a perl script, gleanDescription.pl, to parse the various patterns # used to encode real mappings (not mappings to homologs or things that # look like CG*-P? IDs but aren't really). chmod a+x gleanDescription.pl ./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt # Look for duplicates: awk '{print $1;}' uniProtMapping.txt | uniq -c | awk '$1 != 1 {print;}' # 2 CG17131-RA # 5 CG32637-RA grep CG17131-PA uniProt.description.txt #Q8MS37 RE15579p (CG17131-PA). #Q7PLU3 CG17131-PA.3. # I looked up both Q8MS37 and Q7PLU3 on http://www.pir.uniprot.org/ # and both do mention CG17131-PA... Q8MS37 has been updated more recently # so I'll just delete the Q7PLU3 mapping. grep CG32637-PA uniProt.description.txt #Q5K354 CG32637-PA protein (Fragment). #Q5K356 CG32637-PA protein (Fragment). #Q5K357 CG32637-PA protein (Fragment). #Q5K361 CG32637-PA protein (Fragment). #Q8IR71 CG32637-PA. # Here it's more clear-cut -- delete the fragments. egrep -v '(Q7PLU3|Q5K354|Q5K356|Q5K357|Q5K361)' uniProtMapping.txt \ > tmp; mv tmp uniProtMapping.txt # Wrote a perl script, vagueDescription.pl, to parse out CG* without # isoform specs -- can use those as backups when we don't have a more # specific mapping for a transcript. chmod a+x vagueDescription.pl ./vagueDescription.pl uniProt.description.txt | sort \ > uniProtMappingVague.txt # Write a perl script, combineMappings.pl, to add any additional mappings # available from acode to the UniProt mappings. chmod a+x combineMappings.pl ./combineMappings.pl transcriptIds.txt \ uniProtMapping.txt uniProtMappingVague.txt flyBaseMapping.txt \ > flyBaseToUniProt.txt # Make a 2-column version for loading: awk '{print $1 "\t" $2;}' flyBaseToUniProt.txt > flyBaseToUniProtLoad.txt # Load into a genericAlias-type table (this ignores the third column). ssh hgwdev cd /cluster/data/dm2/bed/uniProtToFlyBase sed -e 's/genericAlias/flyBaseToUniProt/' \ ~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql hgLoadSqlTab dm2 flyBaseToUniProt flyBaseToUniProt.sql \ flyBaseToUniProtLoad.txt ########################################################################## # HGNEAR TABLES # HGNEAR PROTEIN BLAST TABLES (DONE 1/31/06 angie - REDONE 4/6/07) # mmBlastTab redone 5/1/06 -- makeMm8.doc run used outdated proteins # hgBlastTab redone 5/9/06 -- makeHg18.doc run used outdated proteins # Redone 4/3/07 with flybase4.3; mitochondrial proteins yanked 4/6/07 # because they aren't in flyBase2004Xref and were causing joinerCheck errors. ssh hgwdev mkdir /cluster/data/dm2/bed/hgNearBlastp cd /cluster/data/dm2/bed/hgNearBlastp mkdir 070403 cd 070403 # Get the proteins used by the other hgNear organisms: pepPredToFa hg18 knownGenePep hg18.known.faa pepPredToFa mm8 knownGenePep mm8.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer3 ensPep danRer3.ensPep.faa pepPredToFa dm2 flyBasePep dm2.flyBasePep.faa pepPredToFa ce2 sangerPep ce2.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa # Galt's synBlastp.csh filtering, which uses liftOver chains, # would be the next step if dm2 were reasonably closely related # (like mammal-mammal) to another Gene Sorter organism. But it is # pretty distant from all the others, so specify recipBest for all # query databases. cat << _EOF_ > config.ra # Latest fly vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, worm, yeast targetGenesetPrefix flyBase targetDb dm2 queryDbs hg18 mm8 rn4 danRer3 ce2 sacCer1 recipBest hg18 mm8 rn4 danRer3 ce2 sacCer1 dm2Fa /cluster/data/dm2/bed/hgNearBlastp/070403/dm2.flyBasePep.faa hg18Fa /cluster/data/dm2/bed/hgNearBlastp/070403/hg18.known.faa mm8Fa /cluster/data/dm2/bed/hgNearBlastp/070403/mm8.known.faa rn4Fa /cluster/data/dm2/bed/hgNearBlastp/070403/rn4.known.faa danRer3Fa /cluster/data/dm2/bed/hgNearBlastp/070403/danRer3.ensPep.faa ce2Fa /cluster/data/dm2/bed/hgNearBlastp/070403/ce2.sangerPep.faa sacCer1Fa /cluster/data/dm2/bed/hgNearBlastp/070403/sacCer1.sgdPep.faa buildDir /cluster/data/dm2/bed/hgNearBlastp/070403 scratchDir /san/sanvol1/scratch/dm2HgNearBlastp _EOF_ # Run with -noLoad so we can eyeball files, manually load dm2 tables now, # and later overload other databases' dmBlastTab tables. ~/kent/src/hg/utils/automation/doHgNearBlastp.pl -noLoad -workhorse kkr4u00 config.ra >& do.log & tail -f do.log # Ran these manually: # *** -noLoad was specified -- you can run this script manually to load dm2 tables: run.dm2.dm2/loadPairwise.csh #Loading database with 1171077 rows # *** -noLoad was specified -- you can run these scripts manually to load dm2 tables: run.dm2.hg18/loadPairwise.csh #Loading database with 5833 rows run.dm2.mm8/loadPairwise.csh #Loading database with 5557 rows run.dm2.rn4/loadPairwise.csh #Loading database with 3015 rows run.dm2.danRer3/loadPairwise.csh #Loading database with 5253 rows run.dm2.ce2/loadPairwise.csh #Loading database with 4537 rows run.dm2.sacCer1/loadPairwise.csh #Loading database with 2172 rows # 4/6/07: remove mitochondrial genes from *BlastTab because they # are missing from flyBase2004Xref and caused joinerCheck errors. # Would have been better to have joinerCheck'd flyBasePep before # running doHgNearBlastp. hgsql dm2 -e 'delete from flyBaseBlastTab where target like "mt:%"' foreach table (hgBlastTab mmBlastTab rnBlastTab \ drBlastTab ceBlastTab scBlastTab) echo $table hgsql dm2 -e 'delete from '$table' where query like "mt:%"' end # 4/6/07: dm2 FlyBase/hgNear/hgGene update is in the pushQ, so I think # it's a fine time to update dmBlastTab in other databases with the # latest, so hgwdev is all consistent. When that push is done, will # need another push request for *.dmBlastTab loaded here. # *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases: run.hg18.dm2/loadPairwise.csh run.mm8.dm2/loadPairwise.csh run.rn4.dm2/loadPairwise.csh run.danRer3.dm2/loadPairwise.csh run.ce2.dm2/loadPairwise.csh run.sacCer1.dm2/loadPairwise.csh # Yank the "mt:%" targets from those to avoid joinerCheck errors. foreach db (hg18 mm8 rn4 danRer3 ce2 sacCer1) hgsql $db -e 'delete from dmBlastTab where target like "mt:%"' end # CLUSTER GENES AND MAP TO OTHER DATA (DONE 2/6/06 angie - REDONE 4/3/07) # flyBaseToRefSeq regenerated 5/9/06 ssh hgwdev # First we have to add a proteinId column to flyBaseGene. hgClusterGenes dm2 flyBaseGene flyBaseIsoforms flyBaseCanonical \ -protAccQuery='select name,alias from flyBaseToUniProt' #Got 13566 clusters, from 19452 genes in 13 chromosomes # Create table that maps between flyBase genes and RefSeq # Re-ran 5/9/06 to resolve some joinerCheck errors about retired refSeqs: hgMapToGene dm2 refGene flyBaseGene flyBaseToRefSeq # Create table that maps between known genes and Pfam domains hgMapViaSwissProt dm2 flyBaseToUniProt name alias Pfam flyBaseToPfam # Create a table that maps between flyBase genes and the # Stanford Microarray Project expression data. (see makeHgFixed.doc) hgsql dm2 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \ select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene' hgsql dm2 -e 'create index name on flyBaseToCG(name(12))' hgExpDistance dm2 hgFixed.arbFlyLifeMedianRatio dummyArg \ arbExpDistance -lookup=flyBaseToCG #TODO: add the yale expr data too (feep) # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS (DONE 2/6/06 - REDONE 4/3/07) # hgNear's knownDetails column type expects a single table that links # transcript ID with uniProt description, so whip one up here: # (used by hgGene too though hgGene could join on the fly) ssh hgwdev hgsql dm2 -e 'drop table if exists flyBaseToDescription; \ create table flyBaseToDescription \ select flyBaseToUniProt.name,uniProt.description.val \ from flyBaseToUniProt,uniProt.description \ where flyBaseToUniProt.alias = uniProt.description.acc' hgsql dm2 -e 'create index name on flyBaseToDescription(name(12))' # FLYP2P (DONE 2/6/06 angie) # See hg/near/makeNear.doc... # MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 2/7/06 angie) cd ~/kent/src/hg/near/hgNear/hgNearData # Made dm1 and dm2 subdirs of D_melanogaster with separate sets of # .ra files... see cvs revision history. # Similarly for hgGene/hgGeneData. # ENABLE HGNEAR FOR DM2 IN HGCENTRALTEST (DONE 2/7/06 angie) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set hgNearOk = 1 where name = 'dm2';" # REDO DM2-HG18 PAIRWISE BLAST (DONE 2/24/06 angie) # Oops, hgBlastTab got overwritten with dm2 4.1 genes by the # hg18 build process. Tweak back to 4.2: ssh hgwdev mkdir /cluster/data/dm2/bed/hgNearBlastp cd /cluster/data/dm2/bed/hgNearBlastp cat << _EOF_ > config.ra.hg18redo # Redo vs. hg18 with 4.2 genes: targetGenesetPrefix flyBase targetDb dm2 queryDbs hg18 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa hg18Fa /cluster/data/hg18/bed/blastp/known.faa buildDir /cluster/data/dm2/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/dm2HgNearBlastp _EOF_ doHgNearBlastp.pl -noSelf config.ra.hg18redo >& do.log & tail -f do.log # END OF HGNEAR STUFF ############################################################################# # RECIPROCAL BEST DM2-DROSIM1 (DONE 2/28/06 angie - REDONE 6/19/06) # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago) ssh kolossus cd /cluster/data/dm2/bed/blastz.droSim1/axtChain # Swap dm2-best chains to be droSim1-referenced: chainStitchId dm2.droSim1.over.chain.gz stdout \ | chainSwap stdin stdout \ | chainSort stdin droSim1.dm2.tBest.chain # Net those on droSim1 to get droSim1-ref'd reciprocal best net: chainPreNet droSim1.dm2.tBest.chain \ /cluster/data/{droSim1,dm2}/chrom.sizes stdout \ | chainNet -minSpace=1 stdin /cluster/data/{droSim1,dm2}/chrom.sizes \ stdout /dev/null \ | netSyntenic stdin droSim1.dm2.rbest.net # Extract droSim1-ref'd reciprocal best chain: netChainSubset droSim1.dm2.rbest.net droSim1.dm2.tBest.chain stdout \ | chainStitchId stdin stdout \ | chainSort stdin droSim1.dm2.rbest.chain # Swap to get dm2-ref'd reciprocal best chain: chainSwap droSim1.dm2.rbest.chain stdout \ | chainSort stdin dm2.droSim1.rbest.chain # Net those on dm2 to get dm2-ref'd reciprocal best net: chainPreNet dm2.droSim1.rbest.chain \ /cluster/data/{dm2,droSim1}/chrom.sizes stdout \ | chainNet -minSpace=1 -minScore=0 \ stdin /cluster/data/{dm2,droSim1}/chrom.sizes stdout /dev/null \ | netSyntenic stdin dm2.droSim1.rbest.net # Compress and clean up. rm droSim1.dm2.tBest.chain gzip *.rbest.* md5sum *.rbest.*.gz > md5sum.txt # Test coverage of *.rbest.* -- should be equal. netToBed -maxGap=1 droSim1.dm2.rbest.net.gz droSim1.dm2.rbest.net.bed netToBed -maxGap=1 dm2.droSim1.rbest.net.gz dm2.droSim1.rbest.net.bed chainToPsl droSim1.dm2.rbest.chain.gz \ /cluster/data/{droSim1,dm2}/chrom.sizes \ /cluster/data/droSim1/droSim1.2bit /cluster/data/dm2/dm2.2bit \ droSim1.dm2.rbest.chain.psl chainToPsl dm2.droSim1.rbest.chain.gz \ /cluster/data/{dm2,droSim1}/chrom.sizes \ /cluster/data/dm2/dm2.2bit /cluster/data/droSim1/droSim1.2bit \ dm2.droSim1.rbest.chain.psl ssh hgwdev cd /cluster/data/dm2/bed/blastz.droSim1/axtChain featureBits droSim1 droSim1.dm2.rbest.net.bed #104047585 bases of 127256433 (81.762%) in intersection featureBits droSim1 droSim1.dm2.rbest.chain.psl #104047585 bases of 127256433 (81.762%) in intersection featureBits dm2 dm2.droSim1.rbest.chain.psl #104047585 bases of 131698467 (79.004%) in intersection featureBits dm2 dm2.droSim1.rbest.net.bed #104047585 bases of 131698467 (79.004%) in intersection mkdir experiments mv *.bed *.psl experiments # Put out for download (hgwdev-only) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest rm -f *.gz md5sum.txt ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/*.rbest.*.gz . ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/md5sum.txt . # Make a README.txt. # RECIPROCAL BEST DM2-DROYAK2 (DONE 6/20/06 angie) # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago) ssh kolossus cd /cluster/data/dm2/bed/blastz.droYak2/axtChain # Swap dm2-best chains to be droYak2-referenced: chainStitchId dm2.droYak2.over.chain.gz stdout \ | chainSwap stdin stdout \ | chainSort stdin droYak2.dm2.tBest.chain # Net those on droYak2 to get droYak2-ref'd reciprocal best net: chainPreNet droYak2.dm2.tBest.chain \ /cluster/data/{droYak2,dm2}/chrom.sizes stdout \ | chainNet -minSpace=1 stdin /cluster/data/{droYak2,dm2}/chrom.sizes \ stdout /dev/null \ | netSyntenic stdin droYak2.dm2.rbest.net # Extract droYak2-ref'd reciprocal best chain: netChainSubset droYak2.dm2.rbest.net droYak2.dm2.tBest.chain stdout \ | chainStitchId stdin droYak2.dm2.rbest.chain # Swap to get dm2-ref'd reciprocal best chain: chainSwap droYak2.dm2.rbest.chain stdout \ | chainSort stdin dm2.droYak2.rbest.chain # Net those on dm2 to get dm2-ref'd reciprocal best net: chainPreNet dm2.droYak2.rbest.chain \ /cluster/data/{dm2,droYak2}/chrom.sizes stdout \ | chainNet -minSpace=1 -minScore=0 \ stdin /cluster/data/{dm2,droYak2}/chrom.sizes stdout /dev/null \ | netSyntenic stdin dm2.droYak2.rbest.net # Compress and clean up. rm droYak2.dm2.tBest.chain gzip *.rbest.* md5sum *.rbest.*.gz > md5sum.rbest.txt # Test coverage of *.rbest.* -- should be equal. netToBed -maxGap=1 droYak2.dm2.rbest.net.gz droYak2.dm2.rbest.net.bed netToBed -maxGap=1 dm2.droYak2.rbest.net.gz dm2.droYak2.rbest.net.bed chainToPsl droYak2.dm2.rbest.chain.gz \ /cluster/data/{droYak2,dm2}/chrom.sizes \ /cluster/data/droYak2/droYak2.2bit /cluster/data/dm2/dm2.2bit \ droYak2.dm2.rbest.chain.psl chainToPsl dm2.droYak2.rbest.chain.gz \ /cluster/data/{dm2,droYak2}/chrom.sizes \ /cluster/data/dm2/dm2.2bit /cluster/data/droYak2/droYak2.2bit \ dm2.droYak2.rbest.chain.psl ssh hgwdev cd /cluster/data/dm2/bed/blastz.droYak2/axtChain featureBits droYak2 droYak2.dm2.rbest.net.bed #105927235 bases of 162681153 (65.113%) in intersection featureBits droYak2 droYak2.dm2.rbest.chain.psl #105927235 bases of 162681153 (65.113%) in intersection featureBits dm2 dm2.droYak2.rbest.chain.psl #105927235 bases of 131698467 (80.432%) in intersection featureBits dm2 dm2.droYak2.rbest.net.bed #105927235 bases of 131698467 (80.432%) in intersection mkdir experiments mv *.bed *.psl experiments # Put out for download (hgwdev-only) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest rm -f *.gz md5sum.txt ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/*.rbest.*.gz . ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt # Make a README.txt. ######################################################################## ### microRNA targets tracks (DONE - 2006-03-29 - 2006-04-26 - Hiram) ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu ### Yi-Lu Wang ylw205@nyu.edu ### dg@thp.Uni-Koeln.DE ssh hgwdev mkdir /cluster/data/dm2/bed/picTar cd /cluster/data/dm2/bed/picTar wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S1_bed' \ -O newMelanogasterS1.bed wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S3_bed' \ -O newMelanogasterS3.bed grep -v "^track" newMelanogasterS1.bed \ | hgLoadBed -strict dm2 picTarMiRNAS1 stdin # Loaded 24103 elements of size 9 grep -v "^track" newMelanogasterS3.bed \ | hgLoadBed -strict dm2 picTarMiRNAS3 stdin # Loaded 11480 elements of size 9 nice -n +19 featureBits dm2 picTarMiRNAS1 # 59881 bases of 131698467 (0.045%) in intersection nice -n +19 featureBits dm2 picTarMiRNAS3 # 26231 bases of 131698467 (0.020%) in intersection # FLYBASE IN SITU IMAGES / EXPRESSION (DONE 5/3/06 angie - REDONE 4/7/07) # FlyBase has downloadable in situ images for BACs: # ftp://flybase.net/flybase/images/bac_insitu_pic/* # and FBti's: # ftp://flybase.net/flybase/images/in-situ-images/insitus.zip # but what Jim is interested in is the insitus for expression data. # Don't see that in the ftp listing, but they do make it easy to link in. ssh hgwdev cd /cluster/data/dm2/bed/flyBase wget -O summary.txt.070406 \ 'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext' # remove header line. hgLoadSqlTab dm2 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \ summary.txt.070406 ########################################################################### # LIFTOVER CHAINS DM2 -> DM3 (DONE 8/1/06 angie) # Started 8/1 -- This creates the directory and scripts: ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 -debug # Run it for real and log it: cd /cluster/data/dm2/bed/blat.dm3.2006-08-01 ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 \ >& do.log & tail -f do.log ########################################################################### # BLASTZ/CHAIN/NET DROERE2 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droEre2.2006-09-27 cd /cluster/data/dm2/bed/blastz.droEre2.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. erecta BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. erecta SEQ2_DIR=/iscratch/i/droEre2/droEre2.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droEre2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droEre2.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droEre2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre2Link #flyBaseGene 23.333%, chainDroEre2Link 90.665%, both 23.063%, cover 98.84%, enrich 1.09x rm -f /cluster/data/dm2/bed/blastz.droEre2 ln -s blastz.droEre2.2006-09-27 /cluster/data/dm2/bed/blastz.droEre2 ########################################################################### # BLASTZ/CHAIN/NET DROGRI2 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droGri2.2006-09-27 cd /cluster/data/dm2/bed/blastz.droGri2.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. grimshawi BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. grimshawi SEQ2_DIR=/iscratch/i/droGri2/droGri2.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droGri2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droGri2.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droGri2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri2Link #flyBaseGene 23.333%, chainDroGri2Link 63.808%, both 20.793%, cover 89.12%, enrich 1.40x rm -f /cluster/data/dm2/bed/blastz.droGri2 ln -s blastz.droGri2.2006-09-27 /cluster/data/dm2/bed/blastz.droGri2 ########################################################################### # BLASTZ/CHAIN/NET DROWIL1 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droWil1.2006-09-27 cd /cluster/data/dm2/bed/blastz.droWil1.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. willistoni BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. willistoni SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droWil1/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droWil1.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/dm2droWil1 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroWil1Link #flyBaseGene 23.333%, chainDroWil1Link 67.722%, both 21.106%, cover 90.45%, enrich 1.34x rm -f /cluster/data/dm2/bed/blastz.droWil1 ln -s blastz.droWil1.2006-09-27 /cluster/data/dm2/bed/blastz.droWil1 ########################################################################### # BLASTZ/CHAIN/NET DROANA3 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droAna3.2006-09-27 cd /cluster/data/dm2/bed/blastz.droAna3.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droAna3/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droAna3.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droAna3 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna3Link #flyBaseGene 23.333%, chainDroAna3Link 79.307%, both 22.211%, cover 95.19%, enrich 1.20x rm -f /cluster/data/dm2/bed/blastz.droAna3 ln -s blastz.droAna3.2006-09-27 /cluster/data/dm2/bed/blastz.droAna3 ########################################################################### # BLASTZ/CHAIN/NET DROMOJ3 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27 cd /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. mojavensis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. mojavensis SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droMoj3.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droMoj3 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj3Link #flyBaseGene 23.333%, chainDroMoj3Link 60.956%, both 20.676%, cover 88.61%, enrich 1.45x rm -f /cluster/data/dm2/bed/blastz.droMoj3 ln -s blastz.droMoj3.2006-09-27 /cluster/data/dm2/bed/blastz.droMoj3 ########################################################################### # BLASTZ/CHAIN/NET DROVIR3 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.droVir3.2006-09-27 cd /cluster/data/dm2/bed/blastz.droVir3.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/droVir3/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.droVir3.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm2droVir3 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir3Link #flyBaseGene 23.333%, chainDroVir3Link 62.998%, both 20.854%, cover 89.38%, enrich 1.42x rm -f /cluster/data/dm2/bed/blastz.droVir3 ln -s blastz.droVir3.2006-09-27 /cluster/data/dm2/bed/blastz.droVir3 ########################################################################### # BLASTZ/CHAIN/NET DP4 (DONE 9/27/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.dp4.2006-09-27 cd /cluster/data/dm2/bed/blastz.dp4.2006-09-27 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp4/dp4.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dp4/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.dp4.2006-09-27 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/dm2dp4 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp4Link #flyBaseGene 23.333%, chainDp4Link 73.891%, both 21.518%, cover 92.22%, enrich 1.25x rm -f /cluster/data/dm2/bed/blastz.dp4 ln -s blastz.dp4.2006-09-27 /cluster/data/dm2/bed/blastz.dp4 ########################################################################### # BLASTZ/CHAIN/NET TRICAS2 (DONE 9/28/06) ssh kkstore04 mkdir /cluster/data/dm2/bed/blastz.triCas2.2006-09-28 cd /cluster/data/dm2/bed/blastz.triCas2.2006-09-28 cat <<_EOF_ > DEF # D. melanogaster vs. T. castaneum BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm2/chrom.sizes # QUERY - T. castaneum SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes BASE=/cluster/data/dm2/bed/blastz.triCas2.2006-09-28 _EOF_ doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/dm2triCas2 >& do.log & tail -f do.log featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainTriCas2Link #flyBaseGene 23.333%, chainTriCas2Link 16.438%, both 10.740%, cover 46.03%, enrich 2.80x rm -f /cluster/data/dm2/bed/blastz.triCas2 ln -s blastz.triCas2.2006-09-28 /cluster/data/dm2/bed/blastz.triCas2 ########################################################################### # MULTIZ15WAY (DONE 9/28/06 angie) # (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2) ssh kkstore04 mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28 cd /cluster/data/dm2/bed/multiz15way.2006-09-28 # copy MAF's to cluster-friendly server mkdir /san/sanvol1/scratch/dm2/mafNet foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel2 triCas2) echo $s rsync -av /cluster/data/dm2/bed/blastz.$s/mafNet/* \ /san/sanvol1/scratch/dm2/mafNet/$s/ end echo '(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)' \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst ssh pk cd /cluster/data/dm2/bed/multiz15way.2006-09-28 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = dm2 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm2/bed/multiz15way.2006-09-28/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/dm2/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para make jobList para time #Completed: 13 of 13 jobs #CPU time in finished jobs: 37955s 632.58m 10.54h 0.44d 0.001 y #IO & Wait Time: 315s 5.25m 0.09h 0.00d 0.000 y #Average job time: 2944s 49.06m 0.82h 0.03d #Longest finished job: 9401s 156.68m 2.61h 0.11d #Submission to last job: 9429s 157.15m 2.62h 0.11d # ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 9/28/2006 angie) ssh kolossus mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/anno cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/dm2/bed/multiz15way.2006-09-28/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/dm2/dm2.2bit ../maf/`basename $f` >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # 4min. # Load anno/maf ssh hgwdev cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf mkdir -p /gbdb/dm2/multiz15way/anno/maf ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf/*.maf \ /gbdb/dm2/multiz15way/anno/maf date nice hgLoadMaf -pathPrefix=/gbdb/dm2/multiz15way/anno/maf dm2 multiz15way date # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf cat *.maf \ | nice hgLoadMafSummary dm2 -minSize=30000 -mergeGap=1500 -maxSize=200000 \ -test multiz15waySummary stdin #Created 112880 summary blocks from 10610488 components and 1151094 mafs from stdin ssh hgwdev cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz15waySummary.sql time nice hgLoadSqlTab dm2 multiz15waySummary /tmp/multiz15waySummary.sql \ multiz15waySummary.tab #0.000u 0.001s 0:01.59 0.0% 0+0k 0+0io 0pf+0w rm *.tab ln -s multiz15way.2006-09-28 /cluster/data/dm2/bed/multiz15way # MULTIZ15WAY DOWNLOADABLES (UNANNOTATED) (DONE 10/3/2006 angie) # Annotated MAF is now documented, so use anno/maf for downloads. ssh hgwdev mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits dm2 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags dm2 multiz15way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log #86.335u 42.090s 6:47.44 31.5% 0+0k 0+0io 0pf+0w ssh kolossus cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../anno/maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' # << emacs time csh -efx downloads.csh >&! downloads.log #325.699u 4.199s 5:34.17 98.7% 0+0k 0+0io 1pf+0w nice gzip up*.maf nice md5sum up*.maf.gz >> md5sum.txt # Make a README.txt . ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/dm2/multiz15way mkdir $dir ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads/*.{gz,txt} $dir # MULTIZ15WAY MAF FRAMES (DONE 9/29/2006 angie) # multiz15wayFrames regenerated 11/28/06. ssh hgwdev mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/frames cd /cluster/data/dm2/bed/multiz15way.2006-09-28/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes (when available) # currently we have nothing for: # droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2 # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: droSim1 droYak2 anoGam1 apiMel2 mkdir genes foreach queryDb (droSim1 droYak2 anoGam1 apiMel2) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using refGene for dm2 # genePreds; (must keep only the first 10 columns for knownGene) foreach queryDb (dm2) if ($queryDb == "dm2") then set geneTbl = refGene endif hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # MarkD's 1-pass maf methodology # This section was re-run *with* dm2 genes (oops, thanks Brooke) 11/28/06. ssh kolossus cd /cluster/data/dm2/bed/multiz15way/frames nice tcsh # easy way to get process niced (cat ../anno/maf/*.maf \ | time genePredToMafFrames dm2 stdin stdout \ dm2 genes/dm2.gp.gz \ droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \ anoGam1 genes/anoGam1.gp.gz apiMel2 genes/apiMel2.gp.gz \ | gzip -c > multiz15way.mafFrames.gz) >& frames.log & ssh hgwdev cd /cluster/data/dm2/bed/multiz15way/frames hgLoadMafFrames dm2 multiz15wayFrames multiz15way.mafFrames.gz # PHASTCONS 15WAY WITH METHODS FROM PAPER (DONE 10/3/06 angie) # (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2) # NOTE FROM QA (5/7/08): Overlapping data points were found in dm3 phastCons # data by user Michael Hiller: # http://www.soe.ucsc.edu/pipermail/genome/2008-April/016241.html # The overlapping data points were caused by a bug in phastCons that Adam has # since fixed. The bug is present in dm2 phastCons data, but as it is not the # most recent assembly for this organism, we are going to leave it as-is. An # example region containing the overlap is: chr3R:6,999,990-7,000,010 . ssh kkstore04 mkdir -p /san/sanvol1/scratch/dm2/chrom cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/ # Split chrom fa into smaller windows for phastCons: ssh pk mkdir /cluster/data/dm2/bed/multiz15way/phastCons mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.split cd /cluster/data/dm2/bed/multiz15way/phastCons/run.split set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/san/sanvol1/scratch/dm2/chrom set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm2/bed/multiz15way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para make jobList para time #Completed: 13 of 13 jobs #CPU time in finished jobs: 1075s 17.92m 0.30h 0.01d 0.000 y #IO & Wait Time: 101s 1.68m 0.03h 0.00d 0.000 y #Average job time: 90s 1.51m 0.03h 0.00d #Longest finished job: 259s 4.32m 0.07h 0.00d #Submission to last job: 270s 4.50m 0.07h 0.00d ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/dm2/bed/multiz15way/phastCons /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \ --aggregate dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \ -i MAF -o SS > all.ss /cluster/bin/phast/phyloFit all.ss \ --tree "(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)" \ -i SS --out-root starting-tree cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -533323268.267133 #BACKGROUND: 0.285852 0.214455 0.214368 0.285325 #RATE_MAT: # -0.924927 0.204278 0.451988 0.268661 # 0.272287 -1.098668 0.225404 0.600978 # 0.602709 0.225494 -1.100568 0.272364 # 0.269158 0.451705 0.204631 -0.925493 #TREE: (((((((((dm2:0.032478,(droSim1:0.017650,droSec1:0.015740):0.017736):0.026088,(droYak2:0.058116,droEre2:0.055952):0.031922):0.084953,droAna3:0.218988):0.051563,(dp4:0.013624,droPer1:0.015374):0.210705):0.046101,droWil1:0.292357):0.019351,((droVir3:0.109131,droMoj3:0.142623):0.047595,droGri2:0.154583):0.189999):0.113599,anoGam1:0.357817):0.092848,apiMel2:0.386309):0.169225,triCas2:0.169225); # also get GC content from model -- if similar enough, no need to extract it # separately above. awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod #0.428823 # OK, use .429 for --gc below. # Use the values of --target-coverage and --expected-lengths from the # last iteration of the 9way run, 0.398 and 23.7. # Multiply each subst rate on the TREE line by 3.5 which is roughly the # ratio of noncons to cons in # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \ > starting-tree.noncons.mod /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.8 \ starting-tree{,.noncons}.mod # Uh-oh.... out of memory. OK! Well, proceed with params from before. # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh pk mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.429 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end para make jobList para time #Completed: 139 of 139 jobs #CPU time in finished jobs: 1771638s 29527.30m 492.12h 20.51d 0.056 y #IO & Wait Time: 1154s 19.23m 0.32h 0.01d 0.000 y #Average job time: 12754s 212.56m 3.54h 0.15d #Longest finished job: 18435s 307.25m 5.12h 0.21d #Submission to last job: 26561s 442.68m 7.38h 0.31d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((((((((dm2:0.017975,(droSim1:0.012862,droSec1:0.012586):0.011123):0.012087,(droYak2:0.031478,droEre2:0.032714):0.015973):0.033568,droAna3:0.107755):0.019847,(dp4:0.008814,droPer1:0.009578):0.104435):0.016369,droWil1:0.150471):0.005457,((droVir3:0.056362,droMoj3:0.072079):0.020331,droGri2:0.083731):0.092042):0.049021,anoGam1:0.218175):0.046658,apiMel2:0.187335):0.106621,triCas2:0.106621); #ave.noncons.mod:TREE: (((((((((dm2:0.059235,(droSim1:0.039799,droSec1:0.039025):0.035598):0.041349,(droYak2:0.104304,droEre2:0.106715):0.054017):0.119983,droAna3:0.377089):0.071962,(dp4:0.028079,droPer1:0.031175):0.369228):0.060569,droWil1:0.536488):0.019784,((droVir3:0.195542,droMoj3:0.254453):0.073077,droGri2:0.290773):0.336972):0.180643,anoGam1:0.783559):0.174661,apiMel2:0.670401):0.382291,triCas2:0.382291); # SECOND ITERATION: cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # No consEntropy check because it runs out of mem on an 8G-mem small # cluster node and has not completed in a day on 16G-mem kolossus. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh pk mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.phast cd /cluster/data/dm2/bed/multiz15way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 23.8 --target-coverage 0.393 \ --quiet --seqname $chr --idpref $pref \ --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para make jobList para time #Completed: 139 of 139 jobs #CPU time in finished jobs: 1388s 23.14m 0.39h 0.02d 0.000 y #IO & Wait Time: 402s 6.70m 0.11h 0.00d 0.000 y #Average job time: 13s 0.21m 0.00h 0.00d #Longest finished job: 17s 0.28m 0.00h 0.00d #Submission to last job: 170s 2.83m 0.05h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm2/bed/multiz15way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm2/bed/multiz15way/phastCons featureBits -enrichment dm2 flyBaseGene:cds all.bed # FIRST ITERATION: 68.9'll do! #flyBaseGene:cds 16.649%, all.bed 37.421%, both 11.471%, cover 68.90%, enrich 1.84x # Having met the CDS coverage target, load up the results. hgLoadBed dm2 phastConsElements15way all.bed # Create wiggle ssh pk mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.wib cd /cluster/data/dm2/bed/multiz15way/phastCons/run.wib rm -rf /san/sanvol1/scratch/dm2/phastCons/wib mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /san/sanvol1/scratch/dm2/phastCons/wib zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para make jobList para time #Completed: 13 of 13 jobs #CPU time in finished jobs: 71s 1.18m 0.02h 0.00d 0.000 y #IO & Wait Time: 28s 0.47m 0.01h 0.00d 0.000 y #Average job time: 8s 0.13m 0.00h 0.00d #Longest finished job: 17s 0.28m 0.00h 0.00d #Submission to last job: 101s 1.68m 0.03h 0.00d # back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from san/sanvol1 cd /cluster/data/dm2/bed/multiz15way/phastCons rm -rf wib POSTPROBS rsync -av /san/sanvol1/scratch/dm2/phastCons/wib . rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm2/multiz15way/wib cd /cluster/data/dm2/bed/multiz15way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz15way/wib/ hgLoadWiggle dm2 phastCons15way \ -pathPrefix=/gbdb/dm2/multiz15way/wib wib/*.wig rm wiggle.tab # and clean up san/sanvol1. rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /san/sanvol1/scratch/dm2/chrom # Offer raw scores for download since fly folks are likely to be interested: ssh kkstore04 cd /cluster/data/dm2/bed/multiz15way/phastCons/POSTPROBS mkdir ../postprobsDownload foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`) zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \ > ../postprobsDownload/$chr.pp.gz end cd ../postprobsDownload md5sum *.gz > md5sum.txt # Make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way ln -s /cluster/data/dm2/bed/multiz15way/phastCons/postprobsDownload/* . # make a tree picture using phyloGif. cd /cluster/data/dm2/bed/multiz15way /usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \ -phyloGif_width=260 -phyloGif_height=480 > dm2_15way.gif cp -p dm2_15way.gif ~/browser/images/phylo/dm2_15way.gif # Check in browser/images/phylo/dm2_15way.gif. In a clean ~/browser: cd ~/browser make alpha ######################################################################### # ORegAnno Open Regulatory Annotation Belinda Giardine April 2007 # # Update data via a reload # trackDb is at top level # parser for dump from Obi Griffith is at PSU. # cp data to UCSC and load hgsql dm2 truncate table oreganno; truncate table oregannoAttr; truncate table oregannoLink; #load the tables hgsql hgFixed load data local infile "oregannoAttr.dm2.txt" into table oregannoAttr; load data local infile "oregannoLink.dm2.txt" into table oregannoLink; grep "^chr" oreganno.dm2.txt | sort -k1,1 -k2,2n > oreganno.bed hgLoadBed dm2 oreganno oreganno.bed -noSort -oldTable -tab rm oreganno.bed ######################################################################### # FLYBASE 4.3 ANNOTATIONS (DONE (except for contacting flybase-help) 10/11/06 angie - UPDATED 4/6/07) ssh kkstore04 mkdir /cluster/data/dm2/bed/flybase4.3 cd /cluster/data/dm2/bed/flybase4.3 foreach c (2L 2R 3L 3R 4 X) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/gff/$c.gff end cat *.gff > flybase.gff3 # What data sources are represented in this file? grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c # excerpt (many other sources, including blastx:... , sim4:... and # tblastx:...; also various other types for source "FlyBase" and "."): 85778 FlyBase CDS 64452 FlyBase exon 14449 FlyBase gene 19376 FlyBase mRNA 66 FlyBase miRNA 116 FlyBase ncRNA 51 FlyBase pseudogene 96 FlyBase rRNA 46 FlyBase snRNA 63 FlyBase snoRNA 292 FlyBase tRNA 36921 . transcription_start_site 6003 FlyBase transposable_element 35734 FlyBase transposable_element_insertion_site # Notable differences since 4.2: "FlyBase" instead of "." for most of # those, and now there seens to be more than one CDS per exon while it # use to be ~1 CDS per gene. Hmmmm.... # What keywords are defined in the 9th field? Watch out for stray ;'s # from html-escaped special characters... grep -v '^#' flybase.gff3 \ | awk '{print $9;}' \ | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \ | sort | uniq -c 4250095 74483 Alias 31137 Dbxref 2795760 ID 1432800 Name 765 Ontology_term 515484 Parent 1046375 Synonym 17901 associated_with 15773 cyto_range 36666 derived_cyto_location 96162 evidence 15657 gbunit 12101 programversion 11837 putative_ortholog_of 11790 to_name 121126 to_species # Notable differences: synonym -> Synonym; no more synonym_2nd or # dbxref_2nd (or species); some new keywords but I think we can ignore. # Copy over 4.2 script and modify to fit the latest Immortal Unchanging # Widely Adopted Standard GFF3. cp ../flybase4.2/extractGenes.pl . extractGenes.pl flybase.gff3 # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/fasta/dmel-all-translation-r4.3.fasta # save some space. nice gzip ?{,?}.gff dmel-all-transl* # Translate the fancy headers (some of which already start with CG-P's # that match the FBtr's -- but not all!) into CG-R's to match flyBaseGene: zcat dmel-all-translation-r4.3.fasta.gz \ | perl -we 'open(X, "flyBase2004Xref.tab") || die; \ while () { \ @w = split("\t"); \ $fbtr2cgr{$w[3]} = $w[0]; \ } \ while (<>) { \ if (/^>.*parent=(FBtr\d+)/) { \ $cgr = $fbtr2cgr{$1}; \ s/^>.*/>$cgr/ if ($cgr); \ } \ print; \ } \ ' \ > flybasePep.fa ssh hgwdev cd /cluster/data/dm2/bed/flybase4.3 # Protein-coding genes: ldHgGene -gtf dm2 flyBaseGene flyBase*.gtf hgPepPred dm2 generic flyBasePep flybasePep.fa # Noncoding genes: hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed # Cross-referencing info for both coding and noncoding: hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \ flyBase2004Xref.tab # -- had to remove a couple duplicate lines in flyBase2004Xref.tab. # CG4110-RA and CG13904-RA had multiple mappings, both with consecutive # FBtr's so I tossed out the earlier FBtr for both. # Do this next time around, to catch joinerCheck errors before # running doHgNearBlastp with these proteins: runJoiner.csh dm2 flyBasePep # Some featureBits comparisons with refGene which shoudl be pretty much # like version 4.0 with a few noncoding genes tossed in... but this is # a bigger difference than we expect: featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout #604659 bases of 131698467 (0.459%) in intersection # Hmmm, flyBaseGene has nothing on the last 165,000bp of chrX!: # chrX:22,058,584-22,224,390 # Here's anothre 176,000 gap in chrX: # chrX:21,836,109-22,012,685 # nor the last 496,000bp of chr3L: # chr3L:23,275,448-23,771,897 # nor the first 527,000bp of chr2R: # chr2R:1-527,716 # No files for chr2h, chr3h, chr4h, chrXh, chrYh, chrU, so those are # entirely devoid of FlyBase Genes... yikes. #TODO: contact flybase-help about the lack of annotations in those ranges... # This shows only the dropped isoforms/duplicates (ignores noncoding): featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \ -bed=stdout #chr3R 15620310 15621493 chr3R.1 #chr2L 4698094 4700940 chr2L.1 #chrX 3638700 3639895 chrX.1 #474282 bases of 131698467 (0.360%) in intersection # add upstream* downloadable files cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips foreach size (1000 2000 5000) echo upstream$size nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum *.zip *.gz > md5sum.txt # Now go redo all the sections above for tables related to flyBase*. # 4/6/07: remove mitochondrial genes from flyBasePep because they # are missing from flyBase2004Xref and caused joinerCheck errors. hgsql dm2 -e 'delete from flyBasePep where name like "mt:%"' ######################################################################### # AFFYMETRIX TIMECOURSE (DONE 12/21/06 angie) ssh kkstore04 mkdir /cluster/data/dm2/bed/affyDrosDev cd /cluster/data/dm2/bed/affyDrosDev # Download files from Affy: wget -r --level=1 --directory-prefix=transfrags --no-directories \ http://transcriptome.affymetrix.com/download/publication/dros_dev/transfrags foreach chr (`awk '{print $1;}' ../../chrom.sizes`) mkdir $chr pushd $chr foreach t (1 2 3 4 5 6 7 8 9 10 11 12) wget --timestamping http://transcriptome.affymetrix.com/download/publication/dros_dev/graphs/$chr/Dro_Total_AS_${t}_C01.sig.gr end popd end # Transfrags -- Add chromosome to make bed3 (start is 0-based per Sujit): foreach t (1 2 3 4 5 6 7 8 9 10 11 12) cp /dev/null affyDrosDevTransfrags$t.bed foreach chr (`awk '{print $1;}' ../../chrom.sizes`) if ($chr == "chrM") continue awk '{print "'$chr'\t" $1 "\t" $2;}' \ transfrags/$chr.Dro_Total_AS_${t}_C01 >> affyDrosDevTransfrags$t.bed end end # Transcription level -- Add wig format variableStep lines and # agglomerate into per-timepoint .wig files: foreach t (1 2 3 4 5 6 7 8 9 10 11 12) cp /dev/null affyDrosDevSignal$t.varStep.wig foreach chr (`awk '{print $1;}' ../../chrom.sizes`) if ($chr == "chrM") continue echo 'variableStep chrom='$chr >> affyDrosDevSignal$t.varStep.wig cat $chr/Dro_Total_AS_${t}_C01.sig.gr >> affyDrosDevSignal$t.varStep.wig end wigEncode affyDrosDevSignal$t.varStep.wig affyDrosDevSignal$t.wi{g,b} end #Converted affyDrosDevSignal1.varStep.wig, upper limit 1156.00, lower limit -860.00 #Converted affyDrosDevSignal2.varStep.wig, upper limit 1499.75, lower limit -580.50 #Converted affyDrosDevSignal3.varStep.wig, upper limit 1627.50, lower limit -586.00 #Converted affyDrosDevSignal4.varStep.wig, upper limit 1794.50, lower limit -718.50 #Converted affyDrosDevSignal5.varStep.wig, upper limit 2277.25, lower limit -913.00 #Converted affyDrosDevSignal6.varStep.wig, upper limit 2095.25, lower limit -864.00 #Converted affyDrosDevSignal7.varStep.wig, upper limit 1071.75, lower limit -560.50 #Converted affyDrosDevSignal8.varStep.wig, upper limit 1182.50, lower limit -1019.00 #Converted affyDrosDevSignal9.varStep.wig, upper limit 2028.75, lower limit -942.50 #Converted affyDrosDevSignal10.varStep.wig, upper limit 1212.00, lower limit -779.00 #Converted affyDrosDevSignal11.varStep.wig, upper limit 1456.75, lower limit -454.50 #Converted affyDrosDevSignal12.varStep.wig, upper limit 1203.50, lower limit -459.50 # --> trackDb setting: type wig -1019.0 2277.25 # Load tables (--> composite tracks) ssh hgwdev cd /cluster/data/dm2/bed/affyDrosDev mkdir /gbdb/dm2/affyDrosDev ln -s /cluster/data/dm2/bed/affyDrosDev/affyDrosDevSignal*.wib \ /gbdb/dm2/affyDrosDev foreach t (1 2 3 4 5 6 7 8 9 10 11 12) hgLoadBed dm2 affyDrosDevTransfrags$t affyDrosDevTransfrags$t.bed hgLoadWiggle dm2 affyDrosDevSignal$t \ -pathPrefix=/gbdb/dm2/affyDrosDev affyDrosDevSignal$t.wig end rm bed.tab wiggle.tab ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE, 2007-01-26, braney) ssh kkstore04 bash # if not using bash shell already # use blastDb database made for hg17 protein track mkdir -p /san/sanvol1/scratch/dm2/blastDb; cd /cluster/data/dm2/blastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/dm2/blastDb; done mkdir /cluster/data/dm2/bed/tblastn.hg18KG cd /cluster/data/dm2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/dm2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 288 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(250000/288) = 42.309504 mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/dm2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # then run the Blast cluster jobs ssh pk cd /cluster/data/dm2/bed/tblastn.hg18KG para create blastSpec para try, check, push, check etc. para time # Completed: 176544 of 176544 jobs # CPU time in finished jobs: 1614468s 26907.81m 448.46h 18.69d 0.051 y # IO & Wait Time: 589357s 9822.61m 163.71h 6.82d 0.019 y # Average job time: 12s 0.21m 0.00h 0.00d # Longest finished job: 73s 1.22m 0.02h 0.00d # Submission to last job: 58680s 978.00m 16.30h 0.68d ssh kkstore03 cd /cluster/data/dm2/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS \ /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/dm2/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 613 of 613 jobs # CPU time in finished jobs: 6560s 109.33m 1.82h 0.08d 0.000 y # IO & Wait Time: 31904s 531.74m 8.86h 0.37d 0.001 y # Average job time: 63s 1.05m 0.02h 0.00d # Longest finished job: 176s 2.93m 0.05h 0.00d # Submission to last job: 1338s 22.30m 0.37h 0.02d ssh kkstore03 cd /cluster/data/dm2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl pslCheck /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/dm2/bed/tblastn.hg18KG hgLoadPsl dm2 blastHg18KG.psl # check coverage featureBits dm2 blastHg18KG # 5946352 bases of 131698467 (4.515%) in intersection featureBits dm2 refGene:cds blastHg18KG -enrichment # refGene:cds 17.097%, blastHg17KG 4.427%, both 4.246%, cover 24.83%, enrich 5.61x ssh kkstore04 rm -rf /cluster/data/dm2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut #end human proteins ################################################################# ######################################################################### # CHROMOSOME BANDS (DONE 6/19/07 angie) ssh hgwdev cd /cluster/data/dm2/bed/flybase4.3 grep chromosome_band flybase.gff3 \ | perl -wpe 'chomp; @w = split; \ $w[3]--; $w[3] = 0 if ($w[3] < 0); \ $w[8] =~ s/^.*ID=band-(\w+);$/$1/; \ if ($w[4] <= 0) { s/^.*$//; } else { \ s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\t\n/; }' \ > cytoBand.txt # Did a bit of manual cleanup... there are some rows that have negative # starts that should clearly have negative ends, but for some reason # their end coords have been substituted with 10, 20, 30, 40 -- # I just deleted those 12 rows. hgLoadSqlTab dm2 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt # Drop or modify the items that are off the end of their chroms: awk '{print "delete from dm2.cytoBand where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBand set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \ /cluster/data/dm2/chrom.sizes \ | hgsql dm2 checkTableCoords -verbose=2 dm2 cytoBand # Normally we would just create cytoBandIdeo by select * from cytoBand -- # but that gives too high of a resolution for any bands to be visible! # So make a boiled-down cytoBandIdeo that just has {number, letter} # instead of {number, letter, number}. perl -we 'while (<>) { \ chomp; @w=split(" "); \ if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \ $b = $1; \ } else { die "doh!" } \ if (! defined $lastB) { \ $start = $w[1]; \ } elsif ($lastB ne $b) { \ print "$chrom\t$start\t$end\t$lastB\t\n"; \ $start = $w[1]; \ } \ ($chrom, $end, $lastB) = ($w[0], $w[2], $b); \ } \ print "$chrom\t$start\t$end\t$lastB\t\n";' cytoBand.txt \ > cytoBandIdeo.txt hgLoadSqlTab dm2 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \ cytoBandIdeo.txt # Drop or modify the items that are off the end of their chroms: awk '{print "delete from dm2.cytoBandIdeo where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBandIdeo set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \ /cluster/data/dm2/chrom.sizes \ | hgsql dm2 checkTableCoords -verbose=2 dm2 cytoBandIdeo ######################################################################### # EvoFold track (done, 10.02.2007, Jakob Skou Pedersen) # # EvoFold predictions were made based on the drosophila subset (12 # species) of the 15-way multiz insect alignment. The data generation # process will change in the future and is therefore not described in # detail here. Instead we start with the final set of predictions # supplied by me. ssh hgwdev mkdir -p /cluster/data/dm2/bed/evofold cd /cluster/data/dm2/bed/evofold cp /cluster/home/jsp/projects/flies/finalDataSets/finalPredAll.bed foldsDm2.bed # The folds_hg17.bed is a 9-column bed file: column 1-6 provide # standard bed information, column 7 is element length, column 8 is # the RNA secondary structure in parentheses format, and column nine # is a commaseparated list of position specific confidence scores # (floats). hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql dm2 evofold foldsDm2.bed ######################################################################### # BDTNP CHIP-CHIP (DONE 4/9/08 angie) # New data added 2/2/09 and 8/25/10, see next sections. # Berkeley Drosophila Transcription Network Project's ChIP-Chip data, # from Mark Biggin (MDBiggin at lbl dot gov). ssh kkstore04 mkdir /cluster/data/dm2/bed/bdtnpChipper cd /cluster/data/dm2/bed/bdtnpChipper wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_1FDR.wig.bz2 wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_25FDR.wig.bz2 # The files contain concatenated custom tracks; split those out into # one .wig file per track, with UCSC table names: bzcat all_public* | perl -we \ 'my $fh; \ while (<>) { \ if (/^track .* name="([\w-]+)"/) { \ close($fh) if ($fh); \ my $name = $1; \ $name =~ s/([a-zI]+)_(H14)?(\d+)?_\d+_675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$4/ \ || die "parse:\n$name\n\t"; \ open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \ } else { \ die "data before track line? at input line $.:\n$_" if (! $fh); \ print $fh $_; \ } \ }' # These two warnings are for the polII filenames, no problem: #Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 714856. #Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 3415135. mkdir wigEncoded cd wigEncoded foreach f (../*Fdr*.wig) wigEncode $f $f:t:r.{wig,wib} end #Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80 #Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80 #Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68 #Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68 #Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06 #Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48 #Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21 #Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49 #Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92 #Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86 #Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88 #Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78 #Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78 #Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01 #Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07 #Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76 #Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68 #Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68 #Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66 #Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66 #Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93 #Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70 #Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95 #Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79 ssh hgwdev mkdir /gbdb/dm2/bdtnp ln -s /cluster/data/dm2/bed/bdtnpChipper/wigEncoded/*.wib /gbdb/dm2/bdtnp/ cd /cluster/data/dm2/bed/bdtnpChipper/wigEncoded foreach f (*.wig) hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:r $f end rm wiggle.tab # Print out template for 24 subtrack defs (longLabel will need editing): set pri = 1 foreach f (*.wig) echo " track $f:r" echo " subTrack bdtnpChipper" set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'` set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'` set ab = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d)?F.*/$1/ || die;'` set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'` echo " subGroups fdr=$fdr factor=$FAC" echo " shortLabel $fac AB $ab FDR $fdr%" echo " longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%" if ($fdr == 1) then echo " color 50,50,200" else echo " color 25,150,25" endif echo " priority $pri" echo "" set pri = `expr $pri + 1` end ######################################################################### # BDTNP CHIP-CHIP (DONE 2/2/09 angie) ssh hgwdev cd /hive/data/genomes/dm2/bed/bdtnpChipper wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_1FDR.wig.bz2 wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_25FDR.wig.bz2 # The files contain concatenated custom tracks; split those out into # one .wig file per track, with UCSC table names: bzcat BDTNP_all_new* | perl -we \ 'my $fh; \ while (<>) { \ if (/^track .* name="([\w-\.]+)"/) { \ close($fh) if ($fh); \ my $name = $1; \ $name =~ s/_FQ_/_1_/; $name =~ s/_BQ_/_2_/; \ $name =~ s/([a-zA-Z0-9]+)_(H14)?(\d+)?_\d+_(FactorResults\.)?675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$5/ \ || die "parse:\n$name\n\t"; \ print "$name\n"; \ open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \ } elsif (/^track /) { \ die "missed a track line:\n$_\n\t"; \ } else { \ die "data before track line? at input line $.:\n$_" if (! $fh); \ print $fh $_; \ } \ }' cd wigEncoded foreach f (../*Fdr*.wig) if (! -e $f:t:r.wig) then wigEncode $f $f:t:r.{wig,wib} endif end #Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51 #Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51 #Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79 #Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49 #Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65 #Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62 #Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07 #Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84 #Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68 #Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68 #Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68 #Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65 #Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03 #Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35 #Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87 #Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33 #Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90 #Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37 #Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67 #Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78 #Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78 #Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84 #Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84 #Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60 #Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60 #Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16 #Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58 #Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19 #Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81 #Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09 #Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95 #Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06 #Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62 #Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97 #Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77 #Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57 #Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57 #Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59 #Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59 #Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98 #Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57 #Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77 #Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68 #Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79 #Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55 foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/wigEncoded/*.wib) if (! -l /gbdb/dm2/bdtnp/$f:t) then ln -s $f /gbdb/dm2/bdtnp/ hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:t:r $f:t:r.wig endif end rm wiggle.tab # Print out template for new subtrack defs (longLabel will need editing): set pri = 25 foreach f (`ls -l *.wig | grep Feb | awk '{print $9;}'`) echo " track $f:r" echo " subTrack bdtnpChipper off" set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'` set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'` set ab = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d+)?Fdr.*/$1/ || die;'` set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'` echo " subGroups fdr=$fdr factor=$FAC" echo " shortLabel $fac AB $ab FDR $fdr%" echo " longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%" if ($fdr == 1) then echo " color 50,50,200" else echo " color 25,150,25" endif echo " priority $pri" echo "" set pri = `expr $pri + 1` end ######################################################################### # BDTNP CHIP-CHIP UPDATE (DONE 9/14/10 angie) mkdir /hive/data/genomes/dm2/bed/bdtnpChipper/100819 cd /hive/data/genomes/dm2/bed/bdtnpChipper/100819 # Unpacked chip-chip and DNase accessible regions files emailed by # Mark Biggins Aug. 9th, 13th, 16th and Sep. 13, 2010. # Download some additional DNase accessibility files: alias wg 'wget --timestamping --no-check-certificate --user=biggin --password=xxxx' # NOTE: these files were replaced 8/30, see below... wg https://data.htseq.org/biggin/wig/s5_rep9481.wig wg https://data.htseq.org/biggin/wig/s5_rep9482.wig wg https://data.htseq.org/biggin/wig/s9_rep9127.wig wg https://data.htseq.org/biggin/wig/s9_rep9128.wig wg https://data.htseq.org/biggin/wig/s10_rep8816.wig wg https://data.htseq.org/biggin/wig/s10_rep8820.wig wg https://data.htseq.org/biggin/wig/s11_rep9485.wig wg https://data.htseq.org/biggin/wig/s11_rep9486.wig wg https://data.htseq.org/biggin/wig/s14_rep9477.wig wg https://data.htseq.org/biggin/wig/s14_rep9478.wig # Accessibility levels: foreach f (s*_rep*.wig) set t = `echo $f | perl -wpe 's/^s(\d+)_rep(\d+).wig$/bdtnpDnaseS{$1}R$2/'` echo $f --\> $t tail -n +1 $f \ | sed -e 's/step=65/step=20/' \ | wigToBigWig -clip stdin /hive/data/genomes/dm2/chrom.sizes $t.bw end # All files have a few items off the end of some chroms: #line 2243431 of stdin: chromosome chr2h has 1694122 bases, but item ends at 1694124 #line 3432024 of stdin: chromosome chr3L has 23771897 bases, but item ends at 23771904 #line 4975059 of stdin: chromosome chr3h has 2955737 bases, but item ends at 2955744 #line 5039139 of stdin: chromosome chr4 has 1281640 bases, but item ends at 1281644 foreach f (bdtnpDnase*.bw) ln -s `pwd`/$f /gbdb/dm2/bdtnp/$f hgBbiDbLink dm2 $f:r /gbdb/dm2/bdtnp/$f end # Accessible regions (updated stage 9 9/14/10 with corrected data): foreach f (5%*.txt) mac2unix "$f" set t = `echo "$f" | perl -wpe 's/^.*stage (\d+).txt/bdtnpDnaseAccS$1/'` echo $f --\> $t tail -n +3 "$f" \ | awk '{print $1, ($2-1), $3;}' \ | hgLoadBed dm2 $t stdin end #5% FDR regions stage 10.txt --> bdtnpDnaseAccS10 #Loaded 23766 elements of size 3 #5% FDR regions stage 11.txt --> bdtnpDnaseAccS11 #Loaded 19743 elements of size 3 #5% FDR regions stage 14.txt --> bdtnpDnaseAccS14 #Loaded 23065 elements of size 3 #5% FDR regions stage 5.txt --> bdtnpDnaseAccS5 #Loaded 22694 elements of size 3 #5% FDR regions stage 9.txt --> bdtnpDnaseAccS9 #Loaded 16217 elements of size 3 # New ChIP/chip data: foreach f ({hb,med}*.wig) set t = `echo $f | perl -wpe 'm/^([a-z]+)_(\d+)(S\d+).*-(\d+).wig$/ || die; \ ($fac, $ab, $s, $fdr) = ($1, $2, $3, $4); \ $fac = ucfirst($fac); \ $_ = "bdtnp$fac$ab${s}Fdr$fdr"'` echo $f --\> $t wigToBigWig $f /hive/data/genomes/dm2/chrom.sizes $t.bw end foreach f (bdtnp[HM]*.bw) echo $f:t ln -s `pwd`/$f /gbdb/dm2/bdtnp/$f hgBbiDbLink dm2 $f:r /gbdb/dm2/bdtnp/$f end # 8/30/10: new DNase data (normalized; password in email 8/27): alias wg 'wget --timestamping --no-check-certificate --user=encode --password=xxxx' wg http://www.uwencode.org/~sthomas/data/biggin/wiggle.tar.gz tar xvzf wiggle.tar.gz foreach f (wiggle/p5_S*_DS*.wig) set t = `echo $f | perl -wpe 's/^wiggle\/p5_(S\d+)_DS(\d+).wig$/bdtnpDnase{$1}R$2/'` echo $f --\> $t wigToBigWig -clip $f /hive/data/genomes/dm2/chrom.sizes $t.bw end # again, all files have a few items that extend past end of chrom, NBD: #line 2243431 of wiggle/p5_S9_DS9128.wig: chromosome chr2h has 1694122 bases, but item ends at 1694124 #line 3432024 of wiggle/p5_S9_DS9128.wig: chromosome chr3L has 23771897 bases, but item ends at 23771904 #line 4975059 of wiggle/p5_S9_DS9128.wig: chromosome chr3h has 2955737 bases, but item ends at 2955744 #line 5039139 of wiggle/p5_S9_DS9128.wig: chromosome chr4 has 1281640 bases, but item ends at 1281644 # No need to re-run hgBbiDbLink -- filenames are same. # Get new max for trackDb bigWig specs: foreach f ( bdtnpDnaseS*.bw ) echo $f bigWigInfo $f | egrep '^(min|max)' echo "" end ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: dm2.upstreamGeneTbl = refGene dm2.upstreamMaf = multiz15way /hive/data/genomes/dm2/bed/multiz15way/species.lst