# for emacs: -*- mode: sh; -*- # Caenorhabditis elegans # Washington University School of Medicine GSC and Sanger Institute WS180 # We don't have the resources to build a complete browser on this -- # make ce4 <-> ce5 <-> ce6 liftOver files and compGeno tracks for modENCODE, # and a skeletal browser for checking the compGeno tracks. # $Id: ce5.txt,v 1.3 2009/01/26 20:27:39 angie Exp $ ######################################################################### # DOWNLOAD SEQUENCE (DONE 10/10/07 angie) ssh kkstore06 mkdir /cluster/store3/worm/ce5 ln -s /cluster/store3/worm/ce5 /cluster/data/ce5 cd /cluster/data/ce5 mkdir downloads cd downloads wget --timestamping -m -np -nd \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS180/CHROMOSOMES/" # All we really want is the CHR*.agp and CHR*.fa.gz -- clean up a # bit & fetch more selectively next time. rm chrom* *_masked.dna.gz *.gff.gz CHROMOSOME_MtDNA.CSHL.gff ######################################################################### # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE 10/10/07 angie) ssh kkstore06 cd /cluster/data/ce5/downloads # Fix fasta names: zcat CHR*.dna.gz \ | sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \ | gzip -c > UCSC.fa.gz faSize -detailed UCSC.fa.gz #chrI 15072421 #chrII 15279316 #chrIII 13783681 #chrIV 17493785 #chrM 13794 #chrV 20919568 #chrX 17718851 # Make sure we get the same sizes from this command: zcat CHR*.dna.gz | sed -e '/^$/ d;' | faSize -detailed stdin # Fix AGP names: sed -e 's/^/chr/' CHR*.agp \ > UCSC.agp # And add a fake mitochondrial AGP entry for the sake of downstream # tools (I made sure the GenBank sequence is identical to given): echo "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp ######################################################################### # CHECK AGP VS. FA, MAKE UNMASKED 2BIT (DONE 10/10/07 angie) ssh kkstore06 cd /cluster/data/ce5 cat << '_EOF_' > ce5.config.ra # Config parameters for makeGenomeDb.pl: db ce5 clade worm genomeCladePriority 10 scientificName Caenorhabditis elegans commonName C. elegans assemblyDate Aug. 2007 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS180 orderKey 827 mitoAcc none fastaFiles /cluster/data/ce5/downloads/UCSC.fa.gz agpFiles /cluster/data/ce5/downloads/UCSC.agp # qualFiles /dev/null dbDbSpeciesDir worm '_EOF_' # << emacs mkdir jkStuff nice makeGenomeDb.pl ce5.config.ra -stop agp \ >& jkStuff/makeGenomeDb.log & tail -f jkStuff/makeGenomeDb.log # Note: file date on ce5.unmasked.2bit is 10/11 -- original was moved # aside to ce5.unmasked.lowercase.2bit, and then ce5.unmasked.2bit was # regenerated to test a slight tweak to makeGenomeDb.pl: use -noMask # with faToTwoBit when making .unmasked, in case we get lower-cased # sequence as in this case... which looks like all-masked to many progs. ######################################################################### # REPEATMASKER (DONE (through mask) 10/10/07, DONE 1/14/09 angie) ssh kkstore06 cd /cluster/data/ce5 doRepeatMasker.pl ce5 -debug cd bed/RepeatMasker.2007-10-10 doRepeatMasker.pl ce5 -stop mask >& do.log & tail -f do.log # 1/14/09: Load the table (need it for netClass) cd /hive/data/genomes/ce5/bed/RepeatMasker.2007-10-10 doRepeatMasker.pl ce5 -continue install -buildDir `pwd` \ -workhorse=kolossus -fileServer=kolossus \ >>& do.log & tail -f do.log ######################################################################### # SIMPLE REPEATS (DONE (through filter) 10/11/07, DONE 1/14/09 angie) ssh kkstore06 cd /cluster/data/ce5 doSimpleRepeat.pl ce5 -debug -verbose=3 cd bed/simpleRepeat.2007-10-11 doSimpleRepeat.pl ce5 -verbose=3 -stop filter >& do.log & tail -f do.log # Took less than 1/2 hour (small sequence, just did a single run). # Use the suggested commands at the end of do.log below... # 1/14/09: Load the table (need it for netClass) cd /hive/data/genomes/ce5/bed/simpleRepeat.2007-10-11 doSimpleRepeat.pl ce5 -verbose=3 -continue load -buildDir `pwd` \ -workhorse=kolossus -fileServer=kolossus \ >>& do.log & tail -f do.log ######################################################################### # MASK SEQUENCE WITH RM+TRF (DONE 10/11/07 angie) # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed, # now it's time to combine the masking into the final ce5.2bit, # following the instructions at the end of doSimpleRepeat's output. ssh kolossus cd /cluster/data/ce5 twoBitMask ce5.rmsk.2bit -add bed/simpleRepeat.2007-10-11/trfMask.bed \ ce5.2bit # Ignore warning about extra BED columns. ######################################################################### # INSTALL 2BIT AND 11.OOC FOR BLAT/LIFTOVER (DONE 10/11/07 angie) # REDONE 12/15/08 --> jkStuff/11.ooc because /cluster/bluearc is no more. # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome # size from featureBits. So we would use 34, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. ssh kkstore06 cd /cluster/data/ce5 rsync -av ce5.2bit /cluster/bluearc/ce5/ blat ce5.2bit /dev/null /dev/null -makeOoc=/cluster/bluearc/ce5/11.ooc \ -repMatch=100 #Wrote 8502 overused 11-mers to /cluster/bluearc/ce5/11.ooc ######################################################################### # CE5->CE4 LIFTOVER (DONE 10/11/07 angie) ssh kkstore06 cd /cluster/data/ce5 doSameSpeciesLiftOver.pl ce5 ce4 -debug cd bed/blat.ce4.2007-10-11 doSameSpeciesLiftOver.pl ce5 ce4 -verbose 3 >& do.log & tail -f do.log ######################################################################### # CE5->CE6 LIFTOVER (DONE 12/15/08 angie) cd /cluster/data/ce5 doSameSpeciesLiftOver.pl ce5 ce6 -debug cd bed/blat.ce6.2008-12-15 doSameSpeciesLiftOver.pl ce5 ce6 >& do.log & tail -f do.log ######################################################################### # MAKE ACTIVE=0 ENTRY IN DBDB (DONE 10/11/07 angie) ssh kkstore06 cd /cluster/data/ce5 makeGenomeDb.pl ce5.config.ra -debug -continue dbDb -stop dbDb # Edit dbDbInsert.sql so active=0 (and fix the -debug defaultPos # that refers to chr1 --> chrI), then execute as in makeGenomeDb's # #DEBUG# output: ssh -x hgwdev hgsql -h genome-testdb -A -N hgcentraltest \ < /cluster/data/ce5/dbDbInsert.sql ######################################################################### # MAKE SKELETAL BROWSER DB (DONE 1/13/09 angie) cd /hive/data/genomes/ce5 makeGenomeDb.pl ce5.config.ra -continue db \ >>& jkStuff/makeGenomeDb.log & tail -f jkStuff/makeGenomeDb.log # Followed instructions at end for editing and adding ce5 to trackDb/... # Used descriptions from ce6. ######################################################################### # BLASTZ caePb2 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 cd /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 cat << '_EOF_' > DEF # ce5 vs caePb2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 (brenneri) caePb2 SEQ2_DIR=/hive/data/genomes/caePb2/caePb2.2bit SEQ2_LEN=/hive/data/genomes/caePb2/chrom.sizes SEQ2_CTGDIR=/hive/data/staging/data/caePb2/caePb2.supercontigs.2bit SEQ2_CTGLEN=/hive/data/staging/data/caePb2/caePb2.supercontigs.sizes SEQ2_LIFT=/hive/data/staging/data/caePb2/caePb2.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ -qRepeats=windowmaskerSdust \ >& do.log & tail -f do.log ######################################################################### # BLASTZ caePb2 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 cd /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 cat << '_EOF_' > DEF # ce5 vs caePb2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 (brenneri) caePb2 SEQ2_DIR=/hive/data/genomes/caePb2/caePb2.2bit SEQ2_LEN=/hive/data/genomes/caePb2/chrom.sizes SEQ2_CTGDIR=/hive/data/staging/data/caePb2/caePb2.supercontigs.2bit SEQ2_CTGLEN=/hive/data/staging/data/caePb2/caePb2.supercontigs.sizes SEQ2_LIFT=/hive/data/staging/data/caePb2/caePb2.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ -qRepeats=windowmaskerSdust \ >& do.log & tail -f do.log ln -s blastzCaePb2.2009-01-13 /hive/data/genomes/ce5/bed/blastz.caePb2 ######################################################################### # BLASTZ caeRem3 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14 cd /hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14 cat << '_EOF_' > DEF # ce5 vs caeRem3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. remanei caeRem3 SEQ2_DIR=/hive/data/genomes/caeRem3/caeRem3.2bit SEQ2_LEN=/hive/data/genomes/caeRem3/chrom.sizes SEQ2_CTGDIR=/hive/data/staging/data/caeRem3/caeRem3.supercontigs.2bit SEQ2_CTGLEN=/hive/data/staging/data/caeRem3/caeRem3.supercontigs.sizes SEQ2_LIFT=/hive/data/staging/data/caeRem3/caeRem3.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ >& do.log & tail -f do.log ln -s blastzCaeRem3.2009-01-14 /hive/data/genomes/ce5/bed/blastz.caeRem3 ######################################################################### # BLASTZ cb3 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzCb3.2009-01-14 cd /hive/data/genomes/ce5/bed/blastzCb3.2009-01-14 cat << '_EOF_' > DEF # ce5 vs cb3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae cb3 SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzCb3.2009-01-14 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ >& do.log & tail -f do.log ln -s blastzCb3.2009-01-14 /hive/data/genomes/ce5/bed/blastz.cb3 ######################################################################### # BLASTZ priPac1 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14 cd /hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14 cat << '_EOF_' > DEF # ce5 vs priPac1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: P. pacificus priPac1 SEQ2_DIR=/hive/data/genomes/priPac1/priPac1.TrfWM.2bit SEQ2_LEN=/hive/data/genomes/priPac1/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.2bit SEQ2_CTGLEN=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.sizes SEQ2_LIFT=/hive/data/genomes/priPac1/trfWM.maskedContigs/super.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ >& do.log & tail -f do.log ln -s blastzPriPac1.2009-01-14 /hive/data/genomes/ce5/bed/blastz.priPac1 ######################################################################### # BLASTZ caeJap1 (DONE 1/14/09 angie) # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return screen mkdir /hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14 cd /hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14 cat << '_EOF_' > DEF # ce5 vs caeJap1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce5 SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. remanei caeJap1 SEQ2_DIR=/hive/data/genomes/caeJap1/caeJap1.2bit SEQ2_LEN=/hive/data/genomes/caeJap1/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/caeJap1/maskedContigs/caeJap1.TrfWM.supers.2bit SEQ2_CTGLEN=/hive/data/genomes/caeJap1/maskedContigs/caeJap1.TrfWM.supers.sizes SEQ2_LIFT=/hive/data/genomes/caeJap1/maskedContigs/supers.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14 TMPDIR=/scratch/tmp '_EOF_' # << emacs # Would use memk for smallClusterHub, but it can't see /hive doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \ -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \ -qRepeats=windowmaskerSdust \ >& do.log & tail -f do.log ln -s blastzCaeJap1.2009-01-14 /hive/data/genomes/ce5/bed/blastz.caeJap1 ############################################################################ # MULTIZ6WAY (DONE 1/15/09 angie) mkdir /hive/data/genomes/ce5/bed/multiz6way cd /hive/data/genomes/ce5/bed/multiz6way # Tree copied from ce6.txt, see comments there on its derivation: cat << '_EOF_' > nematodes.nh ((((((((((briggsae_cb3:0.005,remanei_caeRem3:0.003):0.004, brenneri_caePb2:0.013):0.002,elegans_ce5:0.003):0.001, japonica_caeJap1:0.023):0.024,ps1010:0.027):0.014, df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038, (df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8) '_EOF_' # << emacs # filter that for the six species in use here /cluster/bin/phast/x86_64/tree_doctor \ --prune-all-but briggsae_cb3,remanei_caeRem3,brenneri_caePb2,elegans_ce5,japonica_caeJap1,pristiochus_priPac1 \ -R elegans_ce5 \ nematodes.nh > 6way.nh #(((((briggsae_cb3:0.005000,remanei_caeRem3:0.003000):0.004000,brenneri_caePb2:0.013000):0.002000,elegans_ce5:0.003000):0.001000,japonica_caeJap1:0.023000):0.384000,pristiochus_priPac1:0.800000); # rearrange to get Ce5 on top for the construction of the tree image # in the 6-way description page. This results in something like: cat < 6way_for_phyloGif.nh (((C._elegans_ce5:0.003000, (C._brenneri_caePb2:0.013000, (C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000) :0.002000):0.001000, C._japonica_caeJap1:0.023000):0.384000, P._pristiochus_priPac1:0.800000); EOF /cluster/bin/phast/x86_64/all_dists 6way.nh > 6way.distances.txt grep -i ce5 6way.distances.txt | sort -k3,3n # Use this output for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCe5Link chain linearGap # distance on ce5 on other minScore # 1 0.0120 - remanei_caeRem3 41.593% n/a 1000 loose # 2 0.0140 - briggsae_cb3 42.154% n/a 1000 loose # 3 0.0180 - brenneri_caePb2 40.553% n/a 1000 loose # 4 0.0270 - japonica_caeJap1 26.639% n/a 1000 loose # 5 1.1880 - pristiochus_priPac1 6.113% n/a 1000 loose # Coverage is about 0.015% lower than for ce6 for each of those, but only # a few bases in the genome assembly actually changed ce5 -> ce6. # Maybe it's a lastz vs. blastz thing?? cd /hive/data/genomes/ce5/bed/multiz6way # bash shell syntax here ... export H=/hive/data/genomes/ce5/bed mkdir mafLinks for G in caeRem3 cb3 caePb2 priPac1 caeJap1 do mkdir -p mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory ${H}/blastz.${G}/mafNet" fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # these are x86_64 binaries mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # the autoMultiz cluster run ssh swarm cd /hive/data/genomes/ce5/bed/multiz6way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 6way.nh > tree-commas.nh sed 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir maf run cd run # NOTE: you need to set the db properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = ce5 set c = $1 set maf = $2 set binDir = /hive/data/genomes/$db/bed/multiz6way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/$db/bed/multiz6way/mafLinks rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /hive/data/genomes/ce5/bed/multiz6way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs awk '{print $1}' /hive/data/genomes/ce5/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para push -maxPush=1 para check ... push ... etc ... #Completed: 7 of 7 jobs #CPU time in finished jobs: 1693s 28.21m 0.47h 0.02d 0.000 y #IO & Wait Time: 106s 1.77m 0.03h 0.00d 0.000 y #Average job time: 257s 4.28m 0.07h 0.00d #Longest finished job: 374s 6.23m 0.10h 0.00d #Submission to last job: 395s 6.58m 0.11h 0.00d # back on non-para-hub machine: cd /hive/data/genomes/ce5/bed/multiz6way time catDir maf > multiz6way.maf #real 0m12.721s #-rw-rw-r-- 1 angie protein 342023168 Jan 15 19:23 multiz6way.maf # before getting to the annotation, load this up so we can get # a first view of the track. This will be replaced with the annotated # mafs ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way mkdir /gbdb/ce5/multiz6way ln -s /hive/data/genomes/ce5/bed/multiz6way/multiz6way.maf /gbdb/ce5/multiz6way # this load creates a large file, do that on local disk: cd /scratch/tmp time hgLoadMaf ce5 multiz6way #Loaded 364189 mafs in 1 files from /gbdb/ce5/multiz6way #7.676u 0.420s 0:10.35 78.1% 0+0k 0+0io 0pf+0w time hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 ce5 multiz6waySummary /gbdb/ce5/multiz6way/multiz6way.maf #Created 124469 summary blocks from 1042942 components and 364189 mafs from /gbdb/ce5/multiz6way/multiz6way.maf #21.210u 2.196s 0:24.49 95.5% 0+0k 0+0io 0pf+0w # remove the temporary .tab files: rm multiz6*.tab ############################################################################ # LIFTOVER SANGER GENES FROM CE6 (DONE 1/15/09 angie) ssh hgwdev mkdir /hive/data/genomes/ce5/bed/sangerGene.liftOver cd /hive/data/genomes/ce5/bed/sangerGene.liftOver hgsql ce6 -NBe 'select * from sangerGene' \ | cut -f 1-10 \ | liftOver -genePred stdin -minBlocks=0.75 \ /hive/data/genomes/ce6/bed/liftOver/ce6ToCe5.over.chain.gz \ sangerGeneLO.gp unmapped.txt ldHgGene ce5 sangerGeneLO -predTab sangerGeneLO.gp #30296 gene predictions ############################################################################ # PHASTCONS6WAY (DONE 1/16/09 angie) # make per-chrom fasta: cd /hive/data/genomes/ce5 for C in `cut -f1 chrom.sizes | sed -e s/chr//` do echo "twoBitToFa -seq=chr${C} ce5.2bit ${C}/chr${C}.fa" twoBitToFa -seq=chr${C} ce5.2bit ${C}/chr${C}.fa done mkdir /hive/data/genomes/ce5/bed/multiz6way/phastCons cd /hive/data/genomes/ce5/bed/multiz6way/phastCons # Create SS files mkdir ss cd ss time for C in `cut -f1 /hive/data/genomes/ce5/chrom.sizes` do if [ -s /hive/data/genomes/ce5/bed/multiz6way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /hive/data/genomes/ce5/bed/multiz6way/maf/${C}.maf \ --refseq /hive/data/genomes/ce5/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done #real 1m22.701s # Using the ave.cons.mod and ave.noncons.mod from # /hive/data/genomes/ce6/bed/multiz6way/cons because ce5 and ce6 # are almost identical. See ce6.txt for derivation. sed -e 's/ce6/ce5/g' \ /hive/data/genomes/ce6/bed/multiz6way/cons/ave.cons.mod > ave.cons.mod sed -e 's/ce6/ce5/g' \ /hive/data/genomes/ce6/bed/multiz6way/cons/ave.noncons.mod > ave.noncons.mod # cons: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.308500 0.191500 0.191500 0.308500 RATE_MAT: -0.875904 0.175256 0.458101 0.242547 0.282331 -1.200805 0.180928 0.737546 0.737985 0.180928 -1.200354 0.281440 0.242547 0.457829 0.174703 -0.875079 TREE: (((((cb3:0.087550,caeRem3:0.073515):0.020860,caePb2:0.090766):0.029808,ce5:0.094770):0.038388,caeJap1:0.109779):0.126548,priPac1:0.126548); # noncons: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.308500 0.191500 0.191500 0.308500 RATE_MAT: -0.875904 0.175256 0.458101 0.242547 0.282331 -1.200805 0.180928 0.737546 0.737985 0.180928 -1.200354 0.281440 0.242547 0.457829 0.174703 -0.875079 TREE: (((((cb3:0.328951,caeRem3:0.273909):0.078262,caePb2:0.339349):0.111927,ce5:0.355503):0.144928,caeJap1:0.413391):0.478831,priPac1:0.478831); ssh swarm # Create cluster dir to do main phastCons run mkdir /hive/data/genomes/ce5/bed/multiz6way/phastCons/run cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-lengths 15 --target-coverage 0.55 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << emacs chmod +x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. #Completed: 29 of 29 jobs #CPU time in finished jobs: 308s 5.13m 0.09h 0.00d 0.000 y #IO & Wait Time: 129s 2.15m 0.04h 0.00d 0.000 y #Average job time: 15s 0.25m 0.00h 0.00d #Longest finished job: 25s 0.42m 0.01h 0.00d #Submission to last job: 666s 11.10m 0.18h 0.01d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" # if that looks right (== in chrom and numerical order), then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > ../phastConsElements6way.bed # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: faSize /hive/data/genomes/ce5/*/chr*.fa #100281416 bases (0 N's 100281416 real 86678741 upper 13602675 lower) in 7 sequences in 7 files #Total size: mean 14325916.6 sd 6728307.7 min 13794 (chrM) max 20919568 (chrV) median 15279316 #%13.56 masked total, %13.56 masked real # The 100281416 comes from the non-n genome as counted above. cd .. awk '{sum+=$3-$2} \ END{printf "%% %.2f = 100.0*%d/100281416\n",100.0*sum/100281416,sum}' \ phastConsElements6way.bed #% 29.44 = 100.0*29519094/100281416 # --expected-length 15 --target-coverage 0.55 # Aiming for %70 coverage in the following featureBits measurement on CDS, # but as in ce6, will accept 61%: featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way.bed #sangerGeneLO:cds 25.324%, phastConsElements6way.bed 29.436%, both 15.448%, cover 61.00%, enrich 2.07x # Load most conserved track into database ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way/phastCons # the copy was already done above # cp -p /hive/data/genomes/ce5/bed/multiz6way/phastCons/run/phastConsElements6way.bed . time hgLoadBed ce5 phastConsElements6way phastConsElements6way.bed #Loaded 592415 elements of size 5 #2.121u 0.172s 0:13.55 16.9% 0+0k 0+0io 0pf+0w # should measure the same as above featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way #sangerGeneLO:cds 25.324%, phastConsElements6way 29.436%, both 15.448%, cover 61.00%, enrich 2.07x # Create merged posterier probability file and wiggle track data files ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run # prepare compressed copy of ascii data values for downloads cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p ../phastCons6wayScores for D in ppRaw/chr* do C=${D/ppRaw\/} out=../phastCons6wayScores/${C}.scores.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << emacs chmod +x gzipAscii.sh time nice ./gzipAscii.sh #68.633u 0.954s 1:12.04 96.5% 0+0k 0+0io 0pf+0w # use those file to encode the data for the track: cd .. zcat phastCons6wayScores/chr*.data.gz \ | wigEncode stdin phastCons6way.wig phastCons6way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 #-rw-rw-r-- 1 angie protein 56581969 Jan 16 15:00 phastCons6way.wib #-rw-rw-r-- 1 angie protein 8810835 Jan 16 15:00 phastCons6way.wig ssh hgwdev # Make link for download of score files: ln -s /hive/data/genomes/ce5/bed/multiz6way/phastCons/phastCons6wayScores \ /usr/local/apache/htdocs/goldenPath/ce5/ # Load gbdb and database with wiggle. cd /hive/data/genomes/ce5/bed/multiz6way/phastCons ln -s `pwd`/phastCons6way.wib /gbdb/ce5/wib/phastCons6way.wib hgLoadWiggle ce5 phastCons6way phastCons6way.wig # Create histogram to see relative and cumulative frequency of scores: hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce5 phastCons6way >& histogram.data cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Worm Ce5 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << emacs display histo.png & ############################################################################ # PHASTCONS6WAY PARAM RE-ESTIMATION EXPERIMENT (DONE 1/23/09 angie) # I wanted to see whether following consEntropy's advice for expected-lengths, # and increasing target-coverage to get closer to 68.8%, would make for good # results, but it didn't. Including this here just for the record -- original # phastCons6way from above is not changed. # Start with ce6 model and params, but use consEntropy's results # to tune the parameters: mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned sed -e 's/ce6/ce5/g' \ /hive/data/genomes/ce6/bed/multiz6way/cons/ave.cons.mod > ce6.ave.cons.mod sed -e 's/ce6/ce5/g' \ /hive/data/genomes/ce6/bed/multiz6way/cons/ave.noncons.mod > ce6.ave.noncons.mod /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.55 15 ave.{,non}cons.mod #( Solving for new omega: 15.000000 76.253180 42.469155 35.389619 34.810297 34.805819 ) #Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667, nu=0.081481 #Relative entropy: H=1.172307 bits/site #Expected min. length: L_min=6.108564 sites #Expected max. length: L_max=5.516260 sites #Phylogenetic information threshold: PIT=L_min*H=7.161112 bits #Recommended expected length: omega=34.805819 sites (for L_min*H=9.780000) # Adam commented in dm1.txt that entropy would decrease with a large # increase in coverage. We need to boost the coverage from 61% to # closer to the target of 68.8% -- not large but not tiny. Also, # the computed PT is 7.16, already lower than the target of 9.78 # (those targets help to make the results more comparable across # species groups). So try increasing both target-coverage and # expected-length from ce6's: /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.75 40 ave.{,non}cons.mod #( Solving for new omega: 40.000000 67.304149 61.371424 61.145983 ) #Transition parameters: gamma=0.750000, omega=40.000000, mu=0.025000, nu=0.075000 #Relative entropy: H=1.172307 bits/site #Expected min. length: L_min=7.137874 sites #Expected max. length: L_max=7.010810 sites #Phylogenetic information threshold: PIT=L_min*H=8.367780 bits #Recommended expected length: omega=61.145983 sites (for L_min*H=9.780000) # Well, with the increased coverage, it is recommending a *very* long # expected-length... but the model figures into this, too. Use those # params to re-estimate the model. # FIRST ITERATION: use ce6's ave.*.mod and the params tweaked according # to consEntropy's output above. ssh swarm mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1 cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1 cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef /cluster/bin/phast/phastCons $1 ../ave.noncons.mod --gc 0.383 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 40 --target-coverage 0.75 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (../../phastCons/ss/chr*/*.ss) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end para create jobList para push -maxPush=1, check, shove etc. para time #Completed: 29 of 29 jobs #CPU time in finished jobs: 2728s 45.47m 0.76h 0.03d 0.000 y #IO & Wait Time: 159s 2.64m 0.04h 0.00d 0.000 y #Average job time: 100s 1.66m 0.03h 0.00d #Longest finished job: 155s 2.58m 0.04h 0.00d #Submission to last job: 190s 3.17m 0.05h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1 ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((((cb3:0.089855,caeRem3:0.089789):0.023180,caePb2:0.095488):0.030832,ce5:0.100172):0.042051,caeJap1:0.123745):0.132125,priPac1:0.132125); #ave.noncons.mod:TREE: (((((cb3:0.310133,caeRem3:0.305886):0.079966,caePb2:0.328879):0.106520,ce5:0.345476):0.145426,caeJap1:0.426104):0.455636,priPac1:0.455636); /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.75 40 ave.{,non}cons.mod #( Solving for new omega: 40.000000 68.550455 62.213959 61.964188 ) #Transition parameters: gamma=0.750000, omega=40.000000, mu=0.025000, nu=0.075000 #Relative entropy: H=1.053667 bits/site #Expected min. length: L_min=7.887543 sites #Expected max. length: L_max=7.970539 sites #Phylogenetic information threshold: PIT=L_min*H=8.310843 bits #Recommended expected length: omega=61.964188 sites (for L_min*H=9.780000) # Hmmm... recommended expected-length went up a tad and PIT decreased # a smidge, so if anything those measures seemed to worsen with # re-estimation. Well, let's see what happens to coverage if we run with # the new model and 0.75 40 anyway. cp ave.{,non}cons.mod .. ssh swarm mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores mkdir ppRaw bed cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../../phastCons/ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-lengths 40 --target-coverage 0.75 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << emacs chmod +x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << emacs # Create parasol batch and run it ls -1 ../../phastCons/ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try, check, shove, time #Completed: 29 of 29 jobs #CPU time in finished jobs: 298s 4.96m 0.08h 0.00d 0.000 y #IO & Wait Time: 90s 1.50m 0.03h 0.00d 0.000 y #Average job time: 13s 0.22m 0.00h 0.00d #Longest finished job: 19s 0.32m 0.01h 0.00d #Submission to last job: 115s 1.92m 0.03h 0.00d # combine predictions and transform scores to be in 0-1000 interval cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > ../phastConsElements6way.bed # As Hiram found when trying even 0.60 as target-cov, there are many # warnings about negative scores. cd .. # Aiming for %70 coverage in the following featureBits measurement on CDS, # but as in ce6, will accept 61%: featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way.bed #sangerGeneLO:cds 25.324%, phastConsElements6way.bed 34.253%, both 16.522%, cover 65.24%, enrich 1.90x # OK, 65.2% is closer to 68.8% than 61%. # What is the actual distribution of conserved element sizes like? awk '{print $3 - $2;}' phastConsElements6way.bed | sort -n > sizes.txt # Average length: awk '{TOTAL += $1; COUNT++; } END {printf "%d / %d = %.2f\n", TOTAL, COUNT, TOTAL/COUNT; }' \ sizes.txt #34349436 / 423298 = 81.15 # Median: tail -n +`expr 423298 / 2` sizes.txt | head -2 #51 #51 textHistogram sizes.txt -binSize=10 #Maximum value 3811.000000 # slowly decaying exponential peaking in the 10-19 bin. # Repeated those measurements in ../phastCons -- avg. 49.8, median 31, # bigger peak at 10-19 and faster decay in the histogram. # So expected-lengths definitely has an effect there... actual lengths # are bigger than expected, and really don't want to be as low as 15. # hg18 17way avg=66.3, median=38. dm3 15way avg=43.05, median=26. # ce2 phastConsElementsPaper (undoc'd): avg=269, median=175 -- but that # might have been with only a 2-way alignment w/briggsae. # Load up wiggle and elements to visually compare with original run: cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores cat << '_EOF_' > catScores.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p ../phastCons6wayScoresT1 for D in ppRaw/chr* do C=${D/ppRaw\/} out=../phastCons6wayScoresT1/${C}.scores.gz find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat done '_EOF_' # << emacs chmod +x catScores.sh ./catScores.sh \ | wigEncode stdin phastCons6wayT1.{wig,wib} mv phastCons6wayT1.{wig,wib} .. #Converted stdin, upper limit 1.00, lower limit 0.00 ssh hgwdev # Load gbdb and database with wiggle. cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned ln -s `pwd`/phastCons6wayT1.wib /gbdb/ce5/wib/ hgLoadWiggle ce5 phastCons6wayT1 phastCons6wayT1.wig hgLoadBed ce5 phastConsElements6wayT1 phastConsElements6way.bed # They look overinflated -- I see elements extending into gaps, # amplified score-humps over gaps (don't know why those are there at all). # Bottom line -- stick with the ce6 model and params from the original # run above! ############################################################################ # ANNOTATE MULTIZ6WAY MAF AND RELOAD TABLE (DONE 1/16/09 angie) ssh swarm mkdir /hive/data/genomes/ce5/bed/multiz6way/anno cd /hive/data/genomes/ce5/bed/multiz6way/anno # these actually already all exist from previous multiple alignments # if there are done you will see an ls of them # or else the twoBitInfo command will be echoed, run it if you want for DB in `cat ../species.lst` do CDIR="/hive/data/genomes/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done mkdir maf run cd run rm -f sizes nBeds for DB in `sed -e "s/ce5 //" ../../species.lst` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /hive/data/genomes/ce5/bed/multiz6way set c = $1 cat $dir/maf/${c}.maf | nice \ mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/ce5/ce5.2bit $2 '_EOF_' # << emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs cut -f1 /hive/data/genomes/ce5/chrom.sizes > chrom.list gensub2 chrom.list single template jobList #Completed: 7 of 7 jobs #CPU time in finished jobs: 31s 0.52m 0.01h 0.00d 0.000 y #IO & Wait Time: 35s 0.58m 0.01h 0.00d 0.000 y #Average job time: 9s 0.16m 0.00h 0.00d #Longest finished job: 12s 0.20m 0.00h 0.00d #Submission to last job: 19s 0.32m 0.01h 0.00d # Load anno/maf ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf mkdir -p /gbdb/ce5/multiz6way/anno/maf ln -s /hive/data/genomes/ce5/bed/multiz6way/anno/maf/*.maf \ /gbdb/ce5/multiz6way/anno/maf time /cluster/bin/x86_64/hgLoadMaf \ -pathPrefix=/gbdb/ce5/multiz6way/anno/maf ce5 multiz6way #Loaded 418471 mafs in 7 files from /gbdb/ce5/multiz6way/anno/maf #7.204u 0.610s 0:25.32 30.8% 0+0k 0+0io 4pf+0w # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf cat *.maf \ | hgLoadMafSummary ce5 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz6waySummary stdin #Created 45472 summary blocks from 1042942 components and 418471 mafs from stdin #-rw-rw-r-- 1 angie protein 2097539 Jan 16 15:39 multiz6waySummary.tab ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf hgLoadSqlTab ce5 multiz6waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz6waySummary.tab # Create per-chrom individual maf files for downloads ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way mkdir mafDownloads time for M in anno/maf/chr*.maf do B=`basename $M` nice cp -p ${M} mafDownloads/${B} nice gzip mafDownloads/${B} echo ${B} done done ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way # There isn't any refGene table, using sangerGeneLO instead #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits -verbose=2 ce5 \ sangerGeneLO:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+//' | sort -k1,1 -k2,2n \ | mafFrags ce5 multiz6way stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/sangerGeneLO.upstream${S}.maf.gz echo "done sangerGeneLO.upstream${S}.maf.gz" done cd mafDownloads md5sum *.gz > md5sum.txt # deliver to downloads mkdir /usr/local/apache/htdocs/goldenPath/ce5/multiz6way ln -s /hive/data/genomes/ce5/bed/multiz6way/mafDownloads/* \ /usr/local/apache/htdocs/goldenPath/ce5/multiz6way/ ####################################################################### # MULTIZ6WAY MAF FRAMES (DONE 1/16/09 angie) ssh hgwdev mkdir /hive/data/genomes/ce5/bed/multiz6way/frames cd /hive/data/genomes/ce5/bed/multiz6way/frames mkdir genes #------------------------------------------------------------------------ # get the genes for all genomes # using sangerGeneLO for ce5 # using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1 # (Note the Ce6, not Ce5! But the genes are the same as in sangerGeneLO) for qDB in ce5 cb3 caePb2 caeRem3 caeJap1 priPac1 do if [ $qDB = "cb3" -o $qDB = "caePb2" -o $qDB = "caeRem3" -o $qDB = "caeJap1" -o $qDB = "priPac1" ]; then geneTbl=blastCe6SG echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ > /scratch/tmp/$qDB.tmp.psl mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm -f /scratch/tmp/$qDB.tmp.psl mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz elif [ $qDB = "ce5" ]; then geneTbl=sangerGeneLO echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz fi done ssh kolossus cd /hive/data/genomes/ce5/bed/multiz6way/frames cat ../maf/*.maf \ | genePredToMafFrames ce5 stdin stdout \ ce5 genes/ce5.gp.gz cb3 genes/cb3.gp.gz \ caePb2 genes/caePb2.gp.gz caeRem3 genes/caeRem3.gp.gz \ caeJap1 genes/caeJap1.gp.gz priPac1 genes/priPac1.gp.gz \ | gzip > multiz6way.mafFrames.gz ssh hgwdev cd /hive/data/genomes/ce5/bed/multiz6way/frames hgLoadMafFrames ce5 multiz6wayFrames multiz6way.mafFrames.gz ######################################################################### #########################################################################