# for emacs: -*- mode: sh; -*- # Saccharomyces cerevisiae # Saccharomyces Genome Database, Stanford ####################################################################### # Download data (DONE - 2011-08-24 - Hiram) mkdir -p /hive/data/genomes/sacCer3/genbank cd /hive/data/genomes/sacCer3/genbank wget --timestamping -r --cut-dirs=7 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/fungi/Saccharomyces_cerevisiae/SacCer_Apr2011/*" mkdir ucsc ls assembled_chromosomes/AGP | sed -e "s#.comp.agp.gz##" | while read F do name=`zcat assembled_chromosomes/AGP/${F}.comp.agp.gz | grep -v "^#" \ | awk -F'\t' '{print $1}'` echo $name zcat assembled_chromosomes/AGP/${F}.comp.agp.gz \ | sed -e "s/^${name}/${F}/" > ucsc/${F}.agp done ls assembled_chromosomes/AGP | sed -e "s#.comp.agp.gz##" | while read F do echo ">${F}" > ucsc/${F}.fa zcat assembled_chromosomes/FASTA/${F}.fa.gz | grep -v "^>" >> ucsc/${F}.fa gzip ucsc/${F}.fa ls -ld ucsc/${F}.fa.gz done faSize ucsc/*.fa.gz # 12071326 bases (0 N's 12071326 real 12071326 upper 0 lower) in 16 # sequences in 16 files faSize assembled_chromosomes/FASTA/*.fa.gz # 12071326 bases (0 N's 12071326 real 12071326 upper 0 lower) in 16 # sequences in 16 files ####################################################################### # Initial genome build (DONE - 2011-08-24 - Hiram) cd /hive/data/genomes/sacCer3] cat << '_EOF_' > sacCer3.config.ra # Config parameters for makeGenomeDb.pl: db sacCer3 scientificName Saccharomyces cerevisiae commonName S. cerevisiae assemblyDate Apr. 2011 assemblyShortLabel SacCer_Apr2011 assemblyLabel Saccharomyces cerevisiae S288c assembly from Saccharomyces Genome Database (GCA_000146055.2) orderKey 998 mitoAcc NC_001224 # override the check for max size of 25,000 mitoSize 90000 fastaFiles /hive/data/genomes/sacCer3/genbank/ucsc/*.fa.gz agpFiles /hive/data/genomes/sacCer3/genbank/ucsc/*.agp # qualFiles /dev/null dbDbSpeciesDir sacCer taxId 559292 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp sacCer3.config.ra > agp.log 2>&1 # real 0m14.802s tail -3 agp.log # if ( All AGP and FASTA entries agree - both files are valid != All AGP and FASTA entries agree - both files are valid ) then # *** All done! (through the 'agp' step) time makeGenomeDb.pl -continue=db sacCer3.config.ra > db.log 2>&1 # real 0m14.088s # no need for repeat masking ln -s sacCer3.unmasked.2bit sacCer3.2bit ln -s `pwd`/sacCer3.2bit /gbdb/sacCer3/sacCer3.2bit ####################################################################### # sacCer3 - S. cerevisiae - Ensembl Genes version 63 (DONE - 2011-08-24 - hiram) ssh swarm # there is now no 2-micron sequence, remove it from the Ensembl GTF file cd /hive/data/genomes/sacCer3 cat << '_EOF_' > sacCer3.ensGene.ra # required db variable db sacCer3 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^VIII/chrVIII/; s/^VII/chrVII/; s/^VI/chrVI/; s/^V/chrV/; s/^XIII/chrXIII/; s/^XII/chrXII/; s/^XIV/chrXIV/; s/^XI/chrXI/; s/^XVI/chrXVI/; s/^XV/chrXV/; s/^X/chrX/; s/^III/chrIII/; s/^IV/chrIV/; s/^II/chrII/; s/^IX/chrIX/; s/^I/chrI/; s/^MT/chrM/; s/2-micron/2micron/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=63 sacCer3.ensGene.ra ssh hgwdev cd /hive/data/genomes/sacCer3/bed/ensGene.63 featureBits sacCer3 ensGene # 8912929 bases of 12157105 (73.315%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2011-08-24 - Hiram) # repMatch = 1024 * sizeof(sacCer3)/sizeof(hg18) # 4.32 = 1024 * (12162995/2881515245) # use 10 to be very conservative ssh hgwdev cd /hive/data/genomes/sacCer3 blat sacCer3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/sacCer3.11.ooc -repMatch=10 # Wrote 3137 overused 11-mers to 11.ooc mkdir /hive/data/staging/data/sacCer3 # copy this to scratch data cp -p jkStuff/sacCer3.11.ooc chrom.sizes sacCer3.2bit \ /hive/data/staging/data/sacCer3 ######################################################################### # CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2011-08-29 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/sgdAnnotations cd /hive/data/genomes/sacCer3/bed/sgdAnnotations wget --timestamping \ http://downloads.yeastgenome.org/chromosomal_feature/saccharomyces_cerevisiae.gff # trim the delivered S.cerevisiae.gff file to get rid of the FASTA section # and fixup the chrM and 2-micron chrom names: awk ' BEGIN { keepGoing = 1 } { if (match($1, "^##FASTA")) { keepGoing = 0; } if (keepGoing) print; } ' saccharomyces_cerevisiae.gff \ | sed -e "/^2-micron/d; s/^chrmt/chrM/" > S.cerevisiae.gff # wrote this perl script to specifically parse this particular instance # of the sgd gff file. # constructs the files: leftOvers.gff, notes.txt, otherFeatures.bed, # and descriptions.txt $HOME/kent/src/hg/makeDb/outside/yeast/hgSgdGff3/sgdGffToGtf.pl \ S.cerevisiae.gff > sacCer3.sgdGene.gtf 2> sacCer3.gtf.stderr cat sacCer3.sgdGene.gtf | ./chrName.sh | sed -e "s/_CDS//" \ > sacCer3.sgdGene.roman.gtf # NOTE: if you reload this table, a command below: # hgProtIdToGenePred sacCer3 sgdGene sgdToSwissProt name value # needs to be run to add a protein column to the sgdGene tabl3e ldHgGene -gtf sacCer3 sgdGene sacCer3.sgdGene.roman.gtf # Read 7063 transcripts in 7450 lines in 1 files # 7063 groups 17 seqs 1 sources 3 feature types # 6692 gene predictions # for table cleaning below, get the names for a reference: hgsql -N -e "select name from sgdGene;" sacCer3 | sort -u > name.sgdGene.txt # some of the otherFeatures have start == end, use option: cat otherFeatures.bed | ./chrName.sh > otherFeatures.roman.bed hgLoadBed -allowStartEqualEnd sacCer3 sgdOther otherFeatures.roman.bed \ -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql # Loaded 2476 elements of size 7 # XXX errors here # chromStart == chromEnd (31228) (zero-length item) # Use -allowStartEqualEnd if that is legit (e.g. for insertion point). # process the peptides into a format that hgSgdPep can handle # actually, a few more lines here could perform the hgSgdPep function cat << '_EOF_' > pepPred.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "zcat ../../yeastgenome.org/orf_trans_all.fasta.gz|") or die "can not zcat ../../yeastgenome.org/orf_trans_all.fasta.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my @a = split('\s+', $line); my $name = $a[0]; $name =~ s/^>//; my $orf = $a[1]; printf ">ORFP:%s\t%s\n", $name, $orf; } else { printf "%s", $line; } } close (FH); '_EOF_' # << happy emacs chmod +x pepPred.pl ./pepPred.pl > sgdPep.fsa hgSgdPep sgdPep.fsa sgdPep.fa stdout | sort > symbol.raw.txt # the ^I here needs to be two keystrokes: ctrl-v I join -t "^I" name.sgdGene.txt symbol.raw.txt > symbol.txt wc -l name.sgdGene.txt symbol.raw.txt symbol.txt # 6692 name.sgdGene.txt # 6717 symbol.raw.txt # 6692 symbol.txt faToTab -type=protein sgdPep.fa stdout | sort > sgdPep.tab # the ^I here needs to be two keystrokes: ctrl-v I join -t"^I" name.sgdGene.txt sgdPep.tab > sgdPep.toLoad.tab wc -l name.sgdGene.txt sgdPep.tab sgdPep.toLoad.tab # 6692 name.sgdGene.txt # 6717 sgdPep.tab # 6692 sgdPep.toLoad.tab hgPepPred sacCer3 tab sgdPep sgdPep.toLoad.tab hgsql sacCer3 -e 'create table sgdToName ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' hgsql sacCer3 -e 'load data local infile "symbol.txt" \ into table sgdToName;' # the ^I here needs to be two keystrokes: ctrl-v I join -t"^I" name.sgdGene.txt descriptions.txt > descriptions.toLoad.txt wc -l name.sgdGene.txt descriptions.txt descriptions.toLoad.txt # 6692 name.sgdGene.txt # 6712 descriptions.txt # 6691 descriptions.toLoad.txt hgsql sacCer3 < $HOME/kent/src/hg/lib/sgdDescription.sql hgsql sacCer3 -e 'load data local infile "descriptions.toLoad.txt" \ into table sgdDescription;' # check out the longest name: cut -f1 notes.txt | sort -u | awk '{print length($0),$0}' | sort -nr | head # 11 YPRWdelta21 # it isn't larger than 16, no adjustment needed to sgdOtherDescription.sql # filter out items that are not in sgdOther hgsql -N -e "select name from sgdOther;" sacCer3 \ | sort -u > sacCer3.sgdOther.txt # the ^I here needs to be two keystrokes: ctrl-v I join -t"^I" sacCer3.sgdOther.txt notes.txt > notes.toLoad.txt cut -f1 notes.toLoad.txt notes.txt | sort | uniq -c \ | awk '$1 < 2' > notes.notLoaded.txt hgsql sacCer3 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql hgsql sacCer3 -e 'load data local infile "notes.toLoad.txt" \ into table sgdOtherDescription;' ############################################################################ # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2011-08-29 - Hiram) cd /hive/data/genomes/sacCer3/bed/sgdAnnotations wget --timestamping \ http://downloads.yeastgenome.org/chromosomal_feature/dbxref.tab grep "UniProt" dbxref.tab \ | awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt hgsql sacCer3 -e 'create table sgdToSwissProt ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' hgsql sacCer3 -e 'load data local infile "sgdToSwissProt.txt" \ into table sgdToSwissProt;' hgProtIdToGenePred sacCer3 sgdGene sgdToSwissProt name value # verify all names are valid cut -f1 sgdToSwissProt.txt | sort -u > name.sgdToSwissProt.txt wc -l name.sgdToSwissProt.txt name.sgdGene.txt # 5897 name.sgdToSwissProt.txt # 6692 name.sgdGene.txt comm -12 name.sgdToSwissProt.txt name.sgdGene.txt | wc -l # 5879 # remove the 18 from sgdToSwissProt that are not in sgdGene: comm -23 name.sgdToSwissProt.txt name.sgdGene.txt | wc -l # 18 comm -23 name.sgdToSwissProt.txt name.sgdGene.txt | while read N do hgsql -e "delete from sgdToSwissProt where name=\"${N}\";" sacCer3 done # interesting to note that sgdTOSwissProt has one accession not # in sgdGene.name: KHS1 (BK 2009-07-14) ############################################################################ # AUTO UPDATE GENBANK RUN (Done - 2011-08-24,29 - Hiram) # align with latest genbank process. cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add sacCer3 just before sacCer1 # sacCer3 S. cerevisiae sacCer3.serverGenome = /hive/data/genomes/sacCer3/sacCer3.2bit sacCer3.clusterGenome = /scratch/data/sacCer3/sacCer3.2bit sacCer3.ooc = no sacCer3.maxIntron = 5000 sacCer3.lift = no sacCer3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} sacCer3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} sacCer3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} sacCer3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} sacCer3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} #sacCer3.perChromTables = no # doesn't work in the browser sacCer3.genbank.mrna.xeno.load = no sacCer3.refseq.mrna.native.load = no sacCer3.downloadDir = sacCer3 git commit -m "adding sacCer3" genbank.conf git pull git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial sacCer3 & # logFile: var/build/logs/2011.08.24-16:16:56.sacCer3.initalign.log # the system was slow due to hive loads: # 26h 17m == 1577m # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer3 # logFile: var/dbload/hgwdev/logs/2011.08.29-14:16:10.dbload.log # real 10m7.094s # enable daily alignment and update of hgwdev (DONE - 2011-08-29 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull cvsup # add sacCer3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added sacCer3 - S. cerevisiae" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # HUMAN (hg18) PROTEINS TRACK (DONE 2011-08-31 braney ) # bash if not using bash shell already cd /cluster/data/sacCer3 mkdir /cluster/data/sacCer3/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst sacCer3.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 8 pieces of 8 written rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst sacCer3.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 14 mkdir -p /cluster/data/sacCer3/bed/tblastn.hg18KG cd /cluster/data/sacCer3/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 14 query.lst # we want around 1500 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(150000/`wc query.lst | awk '{print $1}'`\) # 36727/(1500/14) = 300.427853 mkdir -p kgfa split -l 300 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 123 123 1599 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/sacCer3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/sacCer3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/sacCer3/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 1722 of 1722 jobs # CPU time in finished jobs: 66699s 1111.64m 18.53h 0.77d 0.002 y # IO & Wait Time: 9697s 161.62m 2.69h 0.11d 0.000 y # Average job time: 44s 0.74m 0.01h 0.00d # Longest finished job: 157s 2.62m 0.04h 0.00d # Submission to last job: 160s 2.67m 0.04h 0.00d ssh swarm cd /cluster/data/sacCer3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 123 of 123 jobs # CPU time in finished jobs: 5s 0.09m 0.00h 0.00d 0.000 y # IO & Wait Time: 354s 5.90m 0.10h 0.00d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 5s 0.08m 0.00h 0.00d # Submission to last job: 8s 0.13m 0.00h 0.00d cd /cluster/data/sacCer3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 5956 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/sacCer3/bed/tblastn.hg18KG hgLoadPsl sacCer3 blastHg18KG.psl # check coverage featureBits sacCer3 blastHg18KG # 1364662 bases of 12157105 (11.225%) in intersection featureBits sacCer3 blastHg18KG ensGene -enrichment # blastHg18KG 11.225%, ensGene 73.315%, both 11.155%, cover 99.38%, enrich # 1.36x rm -rf blastOut #end tblastn ################ # CREATE SGD-BASED CLONE TRACK (DONE - 2011-08-31 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/sgdClone cd /hive/data/genomes/sacCer3/bed/sgdClone # since sacCer1, these coordinates have become 0-relative ? wget --timestamping \ ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/clone.tab awk -F '\t' '{printf("%d\t%d\t%d\t%s\t%s\n", $3, $4, $5, $2, $1);}' \ clone.tab | sed -e \ "s/^8/chrVIII/; s/^7/chrVII/; s/^6/chrVI/; s/^5/chrV/; s/^13/chrXIII/; s/^12/chrXII/; s/^14/chrXIV/; s/^11/chrXI/; s/^16/chrXVI/; s/^15/chrXV/; s/^10/chrX/; s/^3/chrIII/; s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" | sort -k1,1 -k2,2n \ > sgdClone.bed hgLoadBed sacCer3 sgdClone sgdClone.bed -tab \ -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql ############################################################################ ## simpleRepeats (DONE - 2011-08-31 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/simpleRepeat cd /hive/data/genomes/sacCer3/bed/simpleRepeat doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \ -workhorse=hgwdev sacCer3 > do.log 2>&1 # about 1 minute 20 seconds featureBits sacCer3 simpleRepeat # 110704 bases of 12157105 (0.911%) in intersection ######################################################################### ## Regulatory Code (DONE - 2011-08-31 - Hiram) # liftOver from sacCer1 mkdir /hive/data/genomes/sacCer3/bed/transRegCode cd /hive/data/genomes/sacCer3/bed/transRegCode # lifting sacCer1 data to this assembly hgsql -N -e "select chrom,chromStart,chromEnd,name,score,chipEvidence,consSpecies from transRegCode" sacCer1 \ | liftOver -bedPlus=5 -tab stdin \ /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \ transRegCode.lifted.bed transRegCode.unMapped.bed hgsql -N -e "select chrom,chromStart,chromEnd,name,tfCount,tfList,bindVals from transRegCodeProbe;" sacCer1 \ | liftOver -bedPlus=4 -tab stdin \ /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \ transRegCodeProbe.lifted.bed transRegCodeProbe.unMapped.bed hgLoadBed sacCer3 transRegCode transRegCode.lifted.bed \ -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql # Loaded 206558 elements of size 7 hgLoadBed sacCer3 transRegCodeProbe transRegCodeProbe.lifted.bed \ -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab # Loaded 6178 elements of size 7 hgsql sacCer3 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql hgsql sacCer3 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql hgsql sacCer3 < $HOME/kent/src/hg/lib/growthCondition.sql hgsql -N -e "select * from transRegCodeCondition;" sacCer1 \ | hgsql sacCer3 -e \ 'load data local infile "/dev/stdin" into table transRegCodeCondition' hgsql -N -e "select * from transRegCodeMotif;" sacCer1 \ | hgsql sacCer3 -e \ 'load data local infile "/dev/stdin" into table transRegCodeMotif' hgsql -N -e "select * from growthCondition;" sacCer1 \ | hgsql sacCer3 -e \ 'load data local infile "/dev/stdin" into table growthCondition' # fixed some typos and other tweaks (added hyphens), # including making some subscripts in growthCondition table. # (also had to change transRegCodeCondition.growthCondition field to match) # propagated to all 3 sacCer* dbs - 2011-09-08 b0b and steve # eliminate some names from Attr that are not in primary table # as identified by joinerCheck: # Error: 15 of 36501 elements of sacCer3.oregannoAttr.id are not in key oreganno.id line 2871 of all.joiner # Example miss: OREG0029365 sacCer3.oregannoLink.id - hits 21886 of 21895 # Error: 9 of 21895 elements of sacCer3.oregannoLink.id are not in key oreganno.id line 2872 of all.joiner # Example miss: OREG0029365 hgsql -N -e "select id from oregannoAttr;" sacCer3 \ | sort -u > oregannoAttr.id.txt hgsql -N -e "select id from oreganno;" sacCer3 \ | sort -u > oreganno.id.txt comm -13 oreganno.id.txt oregannoAttr.id.txt # OREG0029365 # OREG0029396 # OREG0030456 hgsql -e 'delete from oregannoAttr where id="OREG0029365";' sacCer3 hgsql -e 'delete from oregannoAttr where id="OREG0029396";' sacCer3 hgsql -e 'delete from oregannoAttr where id="OREG0030456";' sacCer3 hgsql -N -e "select id from oregannoLink;" sacCer3 \ | sort -u > oregannoLink.id.txt comm -13 oreganno.id.txt oregannoLink.id.txt # OREG0029365 # OREG0029396 # OREG0030456 hgsql -e 'delete from oregannoLink where id="OREG0029365";' sacCer3 hgsql -e 'delete from oregannoLink where id="OREG0029396";' sacCer3 hgsql -e 'delete from oregannoLink where id="OREG0030456";' sacCer3 # ######################################################################### # Oreganno track - (DONE - 2011-08-31 - Hiram) # liftOver from sacCer1 database hgsql -N -e \ "select chrom,chromStart,chromEnd,id,strand,name from oreganno;" sacCer1 \ | liftOver -bedPlus=4 -tab stdin \ /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \ oreganno.lifted.bed oreganno.unMapped.bed hgsql sacCer3 < $HOME/kent/src/hg/lib/oreganno.sql hgLoadBed -oldTable sacCer3 oreganno oreganno.lifted.bed -tab # Loaded 7299 elements of size 6 # and load non-positional tracks from sacCer1: hgsql -N -e "select * from oregannoAttr;" sacCer1 \ | hgLoadSqlTab -oldTable sacCer3 oregannoAttr \ ~/humPhen/kent/src/hg/lib/oreganno.sql stdin hgsql -N -e "select * from oregannoLink;" sacCer1 \ | hgLoadSqlTab -oldTable sacCer3 oregannoLink \ ~/humPhen/kent/src/hg/lib/oreganno.sql stdin ######################################################################### # Regulatory Module - (DONE - 2011-09-06 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/regModule cd /hive/data/genomes/sacCer3/bed/regModule # liftOver data from sacCer1 hgsql -N -e \ "select chrom,chromStart,chromEnd,name,score,strand from esRegUpstreamRegion;" \ sacCer1 | liftOver -bedPlus=6 -tab stdin \ /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \ esRegUpstreamRegion.lifted.bed esRegUpstreamRegion.unMapped.bed hgsql -N -e \ "select chrom,chromStart,chromEnd,name,score,strand,gene from esRegGeneToMotif;" \ sacCer1 | liftOver -bedPlus=6 -tab stdin \ /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \ esRegGeneToMotif.lifted.bed esRegGeneToMotif.unMapped.bed # I do not see instructions in sacCer1 to create these tables, # so, dump their schemas: hgsqldump --all -c -d --tab=. sacCer1 esRegUpstreamRegion esRegGeneToMotif # and data for these other two: chmod 777 . hgsqldump --all -c --tab=. sacCer1 esRegGeneToModule esRegMotif chmod 775 . # XXX there is some kind of problem in these generated SQL create # statement files. This straight use of the file works OK: hgsql sacCer3 < esRegGeneToMotif.sql # But in hgLoadBed, it will not work, some kind of odd syntax error # so, edit them down to just the create statement and run them: hgLoadBed sacCer3 esRegGeneToMotif -sqlTable=t_esRegGeneToMotif.sql -tab \ esRegGeneToMotif.lifted.bed # Loaded 3993 elements of size 7 hgLoadBed sacCer3 esRegUpstreamRegion -sqlTable=t_esRegUpstreamRegion.sql \ -tab esRegUpstreamRegion.lifted.bed # Loaded 1670 elements of size 6 hgsql sacCer3 < esRegMotif.sql hgsql sacCer3 -e \ 'load data local infile "esRegMotif.txt" into table esRegMotif;' hgsql sacCer3 < esRegGeneToModule.sql hgsql sacCer3 -e \ 'load data local infile "esRegGeneToModule.txt" into table esRegGeneToModule;' ######################################################################### # Multiple alignment (WORKING - 2011-09-01 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/strains cd /hive/data/genomes/sacCer3/bed/strains # found a number of yeast genomes, but they are not convenient to # download. This is a hybrid wget/manual procedure # bash business here YEASTTOP="http://downloads.yeastgenome.org/sequence/strains" wget --continue --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=inf --no-parent \ --no-host-directories --cut-dirs=1 ${YEASTTOP}/ grep dir.png strains/index.html | sed -e 's/.*a href="//; s#/".*##' > dir.list for D in `cat dir.list` do wget --continue --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=inf --no-parent \ --no-host-directories --cut-dirs=2 ${YEASTTOP}/${D}/ done for D in `cat dir.list` do grep dir.png ${D}/index.html | sed -e 's/.*a href="//; s#/".*##' \ > ${D}/dir.list for SD in `cat ${D}/dir.list` do wget --continue --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=inf --no-parent \ --no-host-directories --cut-dirs=2 ${YEASTTOP}/${D}/${SD}/ egrep "text.gif|text.png" ${D}/${SD}/index.html \ | sed -e 's/.*a href="//; s#".*##' \ > ${D}/file.list for F in `cat ${D}/file.list` do wget ${YEASTTOP}/${D}/${SD}/${F} -O ./${D}/${SD}/${F} done done done # extract the actual fasta out of those downloads mkdir -p fasta # the EC1118 and W303 do not follow the same pattern as the others for D in `cat dir.list | egrep -v "EC1118|W303"` do for SD in `cat ${D}/dir.list` do for F in `cat ${D}/file.list | egrep -v "README|gff|_cds|_pep"` do # echo ./${D}/${SD}/${F} echo -n "${D}: " head -1 ./${D}/${SD}/${F} | sed -e 's#>gi|[0-9]*|gb|#>#; s#| S.*##' rm -f fasta/${D}.fa cat ./${D}/${SD}/${F} | sed -e 's#>gi|[0-9]*|gb|#>#; s#| S.*##' \ > fasta/${D}.fa rm -f fasta/${D}.fa.gz gzip fasta/${D}.fa done done done # run EC1118 and W303 separately for D in `cat dir.list | egrep "EC1118|W303"` do for SD in `cat ${D}/dir.list` do for F in `cat ${D}/file.list | egrep -v "README|gff|_cds|_pep"` do # echo ./${D}/${SD}/${F} echo -n "${D}: " head -1 ./${D}/${SD}/${F} \ | sed -e 's#>gi|[0-9]*|emb|#>#; s#| S.*##; s# .*##' rm -f fasta/${D}.fa cat ./${D}/${SD}/${F} \ | sed -e 's#>gi|[0-9]*|emb|#>#; s#| S.*##; s# .*##' \ > fasta/${D}.fa rm -f fasta/${D}.fa.gz gzip fasta/${D}.fa done done done # convert to 2bit files mkdir -p 2bit for F in fasta/*.fa.gz do B=`basename ${F} | sed -e "s/.fa.gz//"` echo $F $B zcat "${F}" | faToTwoBit -noMask stdin 2bit/${B}.2bit done # also use the sequences from previous multiple alignment: for S in sacBay sacCas sacKlu sacKud sacMik sacPar do rm -f fasta/${S}.fa cat ../../../sacCer1/bed/otherYeast/align/${S} > fasta/${S}.fa rm -f fasta/${S}.fa.gz gzip fasta/${S}.fa zcat fasta/${S}.fa.gz | faToTwoBit -noMask stdin 2bit/${S}.2bit echo $S done # construct sizes files mkdir -p sizes for F in 2bit/*.2bit do B=`basename ${F} | sed -e "s/.2bit//"` echo $F $B twoBitInfo $F stdout | sort -k2,2nr > sizes/${B}.size done # construct sacCer3 chromosome fasta mkdir -p sacCer3 twoBitInfo ../../sacCer3.2bit stdout | cut -f1 | while read C do twoBitToFa ../../sacCer3.2bit:${C} sacCer3/${C}.fa echo "${C} done" done # run the lastz alignments mkdir align ssh swarm cd /hive/data/genomes/sacCer3/bed/strains/align cat << '_EOF_' > template #LOOP lastz.csh $(path1) $(path2) $(root1).$(root2) {check out exists+ psl/$(root1).$(root2).psl.gz} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > lastz.csh #!/bin/csh -ef /cluster/bin/penn/$MACHTYPE/lastz $1 $2 > lav/$3.lav /cluster/bin/$MACHTYPE/lavToPsl lav/$3.lav stdout | gzip > psl/$3.psl.gz '_EOF_' # << happy emacs chmod +x lastz.csh ls ../fasta/* > other.list ls ../sacCer3/* > sacCer.list gensub2 sacCer.list other.list template jobList mkdir axtAll axtBest mafIn mafRawOut mafOUt lav psl para create jobList para try ... check ... push ... etc # Completed: 544 of 544 jobs # CPU time in finished jobs: 57645s 960.74m 16.01h 0.67d 0.002 y # IO & Wait Time: 5203s 86.72m 1.45h 0.06d 0.000 y # Average job time: 116s 1.93m 0.03h 0.00d # Longest finished job: 328s 5.47m 0.09h 0.00d # Submission to last job: 468s 7.80m 0.13h 0.01d mkdir -p axtChain/run cd axtChain/run ls ../../../fasta | sed -e "s/.fa//" > species.list ls ../../../sacCer3 | sed -e "s/.fa//" > chrom.list cat << '_EOF_' > chain.csh #!/bin/csh -ef set S = $1 set C = $2 set IN = ../../psl/${C}.${S}.psl.gz set OUT = ${S}/${C}.chain mkdir -p ${S} zcat ${IN} \ | axtChain -psl -verbose=0 -minScore=1000 -linearGap=medium stdin \ /hive/data/genomes/sacCer3/sacCer3.2bit \ -faQ ../../../fasta/${S}.fa stdout \ | chainAntiRepeat /hive/data/genomes/sacCer3/sacCer3.2bit \ ../../../2bit/${S}.2bit stdin ${OUT} '_EOF_' # << happy emacs chmod +x chain.csh cat << '_EOF_' > template #LOOP chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain} #ENDLOOP '_EOF_' # << happy emacs gensub2 species.list chrom.list template jobList para create jobList para try ... check ... push ... etc # Completed: 544 of 544 jobs # CPU time in finished jobs: 639s 10.65m 0.18h 0.01d 0.000 y # IO & Wait Time: 2789s 46.48m 0.77h 0.03d 0.000 y # Average job time: 6s 0.11m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 64s 1.07m 0.02h 0.00d # merge the individual results cd /hive/data/genomes/sacCer3/bed/strains/align/axtChain foreach S (`cat run/species.list`) echo $S find ./run/${S} -name "*.chain" \ | chainMergeSort -inputList=stdin | gzip -c > sacCer3.${S}.all.chain.gz end # split them again to get consistent chain ID numbering foreach S (`cat run/species.list`) echo $S chainSplit chain.${S} sacCer3.${S}.all.chain.gz end # netting the chains cat << '_EOF_' > netChains.csh #!/bin/csh -ef set S0 = "../../../../chrom.sizes" foreach S (`cat run/species.list`) echo $S chainPreNet sacCer3.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \ | chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \ | netSyntenic stdin noClass.${S}.net end '_EOF_' # << happy emacs chmod +x netChains.csh time ./netChains.csh # real 0m14.122s # construct MAF files cat << '_EOF_' > mkMafs.csh #!/bin/csh -ef set TOP = `pwd` set S0 = "/hive/data/genomes/sacCer3/chrom.sizes" foreach S (`cat run/species.list`) cd ${TOP} # Make axtNet for download: one .axt per sacCer3 seq. netSplit noClass.${S}.net net.${S} cd .. rm -fr axtNet.${S} mkdir axtNet.${S} foreach f (axtChain/net.${S}/*.net) netToAxt $f axtChain/chain.${S}/$f:t:r.chain \ /hive/data/genomes/sacCer3/sacCer3.2bit ../2bit/${S}.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet.${S}/$f:t:r.sacCer3.${S}.net.axt.gz end # Make mafNet for multiz: one .maf per sacCer3 seq. rm -fr mafNet.${S} mkdir mafNet.${S} foreach f (axtNet.${S}/*.sacCer3.${S}.net.axt.gz) axtToMaf -tPrefix=sacCer3. -qPrefix=${S}. $f \ ${S0} ../sizes/${S} stdout \ | gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz end end '_EOF_' # << happy emacs chmod +x mkMafs.csh time ./mkMafs.csh # real 7m16.117s cd /hive/data/genomes/sacCer3/bed/strains/align # convert maf files to PSL: mkdir -p mafPsl for S in `cat axtChain/run/species.list` do echo mafToPsl sacCer3 ${S} mafNet.${S}/'chr*.maf.gz' mafPsl/${S}.psl rm -f mafPsl/${S}.psl zcat mafNet.${S}/chr*.maf.gz \ | mafToPsl sacCer3 ${S} stdin mafPsl/${S}.psl done # load those PSL files, it isn't clear why these alignments # need to be swapped, but they do need that cat axtChain/run/species.list | grep -v RM11-1a | while read S do pslSwap mafPsl/${S}.psl stdout \ | hgLoadPsl sacCer3 -table=pslMafNet${S} stdin done pslSwap mafPsl/RM11-1a.psl stdout \ | hgLoadPsl sacCer3 -table=pslMafNetRM111a stdin cat axtChain/run/species.list | while read S do pslSwap mafPsl/${S}.psl stdout \ | pslCheck -db=sacCer3 stdin done # note that all are OK, errors can be corrected with: pslRecalcMatch # see also: hg19Patch5.txt hgsql -N -e "show tables;" sacCer3 | grep pslMafNet | while read T do echo -n "# ${T}: " | sed -e "s/pslMafNet//" featureBits sacCer3 "${T}" done # AWRI1631: 11651200 bases of 12157105 (95.839%) in intersection # AWRI796: 11999010 bases of 12157105 (98.700%) in intersection # CBS7960: 11687063 bases of 12157105 (96.134%) in intersection # CLIB215: 11432987 bases of 12157105 (94.044%) in intersection # CLIB324: 10840222 bases of 12157105 (89.168%) in intersection # CLIB382: 9370716 bases of 12157105 (77.080%) in intersection # EC1118: 11981729 bases of 12157105 (98.557%) in intersection # FL100: 11086015 bases of 12157105 (91.190%) in intersection # FostersB: 11874757 bases of 12157105 (97.678%) in intersection # FostersO: 11924459 bases of 12157105 (98.086%) in intersection # JAY291: 12031398 bases of 12157105 (98.966%) in intersection # LalvinQA23: 12000205 bases of 12157105 (98.709%) in intersection # M22: 10511165 bases of 12157105 (86.461%) in intersection # PW5: 11935295 bases of 12157105 (98.175%) in intersection # RM111a: 11962069 bases of 12157105 (98.396%) in intersection # Sigma1278b: 12005373 bases of 12157105 (98.752%) in intersection # T7: 12081072 bases of 12157105 (99.375%) in intersection # T73: 11606894 bases of 12157105 (95.474%) in intersection # UC5: 11949157 bases of 12157105 (98.289%) in intersection # VL3: 12015619 bases of 12157105 (98.836%) in intersection # Vin13: 12005359 bases of 12157105 (98.752%) in intersection # W303: 12042586 bases of 12157105 (99.058%) in intersection # Y10: 10156801 bases of 12157105 (83.546%) in intersection # YJM269: 11719109 bases of 12157105 (96.397%) in intersection # YJM789: 11957276 bases of 12157105 (98.356%) in intersection # YPS163: 10812681 bases of 12157105 (88.941%) in intersection # sacBay: 10762610 bases of 12157105 (88.529%) in intersection # sacCas: 7202451 bases of 12157105 (59.245%) in intersection # sacKlu: 6404405 bases of 12157105 (52.680%) in intersection # sacKud: 10830333 bases of 12157105 (89.086%) in intersection # sacMik: 11254494 bases of 12157105 (92.575%) in intersection # sacPar: 11712000 bases of 12157105 (96.339%) in intersection # Add the sequences to sacCer3 for alignment display cd /hive/data/genomes/sacCer3/bed/strains/fasta mkdir /gbdb/sacCer3/strains ln -s `pwd`/*.fa /gbdb/sacCer3/strains/ rm /gbdb/sacCer3/strains/RM11-1a.fa ln -s `pwd`/RM11-1a.fa /gbdb/sacCer3/strains/RM111a.fa cat ../align/axtChain/run/species.list | grep -v RM11-1a | while read S do hgLoadSeq -drop -seqTbl=seq${S} -extFileTbl=ext${S} sacCer3 \ /gbdb/sacCer3/strains/${S}.fa done hgLoadSeq -drop -seqTbl=seqRM111a -extFileTbl=extRM111a sacCer3 \ /gbdb/sacCer3/strains/RM111a.fa ######################################################################### ## 7-Way Multiz (DONE - 2011-09-01 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/multiz7way cd /hive/data/genomes/sacCer3/bed/multiz7way # using the phylogenetic tree as mentioned at: # http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html # and from Jim's note: genomes/sacCer1/bed/otherYeast/README # ((((((sacCer3 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu) echo "((((((sacCer3 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \ > tree.nh echo sacCer3 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list for S in sacPar sacMik sacKud sacBay sacCas sacKlu do mkdir -p mafLinks/${S} cd mafLinks/${S} ln -s ../../../strains/align/mafNet.${S}/*.maf.gz . cd ../.. done find -L ./mafLinks -type f | xargs -L 1 basename \ | sed -e "s/.maf.gz//" | sort -u > maf.list mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = sacCer3 set c = $1 set result = $2 set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set pairs = /hive/data/genomes/sacCer3/bed/multiz7way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp foreach s (`sed -e "s/ $db//" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out else if (-e $in) then ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db /bin/rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../maf.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 17 of 17 jobs # CPU time in finished jobs: 1146s 19.10m 0.32h 0.01d 0.000 y # IO & Wait Time: 173s 2.89m 0.05h 0.00d 0.000 y # Average job time: 78s 1.29m 0.02h 0.00d # Longest finished job: 163s 2.72m 0.05h 0.00d # Submission to last job: 223s 3.72m 0.06h 0.00d # load MAF tables ssh hgwdev mkdir -p /gbdb/sacCer3/multiz7way/maf cd /hive/data/genomes/sacCer3/bed/multiz7way/maf ln -s `pwd`/*.maf /gbdb/sacCer3/multiz7way/maf # allow the temporary multiz7way.tab file to be created in tmp cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/sacCer3/multiz7way/maf sacCer3 multiz7way # real 0m3.824s # Loaded 49795 mafs in 17 files from /gbdb/sacCer3/multiz7way/maf # load summary table time nice -n +19 cat /gbdb/sacCer3/multiz7way/maf/*.maf \ | hgLoadMafSummary sacCer3 -mergeGap=1500 multiz7waySummary stdin # real 0m1.693s # Created 3048 summary blocks from 92163 components and 49795 mafs ######################################################################### ## phastCons conservation (DONE - 2009-02-11 - Hiram) # Create SS files for each chromosome: mkdir /hive/data/genomes/sacCer3/bed/multiz7way/phastCons cd /hive/data/genomes/sacCer3/bed/multiz7way/phastCons mkdir SS # split up alignments; no need to use cluster here export PHASTBIN=/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin mkdir -p SS for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")` do echo $C ${PHASTBIN}/msa_view \ ../maf/${C}.maf -M ../../strains/sacCer3/${C}.fa -i MAF \ -O sacCer3,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \ -o SS > SS/${C}.ss done # produce rough starting model ${PHASTBIN}/msa_view \ -i SS --aggregate sacCer3,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \ -o SS -z SS/*.ss > all.ss ${PHASTBIN}/phyloFit \ -i SS all.ss \ --tree "((((((sacCer3,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \ -o starting-tree mv starting-tree.mod starting-tree.0.mod # (cheat the branch lengths up a bit, since this represents an # average of conserved and nonconserved sites; we want to # initialize for nonconserved) ${PHASTBIN}/tree_doctor --scale 2 starting-tree.0.mod > starting-tree.mod # also get average GC content of aligned regions ${PHASTBIN}/msa_view -i SS all.ss --summary-only descrip. A C G T G+C length all_gaps some_gaps all.ss 0.3056 0.1950 0.1948 0.3047 0.3898 1021817 0 2026636 # save some I/O and space gzip SS/*.ss # now set up cluster job to estimate model parameters. See # other make{hg18|mm9}.doc for add'l details ssh swarm cd /hive/data/genomes/sacCer3/bed/multiz7way/phastCons cat << '_EOF_' > doEstimate.sh #!/bin/sh zcat SS/$1.ss.gz | \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phastCons \ - starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \ --ignore-missing --expected-lengths 75 --target-coverage 0.5 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << happy emacs chmod +x doEstimate.sh rm -fr LOG TREES mkdir LOG TREES cat << '_EOF_' > template #LOOP doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1) #ENDLOOP '_EOF_' # << happy emacs (cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list gensub2 chr.list single template jobList para create jobList para push para time # Completed: 17 of 17 jobs # CPU time in finished jobs: 1558s 25.97m 0.43h 0.02d 0.000 y # IO & Wait Time: 172s 2.87m 0.05h 0.00d 0.000 y # Average job time: 102s 1.70m 0.03h 0.00d # Longest finished job: 184s 3.07m 0.05h 0.00d # Submission to last job: 195s 3.25m 0.05h 0.00d # Now combine parameter estimates. # XXX - these phyloBoot processes did not complete after several weeks # of run time. ls TREES/*.cons.mod > cons.txt /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloBoot \ --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt ls TREES/*.noncons.mod > noncons.txt time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloBoot \ --read-mods '*noncons.txt' --output-average ave.noncons.mod \ > noncons_summary.txt # Now set up cluster job for computing consevation scores and # predicted elements cat << '_EOF_' > doPhastCons.csh #!/bin/csh -fe /bin/mkdir -p POSTPROBS ELEMENTS set chr = $1 set out = $2 set tmpFile = /scratch/tmp/phastCons.$chr /bin/rm -f $tmpFile /bin/zcat SS/$chr.ss.gz \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phastCons - \ starting-tree.mod --rho 0.3 --expected-length 45 --target-coverage 0.3 \ --quiet --seqname $chr --idpref $chr \ --most-conserved ELEMENTS/$chr.bed --score > $tmpFile /bin/gzip -c $tmpFile > POSTPROBS/$chr.pp.gz /bin/rm $tmpFile '_EOF_' # << happy emacs chmod +x doPhastCons.csh cat << '_EOF_' > run.template #LOOP doPhastCons.csh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 chr.list single run.template run.jobList rm -fr POSTPROBS ELEMENTS para resetCounts para freeBatch para create run.jobList para push para time # Completed: 17 of 17 jobs # CPU time in finished jobs: 47s 0.79m 0.01h 0.00d 0.000 y # IO & Wait Time: 55s 0.91m 0.02h 0.00d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 9s 0.15m 0.00h 0.00d # Submission to last job: 231s 3.85m 0.06h 0.00d # set up phastConsElements track cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements7way.bed hgLoadBed sacCer3 phastConsElements7way phastConsElements7way.bed # Loaded 52725 elements of size 5 featureBits sacCer3 phastConsElements7way # 7358572 bases of 12157105 (60.529%) in intersection # previously this was: featureBits sacCer2 phastConsElements7way # 7762587 bases of 12162995 (63.821%) in intersection featureBits sacCer1 phastConsElements # 7517116 bases of 12156302 (61.837%) in intersection mkdir ../downloads mkdir ../downloads/phastCons7way for C in `cat chr.list` do cp -p POSTPROBS/${C}.pp.gz \ ../downloads/phastCons7way/sacCer3.${C}.wigFixed.gz done # check integrity of data with wigToBigWig time (zcat ../downloads/phastCons7way/*.wigFixed.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/sacCer3/chrom.sizes \ phastCons7way.bw) > bigWig.log 2>&1 & tail bigWig.log # pid=21422: VmPeak: 176612 kB # real 0m8.972s bigWigInfo phastCons7way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 22,672,094 # primaryIndexSize: 398,036 # zoomLevels: 9 # chromCount: 17 # basesCovered: 12,078,291 # mean: 0.601440 # min: 0.000000 # max: 1.000000 # std: 0.429826 # encode those files into wiggle data zcat ../downloads/phastCons7way/*.wigFixed.gz \ | wigEncode stdin phastCons7way.wig phastCons7way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 wigTableStats.sh sacCer3 phastCons7way # db.table min max mean count sumData stdDev viewLimits # sacCer3.phastCons7way 0 1 0.60144 12078291 7.26437e+06 0.429826 viewLimits=0:1 # previously: wigTableStats.sh sacCer2 phastCons7way # db.table min max mean count sumData stdDev viewLimits # sacCer2.phastCons7way 0 1 0.631032 12082524 7.62446e+06 0.425509 viewLimits=0:1 # Load gbdb and database with wiggle. cd /hive/data/genomes/sacCer3/bed/multiz7way/phastCons ln -s `pwd`/phastCons7way.wib /gbdb/sacCer3/multiz7way/phastCons7way.wib hgLoadWiggle -pathPrefix=/gbdb/sacCer3/multiz7way sacCer3 \ phastCons7way phastCons7way.wig # Create histogram to get an overview of all the data hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=sacCer3 phastCons7way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Yeast sacCer3 Histogram phastCons7way track" set xlabel " phastCons7way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # To create the tree diagram for the details page, use this tree # definition in http://genome.ucsc.edu/cgi-bin/phyloGif ((((((S._cerevisiae,S._paradoxus),S._mikatae),S._kudriavzevii),S._bayanus),S._castelli),S._kluyveri) ######################################################################### ## Annotate the sacCer3 7-way sequence with genes mkdir /hive/data/genomes/sacCer3/bed/multiz7way/anno cd /hive/data/genomes/sacCer3/bed/multiz7way/anno mkdir genes # using sgdGene hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from sgdGene" sacCer3 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/sacCer3.sgdGene.gz (cat ../maf/*.maf | genePredToMafFrames sacCer3 stdin stdout \ sacCer3 genes/sacCer3.sgdGene.gz \ | gzip > multiz7way.mafFrames.gz) > frames.log 2>&1 zcat multiz7way.mafFrames.gz \ | sort -k1,1 -k2,2n | hgLoadMafFrames sacCer3 multiz7wayFrames stdin ############################################################################ # creating upstream mafs (DONE - 2009-06-26 - Hiram) ssh hgwdev cd /hive/data/genomes/sacCer3/goldenPath/multiz7way # run this bash script cat << '_EOF_' > mkUpstream.sh DB=sacCer3 GENE=sgdGene NWAY=multiz7way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh ./mkUpstream.sh ######################################################################### # blastTab construction (DONE - 2011-09-02 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab # prepare all protein fasta sequence files and configuration specifications export runDir="/hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab" mkdir -p ${runDir} cd ${runDir} # get all the protein sets to be used here pepPredToFa danRer7 ensPep danRer7.ensembl.faa pepPredToFa sacCer3 sgdPep sacCer3.sgd.faa pepPredToFa dm3 flyBasePep dm3.flyBase.faa pepPredToFa ce6 sangerPep ce6.sanger.faa for D in hg19 mm9 rn4 do pepPredToFa $D knownGenePep $D.known.faa done # known gene tables for D in hg19 mm9 rn4 do mkdir -p $D echo "targetGenesetPrefix known targetDb $D" > ${D}/${D}.config.ra echo "queryDbs sacCer3" \ | sed -e "s/ ${D}//" >> ${D}/${D}.config.ra echo "${D}Fa ${runDir}/${D}.known.faa sacCer3Fa ${runDir}/sacCer3.sgd.faa buildDir ${runDir}/${D} scratchDir ${runDir}/${D}/tmp" >> ${D}/${D}.config.ra done # sgd gene tables for D in danRer7 do mkdir -p $D echo "targetGenesetPrefix dr targetDb $D queryDbs sacCer3 danRer7Fa ${runDir}/danRer7.ensembl.faa sacCer3Fa ${runDir}/sacCer3.sgd.faa buildDir ${runDir}/${D} scratchDir ${runDir}/${D}/tmp" > ${D}/${D}.config.ra done # flyBase table for D in dm3 do mkdir -p $D echo "targetGenesetPrefix flyBase targetDb $D queryDbs sacCer3 dm3Fa ${runDir}/dm3.flyBase.faa sacCer3Fa ${runDir}/sacCer3.sgd.faa buildDir ${runDir}/${D} scratchDir ${runDir}/${D}/tmp" > ${D}/${D}.config.ra done # sanger table for D in ce6 do mkdir -p $D echo "targetGenesetPrefix sanger targetDb $D queryDbs sacCer3 ce6Fa ${runDir}/ce6.sanger.faa sacCer3Fa ${runDir}/sacCer3.sgd.faa buildDir ${runDir}/${D} scratchDir ${runDir}/${D}/tmp" > ${D}/${D}.config.ra done # run them all for D in ce6 danRer7 dm3 hg19 mm9 rn4 do cd ${D} rm -fr *.csh do.log run.${D}.sacCer3 tmp ls -ld * echo $D doHgNearBlastp.pl -noSelf -noLoad -workhorse=hgwdev \ -clusterHub=swarm ${D}.config.ra > do.log 2>&1 & cd .. done ps -f echo waiting ... wait # There is one to run for sacCer3 itself: mkdir /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/sacCer3 cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/sacCer3 cat << '_EOF_' > sacCer3.config.ra targetGenesetPrefix sgd targetDb sacCer3 queryDbs danRer7 danRer7Fa /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/danRer7.ensembl.faa sacCer3Fa /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa buildDir /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/sacCer3 scratchDir /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/sacCer3/tmp '_EOF_' # << happy emacs # leave out the -noSelf for this: doHgNearBlastp.pl -noLoad -workhorse=hgwdev \ -clusterHub=swarm sacCer3.config.ra > do.log 2>&1 cd run.sacCer3.sacCer3 ./loadPairwise.csh # filter out names that do not exist in sgdGene cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab hgsql -N -e "select name from sgdGene;" sacCer3 \ | sort -u > sacCer3.sgdGene.name.txt cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/ce6/run.sacCer3.ce6 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > ceBlastTab.trimmed.tab hgLoadBlastTab sacCer3 ceBlastTab -maxPer=1 ceBlastTab.trimmed.tab # Loading database with 3605 rows cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/dm3/run.sacCer3.dm3 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > dmBlastTab.trimmed.tab hgLoadBlastTab sacCer3 dmBlastTab -maxPer=1 dmBlastTab.trimmed.tab # Loading database with 3675 rows cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/hg19/run.sacCer3.hg19 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > hgBlastTab.trimmed.tab hgLoadBlastTab sacCer3 hgBlastTab -maxPer=1 hgBlastTab.trimmed.tab # Loading database with 3759 rows cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/danRer7/run.sacCer3.danRer7 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > drBlastTab.trimmed.tab hgLoadBlastTab sacCer3 drBlastTab -maxPer=1 drBlastTab.trimmed.tab # Loading database with 3893 rows cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/mm9/run.sacCer3.mm9 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > mmBlastTab.trimmed.tab hgLoadBlastTab sacCer3 mmBlastTab -maxPer=1 mmBlastTab.trimmed.tab # Loading database with 3773 rows cd /hive/data/genomes/sacCer3/bed/sgdAnnotations/blastTab/rn4/run.sacCer3.rn4 cat out/*.tab | sort > all.sorted.tab join ../../sacCer3.sgdGene.name.txt all.sorted.tab > rnBlastTab.trimmed.tab hgLoadBlastTab sacCer3 rnBlastTab -maxPer=1 rnBlastTab.trimmed.tab # Loading database with 2981 rows ######################################################################### # construct downloads files (DONE - 2011-09-02 - Hiram) cd /hive/data/genomes/sacCer3 # there are no repeat masker files: makeDownloads.pl -ignoreRepeatMasker -dbHost=hgwdev \ -workhorse=hgwdev sacCer3 > downloads.log 2>&1 # edit the README.txt files to set project URL and check # the text ########################################################################### # ready for first pushQ entry (DONE - 2011-09-02 - Hiram) mkdir /hive/data/genomes/sacCer3/pushQ cd /hive/data/genomes/sacCer3/pushQ makePushQSql.pl sacCer3 > sacCer3.sql 2> stderr.out # some errors are legitimate and OK: head stderr.out WARNING: hgwdev does not have /gbdb/sacCer3/wib/gc5Base.wib WARNING: hgwdev does not have /gbdb/sacCer3/wib/quality.wib WARNING: hgwdev does not have /gbdb/sacCer3/bbi/quality.bw WARNING: sacCer3 does not have seq WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: growthCondition scp -p sacCer3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < sacCer3.sql # and make this the default genome for Saccharomyces cerevisiae hgsql -e \ 'update defaultDb set name="sacCer3" where genome="S. cerevisiae";' \ hgcentraltest ########################################################################### # creating tables for Gene Sorter (DONE - 2011-09-02 - Hiram) mkdir /hive/data/genomes/sacCer3/bed/hgNear cd /hive/data/genomes/sacCer3/bed/hgNear hgClusterGenes sacCer3 sgdGene sgdIsoforms sgdCanonical # Got 6534 clusters, from 6692 genes in 17 chromosomes # used to make a sgdToSgd table, but: # Removed sgdToSgd table because it wasn't being used anywhere. # Katrina 7/14/09 # Make expression similarity table. hgExpDistance sacCer3 hgFixed.yeastChoCellCycle \ hgFixed.yeastChoCellCycleExps choExpDistance # Have 6259 elements in hgFixed.yeastChoCellCycle # Got 6259 unique elements in hgFixed.yeastChoCellCycle # Made choExpDistance.tab # Turn on protein and gene sorter hgsql -e 'update dbDb set hgNearOk=1 where name="sacCer3";' hgcentraltest ######################################################################### # BLATSERVERS ENTRY (DONE - 2011-09-06 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sacCer3", "blatx", "17840", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sacCer3", "blatx", "17841", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # uwFootprints: (WORKING - 2011-09-07 - hiram) # lifted sacCer1 tag count data and set to Nobel lab # William Stafford Noble noble at gs.washington.edu # Xiaoyu Chen xchen at cs.washington.edu XXX - working on this # lift counts to sacSer2 and give back to UW cd /hive/data/genomes/sacCer1/bed/uwFootprintsxi wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer3.2009-02-04/sacCer1ToSacCer3.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer3.bed /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer3.nolift.bed mkdir /hive/data/genomes/sacCer3/bed/uwFootprints cd /hive/data/genomes/sacCer3/bed/uwFootprints # get back wig wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.sacCer3.lifted.wig # lift other BEDs liftOver /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.mappability.bed /hive/data/genomes/sacCer1/bed/blat.sacCer3.2009-02-04/sacCer1ToSacCer3.over.chain.gz yeast.mappability.bed yeast.mappability.nolift.bed # drop yeast.footprints.bed and truncate name to three decimal places tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.footprints.bed | liftOver stdin /hive/data/genomes/sacCer1/bed/blat.sacCer3.2009-02-04/sacCer1ToSacCer3.over.chain.gz yeast.footprints.bed yeast.footprints.nolift.bed wigEncode yeast.dnaseI.tagCounts.sacCer3.lifted.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib # Converted yeast.dnaseI.tagCounts.sacCer3.lifted.wig, upper limit 13798.00, lower limit 1.00 ln -sf /hive/data/genomes/sacCer3/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer3/wib hgLoadWiggle -tmpDir=/data/tmp sacCer3 uwFootprintsTagCounts uwFootprintsTagCounts.wig hgLoadBed -tab -tmpDir=/data/tmp sacCer3 uwFootprintsMappability yeast.mappability.bed hgLoadBed -tmpDir=/data/tmp sacCer3 uwFootprintsPrints yeast.footprints.bed ############################################################################ # add iRow annotations to the maf files: mkdir /hive/data/genomes/sacCer3/bed/strains/align/anno cd /hive/data/genomes/sacCer3/bed/strains/align/anno twoBitInfo -nBed ../../../../sacCer3.2bit sacCer3.N.bed for F in ../../2bit/*.2bit do B=`basename $F` S=${B/.2bit/} echo $S $F twoBitInfo -nBed $F ${S}.N.bed done mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/anoCar2/anoCar2.2bit result/$(file1) ############################################################################ # LIFTOVER TO SacCer2 (DONE - 2011-11-08 - Chin ) sh hgwdev cd /hive/data/genomes/sacCer3 ln -s jkStuff/sacCer3.11.ooc 11.ooc # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug sacCer3 sacCer2 time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ sacCer3 sacCer2 > doLiftOverToSacCer2.log 2>&1 & # real 3m21.410s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer3/liftOver/ md5sum sacCer3ToSacCer2.over.chain.gz >> md5sum.txt popd ############################################################################ # LIFTOVER TO SacCer1 (DONE - 2011-11-08 - Chin ) sh hgwdev cd /hive/data/genomes/sacCer3 time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ sacCer3 sacCer1 > doLiftOverToSacCer1.log 2>&1 & # real 3m21.141s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/sacCer3/liftOver/ md5sum sacCer3ToSacCer1.over.chain.gz >> md5sum.txt popd ############################################################################