# for emacs: -*- mode: sh; -*- # Saccharomyces cerevisiae # Saccharomyces Genome Database, Stanford # $Id: sacCer2.txt,v 1.17 2010/02/05 06:54:06 kuhn Exp $ ####################################################################### # Download data (DONE - 2009-01-30 - Hiram) mkdir -p /hive/data/genomes/sacCer2/download cd /hive/data/genomes/sacCer2/download TOP=/hive/data/genomes/sacCer2/download for D in gene_registry literature_curation oracle_schema protein_info \ protein_info/hypothetical_peptides do mkdir -p ${D} cd ${D} wget -l 1 --timestamping -np -nd --cut-dirs=1 -r -X "archive" \ "http://downloads.yeastgenome.org/${D}/" rm -f index.* robots.txt cd ${TOP} done mkdir sgd.chromosomes for C in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt do wget --timestamping \ "http://downloads.yeastgenome.org/sequence/genomic_sequence/chromosomes/fasta/ch r${C}.fsa" -O sgd.chromosomes/chr${C}.fsa done # convert chrom names to something reasonable, they are now roman nums mkdir -p chromosomes # bash scripting follows: runOne() { N=$1 C=$2 echo ">chr${C}" > chromosomes/chr${C}.fa grep -v "^>" sgd.chromosomes/chr${N}.fsa >> chromosomes/chr${C}.fa echo "done chr${N} -> ${C}" } runOne 01 I runOne 02 II runOne 03 III runOne 04 IV runOne 05 V runOne 06 VI runOne 07 VII runOne 08 VIII runOne 09 IX runOne 10 X runOne 11 XI runOne 12 XII runOne 13 XIII runOne 14 XIV runOne 15 XV runOne 16 XVI runOne mt M # the same sequence again, with the following runOne() # will product an AGP file: runOne() { N=$1 C=$2 CTG=`head -1 sgd.chromosomes/chr${N}.fsa | sed -e 's/>ref|//; s/|.*//'` size=`faSize sgd.chromosomes/chr${N}.fsa | head -1 | awk '{print $1}'` echo -e "chr${C}\t1\t$size\t$SEQ\tF\t$CTG\t1\t$size\t+" SEQ=`echo $SEQ | awk '{print $1+1}'` } # plus this echo statement for 2micron: echo -e "2micron\t1\t6318\t${SEQ}\tF\tNC_001398\t1\t6318\t+" # trim the SGD gff file and get UCSC chrom names awk ' BEGIN { keepGoing = 1 } { if (match($1, "^##FASTA")) { keepGoing = 0; } if (keepGoing) print; } ' chromosomal_feature/saccharomyces_cerevisiae.gff \ | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff # this program needed to be fixed for this newer gff file format checkSgdSync -verbose=4 -gffRowCount=9 -faExtn=fa \ -gffFile=S.cerevisiae.gff \ download > sgd.feature.scan.txt 2>&1 # this used to have only a couple, now there are many non-coding # annotations # good 6702, bad 371, total 7073 ######################################################################### cd /hive/data/genomes/sacCer2 cat << '_EOF_' > sacCer2.config.ra # Config parameters for makeGenomeDb.pl: db sacCer2 scientificName Saccharomyces cerevisiae commonName S. cerevisiae assemblyDate June 2008 assemblyLabel SGD June 2008 orderKey 998 mitoAcc NC_001224 # override the check for max size of 25,000 mitoSize 90000 fastaFiles /hive/data/genomes/sacCer2/download/chromosomes/*.fa agpFiles /hive/data/genomes/sacCer2/download/sacCer2.agp # qualFiles /dev/null dbDbSpeciesDir sacCer taxId 4932 '_EOF_' # << happy emacs makeGenomeDb.pl sacCer2.config.ra > makeGenomeDb.log 2>&1 # take the trackDb stuff and check it in # no need for masked sequence: ln -s sacCer2.unmasked.2bit sacCer2.2bit ln -s `pwd`/sacCer2.2bit /gbdb/sacCer2 ######################################################################### # sacCer2 - S. cerevisiae - Ensembl Genes version 52 (DONE - 2009-02-04 - hiram) ssh swarm cd /hive/data/genomes/sacCer2 cat << '_EOF_' > sacCer2.ensGene.ra # required db variable db sacCer2 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^VIII/chrVIII/; s/^VII/chrVII/; s/^VI/chrVI/; s/^V/chrV/; s/^XIII/chrXIII/; s/^XII/chrXII/; s/^XIV/chrXIV/; s/^XI/chrXI/; s/^XVI/chrXVI/; s/^XV/chrXV/; s/^X/chrX/; s/^III/chrIII/; s/^IV/chrIV/; s/^II/chrII/; s/^IX/chrIX/; s/^I/chrI/; s/^MT/chrM/; s/2-micron/2micron/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=52 sacCer2.ensGene.ra ssh hgwdev cd /hive/data/genomes/sacCer2/bed/ensGene.52 featureBits sacCer2 ensGene # 8912793 bases of 12162995 (73.278%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram) # repMatch = 1024 * sizeof(sacCer2)/sizeof(hg18) # 4.32 = 1024 * (12162995/2881515245) # use 10 to be very conservative ssh hgwdev cd /hive/data/genomes/sacCer2 blat sacCer2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/11.ooc \ -repMatch=10 # Wrote 3137 overused 11-mers to 11.ooc # copy this to scratch data cp -p jkStuff/11.ooc /hive/data/staging/data/sacCer2/sacCer2.11.ooc ######################################################################### # work with the SGD annotations (WORKING - 2009-02-04 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations cd /hive/data/genomes/sacCer2/bed/sgdAnnotations # get rid of the FASTA section from their gff file awk ' BEGIN { keepGoing = 1 } { if (match($1, "^##FASTA")) { keepGoing = 0; } if (keepGoing) print; } ' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \ | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff ######################################################################### # CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2009-02-10 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations cd /hive/data/genomes/sacCer2/bed/sgdAnnotations # trim the delivered S.cerevisiae.gff file to get rid of the FASTA section # and fixup the chrM and 2-micron chrom names: awk ' BEGIN { keepGoing = 1 } { if (match($1, "^##FASTA")) { keepGoing = 0; } if (keepGoing) print; } ' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \ | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff # wrote this perl script to specifically parse this particular instance # of the sgd gff file. $HOME/kent/src/hg/makeDb/outside/yeast/hgSgdGff3/sgdGffToGtf.pl \ S.cerevisiae.gff > sacCer2.sgdGene.gtf 2> sacCer2.gtf.stderr # This also writes the files: # -rw-rw-r-- 1 640177 Feb 6 15:01 leftOvers.gff # -rw-rw-r-- 1 259712 Feb 6 15:01 otherFeatures.bed # -rw-rw-r-- 1 254996 Feb 6 15:01 notes.txt # -rw-rw-r-- 1 1214875 Feb 6 15:01 descriptions.txt ldHgGene -gtf sacCer2 sgdGene sacCer2.sgdGene.gtf hgLoadBed sacCer2 sgdOther otherFeatures.bed \ -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql # this perl script will fix up the fasta header lines for # the chr*.peptides.fsa files to run into hgSgdPep cat << '_EOF_' > filter.pl #!/usr/bin/env perl use strict; use warnings; open (FH,">symbol.txt") or die "can not write to symbol.txt"; my $inAnnotation = 0; while (my $line=<>) { if ($line =~ m/^>Annotated\|/) { $inAnnotation = 1; my (@words) = split('\s+', $line); die "cannot find four fields in\n'$line'" if (scalar(@words) < 4); my $name = $words[3]; $name =~ s/;.*//; $name =~ s#/.*##; printf ">ORFP:%s\n", $name; printf FH "%s\t", $name; if ("${name};" ne $words[3]) { my $otherName = $words[3]; $otherName =~ s#.*/##; $otherName =~ s/;//; printf FH "%s", $otherName; # printf FH " %s", $words[2]; } else { printf FH "%s", $name; } # for (my $i = 4; $i < scalar(@words); ++$i) { # printf FH " %s", $words[$i]; # } print FH "\n"; } elsif ($line =~ m/^>/) { $inAnnotation = 0; } else { if ($inAnnotation) { print $line; } } } close (FH); '_EOF_' # << happy emacs chmod +x filter.pl zcat \ ../../download/protein_info/hypothetical_peptides/chr*.peptides.20040928.fsa.gz\ | ./filter.pl | hgSgdPep stdin sgdPep.fa symbol.txt hgPepPred sacCer2 generic sgdPep sgdPep.fa hgsql sacCer2 -e 'create table sgdToName ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' hgsql sacCer2 -e 'load data local infile "symbol.txt" \ into table sgdToName;' hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdDescription.sql hgsql sacCer2 -e 'load data local infile "descriptions.txt" \ into table sgdDescription;' hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql hgsql sacCer2 -e 'load data local infile "notes.txt" \ into table sgdOtherDescription;' ## Clean up some stray names: cd /hive/data/genomes/sacCer2/bed/sgdAnnotations hgsql -N -e "select name from sgdGene;" sacCer2 \ | sort -u > sacCer2.sgdGene.name.txt hgsql -N -e "select name from sgdPep;" sacCer2 \ | sort -u > sacCer2.sgdPep.name.txt comm -23 sacCer2.sgdPep.name.txt sacCer2.sgdGene.name.txt | while read N do hgsql -e "delete from sgdPep where name=\"$N\";" sacCer2 done hgsql -N -e "select name from sgdDescription;" sacCer2 \ | sort -u > sacCer2.sgdDescription.name.txt comm -23 sacCer2.sgdDescription.name.txt sacCer2.sgdGene.name.txt \ | while read N do hgsql -e "delete from sgdDescription where name=\"${N}\";" sacCer2 done ############################################################################ # catch up to other tables in sacCer1 - (DONE - 2009-02-24 - Hiram) # can simply transfer these tables across from sacCer1: hgsqldump --all -c --tab=. sacCer1 sgdAbundance sgdLocalization sgdToPfam # hgLoadSqlTab doesn't like the comment characters: grep -v "^--" sgdToPfam.sql \ | hgLoadSqlTab sacCer2 sgdToPfam stdin sgdToPfam.txt grep -v "^--" sgdAbundance.sql \ | hgLoadSqlTab sacCer2 sgdAbundance stdin sgdAbundance.txt grep -v "^--" sgdLocalization.sql \ | hgLoadSqlTab sacCer2 sgdLocalization stdin sgdLocalization.txt hgsqldump --all -c --tab=. sacCer1 yeastP2P grep -v "^--" yeastP2P.sql \ | hgLoadSqlTab sacCer2 yeastP2P stdin yeastP2P.txt ############################################################################ # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram) cd /hive/data/sacCer2/bed/sgdAnnotation grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \ | awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt hgsql sacCer2 -e 'create table sgdToSwissProt ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' hgsql sacCer2 -e 'load data local infile "sgdToSwissProt.txt" \ into table sgdToSwissProt;' hgProtIdToGenePred sacCer2 sgdGene sgdToSwissProt name value # interesting to note that sgdTOSwissProt has one accession not # in sgdGene.name: KHS1 (BK 2009-07-14) ############################################################################ # CREATE SGD-BASED CLONE TRACK (DONE - 2009-02-10 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/sgdClone cd /hive/data/genomes/sacCer2/bed/sgdClone # since sacCer1, these coordinates have become 0-relative ? awk -F '\t' '{printf("%d\t%d\t%d\t%s\t%s\n", $3, $4, $5, $2, $1);}' \ ../../download/chromosomal_feature/clone.tab \ | sed -e \ "s/^8/chrVIII/; s/^7/chrVII/; s/^6/chrVI/; s/^5/chrV/; s/^13/chrXIII/; s/^12/chrXII/; s/^14/chrXIV/; s/^11/chrXI/; s/^16/chrXVI/; s/^15/chrXV/; s/^10/chrX/; s/^3/chrIII/; s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" > sgdClone.bed hgLoadBed sacCer2 sgdClone sgdClone.bed -tab \ -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql ############################################################################ # AUTO UPDATE GENBANK RUN (Done - 2009-02-10 - Hiram) # align with latest genbank process. cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add sacCer2 just before sacCer1 # sacCer2 S. cerevisiae sacCer2.serverGenome = /hive/data/genomes/sacCer2/sacCer2.2bit sacCer2.clusterGenome = /scratch/data/sacCer2/sacCer2.2bit sacCer2.ooc = no sacCer2.maxIntron = 5000 sacCer2.lift = no sacCer2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} sacCer2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} sacCer2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} sacCer2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} sacCer2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} #sacCer2.perChromTables = no # doesn't work in the browser sacCer2.genbank.mrna.xeno.load = no sacCer2.refseq.mrna.native.load = no sacCer2.downloadDir = sacCer2 cvs ci -m "Added sacCer2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial sacCer2 & # logFile: var/build/logs/2009.02.10-11:27:42.sacCer2.initalign.log # real 173m1.570s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2 # logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log # real 18m21.436s # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add sacCer2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added sacCer2 - S. cerevisiae" \ etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ # DOING MULTIPLE ALIGNMENT WITH OTHER YEAST (DONE - 2009-02-10 - Hiram) # the sequences are the same set from 2003 that were used in sacCer1 # so, just link to them and rerun the alignment # in the directory /cluster/data/sacCer1/bed/otherYeast/align mkdir /hive/data/genomes/sacCer2/bed/otherYeast cd /hive/data/genomes/sacCer2/bed/otherYeast for D in sacBay sacCas sacKlu sacKud sacMik sacPar do ln -s ../../../sacCer1/bed/otherYeast/${D} . done mkdir align cd align ln -s ../../../../sacCer1/bed/otherYeast/align/sac??? . # helpful to have 2bit files for chainAntiRepeat for G in sac??? do faToTwoBit ${G} ${G}.2bit done # Create directory full of size info mkdir sizes for F in sac??? do faSize $F -detailed > sizes/$F done # Create some working directories mkdir lav axtAll axtBest mafIn mafRawOut mafOut # Create little shell script to do blastz alignments cat << '_EOF_' > lastz.csh #!/bin/csh -ef /cluster/bin/penn/$MACHTYPE/lastz $1 $2 > $3 '_EOF_' # << happy emacs chmod +x lastz.csh # create sacCer2 individual chr.fa files: mkdir sacCer2 twoBitInfo ../../../sacCer2.2bit stdout | cut -f1 | while read C do twoBitToFa ../../../sacCer2.2bit:${C} sacCer2/${C}.fa echo "${C} done" done - # Create job spec to do all blastz alignments ls -1 sacCer2/*.fa > sacCer.list ls -1 sac??? > other.list cat << '_EOF_' > template #LOOP lastz.csh $(path1) $(path2) {check out line+ lav/$(root1).$(root2)} #ENDLOOP '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/sacCer2/bed/otherYeast/align gensub2 sacCer.list other.list template jobList # Convert from lav to psl for L in lav/* do B=`basename $L` echo $B lavToPsl $L stdout | gzip > psl/$B.psl.gz done # chaining mkdir -p axtChain/run cd axtChain/run ls -d ../../sac??? | xargs -L 1 basename > species.list cut -f1 ../../../../../chrom.sizes > chrom.list cat << '_EOF_' > chain.csh #!/bin/csh -ef set S = $1 set C = $2 set IN = ../../psl/${C}.${S}.psl.gz set OUT = ${S}/${C}.chain mkdir -p ${S} zcat ${IN} \ | axtChain -psl -verbose=0 -minScore=1000 -linearGap=medium stdin \ /hive/data/genomes/sacCer2/sacCer2.2bit \ -faQ ../../${S} stdout \ | chainAntiRepeat /hive/data/genomes/sacCer2/sacCer2.2bit \ ../../${S}.2bit stdin ${OUT} '_EOF_' # << happy emacs chmod +x chain.csh cat << '_EOF_' > template #LOOP chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain} #ENDLOOP '_EOF_' # << happy emacs gensub2 species.list chrom.list template jobList para create jobList para push para time # Completed: 108 of 108 jobs # CPU time in finished jobs: 73s 1.21m 0.02h 0.00d 0.000 y # IO & Wait Time: 1100s 18.34m 0.31h 0.01d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 38s 0.63m 0.01h 0.00d cd .. # merge the individual results foreach S (`cat run/species.list`) echo $S find ./run/${S} -name "*.chain" \ | chainMergeSort -inputList=stdin | gzip -c > sacCer2.${S}.all.chain.gz end # split them again to get consistent numbering foreach S (`cat run/species.list`) echo $S chainSplit chain.${S} sacCer2.${S}.all.chain.gz end # netting the chains cat << '_EOF_' > netChains.csh #!/bin/csh -ef set S0 = "../../../../chrom.sizes" foreach S (`cat run/species.list`) echo $S chainPreNet sacCer2.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \ | chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \ | netSyntenic stdin noClass.${S}.net end '_EOF_' # << happy emacs chmod +x netChains.csh ./netChains.csh cat << '_EOF_' > mkMafs.csh #!/bin/csh -ef set TOP = `pwd` set S0 = "/hive/data/genomes/sacCer2/chrom.sizes" foreach S (`cat run/species.list`) # Make axtNet for download: one .axt per sacCer2 seq. netSplit noClass.${S}.net net.${S} cd .. rm -fr axtNet.${S} mkdir axtNet.${S} foreach f (axtChain/net.${S}/*.net) netToAxt $f axtChain/chain.${S}/$f:t:r.chain \ /hive/data/genomes/sacCer2/sacCer2.2bit ${S}.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet.${S}/$f:t:r.sacCer2.${S}.net.axt.gz end # Make mafNet for multiz: one .maf per sacCer2 seq. rm -fr mkdir mafNet.${S} mkdir mafNet.${S} foreach f (axtNet.${S}/*.sacCer2.${S}.net.axt.gz) axtToMaf -tPrefix=sacCer2. -qPrefix=${S}. $f \ ${S0} sizes/${S} stdout \ | gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz end cd ${TOP} end '_EOF_' # << happy emacs chmod +x mkMafs.csh ./mkMafs.csh ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-10 braney ) # bash if not using bash shell already cd /cluster/data/sacCer2 mkdir /cluster/data/sacCer2/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst sacCer2.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst sacCer2.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 14 mkdir -p /cluster/data/sacCer2/bed/tblastn.hg18KG cd /cluster/data/sacCer2/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 14 query.lst # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\) # 36727/(50000/14) = 10.283560 # chose 100 because 10 is too small mkdir -p kgfa split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 368 368 4784 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/sacCer2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/sacCer2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/sacCer2/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 5152 of 5152 jobs # CPU time in finished jobs: 71064s 1184.40m 19.74h 0.82d 0.002 y # IO & Wait Time: 16915s 281.92m 4.70h 0.20d 0.001 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest finished job: 69s 1.15m 0.02h 0.00d # Submission to last job: 156s 2.60m 0.04h 0.00d ssh swarm cd /cluster/data/sacCer2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 368 of 368 jobs # CPU time in finished jobs: 5s 0.09m 0.00h 0.00d 0.000 y # IO & Wait Time: 1487s 24.78m 0.41h 0.02d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 37s 0.62m 0.01h 0.00d cd /cluster/data/sacCer2/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 5050 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/sacCer2/bed/tblastn.hg18KG hgLoadPsl sacCer2 blastHg18KG.psl # check coverage featureBits sacCer2 blastHg18KG # 1261725 bases of 12162995 (10.373%) in intersection featureBits sacCer2 blastHg18KG sgdGene -enrichment # blastHg18KG 10.373%, sgdGene 72.883%, both 10.320%, cover 99.49%, enrich 1.37x rm -rf blastOut #end tblastn ########################################################################### ## 7-Way Multiz (DONE - 2009-02-11 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/multiz7way cd /hive/data/genomes/sacCer2/bed/multiz7way # using the phylogenetic tree as mentioned at: # http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html # and from Jim's note: genomes/sacCer1/bed/otherYeast/README # ((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu) echo "((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \ > tree.nh echo sacCer2 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list mkdir mafLinks for S in sacPar sacMik sacKud sacBay sacCas sacKlu do mkdir mafLinks/${S} cd mafLinks/${S} ln -s ../../../otherYeast/align/mafNet.${S}/*.maf.gz . cd ../.. done find -L ./mafLinks -type f | xargs -L 1 basename \ | sed -e "s/.maf.gz//" | sort -u > maf.list mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = sacCer2 set c = $1 set result = $2 set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set pairs = /hive/data/genomes/sacCer2/bed/multiz7way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp foreach s (`sed -e "s/ $db//" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out else if (-e $in) then ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db /bin/rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../maf.list single template jobList para create jobList # Completed: 18 of 18 jobs # CPU time in finished jobs: 710s 11.83m 0.20h 0.01d 0.000 y # IO & Wait Time: 127s 2.12m 0.04h 0.00d 0.000 y # Average job time: 47s 0.78m 0.01h 0.00d # Longest finished job: 105s 1.75m 0.03h 0.00d # Submission to last job: 144s 2.40m 0.04h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d # load MAF tables ssh hgwdev mkdir -p /gbdb/sacCer2/multiz7way/maf cd /hive/data/genomes/sacCer2/bed/multiz7way/maf ln -s `pwd`/*.maf /gbdb/sacCer2/multiz7way/maf # allow the temporary multiz7way.tab file to be created in tmp cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/sacCer2/multiz7way/maf sacCer2 multiz7way # real 0m4.903s # Loaded 38795 mafs in 18 files from /gbdb/sacCer2/multiz7way/maf # load summary table time nice -n +19 cat /gbdb/sacCer2/multiz7way/maf/*.maf \ | hgLoadMafSummary sacCer2 -mergeGap=1500 multiz7waySummary stdin # real 0m5.536 # Created 3037 summary blocks from 73806 components and 38795 mafs ######################################################################### ## phastCons conservation (DONE - 2009-02-11 - Hiram) # Create SS files for each chromosome: mkdir /hive/data/genomes/sacCer2/bed/multiz7way/phastCons cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons mkdir SS # split up alignments; no need to use cluster here MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut FA=/cluster/data/sacCer1 CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M" mkdir -p SS for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")` do echo $C /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \ ../maf/${C}.maf -M ../../otherYeast/align/sacCer2/${C}.fa -i MAF \ -O sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \ -o SS > SS/${C}.ss done # produce rough starting model /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \ -i SS --aggregate sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \ -o SS -z SS/*.ss > all.ss /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloFit \ -i SS all.ss \ --tree "((((((sacCer2,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \ -o starting-tree # (cheat the branch lengths up a bit, since this represents an # average of conserved and nonconserved sites; we want to # initialize for nonconserved) /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/tree_doctor \ --scale 2 starting-tree.mod > tmp.mod mv tmp.mod starting-tree.mod # also get average GC content of aligned regions /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \ -i SS all.ss --summary-only #descrip. A C G T G+C length all_gaps some_gaps #all.ss 0.3056 0.1950 0.1948 0.3047 0.3898 13130139 0 2122279 # save some I/O and space gzip SS/*.ss # now set up cluster job to estimate model parameters. See # other make{hg18|mm9}.doc for add'l details cat << '_EOF_' > doEstimate.sh #!/bin/sh zcat SS/$1.ss.gz | \ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons \ - starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \ --ignore-missing --expected-lengths 75 --target-coverage 0.5 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << happy emacs chmod +x doEstimate.sh rm -fr LOG TREES mkdir LOG TREES cat << '_EOF_' > template #LOOP doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1) #ENDLOOP '_EOF_' # << happy emacs (cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list gensub2 chr.list single template jobList para create jobList para push para time # Completed: 18 of 18 jobs # CPU time in finished jobs: 8152s 135.86m 2.26h 0.09d 0.000 y # IO & Wait Time: 323s 5.39m 0.09h 0.00d 0.000 y # Average job time: 471s 7.85m 0.13h 0.01d # Longest finished job: 889s 14.82m 0.25h 0.01d # Submission to last job: 895s 14.92m 0.25h 0.01d # Now combine parameter estimates. ls TREES/*.cons.mod > cons.txt /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \ --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt ls TREES/*.noncons.mod > noncons.txt /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \ --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt # Now set up cluster job for computing consevation scores and # predicted elements cat << '_EOF_' > doPhastCons.sh #!/bin/sh mkdir -p POSTPROBS ELEMENTS chr=$1 out=$2 tmpFile=/scratch/tmp/phastCons.$chr rm -f $tmpFile zcat SS/$chr.ss.gz \ | /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \ ave.cons.mod,ave.noncons.mod --expected-lengths 75 \ --target-coverage 0.5 --quiet --seqname $chr --idpref $chr \ --viterbi ELEMENTS/$chr.bed --score --require-informative 0 > $tmpFile gzip -c $tmpFile > POSTPROBS/$chr.pp.gz rm $tmpFile '_EOF_' # << happy emacs chmod u+x doPhastCons.sh cat << '_EOF_' > run.template #LOOP doPhastCons.sh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 chr.list single run.template run.jobList rm -fr POSTPROBS ELEMENTS para create run.jobList para push para time # Completed: 18 of 18 jobs # CPU time in finished jobs: 78s 1.30m 0.02h 0.00d 0.000 y # IO & Wait Time: 658s 10.97m 0.18h 0.01d 0.000 y # Average job time: 41s 0.68m 0.01h 0.00d # Longest finished job: 53s 0.88m 0.01h 0.00d # Submission to last job: 57s 0.95m 0.02h 0.00d # set up phastConsElements track cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements7way.bed hgLoadBed sacCer2 phastConsElements7way phastConsElements7way.bed featureBits sacCer2 phastConsElements7way # 7762587 bases of 12162995 (63.821%) in intersection # previously this was: featureBits sacCer1 phastConsElements # 7517116 bases of 12156302 (61.837%) in intersection mkdir ../downloads mkdir ../downloads/phastCons7way for C in `cat chr.list` do cp -p POSTPROBS/${C}.pp.gz \ ../downloads/phastCons7way/sacCer2.${C}.wigFixed.gz done # encode those files into wiggle data zcat ../downloads/phastCons7way/*.wigFixed.gz \ | wigEncode stdin phastCons7way.wig phastCons7way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # Load gbdb and database with wiggle. cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons ln -s `pwd`/phastCons7way.wib /gbdb/sacCer2/multiz7way/phastCons7way.wib hgLoadWiggle -pathPrefix=/gbdb/sacCer2/multiz7way sacCer2 \ phastCons7way phastCons7way.wig # Create histogram to get an overview of all the data hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=sacCer2 phastCons7way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Yeast sacCer2 Histogram phastCons7way track" set xlabel " phastCons7way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # To create the tree diagram for the details page, use this tree # definition in http://genome.ucsc.edu/cgi-bin/phyloGif ((((((S._cerevisiae,S._paradoxus),S._mikatae),S._kudriavzevii),S._bayanus),S._castelli),S._kluyveri) ######################################################################### ## Annotate the sacCer2 7-way sequence with genes mkdir /hive/data/genomes/sacCer2/bed/multiz7way/anno cd /hive/data/genomes/sacCer2/bed/multiz7way/anno mkdir genes # using sgdGene hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from sgdGene" sacCer2 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/sacCer2.sgdGene.gz (cat ../maf/*.maf | genePredToMafFrames sacCer2 stdin stdout \ sacCer2 genes/sacCer2.sgdGene.gz \ | gzip > multiz7way.mafFrames.gz) > frames.log 2>&1 zcat multiz7way.mafFrames.gz \ | sort -k1,1 -k2,2n | hgLoadMafFrames sacCer2 multiz7wayFrames stdin ############################################################################ # creating upstream mafs (DONE - 2009-06-26 - Hiram) ssh hgwdev cd /hive/data/genomes/sacCer2/goldenPath/multiz7way # run this bash script cat << '_EOF_' > mkUpstream.sh DB=sacCer2 GENE=sgdGene NWAY=multiz7way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh ./mkUpstream.sh ######################################################################### ## simpleRepeats (DONE - 2009-02-12 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/simpleRepeat cd /hive/data/genomes/sacCer2/bed/simpleRepeat doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \ -workhorse=hgwdev sacCer2 > do.log 2>&1 ######################################################################### ## Regulatory Code (DONE - 2009-02-17 - Hiram) # liftOver from sacCer1 mkdir /hive/data/genomes/sacCer2/bed/transRegCode cd /hive/data/genomes/sacCer2/bed/transRegCode # lifting sacCer1 data to this assembly hgsql -N -e "select chrom,chromStart,chromEnd,name,score,chipEvidence,consSpecies from transRegCode" sacCer1 \ | liftOver -bedPlus=5 -tab stdin \ /usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \ transRegCode.lifted.bed transRegCode.unMapped.bed hgsql -N -e "select chrom,chromStart,chromEnd,name,tfCount,tfList,bindVals fromtransRegCodeProbe;" sacCer1 \ | liftOver -bedPlus=4 -tab stdin \ /usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \ transRegCodeProbe.lifted.bed transRegCodeProbe.unMapped.bed hgLoadBed sacCer2 transRegCode transRegCode.lifted.bed \ -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql # Loaded 206672 elements of size 7 hgLoadBed sacCer2 transRegCodeProbe transRegCodeProbe.lifted.bed \ -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab # Loaded 6178 elements of size 7 hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql hgsql sacCer2 < $HOME/kent/src/hg/lib/growthCondition.sql hgsql -N -e "select * from transRegCodeCondition;" sacCer1 \ | hgsql sacCer2 -e \ 'load data local infile "/dev/stdin" into table transRegCodeCondition' hgsql -N -e "select * from transRegCodeMotif;" sacCer1 \ | hgsql sacCer2 -e \ 'load data local infile "/dev/stdin" into table transRegCodeMotif' hgsql -N -e "select * from growthCondition;" sacCer1 \ | hgsql sacCer2 -e \ 'load data local infile "/dev/stdin" into table growthCondition' ######################################################################### # Oreganno track - (DONE - 2009-02-17 - Hiram) # liftOver from sacCer1 database hgsql -N -e \ "select chrom,chromStart,chromEnd,id,strand,name from oreganno;" sacCer1 \ | liftOver -bedPlus=4 -tab stdin \ /usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \ oreganno.lifted.bed oreganno.unMapped.bed hgsql sacCer2 < $HOME/kent/src/hg/lib/oreganno.sql hgLoadBed -oldTable sacCer2 oreganno oreganno.lifted.bed -tab # Loaded 7302 elements of size 6 # and load non-positional tracks from sacCer1: hgsql -N -e "select * from oregannoAttr;" sacCer1 \ | hgLoadSqlTab -oldTable sacCer2 oregannoAttr \ ~/humPhen/kent/src/hg/lib/oreganno.sql stdin hgsql -N -e "select * from oregannoLink;" sacCer1 \ | hgLoadSqlTab -oldTable sacCer2 oregannoLink \ ~/humPhen/kent/src/hg/lib/oreganno.sql stdin ######################################################################### # Regulatory Module - (DONE - 2009-02-17 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/regModule cd /hive/data/genomes/sacCer2/bed/regModule # liftOver data from sacCer1 hgsql -N -e \ "select chrom,chromStart,chromEnd,name,score,strand from esRegUpstreamRegion;" \ sacCer1 | liftOver -bedPlus=6 -tab stdin \ /usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \ esRegUpstreamRegion.lifted.bed esRegUpstreamRegion.unMapped.bed hgsql -N -e \ "select chrom,chromStart,chromEnd,name,score,strand,gene from esRegGeneToMotif;" \ sacCer1 | liftOver -bedPlus=6 -tab stdin \ /usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \ esRegGeneToMotif.lifted.bed esRegGeneToMotif.unMapped.bed # I do not see instructions in sacCer1 to create these tables, # so, dump their schemas: hgsqldump --all -c -d --tab=. sacCer1 esRegUpstreamRegion esRegGeneToMotif # and data for these other two: hgsqldump --all -c --tab=. sacCer1 esRegGeneToModule esRegMotif hgLoadBed sacCer2 esRegGeneToMotif -sqlTable=esRegGeneToMotif.sql -tab \ esRegGeneToMotif.lifted.bed # Loaded 4002 elements of size 7 hgLoadBed sacCer2 esRegUpstreamRegion -sqlTable=esRegUpstreamRegion.sql \ -tab esRegUpstreamRegion.lifted.bed # Loaded 1670 elements of size 6 hgsql sacCer2 < esRegMotif.sql hgsql sacCer2 -e \ 'load data local infile "esRegMotif.txt" into table esRegMotif;' hgsql sacCer2 < esRegGeneToModule.sql hgsql sacCer2 -e \ 'load data local infile "esRegGeneToModule.txt" into table esRegGeneToModule;' ######################################################################### # creating tables for Gene Sorter (DONE - 2009-02-17 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/hgNear cd /hive/data/genomes/sacCer2/bed/hgNear hgClusterGenes sacCer2 sgdGene sgdIsoforms sgdCanonical # Got 6550 clusters, from 6717 genes in 18 chromosomes # Make self mapping table for expression. hgsql -N -e 'select name from sgdGene;' sacCer2 \ | awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab hgsql sacCer2 -e 'create table sgdToSgd ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ UNIQUE (value));' hgsql sacCer2 \ -e 'load data local infile "sgdToSgd.tab" into table sgdToSgd' # Removed sgdToSgd table because it wasn't being used anywhere. # Katrina 7/14/09 # Make expression similarity table. hgExpDistance sacCer2 hgFixed.yeastChoCellCycle \ hgFixed.yeastChoCellCycleExps choExpDistance # Have 6259 elements in hgFixed.yeastChoCellCycle # Got 6259 unique elements in hgFixed.yeastChoCellCycle # Made choExpDistance.tab ######################################################################### # running the blastP operation to the other genomes for the gene sorter # (DONE - 2009-02-18 - Hiram) mkdir /hive/data/genomes/sacCer2/bed/hgNearBlastp cd /hive/data/genomes/sacCer2/bed/hgNearBlastp mkdir tmp 090218 pepPredToFa sacCer2 sgdPep 090218/sgdPep.faa pepPredToFa hg18 knownGenePep 090218/hg18.known.faa pepPredToFa mm9 knownGenePep 090218/mm9.known.faa pepPredToFa rn4 knownGenePep 090218/rn4.known.faa pepPredToFa danRer5 ensPep 090218/danRer5.ensPep.faa pepPredToFa dm3 flyBasePep 090218/dm3.flyBasePep.faa pepPredToFa ce6 sangerPep 090218/ce6.sangerPep.faa # sanity check, number of lines in each faa file cd 090218 cat << '_EOF_' > config.ra # Latest Yeast vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, fly, worm targetGenesetPrefix known targetDb sacCer2 queryDbs hg18 mm9 rn4 danRer5 dm3 ce6 sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa hg18Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/hg18.known.faa mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/mm9.known.faa rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/rn4.known.faa danRer5Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/danRer5.ensPep.faa dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/dm3.flyBasePep.faa ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/ce6.sangerPep.faa buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218 scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/tmp '_EOF_' # << happy emacs # takes about an hour time nice -n +19 $HOME/kent/src/hg/utils/automation/doHgNearBlastp.pl \ config.ra > do.log 2>&1 & # real 21m32.343s # one name seems to have snuck in here: cd /hive/data/genomes/sacCer2/bed/hgNearBlastp hgsql -N -e "select query from mmBlastTab;" sacCer2 \ | sort -u > sacCer2.mmBlastTab.query.txt hgsql -N -e "select name from sgdGene;" sacCer2 \ | sort -u > sacCer2.sgdGene.name.txt # the single one is: comm -23 sacCer2.mmBlastTab.query.txt sacCer2.sgdGene.name.txt # YDL038C # it was the same in all of them: hgsql -e "delete from mmBlastTab where query=\"YDL038C\";" sacCer2 hgsql -e "delete from drBlastTab where query=\"YDL038C\";" sacCer2 hgsql -e "delete from dmBlastTab where query=\"YDL038C\";" sacCer2 hgsql -e "delete from ceBlastTab where query=\"YDL038C\";" sacCer2 hgsql -N -e "select query from knownBlastTab;" sacCer2 \ | sort -u > sacCer2.knownBlastTab.query.txt comm -23 sacCer2.knownBlastTab.query.txt sacCer2.sgdGene.name.txt \ | while read N do hgsql -e "delete from knownBlastTab where query=\"${N}\";" sacCer2 done # knownBlastTab does not run any columns in yeast GS. RENAME TABLE knownBlastTab TO sgdBlastTab; # kuhn and katrina 06-18-2009 ######################################################################### # creating download files and pushQ (DONE - 2009-02-24 - Hiram) cd /hive/data/genoems/sacCer2 # there aren't any repeats on 2micron touch bed/simpleRepeat/trfMaskChrom/2micron.bed # and, there are no RM files: makeDownloads.pl -ignoreRepeatMasker sacCer2 # edit the README files in: # ./goldenPath/bigZips/README.txt # ./goldenPath/database/README.txt # ./goldenPath/liftOver/README.txt # ./goldenPath/chromosomes/README.txt mkdir pushQ makePushQSql.pl sacCer2 > sacCer2.pushQ.sql # one warning: # sacCer2 does not have seq # it could not identify the following tables: # 2micron_est # 2micron_gap # 2micron_gold # 2micron_intronEst # 2micron_mrna # growthCondition # sgdToPfam # yeastP2P scp -p sacCer2.pushQ.sql hiram@hgwbeta:/tmp ssh hgwbeta hgsql qapushq < sacCer2.pushQ.sql ######################################################################### # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sacCer2", "blat10", "17792", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sacCer2", "blat10", "17793", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # uwFootprints: (2009-04-04 markd) # lifted sacCer1 tag count data and set to Nobel lab # William Stafford Noble # Xiaoyu Chen # lift counts to sacSer2 and give back to UW cd /hive/data/genomes/sacCer1/bed/uwFootprintsxi wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed mkdir /hive/data/genomes/sacCer2/bed/uwFootprints cd /hive/data/genomes/sacCer2/bed/uwFootprints # get back wig wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.sacCer2.lifted.wig # lift other BEDs liftOver /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.mappability.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.mappability.bed yeast.mappability.nolift.bed # drop yeast.footprints.bed and truncate name to three decimal places tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.footprints.bed | liftOver stdin /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.footprints.bed yeast.footprints.nolift.bed wigEncode yeast.dnaseI.tagCounts.sacCer2.lifted.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib # Converted yeast.dnaseI.tagCounts.sacCer2.lifted.wig, upper limit 13798.00, lower limit 1.00 ln -sf /hive/data/genomes/sacCer2/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer2/wib hgLoadWiggle -tmpDir=/data/tmp sacCer2 uwFootprintsTagCounts uwFootprintsTagCounts.wig hgLoadBed -tab -tmpDir=/data/tmp sacCer2 uwFootprintsMappability yeast.mappability.bed hgLoadBed -tmpDir=/data/tmp sacCer2 uwFootprintsPrints yeast.footprints.bed ############################################################################ # fixup sgdToName table (DONE - 2009-07-09 - Hiram) # this table is missing a name correspondence for some of # the gene names in sgdGene.name # to fixup, any names in sgdGene.name that are not in sgdToName, # simply add those names and reference themselves mkdir /hive/data/genomes/sacCer2/bed/fixSgdToName cd /hive/data/genomes/sacCer2/bed/fixSgdToName hgsql -N -e "select name from sgdGene;" sacCer2 | sort -u > sgdGene.name hgsql -N -e "select name from sgdToName;" sacCer2 > sgdToName.tab # convert the two columns of names to a single list of unique names cat sgdToName.tab | tr '[\t]' '[\n]' | sort -u > all.sgdToName.name comm -12 sgdGene.name all.sgdToName.name > commonToBoth comm -23 sgdGene.name all.sgdToName.name > uniqueToSgdGene comm -13 sgdGene.name all.sgdToName.name > uniqueToSgdToName awk '{printf "%s\t%s\n", $1, $1}' uniqueToSgdGene > addSgdGeneNames.tab # count before load hgsql -e "select count(*) from sgdToName;" sacCer2 # 6254 # adding names: wc -l addSgdGeneNames.tab # 472 # 6254 + 472 = 6726 hgsql sacCer2 \ -e 'load data local infile "addSgdGeneNames.tab" into table sgdToName;' hgsql -e "select count(*) from sgdToName;" sacCer2 # 6726 # Removed extraneous records from sgdToName (DONE - 2009-07-10 - Katrina) #sgdGene has 6717 records (all of which have a distinct value in the #name field) and sgdToName has 6726 (all of which have a distinct value #in the name field).I searched for these 9 gene names in the SGD web #site (yeastgenome.org). Most are alias' of another gene or something #that was originally characterized as a separate gene but later found #to be a part of another gene. mysql> delete from sgdToName where name= "YBL039W-A"; mysql> delete from sgdToName where name= "YBL101W-A"; mysql> delete from sgdToName where name= "YBL101W-C"; mysql> delete from sgdToName where name= "YDL038C"; mysql> delete from sgdToName where name= "YDR524W-A"; mysql> delete from sgdToName where name= "YGR272C"; mysql> delete from sgdToName where name= "YLR157W-A"; mysql> delete from sgdToName where name= "YLR157W-C"; mysql> delete from sgdToName where name= "YNL097C-A"; ############################################################################ # for some unknown reason, indexes missing on chrM_gold and chrM_gap # maybe because they have only one and zero items ? # (DONE - 2009-07-21 - Hiram) hgsql -e "alter table chrM_gold add index chromStart (chromStart);" sacCer2 hgsql -e "alter table chrM_gold add index bin (bin);" sacCer2 hgsql -e "alter table chrM_gold add index frag (frag(9));" sacCer2 hgsql -e "alter table chrM_gap add index chromStart (chromStart);" sacCer2 hgsql -e "alter table chrM_gap add index bin (bin);" sacCer2 ############################################################################ # LIFTOVER TO SacCer1 (DONE - 2010-01-14 - Hiram ) mkdir /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14 cd /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug sacCer2 sacCer1 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ sacCer2 sacCer1 > do.log 2>&1 # real 3m42.700s ############################################################################# # Update BlastTab tables of sacCer2 (DONE, 2010-08-06, Fan) cd /hive/data/genomes/sacCer2/bed/hgNearBlastp mkdir 100806 cd 100806 pepPredToFa sacCer2 sgdPep sgdPep.faa pepPredToFa hg19 knownGenePep hg19.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer6 ensPep danRer6.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce6 sangerPep ce6.sangerPep.faa # sanity check, number of lines in each faa file cat << '_EOF_' > config.ra # Latest Yeast vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, fly, worm targetGenesetPrefix sgd targetDb sacCer2 queryDbs hg19 mm9 rn4 danRer6 dm3 ce6 sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/sgdPep.faa hg19Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/hg19.known.faa mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/mm9.known.faa rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/rn4.known.faa danRer6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/danRer6.ensPep.faa dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/dm3.flyBasePep.faa ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/ce6.sangerPep.faa buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806 scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/tmp '_EOF_' doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log ######################################################################### # LIFTOVER TO SacCer3 (DONE - 2009-01-04 - Hiram ) mkdir /hive/data/genomes/sacCer2/bed/blat.sacCer3.2011-08-31 cd /hive/data/genomes/sacCer2/bed/blat.sacCer3.2011-08-31 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug sacCer2 sacCer3 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ sacCer2 sacCer3 > do.log 2>&1 # real 3m30.878s ############################################################################# ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd