# for emacs: -*- mode: sh; -*-

# Saccharomyces cerevisiae
# Saccharomyces Genome Database, Stanford

#  $Id: sacCer2.txt,v 1.17 2010/02/05 06:54:06 kuhn Exp $

#######################################################################
# Download data  (DONE - 2009-01-30 - Hiram)
    mkdir -p /hive/data/genomes/sacCer2/download
    cd /hive/data/genomes/sacCer2/download
    TOP=/hive/data/genomes/sacCer2/download
    for D in gene_registry literature_curation oracle_schema protein_info \
	protein_info/hypothetical_peptides
do
    mkdir -p ${D}
    cd ${D}
    wget -l 1 --timestamping -np -nd --cut-dirs=1 -r -X "archive" \
	"http://downloads.yeastgenome.org/${D}/"
    rm -f index.* robots.txt
    cd ${TOP}
done

    mkdir sgd.chromosomes
    for C in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt
do
    wget --timestamping \
"http://downloads.yeastgenome.org/sequence/genomic_sequence/chromosomes/fasta/ch
r${C}.fsa" -O sgd.chromosomes/chr${C}.fsa
done

    #	convert chrom names to something reasonable, they are now roman nums
    mkdir -p chromosomes
    # bash scripting follows:
runOne() {
N=$1
C=$2
    echo ">chr${C}" > chromosomes/chr${C}.fa
    grep -v "^>" sgd.chromosomes/chr${N}.fsa >> chromosomes/chr${C}.fa
    echo "done chr${N} -> ${C}"
}

runOne 01 I
runOne 02 II
runOne 03 III
runOne 04 IV
runOne 05 V
runOne 06 VI
runOne 07 VII
runOne 08 VIII
runOne 09 IX
runOne 10 X
runOne 11 XI
runOne 12 XII
runOne 13 XIII
runOne 14 XIV
runOne 15 XV
runOne 16 XVI
runOne mt M
    #	the same sequence again, with the following runOne()
    #	will product an AGP file:
runOne() {
    N=$1
    C=$2
    CTG=`head -1 sgd.chromosomes/chr${N}.fsa | sed -e 's/>ref|//; s/|.*//'`
    size=`faSize  sgd.chromosomes/chr${N}.fsa | head -1 | awk '{print $1}'`
    echo -e "chr${C}\t1\t$size\t$SEQ\tF\t$CTG\t1\t$size\t+"
    SEQ=`echo $SEQ | awk '{print $1+1}'`
}
    #	plus this echo statement for 2micron:
    echo -e "2micron\t1\t6318\t${SEQ}\tF\tNC_001398\t1\t6318\t+"

    #	trim the SGD gff file and get UCSC chrom names
    awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' chromosomal_feature/saccharomyces_cerevisiae.gff \
        | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff

    #	this program needed to be fixed for this newer gff file format
    checkSgdSync -verbose=4 -gffRowCount=9 -faExtn=fa \
	-gffFile=S.cerevisiae.gff \
	download > sgd.feature.scan.txt 2>&1
    #	this used to have only a couple, now there are many non-coding
    #	annotations
    #    good 6702, bad 371, total 7073

#########################################################################

    cd /hive/data/genomes/sacCer2
    cat << '_EOF_' > sacCer2.config.ra
# Config parameters for makeGenomeDb.pl:
db sacCer2
scientificName Saccharomyces cerevisiae
commonName S. cerevisiae
assemblyDate June 2008
assemblyLabel SGD June 2008
orderKey 998
mitoAcc NC_001224
#       override the check for max size of 25,000
mitoSize        90000
fastaFiles /hive/data/genomes/sacCer2/download/chromosomes/*.fa
agpFiles /hive/data/genomes/sacCer2/download/sacCer2.agp
# qualFiles /dev/null
dbDbSpeciesDir sacCer
taxId   4932
'_EOF_'
    # << happy emacs
    makeGenomeDb.pl sacCer2.config.ra > makeGenomeDb.log 2>&1
    #	take the trackDb stuff and check it in
    #	no need for masked sequence:
    ln -s sacCer2.unmasked.2bit sacCer2.2bit
    ln -s `pwd`/sacCer2.2bit /gbdb/sacCer2


#########################################################################
# sacCer2 - S. cerevisiae - Ensembl Genes version 52 (DONE - 2009-02-04 - hiram)
    ssh swarm
    cd /hive/data/genomes/sacCer2
    cat << '_EOF_' > sacCer2.ensGene.ra
# required db variable
db sacCer2
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^VIII/chrVIII/; s/^VII/chrVII/; s/^VI/chrVI/; s/^V/chrV/; s/^XIII/chrXIII/; s/^XII/chrXII/; s/^XIV/chrXIV/; s/^XI/chrXI/; s/^XVI/chrXVI/; s/^XV/chrXV/; s/^X/chrX/; s/^III/chrIII/; s/^IV/chrIV/; s/^II/chrII/; s/^IX/chrIX/; s/^I/chrI/; s/^MT/chrM/; s/2-micron/2micron/"
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=52 sacCer2.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/sacCer2/bed/ensGene.52
    featureBits sacCer2 ensGene
    # 8912793 bases of 12162995 (73.278%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
    #	repMatch = 1024 * sizeof(sacCer2)/sizeof(hg18)
    #	4.32 = 1024 * (12162995/2881515245)
    #	use 10 to be very conservative
    ssh hgwdev
    cd /hive/data/genomes/sacCer2
    blat sacCer2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/11.ooc \
	-repMatch=10
    #	Wrote 3137 overused 11-mers to 11.ooc
    #	copy this to scratch data
    cp -p jkStuff/11.ooc /hive/data/staging/data/sacCer2/sacCer2.11.ooc

#########################################################################
# work with the SGD annotations (WORKING - 2009-02-04 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
    cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
    #	get rid of the FASTA section from their gff file
    awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
        | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff

#########################################################################
# CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2009-02-10 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
    cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
    #	trim the delivered S.cerevisiae.gff file to get rid of the FASTA section
    #	and fixup the chrM and 2-micron chrom names:
    awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
        | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff

    #	wrote this perl script to specifically parse this particular instance
    #	of the sgd gff file.
    $HOME/kent/src/hg/makeDb/outside/yeast/hgSgdGff3/sgdGffToGtf.pl \
        S.cerevisiae.gff > sacCer2.sgdGene.gtf 2> sacCer2.gtf.stderr
    #	This also writes the files:
    #	-rw-rw-r-- 1  640177 Feb  6 15:01 leftOvers.gff
    #	-rw-rw-r-- 1  259712 Feb  6 15:01 otherFeatures.bed
    #	-rw-rw-r-- 1  254996 Feb  6 15:01 notes.txt
    #	-rw-rw-r-- 1 1214875 Feb  6 15:01 descriptions.txt

    ldHgGene -gtf sacCer2 sgdGene sacCer2.sgdGene.gtf
    hgLoadBed sacCer2 sgdOther otherFeatures.bed \
        -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
    #	this perl script will fix up the fasta header lines for
    #	the chr*.peptides.fsa files to run into hgSgdPep
    cat << '_EOF_' > filter.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,">symbol.txt") or die "can not write to symbol.txt";

my $inAnnotation = 0;

while (my $line=<>) {
    if ($line =~ m/^>Annotated\|/) {
        $inAnnotation = 1;
        my (@words) = split('\s+', $line);
        die "cannot find four fields in\n'$line'" if (scalar(@words) < 4);
        my $name = $words[3];
        $name =~ s/;.*//;
        $name =~ s#/.*##;
        printf ">ORFP:%s\n", $name;
        printf FH "%s\t", $name;
        if ("${name};" ne $words[3]) {
            my $otherName = $words[3];
            $otherName =~ s#.*/##;
            $otherName =~ s/;//;
            printf FH "%s", $otherName;
#           printf FH " %s", $words[2];
        } else {
            printf FH "%s", $name;
        }
#       for (my $i = 4; $i < scalar(@words); ++$i) {
#           printf FH " %s", $words[$i];
#       }
        print FH "\n";
    } elsif ($line =~ m/^>/) {
        $inAnnotation = 0;
    } else {
        if ($inAnnotation) { print $line; }
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x filter.pl
    zcat \
../../download/protein_info/hypothetical_peptides/chr*.peptides.20040928.fsa.gz\
    | ./filter.pl | hgSgdPep  stdin sgdPep.fa symbol.txt
    hgPepPred sacCer2 generic sgdPep sgdPep.fa
    hgsql sacCer2 -e 'create table sgdToName ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  INDEX (value));'
    hgsql sacCer2 -e 'load data local infile "symbol.txt" \
          into table sgdToName;'
    hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdDescription.sql
    hgsql sacCer2 -e 'load data local infile "descriptions.txt" \
          into table sgdDescription;'
    hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
    hgsql sacCer2 -e 'load data local infile "notes.txt" \
          into table sgdOtherDescription;'

    ## Clean up some stray names:
    cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
    hgsql -N -e "select name from sgdGene;" sacCer2 \
	| sort -u > sacCer2.sgdGene.name.txt
    hgsql -N -e "select name from sgdPep;" sacCer2 \
	| sort -u > sacCer2.sgdPep.name.txt
    comm -23 sacCer2.sgdPep.name.txt sacCer2.sgdGene.name.txt | while read N
do
    hgsql -e "delete from sgdPep where name=\"$N\";" sacCer2
done

    hgsql -N -e "select name from sgdDescription;" sacCer2 \
        | sort -u > sacCer2.sgdDescription.name.txt
    comm -23 sacCer2.sgdDescription.name.txt sacCer2.sgdGene.name.txt \
	| while read N
do
    hgsql -e "delete from sgdDescription where name=\"${N}\";" sacCer2
done

############################################################################
# catch up to other tables in sacCer1 - (DONE - 2009-02-24 - Hiram)
    #	can simply transfer these tables across from sacCer1:
    hgsqldump --all -c --tab=. sacCer1 sgdAbundance sgdLocalization sgdToPfam

    #	hgLoadSqlTab doesn't like the comment characters:
    grep -v "^--" sgdToPfam.sql \
	| hgLoadSqlTab sacCer2 sgdToPfam stdin sgdToPfam.txt
    grep -v "^--" sgdAbundance.sql \
	| hgLoadSqlTab sacCer2 sgdAbundance stdin sgdAbundance.txt
    grep -v "^--" sgdLocalization.sql \
	| hgLoadSqlTab sacCer2 sgdLocalization stdin sgdLocalization.txt

    hgsqldump --all -c --tab=. sacCer1 yeastP2P
    grep -v "^--" yeastP2P.sql \
	| hgLoadSqlTab sacCer2 yeastP2P stdin yeastP2P.txt

############################################################################
# ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram)
    cd /hive/data/sacCer2/bed/sgdAnnotation
    grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \
	| awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt
    hgsql sacCer2 -e 'create table sgdToSwissProt ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  INDEX (value));'
    hgsql sacCer2 -e 'load data local infile "sgdToSwissProt.txt" \
          into table sgdToSwissProt;'
    hgProtIdToGenePred sacCer2 sgdGene sgdToSwissProt name value

# interesting to note that sgdTOSwissProt has one accession not
# in sgdGene.name:  KHS1  (BK 2009-07-14)

############################################################################
# CREATE SGD-BASED CLONE TRACK (DONE - 2009-02-10 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/sgdClone
    cd /hive/data/genomes/sacCer2/bed/sgdClone
    # since sacCer1, these coordinates have become 0-relative ?
    awk -F '\t' '{printf("%d\t%d\t%d\t%s\t%s\n", $3, $4, $5, $2, $1);}' \
    	../../download/chromosomal_feature/clone.tab \
	| sed -e \
"s/^8/chrVIII/; s/^7/chrVII/; s/^6/chrVI/; s/^5/chrV/;
s/^13/chrXIII/; s/^12/chrXII/; s/^14/chrXIV/; s/^11/chrXI/;
s/^16/chrXVI/; s/^15/chrXV/; s/^10/chrX/; s/^3/chrIII/;
s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" > sgdClone.bed
    hgLoadBed sacCer2 sgdClone  sgdClone.bed -tab \
        -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql

############################################################################
# AUTO UPDATE GENBANK RUN  (Done - 2009-02-10 - Hiram)
    # align with latest genbank process.
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add sacCer2 just before sacCer1

# sacCer2 S. cerevisiae
sacCer2.serverGenome = /hive/data/genomes/sacCer2/sacCer2.2bit
sacCer2.clusterGenome = /scratch/data/sacCer2/sacCer2.2bit
sacCer2.ooc = no
sacCer2.maxIntron = 5000
sacCer2.lift = no
sacCer2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
sacCer2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
sacCer2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
#sacCer2.perChromTables = no  # doesn't work in the browser
sacCer2.genbank.mrna.xeno.load = no
sacCer2.refseq.mrna.native.load = no
sacCer2.downloadDir = sacCer2

    cvs ci -m "Added sacCer2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial sacCer2 &
    #	logFile: var/build/logs/2009.02.10-11:27:42.sacCer2.initalign.log
    #	real    173m1.570s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2
    #	logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log
    #	real    18m21.436s

    # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add sacCer2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added sacCer2 - S. cerevisiae" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update

############################################################################
# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST  (DONE - 2009-02-10 - Hiram)
#	the sequences are the same set from 2003 that were used in sacCer1
#	so, just link to them and rerun the alignment
# in the directory /cluster/data/sacCer1/bed/otherYeast/align
    mkdir /hive/data/genomes/sacCer2/bed/otherYeast
    cd /hive/data/genomes/sacCer2/bed/otherYeast
    for D in sacBay sacCas sacKlu sacKud sacMik sacPar
do
    ln -s ../../../sacCer1/bed/otherYeast/${D} .
done
    mkdir align
    cd align
    ln -s ../../../../sacCer1/bed/otherYeast/align/sac??? .
    #	helpful to have 2bit files for chainAntiRepeat
    for G in sac???
do
    faToTwoBit ${G} ${G}.2bit
done


    # Create directory full of size info
    mkdir sizes
    for F in sac???
    do
	faSize $F -detailed > sizes/$F
    done

    # Create some working directories
    mkdir lav axtAll axtBest mafIn mafRawOut mafOut

    # Create little shell script to do blastz alignments
    cat << '_EOF_' > lastz.csh
#!/bin/csh -ef
/cluster/bin/penn/$MACHTYPE/lastz $1 $2 > $3
'_EOF_'
    # << happy emacs
    chmod +x lastz.csh

    # create sacCer2 individual chr.fa files:
    mkdir sacCer2
    twoBitInfo ../../../sacCer2.2bit stdout | cut -f1 | while read C
do
    twoBitToFa ../../../sacCer2.2bit:${C} sacCer2/${C}.fa
    echo "${C} done"
done
-
    # Create job spec to do all blastz alignments
    ls -1 sacCer2/*.fa > sacCer.list
    ls -1 sac??? > other.list
    cat << '_EOF_' > template
#LOOP
lastz.csh $(path1) $(path2) {check out line+ lav/$(root1).$(root2)}
#ENDLOOP
'_EOF_'
    # << happy emacs
    ssh swarm
    cd /hive/data/genomes/sacCer2/bed/otherYeast/align
    gensub2 sacCer.list other.list template jobList

    # Convert from lav to psl
    for L in lav/*
do
    B=`basename $L`
    echo $B
    lavToPsl $L stdout | gzip > psl/$B.psl.gz
done

    #	chaining
    mkdir -p axtChain/run
    cd axtChain/run
    ls -d ../../sac??? | xargs -L 1 basename > species.list
    cut -f1 ../../../../../chrom.sizes > chrom.list
    cat << '_EOF_' > chain.csh
#!/bin/csh -ef
set S = $1
set C = $2
set IN = ../../psl/${C}.${S}.psl.gz
set OUT = ${S}/${C}.chain
mkdir -p ${S}
zcat ${IN} \
| axtChain -psl -verbose=0  -minScore=1000 -linearGap=medium stdin \
    /hive/data/genomes/sacCer2/sacCer2.2bit \
    -faQ ../../${S} stdout \
| chainAntiRepeat /hive/data/genomes/sacCer2/sacCer2.2bit \
    ../../${S}.2bit stdin ${OUT}
'_EOF_'
    # << happy emacs
    chmod +x chain.csh

    cat << '_EOF_' > template
#LOOP
chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 species.list chrom.list template jobList
    para create jobList
    para push
    para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs:         73s       1.21m     0.02h    0.00d  0.000 y
# IO & Wait Time:                  1100s      18.34m     0.31h    0.01d  0.000 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:            38s       0.63m     0.01h    0.00d

    cd ..
    #	merge the individual results
    foreach S (`cat run/species.list`)
echo $S
find ./run/${S} -name "*.chain" \
    | chainMergeSort -inputList=stdin | gzip -c > sacCer2.${S}.all.chain.gz
end
    #	split them again to get consistent numbering
    foreach S (`cat run/species.list`)
echo $S
chainSplit chain.${S} sacCer2.${S}.all.chain.gz
end

    #	netting the chains
    cat << '_EOF_' > netChains.csh
#!/bin/csh -ef

set S0 = "../../../../chrom.sizes"

foreach S (`cat run/species.list`)
    echo $S
    chainPreNet sacCer2.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \
    | chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \
        | netSyntenic stdin noClass.${S}.net
end
'_EOF_'
    # << happy emacs
    chmod +x netChains.csh
    ./netChains.csh

    cat << '_EOF_' > mkMafs.csh
#!/bin/csh -ef

set TOP = `pwd`
set S0 = "/hive/data/genomes/sacCer2/chrom.sizes"

foreach S (`cat run/species.list`)
    # Make axtNet for download: one .axt per sacCer2 seq.
    netSplit noClass.${S}.net net.${S}
    cd ..
    rm -fr axtNet.${S}
    mkdir axtNet.${S}
    foreach f (axtChain/net.${S}/*.net)
    netToAxt $f axtChain/chain.${S}/$f:t:r.chain \
      /hive/data/genomes/sacCer2/sacCer2.2bit ${S}.2bit stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet.${S}/$f:t:r.sacCer2.${S}.net.axt.gz
    end

    # Make mafNet for multiz: one .maf per sacCer2 seq.
    rm -fr mkdir mafNet.${S}
    mkdir mafNet.${S}
    foreach f (axtNet.${S}/*.sacCer2.${S}.net.axt.gz)
      axtToMaf -tPrefix=sacCer2. -qPrefix=${S}. $f \
            ${S0} sizes/${S} stdout \
      | gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz
    end
    cd ${TOP}
end
'_EOF_'
    # << happy emacs
    chmod +x mkMafs.csh
    ./mkMafs.csh

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-10 braney )
    # bash  if not using bash shell already

    cd /cluster/data/sacCer2
    mkdir /cluster/data/sacCer2/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  sacCer2.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  sacCer2.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y 
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 14

    mkdir -p /cluster/data/sacCer2/bed/tblastn.hg18KG
    cd /cluster/data/sacCer2/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 14 query.lst

   # we want around 50000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\)

# 36727/(50000/14) = 10.283560

# chose 100 because 10 is too small
   mkdir -p kgfa
   split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
# 368  368 4784 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/sacCer2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/sacCer2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then                  
            mv $3.tmp $3     
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0               
    fi                      
fi                         
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit 
    
    ssh swarm
    cd /cluster/data/sacCer2/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 5152 of 5152 jobs
# CPU time in finished jobs:      71064s    1184.40m    19.74h    0.82d  0.002 y
# IO & Wait Time:                 16915s     281.92m     4.70h    0.20d  0.001 y
# Average job time:                  17s       0.28m     0.00h    0.00d
# Longest finished job:              69s       1.15m     0.02h    0.00d
# Submission to last job:           156s       2.60m     0.04h    0.00d

    ssh swarm
    cd /cluster/data/sacCer2/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 368 of 368 jobs
# CPU time in finished jobs:          5s       0.09m     0.00h    0.00d  0.000 y
# IO & Wait Time:                  1487s      24.78m     0.41h    0.02d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:            37s       0.62m     0.01h    0.00d

    cd /cluster/data/sacCer2/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 5050 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /cluster/data/sacCer2/bed/tblastn.hg18KG
    hgLoadPsl sacCer2 blastHg18KG.psl

    # check coverage
    featureBits sacCer2 blastHg18KG 
# 1261725 bases of 12162995 (10.373%) in intersection

    featureBits sacCer2 blastHg18KG sgdGene  -enrichment
# blastHg18KG 10.373%, sgdGene 72.883%, both 10.320%, cover 99.49%, enrich 1.37x

    rm -rf blastOut
#end tblastn
###########################################################################
## 7-Way Multiz (DONE - 2009-02-11 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/multiz7way
    cd /hive/data/genomes/sacCer2/bed/multiz7way

    #	using the phylogenetic tree as mentioned at:
    #	http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html
    #	and from Jim's note: genomes/sacCer1/bed/otherYeast/README 
    # ((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)
    echo "((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \
	> tree.nh
    echo sacCer2 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list
    mkdir mafLinks
    for S in sacPar sacMik sacKud sacBay sacCas sacKlu
do
    mkdir mafLinks/${S}
    cd mafLinks/${S}
    ln -s ../../../otherYeast/align/mafNet.${S}/*.maf.gz .
    cd ../..
done
    find -L ./mafLinks -type f | xargs -L 1 basename \
	| sed -e "s/.maf.gz//" | sort -u > maf.list

    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
    cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
    cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = sacCer2
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/sacCer2/bed/multiz7way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../tree.nh ../species.list $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
    else if (-e $in) then
        ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    gensub2 ../maf.list single template jobList
    para create jobList
# Completed: 18 of 18 jobs
# CPU time in finished jobs:        710s      11.83m     0.20h    0.01d  0.000 y
# IO & Wait Time:                   127s       2.12m     0.04h    0.00d  0.000 y
# Average job time:                  47s       0.78m     0.01h    0.00d
# Longest finished job:             105s       1.75m     0.03h    0.00d
# Submission to last job:           144s       2.40m     0.04h    0.00d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    # load MAF tables
    ssh hgwdev
    mkdir -p /gbdb/sacCer2/multiz7way/maf
    cd /hive/data/genomes/sacCer2/bed/multiz7way/maf
    ln -s `pwd`/*.maf /gbdb/sacCer2/multiz7way/maf

    # allow the temporary multiz7way.tab file to be created in tmp
    cd /data/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/sacCer2/multiz7way/maf sacCer2 multiz7way
    #	real    0m4.903s
    #	Loaded 38795 mafs in 18 files from /gbdb/sacCer2/multiz7way/maf
    # load summary table
    time nice -n +19 cat /gbdb/sacCer2/multiz7way/maf/*.maf \
	| hgLoadMafSummary sacCer2 -mergeGap=1500 multiz7waySummary stdin
    #	real    0m5.536
    #	Created 3037 summary blocks from 73806 components and 38795 mafs

#########################################################################
## phastCons conservation (DONE - 2009-02-11 - Hiram)
    #	Create SS files for each chromosome:
    mkdir /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
    cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
    mkdir SS
    
    # split up alignments; no need to use cluster here
    MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
    FA=/cluster/data/sacCer1
    CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
    mkdir -p SS
    for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")`
do
    echo $C
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
	../maf/${C}.maf -M ../../otherYeast/align/sacCer2/${C}.fa -i MAF \
	-O sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
	-o SS > SS/${C}.ss
done

    # produce rough starting model
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
	-i SS --aggregate sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
	-o SS -z SS/*.ss > all.ss

    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloFit \
	-i SS all.ss \
	--tree "((((((sacCer2,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \
	    -o starting-tree

    # (cheat the branch lengths up a bit, since this represents an
    # average of conserved and nonconserved sites; we want to
    # initialize for nonconserved)
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/tree_doctor \
	--scale 2 starting-tree.mod > tmp.mod
    mv tmp.mod starting-tree.mod 

    # also get average GC content of aligned regions
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
	-i SS all.ss --summary-only
#descrip.                      A          C          G          T        G+C  length   all_gaps  some_gaps
#all.ss                   0.3056     0.1950     0.1948     0.3047     0.3898   13130139          0    2122279

    # save some I/O and space
    gzip SS/*.ss
    
    # now set up cluster job to estimate model parameters.  See
    # other make{hg18|mm9}.doc for add'l details

    cat << '_EOF_' > doEstimate.sh
#!/bin/sh
zcat SS/$1.ss.gz | \
	/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons \
	- starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \
	--ignore-missing --expected-lengths 75 --target-coverage 0.5 \
	--quiet --log $2 --estimate-trees $3
'_EOF_'
    # << happy emacs
    chmod +x doEstimate.sh

    rm -fr LOG TREES
    mkdir  LOG TREES
    cat << '_EOF_' > template
#LOOP
doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    (cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list
    gensub2 chr.list single template jobList
    para create jobList
    para push
    para time
# Completed: 18 of 18 jobs
# CPU time in finished jobs:       8152s     135.86m     2.26h    0.09d  0.000 y
# IO & Wait Time:                   323s       5.39m     0.09h    0.00d  0.000 y
# Average job time:                 471s       7.85m     0.13h    0.01d
# Longest finished job:             889s      14.82m     0.25h    0.01d
# Submission to last job:           895s      14.92m     0.25h    0.01d

    # Now combine parameter estimates.  
    ls TREES/*.cons.mod > cons.txt
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
	--read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    ls TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
	--read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt

    # Now set up cluster job for computing consevation scores and
    # predicted elements
    cat << '_EOF_' > doPhastCons.sh
#!/bin/sh

mkdir -p POSTPROBS ELEMENTS
chr=$1
out=$2
tmpFile=/scratch/tmp/phastCons.$chr
rm -f $tmpFile
zcat SS/$chr.ss.gz \
    | /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \
	ave.cons.mod,ave.noncons.mod --expected-lengths 75 \
	--target-coverage 0.5 --quiet --seqname $chr --idpref $chr \
	--viterbi ELEMENTS/$chr.bed --score --require-informative 0 > $tmpFile
gzip -c $tmpFile > POSTPROBS/$chr.pp.gz
rm $tmpFile
'_EOF_'
    # << happy emacs
    chmod u+x doPhastCons.sh

    cat << '_EOF_' > run.template
#LOOP
doPhastCons.sh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 chr.list single run.template run.jobList
    rm -fr POSTPROBS ELEMENTS
    para create run.jobList
    para push
    para time
# Completed: 18 of 18 jobs
# CPU time in finished jobs:         78s       1.30m     0.02h    0.00d  0.000 y
# IO & Wait Time:                   658s      10.97m     0.18h    0.01d  0.000 y
# Average job time:                  41s       0.68m     0.01h    0.00d
# Longest finished job:              53s       0.88m     0.01h    0.00d
# Submission to last job:            57s       0.95m     0.02h    0.00d

    # set up phastConsElements track
    cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin \
		> phastConsElements7way.bed

    hgLoadBed sacCer2 phastConsElements7way phastConsElements7way.bed
    featureBits sacCer2 phastConsElements7way
# 7762587 bases of 12162995 (63.821%) in intersection
    #	previously this was:
    featureBits sacCer1 phastConsElements
# 7517116 bases of 12156302 (61.837%) in intersection

    mkdir ../downloads
    mkdir ../downloads/phastCons7way
    for C in `cat chr.list`
do
    cp -p POSTPROBS/${C}.pp.gz \
	../downloads/phastCons7way/sacCer2.${C}.wigFixed.gz
done

    #	encode those files into wiggle data
    zcat ../downloads/phastCons7way/*.wigFixed.gz \
	| wigEncode stdin phastCons7way.wig phastCons7way.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00

    # Load gbdb and database with wiggle.
    cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
    ln -s `pwd`/phastCons7way.wib /gbdb/sacCer2/multiz7way/phastCons7way.wib
    hgLoadWiggle -pathPrefix=/gbdb/sacCer2/multiz7way sacCer2 \
	phastCons7way phastCons7way.wig

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=sacCer2 phastCons7way > histogram.data 2>&1

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Yeast sacCer2 Histogram phastCons7way track"
set xlabel " phastCons7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	To create the tree diagram for the details page, use this tree
    #	definition in http://genome.ucsc.edu/cgi-bin/phyloGif

((((((S._cerevisiae,S._paradoxus),S._mikatae),S._kudriavzevii),S._bayanus),S._castelli),S._kluyveri)

#########################################################################
## Annotate the sacCer2 7-way sequence with genes
    mkdir /hive/data/genomes/sacCer2/bed/multiz7way/anno
    cd /hive/data/genomes/sacCer2/bed/multiz7way/anno
    mkdir genes
    # using sgdGene
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from sgdGene" sacCer2 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/sacCer2.sgdGene.gz

    (cat ../maf/*.maf | genePredToMafFrames sacCer2 stdin stdout \
sacCer2 genes/sacCer2.sgdGene.gz \
    | gzip > multiz7way.mafFrames.gz) > frames.log 2>&1

    zcat multiz7way.mafFrames.gz \
	| sort -k1,1 -k2,2n | hgLoadMafFrames sacCer2 multiz7wayFrames stdin

############################################################################
# creating upstream mafs (DONE - 2009-06-26 - Hiram)
    ssh hgwdev
    cd /hive/data/genomes/sacCer2/goldenPath/multiz7way
    # run this bash script
    cat << '_EOF_' > mkUpstream.sh
DB=sacCer2
GENE=sgdGene
NWAY=multiz7way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.${GENE}.maf"
    echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout"
    featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | mafFrags ${DB} ${NWAY} stdin stdout \
                -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.${GENE}.maf.gz
    echo "done upstream${S}.${GENE}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x mkUpstream.sh
    ./mkUpstream.sh

#########################################################################
## simpleRepeats (DONE - 2009-02-12 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/simpleRepeat
    cd /hive/data/genomes/sacCer2/bed/simpleRepeat
    doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
	-workhorse=hgwdev sacCer2 > do.log 2>&1

#########################################################################
## Regulatory Code (DONE - 2009-02-17 - Hiram)
    #	liftOver from sacCer1
    mkdir /hive/data/genomes/sacCer2/bed/transRegCode
    cd /hive/data/genomes/sacCer2/bed/transRegCode
    #	lifting sacCer1 data to this assembly
    hgsql -N -e "select chrom,chromStart,chromEnd,name,score,chipEvidence,consSpecies from transRegCode" sacCer1 \
    | liftOver -bedPlus=5 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
	transRegCode.lifted.bed transRegCode.unMapped.bed

    hgsql -N -e "select chrom,chromStart,chromEnd,name,tfCount,tfList,bindVals fromtransRegCodeProbe;" sacCer1 \
    | liftOver -bedPlus=4 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
        transRegCodeProbe.lifted.bed transRegCodeProbe.unMapped.bed


    hgLoadBed sacCer2 transRegCode transRegCode.lifted.bed \
        -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
    #	Loaded 206672 elements of size 7
    hgLoadBed sacCer2 transRegCodeProbe transRegCodeProbe.lifted.bed \
        -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
    #	Loaded 6178 elements of size 7

    hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
    hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
    hgsql sacCer2 < $HOME/kent/src/hg/lib/growthCondition.sql

    hgsql -N -e "select * from transRegCodeCondition;" sacCer1 \
    | hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table transRegCodeCondition'

    hgsql -N -e "select * from transRegCodeMotif;" sacCer1 \
    | hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table transRegCodeMotif'

    hgsql -N -e "select * from growthCondition;" sacCer1 \
    | hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table growthCondition'

#########################################################################
# Oreganno track - (DONE - 2009-02-17 - Hiram)
    #	liftOver from sacCer1 database
    hgsql -N -e \
"select chrom,chromStart,chromEnd,id,strand,name from oreganno;" sacCer1 \
    | liftOver -bedPlus=4 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
        oreganno.lifted.bed oreganno.unMapped.bed

    hgsql sacCer2 < $HOME/kent/src/hg/lib/oreganno.sql

    hgLoadBed -oldTable sacCer2 oreganno oreganno.lifted.bed -tab
    #	Loaded 7302 elements of size 6

    #	and load non-positional tracks from sacCer1:
    hgsql -N -e "select * from oregannoAttr;" sacCer1 \
	| hgLoadSqlTab -oldTable sacCer2 oregannoAttr \
	    ~/humPhen/kent/src/hg/lib/oreganno.sql stdin
    hgsql -N -e "select * from oregannoLink;" sacCer1 \
	| hgLoadSqlTab -oldTable sacCer2 oregannoLink \
	    ~/humPhen/kent/src/hg/lib/oreganno.sql stdin

#########################################################################
# Regulatory Module - (DONE - 2009-02-17 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/regModule
    cd /hive/data/genomes/sacCer2/bed/regModule
    #	liftOver data from sacCer1
    hgsql -N -e \
"select chrom,chromStart,chromEnd,name,score,strand from esRegUpstreamRegion;" \
	sacCer1 | liftOver -bedPlus=6 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
        esRegUpstreamRegion.lifted.bed esRegUpstreamRegion.unMapped.bed

    hgsql -N -e \
"select chrom,chromStart,chromEnd,name,score,strand,gene from esRegGeneToMotif;" \
	sacCer1 | liftOver -bedPlus=6 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
        esRegGeneToMotif.lifted.bed esRegGeneToMotif.unMapped.bed

    #	I do not see instructions in sacCer1 to create these tables,
    #	so, dump their schemas:
    hgsqldump --all -c -d --tab=. sacCer1 esRegUpstreamRegion esRegGeneToMotif
    #	and data for these other two:
    hgsqldump --all -c --tab=. sacCer1 esRegGeneToModule esRegMotif

    hgLoadBed sacCer2 esRegGeneToMotif -sqlTable=esRegGeneToMotif.sql -tab \
	esRegGeneToMotif.lifted.bed
    #	Loaded 4002 elements of size 7
    hgLoadBed sacCer2 esRegUpstreamRegion -sqlTable=esRegUpstreamRegion.sql \
	-tab esRegUpstreamRegion.lifted.bed
    #	Loaded 1670 elements of size 6
    hgsql sacCer2 < esRegMotif.sql
    hgsql sacCer2 -e \
'load data local infile "esRegMotif.txt" into table esRegMotif;'
    hgsql sacCer2 < esRegGeneToModule.sql
    hgsql sacCer2 -e \
'load data local infile "esRegGeneToModule.txt" into table esRegGeneToModule;'

#########################################################################
# creating tables for Gene Sorter (DONE - 2009-02-17 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/hgNear
    cd /hive/data/genomes/sacCer2/bed/hgNear
    hgClusterGenes sacCer2 sgdGene sgdIsoforms sgdCanonical
    #	Got 6550 clusters, from 6717 genes in 18 chromosomes

    # Make self mapping table for expression. 
    hgsql -N -e 'select name from sgdGene;' sacCer2 \
         | awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
    hgsql sacCer2 -e 'create table sgdToSgd ( \
          name varchar(10) not null, \
	  value varchar(10) not null, \
	  PRIMARY KEY(name), \
	  UNIQUE (value));'
    hgsql sacCer2 \
	-e 'load data local infile "sgdToSgd.tab" into table sgdToSgd'
    # Removed sgdToSgd table because it wasn't being used anywhere.
    # Katrina 7/14/09


    # Make expression similarity table. 
    hgExpDistance sacCer2 hgFixed.yeastChoCellCycle \
	hgFixed.yeastChoCellCycleExps choExpDistance 
    #	Have 6259 elements in hgFixed.yeastChoCellCycle
    #	Got 6259 unique elements in hgFixed.yeastChoCellCycle
    #	Made choExpDistance.tab

#########################################################################
# running the blastP operation to the other genomes for the gene sorter
#	(DONE - 2009-02-18 - Hiram)
    mkdir /hive/data/genomes/sacCer2/bed/hgNearBlastp
    cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
    mkdir tmp 090218
    pepPredToFa sacCer2 sgdPep 090218/sgdPep.faa

    pepPredToFa hg18 knownGenePep 090218/hg18.known.faa
    pepPredToFa mm9 knownGenePep 090218/mm9.known.faa
    pepPredToFa rn4 knownGenePep 090218/rn4.known.faa
    pepPredToFa danRer5 ensPep 090218/danRer5.ensPep.faa
    pepPredToFa dm3 flyBasePep 090218/dm3.flyBasePep.faa
    pepPredToFa ce6 sangerPep 090218/ce6.sangerPep.faa

    # sanity check, number of lines in each faa file

    cd 090218
    cat << '_EOF_' > config.ra
# Latest Yeast vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, worm

targetGenesetPrefix known
targetDb sacCer2
queryDbs hg18 mm9 rn4 danRer5 dm3 ce6

sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa
hg18Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/hg18.known.faa
mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/mm9.known.faa
rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/rn4.known.faa
danRer5Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/danRer5.ensPep.faa
dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/dm3.flyBasePep.faa
ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/ce6.sangerPep.faa

buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218
scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/tmp
'_EOF_'
    # << happy emacs
    #	takes about an hour
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doHgNearBlastp.pl \
	config.ra > do.log 2>&1 &
    #	real    21m32.343s

    #	one name seems to have snuck in here:
    cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
    hgsql -N -e "select query from mmBlastTab;" sacCer2 \
	| sort -u > sacCer2.mmBlastTab.query.txt
    hgsql -N -e "select name from sgdGene;" sacCer2 \
	| sort -u > sacCer2.sgdGene.name.txt
    #	the single one is:
    comm -23 sacCer2.mmBlastTab.query.txt sacCer2.sgdGene.name.txt
    #	YDL038C
    #	it was the same in all of them:
    hgsql -e "delete from mmBlastTab where query=\"YDL038C\";" sacCer2
    hgsql -e "delete from drBlastTab where query=\"YDL038C\";" sacCer2
    hgsql -e "delete from dmBlastTab where query=\"YDL038C\";" sacCer2
    hgsql -e "delete from ceBlastTab where query=\"YDL038C\";" sacCer2

    hgsql -N -e "select query from knownBlastTab;" sacCer2 \
	| sort -u > sacCer2.knownBlastTab.query.txt
    comm -23 sacCer2.knownBlastTab.query.txt sacCer2.sgdGene.name.txt \
	| while read N
do
    hgsql -e "delete from knownBlastTab where query=\"${N}\";" sacCer2
done

# knownBlastTab does not run any columns in yeast GS.  
 RENAME TABLE knownBlastTab TO sgdBlastTab;
#  kuhn and katrina 06-18-2009

#########################################################################
#  creating download files and pushQ (DONE - 2009-02-24 - Hiram)
    cd /hive/data/genoems/sacCer2
    #	there aren't any repeats on 2micron
    touch bed/simpleRepeat/trfMaskChrom/2micron.bed
    #	and, there are no RM files:
    makeDownloads.pl -ignoreRepeatMasker sacCer2
    #	edit the README files in:
    #	./goldenPath/bigZips/README.txt
    #	./goldenPath/database/README.txt
    #	./goldenPath/liftOver/README.txt
    #	./goldenPath/chromosomes/README.txt

    mkdir pushQ
    makePushQSql.pl sacCer2 > sacCer2.pushQ.sql
    #	one warning:
    #	sacCer2 does not have seq
    #	it could not identify the following tables:
    #	2micron_est
    #	2micron_gap
    #	2micron_gold
    #	2micron_intronEst
    #	2micron_mrna
    #	growthCondition
    #	sgdToPfam
    #	yeastP2P
    scp -p sacCer2.pushQ.sql hiram@hgwbeta:/tmp

    ssh hgwbeta
    hgsql qapushq < sacCer2.pushQ.sql

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("sacCer2", "blat10", "17792", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("sacCer2", "blat10", "17793", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# uwFootprints: (2009-04-04 markd)
# lifted sacCer1 tag count data and set to Nobel lab
#   William Stafford Noble <noble@gs.washington.edu>
#   Xiaoyu Chen <xchen@cs.washington.edu>

    # lift counts to sacSer2 and give back to UW
    cd /hive/data/genomes/sacCer1/bed/uwFootprintsxi
    wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed
    liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed  /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed
    
    mkdir /hive/data/genomes/sacCer2/bed/uwFootprints
    cd /hive/data/genomes/sacCer2/bed/uwFootprints

    # get back wig
    wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.sacCer2.lifted.wig

    # lift other BEDs
    liftOver /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.mappability.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.mappability.bed yeast.mappability.nolift.bed
    # drop yeast.footprints.bed and truncate name to three decimal places
    tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}'  /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.footprints.bed | liftOver stdin /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.footprints.bed yeast.footprints.nolift.bed

    wigEncode yeast.dnaseI.tagCounts.sacCer2.lifted.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib
    # Converted yeast.dnaseI.tagCounts.sacCer2.lifted.wig, upper limit 13798.00, lower limit 1.00

    ln -sf /hive/data/genomes/sacCer2/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer2/wib
    hgLoadWiggle -tmpDir=/data/tmp sacCer2 uwFootprintsTagCounts uwFootprintsTagCounts.wig 

    hgLoadBed -tab -tmpDir=/data/tmp sacCer2 uwFootprintsMappability yeast.mappability.bed
    hgLoadBed -tmpDir=/data/tmp sacCer2 uwFootprintsPrints yeast.footprints.bed

############################################################################
# fixup sgdToName table (DONE - 2009-07-09 - Hiram)
    # this table is missing a name correspondence for some of
    #	the gene names in sgdGene.name
    #	to fixup, any names in sgdGene.name that are not in sgdToName,
    #	simply add those names and reference themselves
    mkdir /hive/data/genomes/sacCer2/bed/fixSgdToName
    cd /hive/data/genomes/sacCer2/bed/fixSgdToName
    hgsql -N -e "select name from sgdGene;" sacCer2 | sort -u > sgdGene.name
    hgsql -N -e "select name from sgdToName;" sacCer2 > sgdToName.tab
    #	convert the two columns of names to a single list of unique names
    cat sgdToName.tab | tr '[\t]' '[\n]' | sort -u > all.sgdToName.name
    comm -12 sgdGene.name all.sgdToName.name > commonToBoth
    comm -23 sgdGene.name all.sgdToName.name > uniqueToSgdGene
    comm -13 sgdGene.name all.sgdToName.name > uniqueToSgdToName
    awk '{printf "%s\t%s\n", $1, $1}' uniqueToSgdGene > addSgdGeneNames.tab
    #	count before load
    hgsql -e "select count(*) from sgdToName;" sacCer2
    #	6254
    # adding names:
    wc -l addSgdGeneNames.tab
    #	472
    #	6254 + 472 = 6726
    hgsql sacCer2 \
	-e 'load data local infile "addSgdGeneNames.tab" into table sgdToName;'
    hgsql -e "select count(*) from sgdToName;" sacCer2
    #	6726

# Removed extraneous records from sgdToName (DONE - 2009-07-10 - Katrina)
	#sgdGene has 6717 records (all of which have a distinct value in the
	#name field) and sgdToName has 6726 (all of which have a distinct value
	#in the name field).I searched for these 9 gene names in the SGD web
	#site (yeastgenome.org). Most are alias' of another gene or something
	#that was originally characterized as a separate gene but later found
	#to be a part of another gene.
	mysql> delete from sgdToName where name= "YBL039W-A";
	mysql> delete from sgdToName where name= "YBL101W-A";
	mysql> delete from sgdToName where name= "YBL101W-C";
	mysql> delete from sgdToName where name= "YDL038C";
	mysql> delete from sgdToName where name= "YDR524W-A";
	mysql> delete from sgdToName where name= "YGR272C";
	mysql> delete from sgdToName where name= "YLR157W-A";
	mysql> delete from sgdToName where name= "YLR157W-C";
	mysql> delete from sgdToName where name= "YNL097C-A";

############################################################################
# for some unknown reason, indexes missing on chrM_gold and chrM_gap
#	maybe because they have only one and zero items ?
#		(DONE - 2009-07-21 - Hiram)
    hgsql -e "alter table chrM_gold add index chromStart (chromStart);" sacCer2
    hgsql -e "alter table chrM_gold add index bin (bin);" sacCer2
    hgsql -e "alter table chrM_gold add index frag (frag(9));" sacCer2
    hgsql -e "alter table chrM_gap add index chromStart (chromStart);" sacCer2
    hgsql -e "alter table chrM_gap add index bin (bin);" sacCer2

############################################################################
# LIFTOVER TO SacCer1 (DONE - 2010-01-14 - Hiram )
    mkdir /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14
    cd /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug sacCer2 sacCer1
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 sacCer2 sacCer1 > do.log 2>&1
    #	real    3m42.700s

#############################################################################
# Update BlastTab tables of sacCer2	(DONE, 2010-08-06, Fan)
    cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
    mkdir 100806
    cd    100806

    pepPredToFa sacCer2 sgdPep sgdPep.faa

    pepPredToFa hg19 knownGenePep hg19.known.faa
    pepPredToFa mm9 knownGenePep mm9.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer6 ensPep danRer6.ensPep.faa
    pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
    pepPredToFa ce6 sangerPep ce6.sangerPep.faa

    # sanity check, number of lines in each faa file

    cat << '_EOF_' > config.ra
# Latest Yeast vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, worm

targetGenesetPrefix sgd
targetDb sacCer2
queryDbs hg19 mm9 rn4 danRer6 dm3 ce6

sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/sgdPep.faa
hg19Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/hg19.known.faa
mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/mm9.known.faa
rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/rn4.known.faa
danRer6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/danRer6.ensPep.faa
dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/dm3.flyBasePep.faa
ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/ce6.sangerPep.faa

buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806
scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/100806/tmp
'_EOF_'

doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log

#########################################################################
# LIFTOVER TO SacCer3 (DONE - 2009-01-04 - Hiram )
    mkdir /hive/data/genomes/sacCer2/bed/blat.sacCer3.2011-08-31
    cd /hive/data/genomes/sacCer2/bed/blat.sacCer3.2011-08-31
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug sacCer2 sacCer3
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 sacCer2 sacCer3 > do.log 2>&1
    #	real    3m30.878s

#############################################################################

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd