# for emacs: -*- mode: sh; -*-

# Caenorhabditis elegans
#	Washington University School of Medicine GSC and Sanger Institute WS170

#  $Id: ce4.txt,v 1.34 2008/06/24 23:00:53 hiram Exp $


#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-02-23 - Hiram)
    ssh kkstore02
    mkdir /cluster/store5/worm/ce4
    ln -s /cluster/store5/worm/ce4 /cluster/data/ce4
    cd /cluster/data/ce4
    mkdir WS170
    cd WS170
    wget --timestamping -m -np -nd \
	"ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/CHROMOSOMES/"


#########################################################################
## Translate Sanger/Wormbase chrom names into UCSC names
##	(DONE - 2007-02-23 - Hiram)
    ## bash syntax, this sequence captured in toUCSC_Fa.sh
WSxxx=WS170
export WSxxx

if [ -d downloads ]; then
    mv downloads downloads.0
    rm -fr downloads.0 &
fi

mkdir downloads
mkdir downloads/fasta
mkdir downloads/agp

for C in I II III IV V X
do
    echo -n "${C} "
    zcat ${WSxxx}/CHROMOSOME_${C}.dna.gz | tr '[a-z]' '[A-Z]' | \
            sed -e "s/CHROMOSOME_/chr/" > downloads/fasta/chr${C}.fa
    sed -e "s/^/chr/" ${WSxxx}/CHROMOSOME_${C}.agp > downloads/agp/chr${C}.agp
done

C=M
echo -n "${C} "
zcat ${WSxxx}/CHROMOSOME_MtDNA.dna.gz | tr '[a-z]' '[A-Z]' | \
	sed -e "s/CHROMOSOME_MTDNA/chrM/" > downloads/fasta/chr${C}.fa
echo "chrM 1 13794 1 F X54252.1 1 13794 +" | sed -e "s/  */\t/g" > \
        downloads/agp/chr${C}.agp

gzip downloads/fasta/chr*.fa
gzip downloads/agp/chr*.agp

#########################################################################
## Create config.ra file, and run makeGenomeDb.pl (DONE - 2007-02-23 - Hiram)
    ssh kkstore02
    cd /cluster/data/ce4
    cat << '_EOF_' > ce4.config.ra
# Config parameters for makeGenomeDb.pl:
db ce4
clade worm
genomeCladePriority 10
scientificName Caenorhabditis elegans
commonName C. elegans
assemblyDate Jan. 2007
assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS170
orderKey 858
mitoAcc none
fastaFiles /cluster/data/ce4/downloads/fasta/chr*.fa.gz
agpFiles /cluster/data/ce4/downloads/agp/chr*.agp.gz
# qualFiles /dev/null
dbDbSpeciesDir worm
'_EOF_'
    # << happy emacs

    nice -n +19 makeGenomeDb.pl ce4.config.ra > makeGenomeDb.out 2>&1

###########################################################################
## RepeatMasker (DONE - 2007-02-23 - Hiram)
    ssh kkstore02
    cd /cluster/data/ce4
    time nice -n +19 doRepeatMasker.pl ce4 > doRepeatMasker.out 2>&1 &
    mv doRepeatMasker.out bed/RepeatMasker.2007-02-23
    #	real    48m37.395s
    twoBitToFa ce4.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
    #	100281244 bases (0 N's 100281244 real 87184653 upper 13096591 lower)
    #	in 7 sequences in 1 files
    #	%13.06 masked total, %13.06 masked real

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram)
    # Use -repMatch=40 (based on size -- for human we use 1024, and 
    # C. elegans size is ~3.4% of human judging by gapless ce4 vs. hg18 
    # genome sizes from featureBits.
    ssh kolossus
    cd /cluster/data/ce4
    blat ce4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=36
    #	Wrote 43729 overused 11-mers to jkStuff/11.ooc
    cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/ce4

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add ce4 just before ce3

# ce4 (C. elegans)
ce4.serverGenome = /cluster/data/ce4/ce4.2bit
ce4.clusterGenome = /san/sanvol1/scratch/worms/ce4/ce4.2bit
ce4.ooc = /san/sanvol1/scratch/worms/ce4/11.ooc
ce4.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
ce4.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
ce4.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
ce4.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
ce4.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
ce4.maxIntron = 100000
ce4.lift = no
ce4.genbank.mrna.xeno.load = no
ce4.downloadDir = ce4


    cvs ci -m "Added ce4." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial ce4 &
    #	logFile: var/build/logs/2007.03.27-15:39:56.ce4.initalign.log
    #	real    61m33.930s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce4
    #	logFile: var/dbload/hgwdev/logs/2007.03.27-16:43:00.dbload.log
    #	real    14m41.601s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add ce4 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added ce4 - C. elegans WS170" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
## Create goldenPath files for hgdownload (DONE - 2007-04-05 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ce4/goldenPath
    cd /cluster/data/ce4/goldenPath
    mkdir bigZips chromosomes database

    for C in I II III IV V X M
do
    maskOutFa ${C}/chr${C}.fa hard ${C}/chr${C}.fa.masked
done
    tar cvzf goldenPath/bigZips/chromFaMasked.tar.gz */chr*.fa.masked

    cd chromosomes
    for C in I II III IV V X M
do
    twoBitToFa -seq=chr${C} ../../ce4.2bit stdout | gzip -c > chr${C}.fa.gz
    echo chr${C} done
done
    #	check the sequence
    faSize *.fa.gz
    #	100281244 bases (0 N's 100281244 real 87008909
    #	upper 13272335 lower) in 7 sequences in 7 files
    #	%13.24 masked total, %13.24 masked real
    twoBitToFa ../../ce4.2bit stdout | faSize stdin
    #	100281244 bases (0 N's 100281244 real 87008909
    #	upper 13272335 lower) in 7 sequences in 1 files
    #	%13.24 masked total, %13.24 masked real

    cd ../bigZips
    splitFileByColumn -ending=agp ../../ce4.agp .
    tar cvzf ./chromAgp.tar.gz ./chr*.agp
    rm chr*.agp
    for C in I II III IV V X M
do
    zcat ../chromosomes/chr${C}.fa.gz > chr${C}.fa
    echo chr${C} done
done
    tar cvzf ./chromFa.tar.gz chr*.fa
    rm chr*.fa

    cp -p ../../*/chr*.fa.out .
    tar cvzf ./chromOut.tar.gz chr*.fa.out
    rm chr*.fa.out

    splitFileByColumn -ending=bed ../../bed/simpleRepeat/trfMask.bed .
    tar cvzf ./chromTrf.tar.gz chr*.bed
    rm chr*.bed

    ssh hgwdev
    cd /cluster/data/genbank
    # get GenBank native mRNAs
    cd /cluster/data/genbank
    ./bin/x86_64/gbGetSeqs -db=ce4 -native GenBank mrna \
	/cluster/data/ce4/goldenPath/bigZips/mrna.fa
    # get GenBank xeno mRNAs
    ./bin/x86_64/gbGetSeqs -db=ce4 -xeno GenBank mrna \
	/cluster/data/ce4/goldenPath/bigZips/xenoMrna.fa.gz
    # get native GenBank ESTs
    ./bin/x86_64/gbGetSeqs -db=ce4 -native GenBank est \
	/cluster/data/ce4/goldenPath/bigZips/est.fa
    # get native RefSeq mRNAs
    ./bin/x86_64/gbGetSeqs -db=ce4 -native RefSeq mrna \
	/cluster/data/ce4/goldenPath/bigZips/refMrna.fa

    ssh kkstore02
    cd /cluster/data/ce4/goldenPath/bigZips
    gzip mrna.fa est.fa refMrna.fa
    md5sum *.gz > md5sum.txt

###########################################################################
## Simple Repeats (DONE - 2007-02-24 - Hiram)
##		Documented 31 May 2007
    ssh kolossus
    mkdir /cluster/data/ce4/bed/simpleRepeat
    cd /cluster/data/ce4/bed/simpleRepeat
    twoBitToFa ../../ce4.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
        -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
    
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    ssh hgwdev
    cd /cluster/data/ce4/bed/simpleRepeat
    nice -n +19 hgLoadBed ce4 simpleRepeat \
      /cluster/data/ce4/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    
    featureBits ce4 simpleRepeat > fb.ce4.simpleRepeat.txt 2>&1
    #	3801183 bases of 100281244 (3.791%) in intersection

############################################################################
## Add TRF to RMSK (DONE - 2007-03-04 - Hiram)
    ssh kkstore02
    cd /cluster/data/ce4
    cat bed/simpleRepeat/trfMask.bed \
	twoBitMask -add -type=.bed ce4.rmsk.2bit stdin ce4.rmskTrf.2bit
    twoBitToFa ce4.rmskTrf.2bit stdout | faSize stdin \
	> faSize.ce4.rmskTrf.txt
    #	100281244 bases (0 N's 100281244 real 87008909 upper 13272335 lower)
    #	in 7 sequences in 1 files
    #	%13.24 masked total, %13.24 masked real

#########################################################################
#########################################################################
### The sequences below were an experiment beginning early 2006

# FETCH acedb database for construction of Sanger Links table
## OBSOLETE procedure attempted on WS150
#	DONE - 2006-01-09 - Hiram
    ssh kkstore02
    mkdir /cluster/store5/worm/ce4/bed/acedb
    cd /cluster/store5/worm/ce4/bed/acedb
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=4 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS150/
    #	Then, to install this acedb database:
    #	started 2006-01-09 09:48
    chmod +x INSTALL
    ./INSTALL

#########################################################################
#	Create sangerLinks data and table
## OBSOLETE procedure attempted on WS150
    ssh kkstore02
    mkdir /cluster/store5/worm/ce4/bed/sangerLinks
    cd /cluster/store5/worm/ce4/bed/sangerLinks

    /cluster/bin/acedb/tace /cluster/store5/worm/ce4/bed/acedb
    # At prompt, can type ? for help, type the following queries
#    wbGeneClass.txt - associates 3 letter first part of worm gene
#             name with a brief description.  
    acedb> AQL -o wbGeneClass.txt select l,l->Description  from l in class Gene_Class
    #	it says:
    #	// 1578 Active Objects

#    wbLongDescr.txt - associates worm gene name with a several
#             sentence description. 
    acedb> AQL -o wbLongDescr.txt select l,l->Gene_info[Provisional_description] from l in class Gene
    #	it says:
    #	// 5640 Active Objects
    #	ctrl-D to exit the acedb monitor

    #	remove header lines and "" from those two files:
    grep -v "^Format" wbGeneClass.txt | \
	sed -e 's/Gene_class://; s/Text://; s/"//g;' \
	    > wbGeneClass.new
    grep -v "^Format" wbLongDescr.txt | \
	sed -e 's/Gene://; s/Text://; s/"//g;' \
	    > wbLongDescr.new
    mv wbGeneClass.new wbGeneClass.txt
    mv wbLongDescr.new wbLongDescr.txt
    #	Check how we compare to previous build, there will probably be a
    #	few more with each new build:
    wc wbGeneClass.txt wbLongDescr.txt
    #	 1625    7178   52423 wbGeneClass.txt
    #	51111  445663 3390455 wbLongDescr.txt
    wc /cluster/data/ce3/bed/sangerLinks/wbGeneClass.txt \
	/cluster/data/ce3/bed/sangerLinks/wbLongDescr.txt
    #	  1515    6589   47870 /cluster/data/ce3/bed/sangerLinks/wbGeneClass.txt
    #	 50417  421524 3220249 /cluster/data/ce3/bed/sangerLinks/wbLongDescr.txt

    tar xvzf ../acedb/wormpep150.tar.gz
    zcat ../acedb/geneIDs.WS150.gz > geneIDs.WS150.txt

    #	perl script is from previous build, this same directory
    cp -p /cluster/data/ce3/bed/sangerLinks/mkSangerLinks.pl .
    ./mkSangerLinks.pl wormpep150/wormpep.table150 geneIDs.WS150.txt \
	wbLongDescr.txt wbGeneClass.txt > sangerLinks.txt 2> dbg.out
    #	should be relatively few names without descriptions:
    grep "no descr" dbg.out | wc
    #	     63     441    3477

###############################################################################
## SANGER GENE TRACK 2nd attempt (DONE - 2007-04-27 - Hiram)
##	There is a tremendous amount of extraneous notations in the gff
##	files.  Filter them down to a manageable set, change the chrom
##	names, eliminate duplicates, select only what will work in
##	ldHgGene
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/ws170Genes
    cd /cluster/data/ce4/bed/ws170Genes
for C in I II III IV V X
do
    echo -n "${C} "
    zcat ../../WS170/CHROMOSOME_${C}.gff.gz | \
    sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
        s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
        s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
        s/CHROMOSOME_M/chrM/g;" \
        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
        -e 's/CDS "//' -e 's/"//' \
                > chr${C}.gff
done
C=M
echo -n "${C} "
zcat ../../WS170/CHROMOSOME_MtDNA.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
    s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
    s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
    s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
    -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
    -e 's/CDS "//' -e 's/"//' \
            > chr${C}.gff
for C in I II III IV V X M
do
    echo "chr${C}.gff -> filtered.chr${C}.gff"
    grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
    if (match($2,"curated|Coding_transcript")) {
        if (match($3,"intron|coding_exon|cds|three_prime_UTR|five_prime_UTR")) {
            gsub("Transcript ","",$9)
            gsub(" .*","",$9)
            gsub("three_prime_UTR","3utr",$3)
            gsub("five_prime_UTR","5utr",$3)
            for (i = 1; i < 9; ++i) {
                printf "%s\t", $i
            }
            printf "%s\n", $9
        }
    }
}
' | sort -u | sort -k4n > filtered.chr${C}.gff
done

    ssh hgwdev
    cd /cluster/data/ce4/bed/ws170Genes
    time nice -n +19 ldHgGene ce4 sangerGene filtered.*.gff
    #	Read 52047 transcripts in 980039 lines in 7 files
    #	52047 groups 7 seqs 3 sources 5 feature types
    #	30497 gene predictions
    #	real    0m2.569s
    ## this didn't work with the bin column, so alter that table
    hgsql -e "alter table sangerGene drop column bin;" ce4
    ## done 2007-07-10
    ## found when trying to click through on sequence links, mRNA or
    ## protein when on a gene details page

    ##	This requires the following to also be re-run
    cd /cluster/data/ce4/bed/sangerGene
    # Add proteinID field to sangerGene table, used by Gene Sorter
    hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce4
    # To add index on this column
    hgsql -e 'alter table sangerGene add index(proteinID);' ce4
    # Use SwissProt IDs in sangerLinks table to fill proteinID 
    # column in the sangerGene table
    hgsql -e 'update sangerGene as g, sangerLinks as l \
      set g.proteinID = l.protName where g.name = l.orfName;' ce4
    # check there are rows with the protName field filled
    hgsql -N -e "select proteinId from sangerGene" ce4 | sort | uniq -c | wc -l
    #	22126
    hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql orfToGene.tab
    hgClusterGenes ce4 sangerGene sangerIsoforms sangerCanonical
    #	Got 19965 clusters, from 30497 genes in 7 chromosomes
    hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
	sangerLinks.tab
    hgPepPred ce4 generic sangerPep wormpep170/wormpep170
    hgMapToGene ce4 refGene sangerGene sangerToRefSeq

    cd /cluster/data/ce4/bed/blastp/self/run/out
    hgLoadBlastTab ce4 sangerBlastTab *.tab
    #	Scanning through 986 files
    #	Loading database with 1214556 rows

    #	Running joinerCheck to see what is sane:
    joinerCheck -identifier=wormBaseId -database=ce4 -times -keys all.joiner
    #	Error: 999 of 27783 elements of ce4.orfToGene.name are not in key
    #		sangerGene.name
    #	indicates the non-coding genes in orfToGene are missing
    #	from sangerGene.  So, removing those from orfToGene:
    hgsql -N -e "select name from orfToGene;" ce4 | sort > orfToGene.name
    hgsql -N -e "select name from sangerGene;" ce4 | sort > sangerGene.name
    comm -23 orfToGene.name sangerGene.name > rm.orfToGene.name
    wc -l rm.orfToGene.name
    hgsql -N -e "select count(*) from orfToGene;" ce4
    #	27783
    for G in `cat rm.orfToGene.name`
do
    hgsql -e 'delete from orfToGene where name="'"${G}"'";' ce4
    echo $G
done
    hgsql -N -e "select count(*) from orfToGene;" ce4
    #	26874

    ## create a gene name to WBGene ID reference table
    ## to be used to construct URL
    mkdir /cluster/data/ce4/bed/WBGeneID
    cd /cluster/data/ce4/bed/WBGeneID
    #	There was one duplicate entry in the file, the sort -u eliminates it
    sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \
	../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab
    hgLoadSqlTab ce4 sangerGeneToWBGeneID \
	~/kent/src/hg/lib/sangerGeneToWBGeneID.sql sangerGeneToWBGeneID.tab
    ## later, it was discovered that this list needed to also be updated
    ## with the one-dot names, where only the two-dot names were present.
    ## all.joiner rule added, but it still is missing gene names.

###############################################################################
## RNA sangerGenes (DONE - 2007-04-27 - Hiram)
    ssh kkstore02
    cd /cluster/data/ce4/bed/ws170Genes
for C in I II III IV V X M
do
    echo "chr${C}.gff -> rna.chr${C}.gff"
    grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
    if (match($2,"^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) {
        if (match($3,"exon")) {
            gsub("Transcript ","",$9)
            gsub(" .*","",$9)
            for (i = 1; i < 9; ++i) {
                printf "%s\t", $i
            }
            printf "%s\n", $9
        }
    }
}
' | sort -u | sort -k4n > rna.chr${C}.gff
done

    ssh hgwdev
    cd /cluster/data/ce4/bed/ws170Genes
    time nice -n +19 ldHgGene ce4 sangerRnaGene rna.*.gff
    #	Read 999 transcripts in 1030 lines in 7 files
    #	999 groups 7 seqs 9 sources 1 feature types
    #	999 gene predictions

    time nice -n +19 ldHgGene ce4 sangerPseudoGene pseudoGene.*.gff
    #	Read 1154 transcripts in 1154 lines in 7 files
    #	1154 groups 6 seqs 1 sources 1 feature types
    #	1154 gene predictions


###############################################################################
## SANGER GENE TRACK 1st attempt - was reloaded (DONE - 2007-03-28 - Hiram)
    mkdir /cluster/data/ce4/bed/sangerGene
    cd /cluster/data/ce4/bed/sangerGene
    #	this tar file package was prepared at WormBase at Jim's request
    #	to ease our loading task here.  From Anthony Rogers, ar2@sanger.ac.uk
    #	URL in email to Jim 09 Feb 2007:
    #	ftp://ftp.sanger.ac.uk/pub2/wormbase/WS170/wormbase_UCSC_WS170.tar.gz
    #	Was unpacked into /cluster/data/ce4/downloads/tarFile
    ln -s /cluster/data/ce4/downloads/tarFile/*.gff .
    time nice -n +19 ldHgGene ce4 sangerGene *.gff
    #	Read 28281 transcripts in 351460 lines in 6 files
    #	28281 groups 6 seqs 8 sources 2 feature types
    #	28281 gene predictions

    ## Create orfToGene table.  They had one duplicate line in their file:
    sort -u /cluster/data/ce4/downloads/tarFile/gene_name.txt \
	| sed -e "s/  */\t/" > orfToGene.tab
    ## plus, they didn't include the self with self references, which are
    ## needed in the gene sorter so it can find its names.
    hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql orfToGene.tab
    ## fixed this table in /cluster/data/ce4/bed/fixupOrfToGene
    ## by taking any name in the second column that was not mentioned in
    ## the first column, and making it refer to itself.
    hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql \
	fixedOrfToGene.tab
    ln -s /cluster/data/ce4/downloads/tarFile/paragraph.txt ./paragraph.tab
    ln -s /cluster/data/ce4/downloads/tarFile/uniprot.txt ./uniprot.tab
    cat << '_EOF_' > mkSangerLinks.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %geneToOrf;  #       key is uc(gene name), value is orf
my %orfToGene;  #       key is uc(orf), value is gene name

my $orfCount = 0;
open (FH, "<orfToGene.tab") or die "can not read orfToGene.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($orf, $gene) = split('\t',$line);
    die "gene name zero length" if (length($gene) < 1);
    die "orf name zero length" if (length($orf) < 1);
    $geneToOrf{uc($gene)} = $orf;
    $orfToGene{uc($orf)} = $gene;
    ++$orfCount;
}

close (FH);
printf STDERR "read $orfCount orf designations from orfToGene.tab\n";

open (LS,">sangerLinks.tab") or die "can not write to sangerLinks.tab";

my $geneCount = 0;
open (FH, "sort -u paragraph.tab|") or die "can not read paragraph.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($orf, $descr) = split('\t',$line);
    die "gene name zero length" if (length($orf) < 1);
    die "can not find orfToGene $orf" if (!exists($orfToGene{uc($orf)}));
    my $orfGene = $orfToGene{uc($orf)};
    die "orfGene name zero length" if (length($orfGene) < 1);
    printf LS "%s\t%s\t%s\n", $orf, $orfGene, $descr;
    ++$geneCount;
}

close (FH);
printf STDERR "read $geneCount gene descriptions from paragraph.tab\n";
close (LS);
'_EOF_'
    # << happy emacs
    chmod +x mkSangerLinks.pl
    ./mkSangerLinks.pl
    #	read 27783 orf designations from orfToGene.tab
    #	read 27783 gene descriptions from paragraph.tab

    hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
	sangerLinks.tab
    # Add proteinID field to sangerGene table, used by Gene Sorter
    hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce4
    # To add index on this column
    hgsql -e 'alter table sangerGene add index(proteinID);' ce4
    # Use SwissProt IDs in sangerLinks table to fill proteinID 
    # column in the sangerGene table
    hgsql -e 'update sangerGene as g, sangerLinks as l \
      set g.proteinID = l.protName where g.name = l.orfName;' ce4
    # check there are rows with the protName field filled
    hgsql -N -e "select proteinId from sangerGene" ce4 | sort | uniq -c | wc -l
    #	22126
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/wormpep170.tar.gz"
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/wormrna170.tar.gz"
    tar xvzf wormrna170.tar.gz
    mv README README.wormrna170
    tar xvzf wormpep170.tar.gz
    #	creates files in directory wormpep170:
    #	-rw-rw-r--  1 18238530 Jan  4 07:15 wp.fasta170
    #	-rw-rw-r--  1   949522 Jan  4 07:15 wormpep.history170
    #	-rw-rw-r--  1     1547 Jan  4 07:15 wormpep.diff170
    #	-rw-rw-r--  1 31502715 Jan  9 07:19 wormpep.dna170
    #	-rw-rw-r--  1  1510654 Jan  9 07:21 wormpep.table170
    #	-rw-rw-r--  1 12658135 Jan  9 07:21 wormpep170
    #	-rw-rw-r--  1   508392 Jan  9 07:21 wormpep.accession170
    hgPepPred ce4 generic sangerPep wormpep170/wormpep170

###############################################################################
######## MAKING GENE SORTER TABLES #### (DONE - 2007-04-02 - Hiram)
### re-done 2007-04-27 with new sangerGene table
# These are instructions for building the
# Gene Sorter browser.  Don't start these until
# there is a sangerGene table with a proteinID column containing Swiss-Prot
# protein IDs, and also Kim lab expression data is required (in hgFixed).

    # Cluster together various alt-splicing isoforms.
    # Creates the sangerIsoforms and sangerCanonical tables
    ssh hgwdev
    hgClusterGenes ce4 sangerGene sangerIsoforms sangerCanonical
    #	Got 20090 clusters, from 30391 genes in 7 chromosomes

    featureBits ce4 sangerCanonical
    #	56953095 bases of 100281244 (56.793%) in intersection
    featureBits ce2 sangerCanonical
    # 54156601 bases of 100291761 (53.999%) in intersection
    featureBits ce1 sangerCanonical 
    # 53654286 bases of 100277784 (53.506%) in intersection

    # Create Self blastp table - sangerBlastTab
    # Reload blastp data into sangerBlastTab and drop knownBlastTab 
    # - table given the wrong name (DONE, 2004-05-28, hartera)
    # Get sangerPep peptide sequences fasta file from sangerPep dir
    # and create a blast database out of them.
    mkdir -p /cluster/data/ce4/bed/blastp
    cd /cluster/data/ce4/bed/blastp
    cat < '_EOF_' > pepTabToFa.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,"<../sangerGene/sangerPep.tab") or
        die "can not open ../sangerGene/sangerPep.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($name,$seq) = split('\s+',$line);
    printf ">%s\n%s\n", $name, $seq;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./pepTabToFa.pl
    ./pepTabToFa.pl > wormPep170.faa
    
    /cluster/bluearc/blast229/formatdb -i wormPep170.faa \
	-t wormPep170 -n wormPep170

    # Copy database over to kluster storage
    mkdir /san/sanvol1/scratch/worms/ce4/blastp
    rsync -a ./ /san/sanvol1/scratch/worms/ce4/blastp/

    cd /cluster/data/ce4/bed/blastp
    mkdir split
    faSplit sequence wormPep170.faa 1000 split/wp

    # Make parasol run directory 
    ssh pk
    mkdir -p /cluster/data/ce4/bed/blastp/self
    cd /cluster/data/ce4/bed/blastp/self
    mkdir run
    cd run
    mkdir out

# Make blast script
    cat  << '_EOF_' > blastSome.csh
#!/bin/csh -ef
setenv BLASTMAT /cluster/bluearc/blast229/data
/cluster/bluearc/blast229/blastall -p blastp \
    -d /san/sanvol1/scratch/worms/ce4/blastp/wormPep170 \
	-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << happy emacs
    chmod a+x blastSome.csh
    # Make gensub2 file
    cat  << '_EOF_' > gsub
#LOOP
blastSome.csh {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
     # << Create parasol batch

     ls ../../split/*.fa > split.lst
     gensub2 split.lst single gsub jobList
     para create jobList
     para try.... etc ...
# Completed: 986 of 986 jobs
# CPU time in finished jobs:      17327s     288.79m     4.81h    0.20d  0.001 y
# IO & Wait Time:                  3136s      52.26m     0.87h    0.04d  0.000 y
# Average job time:                  21s       0.35m     0.01h    0.00d
# Longest finished job:              69s       1.15m     0.02h    0.00d
# Submission to last job:          2155s      35.92m     0.60h    0.02d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/ce4/bed/blastp/self/run/out
    hgLoadBlastTab ce4 sangerBlastTab *.tab
    #	Scanning through 986 files
    #	Loading database with 1214556 rows

    # CREATE EXPRESSION DISTANCE TABLES FOR  
    # KIM LAB EXPRESSION DATA
    hgExpDistance ce4 hgFixed.kimWormLifeMedianRatio \
	  hgFixed.kimWormLifeMedianExps kimExpDistance
    #	Have 19134 elements in hgFixed.kimWormLifeMedianRatio
    #	Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio

    # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
    # sangerToRefSeq (DONE, 2007-04-02, hiram)
    hgMapToGene ce4 refGene sangerGene sangerToRefSeq

    # SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
    # set hgNearOk to 1 on hgcentraltest to make Ce4 Gene Sorter viewable

    hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce4";' \
         hgcentraltest

##########################################################################
## Blastz SELF (DONE - 2007-03-29 - Hiram
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/blastz.ce4.2007-03-29
    cd /cluster/data/ce4/bed
    ln -s blastz.ce4.2007-03-29 blastzSelf
    cd blastz.ce4.2007-03-29
    cat << '_EOF_' > DEF
# ce4 vs ce4 - Self alignment
BLASTZ_H=2000
BLASTZ_M=200

# TARGET: elegans Ce4
SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: elegans Ce4
SEQ2_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000

BASE=/cluster/data/ce4/bed/blastz.ce4.2007-03-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/ce4Ce4 > do.log 2>&1 &
    #	real    37m22.066s
    ssh hgwdev
    cd /cluster/data/ce4/bed/blastz.ce4.2007-03-29
    nice -n +19 featureBits ce4 chainSelfLink > fb.ce4.chainSelfLink.txt 2>&1
    #	31186593 bases of 100281244 (31.099%) in intersection

##########################################################################
## BlastZ caeRem2 (DONE - 2007-03-29 - 2007-04-02 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29
    cd /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29
    cat << '_EOF_' > DEF
# ce4 vs caeRem2
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans Ce4
SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: briggsae caeRem2, 9,660 contigs, longest 5,925,111
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/caeRem2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.sizes
SEQ2_LIFT=/san/sanvol1/scratch/worms/caeRem2/caeRem2.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/cluster/data/ce4/bed/blastz.caeRem2.2007-03-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/ce4CaeRem2 > do.log 2>&1 &
    #	real     44m41.608s
    cat fb.ce4.chainCaeRem2Link.txt
    #	45160539 bases of 100281244 (45.034%) in intersection
    ln -s blastz.caeRem2.2007-03-29 blastz.caeRem2

    ## swap to caeRem2 - also in caeRem2.txt
    mkdir /cluster/data/caeRem2/bed/blastz.ce4.swap
    cd /cluster/data/caeRem2/bed/blastz.ce4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	/cluster/data/ce4/bed/blastz.caeRem2.2007-03-29/DEF \
	-swap > swap.log 2>&1 &
    #	real    3m52.831s
    cat fb.caeRem2.chainCe4Link.txt
    #	52254587 bases of 146898439 (35.572%) in intersection

##########################################################################
## BlastZ cb3 (DONE - 2007-03-29 - 2007-04-02 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/blastz.cb3.2007-03-29
    cd /cluster/data/ce4/bed/blastz.cb3.2007-03-29
    cat << '_EOF_' > DEF
# ce4 vs cb3
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans Ce4
SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: briggsae Cb3
SEQ2_DIR=/san/sanvol1/scratch/worms/cb3/cb3.rmskTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/cb3/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000

BASE=/cluster/data/ce4/bed/blastz.cb3.2007-02-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/ce4Ce3 > do.log 2>&1 &
    #  real    16m5.178s
    cat fb.ce4.chainCb3Link.txt
    #	42491022 bases of 100281244 (42.372%) in intersection
    ln -s blastz.cb3.2007-02-25 blastz.cb3

    ## swap to cb3 - also in cb3.txt
    mkdir /cluster/data/cb3/bed/blastz.ce4.swap
    cd /cluster/data/cb3/bed/blastz.ce4.swap
    time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/ce4/bed/blastz.cb3.2007-02-25/DEF \
	-verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 &
    #	real    3m18.164s
    cat fb.cb3.chainCe4Link.txt
    #	43181535 bases of 108433446 (39.823%) in intersection

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("ce4", "blat14", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("ce4", "blat14", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## Reset default position to the unc-52 gene complex (DONE - 2004-04-04 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrII:14645680-14667144"
	where name="ce4";' hgcentraltest

############################################################################
## BLASTZ caePb1 (DONE - 2007-03-04 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04
    cd /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04

    cat << '_EOF_' > DEF
# ce4 vs caePb1
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans Ce4
SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.unmasked.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. PB2801 caePb1
SEQ2_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
	DEF -verbose=2 -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/ce4CaePb1 > do.log 2>&1 &
    ## it is having trouble with the chaining
    #	real    69m6.967s
    #	recovering from Baskin power failure, continuing
    time nice -n +19 doBlastzChainNet.pl \
	DEF -verbose=2 -qRepeats=windowmaskerSdust -continue=chainMerge \
	-bigClusterHub=pk -blastzOutRoot /cluster/bluearc/ce4CaePb1 \
	    > chainMerge.log 2>&1 &

    cat fb.ce4.chainCaePb1Link.txt
    #	52594760 bases of 100281244 (52.447%) in intersection

    #	swap, this is also in caePb1.txt
    mkdir /cluster/data/caePb1/bed/blastz.ce4.swap
    cd /cluster/data/caePb1/bed/blastz.ce4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-qRepeats=windowmaskerSdust \
	/cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04/DEF \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    #	real    22m40.205s
    cat fb.caePb1.chainCe4Link.txt
    #	74758640 bases of 175247318 (42.659%) in intersection

############################################################################
## BLASTZ priPac1 (DONE - 2007-03-04 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04
    cd /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04

    cat << '_EOF_' > DEF
# ce4 vs priPac1
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans Ce4
SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.unmasked.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: Pristionchus pacificus priPac1
SEQ2_DIR=/san/sanvol1/scratch/worms/priPac1/priPac1.unmasked.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/priPac1/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \
	-bigClusterHub=pk -blastzOutRoot /cluster/bluearc/ce4PriPac1 \
	    > do.log 2>&1 &
    #	real    110m47.620s
    cat fb.ce4.chainPriPac1Link.txt
    #	8671813 bases of 100281244 (8.647%) in intersection
    ln -s blastz.priPac1.um.2007-04-04 blastz.priPac1

    #	swap, this is also in priPac1.txt
    ssh kkstore02
    mkdir /cluster/data/priPac1/bed/blastz.ce4.swap
    cd /cluster/data/priPac1/bed/blastz.ce4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04/DEF \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    cat fb.priPac1.chainCe4Link.txt
    #	9730314 bases of 145948246 (6.667%) in intersection

##########################################################################
## 5-Way multiple alignment (DONE - 2007-04-20,05-01 - Hiram)

    mkdir /cluster/data/ce4/bed/multiz5way
    cd /cluster/data/ce4/bed/multiz5way
    #	An additional nematode tree:
    #	http://nematol.unh.edu/tree/tree_des.php
    #	Constructing a tree from the document:
    #	http://www.wormbook.org/chapters/www_phylogrhabditids/phylorhab.html
    #	purely artifical for the last one: :0.2,pristiochus_priPac1:0.8
    #	since it is not on the graph

    cat << '_EOF_' > nematodes.nh
    ((((((((((briggsae_cb3:0.005,remanei_caeRem2:0.003):0.004,
	cb5161:0.013):0.002,elegans_ce4:0.003):0.001,
	    japonica:0.023):0.024,ps1010:0.027):0.014,df5070:0.056):0.009,
		plicata:0.102):0.099,sb341:0.060):0.038,
		(df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8)
'_EOF_'
    #	<< happy emacs

    #	I don't know if this is anywhere near accurate, but I'm going
    #	to call cb5161 our PB2801_caePb1 (it turns out to be true)
    #	Given what comes out of the chainLink measurments, this is not
    #	correct, but it is close ...
    /cluster/bin/phast/x86_64/tree_doctor \
--prune-all-but briggsae_cb3,remanei_caeRem2,cb5161,elegans_ce4,pristiochus_priPac1 \
	--rename "cb5161 -> PB2801_caePb1" nematodes.nh > 5way.nh

    /cluster/bin/phast/x86_64/all_dists 5way.nh > 5way.distances.txt
    grep -i ce4 5way.distances.txt | sort -k3,3n
    #	Use this output for reference, and use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                           chainCe4Link   chain   linearGap
#    distance                       on Ce4      on other   minScore
#  1 0.0120  - remanei_caeRem2     (% 45.034)  (% 35.572)  1000     loose
#  2 0.0140  - briggsae_cb3        (% 42.372)  (% 39.823)  1000     loose
#  3 0.0180  - PB2801_caePb1       (% 52.447)  (% 42.659)  1000     loose
#  4 1.1880  - pristiochus_priPac1 (%  8.647)  (%  6.667)  1000     loose

    cd /cluster/data/ce4/bed/multiz5way
    #	bash shell syntax here ...
    export H=/cluster/data/ce4/bed
    mkdir mafLinks
    for G in caeRem2 cb3 caePb1 priPac1
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore02
    mkdir /san/sanvol1/scratch/worms/ce4/multiz5way
    cd /san/sanvol1/scratch/worms/ce4/multiz5way
    time rsync -a --copy-links --progress --stats \
	/cluster/data/ce4/bed/multiz5way/mafLinks/ .
    #	a few seconds to copy 109 Mb

    #	these are x86_64 binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/ce4/bed/multiz5way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	5way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir maf run
    cd run

    #	NOTE: you need to set the db properly in this script

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = ce4
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/worms/$db/multiz5way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/worms/$db/multiz5way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/ce4/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    awk '{print $1}' /cluster/data/ce4/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para try
    #	7 jobs in batch
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       1916s      31.93m     0.53h    0.02d  0.000 y
# IO & Wait Time:                    31s       0.52m     0.01h    0.00d  0.000 y
# Average job time:                 278s       4.64m     0.08h    0.00d
# Longest finished job:             400s       6.67m     0.11h    0.00d
# Submission to last job:           404s       6.73m     0.11h    0.00d

    ssh kkstore02
    cd /cluster/data/ce4/bed/multiz5way
    time nice -n +19 catDir maf > multiz5way.maf
    # a few seconds to produce:
    #	-rw-rw-r--  1 312800151 Apr 21 21:53 multiz5way.maf

    #	before getting to the annotation, load this up so we can get
    #	a first view of the track.  This will be replaced with the annotated
    #	mafs
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way
    mkdir /gbdb/ce4/multiz5way
    ln -s /cluster/data/ce4/bed/multiz5way/multiz5way.maf /gbdb/ce4/multiz5way

    time nice -n +19 hgLoadMaf ce4 multiz5way
    #	a few seconds to load:
    #	Loaded 375859 mafs in 1 files from /gbdb/ce4/multiz5way

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 ce4 multiz5waySummary multiz5way.maf
    #	a few seconds to load:
    #	Created 108194 summary blocks from 855104 components and 375859 mafs
    #	from multiz5way.maf

############################################################################
# ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-05-03 - Hiram)
    ssh kolossus
    #	need the nBed stuff on these 5 genomes, if this hasn't been
    # done before:
    cd /cluster/data
    for X in ce4 cb3 caePb1 caeRem2 priPac1
do
    twoBitInfo -nBed /cluster/data/${X}/${X}.{2bit,N.bed}
done
    mkdir /cluster/data/ce4/bed/multiz5way/anno
    cd /cluster/data/ce4/bed/multiz5way/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    ln -s  /cluster/data/ce4/chrom.sizes ce4.len
    #	This were done before during the 7-way
    # twoBitInfo -nBed /cluster/data/ce4/ce4.sdTrf.{2bit,N.bed}
    ln -s  /cluster/data/ce4/ce4.N.bed ce4.bed
    for DB in \
`cat /cluster/data/ce4/bed/multiz5way/species.lst | sed -e "s/ ce4//"`
    do
        ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
        ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
	echo ${DB}.bed  >> nBeds
	echo ${DB}.len  >> sizes
	echo $DB
    done

    echo '#!/bin/csh -ef' > jobs.csh
    echo date >> jobs.csh
    # do smaller jobs first so you can see some progress immediately:
    for F in `ls -1rS ../../maf/*.maf`
    do
     echo mafAddIRows -nBeds=nBeds $F \
      /cluster/data/ce4/ce4.2bit ../maf/`basename $F` >> jobs.csh
     echo "echo $F" >> jobs.csh
    done 
    echo date >> jobs.csh
    chmod +x jobs.csh
    time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
    #	to watch progress;
    tail -f jobs.log
    #	real    0m43.777s

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way/anno/maf
    mkdir -p /gbdb/ce4/multiz5way/anno/maf
    ln -s /cluster/data/ce4/bed/multiz5way/anno/maf/*.maf \
      /gbdb/ce4/multiz5way/anno/maf
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/ce4/multiz5way/anno/maf ce4 multiz5way
    #	Loaded 451902 mafs in 7 files from /gbdb/ce4/multiz5way/anno/maf
    #	real    0m11.804s

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/ce4/bed/multiz5way/anno/maf
    time cat *.maf | \
    	nice -n +19 hgLoadMafSummary ce4 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000 -test multiz5waySummary stdin
    #	Created 36099 summary blocks from 855104 components
    #	and 451902 mafs from stdin
    #	real    0m13.892s

    #	-rw-rw-r--  1  1657980 May  3 16:02 multiz5waySummary.tab

    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way/anno/maf
    time nice -n +19 hgLoadSqlTab ce4 multiz5waySummary \
	~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab
    #	real    0m4.525

    #	Create per-chrom individual maf files for downloads
    ssh kkstore02
    cd /cluster/data/ce4/bed/multiz5way
    mkdir mafDownloads
    time for M in anno/maf/chr*.maf
    do
	B=`basename $M`
	nice -n +19 cp -p ${M} mafDownloads/${B}
	nice -n +19 gzip mafDownloads/${B}
	echo ${B} done
    done

    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way
    #	redone 2007-12-21 to fix difficulty in mafFrags when ce4 was not the
    #	first species listed
    # upstream mafs 
    #	There isn't any refGene table, trying sangerGene instead
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 ce4 \
        sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ce4 multiz5way \
                stdin stdout -orgs=species.lst \
        | gzip -c > mafDownloads/sangerGene.upstream${S}.maf.gz
    echo "done sangerGene.upstream${S}.maf.gz"
done

    cd mafDownloads
    md5sum *.gz > md5sum.txt

    #	deliver to downloads
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/ce4/multiz5way
    cd /usr/local/apache/htdocs/goldenPath/ce4/multiz5way
    ln -s /cluster/data/ce4/bed/multiz5way/mafDownloads/* .

#######################################################################
# MULTIZ5WAY MAF FRAMES (DONE - 2007-05-03 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/ce4/bed/multiz5way/frames
    cd /cluster/data/ce4/bed/multiz5way/frames
    mkdir genes
    # The following is adapted from the oryLat1 5-way sequence

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # using sangerGene for ce4
    # using blastCe4SG for cb3, caePb1, caeRem2, priPac1
    for qDB in ce4 cb3 caePb1 caeRem2 priPac1
    do
      if [ $qDB = "cb3" -o $qDB = "caePb1" -o $qDB = "caeRem2" -o $qDB = "priPac1" ]; then
	geneTbl=blastCe4SG
	  echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
	  hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
		> /scratch/tmp/$qDB.tmp.psl
	    mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \
	  | genePredSingleCover stdin stdout | gzip -2c \
	    > /scratch/tmp/$qDB.tmp.gz
	  rm -f /scratch/tmp/$qDB.tmp.psl
	  mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      elif [ $qDB = "ce4" ]; then
	geneTbl=sangerGene
	  echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
	  hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \
	  | genePredSingleCover stdin stdout | gzip -2c \
	    > /scratch/tmp/$qDB.tmp.gz
	  mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      fi
    done

    ssh kkstore02
    cd /cluster/data/ce4/bed/multiz5way/frames
    cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce4 \
	stdin stdout ce4 genes/ce4.gp.gz cb3 \
	genes/cb3.gp.gz caePb1 genes/caePb1.gp.gz caeRem2 \
	genes/caeRem2.gp.gz priPac1 genes/priPac1.gp.gz \
	| gzip > multiz5way.mafFrames.gz

    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way/frames
    nice -n +19 hgLoadMafFrames ce4 multiz5wayFrames multiz5way.mafFrames.gz

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-20)
    ssh kkstore02
    # bash  if not using bash shell already

    mkdir /cluster/data/ce4/blastDb
    cd /cluster/data/ce4
    twoBitToFa ce4.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/ce4/blastDb
    cd /cluster/data/ce4/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/ce4/blastDb
    done

    mkdir -p /cluster/data/ce4/bed/tblastn.hg18KG
    cd /cluster/data/ce4/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/ce4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 104 query.lst

   # we want around 50000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\)

# 36727/(50000/104) = 76.392160

   mkdir -p /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa
   split -l 76 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/ce4/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/ce4/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh pk
    cd /cluster/data/ce4/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

# Completed: 50336 of 50336 jobs
# CPU time in finished jobs:     955647s   15927.45m   265.46h   11.06d  0.030 y
# IO & Wait Time:                199548s    3325.80m    55.43h    2.31d  0.006 y
# Average job time:                  23s       0.38m     0.01h    0.00d
# Longest finished job:              59s       0.98m     0.02h    0.00d
# Submission to last job:          3074s      51.23m     0.85h    0.04d

    ssh kkstore02
    cd /cluster/data/ce4/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/ce4/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 484 of 484 jobs
# CPU time in finished jobs:        485s       8.08m     0.13h    0.01d  0.000 y
# IO & Wait Time:                  1739s      28.99m     0.48h    0.02d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:              16s       0.27m     0.00h    0.00d
# Submission to last job:           102s       1.70m     0.03h    0.00d

    ssh kkstore02
    cd /cluster/data/ce4/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/ce4/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/ce4/bed/tblastn.hg18KG
    hgLoadPsl ce4 blastHg18KG.psl

    # check coverage
    featureBits ce4 blastHg18KG 
# 3424422 bases of 100281244 (3.415%) in intersection

    featureBits ce4 sangerGene:cds blastHg18KG  -enrichment
# sangerGene:cds 27.876%, blastHg18KG 3.415%, both 3.231%, cover 11.59%, enrich 3.39x

    ssh kkstore02
    rm -rf /cluster/data/ce4/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut
#end tblastn

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS - 5-WAY
#		(DONE - 2007-04-23 - Hiram)

# Estimate phastCons parameters
    ssh kkstore02
    #	We need the fasta sequence for this business
    cd /cluster/data/ce4
    for C in `awk '{print $1}' chrom.sizes | sed -e s/chr//`
do
    echo "twoBitToFa -seq=chr${C} ce4.2bit ${C}/chr${C}.fa"
    twoBitToFa -seq=chr${C} ce4.2bit ${C}/chr${C}.fa
done

    mkdir /cluster/data/ce4/bed/multiz5way/cons
    cd /cluster/data/ce4/bed/multiz5way/cons

    # Create a starting-tree.mod based on chrV (the largest one)
    #	Set -windows large enough so it only makes one piece
    faSize ../../../V/chrV.fa
    #	20919398 bases (0 N's 20919398 real 18027882 upper
    #	2891516 lower) in 1 sequences in 1 files
    time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
	--refseq ../../../V/chrV.fa --in-format MAF \
	--windows 30000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	10 minutes

    #	the tree in use
    cat ../tree-commas.nh
    #	((((cb3,caeRem2),caePb1),ce4),priPac1)

    #  this s1.*.ss should match only one item
    time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "((((cb3,caeRem2),caePb1),ce4),priPac1)" \
    --out-root starting-tree
    #	real    0m4.926s

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.377
    #	This 0.377 is used in the --gc argument below

    # Create SS files on san filesystem (takes ~ 2h 20m)
    #	Increasing their size this time from 1,000,000 to 4,000,000 to
    #	slow down the phastCons pk jobs
    ssh kkstore02
    mkdir -p  /san/sanvol1/scratch/worms/ce4/cons/ss
    cd  /san/sanvol1/scratch/worms/ce4/cons/ss
    time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes`
    do
      if [ -s /cluster/data/ce4/bed/multiz5way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/ce4/bed/multiz5way/maf/${C}.maf \
	    --refseq /cluster/data/ce4/${chrN}/${C}.fa \
	    --in-format MAF --windows 4000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done
    #	real    1m11.734s

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/worms/ce4/cons/ss
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
    #	Only 13 lines in stdin when the --windows was 10000000
    #	Only 29 lines in stdin when the --windows was 4000000

    # Set up parasol directory to calculate trees on these regions
    ssh pk
    mkdir /san/sanvol1/scratch/worms/ce4/cons/treeRun4
    cd /san/sanvol1/scratch/worms/ce4/cons/treeRun4
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create  script that calls phastCons with right arguments
    #	expected-lengths and target-coverage taken from ce2 history
    #	for treeRun4 --expected-lengths 15 --target-coverage 0.60
    #		and using ave.noncons.mod from treeRun3
    #	for treeRun3 --expected-lengths 15 --target-coverage 0.55
    #		and using ave.noncons.mod from treeRun2 in place of
    #		/cluster/data/ce4/bed/multiz5way/cons/starting-tree.mod 
    #	for treeRun2 --expected-lengths 15 --target-coverage 0.65
    #	for treeRun1 --expected-lengths 25 --target-coverage 0.45
    #		mistakenly: --gc 0.277
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
	/san/sanvol1/scratch/worms/ce4/cons/treeRun3/ave.noncons.mod \
	--gc 0.377 --nrates 1,1 --no-post-probs --ignore-missing \
	--expected-lengths 15 --target-coverage 0.60 \
	--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       2264s      37.73m     0.63h    0.03d  0.000 y
# IO & Wait Time:                    95s       1.59m     0.03h    0.00d  0.000 y
# Average job time:                  84s       1.40m     0.02h    0.00d
# Longest finished job:             156s       2.60m     0.04h    0.00d
# Submission to last job:           157s       2.62m     0.04h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons/treeRun4
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt 2>&1
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/ce4/bed/multiz5way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .60 15 \
                        ave.cons.mod ave.noncons.mod

    #	for treeRun4 --expected-lengths 15 --target-coverage 0.60
# (Solving for new omega: 15.000000 141.732548 61.619960 42.817579 40.070343
#	39.988928 )
# Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667,
#	nu=0.100000
# Relative entropy: H=0.977026 bits/site
# Expected min. length: L_min=6.777394 sites
# Expected max. length: L_max=6.565768 sites
# Phylogenetic information threshold: PIT=L_min*H=6.621688 bits
Recommended expected length: omega=39.988928 sites (for L_min*H=9.780000)
    #	for treeRun3 --expected-lengths 15 --target-coverage 0.55
# (Solving for new omega: 15.000000 77.051147 42.798715 35.637374 35.053109
#	35.048618 )
# Transition parameters: gamma=0.550000, omega=15.000000,
#	mu=0.066667, nu=0.081481
# Relative entropy: H=0.923015 bits/site
# Expected min. length: L_min=7.718135 sites
# Expected max. length: L_max=7.015398 sites
# Phylogenetic information threshold: PIT=L_min*H=7.123957 bits
# Recommended expected length: omega=35.048618 sites (for L_min*H=9.780000)

    #	for treeRun2 --expected-lengths 15 --target-coverage 0.65
# ( Solving for new omega: 15.000000 466.852221 137.016323 64.163096
#	47.673547 45.689998 45.653095 )
# Transition parameters: gamma=0.650000, omega=15.000000,
#	mu=0.066667, nu=0.123810
# Relative entropy: H=1.050500 bits/site
# Expected min. length: L_min=5.807815 sites
# Expected max. length: L_max=6.063484 sites
# Phylogenetic information threshold: PIT=L_min*H=6.101107 bits
# Recommended expected length: omega=45.653095 sites (for L_min*H=9.780000)

    #	for treeRun1 --expected-lengths 25 --target-coverage 0.45
    #		mistakenly: --gc 0.277
    /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .45 25 \
                        ave.cons.mod ave.noncons.mod
# ( Solving for new omega: 25.000000 26.470691 26.427531 )
# Transition parameters: gamma=0.450000, omega=25.000000,
#	mu=0.040000, nu=0.032727
# Relative entropy: H=0.700176 bits/site
# Expected min. length: L_min=13.739278 sites
# Expected max. length: L_max=11.506136 sites
# Phylogenetic information threshold: PIT=L_min*H=9.619918 bits
# Recommended expected length: omega=26.427531 sites (for L_min*H=9.780000)

# Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727
# Relative entropy: H=0.700176 bits/site
# Expected min. length: L_min=13.739278 sites
# Expected max. length: L_max=11.506136 sites
# Phylogenetic information threshold: PIT=L_min*H=9.619918 bits

    ssh pk
    #	After some experimentation, going to go with consRun3 at this time
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/worms/ce4/cons/consRun3
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun3
    #	using the ave.cons.mod and ave.noncons.mod from treeRun3 above:
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.311500 0.188500 0.188500 0.311500 
RATE_MAT:
  -0.871104    0.170916    0.455562    0.244626 
   0.282442   -1.213584    0.179324    0.751817 
   0.752825    0.179324   -1.214011    0.281862 
   0.244626    0.454952    0.170565   -0.870143 
TREE: ((((cb3:0.089669,caeRem2:0.075007):0.023378,caePb1:0.092376):0.038979,ce4:0.082842):0.118524,priPac1:0.118524);

ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.311500 0.188500 0.188500 0.311500 
RATE_MAT:
  -0.871104    0.170916    0.455562    0.244626 
   0.282442   -1.213584    0.179324    0.751817 
   0.752825    0.179324   -1.214011    0.281862 
   0.244626    0.454952    0.170565   -0.870143 
TREE: ((((cb3:0.333096,caeRem2:0.276036):0.086677,caePb1:0.341433):0.145785,ce4:0.306896):0.447539,priPac1:0.447539);

    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 15 --target-coverage 0.55 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive
# CPU time in finished jobs:        383s       6.38m     0.11h    0.00d  0.000 y
# IO & Wait Time:                    89s       1.49m     0.02h    0.00d  0.000 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              22s       0.37m     0.01h    0.00d
# Submission to last job:           233s       3.88m     0.06h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun3
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
    #	if that looks right (== in chrom and numerical order), then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin \
	    > phastConsElements5way.bed
    #	way too many negative
    #		scores with --expected-length 15 --target-coverage 0.60 
    #	OK with--expected-length 15 --target-coverage 0.55 
    #	way too many negative
    #		scores with --expected-length 15 --target-coverage 0.65 
    cp -p phastConsElements5way.bed /cluster/data/ce4/bed/multiz5way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore02
    cd /cluster/data/ce4
    faSize */chr*.fa
    #	100281244 bases (0 N's 100281244 real 87008909 upper
    #	13272335 lower) in 7 sequences in 7 files
    #	%13.24 masked total, %13.24 masked real

    cd /san/sanvol1/scratch/worms/ce4/cons/consRun3
    #	The 100281244 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \
	phastConsElements5way.bed
    #	after fixing the model arguments:
    #	% 32.37 = 100.0*32459183/100281244
    #	--expected-length 15 --target-coverage 0.55
    #	% 2.36 = 100.0*2370557/100281244
    #	--expected-length 25 --target-coverage 0.45
    #	% 1.36 = 100.0*1366061/100281244

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \
	-enrichment sangerGene:cds phastConsElements5way.bed
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements5way.bed 2.364%, both 0.668%,
    #	cover 2.64%, enrich 1.12x

    #	--expected-length 25 --target-coverage 0.45
    #	sangerGene:cds 25.335%, phastConsElements5way.bed 1.362%, both 0.410%,
    #	cover 1.62%, enrich 1.19x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way
    #	the copy was already done above
    # cp -p /san/sanvol1/scratch/worms/ce4/cons/consRun3/phastConsElements5way.bed .
    time nice -n +19 hgLoadBed ce4 phastConsElements5way \
	phastConsElements5way.bed
    #	Loaded 548755 elements of size 5

    #	should measure the same as above
    time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \
	phastConsElements5way
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements5way 2.364%, both 0.668%,
    #	cover 2.64%, enrich 1.12x

    #	--expected-length 25 --target-coverage 0.45
    #	sangerGene:cds 25.335%, phastConsElements5way 1.362%, both 0.410%,
    #	cover 1.62%, enrich 1.19x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun3
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode -noOverlap stdin \
		phastCons5way.wig phastCons5way.wib
    #	Converted stdin, upper limit 0.98, lower limit 0.00
    #	real    0m32.137s
    #	-rw-rw-r--  1 61949779 Apr 30 13:29 phastCons5way.wib
    #	-rw-rw-r--  1 11279833 Apr 30 13:29 phastCons5way.wig

    nice -n +19 cp -p phastCons5way.wi? /cluster/data/ce4/bed/multiz5way/

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun3
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons5wayScores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons5wayScores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    1m14.178s

    #	copy them for downloads
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/multiz5way/phastCons5wayScores
    cd /cluster/data/ce4/bed/multiz5way/phastCons5wayScores
    cp -p  /san/sanvol1/scratch/worms/ce4/cons/consRun3/phastCons5wayScores/* .

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/ce4
    ln -s /cluster/data/ce4/bed/multiz5way/phastCons5wayScores .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way
    ln -s `pwd`/phastCons5way.wib /gbdb/ce4/wib/phastCons5way.wib
    nice -n +19 hgLoadWiggle ce4 phastCons5way phastCons5way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz5way
    nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ce4 phastCons5way > histogram.data 2>&1

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce4 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

##########################################################################
####  Blat sangerGene proteins to determine exons
#	(DONE - 2007-04-27 - hiram)
    ssh hgwdev
    cd /cluster/data/ce4/bed
    mkdir blat.ce4SG.2007-04-27
    ln -s  blat.ce4SG.2007-04-27 blat.ce4SG
    cd blat.ce4SG
    pepPredToFa ce4 sangerPep sangerPep.fa

    #	The kluster run
    #	created san nib directory from goldenPath chromosomes fa files
    ssh pk
    cd /cluster/data/ce4/bed/blat.ce4SG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
blat -t=dnax -q=prot -out=pslx /san/sanvol1/scratch/worms/ce4/nib/$1.nib sgfa/$2.fa $3
'_EOF_'
    # << happy emacs
    chmod +x blatSome
    ls -1S /san/sanvol1/scratch/worms/ce4/nib > ce4.lst
    mkdir sgfa
    cd sgfa
    faSplit sequence ../sangerPep.fa 3000 sg
    ls -1S *.fa > ../sg.lst
    cd ..
    cat << '_EOF_' > template
#LOOP
blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 ce4.lst sg.lst template jobList
    mkdir psl
    cd psl
    sed -e "s/.nib//" ../ce4.lst | xargs mkdir
    cd ..
    para create jobList
    para try ... check ... push ... etc
# Completed: 20335 of 20335 jobs
# CPU time in finished jobs:      64312s    1071.87m    17.86h    0.74d  0.002 y
# IO & Wait Time:                 53465s     891.08m    14.85h    0.62d  0.002 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest finished job:             278s       4.63m     0.08h    0.00d
# Submission to last job:           788s      13.13m     0.22h    0.01d

    ssh kkstore02
    cd /cluster/data/ce4/bed/blat.ce4SG.2007-04-27
    pslSort dirs raw.psl /tmp psl/*
    #	-rw-rw-r--  1 70076854 Apr 27 16:19 raw.psl
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    #	Processed 156107 alignments
    #	-rw-rw-r--  1 25783288 Apr 27 16:20 cooked.psl
    pslUniq cooked.psl ce4SG.psl
    #	-rw-rw-r--   1  41321225 Mar 24 11:14 ce4SG.psl
    cut -f 10 ce4SG.psl > sgName.lst
    faSomeRecords sangerPep.fa sgName.lst ce4SG.fa
    faSize ce4SG.fa
    #	10183094 bases (8184565 N's 1998529 real 1998529 upper 0 lower) in
    #	23192 sequences in 1 files
    faSize sangerPep.fa
    #	10186194 bases (8187162 N's 1999032 real 1999032 upper 0 lower) in
    #	23224 sequences in 1 files

    #	You may need to build this pslxToFa - it is not in the standard build
    pslxToFa ce4SG.psl ce4SG_ex.fa -liftTarget=genome.lft \
	-liftQuery=protein.lft
    #	-rw-rw-r--  1  5128274 Apr 27 16:29 protein.lft
    #	-rw-rw-r--  1  6545612 Apr 27 16:29 genome.lft
    #	-rw-rw-r--  1 12409406 Apr 27 16:29 ce4SG_ex.fa
    wc -l *.psl *.lft *.fa sgName.lst
    #	  23192 ce4SG.psl
    #	  25361 cooked.psl
    #	 156112 raw.psl
    #	 146926 genome.lft
    #	 146926 protein.lft
    #	 238132 ce4SG.fa
    #	 293852 ce4SG_ex.fa
    #	 238243 sangerPep.fa
    #	  23192 sgName.lst
    #	1291936 total

    #	back on hgwdev
    ssh hgwdev
    cd /cluster/data/ce4/bed/blat.ce4SG
    ## This doesn't work here, it complains about a missing kgXref table
    #	may need to make this command in src/hg/oneShot/kgName
    kgName ce4 ce4SG.psl blastSGRef01
    #	After about an hour, it exited with this message:
    #	sqlFreeConnection called on cache (ce4) that doesn't contain
    #	the given connection
    #	This may be a lurking error in this program, because the
    #	resulting file seems to have the correct number of lines:

    #	Need a reference table loaded, perl script makes up relationships
    cat << '_EOF_' >> mkRef.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

sub usage() {
    print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n";
    print STDERR "reads ../sangerGene/genePred.tab and ",
        "../sangerGene/orfToGene.tab\n";
    print STDERR "writes references to the given input file, ",
        "e.g. blastSGRef01.tab\n";
}

if (1 != $argc) {
    usage;
    exit 255;
}
my $outFile = shift;

my %orfToGene;  # key is all lowercase orf name, value is gene name

my $inFile="../sangerGene/orfToGene.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
    chomp $line;
    my ($orf, $gene) = split('\s+',$line);
    $orfToGene{lc($orf)} = $gene
}
close (FH);

my %links;      # key is all lowercase orf name, value is SwissProt name

$inFile="../sangerGene/sangerLinks.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
    chomp $line;
    my ($orf, $SP, $comments) = split('\s+',$line);
    $links{lc($orf)} = $SP
}
close (FH);

my %ce4Position;        # key is all lowercase orf name, value is ce4 position

$inFile="../sangerGene/genePred.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
    chomp $line;
    my ($bin, $orf, $chr, $strand, $start, $end, $other) = split('\s+',$line);
    $ce4Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end);
}
close (FH);

open (OF,">$outFile") or die "can not write to $outFile";

$inFile="sgName.lst";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $orf=<FH>) {
    chomp $orf;
    if (!exists($orfToGene{lc($orf)})) {
        $orfToGene{lc($orf)} = "";
    }
    if (!exists($links{lc($orf)})) {
        $links{lc($orf)} = "";
    }
    if (!exists($ce4Position{lc($orf)})) {
        $ce4Position{lc($orf)} = "";
    }
    printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)},
        $ce4Position{lc($orf)}, $links{lc($orf)}, "";
}
close (FH);
close (OF);
'_EOF_'
    # << happy emacs
    chmod +x ./mkRef.pl
    ./mkRef.pl blastSGRef.01.tab

    hgLoadSqlTab ce4 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \
	blastSGRef01.tab
    wc -l sgName.lst blastSGRef01.tab ce4SG.psl
    #	23192 sgName.lst
    #	23192 blastSGRef01.tab
    #	23192 ce4SG.psl

    #	And load the protein sequences
    hgPepPred ce4 generic blastSGPep01 ce4SG.fa
#	end blat proteins

##########################################################################
## 4-Way multiple alignment (DONE - 2007-05-01 - Hiram)

    mkdir /cluster/data/ce4/bed/multiz4way
    cd /cluster/data/ce4/bed/multiz4way

    #	Taking the tuning tree ave.noncons.mod from the 5-Way experiment
    tail -1 /cluster/data/ce4/bed/multiz5way/cons/ave.noncons.mod
# TREE: ((((cb3:0.333096,caeRem2:0.276036):0.086677,caePb1:0.341433):0.145785,
#	ce4:0.306896):0.447539,priPac1:0.447539);

    cat << '_EOF_' > nematodes.nh
((((briggsae_cb3:0.333096,remanei_caeRem2:0.276036):0.086677,
brenneri_caePb1:0.341433):0.145785,elegans_ce4:0.306896):0.447539,
priPac1:0.447539)
'_EOF_'
    # << happy emacs

    /cluster/bin/phast/x86_64/tree_doctor \
--prune-all-but briggsae_cb3,remanei_caeRem2,brenneri_caePb1,elegans_ce4 \
	nematodes.nh > 4way.nh

    /cluster/bin/phast/x86_64/all_dists 4way.nh > 4way.distances.txt
    grep -i ce4 4way.distances.txt | sort -k3,3n
    #	Use this output for reference, and use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                           chainCe4Link   chain   linearGap
#    distance                       on Ce4      on other   minScore
#  1 0.7941  - brenneri_caePb1     (% 52.447)  (% 42.659)  1000     loose
#  2 0.8154  - remanei_caeRem2     (% 45.034)  (% 35.572)  1000     loose
#  3 0.8725  - briggsae_cb3        (% 42.372)  (% 39.823)  1000     loose

    cd /cluster/data/ce4/bed/multiz4way
    #	bash shell syntax here ...
    export H=/cluster/data/ce4/bed
    mkdir mafLinks
    for G in caeRem2 cb3 caePb1
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore02
    mkdir /san/sanvol1/scratch/worms/ce4/multiz4way
    cd /san/sanvol1/scratch/worms/ce4/multiz4way
    time rsync -a --copy-links --progress --stats \
	/cluster/data/ce4/bed/multiz4way/mafLinks/ .
    #	a few seconds to copy 99 Mb

    #	these are x86_64 binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/ce4/bed/multiz4way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	4way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir maf run
    cd run

    #	NOTE: you need to set the db properly in this script

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = ce4
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/worms/$db/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/worms/$db/multiz4way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/ce4/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    awk '{print $1}' /cluster/data/ce4/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    para try
    #	7 jobs in batch
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       1708s      28.47m     0.47h    0.02d  0.000 y
# IO & Wait Time:                    34s       0.57m     0.01h    0.00d  0.000 y
# Average job time:                 249s       4.15m     0.07h    0.00d
# Longest finished job:             361s       6.02m     0.10h    0.00d
# Submission to last job:           464s       7.73m     0.13h    0.01d

    ssh kkstore02
    cd /cluster/data/ce4/bed/multiz4way
    nice -n +19 catDir maf > multiz4way.maf
    #	-rw-rw-r--  1 285201701 May  1 13:54 multiz4way.maf

    #	before getting to the annotation, load this up so we can get
    #	a first view of the track.  This will be replaced with the annotated
    #	mafs
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz4way
    mkdir /gbdb/ce4/multiz4way
    ln -s /cluster/data/ce4/bed/multiz4way/multiz4way.maf /gbdb/ce4/multiz4way

    nice -n +19 hgLoadMaf ce4 multiz4way
    #	Loaded 314717 mafs in 1 files from /gbdb/ce4/multiz4way

    nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 ce4 multiz4waySummary multiz4way.maf
    #	Created 88583 summary blocks from 631555 components and 314717 mafs
    #	from multiz4way.maf

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS 4-WAY
#		(DONE - 2007-05-01 - Hiram)

# Estimate phastCons parameters
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/multiz4way/cons
    cd /cluster/data/ce4/bed/multiz4way/cons

    # Create a starting-tree.mod based on chrV (the largest one)
    #	Set -windows large enough so it only makes one piece
    faSize ../../../V/chrV.fa
    #	20919398 bases (0 N's 20919398 real 18027882 upper
    #	2891516 lower) in 1 sequences in 1 files
    nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
	--refseq ../../../V/chrV.fa --in-format MAF \
	--windows 30000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1

    #	the tree in use
    cat ../tree-commas.nh
    #	(((cb3,caeRem2),caePb1),ce4)

    #  this s1.*.ss should match only one item
    nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((cb3,caeRem2),caePb1),ce4)" \
    --out-root starting-tree

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.375
    #	This 0.375 is used in the --gc argument below

    # Create SS files on san filesystem (takes ~ 2h 20m)
    #	Increasing their size this time from 1,000,000 to 4,000,000 to
    #	slow down the phastCons pk jobs
    ssh kkstore02
    mkdir -p  /san/sanvol1/scratch/worms/ce4/cons.4way/ss
    cd  /san/sanvol1/scratch/worms/ce4/cons.4way/ss
    time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes`
    do
      if [ -s /cluster/data/ce4/bed/multiz4way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/ce4/bed/multiz4way/maf/${C}.maf \
	    --refseq /cluster/data/ce4/${chrN}/${C}.fa \
	    --in-format MAF --windows 4000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done &
    #	real    1m1.269s

    # Create a random list of 50 1 mb regions 
    cd /san/sanvol1/scratch/worms/ce4/cons.4way/ss
    ls -1l chr*/chr*.ss | awk '$5 > 4000000 {print $9;}' \
	| randomLines stdin 50 ../randomSs.list
    #	Only 28 lines in stdin

    # Set up parasol directory to calculate trees on these regions
    ssh pk
    mkdir /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1
    cd /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1

    #	Tuning this loop should come back to here to recalculate 
    # Create  script that calls phastCons with right arguments
    #	expected-lengths and target-coverage taken from ce2 history
    #	for treeRun1 --expected-lengths 15 --target-coverage 0.60
    #		and using 
    #		/cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod 
    #	for treeRun2 --expected-lengths 15 --target-coverage 0.55
    #		and using
    #		/cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod 
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
	/cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod \
	--gc 0.375 --nrates 1,1 --no-post-probs --ignore-missing \
	--expected-lengths 15 --target-coverage 0.55 \
	--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    mkdir tree log
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       3337s      55.62m     0.93h    0.04d  0.000 y
# IO & Wait Time:                    77s       1.28m     0.02h    0.00d  0.000 y
# Average job time:                 122s       2.03m     0.03h    0.00d
# Longest finished job:             193s       3.22m     0.05h    0.00d
# Submission to last job:           193s       3.22m     0.05h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt 2>&1
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/ce4/bed/multiz4way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .55 15 \
                        ave.cons.mod ave.noncons.mod

    #	for treeRun2 --expected-lengths 15 --target-coverage 0.55
# (Solving for new omega: 15.000000 78.545736 43.417367 36.104783
#	35.511720 35.507213 )
# Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667,
#	nu=0.081481
# Relative entropy: H=0.656639 bits/site
# Expected min. length: L_min=10.742794 sites
# Expected max. length: L_max=9.457029 sites
# Phylogenetic information threshold: PIT=L_min*H=7.054137 bits
# Recommended expected length: omega=35.507213 sites (for L_min*H=9.780000)

    #	for treeRun1 --expected-lengths 15 --target-coverage 0.60
# (Solving for new omega: 15.000000 143.489247 62.468129 43.654450 40.993458
#	40.920278 )
# Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667,
#	nu=0.100000
# Relative entropy: H=0.697543 bits/site
# Expected min. length: L_min=9.302913 sites
# Expected max. length: L_max=8.908666 sites
# Phylogenetic information threshold: PIT=L_min*H=6.489180 bits
# Recommended expected length: omega=40.920278 sites (for L_min*H=9.780000)

    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/worms/ce4/cons/consRun1
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun1
    #	using the ave.cons.mod from treeRun2 above:
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.312500 0.187500 0.187500 0.312500 
RATE_MAT:
  -0.870936    0.168820    0.457329    0.244786 
   0.281367   -1.216280    0.173422    0.761491 
   0.762216    0.173422   -1.215832    0.280194 
   0.244786    0.456895    0.168116   -0.869797 
TREE: (((cb3:0.088481,caeRem2:0.073770):0.023475,caePb1:0.091021):0.059375,ce4:0.059375);

    mkdir ppRaw bed

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 15 --target-coverage 0.55 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive
# Completed: 29 of 29 jobs
# CPU time in finished jobs:        375s       6.25m     0.10h    0.00d  0.000 y
# IO & Wait Time:                    88s       1.46m     0.02h    0.00d  0.000 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              21s       0.35m     0.01h    0.00d
# Submission to last job:            32s       0.53m     0.01h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/worms/ce4/cons/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
    #	if that looks right (== in chrom and numerical order), then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin \
	    > phastConsElements4way.bed
    #	way too many negative
    #		scores with --expected-length 15 --target-coverage 0.60 
    #	OK with--expected-length 15 --target-coverage 0.55 
    cp -p phastConsElements4way.bed /cluster/data/ce4/bed/multiz4way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore02
    cd /cluster/data/ce4
    faSize */chr*.fa
    #	100281244 bases (0 N's 100281244 real 87008909 upper
    #	13272335 lower) in 7 sequences in 7 files
    #	%13.24 masked total, %13.24 masked real

    cd /san/sanvol1/scratch/worms/ce4/cons/consRun2
    #	The 100281244 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \
	phastConsElements4way.bed
    #	after fixing the two-model argument in the phastCons run
    #	--expected-length 15 --target-coverage 0.55
    #	% 31.95 = 100.0*32036617/100281244
    #	--expected-length 15 --target-coverage 0.55
    #	% 2.46 = 100.0*2466194/100281244

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \
	-enrichment sangerGene:cds phastConsElements4way.bed
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements4way.bed 31.947%, both
    #	15.830%, cover 62.60%, enrich 1.96x
    #	the broken single model phastCons run:
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements4way.bed 2.459%, both 0.872%,
    #	cover 3.45%, enrich 1.40x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz4way
    #	the copy was already done above
    # cp -p /san/sanvol1/scratch/worms/ce4/cons/consRun2/phastConsElements4way.bed .
    time nice -n +19 hgLoadBed ce4 phastConsElements4way \
	phastConsElements4way.bed
    #	Loaded 535009 elements of size 5

    #	should measure the same as above
    time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \
	phastConsElements4way
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements4way 31.947%, both 15.830%,
    #	cover 62.60%, enrich 1.96x
    #	the broken single model phastCons run:
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements4way 2.459%, both 0.872%,
    #	cover 3.45%, enrich 1.40x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode -noOverlap stdin \
		phastCons4way.wig phastCons4way.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    0m30.016s
    #	-rw-rw-r--  1 61537017 May  3 15:41 phastCons4way.wib
    #	-rw-rw-r--  1 11279406 May  3 15:41 phastCons4way.wig

    nice -n +19 cp -p phastCons4way.wi? /cluster/data/ce4/bed/multiz4way/

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons4wayScores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons4wayScores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    1m14.178s

    #	copy them for downloads
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/multiz4way/phastCons4wayScores
    cd /cluster/data/ce4/bed/multiz4way/phastCons4wayScores
    cp -p \
    /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2/phastCons4wayScores/* .

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/ce4
    ln -s /cluster/data/ce4/bed/multiz4way/phastCons4wayScores .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz4way
    ln -s `pwd`/phastCons4way.wib /gbdb/ce4/wib/phastCons4way.wib
    nice -n +19 hgLoadWiggle ce4 phastCons4way phastCons4way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz4way
    nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ce4 phastCons4way > histogram.data 2>&1

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce4 Histogram phastCons4way track"
set xlabel " phastCons4way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS just with brenneri
#		(DONE - 2007-05-03 - Hiram)

# Estimate phastCons parameters
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/multiz2way
    cd /cluster/data/ce4/bed/multiz2way

    /cluster/bin/phast/x86_64/tree_doctor \
	--prune-all-but cb5161,elegans_ce4 \
	--rename "cb5161 -> brenneri_caePb1" \
	../multiz5way/nematodes.nh > 2way.nh
    #	the tree in use
    sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	2way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
    mkdir maf
    cp -p ../blastz.caePb1/mafNet/*.maf.gz maf
    #	second attempt with blastz.cb3
    cd maf
    gunzip *.gz

    mkdir /cluster/data/ce4/bed/multiz2way/cons
    cd /cluster/data/ce4/bed/multiz2way/cons

    # Create a starting-tree.mod based on chrV (the largest one)
    #	Set -windows large enough so it only makes one piece
    faSize ../../../V/chrV.fa
    #	20919398 bases (0 N's 20919398 real 18027882 upper
    #	2891516 lower) in 1 sequences in 1 files
    nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
	--refseq ../../../V/chrV.fa --in-format MAF \
	--windows 30000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1

    cat ../tree-commas.nh
    #	first time with (caePb1,ce4)
    #	second time with (ce4,cb3)

    #  this s1.*.ss should match only one item
    nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(ce4,cb3)" \
    --out-root starting-tree

    #	first time with --tree "(caePb1,ce4)" \

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.365
    #	This 0.365 is used in the --gc argument below

    # Create SS files on san filesystem (takes ~ 2h 20m)
    #	Increasing their size this time from 1,000,000 to 4,000,000 to
    #	slow down the phastCons pk jobs
    ssh kkstore02
    mkdir -p  /san/sanvol1/scratch/worms/ce4/cons2way/ss
    cd  /san/sanvol1/scratch/worms/ce4/cons2way/ss
    time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes`
    do
      if [ -s /cluster/data/ce4/bed/multiz2way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/ce4/bed/multiz2way/maf/${C}.maf \
	    --refseq /cluster/data/ce4/${chrN}/${C}.fa \
	    --in-format MAF --windows 4000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done &
    #	real    0m38.178s

    # Create a random list of 50 1 mb regions 
    cd /san/sanvol1/scratch/worms/ce4/cons2way/ss
    ls -1l chr*/chr*.ss | awk '$5 > 4000000 {print $9;}' \
	| randomLines stdin 50 ../randomSs.list
    #	Only 28 lines in stdin

    # Set up parasol directory to calculate trees on these regions
    ssh pk
    mkdir /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1
    cd /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1

    # Create template file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    #	Tuning this loop should come back to here to recalculate 
    # Create  script that calls phastCons with right arguments
    #	expected-lengths and target-coverage taken from ce2 history
    #	for treeRun1 --expected-lengths 25 --target-coverage 0.45 \
    #		/cluster/data/ce4/bed/multiz2way/cons/starting-tree.mod 
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
	/cluster/data/ce4/bed/multiz2way/cons/starting-tree.mod \
	--gc 0.365 --nrates 1,1 --no-post-probs --ignore-missing \
	--expected-lengths 25 --target-coverage 0.45 \
	--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod +x makeTree.csh

    # Make cluster job and run it
    mkdir tree log
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 27 of 27 jobs
# CPU time in finished jobs:       1887s      31.45m     0.52h    0.02d  0.000 y
# IO & Wait Time:                   319s       5.31m     0.09h    0.00d  0.000 y
# Average job time:                  82s       1.36m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             333s       5.55m     0.09h    0.00d
# Submission to last job:          3474s      57.90m     0.96h    0.04d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt 2>&1
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/ce4/bed/multiz2way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .45 25 \
                        ave.cons.mod ave.noncons.mod
    #	for treeRun1 --expected-lengths 25 --target-coverage 0.45 cb3
# (Solving for new omega: 25.000000 22.541982 22.386815 22.386143 )
# Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727
# Relative entropy: H=0.186326 bits/site
# Expected min. length: L_min=53.981235 sites
# Expected max. length: L_max=39.938782 sites
# Phylogenetic information threshold: PIT=L_min*H=10.058085 bits
# Recommended expected length: omega=22.386143 sites (for L_min*H=9.780000)

    #	for treeRun1 --expected-lengths 25 --target-coverage 0.45 caePb1
# (Solving for new omega: 25.000000 22.727583 22.596785 22.596318 )
# Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727
# Relative entropy: H=0.192979 bits/site
# Expected min. length: L_min=52.008938 sites
# Expected max. length: L_max=38.464113 sites
# Phylogenetic information threshold: PIT=L_min*H=10.036610 bits
# Recommended expected length: omega=22.596318 sites (for L_min*H=9.780000)

    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/worms/ce4/cons2way/consRun1
    cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1
    #	using the ave.cons.mod,ave.noncons.mod from treeRun2 above:
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.317500 0.182500 0.182500 0.317500 
RATE_MAT:
  -0.861737    0.169946    0.448797    0.242994 
   0.295659   -1.241318    0.166727    0.778932 
   0.780784    0.166727   -1.242392    0.294881 
   0.242994    0.447733    0.169498   -0.860225 
TREE: (ce4:0.088042,cb3:0.088042);
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.317500 0.182500 0.182500 0.317500 
RATE_MAT:
  -0.861737    0.169946    0.448797    0.242994 
   0.295659   -1.241318    0.166727    0.778932 
   0.780784    0.166727   -1.242392    0.294881 
   0.242994    0.447733    0.169498   -0.860225 
TREE: (ce4:0.281698,cb3:0.281698);

    mkdir ppRaw bed
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 25 --target-coverage 0.45 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive
# Completed: 29 of 29 jobs
# CPU time in finished jobs:        312s       5.20m     0.09h    0.00d  0.000 y
# IO & Wait Time:                    78s       1.30m     0.02h    0.00d  0.000 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              19s       0.32m     0.01h    0.00d
# Submission to last job:            38s       0.63m     0.01h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
    #	if that looks right (== in chrom and numerical order), then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin \
	    > phastConsElements2way.bed
    #	OK with--expected-length 25 --target-coverage 0.45 
    cp -p phastConsElements2way.bed /cluster/data/ce4/bed/multiz2way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore02
    cd /cluster/data/ce4
    faSize */chr*.fa
    #	100281244 bases (0 N's 100281244 real 87008909 upper
    #	13272335 lower) in 7 sequences in 7 files
    #	%13.24 masked total, %13.24 masked real

    cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1
    #	The 100281244 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \
	phastConsElements2way.bed
    #	--expected-length 15 --target-coverage 0.55
    #	% 0.54 = 100.0*537580/100281244

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \
	-enrichment sangerGene:cds phastConsElements2way.bed
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements2way.bed 2.459%, both 0.872%,
    #	cover 3.45%, enrich 1.40x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz2way
    #	the copy was already done above
    # cp -p /san/sanvol1/scratch/worms/ce4/cons2way/consRun2/phastConsElements2way.bed .
    time nice -n +19 hgLoadBed ce4 phastConsElements2way \
	phastConsElements2way.bed
    #	Loaded 51538 elements of size 5

    #	should measure the same as above
    time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \
	phastConsElements2way
    #	--expected-length 15 --target-coverage 0.55
    #	sangerGene:cds 25.286%, phastConsElements2way 2.459%, both 0.872%,
    #	cover 3.45%, enrich 1.40x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore02
    cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode -noOverlap stdin \
		phastCons2way.wig phastCons2way.wib
    #	Converted stdin, upper limit 0.93, lower limit 0.00
    #	real    0m26.060s
    #	-rw-rw-r--  1 49349613 May  3 11:02 phastCons2way.wib
    #	-rw-rw-r--  1 11327787 May  3 11:02 phastCons2way.wig

    nice -n +19 cp -p phastCons2way.wi? /cluster/data/ce4/bed/multiz2way/

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun2
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons2wayScores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons2wayScores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    1m14.178s

    #	copy them for downloads
    ssh kkstore02
    #	this directory is actually a symlink from store9 to store8 to
    #	avoid the data full problem on store9
    mkdir /cluster/data/ce4/bed/multiz2way/phastCons2wayScores
    cd /cluster/data/ce4/bed/multiz2way/phastCons2wayScores
    cp -p  /san/sanvol1/scratch/worms/ce4/cons2way/consRun2/phastCons2wayScores/* .

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/ce4
    ln -s /cluster/data/ce4/bed/multiz2way/phastCons2wayScores .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz2way
    ln -s `pwd`/phastCons2way.wib /gbdb/ce4/wib/phastCons2way.wib
    nice -n +19 hgLoadWiggle ce4 phastCons2way phastCons2way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/ce4/bed/multiz2way
    nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ce4 phastCons2way > histogram.data 2>&1

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce4 Histogram phastCons2way track"
set xlabel " phastCons2way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &


############################################################################
# ADJUST KIMEXPDISTANCE NAMES TO MATCH SANGERCANONICAL (DONE 7/12/07 angie)
# kimExpDistance has all [A-Z]+[0-9]+\.[0-9]+ names, but now sangerCanonical
# has some of the new    [A-Z]+[0-9]+\.[0-9]+\.[0-9]+ names, so when hgNear 
# looks for a canonical name in kimExpDistance, it often can't find it.
# Canonical also has     [A-Z]+[0-9]+\.[0-9]+[a-z]
# and                    [A-Z]+[0-9]+\.[0-9]+[a-z]\.[0-9]+
# kimExpDistance has 4   [A-Z]+[0-9]+\.[0-9]+[A-Z]
# Update the names in kimExpDistance to match sangerCanonical.
    ssh hgwdev
    # Look at just the names and how they differ between the two sources:
    hgsql ce4 -N -e 'select transcript from sangerCanonical' \
      | sort > /tmp/canonicalNames
    hgsql ce4 -N -e 'select * from kimExpDistance' \
      > /tmp/kimExpDistance
    awk -F"\t" '{print $1;}' /tmp/kimExpDistance \
      | uniq | sort -u > /tmp/kimExpNames
    sdiff /tmp/canonicalNames /tmp/kimExpNames | less
    comm -23 /tmp/canonicalNames /tmp/kimExpNames | wc -l
#7529
    comm -13 /tmp/canonicalNames /tmp/kimExpNames | wc -l
#6698
    # Pretty significant non-overlap, but I think much of it can be 
    # resolved by slight tweaks to kimExpDistance names so that they 
    # match the latest, more-specifically-versioned canonical transcripts.
    # Also, kimExpDistance has many names with no match in canonical,
    # which means many thousands of unnecessary rows bloating the table.
    cat > fixExpDistance.pl <<'_EOF_'
#!/bin/env perl
use warnings;
use strict;

my @kimExpNames;
my @kimSpecial;
open(F, "/tmp/kimExpNames") || die;
while (<F>) {
  chomp;
  push @kimExpNames, $_;
  push @kimSpecial, $_ if ($_ =~ /\dA$/);
}
close(F);

my %kim2canon;
open(F, "/tmp/canonicalNames") || die;
while (<F>) {
  chomp;
  my $kim = $_;
  my @special = grep m/^$kim$/i, @kimSpecial;
  if (scalar(@special)) {
    $kim2canon{$special[0]} = $kim;
  } else {
    my $simple = $kim;
    $simple =~ s/^(\w+\.\d+).*$/$1/;
    $kim2canon{$simple} = $kim;
  }
}
close(F);

while (<>) {
  chomp;
  my ($query, $target, $distance) = split('\t');
  my $canonicalQ = $kim2canon{$query};
  my $canonicalT = $kim2canon{$target};
  if (defined $canonicalQ && defined $canonicalT) {
    print "$canonicalQ\t$canonicalT\t$distance\n";
  }
}
'_EOF_'
    # << emacs
    chmod a+x fixExpDistance.pl
    ./fixExpDistance.pl /tmp/kimExpDistance > kimExpDistanceFixedNames.tab
    # Look the file over, grep for some IDs in /tmp/canonicalNames.
    # Load into the db as kimExprDistanceTest.  It will take a while
    # because it's still 12M rows (down from 19M).
    time nice hgLoadSqlTab ce4 kimExpDistanceTest \
      ~/kent/src/hg/lib/distance.sql kimExpDistanceFixedNames.tab
#0.000u 0.000s 2:28.94 0.0%      0+0k 0+0io 0pf+0w

    # Test hgNear with kimExpDistanceTest instead of kimExpDistance.
    # Don't delete the original table just yet, but do clean it up soon
    # because it's quite large.
    hgsql ce4 -e 'rename table kimExpDistance to kimExpDistanceOrig;'
    hgsql ce4 -e 'rename table kimExpDistanceTest to kimExpDistance;'
    runJoiner.csh ce4 kimExpDistance
    # Test hgNear again with kimExpDistance.


############################################################################
## fixup the sangerPep names to match sangerCanonical
    ## this is a dead end, it doesn't actually fix anything
    ssh hgwdev
    mkdir /cluster/data/ce4/bed/fixupSangerPep
    cd /cluster/data/ce4/bed/fixupSangerPep

    ## current sangerPep names
    cut -f1 ../sangerGene/sangerPep.tab | sort -u > sangerPep.names.original.txt
    ## and canonical names
    hgsql -N -e "select transcript from sangerCanonical;" ce4 \
	| sort -u > sangerCanonical.transcript.txt
    # line counts
    wc -l *.txt
    #	19965 sangerCanonical.transcript.txt
    #	23224 sangerPep.names.original.txt
    ## unique to canonical
    comm -23 sangerCanonical.transcript.txt sangerPep.names.original.txt | wc -l
    #	1630
    #  add all canonical names to peptides by duplicating the peptides
    ~/kent/src/hg/utils/wormStuff/oneToTwo.pl newPeptides.tab 2> warnings.txt
    # verify this is a new superset of peptides
    cut -f1 newPeptides.tab | sort -u > sangerPep.newNames.txt
    comm -12 sangerPep.names.original.txt sangerPep.newNames.txt | wc -l
    #	23224 - same number as was in sangerPep.names.original.txt
    # new names added:
    comm -13 sangerPep.names.original.txt sangerPep.newNames.txt | wc -l
    #	1593
    # the missing peptides are for canonical non-coding genes
    comm -23 sangerCanonical.transcript.txt sangerPep.newNames.txt | wc -l
    # 37
############################################################################
## fixup sangerCanonical names to remove the second dot name
    ssh hgwdev
    cd /cluster/data/ce4/bed/fixupSangerCanonical
    hgsql -e "select chrom,txStart,txEnd,name from sangerGene;" ce4 \
	| sort -k1,1 -k2,2n > sangerGene.names.txt
    hgsql -N -e "select * from sangerCanonical;" ce4 > sangerCanonical.tab
    cat << '_EOF_' > fixup.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %genePositions;

open (FH,"<sangerGene.names.txt") or die "can not read sangerGene.names.txt";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $name) = split('\s+',$line);
    die "duplicate name in sangerGene $name" if (exists($genePositions{$name}));
    $genePositions{$name} = sprintf "%s\t%d\t%d", $chr, $start, $end;
}
close (FH);

my %names;

open (FH,"<sangerCanonical.tab") or die "can not read sangerCanonical.tab";

while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $score, $name, $prot) = split('\s+',$line);
    my ($a, $b, $c, $d) = split('\.',$name);
    die "found four part name at $line" if (defined($d));
    my $twoPartName = $name;
    if (defined($c)) {
	$twoPartName = sprintf "%s.%s", $a, $b;
    }
    die "ERROR: two part name clash at $line" if (exists($names{$twoPartName}));
    $names{$twoPartName} = $line;
    if (!defined($prot)) { print STDERR "# no prot: $line\n"; my $prot = ""; }
    die "can not find position for $twoPartName"
	if (!exists($genePositions{$twoPartName}));
    my $pos = $genePositions{$twoPartName};
    my ($existChr, $realStart, $realEnd) = split('\s+', $pos);
    die "not on same chrom ? $twoPartName Name $chr and $existChr"
	if ($existChr ne $chr);
    my $startDiff = $start - $realStart;
    $startDiff = - $startDiff if ($startDiff < 0);
    my $endDiff = $end - $realEnd;
    $endDiff = - $endDiff if ($endDiff < 0);
    die "start more than 100000 off $twoPartName $pos" if ($startDiff > 100000);
    die "end more than 20000 off $twoPartName $pos" if ($endDiff > 20000);
    printf "%s\t%d\t%d\t%d\t%s\t%s\n", $chr, $realStart, $realEnd, $score,
	$twoPartName, $prot;
}

close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./fixup.pl
    ./fixup.pl > fixed.tab
    hgsql -e 'delete from sangerCanonical where chrom like "chr%";' ce4
    hgsql -e \
	'load data local infile "fixed.tab" into table sangerCanonical;' ce4
    ## reset indexes
    hgsql -e "analyze table sangerCanonical;" ce4

############################################################################
## fixup sangerLinks table (DONE - 2007-07-25 - Hiram)
    ## it was missing descriptions for the one dot names where only two dot
    ## names existed
    mkdir /cluster/data/ce4/bed/fixupSangerLinks
    cd /cluster/data/ce4/bed/fixupSangerLinks
    ln -s ../sangerGene/sangerLinks.tab .
    cat << '_EOF_' > slFixup.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,"<sangerLinks.tab") or die "can not read sangerLinks.tab";

my %existing;   # key is full gene symbol name, one or two dots
                # value is the orfName and description text, tab separated
my %newNeeded;  # key is shortened one dot name, value is orfName and
                # description text, tab separated
my $newCount = 0;

while (my $line = <FH>) {
    chomp $line;
    my ($name, $orf, $desc) = split('\t',$line);
    my ($a, $b, $c, $d) = split('\.', $name);
    die "ERROR: found three dot name at $line" if (defined($d));
    if (defined($c)) {
        my $oneDotName = sprintf "%s.%s", $a, $b;
        if (exists($newNeeded{$oneDotName})) {
            my $newString = sprintf "%s\t%s", $orf, $desc;
            die "ERROR: new one dot name, different descr $name"
                if ($newNeeded{$oneDotName} ne $newString)
        } else {
        $newNeeded{$oneDotName} = sprintf "%s\t%s", $orf, $desc;
        ++$newCount;
        }
    }
    die "ERROR: duplicate name $name at $line" if (exists($existing{$name}));
    $existing{$name} = sprintf "%s\t%s", $orf, $desc;
}

close (FH);

foreach my $key (keys %newNeeded) {
    die "ERROR: duplicate new name $key for $newNeeded{$key}"
        if (exists($existing{$key}));
    printf "%s\t%s\n", $key, $newNeeded{$key};
}

foreach my $key (keys %existing) {
    die "ERROR: duplicate existing name $key for $existing{$key}"
        if (exists($newNeeded{$key}));
    printf "%s\t%s\n", $key, $existing{$key};
}
'_EOF_'
    # << happy emacs
    chmod +x ./slFixup.pl
    ./slFixup.pl > fixedSangerLinks.tab

    hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
	fixedSangerLinks.tab

#############################################################################
## creating xxBlastTab tables (WORKING - 2007-07-31 - Hiram)
    # Make sure there are no joinerCheck errors at this point, because 
    # if left here they can spread via doHgNearBlastp:
    ssh hgwdev
    runJoiner.csh ce4 sangerPep
 # ce4.orfToGene.name - hits 27812 of 27812 ok
 # ce4.sangerBlastTab.query - hits 1214556 of 1214556 ok
 # ce4.sangerBlastTab.target - hits 1214556 of 1214556 ok
 # ce4.sangerCanonical.transcript - hits 19965 of 19965 ok
 # ce4.sangerIsoforms.transcript - hits 30497 of 30497 ok
 # ce4.sangerLinks.orfName - hits 29778 of 30777 ok
 # ce4.sangerPep.name - hits 23224 of 23224 ok
 # ce4.sangerToRefSeq.name - hits 30336 of 30336 ok

    ssh hgwdev
    mkdir /cluster/data/ce4/bed/hgNearBlastp
    cd /cluster/data/ce4/bed/hgNearBlastp
    mkdir 070731
    cd 070731
    # Get the proteins used by the other hgNear organisms:
    pepPredToFa hg18 knownGenePep hg18.known.faa
    pepPredToFa mm8 knownGenePep mm8.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer4 ensPep danRer4.ensPep.faa
    pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
    pepPredToFa ce4 sangerPep ce4.sangerPep.faa
    pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
    # sanity check, number of lines in each faa file
    wc -l *.faa
    #	238243 ce4.sangerPep.faa
    #	435532 danRer4.ensPep.faa
    #	258316 dm3.flyBasePep.faa
    #	526563 hg18.known.faa
    #	315528 mm8.known.faa
    #	86220 rn4.known.faa
    #	64956 sacCer1.sgdPep.faa
    # sanity check, count the number of protein sequences in each
    for F in *.faa
do
    echo -n "${F} "
    grep "^>" ${F} | wc -l
done
# ce4.sangerPep.faa 23224
# danRer4.ensPep.faa 36048
# dm3.flyBasePep.faa 20279
# hg18.known.faa 45480
# mm8.known.faa 30716
# rn4.known.faa 8160
# sacCer1.sgdPep.faa 5766

    cd /cluster/data/ce4/bed/hgNearBlastp
    cat << '_EOF_' > config.ra
# Latest worm vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, yeast

targetGenesetPrefix sanger
targetDb ce4
queryDbs hg18 mm8 rn4 danRer4 dm3 sacCer1
recipBest hg18 mm8 rn4 danRer4 dm3 sacCer1

ce4Fa     /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa
hg18Fa    /cluster/data/ce4/bed/hgNearBlastp/070731/hg18.known.faa
mm8Fa     /cluster/data/ce4/bed/hgNearBlastp/070731/mm8.known.faa
rn4Fa     /cluster/data/ce4/bed/hgNearBlastp/070731/rn4.known.faa
danRer4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/danRer4.ensPep.faa
dm3Fa     /cluster/data/ce4/bed/hgNearBlastp/070731/dm3.flyBasePep.faa
sacCer1Fa /cluster/data/ce4/bed/hgNearBlastp/070731/sacCer1.sgdPep.faa

buildDir /cluster/data/ce4/bed/hgNearBlastp/070731
scratchDir /san/sanvol1/scratch/ce4HgNearBlastp
'_EOF_'
    # << happy emacs

    # Run this in a screen since it is going to take awhile
    # Run with -noLoad so we can eyeball files, manually load dm3 tables now,
    # and later overload other databases' dmBlastTab tables.
    doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 &
    # watch progress in do.log


#########################################################################
# CE4->CE5 LIFTOVER (DONE 10/11/07 angie)
    ssh kkstore02
    cd /cluster/data/ce4
    doSameSpeciesLiftOver.pl ce4 ce5 -debug
    cd bed/blat.ce5.2007-10-11
    doSameSpeciesLiftOver.pl ce4 ce5 -verbose 3 >& do.log &
    tail -f do.log

#########################################################################
# CE4->CE6 LIFTOVER (DONE - 2008-06-24 - Hiram)
    ssh kkstore02
    cd /cluster/data/ce4
    cp -p /cluster/data/ce6/ce6.2bit /cluster/bluearc/ce6
    doSameSpeciesLiftOver.pl -bigClusterHub=kk -workhorse=hgwdev ce4 ce6 -debug
    cd bed/blat.ce5.2008-06-24
    doSameSpeciesLiftOver.pl -bigClusterHub=kk -workhorse=hgwdev ce4 ce6 \
	-verbose=3 > do.log 2>&1 &
    #	this takes about 10 minutes
    tail -f do.log
    rm -f /cluster/bluearc/ce6/ce6.2bit

#########################################################################
# Nucleosome tracks (DONE - 2008-04-08 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ce4/bed/monoNucleosomes
    cd /cluster/data/ce4/bed/monoNucleosomes
    #	download everything into stanford sub-directory
    mkdir stanford
    cd stanford

    #	track data was delivered via the following URL:
    #	http://stellar.stanford.edu/C_elegans_mononucleosomes/index.html

    #	Uncollapsed Track files by chromosome
for C in I II III IV V X M
do
    wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_chr${C}_forward.bed.gz
    wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_chr${C}_reverse.bed.gz
done

    #	Collapsed nucleosome track files by chromosome:
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.nucl.for.bed.gz
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.nucl.rev.bed.gz

    #	Collapsed control track files by chromosome:
wget --timestamping \
	 http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.cont.paired.for.bed.gz
	 wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.cont.paired.rev.bed.gz

    #	Positioning stringency wig track
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M_positioning_stringency.wig.gz
    #	encode that
    zcat chrI-M_positioning_stringency.wig.gz \
	| wigEncode stdin stringency.wig stringency.wib

    #	 Adjusted nucleosome coverage wig track:
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/adjusted_coverage_chrI-M.wig.gz
    wigEncode adjusted_coverage_chrI-M.wig.gz nucleosomeAdjustedCoverage.wig \
        nucleosomeAdjustedCoverage.wib

    #	Track files for the whole genome
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_all_forward.bed.gz
wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_all_reverse.bed.gz

    #	time for all of the above:
    #	real    2m20.465s
    #	resulting files:
# -rw-rw-r--  1 386656573 Feb 29 12:08 mono_nucleosomes_all_forward.bed.gz
# -rw-rw-r--  1 382914445 Feb 29 12:08 mono_nucleosomes_all_reverse.bed.gz
# -rw-rw-r--  1 166189150 Mar 17 15:30 chrI-M.cont.paired.for.bed.gz
# -rw-rw-r--  1 130560514 Mar 17 15:30 chrI-M.nucl.for.bed.gz
# -rw-rw-r--  1 135184112 Mar 17 15:30 chrI-M.nucl.rev.bed.gz
# -rw-rw-r--  1 381477404 Mar 19 15:10 chrI-M_positioning_stringency.wig.gz
# -rw-rw-r--  1 470000654 Mar 24 14:55 adjusted_coverage_chrI-M.wig.gz
# -rw-rw-r--  1 175019114 Apr  3 11:37 chrI-M.cont.paired.rev.bed.gz

    #	shorten the names which indicate counts:
    cd ..

    #	Collapsed control track files by chromosome:
    zcat stanford/chrI-M.cont.paired.rev.bed.gz | grep "^chr" | awk '
{
gsub("count=","",$4)
printf "%s\t%d\t%d\tn = %d\t%d\t-\t%d\t%d\t%s\t%s\t%s\t%s\n",
    $1,$2,$3,$4,$4,$7,$8,$9,$10,$11,$12
}
' | gzip > chrI-M.rev.paired.bed.gz
    zcat chrI-M.rev.paired.bed.gz | sort -k1,1 -k2,2n \
     | bedItemOverlapCount ce4 stdin \
     | wigEncode stdin nucleosomeControlAntisenseCoverage.wig \
        nucleosomeControlAntisenseCoverage.wib

    zcat stanford/chrI-M.cont.paired.for.bed.gz | grep "^chr" | awk '
{
gsub("count=","n = ",$4)
printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
    $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12
}
' | gzip > chrI-M.for.paired.bed.gz
    zcat chrI-M.for.paired.bed.gz | sort -k1,1 -k2,2n \
     | bedItemOverlapCount ce4 stdin \
     | wigEncode stdin nucleosomeControlSenseCoverage.wig \
        nucleosomeControlSenseCoverage.wib

    #	Collapsed nucleosome track files by chromosome:
    zcat stanford/chrI-M.nucl.for.bed.gz | grep "^chr" | awk '
{
gsub("count=","n = ",$4)
printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
    $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12
}
' | gzip > chrI-M.for.clean.bed.gz

    zcat stanford/chrI-M.nucl.rev.bed.gz | grep "^chr" | awk '
{
gsub("count=","n = ",$4)
printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
    $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12
}
' | gzip > chrI-M.rev.clean.bed.gz

    ###########################################################
    #	loading track data
    ssh hgwdev
    cd /cluster/data/ce4/bed/monoNucleosomes

    #	Collapsed control track files by chromosome:
    LOAD="hgLoadBed -tab -noNameIx ce4"
    zcat stanford/chrI-M.for.paired.bed.gz stanford/chrI-M.rev.paired.bed.gz \
        | ${LOAD} nucleosomeControl stdin

    #	Collapsed nucleosome track files by chromosome:
    zcat chrI-M.for.clean.bed.gz \
        | ${LOAD} nucleosomeFragmentsSense stdin
    zcat chrI-M.rev.clean.bed.gz \
        | ${LOAD} nucleosomeFragmentsAntisense stdin
    #	these three tracks grouped into the composite: "nucleosome"
    #	labels:
    #   composite container: Nucleosome predictions from SOLidD core alignments
    #	Control - : mononucleosome control
    #	fragments + : mononucleosomal fragments, sense strand reads
    #	fragments - : mononucleosomal fragments, antisense strand reads
    
    #	Positioning stringency wig track
    hgLoadWiggle ce4 nucleosomeStringency stringency.wig
    #	NSome Stringency
    #	Nucleosome Positioning Stringency from SOLiD core alignments

    #	 Adjusted nucleosome coverage wig track:
    hgLoadWiggle ce4 nucleosomeAdjustedCoverage \
        nucleosomeAdjustedCoverage.wig
    #	Adj NSome Covrg
    #	Adjusted nucleosome coverage (25bp)

    #	MNase Coverage
    #	Micrococcal nuclease control coverage
    hgLoadWiggle ce4 nucleosomeControlSenseCoverage \
        nucleosomeControlSenseCoverage.wig
    #	Cntl MNase +Covg
    #	Micrococcal nuclease control coverage, sense strand reads
    hgLoadWiggle ce4 nucleosomeControlAntisenseCoverage \
        nucleosomeControlAntisenseCoverage.wig
    #	Cntl MNase -Covg
    #	Micrococcal nuclease control coverage, antisense strand reads

    #	these two sets of files were "cleaned" of references to chrK-12
    #	and chr positions off the end of chromosomes with the following script:
for I in forward reverse
do
cat ../../../chrom.sizes | while read S
do
    C=`echo ${S} | awk '{print $1}'`
    SZ=`echo ${S} | awk '{print $2}'`
    echo "checking $C for $SZ"
    zcat mono_nucl*_${C}_${I}.bed.gz | grep -w "^${C}" \
        | grep -v "chrK-12" | awk '$3 <= '"${SZ}"'' \
        | sort -k1,1 -k2,2n | gzip > clean/${C}.${I}.bed.gz
done
done

    zcat stanford/clean/chr*.forward.bed.gz \
     | bedItemOverlapCount ce4 stdin \
         | wigEncode stdin monoNucleosomesSense.wig monoNucleosomesSense.wib

    zcat stanford/mono_nucleosomes_chr*_reverse.bed.gz | grep "^chr" \
     | bedItemOverlapCount ce4 stdin \
     | wigEncode stdin monoNucleosomesAntiSense.wig monoNucleosomesAntiSense.wib
    #	NSome Coverage
    #	Coverage of nucleosome predictions from SOLiD core alignments

    #	Control
    #	mononucleosome control
    hgLoadWiggle ce4 monoNucleosomesSense monoNucleosomesSense.wig
    #	NSome Core +Covg
    #	Coverage of mononucleosomal fragments, sense strand reads
    hgLoadWiggle ce4 monoNucleosomesAntiSense monoNucleosomesAntiSense.wig
    #	NSome Core -Covg
    #	Coverage of mononucleosomal fragments, antisense strand reads

#########################################################################
#  25mers repeat track (DONT - 2008-04-16 - Hiram)
    #	data from Anton Valouev "valouev at stanford.edu"
    #	to accompany the nucleosomes business
    mkdir /cluster/data/ce4/bed/25mersRepeats
    cd /cluster/data/ce4/bed/25mersRepeats
    wget --timestamping \
http://stellar.stanford.edu/C_elegans_mononucleosomes/all_chr_repeat.bed.gz
    #	add a better score filed and a better name
    zcat all_chr_repeat.bed.gz | grep ^chr | awk '
{
printf "%s\t%d\t%d\t%d\t1\t%s\t%d\t%d\t%s\n", $1, $2, $3, NR, $6, $7, $8, $9
}
' > withScore.bed
    hgLoadBed ce4 25mersRepeats withScore.bed
    #	create a description page for it.
###########################################################################
#  LOAD Transcriptome data (DONE - 2008-04-15 - Hiram)
    # data from Christian Iseli 'Christian.Iseli at licr.org'
    ssh hgwdev
    mkdir /cluster/data/ce4/bed/transcriptome
    cd /cluster/data/ce4/bed/transcriptome
    wget --timestamping ftp://ftp.licr.org/pub/databases/trome/ce4/WTr.gtf.gz
    wget --timestamping ftp://ftp.licr.org/pub/databases/trome/ce4/txg.tar.gz
    gtfToGenePred -genePredExt WTr.gtf.gz WTr.gp
    hgLoadGenePred ce4 transcriptome -genePredExt WTr.gp

    tar xvzf txg.tar.gz
    # Do a little data cleanup and transformation and
    #	load splice graphs into database.
    sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \
	> sibTxGraph.sql
    cat txg/*.txg | txgToAgx stdin stdout \
	| hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql ce4 sibTxGraph stdin
    #	Loaded 20553 elements of size 18

    # Create sibAltEvents track for analysed alt-splices.
    cat txg/*.txg \
	| txgAnalyze stdin /cluster/data/ce4/ce4.2bit stdout \
	| awk '$2 >= 0' | sort | uniq > sibAltEvents.bed
    hgLoadBed ce4 sibAltEvents sibAltEvents.bed
    #	Loaded 18447 elements of size 6

#########################################################################
# LIFTOVER TO ce10 (DONE - 2011-05-24 - Hiram )
    mkdir /hive/data/genomes/ce4/bed/blat.ce10.2011-05-24
    cd /hive/data/genomes/ce4/bed/blat.ce10.2011-05-24
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/ce4/jkStuff/11.ooc -debug ce4 ce10
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/ce4/jkStuff/11.ooc ce4 ce10 > do.log 2>&1
    #	about 4 minutes
    # verify it works on genome-test

#############################################################################