# for emacs: -*- mode: sh; -*-


# This file describes how to make the browser database for the
# worm C. elegans

###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2005-04-29, hiram)
    ssh eieio
    mkdir /cluster/store5/worm/ce3
    cd /cluster/store5/worm/ce3
    mkdir WS140
    cd WS140
for F in composition.all masked totals
do
    FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/"
    wget "${FTP}/${F}" -O ${F}
done

    #	while fetching chrM, rename it from CHROMOSOME_MtDNA to
    #	CHROMOSOME_M to match the pattern of the others
    FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/"
    wget "${FTP}/CHROMOSOME_MtDNA.dna.gz" -O /dev/stdout | zcat | \
	sed -e "s/MtDNA/M/" | gzip > chrM.fa.gz
    wget "${FTP}/CHROMOSOME_MtDNA.gff.gz" -O /dev/stdout | zcat | \
	sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" | gzip > chrM.gff.gz

    #	create agp file for chrM
    echo "M 1 13794 1 F X54252.1 1 13794 +" | sed -e "s/  */\t/g" > \
	chrM.agp

for C in I II III IV V X
do
    FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/"
    wget --timestamping "${FTP}/CHROMOSOME_${C}.dna.gz" -O chr${C}.fa.gz
    wget --timestamping "${FTP}/CHROMOSOME_${C}.gff.gz" -O chr${C}.gff.gz
    wget --timestamping "${FTP}/CHROMOSOME_${C}.agp" -O chr${C}.agp
    echo "done: chr${C}"
done

    faCount *.fa.gz
# #seq                len        A        C        G         T N       cpg
# CHROMOSOME_I   15080552  4838583  2697189  2693560   4851220 0       503702
# CHROMOSOME_II  15279311  4878196  2769211  2762192   4869712 0       492145
# CHROMOSOME_III 13783317  4444530  2449080  2466259   4423448 0       459652
# CHROMOSOME_IV  17493785  5711040  3034769  3017007   5730969 0       522372
# CHROMOSOME_V   20922231  6749806  3711722  3700960   6759743 0       638852
# CHROMOSOME_X   17718850  5746417  3119281  3118287   5734865 0       514715
# chrM              13794     4335     1225     2055     6179  0       110
# total         100291840 32372907 17782477 17760320 323761360        3131548

    cd /cluster/store5/worm/ce3
    # translate to unzipped .fa, all upper case, and
    # rename the agp files so hgGoldGap can find them
    mkdir sangerFa

for C in I II III IV V X M
do
    echo -n "${C} "
    zcat WS140/chr${C}.fa.gz | tr '[a-z]' '[A-Z]' | \
	    sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${C}.fa
    mkdir -p sangerFa/${C}
    ln -s ${C} sangerFa/chr${C}
    sed -e "s/^/chr/" `pwd`/WS140/chr${C}.agp > sangerFa/${C}/chr${C}.agp
done

#    ln -s `pwd`/WS140/chr${C}.agp sangerFa/${C}/chr${C}.agp

    #	verify faCount still has the same amount of sequence as listed above
    cd sangerFa
    faCount *.fa
    #	you should see the same numbers
    #	the assembly has no N's:
    grep N *.fa
    #	shows nothing

    #	set cluster data symlink for future reference:
    ssh hgwdev
    ln -s /cluster/store5/worm/ce3 /cluster/data/ce3

###########################################################################
# PREPARE Split contigs into 100,000 bp chunks for cluster runs
# (DONE, 2004-04-29, Hiram)
    # next machine
    ssh eieio
    cd /cluster/data/ce3
    rm -fr ./split
    mkdir split
for C in I II III IV V X M
do
    mkdir split/${C}
    faSplit size sangerFa/chr${C}.fa 100000 split/${C}/c -lift=split/chr${C}.lft
done
151 pieces of 151 written
153 pieces of 153 written
138 pieces of 138 written
175 pieces of 175 written
210 pieces of 210 written
178 pieces of 178 written
1 pieces of 1 written

   cat split/*.lft > liftAll.lft
   # copy them to /iscratch/i for cluster rsync
   # next machine
   ssh kkr1u00
   cd /cluster/data/ce3/split
for C in I II III IV V X M
do
    echo -n "${C} "
    mkdir -p /iscratch/i/worms/Celegans3/unmaskedSplit/${C}
    cp -p ${C}/c*.fa /iscratch/i/worms/Celegans3/unmaskedSplit/${C}
done
    #	this iSync takes forever these days because iscratch is so large
    /cluster/bin/iSync
    #	instead, a simple rsync of just this business will get it done
    #	rapidly
for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \
	kkr${i}u00:/iscratch/i/worms/
done

############################################################################
# Run RepeatMasker on the chromosomes (DONE, 2005-04-29 - Hiram)
    # next machine
    ssh kk
    cd /cluster/data/ce3
    mkdir jkStuff
    # make run directory and job list, create the script to use 
    # for the RepeatMasker run
    cat << '_EOF_' > jkStuff/RMWorm
#!/bin/csh -fe
#
#       This is a slight rearrangement of the
#       RMChicken script used in makeGalGal2.doc
#       The results here need to go to a different location
#       $1 == chrom name: I II III IV V X M
#       $2 == directory where split contig .fa is found
#       $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/ce3/$3/$1
/bin/cp $3 /tmp/ce3/$3/$1
cd /tmp/ce3/$3/$1
/cluster/bluearc/RepeatMasker050112/RepeatMasker -alignments -s -species elegans $3
popd
/bin/cp /tmp/ce3/$3/$1/$3.out ./
if( -e /tmp/ce3/$3/$1/$3.align ) /bin/cp /tmp/ce3/$3/$1/$3.align ./
if (-e /tmp/ce3/$3/$1/$3.tbl) /bin/cp /tmp/ce3/$3/$1/$3.tbl ./
if (-e /tmp/ce3/$3/$1/$3.cat) /bin/cp /tmp/ce3/$3/$1/$3.cat ./
/bin/rm -r /tmp/ce3/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce3/$3
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce3
'_EOF_'
    # emacs happy
    chmod +x jkStuff/RMWorm
    # create job list
    mkdir RMRun
    rm -f RMRun/jobList
for C in I II III IV V X M
do
    mkdir /cluster/data/ce3/RMRun/${C}
    for T in /iscratch/i/worms/Celegans3/unmaskedSplit/$C/c*.fa
    do
	D=`dirname $T`
	F=`basename $T`
	echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \
'{'check out line+ /cluster/data/ce3/RMRun/$C/${F}.out'}'
    done >> RMRun/jobList
done
    # Do the run
    cd /cluster/data/ce3/RMRun
    para create jobList
    para try, para check, para check, para push, para check, ...
XXXX
    #  STARTED 2005-04-29 14:28
# Completed: 1006 of 1006 jobs
# CPU time in finished jobs:    1129254s   18820.90m   313.68h   13.07d  0.036 y
# IO & Wait Time:                  4664s      77.73m     1.30h    0.05d  0.000 y
# Average job time:                1127s      18.79m     0.31h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1290s      21.50m     0.36h    0.01d
# Submission to last job:          3575s      59.58m     0.99h    0.04d

    # when they are finished, liftUp and load the .out files into the database:
    # next machine
    ssh eieio
    cd /cluster/data/ce3/RMRun
for C in I II III IV V X M
do
    liftUp chr${C}.fa.out /cluster/data/ce3/split/chr${C}.lft warn ${C}/*.fa.out
done

##########################################################################
# CREATE DATABASE
    ssh eieio
    cd /cluster/data/ce3
    faToTwoBit sangerFa/chr*.fa ce3.2bit
    twoBitInfo ce3.2bit stdout | sort -rn +1 > chrom.sizes
    grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
    twoBitInfo ce3.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/ce3/ce3.2bit\n", $1,$2}' > chromInfo.tab

    ssh hgwdev
    cd /cluster/data/ce3
    hgsql -e "create database ce3;" mysql
    #	Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
    df -h | grep mysql
    #	/dev/sda1             472G  230G  218G  52% /var/lib/mysql2
    #	/dev/sdc1             1.8T  904G  756G  55% /var/lib/mysql

    # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-03-10 - Hiram)
    #   Use any of the newest databases to ensure that the organization
    #   of the grp table is up to date
    ssh hgwdev
    cd /cluster/data/ce3
    hgsql ce3 -e "create table grp (PRIMARY KEY(NAME)) select * from mm6.grp"
    hgsql ce3 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql ce3 -e 'load data local infile "chromInfo.tab" into table chromInfo;'

    # Enter ce3 into dbDb and defaultDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("ce3", "March 2005", "/gbdb/ce3", "C. elegans", \
        "chrII:14647078-14667002", 1, 59, "C. elegans", \
        "Caenorhabditis elegans", "/gbdb/ce3/html/description.html", 0, 0, \
        "WormBase v. WS140");' -h localhost hgcentraltest
    #	do this defaultDb entry later after there is something to see
    #	on this browser.
    hgsql -e 'INSERT INTO defaultDb (name, genome) VALUES("ce3", "Mouse")' \
        -h localhost hgcentraltest
    # start a new entry in the trackDb hierarchy
    ssh hgwdev
    # start a new entry in the trackDb hierarchy
    cd $HOME/kent/src/hg/makeDb/trackDb/worm
    mkdir ce3
    cvs add ce3
    cd ce3
    cp ../ce3/description.html .
    vi description.html - fixup text for this assembly
    cvs add description.html
    cvs commit
    cd ../..
    vi makefile - add ce3 to the list
    mkdir /cluster/data/ce3/html
    mkdir /gbdb/ce3
    ln -s /cluster/data/ce3/html /gbdb/ce3/html
    ln -s /cluster/data/ce3/ce3.2bit /gbdb/ce3/ce3.2bit
    cp -p worm/ce3/description.html /gbdb/ce3/html
    make DBS=ce3 ZOO_DBS=""

    cd /cluster/data/ce3/RMRun
    hgLoadOut ce3 chr*.fa.out
# bad rep range [1023, 233] line 2579 of chrII.fa.out 
# bad rep range [2914, 2908] line 2791 of chrIII.fa.out 
#	This result produces: 13238621 bases, 101,554 items, %13.200098

#######################################################################
#  an experiment to see if more repeat masked elements can be found
#	do a second repeat masker run, with a different split to help
#	cover some of the edges that the previous split may have made
#	that cause RM to not find items next to those edges.
#
#	The result of this experiment is that splitting at the 100,000
#	boundary hardly made much of a difference compared to a
#	completely unsplit result.  So, we will go with the traditional
#	split at 100,000 and leave it at that.
#######################################################################

###########################################################################
# PREPARE Split contigs into 103,000 bp chunks for a second RM run
#	This isn't perfect, there are a couple of breaks that are
#	exactly the same as before (seven) but all others are at least
#	1000 different
# (DONE, 2004-04-29, Hiram)
    # next machine
    ssh eieio
    cd /cluster/data/ce3
    rm -fr ./split103
    mkdir split103
for C in I II III IV V X M
do
    mkdir split103/${C}
    faSplit size sangerFa/chr${C}.fa 103000 split103/${C}/c -lift=split103/chr${C}.lft
done
147 pieces of 147 written
149 pieces of 149 written
134 pieces of 134 written
170 pieces of 170 written
204 pieces of 204 written
173 pieces of 173 written
1 pieces of 1 written

   cat split103/*.lft > liftAll103.lft
   # copy them to /iscratch/i for cluster rsync
   # next machine
   ssh kkr1u00
   cd /cluster/data/ce3/split103
for C in I II III IV V X M
do
    echo -n "${C} "
    mkdir -p /iscratch/i/worms/Celegans3/split103/${C}
    cp -p ${C}/c*.fa /iscratch/i/worms/Celegans3/split103/${C}
done
    #	this iSync takes forever these days because iscratch is so large
    /cluster/bin/iSync
    #	instead, a simple rsync of just this business will get it done
    #	rapidly
for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \
	kkr${i}u00:/iscratch/i/worms/
done

for C in I II III IV V X M
do
    mkdir /cluster/data/ce3/RMFull/${C}
    for T in /iscratch/i/worms/Celegans3/unmaskedFa/chr${C}.fa
    do
	D=`dirname $T`
	F=`basename $T`
	echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \
'{'check out line+ /cluster/data/ce3/RMFull/$C/${F}.out'}'
    done >> RMFull/jobList
done
    # Do the run
############################################################################
# Second RepeatMasker on different split size
    # next machine
    ssh kk
    cd /cluster/data/ce3
    # create job list
    mkdir RMRun103
    rm -f RMRun103/jobList
for C in I II III IV V X M
do
    mkdir /cluster/data/ce3/RMRun103/${C}
    for T in /iscratch/i/worms/Celegans3/split103/$C/c*.fa
    do
	D=`dirname $T`
	F=`basename $T`
	echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \
'{'check out line+ /cluster/data/ce3/RMRun103/$C/${F}.out'}'
    done >> RMRun103/jobList
done
    # Do the run
    cd /cluster/data/ce3/RMRun103
    para create jobList
    para try, para check, para check, para push, para check, ...
    #  STARTED 2005-04-29 15:43
# Completed: 978 of 978 jobs
# CPU time in finished jobs:    1129788s   18829.81m   313.83h   13.08d  0.036 y
# IO & Wait Time:                  5530s      92.16m     1.54h    0.06d  0.000 y
# Average job time:                1161s      19.35m     0.32h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1313s      21.88m     0.36h    0.02d
# Submission to last job:          2737s      45.62m     0.76h    0.03d

    # when they are finished, liftUp and load the .out files into the database:
    # next machine
    ssh eieio
    cd /cluster/data/ce3/RMRun103
for C in I II III IV V X M
do
 liftUp chr${C}.fa.out /cluster/data/ce3/split103/chr${C}.lft warn ${C}/*.fa.out
done

    # next machine
    ssh hgwdev
    cd /cluster/data/ce3/RMRun103
    hgLoadOut ce3 chr*.fa.out
bad rep range [1023, 233] line 2595 of chrII.fa.out 
bad rep range [2914, 2908] line 2797 of chrIII.fa.out
#	This result produces: 13236863 bases, 101,529 items, %13.198345

#######################################################################
#	The same RM run was done a third time on whole un-split chrom
#	files.  This run took 35 hours on the Iservers kluster.
#	Loading these full chrom results:
#	Not split:              13,238,684 bases, 101,565 items, %13.200161
#	Split at 100,000 bases: 13,238,621 bases, 101,554 items, %13.200098
#	Split at 103,000 bases: 13,236,863 bases, 101,529 items, %13.198345
#	Even the full chroms had problems with coordinates:
# bad rep range [1023, 233] line 2586 of chrII.fa.out 
# bad rep range [2914, 2908] line 2792 of chrIII.fa.out 
#	Combining the two split results:
#	                        13,283,266 bases, 103,117 items, %13.244613
#######################################################################
#	Most interestingly, the hgLoadOut would *not* load the files
#	unless they had been lifted.  It said they were not valid .out
#	files !  So, a fake split at 30,000,000 to create some lft
#	files:  (30,000,000 is too large to cause anything to split)
for C in I II III IV V X M
do
    mkdir splitFull/${C}
    faSplit size sangerFa/chr${C}.fa 30000000 splitFull/${C}/c -lift=splitFull/chr${C}.lft
done
#
#######################################################################
# SIMPLE REPEAT [TRF] TRACK  (DONE - 2005-05-02 - Hiram)
    ssh kki
    mkdir -p /cluster/data/ce3/bed/simpleRepeat
    cd /cluster/data/ce3/bed/simpleRepeat
    mkdir trf
    ls -1S /iscratch/i/worms/Celegans3/unmaskedFa/chr*.fa > genome.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
    #	happy emacs
                                                                                
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try    # there are only 7, so this runs them all
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       1751s      29.18m     0.49h    0.02d  0.000 y
# IO & Wait Time:                    28s       0.47m     0.01h    0.00d  0.000 y
# Average job time:                 254s       4.24m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             545s       9.08m     0.15h    0.01d
# Submission to last job:           545s       9.08m     0.15h    0.01d

    #  When cluster run is done, combine into one:
    cat trf/*.bed > simpleRepeat.bed

    # Load into the database:
    # next machine
    ssh hgwdev
    cd /cluster/data/ce3/bed/simpleRepeat
    hgLoadBed ce3 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    #	Loaded 28637 elements of size 16

#######################################################################
# PROCESS SIMPLE REPEATS AND RMSK INTO MASK (DONE,  2005-05-02 - Hiram)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd /cluster/data/ce3/bed/simpleRepeat
    mkdir -p trfMask
for F in trf/*.bed
do
    T=${F#trf/}
    echo "${F} > trfMask/${T}"
    awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T}
done

#  create Soft and Hard masks from RepeatMaster and TRF outputs:
#  and rebuild the 2bit file using the soft masking in the fa.
#	Might need the nibs for something, so make those too.
    # next machine
    ssh eieio
    cd /cluster/data/ce3
    mkdir softMask
    mkdir nib
    cd /cluster/data/ce3
for C in I II III IV V X M
do
    echo -n "masking chr${C} "
    maskOutFa sangerFa/chr${C}.fa RMRun/chr${C}.fa.out \
                softMask/chr${C}.fa -soft
    maskOutFa softMask/chr${C}.fa \
                bed/simpleRepeat/trfMask/chr${C}.bed \
                softMask/chr${C}.fa -softAdd
    faToNib -softMask softMask/chr${C}.fa nib/chr${C}.nib
done
# masking chrI Writing 15080552 bases in 7540284 bytes
# masking chrII Writing 15279311 bases in 7639664 bytes
# masking chrIII Writing 13783317 bases in 6891667 bytes
# masking chrIV Writing 17493785 bases in 8746901 bytes
# masking chrV Writing 20922231 bases in 10461124 bytes
# masking chrX Writing 17718850 bases in 8859433 bytes
# masking chrM Writing 13794 bases in 6905 bytes

    #	re-create the 2bit file
    rm -f ce3.2bit
    faToTwoBit softMask/chr*.fa ce3.2bit

    # create hard masks 
    mkdir hardMask
for C in I II III IV V X M
do
    echo "masking chr${C}"
    /cluster/bin/i386/maskOutFa softMask/chr${C}.fa hard \
                hardMask/chr${C}.fa
done

    #	copy to iscratch for cluster runs
    ssh kkr1u00
    cd /cluster/data/ce3/softMask
    mkdir -p /iscratch/i/worms/Celegans3/bothMasksFa
    mkdir -p /iscratch/i/worms/Celegans3/nib
    cp -p *.fa /iscratch/i/worms/Celegans3/bothMasksFa
    cd /cluster/data/ce3/nib
    cp -p c*.nib /iscratch/i/worms/Celegans3/nib
    cd /iscratch/i/worms/Celegans3
    cp -p /cluster/data/ce3/ce3.2bit .
    #	no longer need these items
    rm -fr unmaskedSplit split103 unmaskedFa
for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \
	kkr${i}u00:/iscratch/i/worms/
done
#######################################################################
#  GOLD AND GAP TABLES (DONE - 2005-05-02 - Hiram)
    ssh hgwdev
    cd /cluster/data/ce3
    awk '{print $1}' chrom.sizes | sed -e "s/chr//" > chrom.lst
    grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
    grep -v random chrom.sizes | cut -f1 > chrom.lst
    hgGoldGapGl -noGl -chromLst=chrom.lst ce3 /cluster/data/ce3 sangerFa
    #
    #	There are no gaps in this assembly, the gap tables are all empty
    #

#######################################################################
# GC5BASE (DONE - 2004-12-08 - Hiram)
    ssh eieio
    mkdir /cluster/data/ce3/bed/gc5Base
    cd /cluster/data/ce3/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 ce3 \
        /cluster/data/ce3 | wigEncode stdin gc5Base.wig gc5Base.wib

    ssh hgwdev
    cd /cluster/data/ce3/bed/gc5Base
    mkdir /gbdb/ce3/wib
    ln -s `pwd`/gc5Base.wib /gbdb/ce3/wib
    hgLoadWiggle ce3 gc5Base gc5Base.wig

###########################################################################
# CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
# FETCH acedb database for construction of Sanger Links table
#	DONE - 2006-01-06 - 2006-01-09 - Hiram
    ssh kkstore02
    mkdir /cluster/data/ce3/bed/acedb
    cd /cluster/data/ce3/bed/acedb
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=4 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS140/
    #	Then, to install this acedb database:
    chmod +x INSTALL
    ./INSTALL
    #	It asks a question about disk space, you should have at least
    #	15 Mb free, answer the question with: yes
    #	if that is true
    #	Start up the AceDb monitor

    mkdir /cluster/data/ce3/bed/sangerLinks
    cd /cluster/data/ce3/bed/sangerLinks
    #	fetch from our version of uniProt, name relationships
    #	acc == uniProt ID
    #	extAcc1 == WormBase Gene number
    #	extAcc2 == orfName or Gene Class name
    #   extAcc3 == haven't seen anything in this column yet
    hgsql -N -e \
"select acc,extAcc1,extAcc2,extAcc3 from extDbRef where extDb=19;" uniProt \
        > uniProt.Celegans.txt
    tar xvzf ../acedb/wormpep140.tar.gz wormpep140/wormpep140
    zcat ../acedb/geneIDs.WS140.gz > geneIDs.WS140.txt

    /cluster/bin/acedb/tace /cluster/data/ce3/bed/acedb
    # At prompt, can type ? for help, type the following queries
#    wbGeneClass.txt - associates 3 letter first part of worm gene
#             name with a brief description.  
    acedb> AQL -o wbGeneClass.txt select l,l->Description  from l in class Gene_Class
    #	it says:
    #	// 1471 Active Objects

#    wbLongDescr.txt - associates worm gene name with a several
#             sentence description. 
    acedb> AQL -o wbLongDescr.txt select l,l->Gene_info[Provisional_description] from l in class Gene
    #	it says:
    #	// 5296 Active Objects
    #	ctrl-D to exit the acedb monitor

    #	remove header lines and "" from those two files:
    grep -v "^Format" wbGeneClass.txt | \
	sed -e 's/Gene_class://; s/Text://; s/"//g;' \
	    > wbGeneClass.new
    grep -v "^Format" wbLongDescr.txt | \
	sed -e 's/Gene://; s/Text://; s/"//g;' \
	    > wbLongDescr.new
    mv wbGeneClass.new wbGeneClass.txt
    mv wbLongDescr.new wbLongDescr.txt

    #	Check how we compare to previous build, there will probably be a
    #	few more with each new build:  (I don't know why the bigger one
    #	is so much larger this time)
    wc wbGeneClass.txt wbLongDescr.txt
    #	 1515    6589   47870 wbGeneClass.txt
    #	50417  421524 3220249 wbLongDescr.txt
    wc /cluster/data/ce2/bed/steinHelp/wb*.txt
    #	 1393    5982   43094 /cluster/data/ce2/bed/steinHelp/wbGeneClass.txt
    #	18400  197407 1378516 /cluster/data/ce2/bed/steinHelp/wbConcise.txt

    #	There is a copy of this script in the source tree:
    #	/kent/src/hg/near/hgWormLinks/
    #	There is also an hgWormLinks program there, with slightly
    #	different inputs and outputs.
    ./mkSangerLinks.pl wormpep140/wormpep.table140 geneIDs.WS140.txt \
	wbLongDescr.txt wbGeneClass.txt uniProt.Celegans.txt \
	    > sangerLinks.txt 2> dbg.out
    #	should be relatively few names without descriptions:
    grep "no descr" dbg.out | wc
    #	     1214    8498   67382

    ssh hgwdev
    cd /cluster/data/ce3/bed/sangerLinks
    hgsql ce3 < $HOME/kent/src/hg/lib/sangerLinks.sql
    hgsql ce3 -e \
	'load data local infile "sangerLinks.txt" into table sangerLinks;'
    #	the index for protName doesn't seem to get populated
    hgsql -e "analyze table sangerLinks;" ce3
    #	should now see numbers (happen to be identical) for each of the
    #	indices:
    hgsql -e "show index from sangerLinks;" ce3


    hgsql ce3 < /cluster/home/hiram/kent/src/hg/lib/orfToGene.sql
    hgsql ce3 -e 'load data local infile "orfToGene.tab" into table orfToGene;'


#######################################################################
# MAKE SANGER GENE (WORM BASE GENES) TRACK (WORKING, 2005-05-02, Hiram)
   ssh eieio
   mkdir -p /cluster/data/ce3/bed/sangerGene
   cd /cluster/data/ce3/bed/sangerGene
    # the perl trims these files down to a reasonable size.  As they are
    #   they cause ldHgGene to hangup due to memory size.
    # The sed removes CDS before id e.g. from CDS "Y74C9A.2" and the
    # extra quote ", and finally the comments at the ends of the lines
    # which start with `blank semi-colon blank'
for F in I II III IV V X M
do
    echo -n "${F} "
    zcat ../../WS140/chr${F}.gff.gz | \
    sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
	-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
	-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
	-e "s/CHROMOSOME_M/chrM/g" \
	-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
	    perl -ne '@a=split; print if($a[1] =~ m/curated|DNA|RNA/i &&
$a[2] =~ m/intron|exon|cds|sequence|transcri/i);' | \
	sed -e "s/CDS \"//; s/\"//; s/ ; .*//;" > chr${F}.gff
done
    #  check file sizes, should be reasonable
    ls -ogrt
# -rw-rw-r--    1  3524403 May  2 12:47 chrI.gff
# -rw-rw-r--    1  3732465 May  2 12:48 chrII.gff
# -rw-rw-r--    1  3295374 May  2 12:49 chrIII.gff
# -rw-rw-r--    1  3635497 May  2 12:51 chrIV.gff
# -rw-rw-r--    1  4754926 May  2 12:53 chrV.gff
# -rw-rw-r--    1  3896221 May  2 12:55 chrX.gff
# -rw-rw-r--    1     3881 May  2 12:55 chrM.gff

    # now load database with those transformed gff files
    # next machine
    ssh hgwdev
    cd /cluster/data/ce3/bed/sangerGene
    # 2004-05-10, hartera, Reload sangerGene table using extended GenePred
    # format. 2004-05-11, hartera. Extended format frame information does not
    # look correct. Reload without the extended fields.
    ldHgGene ce3 sangerGene *.gff
# Read 23277 transcripts in 431929 lines in 7 files
#  23277 groups 7 seqs 8 sources 10 feature types
#23277 gene predictions
XXXXX Stopped here 2005-05-02 14:30
    
    # Add proteinID field to sangerGene table, used by Gene Sorter
    ssh hgwdev
    cd /cluster/data/ce3/bed/sangerGene
    hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce3
    # To add index on this column
    hgsql -e 'alter table sangerGene add index(proteinID);' ce3
    # Add Swiss-Prot protein IDs to this column
    # There are 23277 entries in the sangerGene table:
    hgsql -N -e "select count(*) from sangerGene;" ce3
    #	23277
    #	And 22086 names in common between sangerGene and sangerLinks:
    hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \
           where g.name = l.orfName;' ce3
    # 22086
    # get names from sangerGene and sangerLinks tables
    hgsql -N -e "select name from sangerGene;" ce3 | sort > name.sangerGene
    hgsql -N -e "select orfName from sangerLinks;" ce3 | sort > orfName.sangerLinks
    # get list of names in sangerGene not in sangerLinks
    comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks

    # split this file into 5 pieces, 400 lines each:
    split -l 300 geneNames.notin.sangerLinks
    #	take each of those lists and paste those identifiers into
    #	the wormbase batch_genes query,
    #	http://ws140.wormbase.org/db/searches/batch_genes
    #	requesting the Swiss Prot ID
    #	You can cut and paste by viewing the 400 line file
    #	in your WEB browser as a text document.
    #	only a couple of them are identified as having SP IDs:
    #	cut and paste the results from the "Plain Text" output format
F15E11.13       Q9N565       SWALL:Q9N565_CAEEL

Y19D10A.13       SWALL:Q9UAU0_CAEEL
Y19D10A.15       SWALL:Q9UAT7_CAEEL
Y19D10A.9       SWALL:O44596_CAEEL
Y45G12C.7       SWALL:Q95X13_CAEEL              
Y45G12C.9       SWALL:Q9N4Y6_CAEEL
Y49C4A.2                     
;       SWALL:Q8MYQ4_CAEEL               F25B5.4
Y49C4A.3                     
Y49C4A.3                     
;       SWALL:Q8MYQ4_CAEEL 
Y6B3B.12       Q5ZEQ2
ZK131.10       P09588

# CREATE SANGERPEP TABLE (DONE, 2005-08-15, hiram)
#	Must be done after sangerGene table load to have time stamps
#	correct for joinerCheck
    mkdir -p /cluster/data/ce3/bed/sangerPep
    cd /cluster/data/ce3/bed/sangerPep
    # Download peptide sequences from the Sanger Centre ftp site:
    wget -o ce3.fetch.log -r -l1 --no-directories --timestamping \
        ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep140/wormpep140
    # Load into database
    hgPepPred ce3 generic sangerPep wormpep140
    # the sangerPep table is used by the sangerGene track
    gzip wormpep140
    ssh kkr1u00
    cd /iscratch/i/worms/ce3
    zcat /cluster/data/ce3/bed/sangerPep/wormpep140.gz > wormpep140
    mkdir splitPep
    faSplit sequence wormpep140 1000 splitPep/wp
    #	rsync
    for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \
	kkr${i}u00:/iscratch/i/worms/
done


# CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (NOT DONE)
   mkdir /cluster/data/ce3/bed/orfToGene
   cd !$
   # gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu
   # ORF e.g. Y110A7A.10 gene e.g. aap-1
   awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
   # reformat this for use in creating Sanger Links with hgWormLinks
   awk 'NF == 2' orfToGene.txt > orfToGene.txt2
   # use perl script to move ORFs with the same gene name onto separate lines
   # and remove lines where there is a gene name with an alternate name
   # in parentheses. this output file is used for Sanger Links
   /cluster/data/ce3/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2   
   hgCeOrfToGene ce3 gene_names.txt sangerGene orfToGene

#########################################################################
# BLASTZ HUMAN Cb1 (WORKING - 2005-05-02 Hiram)
#	going to try and do this with ce3 chroms vs the cb1 contigs, and
#	then lift to chrUn
    ssh eieio
    mkdir /cluster/data/ce3/bed/blastzCb1.2005_05_02
    cd /cluster/data/ce3/bed
    ln -s blastzCb1.2005_05_02 blastz.cb1
    cd blastzCb1.2005_05_02

    cat << '_EOF_' > DEF
# briggsae vs. elegans
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: elegans Ce3
SEQ1_DIR=/iscratch/i/worms/Celegans3/ce3.2bit
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: briggsae cb1
SEQ2_DIR=/iscratch/i/worms/Cbriggsae/cb1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/ce3/bed/blastzCb1.2005_05_02

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/ce3/chrom.sizes ./S1.len
    sort -rn +1 /cluster/data/cb1/chrom.sizes > S2.len
    #	establish a screen to control this job
    screen
    cd /cluster/data/ce3/bed/blastzCb1.2005_05_02
    time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
	blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh eieio
    screen -d -r
    #	STARTED - 2005-03-17 21:25

    #	swap results to place ce3 alignments onto cb1
    ssh eieio
    cd /cluster/data/ce3/bed/blastzCb1.2005_05_02
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &

# BLAT SERVER SETUP (DONE - 2005-06-02 - Hiram)

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("ce3", "blat15", 17784, 1, 0);' \
	-h localhost hgcentraltest
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("ce3", "blat15", 17785, 0, 1);' \
	-h localhost hgcentraltest

##########################################################################
#	Convenient rsync for kkr1u00 to copy the worm stuff quickly
    ssh kkr1u00
    for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \
	kkr${i}u00:/iscratch/i/worms/
done


##########################################################################
# BLASTZ C. elegans ce3 (DONE - 2005-08-10 - Hiram)
    ssh kk
    mkdir /cluster/data/ce3/bed/blastzCb2.2005_08_10
    cd /cluster/data/ce3/bed/blastzCb2.2005_08_10
    cat << '_EOF_' > DEF
# Ce3 vs Cb2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb3
SEQ1_DIR=/iscratch/i/worms/ce3/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cb2 - full chroms only, no randoms
SEQ2_DIR=/iscratch/i/worms/cb2/chromNib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/ce3/bed/blastzCb2.2005_08_10

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp -p /cluster/data/ce3/chrom.sizes ./S1.len
    nibSize /iscratch/i/worms/cb2/chromNib/*.nib | \
	awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	real    85m16.486s
    #	user    0m1.070s
    #	sys     0m0.200s

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kk
    screen -d -r
    #	STARTED - 2005-08-10 16:22
    #	FINISHED - 2005-08-10 17:45
#	blastz kluster run
# Completed: 182 of 182 jobs
# CPU time in finished jobs:     109157s    1819.29m    30.32h    1.26d  0.003 y
# IO & Wait Time:                  1038s      17.30m     0.29h    0.01d  0.000 y
# Average job time:                 605s      10.09m     0.17h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            4811s      80.18m     1.34h    0.06d
# Submission to last job:          4813s      80.22m     1.34h    0.06d

#	doCatRun
# Completed: 14 of 14 jobs
# CPU time in finished jobs:         18s       0.29m     0.00h    0.00d  0.000 y
# IO & Wait Time:                    54s       0.91m     0.02h    0.00d  0.000 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:            10s       0.17m     0.00h    0.00d

#	doChainRun
# Completed: 7 of 7 jobs
# CPU time in finished jobs:        125s       2.09m     0.03h    0.00d  0.000 y
# IO & Wait Time:                    41s       0.68m     0.01h    0.00d  0.000 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              42s       0.70m     0.01h    0.00d
# Submission to last job:            42s       0.70m     0.01h    0.00d

    #	And then, a run with the randomContigs
    
    mkdir /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs
    cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs
    cat << '_EOF_' > DEF
# Ce3 vs Cb2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb3
SEQ1_DIR=/iscratch/i/worms/ce3/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cb2 - full chroms only, no randoms
SEQ2_DIR=/iscratch/i/worms/cb2/randomContigs.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp -p /cluster/data/ce3/chrom.sizes ./S1.len
    twoBitInfo /iscratch/i/worms/cb2/randomContigs.2bit stdout | \
	sort -rn +1 > S2.len

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	real    104m9.081s
    #	user    0m1.180s
    #	sys     0m0.150s

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kk
    screen -d -r
#	blastz kluster run
# Completed: 28 of 28 jobs
# CPU time in finished jobs:      36050s     600.84m    10.01h    0.42d  0.001 y
# IO & Wait Time:                   259s       4.31m     0.07h    0.00d  0.000 y
# Average job time:                1297s      21.61m     0.36h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6144s     102.40m     1.71h    0.07d
# Submission to last job:          6144s     102.40m     1.71h    0.07d

#	doCatRun
# Completed: 14 of 14 jobs
# CPU time in finished jobs:          5s       0.08m     0.00h    0.00d  0.000 y
# IO & Wait Time:                    43s       0.72m     0.01h    0.00d  0.000 y
# Average job time:                   3s       0.06m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:               5s       0.08m     0.00h    0.00d
# Submission to last job:             7s       0.12m     0.00h    0.00d

#	doChainRun
# Completed: 7 of 7 jobs
# CPU time in finished jobs:         29s       0.48m     0.01h    0.00d  0.000 y
# IO & Wait Time:                    25s       0.42m     0.01h    0.00d  0.000 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              16s       0.27m     0.00h    0.00d
# Submission to last job:            16s       0.27m     0.00h    0.00d

    #	Now, lift those results to their chr*_random coordinates:

    ssh kkstore02
    cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs/axtChain
    mkdir liftedChain
    for F in chain/*.chain
    do
	C=${F#chain/}
	C=${C/.chain/}
	echo "$F -> liftedChain/$C"
	liftUp -chainQ liftedChain/${C}.lifted.chain \
	    /cluster/bluearc/scratch/hg/cb2/liftRandoms.lft warn $F
    done
    # now merge these lifted chain files with the existing chain files for 
    # the chroms and then sort by score using chainSort 
    cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/axtChain
    # gzipped file is only chains for chromsI-X and chrM so rename
    mv ce3.cb2.all.chain.gz ce3.cb2.chroms.chain.gz
    mv chain chromChain
    mkdir chain chainUnSorted
    # get all chains to be merged in chainUnSorted dir
    cp -p ./chromChain/*.chain ./chainUnSorted/
    # copy contig chains, these are *.lifted.chain so they do not 
    # write over the chrom chains.
    cp -p ../randomContigs/axtChain/liftedChain/*.chain ./chainUnSorted/
    # then merge and sort all these chains. they must be merged and all 
    # sorted together so that all IDs are unique across all chroms.
    # IDs are reassigned by chainMergeSort so that IDs are unique.
    chainMergeSort chainUnSorted/*.chain | gzip > ce3.cb2.all.chain.gz
     # use chainSplit to split this into chains again
    chainSplit chain ce3.cb2.all.chain.gz

    # then pick up the doBlastzChainNet.pl script with the net step
    ssh kk
    cd /cluster/data/ce3/bed/blastzCb2.2005_08_10
    cp -p DEF DEF.chroms
    # edit DEF so SEQ2_DIR=/iscratch/i/worms/cb2/allNib
    #	as need all nib files now
    #	And complete S2.len
    nibSize /iscratch/i/worms/cb2/nib/*.nib | \
	awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len
    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -continue net \
	`pwd`/DEF > net.run.out 2>&1 &
    #	real    5m32.249s
    #	user    0m0.510s
    #	sys     0m0.090s

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kk
    screen -d -r

    #	swap results to place ce3 alignments onto cb2
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &
    #	real    7m27.028s
    #	user    0m1.770s
    #	sys     0m0.190s

##########################################################################
# BLASTZ C. remanei caeRem1 (DONE - 2005-08-18 - Hiram)
    ssh pk
    mkdir /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18
    cd /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18
    cat << '_EOF_' > DEF
# Ce3 vs Cb2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb3
SEQ1_DIR=/san/sanvol1/scratch/worms/ce3/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: CaeRem1
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/ce3/bed/blastzCaeRem1.2005_08_18

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp -p /cluster/data/ce3/chrom.sizes ./S1.len
    twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit stdout | \
	sort -rn +1  > S2.len

    #	establish a screen to control this job
    #	altered local copy of doBlastzChainNet.pl to fix santest location
    #	and run everything on pk since the san location is unique here
    #	and these jobs are not going to be very large anyway
    screen
    time ./doBlastzChainNet.pl \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-workhorse=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	real    33m25.177s
    #	user    0m0.561s
    #	sys     0m0.222s

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kk
    screen -d -r

    #	swap results to place ce3 alignments onto caeRem1
    cd /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18
    time ./doBlastzChainNet.pl -swap \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-workhorse=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > swap.run.out 2>&1 &
    #	real    17m13.957s
    #	user    0m0.395s
    #	sys     0m0.148s

########################################################################
# Aceview (Acembly) Gene set:  (DONE - 2005-09-07 - Hiram)
    # Data is obtained from 
    # Danielle et Jean Thierry-Mieg     mieg@ncbi.nlm.nih.gov
    ssh hgwdev
    mkdir /cluster/data/ce3/bed/aceview
    cd /cluster/data/ce3/bed/aceview
    wget --timestamping \
'ftp://anonymous:ucsc@ftp.ncbi.nlm.nih.gov/repository/acedb/wormgenes/aceview.wormgenes.2005_09_06.gff.gz'
    #	Verify no reversed blocks
    zcat aceview.wormgenes.2005_09_06.gff.gz \
    | awk -F"\t" '{if ($4 > $5) {printf "reverse blocks problem at:\n$0" } }'

    zcat aceview.wormgenes.2005_09_06.gff.gz | \
	sed -e "s/CHROMOSOME_/chr/; s/MtDNA/M/" \
	| ldHgGene -gtf ce3 acembly stdin
    #	Read 22456 transcripts in 412173 lines in 1 files
    #	22456 groups 7 seqs 1 sources 5 feature types
    #	Check chrom names:
    checkTableCoords -table=acembly ce3
    hgsql -N -e "select chrom from acembly;" ce3 | sort | uniq -c
    #	3539 chrI
    #	3836 chrII
    #	3491 chrIII
    #	3734 chrIV
    #	   7 chrM
    #	4629 chrV
    #	3220 chrX
    featureBits ce3 acembly
    #	25509966 bases of 100291840 (25.436%) in intersection

#############################################################################
# BLASTZ Cb2 (WORKING - 2005-09-09 - Hiram)
    ssh kk
    mkdir /cluster/data/ce3/bed/blastzCb2.2005-09-09
    cd /cluster/data/ce3/bed
    ln -s blastzCb2.2005-09-09 blastz.cb2
    cd blastzCb2.2005-09-09

    #	Utilizing tiny target chunks and a single query chunk to use the
    #	dynamic masking available via the BLASTZ_M parameter
    cat << '_EOF_' > DEF
# ce3 vs cb2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: elegans Ce3
SEQ1_DIR=/san/sanvol1/scratch/worms/ce3/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=1000000
SEQ1_LAP=100

# QUERY: briggsae Cb2
SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/ce3/bed/blastzCb2.2005-09-09

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # happy emacs

    nibSize /san/sanvol1/scratch/worms/ce3/nib/*.nib | \
	awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S1.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S2.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \
	| grep random >> S2.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \
	| grep chrUn >> S2.len

XXXX - working on modifications to doBlastzChainNet.pl 2005-09-06
XXXX - to do this random contigs business properly

    #	establish a screen to control this job
    screen
    time ./doBlastzChainNet.pl -stop chainMerge \
	-bigClusterHub=kk \
	`pwd`/DEF > toChainMerge.run.out 2>&1 &
    #	STARTED - 2005-09-09 - 16:18
    #	with lots of kluster contention
# Completed: 104 of 104 jobs
# CPU time in finished jobs:     125426s    2090.44m    34.84h    1.45d  0.004 y
# IO & Wait Time:                  1821s      30.35m     0.51h    0.02d  0.000 y
# Average job time:                1224s      20.39m     0.34h    0.01d
# Longest finished job:            2849s      47.48m     0.79h    0.03d
# Submission to last job:         18148s     302.47m     5.04h    0.21d
    time ./doBlastzChainNet.pl -continue cat -stop chainRun \
	-bigClusterHub=kk \
	`pwd`/DEF > toChainRun.run.out 2>&1 &
# Completed: 104 of 104 jobs
# CPU time in finished jobs:         31s       0.52m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   351s       5.84m     0.10h    0.00d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:           211s       3.52m     0.06h    0.00d
    time ./doBlastzChainNet.pl -continue chainRun -stop chainMerge \
	-bigClusterHub=kk \
	`pwd`/DEF > toChainMerge.run.out 2>&1 &
#Completed: 7 of 7 jobs
#CPU time in finished jobs:        347s       5.78m     0.10h    0.00d  0.000 y
#IO & Wait Time:                    65s       1.08m     0.02h    0.00d  0.000 y
#Average job time:                  59s       0.98m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             120s       2.00m     0.03h    0.00d
#Submission to last job:           696s      11.60m     0.19h    0.01d
    time ./doBlastzChainNet.pl -continue chainMerge -stop net \
	-bigClusterHub=kk \
	`pwd`/DEF > toNet.run.out 2>&1 &
    time ./doBlastzChainNet.pl -continue load -stop load \
	-bigClusterHub=kk \
	`pwd`/DEF > loadStep.run.out 2>&1 &
    time ./doBlastzChainNet.pl -continue download -stop cleanup \
	-bigClusterHub=kk \
	`pwd`/DEF > thruCleanup.run.out 2>&1 &
    #	swap results to place ce3 alignments onto cb2
    time ./doBlastzChainNet.pl -swap \
	-bigClusterHub=kk \
	`pwd`/DEF > \
	swap.run.out 2>&1 &


    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:

#######################################################################
#  3-WAY Multiz (DONE - 2005-12-13 - Hiram)
    mkdir /cluster/data/ce3/bed/multiz3way
    cd /cluster/data/ce3/bed/multiz3way

    cat << '_EOF_' > 3way.nh
((ce3:0.2,caeRem1:0.2):0.3,cb2:0.3)
'_EOF_'

    /cluster/bin/phast/draw_tree 3way.nh > 3way.ps
    convert 3way.ps 3way.jpg

#                         featureBits chainLink measures
#                                           chainMm7Link   chain     linearGap
#    distance                       on Ce3      on other   minScore
#  1  0.3 - remanei caeRem1       (% 46.128)  (% 37.675)   1000      loose
#  2  0.2 - briggsae cb2          (% 42.047)  (% 39.062)   1000      loose
 

    mkdir mafLinks
    mkdir mafLinks/caeRem1
    mkdir mafLinks/ce3
    cd mafLinks/caeRem1
    ln -s ../../../blastzCaeRem1.2005_08_18/mafNet/*.maf.gz .
    cd ../ce3
    ln -s ../../../blastzCb2.2005-09-09/mafNet/*.maf.gz .

    ssh pk
    mkdir -p /san/sanvol1/scratch/ce3/multiz3way
    cd /san/sanvol1/scratch/ce3/multiz3way
    rsync -a --progress --copy-links \
	/cluster/data/ce3/bed/multiz3way/mafLinks/ .
    
    mkdir penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn

    cd /cluster/data/ce3/bed/multiz3way
    mkdir -p maf
    mkdir -p run
    cd run


    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set db = ce3
    set multi = /scratch/tmp/$db/multiz3way.$c
    set pairs = /san/sanvol1/scratch/$db/multiz3way
    set penn = $pairs/penn

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
	echo "cleanup"
        echo "rm -fr $multi"
        rm -fr $multi
	echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db"
	rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        echo "cp $multi/$2/$c.maf $3"
        ls -og $multi/$2/$c.maf
        cp $multi/$2/$c.maf $3
        exit
    endif 
        
    set s1 = $2
    set s2 = $3
    set flag = $4
    
    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi 
    if (-d $pairs/$s1) then
        set d1 = $pairs
	set f1 = $d1/$s1/$c.maf.gz
	set t1 = /tmp/$s1.$c.maf
	zcat $f1 > $t1
    else
	set f1 = $d1/$s1/$c.maf
	set t1 = /tmp/$s1.$c.maf
	cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
	set f2 = $d2/$s2/$c.maf.gz
	set t2 = /tmp/$s2.$c.maf
	zcat $f2 > $t2
    else
	set f2 = $d2/$s2/$c.maf
	set t2 = /tmp/$s2.$c.maf
	cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
        $penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \
		$out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        $penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf
	rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
	rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
	rm -f $t2
    endif
'EOF'
# << keep emacs coloring happy
    chmod +x oneMultiz.csh
    cp -p oneMultiz.csh /san/sanvol1/scratch/ce3/multiz3way/penn/oneMultiz.csh

    #	using your tree diagram printed above, arrange these alignments
    #	in order of the tree branches

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
set db = ce3
set s = "/san/sanvol1/scratch/$db/multiz3way/penn/oneMultiz.csh"

$s $c cb2 caeRem1 0
#	 get final alignment file
$s $c cb2caeRem1 /cluster/data/$db/bed/multiz3way/maf/$c.maf
#cleanup
$s $c
'EOF'
# happy emacs
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/ce3/bed/multiz3way/maf/$(root1).maf}
#ENDLOOP
'EOF'

    awk '{print $1}' ../../../chrom.sizes > chrom.lst

    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check ... etc
    para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:        836s      13.93m     0.23h    0.01d  0.000 y
# IO & Wait Time:                    30s       0.50m     0.01h    0.00d  0.000 y
# Average job time:                 124s       2.06m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             184s       3.07m     0.05h    0.00d
# Submission to last job:           184s       3.07m     0.05h    0.00d

    ssh kkstore02
    -d /cluster/data/ce3/bed/multiz3way
    time catDir maf > multiz3way.maf
    ls -og
# -rw-rw-r--  1 195770329 Dec 13 12:03 multiz3way.maf

    #   Create per-chrom individual maf files for downloads
    mkdir mafDownloads
    for M in maf/chr*.maf
    do
        B=`basename $M`
        cp -p ${M} mafDownloads/${B}
        gzip mafDownloads/${B}
        echo ${B} done
    done

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/ce3/multiz3way
    cd /usr/local/apache/htdocs/goldenPath/ce3/multiz3way
    ln -s /cluster/data/ce3/bed/multiz3way/mafDownloads/chr*.maf.gz .

    # Load into database
    ssh hgwdev
    cd /cluster/data/ce3/bed/multiz3way
    mkdir /gbdb/ce3/multiz3way
    ln -s /cluster/data/ce3/bed/multiz3way/multiz3way.maf \
        /gbdb/ce3/multiz3way
    time hgLoadMaf ce3 multiz3way
    #   Loaded 203120 mafs in 1 files from /gbdb/ce3/multiz3way
    #   real    0m30.526s

    time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 ce3 \
        multiz3waySummary multiz3way.maf
    #   Processed 305166 components in 203120 mafs from multiz3way.maf
    #   real    0m26.164s

#######################################################################
#	phastCons conservation scores on the 3-way
#	DONE - 2005-12-13 - 2005-12-14 - Hiram

    ssh kkstore02
    mkdir /cluster/data/ce3/bed/multiz3way/cons
    cd /cluster/data/ce3/bed/multiz3way/cons

    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
        --refseq ../../../softMask/chrV.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1

     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
	--tree "(ce3,(caeRem1,cb2))" \
	    --out-root starting-tree

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.372
    #	This 0.372 is used in the --gc argument below

    # Create big bad bloated SS files on san filesystem
    ssh kkstore02
    mkdir -p  /san/sanvol1/scratch/ce3/cons/ss
    cd  /san/sanvol1/scratch/ce3/cons/ss
    for C in `awk '{print $1}' /cluster/data/ce3/chrom.sizes`
    do
      if [ -s /cluster/data/ce3/bed/multiz3way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/ce3/bed/multiz3way/maf/${C}.maf \
	    --refseq /cluster/data/ce3/softMask/${C}.fa \
	    --in-format MAF --windows 1000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/ce3/cons/ss
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 3000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/ce3/cons/treeRun4
    cd /san/sanvol1/scratch/ce3/cons/treeRun4
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create  script that calls phastCons with right arguments
    #	We are tuning target-coverage and expected-length
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/ce3/bed/multiz3way/cons/starting-tree.mod \
      --gc 0.372 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 27 --target-coverage 0.45 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	emacs happy
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:        919s      15.31m     0.26h    0.01d 0.000 y
# IO & Wait Time:                   197s       3.29m     0.05h    0.00d 0.000 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest finished job:              37s       0.62m     0.01h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/ce3/cons/treeRun4
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt 2>&1
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/ce3/bed/multiz3way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    /cluster/bin/phast/$MACHTYPE/consEntropy .45 27 \
                        ave.cons.mod ave.noncons.mod

#	aiming for PIT=10 bits
#	With a PIT of 10 bits at 27 and .45 we will use these numbers
#		for the full run
#	at --expected-length 27 --target-coverage 0.45
# Transition parameters: gamma=0.450000, omega=27.000000, mu=0.037037,
#	nu=0.030303
# Relative entropy: H=0.326190 bits/site
# Expected min. length: L_min=30.684329 sites
# Expected max. length: L_max=24.341047 sites
# Phylogenetic information threshold: PIT=L_min*H=10.008930 bits
#	at --expected-length 22 --target-coverage 0.45
# Transition parameters: gamma=0.450000, omega=22.000000, mu=0.045455,
#	nu=0.037190
# Relative entropy: H=0.343713 bits/site
# Expected min. length: L_min=27.428995 sites
# Expected max. length: L_max=21.416559 sites
# Phylogenetic information threshold: PIT=L_min*H=9.427716 bits

#	at --expected-length 25 --target-coverage 0.45
# Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000,
#	nu=0.032727
# Relative entropy: H=0.332608 bits/site
# Expected min. length: L_min=29.436557 sites
# Expected max. length: L_max=23.218502 sites
# Phylogenetic information threshold: PIT=L_min*H=9.790834 bits

#	at --expected-length 12 --target-coverage 0.17
# Transition parameters: gamma=0.170000, omega=12.000000, mu=0.083333,
#	nu=0.017068
# Relative entropy: H=0.363555 bits/site
# Expected min. length: L_min=35.407009 sites
# Expected max. length: L_max=16.168434 sites
# Phylogenetic information threshold: PIT=L_min*H=12.872407 bits


    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/ce3/cons/consRun1
    cd /san/sanvol1/scratch/ce3/cons/consRun1
    mkdir ppRaw bed
    cp -p /san/sanvol1/scratch/ce3/cons/treeRun4/ave.*.mod .

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 27 --target-coverage 0.45 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they can hang it up as they work at full tilt.
# Completed: 104 of 104 jobs
# CPU time in finished jobs:        331s       5.51m     0.09h    0.00d  0.000 y
# IO & Wait Time:                  1066s      17.77m     0.30h    0.01d  0.000 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              54s       0.90m     0.01h    0.00d
# Submission to last job:            75s       1.25m     0.02h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    ssh kkstore02
    cd /san/sanvol1/scratch/ce3/cons/consRun1

    #	The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/ce3/bed/multiz3way

    # Figure out how much is actually covered by the mostConserved data as so:
    #	The 100291840 comes from the non-n genome size,
    #	from faSize on all chroms:
    cd /cluster/data/ce3
    faSize softMask/*.fa 
    #	100291840 bases (0 N's 100291840 real 86891657 upper 13400183 lower)
    #	in 7 sequences in 7 files

    cd /cluster/data/ce3/bed/multiz3way
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100291840\n",100.0*sum/100291840,sum}' \
	mostConserved.bed
    -target-coverage 0.45: % 22.53 = 100.0*22595226/100291840


    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/ce3/bed/multiz3way
    hgLoadBed -strict ce3 phastConsElements mostConserved.bed
    #	Loaded 185737 elements of size 5
    #	~1 minute load time
    featureBits ce3 -enrichment sangerGene:cds phastConsElements
# -target-coverage 0.45 and expected lengths 27:
# sangerGene:cds 25.226%, phastConsElements 22.529%, both 13.627%,
#	cover 54.02%, enrich 2.40x

    # Create merged posterier probability file and wiggle track data files
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    ssh kkstore02
    cd /san/sanvol1/scratch/ce3/cons/consRun1
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode stdin phastCons3.wig phastCons3.wib
    # about 40 seconds for above
# -rw-rw-r--  1 54890327 Dec 14 12:34 phastCons3.wib
# -rw-rw-r--  1  9271833 Dec 14 12:34 phastCons3.wig

    cp -p phastCons3.wig phastCons3.wib /cluster/data/ce3/bed/multiz3way

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ce3/bed/multiz3way
    ln -s `pwd`/phastCons3.wib /gbdb/ce3/wib/phastCons3.wib
    hgLoadWiggle ce3 phastCons3 phastCons3.wig
    #  ~ 1.5 minute load

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ce3 phastCons3 > histogram.data 2>&1
    #	about 8 minutes to scan all data

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " C. elegans Ce3 Histogram phastCons3 track"
set xlabel " phastCons3 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	happy emacs

    display histo.png &

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/ce3/cons/consRun1
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons3Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons3Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	happy emacs
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    #	takes 1 minute, makes 95 Mb of data
    #	copy them for downloads
    ssh kkstore02
    mkdir /cluster/data/ce3/bed/multiz3way/phastCons3Scores
    cd /cluster/data/ce3/bed/multiz3way/phastCons3Scores
    rsync -a --progress \
	pk:/san/sanvol1/scratch/ce3/cons/consRun1/phastCons3Scores/ .
    #	10 second copy

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/ce3/phastCons3Scores
    cd /usr/local/apache/htdocs/goldenPath/ce3/phastCons3Scores
    ln -s /cluster/data/ce3/bed/multiz3way/phastCons3Scores/*.gz .

###########################################################################
#	preparing downloads  (WORKING - 2005-12-14 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ce3/goldenPath
    mkdir /cluster/data/ce3/goldenPath/bigZips
    mkdir /cluster/data/ce3/goldenPath/chromosomes
    cd /cluster/data/ce3/
    cp -p softMask/chr*.fa goldenPath/chromosomes
    gzip goldenPath/chromosomes/chr*.fa

    cat RMRun/chrI.fa.out > goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrII.fa.out >> goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrIII.fa.out >> goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrIV.fa.out >> goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrV.fa.out >> goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrX.fa.out >> goldenPath/bigZips/rmsk.out
    tail +4 RMRun/chrM.fa.out >> goldenPath/bigZips/rmsk.out
    gzip goldenPath/bigZips/rmsk.out
    cd softMask
    tar cvzf ../goldenPath/bigZips/chromFa.tar.gz ./chr*.fa

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/ce3/bigZips
    mkdir /usr/local/apache/htdocs/goldenPath/ce3/chromosomes
    cd /usr/local/apache/htdocs/goldenPath/ce3/bigZips
    ln -s /cluster/data/ce3/goldenPath/bigZips/* .
    cd /usr/local/apache/htdocs/goldenPath/ce3/chromosomes
    ln -s /cluster/data/ce3/goldenPath/chromosomes/chr*.fa.gz .


###########################################################################
# MAKE 11.OOC FILE FOR BLAT and GENBANK (DONE - 2006-01-11 - hiram)
# Use -repMatch=40 (based on size, for human use 1024)
    ssh kkr1u00
    cd /iscratch/i/worms/Celegans3
    ls -1 `pwd`/nib/chr*.nib > nib.lst
    time /cluster/bin/x86_64/blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=11.ooc -repMatch=40
    #	Wrote 43651 overused 11-mers to 11.ooc
    #	real    0m4.562s
    rm nib.lst

    #	copy to other iservers
for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/Celegans3/ \
	kkr${i}u00:/iscratch/i/worms/Celegans3/
done

###########################################################################
# ENABLE GENBANK UPDATE (DONE 2006-01-11 - Hiram)

# add ce3 to the following three files and check them in.
#	The genbank.conf entry is a copy of the existing ce2 entry with
#	appropriate changes for pathnames, etc ...

    src/hg/makeDb/genbank/etc/align.dbs
    src/hg/makeDb/genbank/etc/hgwdev.dbs
    src/hg/makeDb/genbank/etc/genbank.conf

    # go to /cluster/data/genbank/etc and do cvs update on these three files.

    ssh kkstore02
    cd /cluster/data/genbank
    nice ./bin/gbAlignStep -initial ce3
    #	logFile: var/build/logs/2006.01.11-11:30:41.ce3.initalign.log

    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -initialLoad ce3
    #	var/dbload/hgwdev/logs/2006.01.11-12:24:14.dbload.log

###########################################################################
# MAKE WORMBASE GENEFINDER TRACKS (DONE, 2006-01-11, hartera)
    # next machine
    ssh kkstore02
    mkdir -p /cluster/data/ce3/bed/sangerGenefinder
    cd /cluster/data/ce3/bed/sangerGenefinder
    # the perl trims these files down to a reasonable size.  As they are
    #   they cause ldHgGene to hangup due to memory size.
    for C in I II III IV V X M
    do
        echo -n "${C} "
        zcat ../../WS140/chr${C}.gff.gz | \
        sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
        -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
        -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
        -e "s/CHROMOSOME_M/chrM/g" \
        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
        perl -ne '@a = split;
        print if(($a[1] =~ m/Genefinder/i) &&
        ($a[2] =~ m/intron|exon|cds|sequence|transcri/i));' | \
	    sed -e 's/CDS //g; s/"//g' > chr${C}.gff
    done
    #  check file sizes, should be reasonable
    ls -ogrt
total 25024
-rw-rw-r--  1 3927756 Jan 11 15:49 chrI.gff
-rw-rw-r--  1 4205150 Jan 11 15:50 chrII.gff
-rw-rw-r--  1 3567022 Jan 11 15:50 chrIII.gff
-rw-rw-r--  1 4167290 Jan 11 15:51 chrIV.gff
-rw-rw-r--  1 5547317 Jan 11 15:51 chrV.gff
-rw-rw-r--  1 4165641 Jan 11 15:52 chrX.gff
-rw-rw-r--  1       0 Jan 11 15:52 chrM.gff

    # now load database with those transformed gff files
    ssh hgwdev
    cd /cluster/data/ce3/bed/sangerGenefinder
    time ldHgGene ce3 sangerGenefinder *.gff
    #	Read 62263 transcripts in 401688 lines in 7 files
    #	62263 groups 6 seqs 1 sources 4 feature types
    #	21182 gene predictions
    #	real    0m7.532s

########################################################################
# TWINSCAN GENE PREDICTIONS (DONE, 2006-01-11, hartera)
    # These are Iscan (new version of Twinscan) gene predictions
    # e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data
    #	*if* it isn't already at the location below.  If it has been
    #	some time since the WS release, it is probably already there.
    ssh hgwdev
    mkdir /cluster/data/ce3/bed/twinscan
    cd /cluster/data/ce3/bed/twinscan

    for D in chr_gtf chr_ptx chr_tx
    do
	mkdir ${D}
	cd ${D}
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=1 --no-parent \
	--no-host-directories --cut-dirs=6 \
"http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS140/Twinscan/${D}/"
	cd ..
    done
    #	clean up index files
    rm chr*/index*

    #	There used to be a lot of chrom name fixups to these files, but
    #	it appears with this set the names are already in our usual chrN
    #	format rather than the previous CHROMOSOME_N format
    #	Except for chrM
    sed -e "s/chrMtDNA/chrM/g" chr_gtf/CHROMOSOME_MtDNA.gtf \
	> chr_gtf/CHROMOSOME_M.gtf
    mv chr_gtf/CHROMOSOME_MtDNA.gtf chr_gtf/CHROMOSOME_MtDNA.gtf.original

    sed -e "s/chrMtDNA/chrM/g" chr_ptx/CHROMOSOME_MtDNA.ptx \
	> chr_ptx/CHROMOSOME_M.ptx
    mv chr_ptx/CHROMOSOME_MtDNA.ptx chr_ptx/CHROMOSOME_MtDNA.ptx.original

    # load data into database, need -gtf option as no stop codons in
    # these predictions. also use new -frame -id and -geneName options
    ldHgGene -gtf -genePredExt ce3 twinscan chr_gtf/CHROMOSOME_*.gtf
    
    #	Read 21985 transcripts in 169772 lines in 7 files
    #	21985 groups 7 seqs 1 sources 3 feature types
    #	21985 gene predictions
    hgPepPred ce3 generic twinscanPep chr_ptx/CHROMOSOME_*.ptx

########################################################################
# MAKE ALT. SPLICING TRACK (DONE - 2006-01-12 - Hiram)
    # create rnaCluster table and Gene Bounds track
    ssh hgwdev
    mkdir /cluster/data/ce3/bed/rnaCluster
    cd /cluster/data/ce3/bed/rnaCluster
    for C in I II III IV V X
    do
        echo "clusterRna ce3 /dev/null chr${C}.bed -chrom=${C}"
        clusterRna ce3 /dev/null chr${C}.bed -chrom=chr${C}
    done
    #	Are we getting about the same numbers as before ?
    #	Do a wc on these bed files, should be something like:
    wc *.bed | sort -n
    #	2628   31536  193683 chrIII.bed
    #	2767   33204  194963 chrX.bed
    #	2878   34536  202197 chrI.bed
    #	3245   38940  235165 chrIV.bed
    #	3506   42072  251726 chrII.bed
    #	4981   59772  353344 chrV.bed
    #  20005  240060 1431078 total

    hgLoadBed -strict ce3 rnaCluster *.bed

    hgsql -N -e "select chrom from chromInfo" ce3 > chrom.list

    (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce3 | egrep -v chromStart | cut -f2\-13 > $_.ce3.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.list > rnaCluster.sh

    chmod a+x rnaCluster.sh
    ./rnaCluster.sh
    #	And the result of that should look like:
    wc *.rnaCluster.bed | sort -n
    #	   0       0       0 chrM.ce3.rnaCluster.bed
    #	2628   31536  193683 chrIII.ce3.rnaCluster.bed
    #	2767   33204  194963 chrX.ce3.rnaCluster.bed
    #	2878   34536  202197 chrI.ce3.rnaCluster.bed
    #	3245   38940  235165 chrIV.ce3.rnaCluster.bed
    #	3506   42072  251726 chrII.ce3.rnaCluster.bed
    #	4981   59772  353344 chrV.ce3.rnaCluster.bed
    #  20005  240060 1431078 total

    mkdir /cluster/data/ce3/bed/altSplice
    cd /cluster/data/ce3/bed/altSplice

    hgsql -N -e "select chrom from chromInfo" ce3 > chrom.list
    perl -e 'while(<>) {chomp($_); print "altSplice -db=ce3 -beds=../rnaCluster/$_.ce3.rnaCluster.bed -agxOut=agx/$_.ce3.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.list > altSplice.para.spec
    # these jobs are no big deal on the worm, just run this file:
    chmod +x altSplice.para.spec
    mkdir agx
    time ./altSplice.para.spec
    #	real    6m55.219s
    #	The result of that should look like:
    wc agx/*.agx | sort -n
    #	2579   46422 2379008 agx/chrIII.ce3.rnaCluster.agx
    #	2681   48258 2320195 agx/chrX.ce3.rnaCluster.agx
    #	2817   50706 2516155 agx/chrI.ce3.rnaCluster.agx
    #	3121   56178 2420981 agx/chrIV.ce3.rnaCluster.agx
    #	3388   60984 2454789 agx/chrII.ce3.rnaCluster.agx
    #	4873   87714 2939995 agx/chrV.ce3.rnaCluster.agx

    cat agx/*.agx > altSplice.agx
    agxToBed altSplice.agx altSplice.bed

    hgLoadBed -strict -notItemRgb \
	-sqlTable=$HOME/kent/src/hg/lib/altGraphX.sql ce3 \
	    altGraphX altSplice.agx
    #	Loaded 19459 elements of size 18

########################################################################
# Create frames based page views of alt splice areas of interest
#	(DONE - 2005-01-12 - Hiram)
#	A link to this business was added to the description page
#	on each of the ce1 ce2 and ce3 browsers and all their
#	frames based viewpoints were placed in goldenPath to you can
#	click through from the description page
    ssh hgwdev
    mkdir /cluster/data/ce3/bed/altSplice/altAnalysis
    cd /cluster/data/ce3/bed/altSplice/altAnalysis
    altAnalysis ce3 ../altSplice.agx altSummary.txt links.html \
	frame.html -bedName=all
    # creates files, (as well as several .bed files):
    #	-rw-rw-r--    1      282 May 16 09:19 frame.html
    #	-rw-rw-r--    1   391029 May 16 09:19 links.html
    #	Copy these to the Intronerator WS120 location, change
    #	default opening location:

    mkdir /usr/local/apache/htdocs/goldenPath/ce3/altGraphX
    sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
	> /usr/local/apache/htdocs/goldenPath/ce3/altGraphX/links.html

    sed -e \
    "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
	-e "s/NM_005487/chrII:14689493-14690232/" \
	-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
	 frame.html \
	    > /usr/local/apache/htdocs/goldenPath/ce3/altGraphX/frame.html

########################################################################
# HGNEAR PROTEIN BLAST TABLES (DONE - 2005-01-12 - Hiram)
    ssh pk

    #	Fetch WormBase peptides for this release
    mkdir /cluster/data/ce3/bed/blastp
    cd /cluster/data/ce3/bed/blastp
    wget --timestamping -O wormPep140.faa \
	ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep140/wormpep140

    mkdir /cluster/data/ce3/bed/hgNearBlastp
    cd /cluster/data/ce3/bed/hgNearBlastp
    cat << _EOF_ > config.ra
# This elegans release vs. other Gene Sorter orgs:
# self, human, mouse, rat, zebrafish, yeast

targetGenesetPrefix wormBase
targetDb ce3
queryDbs ce3 hg17 mm7 rn3 danRer3 sacCer1

ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa
hg17Fa /cluster/data/hg17/bed/blastp/known.faa
mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa

buildDir /cluster/data/ce3/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/ce3HgNearBlastp
_EOF_
    doHgNearBlastp.pl config.ra >& do.log &
    tail -f do.log

    cat << _EOF_ > config.ra
# This elegans release vs. other Gene Sorter orgs:
# self, human, mouse, rat, zebrafish, yeast

targetGenesetPrefix wormBase
targetDb ce3
queryDbs ce3

ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa

buildDir /cluster/data/ce3/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/ce3HgNearBlastp
_EOF_
    doHgNearBlastp.pl config.ra >& do.log &
    tail -f do.log

#########################################################################
# Waba alignment with briggsae  (WORKING - 2006-01-12 - Hiram)
    ssh pk
    #	We need some unmasked FA for these things
    mkdir /san/sanvol1/scratch/worms/Celegans3/unmaskedFa
    cd /san/sanvol1/scratch/worms/Celegans3/unmaskedFa
    for C in I II III IV V X M
    do
	echo -n "chr${C}: "
	twoBitToFa -seq=chr${C} ../ce3.2bit stdout | \
		tr [a-z] [A-Z] | sed -e "s/CHR${C}/chr${C}/" > chr${C}.fa
	faSize chr${C}.fa
    done

    mkdir /cluster/data/ce3/bed/cb2.waba
    cd /cluster/data/ce3/bed/cb2.waba
    for C in I II III IV V X I_random II_random III_random IV_random \
	V_random X_random Un
    do
	ls -1S /san/sanvol1/scratch/worms/cb2/split/${C}/*.fa
    done > cb2.list

    ls -1S /san/sanvol1/scratch/worms/Celegans3/unmaskedFa/chr*.fa \
	> elegans.list
    mkdir out
    cat elegans.list | while read FN
    do
	b=`basename ${FN}`
	mkdir out/${b%%.fa}
    done
    # create scripts to be used here
    cat << '_EOF_' > wabaRun
#!/bin/csh -fe
#
#	$1 - full pathname to a briggsae contig
#	$2 - file path to an elegans chrom.fa
#	$3 - result file full pathname
#
set f = $1:t
set chr = $2:t
set d = $chr:r
mkdir -p /scratch/tmp/$d/$f
cp $1 /scratch/tmp/$d/$f
pushd .
cd /scratch/tmp/$d/$f
set t = $f:r
/cluster/bin/x86_64/waba 1 $f $2 $t.1
/cluster/bin/x86_64/waba 2 $t.1 $t.2
/cluster/bin/x86_64/waba 3 $t.2 $t.wab
popd
cp -p /scratch/tmp/$d/$f/$t.wab $3
rm -f /scratch/tmp/$d/$f/$t.*
rmdir --ignore-fail-on-non-empty /scratch/tmp/$d/$f
rmdir --ignore-fail-on-non-empty /scratch/tmp/$d
'_EOF_'
    #	happy emacs
    chmod +x wabaRun

    cat << '_EOF_' > jobTemplate
#LOOP
./wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists out/$(root2)/$(root1).wab}
#ENDLOOP
'_EOF_'
    #	happy emacs

    gensub2 cb2.list elegans.list jobTemplate jobList
    para create jobList
    para try
    para check
    para push
    # one of the jobs takes quite a while.  Most of the others are OK:
Completed: 6950 of 6951 jobs
Crashed: 1 jobs
CPU time in finished jobs:   19284472s  321407.87m  5356.80h  223.20d  0.612 y
IO & Wait Time:                 63556s    1059.26m    17.65h    0.74d  0.002 y
Average job time:                2784s      46.40m     0.77h    0.03d
Longest job:                    47017s     783.62m    13.06h    0.54d
Submission to last job:         58555s     975.92m    16.27h    0.68d

    #	The failed job is:
    # /cluster/store5/worm/ce3/bed/waba/wabaRun \
    #	/iscratch/i/worms/Cbriggsae/contigs/c0907.fa \
    #	/iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \
    #	/cluster/store5/worm/ce3/bed/waba/out/chrI/c0907.wab
XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00
    # after about 1.5 hours fails with the error:
    # waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart &&
    #	c->qEnd == qEnd' failed.
    # Abort

    # Ignoring that error, and moving on

    # next machine
    ssh hgwdev

    cd /cluster/data/ce3/bed/waba
    mkdir Load
    cd Load
    #  The cat through the pipe to hgWaba will avoid making large files
    # 	that are not needed.
    cat << '_EOF_' > loadEm.sh
#!/bin/sh
#
for c in I II III IV V X M
do
	echo -n "${c} "
        cat /cluster/store/data/ce3/bed/waba/out/chr${c}/c*.wab |
        hgWaba ce3 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
done

exit 0
'_EOF_'
    chmod +x loadEm.sh

    # run it to load the waba track:
    ./loadEm.sh
    # remove garbage temp file:
    rm full_waba.tab chrom_waba.tab

    # worm/ce3/trackDb.ra entry:
#   track cbrWaba
#   shortLabel Briggsae Waba
#   longLabel C. briggsae Waba Alignments
#   group compGeno
#   priority 20
#   visibility dense
#   color 140,0,200
#   altColor 210,140,250