# for emacs: -*- mode: sh; -*-


# This file describes browser build for the Orangutan
# genome, July 31 2007
#
#	"$Id: ponAbe2.txt,v 1.23 2010/02/12 23:51:36 hiram Exp $"
#
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABGA00
#
######################################################################
## DOWNLOAD SEQUENCE (DONE - 2007-09-06 - Hiram)
    ssh kkstore02
    mkdir /cluster/store5/ponAbe2
    ln -s /cluster/store5/ponAbe2 /cluster/data/ponAbe2
    mkdir /cluster/data/ponAbe2/wustl
    cd /cluster/data/ponAbe2/wustl
    # LaDeana Hillier provided us with a private copy before it got to the
    # public site
    wget --timestamping \
"http://genome.wustl.edu/pub/user/lhillier/private/P_abelii/orang_070731.tar.gz"

    tar xvzf orang_070731.tar.gz
    #	creates a directory FINAL_070731
    ls -ogrt
# drwxrwsr-x  2      4096 Sep  6 03:29 FINAL_070731
# -rw-rw-r--  1 906641840 Sep  6 05:01 orang_070731.tar.gz

    # chrX.agp and chr20.agp had blank lines at the end, remove those manually

#######################################################################
## create config.ra and run makeGenomeDb.pl (DONE - 2007-09-12 - Hiram)
    ssh kkstore02
    cd /cluster/data/ponAbe2

    cat << '_EOF_' > ponAbe2.config.ra
# Config parameters for makeGenomeDb.pl:
db ponAbe2
scientificName Pongo pygmaeus abelii
commonName Orangutan
assemblyDate Jul. 2007
assemblyLabel WUSTL 3.0
orderKey 31
# clade already on in ponAbe1
# clade mammal
# genomeCladePriority 11
# mitoAcc gi:1743294
mitoAcc X97707.1
fastaFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.fa
agpFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.agp
# qualFiles /dev/null
dbDbSpeciesDir orangutan
'_EOF_'
    # << happy emacs
    time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
	ponAbe2.config.ra > makeGenomeDb.out 2>&1 &
    #	real    19m49.887s
    # failed in the makeDb due to chrM vs. AGP confusion.  Fixed the code,
    #	and ran the jkStuff/makeDb.csh manually.  Then continuing:
    time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
	-continue=dbDb -stop=dbDb ponAbe2.config.ra > dbDb.out 2>&1 &
    #	no need to run the trackDb business since we can simply
    #	copy the ponAbe1 directory in the source tree

##########################################################################
## Repeat masker (DONE - 2007-09-15 - Hiram)
    ssh kkstore02
    ## use screen for this
    mkdir /cluster/data/ponAbe2/bed/RepeatMasker
    cd /cluster/data/ponAbe2/bed/RepeatMasker
    time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-bigClusterHub=pk \
	-buildDir=/cluster/data/ponAbe2/bed/RepeatMasker ponAbe2 > do.out 2>&1 &
    #	real    1831m46.301s

##############################################################################
## simpleRepeat masking (DONE - 2007-09-19 - Hiram)
    ## create a kki kluster run
    ssh kkr1u00
    mkdir /iscratch/i/ponAbe2
    cd /iscratch/i/ponAbe2
    cp -p /cluster/data/ponAbe2/ponAbe2.unmasked.2bit .
    cp -p /cluster/data/ponAbe2/chrom.sizes .
    twoBitToFa ponAbe2.unmasked.2bit ponAbe2.unmasked.fa
    mkdir split
    #  split sequence into individual chrom sequences
    faSplit sequence ponAbe2.unmasked.fa 100 split/s_
    rm ponAbe2.unmasked.fa ponAbe2.unmasked.2bit

    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
done

    ssh kki
    mkdir -p /cluster/data/ponAbe2/bed/simpleRepeat/trf
    cd /cluster/data/ponAbe2/bed/simpleRepeat/trf

    cat << '_EOF_' > runTrf
#!/bin/csh -fe 
#
set C = $1:r
set SRC = /iscratch/i/ponAbe2/split/$C.fa
mkdir -p /scratch/tmp/$C
cp -p $SRC /scratch/tmp/$C/$C.fa
pushd /scratch/tmp/$C
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf $C.fa \
	/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
popd
rm -f $C.bed
cp -p /scratch/tmp/$C/$C.bed .
rm -fr /scratch/tmp/$C
'_EOF_'
    # << happy emacs
    chmod +x runTrf

    cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line $(root1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls /iscratch/i/ponAbe2/split > part.list
    gensub2 part.list single template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 55 of 55 jobs
# CPU time in finished jobs:      57854s     964.23m    16.07h    0.67d  0.002 y
# IO & Wait Time:                   473s       7.88m     0.13h    0.01d  0.000 y
# Average job time:                1060s      17.67m     0.29h    0.01d
# Longest finished job:           32538s     542.30m     9.04h    0.38d
# Submission to last job:         32538s     542.30m     9.04h    0.38d

    ssh kkstore02
    cd /cluster/data/ponAbe2/bed/simpleRepeat/trf
    cat s_*.bed > ../simpleRepeat.bed
    cd ..
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/simpleRepeat
    time nice -n +19 hgLoadBed ponAbe2 simpleRepeat \
      simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 1036457 elements of size 16

    nice -n +19 featureBits ponAbe2 simpleRepeat \
	> fb.simpleRepeat.ponAbe2.txt 2>&1 &
    cat fb.simpleRepeat.ponAbe2.txt
    #	120386097 bases of 3093572278 (3.891%) in intersection

    #	add the trfMask to the rmsk masked sequence to get our final
    #	masked sequence
    ssh kkstore02
    cd /cluster/data/ponAbe2
    time nice -n +19 cat bed/simpleRepeat/trfMask.bed \
	| twoBitMask -add -type=.bed ponAbe2.rmsk.2bit stdin ponAbe2.2bit
    #	can safely ignore the warning about >= 13 fields
    #	measure it
    time nice -n +19 twoBitToFa ponAbe2.2bit stdout \
	| faSize stdin > faSize.ponAbe2.2bit.txt 2>&1
    grep masked faSize.ponAbe2.2bit.txt
    #	%45.68 masked total, %50.89 masked real

    ## clean up the /iscratch/i/ponAbe2/ directory
    ssh kkr1u00
    cd /iscratch/i/ponAbe2
    rm -fr *
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --stats /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
    ssh kkr${R}u00 rmdir /iscratch/i/ponAbe2
done
    cd ..
    rmdir ponAbe2

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-09-19 - Hiram)
#	Replaced the ponAbe1 blat server with this sequence

    ssh hgwdev
    hgsql -e 'update blatServers set db="ponAbe2" where db="ponAbe1"' \
	hgcentraltest

    #	Do not need to do this insert, just the update above
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("ponAbe2", "blat13", "17784", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("ponAbe2", "blat13", "17785", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence, i.e. the GABRA3 protein from Hg18
    #	to establish the defaultPos next:

############################################################################
    #	Reset default position to GABRA3 gene location (like Hg18)
    hgsql -e \
'update dbDb set defaultPos="chrX:152299762-152501081" where name="ponAbe2";' \
	hgcentraltest

############################################################################
# SWAP BLASTZ Mouse Mm9 (DONE - 2007-09-20 - Hiram)
    # the original
    ssh kkstore02
    screen  # control with a screen
    cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
    cat fb.mm9.chainPonAbe2Link.txt
    #	914561309 bases of 2620346127 (34.902%) in intersection

    #	And, for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
    cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
	-chainMinScore=3000 -swap -chainLinearGap=medium \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    102m23.209s
    cat fb.ponAbe2.chainMm9Link.txt
    #	948458190 bases of 3093572278 (30.659%) in intersection

    #	create syntenicNets for 7-way conservation maf use
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
	-continue=syntenicNet -syntenicNet \
	-chainMinScore=3000 -swap -chainLinearGap=medium \
	-bigClusterHub=pk > synNet.log 2>&1 &
    #	real    80m44.564s

############################################################################
# GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
    #	Create a lift file as per the procedures for Chimp from the AGP:
    ssh kolossus
    cd /cluster/data/ponAbe2
    cat ponAbe2.agp | /cluster/bin/scripts/agpToLift -revStrand \
	> jkStuff/genbank.lft
    # MAKE 11.OOC FILE FOR BLAT
    blat ponAbe2.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
    #	Wrote 36396 overused 11-mers to 11.ooc
    # copy that over to /iscratch/i/ponAbe2/ and get that directory copied
    #	to all the Iservers

    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add ponAbe2 just after panTro2
    # ponAbe2
# Orangutan
ponAbe2.serverGenome = /cluster/data/ponAbe2/ponAbe2.2bit
ponAbe2.clusterGenome = /iscratch/i/ponAbe2/ponAbe2.2bit
ponAbe2.ooc = /iscratch/i/ponAbe2/11.ooc
ponAbe2.align.unplacedChroms = chrUn,chr*_random
ponAbe2.lift = /cluster/data/ponAbe2/jkStuff/genbank.lft
ponAbe2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
ponAbe2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
ponAbe2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
ponAbe2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
ponAbe2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
ponAbe2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
ponAbe2.downloadDir = ponAbe2
ponAbe2.genbank.est.xeno.load = no
ponAbe2.refseq.mrna.native.load  = yes
ponAbe2.refseq.mrna.xeno.load = yes
ponAbe2.refseq.mrna.xeno.loadDesc  = yes

    cvs ci -m "Added ponAbe2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.  With these two lines:
    #	static char *ponAbeNames[] = {"Pongo abelii", "Pongo pygmaeus",
    #		"Pongo pygmaeus pygmaeus", "Pongo pygmaeus abelii", NULL};

    cvs ci -m "Added Pongo pygmaeus abelii (Orangutan)." src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    #	2008-02-14 12:28 add other Pongo names to gbGeneom.c,
    #	ssh to genbank machine, remove data/aligned/*/ponAbe2
    #	and start this build again.
    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.  
    time nice -n +19 bin/gbAlignStep -initial ponAbe2 &
    # logFile: var/build/logs/2008.02.14-12:28:36.ponAbe2.initalign.log
    #	real    702m19.137s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ponAbe2
    #	logFile: var/dbload/hgwdev/logs/2008.02.22-17:08:01.dbload.log
    #	real    21m12.354s

    # enable daily alignment and update of hgwdev (DONE - 2007-11-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add ponAbe2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added ponAbe2." etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (SCRIPT DONE 10/1/07 angie -- README EDITS TODO)
    cd /cluster/data/ponAbe2
    # Thought this was done in most chrom-based assemblies:
    ssh -x kkstore02 splitFileByColumn \
      `pwd`/bed/simpleRepeat/trfMask.bed `pwd`/bed/simpleRepeat/trfMaskChrom
    # And these aren't quite where expected, so make some symbolic links:
    foreach c (5_h2_hap1 \
               6_cox_hap1 6_qbl_hap2 6_cox_hap1_random 6_qbl_hap2_random)
      set cOldBase = `echo $c | sed -e 's/_random$//;'`
      set cNewBase = `echo $c | sed -e 's/_.*//;'`
      ln -s `pwd`/$cOldBase/chr$c.agp $cNewBase/chr$c.agp
      ln -s `pwd`/$cOldBase/chr$c.fa.out $cNewBase/chr$c.fa.out
    end

    ln -s bed/RepeatMasker/ponAbe2.fa.out .
    makeDownloads.pl ponAbe2 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log

#TODO:
# *** Edit each README.txt to resolve any notes marked with "***":
#     /cluster/data/ponAbe2/goldenPath/database/README.txt
#     /cluster/data/ponAbe2/goldenPath/bigZips/README.txt
#     /cluster/data/ponAbe2/goldenPath/chromosomes/README.txt
#     (The htdocs/goldenPath/ponAbe2/*/README.txt "files" are just links to those.)
# *** If you have to make any edits that would always apply to future
#     assemblies from the same sequencing center, please edit them into
#     ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).

#########################################################################
## genscan run (DONE - 2007-11-08 - Hiram)
##	create hard masked sequence
    ssh kkstore02
    cd /cluster/data/ponAbe2
    for C in `cut -f1 chrom.sizes`
do
    CN=${C/_*}
    CN=${CN/chr}
    echo "${CN}/${C}.hard.fa"
    twoBitToFa -seq=${C} ponAbe2.2bit stdout \
	| maskOutFa stdin hard ${CN}/${C}.hard.fa
    ls -ld "${CN}/${C}.hard.fa"
done

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them.  This drives genscan nuts
    faCount ?/*hard.fa ??/*hard.fa > faCount.hard.txt
    #	the lowest three are:
    egrep -v "^#|^total" faCount.hard.txt \
	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
    #	chr6_qbl_hap2_random 44531
    #	chr5_h2_hap1 25615
    #	chrM 16079
    # this is OK, the numbers mean there is plenty of sequence in those bits

    ssh kkr1u00
    mkdir /iscratch/i/ponAbe2/hardChunks
    cd /iscratch/i/ponAbe2/hardChunks
    #	creating 4,000,000 sized chunks, the chroms stay together as
    #	single pieces.  The contigs get grouped together into 4,000,000
    #	sized fasta files.  You don't want to break these things up
    #	because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.
    cat /cluster/data/ponAbe2/?/*.hard.fa /cluster/data/ponAbe2/??/*.hard.fa \
	| faSplit about stdin 4000000 c_
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/hardChunks/
done

    ssh hgwdev
    mkdir /cluster/data/ponAbe2/bed/genscan
    cd /cluster/data/ponAbe2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/ponAbe2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1Sr /iscratch/i/ponAbe2/hardChunks/c_*.fa > genome.list

    # Create run-time script to operate gsBig in a cluster safe manner
    cat << '_EOF_' > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
'_EOF_'
    # << happy emacs
    chmod +x runGsBig
    cat << '_EOF_' > template
#LOOP
runGsBig /iscratch/i/ponAbe2/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 44 of 48 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:     327299s    5454.99m    90.92h    3.79d  0.010 y
# IO & Wait Time:                  1091s      18.18m     0.30h    0.01d  0.000 y
# Average job time:                7463s     124.39m     2.07h    0.09d
# Longest finished job:           93797s    1563.28m    26.05h    1.09d
# Submission to last job:        104573s    1742.88m    29.05h    1.21d
    #	four jobs will not finish.  They don't finish on kolossus either.
    #	The sequence involved is:
    #	c_12.fa -> chr8
    #	c_08.fa -> chr6
    #	c_06.fa -> chr5
    #	c_04.fa -> chr4
    #	So, split up these chroms on a couple of the non-bridged gaps
    ssh kkr1u00
    cd /iscratch/i/ponAbe2/hardChunks
    mkdir ../genscanSplit
    # examine the gap sizes to see if there is a good size range to
    #	break them up on, c_12 turns out to be about 30,000
    faGapSizes c_12.fa
    # this makes 6 or 7 pieces for each of these
    faSplit -minGapSize=29999 -lift=../genscanSplit/c_12.lift gap c_12.fa \
	40000000 ../genscanSplit/c_12_
    faSplit -minGapSize=29999 -lift=../genscanSplit/c_08.lift gap c_08.fa \
	40000000 ../genscanSplit/c_08_
    faSplit -minGapSize=29999 -lift=../genscanSplit/c_06.lift gap c_06.fa \
	40000000 ../genscanSplit/c_06_
    faSplit -minGapSize=29999 -lift=../genscanSplit/c_04.lift gap c_04.fa \
	40000000 ../genscanSplit/c_04_
    #	copy the genscanSplit directory to the other Iservers
    cd ../genscanSplit
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/genscanSplit/
done
    # and run these four items in a small kluster job
    #	can re-use the runGsBig script from before, just a different
    #	source directory location
    ssh kki
    mkdir /cluster/data/ponAbe2/bed/genscan/split
    cd /cluster/data/ponAbe2/bed/genscan/split
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    cat << '_EOF_' > template
#LOOP
../runGsBig /iscratch/i/ponAbe2/genscanSplit $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls -1Sr /iscratch/i/ponAbe2/genscanSplit/c_*.fa > genome.list
    gensub2 genome.list single template jobList
    para create jobList
    #	even after this split, three of these jobs would not complete
# Completed: 22 of 25 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:      17154s     285.89m     4.76h    0.20d  0.001 y
# IO & Wait Time:                    77s       1.29m     0.02h    0.00d  0.000 y
# Average job time:                 783s      13.05m     0.22h    0.01d
# Longest finished job:            1135s      18.92m     0.32h    0.01d
# Submission to last job:         22398s     373.30m     6.22h    0.26d
    #	so, take those three jobs and split them up and run them like this
    #	after all that is done, lifting all results back to their chroms
    for F in c_04 c_08 c_12
do
sort -k1,1 -k4,4n gtf/${F}_*.gtf ../secondSplit/${F}_?.gtf | liftUp -type=.gtf stdout split.lift error stdin \
	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
        | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
        > ${F}.gtf
sort -k1,1 -k2,2n subopt/${F}_*.bed ../secondSplit/${F}_?.bed | liftUp -type=.bed stdout split.lift error stdin \
	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
        | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
        > ${F}.bed
cat pep/${F}_*.pep ../secondSplit/${F}_?.pep \
	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
        | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
        > ${F}.pep
echo $F
done
    #	that includes the secondary splits for three jobs too.
    #	the result of all this is copied to ../gtf ../pep and ../subopt
    #	to fully populate the original result, now continuing with the
    #	processing of all results

    # cat and lift the results into single files
    ssh kkstore02
    cd /cluster/data/ponAbe2/bed/genscan
    sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
    sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed 
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/genscan
    ldHgGene ponAbe2 -gtf genscan genscan.gtf
    #	Read 42190 transcripts in 314789 lines in 1 files
    #	42190 groups 54 seqs 1 sources 1 feature types
    #	42190 gene predictions

    hgPepPred ponAbe2 generic genscanPep genscan.pep
    hgLoadBed ponAbe2 genscanSubopt genscanSubopt.bed
    #	Loaded 517874 elements of size 6

    #	check the numbers
    time nice -n +19 featureBits ponAbe2 genscan
    #	52587049 bases of 3093572278 (1.700%) in intersection

###########################################################################
# Blastz Platypus ornAna1 (DONE - 2007-11-20 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
    cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13

    cat << '_EOF_' > DEF
# Orangutan vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Orangutan PonAbe2
SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Platypus ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    2585m50.826s
    # some of the jobs got confused during kk parasol hub reboots, where they
    # think they aren't done, but they actually are.  A 'para recover' finds
    # nothing to re-run, all the result files are present.
# Completed: 263242 of 263488 jobs
# Crashed: 246 jobs
# CPU time in finished jobs:   48351127s  805852.12m 13430.87h  559.62d  1.533 y
# IO & Wait Time:               7609407s  126823.45m  2113.72h   88.07d  0.241 y
# Average job time:                 213s       3.54m     0.06h    0.00d
# Longest finished job:            2146s      35.77m     0.60h    0.02d
# Submission to last job:        259048s    4317.47m    71.96h    3.00d

    #	after verifying all jobs are actually finished, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
    #	real    53m18.757s
    cat fb.ponAbe2.chainOrnAna1Link.txt
    #	214557704 bases of 3093572278 (6.936%) in intersection

    mkdir /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
    cd /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    #	real    120m20.860s
    cat fb.ornAna1.chainPonAbe2Link.txt
    #	201368027 bases of 1842236818 (10.931%) in intersection

###########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-19 - Hiram)
    ssh kkstore02
    screen # use screen to control this job
    mkdir /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
    cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18

    cat << '_EOF_' > DEF
# Orangutan vs. Marmoset
BLASTZ_M=50

# TARGET: Orangutan PonAbe2
SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > do.log 2>&1 &
    #	real    1441m
# Completed: 97520 of 97520 jobs
# CPU time in finished jobs:   21651347s  360855.79m  6014.26h  250.59d  0.687 y
# IO & Wait Time:              10087025s  168117.08m  2801.95h  116.75d  0.320 y
# Average job time:                 325s       5.42m     0.09h    0.00d
# Longest finished job:            5645s      94.08m     1.57h    0.07d
# Submission to last job:         67173s    1119.55m    18.66h    0.78d
    cat fb.ponAbe2.chainCalJac1Link.txt
    #	2310720863 bases of 3093572278 (74.694%) in intersection

    # reciprocal best net mafs for 7-way multiz
    time nice -n +19 doRecipBest.pl -workhorse=hgwdev ponAbe2 calJac1 \
	> rbest.log 2>&1
    #	this failed at the download step, expects to run on hgwdev
    #	So, let's see what it makes on hgwdev
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
    time nice -n +19 doRecipBest.pl -continue=download -workhorse=hgwdev \
	ponAbe2 calJac1 > rbest.download.log 2>&1 &
    # it populates the directory:
    # /usr/local/apache/htdocs/goldenPath/ponAbe2/vsCalJac1/reciprocalBest

    ssh kkstore02
    mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap
    cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    #	real    341m54.548s
    cat fb.calJac1.chainPonAbe2Link.txt
    #	2253236255 bases of 2929139385 (76.925%) in intersection

#########################################################################
## 8-Way Multiz (DONE - 2007-11-27 - Hiram)
##
    ssh hgwdev
    mkdir /cluster/data/ponAbe2/bed/multiz8way
    cd /cluster/data/ponAbe2/bed/multiz8way
    #	take the 30-way tree from mm9 and eliminate genomes not in
    #	this alignment
    #	rearrange to get ponAbe2 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    #	select the 8 organisms from the 30-way recently done on mouse mm9
    /cluster/bin/phast/tree_doctor \
	--prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Platypus_ornAna1,Opossum_monDom4 \
	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
	> 8-way.fullNames.nh

    #	looks something like this:
(((Mouse_mm9:0.325818,
((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,
Marmoset_calJac1:0.070000):0.058454):0.263313,
Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);

    #	rearrange to get Orangutan at the top:
    # this leaves us with:
    cat << '_EOF_' > ponAbe2.8-way.nh
((((((Orangutan_ponAbe2:0.020000,
      (Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037):0.013037,
     Rhesus_rheMac2:0.031973):0.036500,
    Marmoset_calJac1:0.070000):0.058454,
  Mouse_mm9:0.325818):0.263313,
 Opossum_monDom4:0.320721):0.088647,
Platypus_ornAna1:0.488110);
'_EOF_'
    #	<< happy emacs

    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' ponAbe2.8-way.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    #	verify that has 8 db names in it
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' ponAbe2.8-way.nh \
	| sed -e "s/  / /g"` > tree.8.nh
    #	that looks like, as a single line:
    # ((((((ponAbe2 (hg18 panTro2)) rheMac2) calJac1) mm9) monDom4) ornAna1)

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/ponAbe2/bed/multiz8way
foreach db (`cat species.list`)
    set bdir = /cluster/data/ponAbe2/bed/blastz.$db
    if (-e $bdir/mafRBestNet/chr1.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/chr1.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/chr1.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says, the "mafs not found" should only show up on ponAbe2
    ./listMafs.csh
# calJac1 mafRBestNet
# hg18 mafSynNet
# mm9 mafSynNet
# monDom4 mafNet
# ornAna1 mafNet
# panTro2 mafSynNet
# ponAbe2 mafs not found
# rheMac2 mafSynNet


    /cluster/bin/phast/all_dists ponAbe2.8-way.nh > 8way.distances.txt
    grep -i ponabe 8way.distances.txt | sort -k3,3n
# Orangutan_ponAbe2       Human_hg18      0.038910
# Orangutan_ponAbe2       Chimp_panTro2   0.040705
# Orangutan_ponAbe2       Rhesus_rheMac2  0.065010
# Orangutan_ponAbe2       Marmoset_calJac1        0.139537
# Orangutan_ponAbe2       Mouse_mm9       0.453809
# Orangutan_ponAbe2       Opossum_monDom4 0.712025
# Orangutan_ponAbe2       Platypus_ornAna1        0.968061

    #	use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                       chainPonAbe2Link   chain   linearGap
#    distance                     on PonAbe2    on other   minScore
#  1  0.038910 Human_hg18         (% 91.302)   (% 92.892)   3000     medium
#  2  0.040705 Chimp_panTro2      (% 89.431)   (% 91.033)   3000     medium
#  3  0.065010 Rhesus_rheMac2     (% 82.623)   (% 88.157)   3000     medium
#  4  0.139537 Marmoset_calJac1   (% 74.694)   (% 76.925)   3000     medium
#  5  0.453809 Mouse_mm9          (% 30.659)   (% 34.902)   3000     medium
#  6  0.712025 Opossum_monDom4    (% 11.306)   (% 13.137)   5000     loose
#  7  0.968061 Platypus_ornAna1   (%  6.936)   (% 10.931)   5000     loose

    # copy net mafs to cluster-friendly storage, splitting chroms
    # into 50MB chunks  to improve run-time
    # NOTE: splitting will be different for scaffold-based reference asemblies
    ssh hgwdev
    mkdir /cluster/data/ponAbe2/bed/multiz8way/run.split
    cd /cluster/data/ponAbe2/bed/multiz8way/run.split
    #	this works by examining the rmsk table for likely repeat areas
    #	that won't be used in blastz
    mafSplitPos ponAbe2 50 mafSplit.bed

    ssh kki
    cd /cluster/data/ponAbe2/bed/multiz8way/run.split
 
    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set targDb = "ponAbe2"
set db = $1
set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
    echo "directory $sdir/$db already exists -- remove and retry"
    exit 1
endif
set bdir = /cluster/data/$targDb/bed/blastz.$db
if (! -e $bdir) then
    echo "directory $bdir not found"
    exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
    set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
    set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
    set mdir = $bdir/mafNet
else
    echo "$bdir maf dir not found"
    exit 1
endif
echo $mdir
foreach f ($mdir/*)
    set c = $f:t:r:r
    echo "  $c"
    nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    grep -v ponAbe2  ../species.list > split.list
    cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
    gensub2 split.list single template jobList
    para create jobList
    # 7 jobs
    # start these gently, this is a good load on the san filesystem
    para -maxPush=3 push
    #	wait a while, verify these are running OK
    para push
    # let that run to a couple completions, a few minutes, then again:
    para try
    # etc ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       6048s     100.81m     1.68h    0.07d  0.000 y
# IO & Wait Time:                   593s       9.88m     0.16h    0.01d  0.000 y
# Average job time:                 949s      15.81m     0.26h    0.01d
# Longest finished job:            1451s      24.18m     0.40h    0.02d
# Submission to last job:          1471s      24.52m     0.41h    0.02d

    # ready for the multiz run
    ssh pk
    cd /cluster/data/ponAbe2/bed/multiz8way
    #	actually, the result directory here should be maf.split instead of maf
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    # list chrom chunks, any db dir will do; better would be for the
    # splitter to generate this file
    # We temporarily use __ instead of . to delimit chunk in filename
    # so we can use $(root) to get basename
    find /san/sanvol1/scratch/ponAbe2/splitStrictMafNet -type f \
	| while read F; do basename $F; done \
	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
    wc -l chromChunks.list
        # 104

cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef

    set db = ponAbe2
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../tree.8.nh ../species.list $tmp
    pushd $tmp
    foreach s (`cat species.list`)
        set c2 = `echo $c | sed 's/__/./'`
        set in = $pairs/$s/$c2.maf
        set out = $db.$s.sing.maf
        if ($s == ponAbe2) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 chromChunks.list single template jobList
    para create jobList
    # the 104 jobs run time:
# Completed: 104 of 104 jobs
# CPU time in finished jobs:     174748s    2912.47m    48.54h    2.02d  0.006 y
# IO & Wait Time:                  2952s      49.20m     0.82h    0.03d  0.000 y
# Average job time:                1709s      28.48m     0.47h    0.02d
# Longest finished job:            3340s      55.67m     0.93h    0.04d
# Submission to last job:          4413s      73.55m     1.23h    0.05d

    # put the split maf results back together into single chroms
    ssh kkstore02
    cd /cluster/data/ponAbe2/bed/multiz8way
    # here is where the result directory maf should have already been maf.split
    mv maf maf.split
    mkdir maf
    # going to sort out the redundant header garbage to leave a cleaner maf
    for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
    echo ${C}
    head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
    grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
    grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
    tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/ponAbe2/multiz8way/maf
    ln -s /cluster/data/ponAbe2/bed/multiz8way/maf/*.maf \
                /gbdb/ponAbe2/multiz8way/maf
    # this generates a large 1 Gb multiz8way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/ponAbe2/multiz8way/maf ponAbe2 multiz8way
    #	real    6m37.534s
    #	Loaded 6981959 mafs in 55 files from /gbdb/ponAbe2/multiz8way/maf

    # load summary table
    time nice -n +19 cat /gbdb/ponAbe2/multiz8way/maf/*.maf \
	| hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz8waySummary stdin
    #	Created 1417364 summary blocks from 29928557 components
    #	and 6981421 mafs from stdin
    #	real    21m35.057s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore02
    mkdir /cluster/data/ponAbe2/bed/multiz8way/anno
    cd /cluster/data/ponAbe2/bed/multiz8way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v ponAbe2 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh kki
    cd /cluster/data/ponAbe2/bed/multiz8way/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
    set dir = /cluster/data/ponAbe2/bed/multiz8way
    set c = $1
    cat $dir/maf/${c}.maf | \
        nice mafAddIRows -nBeds=nBeds stdin /cluster/data/ponAbe2/ponAbe2.2bit $2
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /cluster/data/ponAbe2/chrom.sizes > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 55 of 55 jobs
# CPU time in finished jobs:       2278s      37.97m     0.63h    0.03d  0.000 y
# IO & Wait Time:                  9909s     165.15m     2.75h    0.11d  0.000 y
# Average job time:                 222s       3.69m     0.06h    0.00d
# Longest finished job:            3340s      55.67m     0.93h    0.04d
# Submission to last job:          3897s      64.95m     1.08h    0.05d

    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/anno
    mkdir -p /gbdb/ponAbe2/multiz8way/anno/maf
    ln -s /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf \
                /gbdb/ponAbe2/multiz8way/anno/maf
    #	by loading this into the table multiz8way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ponAbe2/multiz8way/anno/maf \
                ponAbe2 multiz8way
    #	Loaded 7331265 mafs in 55 files from /gbdb/ponAbe2/multiz8way/anno/maf
    #	real    8m31.092s

    cat /cluster/data/ponAbe2/chrom.sizes | \
	awk '{if ($2 > 1000000) { print $1 }}' |
	while read C
do
    echo /gbdb/ponAbe2/multiz8way/anno/maf/$C.maf
done | xargs cat | \
        hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz8waySummary stdin
    #	Created 1417364 summary blocks from 29928557 components and 7330669
    #	mafs from stdin
    #	by loading this into the table multiz8waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz8way*.tab files in this /scratch/tmp directory
    rm multiz8way*

#############################################################################
## Annotate 8-way multiple alignment with gene annotations
##		(DONE - 2007-11-27 - Hiram)
    # Gene frames
    ## survey all genomes to see what type of gene track to use
    ssh hgwdev
    mkdir /cluster/data/ponAbe2/bed/multiz8way/frames
    cd /cluster/data/ponAbe2/bed/multiz8way/frames
    #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    echo -n "Tables: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
	    $table == "knownGene") then
		set count = `hgsql $db -N -e "select count(*) from $table"`
		echo -n "${table}: ${count}, "
	endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
	    "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql ponAbe2 -N -e \
	    "select id from organism where name='$orgName'"`
    if ($orgId == "") then
	echo "Mrnas: 0"
    else
	set count = `hgsql ponAbe2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
	echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    #	given this output, manually sorted for this display:
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29037,
#	refGene: 25902, Mrnas: 209126
# mm9: Tables: knownGene: 49409, mgcGenes: 22950, refGene: 21146, Mrnas: 242485
# monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
# ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
# panTro2: Tables: ensGene: 32852, refGene: 26179, Mrnas: 1297
# rheMac2: Tables: ensGene: 38561, refGene: 426, Mrnas: 3202
# calJac1: Tables: Mrnas: 952
# ponAbe2: Tables: Mrnas: 2

    #	from this information, conclusions:
    #	use knownGene for hg18, mm9
    #	use ensGene for monDom4, ornAna1, panTro2, rheMac2
    #	no annotations for calJac1, ponAbe2

    mkdir genes
    # knownGene
    for DB in hg18 mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in monDom4 ornAna1 panTro2 rheMac2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    ls -og genes
# -rw-rw-r--  1 2008806 Nov 27 14:37 hg18.gp.gz
# -rw-rw-r--  1 1965274 Nov 27 14:37 mm9.gp.gz
# -rw-rw-r--  1 1751726 Nov 27 14:37 monDom4.gp.gz
# -rw-rw-r--  1 1232719 Nov 27 14:37 ornAna1.gp.gz
# -rw-rw-r--  1 1980696 Nov 27 14:37 panTro2.gp.gz
# -rw-rw-r--  1 1935916 Nov 27 14:37 rheMac2.gp.gz


    ##################################################
    # redmine GB - Feature #480 -  missing self frames on multiz 
    #   NwayFrames tables (DONE 2010-07-29) 
    #   re-run the genePredToMafFrames with ponAbe2 genes/ponAbe2
    #
    # fixed up steps:
    # mv frames.log frames.2007-11-27.log
    # mv multiz8way.mafFrames.gz multiz8way.mafFrames.2007-11-27.gz
    # Since ponAbe2 has ensGene: 41488, refGene: 1337, Mrnas: 2153
    #  use ensGene for ponAbe2:
   for DB in  ponAbe2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    ssh hgwdv
    sizeG=188743680
    export sizeG
    ulimit -d $sizeG
    ulimit -v $sizeG

    cd /cluster/data/ponAbe2/bed/multiz8way/frames
    # leaving out calJac1 ponAbe2
    time (cat  ../maf/*.maf | nice -n +19 genePredToMafFrames ponAbe2 stdin stdout  ponAbe2 genes/ponAbe2.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz panTro2 genes/panTro2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz8way.mafFrames.gz) > frames.log 2>&1
    # real    6m52.150s

    # see what it looks like in terms of number of annotations per DB:
    zcat multiz8way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
    # 192293 ponAbe2
    # 219627 hg18
    # 221459 panTro2
    # 230108 rheMac2
    # 238734 mm9
    # 412384 ornAna1
    # 459521 monDom4
    ## output from multiz8way.mafFrames.2007-11-27.gz matched
    # 219627 hg18
    # 221459 panTro2
    # 230108 rheMac2
    # 238734 mm9
    # 412384 ornAna1
    # 459521 monDom4

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/frames
    time nice -n +19 hgLoadMafFrames ponAbe2 multiz8wayFrames \
	multiz8way.mafFrames.gz
    #	real    0m18.905s

    #	enable the trackDb entries:
# frames multiz8wayFrames
# irows on

#############################################################################
# phastCons 8-way (DONE - 2007-10-16 - Hiram)

    # split 8way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh kki
    mkdir /cluster/data/ponAbe2/bed/multiz8way/msa.split
    cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
    mkdir -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/ponAbe2/bed/multiz8way/maf
set WINDOWS = /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/ponAbe2/ponAbe2.2bit /scratch/tmp/ponAbe2.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
/cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
    -M /scratch/tmp/ponAbe2.$c.fa \
    -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $CLEAN_MAF /scratch/tmp/ponAbe2.$c.fa
popd
date >> $c.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../maf | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
    #	push these with a -maxPush number that would put a single job on
    #	each iserver, you don't want two jobs at a time on each iserver,
    #	they consume too much memory and cause swapping to the point where
    #	they slow to a crawl and take hours to finish.
# Completed: 55 of 55 jobs
# CPU time in finished jobs:       2679s      44.65m     0.74h    0.03d  0.000 y
# IO & Wait Time:                  2980s      49.67m     0.83h    0.03d  0.000 y
# Average job time:                 103s       1.71m     0.03h    0.00d
# Longest finished job:             716s      11.93m     0.20h    0.01d
# Submission to last job:         10391s     173.18m     2.89h    0.12d

    # XXXX Estimates were attempted, not really very useful, instead, as seen
    # below, merely take the cons and noncons trees from the mouse 30-way

    # Estimate phastCons parameters
    #	see also:
    #	http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html

    # Create a list of .ss files over 3,000,000 in length
    #	this is almost everything
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
    ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
	awk '$5 > 3000000 {print $9;}' > ../tuningRun.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh kk
    mkdir /cluster/data/ponAbe2/bed/multiz8way/treeRun2
    cd /cluster/data/ponAbe2/bed/multiz8way/treeRun2
    mkdir tree log most

    #	Tuning this loop should come back to here to recalculate 
    # Create script that calls phastCons with right arguments
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set SAN="/san/sanvol1/scratch/ponAbe2/multiz8way/cons"
set SS=$1
set C=$1:h
set F=$1:t
set tmpDir="/scratch/tmp/pA2_$2"
rm -fr $tmpDir
mkdir $tmpDir
mkdir -p log/${C} tree/${C} most/${C}
scp -p hiram@pk:$SAN/ss/$1 $tmpDir/$F
scp -p hiram@pk:$SAN/estimate/starting-tree.mod $tmpDir
pushd $tmpDir
/cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
      --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
      --quiet --log $F.log --estimate-trees $F.tree
popd
cp -p $tmpDir/$F.log log/$C
cp -p $tmpDir/$F.most most/$C
cp -p $tmpDir/$F.tree.*cons.mod tree/$C
rm -fr $tmpDir
'_EOF_'
#	<< happy emacs
      chmod a+x makeTree.csh

# Create gensub file
      cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(num1)
#ENDLOOP
'_EOF_'
#	<< happy emacs

# Make cluster job and run it
  scp -p hiram@pk:/san/sanvol1/scratch/ponAbe2/multiz8way/cons/tuningRun.list .
    gensub2 tuningRun.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:      51439s     857.32m    14.29h    0.60d  0.002 y
# IO & Wait Time:                   948s      15.79m     0.26h    0.01d  0.000 y
# Average job time:                1048s      17.46m     0.29h    0.01d
# Longest finished job:            1388s      23.13m     0.39h    0.02d
# Submission to last job:          4376s      72.93m     1.22h    0.05d

# Now combine parameter estimates.  We can average the .mod files
# using phyloBoot.  This must be done separately for the conserved
# and nonconserved models
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    sort -k1,1 -k2,2n  most/chr*/*.most > mostConserved.bed
    wc -l mostConserved.bed
    #	at --target_coverage .3 and --expected_length 45
    #	1159312 most conserved elements
    #	primates only at --target_coverage .75 and --expected_length 1000

#	measuring entropy
#	consEntopy <target coverage> <expected lengths>
#		 ave.cons.mod ave.noncons.mod --NH 9.78
    /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 --NH 9.78 \
	ave.cons.mod ave.noncons.mod

# ( Solving for new omega: 45.000000 21.095519 13.646969 11.146376
#	10.499722 10.437794 10.437185 )
# Transition parameters: gamma=0.300000, omega=45.000000,
#	mu=0.022222, nu=0.009524
# Relative entropy: H=0.415553 bits/site
# Expected min. length: L_min=30.634230 sites
# Expected max. length: L_max=24.942235 sites
# Phylogenetic information threshold: PIT=L_min*H=12.730148 bits
# Recommended expected length: omega=10.437185 sites (for L_min*H=9.780000)

    #	and checking the most conserved areas, on hgwdev:
    #	at --target_coverage .3 and --expected_length 45
    featureBits -noRandom -noHap ponAbe2 mostConserved.bed
    #	221759187 bases of 2723012131 (8.144%) in intersection
    featureBits -noRandom -noHap -enrichment ponAbe2 genscan:cds \
	mostConserved.bed
    #	genscan:cds 1.688%, mostConserved.bed 8.144%, both 0.146%,
    #	cover 8.63%, enrich 1.06x

    #	primates only at --target_coverage .75 and --expected_length 1000
    featureBits -noRandom -noHap ponAbe2 mostConserved.bed
    #	524207695 bases of 2723012131 (19.251%) in intersection
    featureBits -enrichment -noRandom -noHap ponAbe2 genscan:cds \
	mostConserved.bed
    #	genscan:cds 1.688%, mostConserved.bed 19.251%, both 0.314%,
    #	cover 18.61%, enrich 0.97x

    #	Estimates could be made, but more correctly, take the 30-way
    #	.mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way
    cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .

    ssh hgwdev
    screen # use screen to manage this long running job
    cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
    #	using one of the sections of chr1 as an example
    time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
/san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss/chr1/chr1.50000209-60000000.ss \
--tree \
"((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
    --out-root starting-tree
    #	real    1m35.979s

    #	create a single ss file from a whole chrom maf file
    /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
	-o SS -r chr1 -w 300000000,0 -I 1000 -B 5000 

    #	run a phyloFit on that large .ss file to find a starting-tree
    time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
	chr1.1-229942017.ss --tree \
"((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
	--out-root starting-tree
# Reading alignment from chr1.1-229942017.ss ...
# Compacting sufficient statistics ...
# Fitting tree model to chr1.1-229942017.ss using REV ...
# Writing model to starting-tree.mod ...
# Done.

# real    5m30.470s

    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.355
    #	This 0.355 is used in the --gc argument below

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh pk
    mkdir -p /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/ponAbe2/bed/multiz8way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/ponAbe2/multiz8way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
  cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod .
  cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    #	it may be better to have the result check file be the .pp file, since
    #	this .bed file is sometimes empty even though there are .pp results
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/ponAbe2/multiz8way/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/ponAbe2/bed/multiz8way/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
    --prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
	> all.mod
    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    #	it may be better to have the result check file be the .pp file, since
    #	this .bed file is sometimes empty even though there are .pp results
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 368 of 368 jobs
# CPU time in finished jobs:      11954s     199.23m     3.32h    0.14d  0.000 y
# IO & Wait Time:                  3688s      61.47m     1.02h    0.04d  0.000 y
# Average job time:                  43s       0.71m     0.01h    0.00d
# Longest finished job:              52s       0.87m     0.01h    0.00d
# Submission to last job:           577s       9.62m     0.16h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
    time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
    time nice -n +19 hgLoadBed ponAbe2 phastConsElements8way mostConserved.bed
    #	Loaded 1096981 elements of size 5

    # Try for 5% overall cov, and 70% CDS cov 
    #	We don't have and gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits ponAbe2 phastConsElements8way
    #	134611760 bases of 3093572278 (4.351%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons8wayScores

for D in pp/chr*
do
    C=${D/pp\/}
    out=phastCons8wayScores/${C}.data.gz
    echo "${D} > ${C}.data.gz"
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
	gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    38m22.760s
    #	copy the phastCons8wayScores to:
# /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
    time nice -n +19 ls phastCons8wayScores/*.data.gz | xargs zcat \
	| wigEncode -noOverlap stdin phastCons8way.wig phastCons8way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    23m55.813s
    time nice -n +19 cp -p *.wi? /cluster/data/ponAbe2/bed/multiz8way/cons/all
    #	real    2m2.107s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
    ln -s `pwd`/phastCons8way.wib /gbdb/ponAbe2/multiz8way/phastCons8way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
	phastCons8way phastCons8way.wig
    #	real    1m27.666s


    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ponAbe2 phastCons8way > histogram.data 2>&1
    #	real    5m0.608s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Orangutan PonAbe2 Histogram phastCons8way track"
set xlabel " phastCons8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ### Create a phastCons data set for primates-only
    # XXX - this was only a test, this data was not used

    # setup primates-only run
    ssh pk
    cd /cluster/data/ponAbe2/bed/multiz8way/cons
    mkdir primates run.cons/primates
    cd primates
    # primates-only: exclude all but these for phastCons tree:
    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
	--prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1 \
	    > primates.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "monDom4,ornAna1" > primates.non-inf

    cd ../run.cons/primates
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    #	it may be better to have the result check file be the .pp file, since
    #	this .bed file is sometimes empty even though there are .pp results
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
    #	Four of these jobs fail to produce any output:
    #	chr5_h2_hap1/chr5_h2_hap1.1-191932.bed
    #	chr6_cox_hap1_random/chr6_cox_hap1_random.1-239941.bed
    #	chr6_qbl_hap2_random/chr6_qbl_hap2_random.1-120011.bed
    #	chrM/chrM.1-16499.bed
    # this chrM missing one is interesting.  It should have something ?
    # I don't know.  They all have data in their pp files.

# Completed: 364 of 368 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:      11969s     199.49m     3.32h    0.14d  0.000 y
# IO & Wait Time:                  3757s      62.61m     1.04h    0.04d  0.000 y
# Average job time:                  43s       0.72m     0.01h    0.00d
# Longest finished job:              56s       0.93m     0.02h    0.00d
# Submission to last job:          2362s      39.37m     0.66h    0.03d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/euarchontoglires
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/primates

    # load into database
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
    time nice -n +19 hgLoadBed ponAbe2 phastConsElements8wayPrimates \
	mostConserved.bed
    #	Loaded 298598 elements of size 5
    #	real    0m8.768s
    # verify coverage
    featureBits ponAbe2 phastConsElements8wayPrimates
    #	92315637 bases of 3093572278 (2.984%) in intersection

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
    mkdir downloads
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.primates.pp.data.gz
    echo $D $C done
done

    # Create merged posterier probability file and wiggle track data files
    cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
    time nice -n +19 ls downloads/chr*.data.gz | xargs zcat \
	| wigEncode -noOverlap stdin \
	    phastCons8wayPrimates.wig phastCons8wayPrimates.wib
# Converted stdin, upper limit 1.00, lower limit 0.00

    ## load table with wiggle data
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
    cp -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/*.wi? .
    ln -s `pwd`/phastCons8wayEuarch.wib \
	/gbdb/ponAbe2/multiz8way/phastCons8wayPrimates.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
	phastCons8wayPrimates phastCons8wayPrimates.wig
    #	real    0m44.161s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=ponAbe2 phastCons8wayPrimates > histogram.data 2>&1
    #	real    5m15.581s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Orantutan PonAbe2 Histogram phastCons8wayPrimates track"
set xlabel " phastCons8wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
## Create downloads for 8-way business (DONE - 2007-12-27 - Hiram)
    ssh kkstore02
    mkdir -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
    cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
    cp -p /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf .
    time nice -n +19 gzip -v *.maf
    #	creating upstream files
    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
    #	creating upstream files from xenoRefGene, bash script:
DB=ponAbe2
GENE=xenoRefGene
NWAY=multiz8way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                stdin stdout \
                -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done
    #
    ssh kkstore02
    cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
    md5sum *.maf.gz > md5sum.txt

    cd ..
    ln -s ../../ponAbe2.8-way.nh ./8way.nh
    ln -s $HOME/kent/src/hg/makeDb/trackDb/orangutan/ponAbe2/multiz8way.html .
    # copy README.txt from mm9 30-way downloads and edit to represent
    #	this directory
    cd ..
    mkdir -p phastCons8way/phastConsScores
    cd phastCons8way/phastConsScores
    cp -p \
/san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/phastCons8wayScores/*.data.gz .
    md5sum *.data.gz > md5sum.txt
    cd ..
    ln -s ../../cons/all/all.mod 8way.mod
    # copy README.txt from mm9 30way downloads and edit to represent
    #	this directory

    #	enable them for hgdownload rsync transfer
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/ponAbe2
    mkdir multiz8way phastCons8way
    cd multiz8way
    ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/8way.nh .
    ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/*.html .
    ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/R*.txt .
    mkdir maf
    cd maf
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/*.maf.gz .
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/md5sum.txt
    cd ../../phastCons8way
    mkdir phastConsScores
    cd phastConsScores
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/*.gz .
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/md5sum.txt .
    cd ..
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/8way.mod .
    ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt .
    
    
###########################################################################
# BLASTZ SWAP Human Hg18 (DONE - 2007-10-05 - Hiram)
    ssh kkstore02
    screen # use a screen to control this job
    #	the original blastz
    cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
    cat fb.hg18.chainPonAbe2Link.txt
    #	2676696124 bases of 2881515245 (92.892%) in intersection

    #	And the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
    cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    123m9.197s
    cat fb.ponAbe2.chainHg18Link.txt
    #	2824501297 bases of 3093572278 (91.302%) in intersection

    #	create syntenicNets for 7-way conservation maf use
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-continue=syntenicNet -syntenicNet \
	-swap -bigClusterHub=pk > synNet.log 2>&1 &
    #	real    91m18.097s

###########################################################################
# BLASTZ SWAP Chimp PanTro2 (DONE - 2007-10-16 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    #	the original blastz
    cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
    cat fb.panTro2.chainPonAbe2Link.txt
    #	2648603020 bases of 2909485072 (91.033%) in intersection

    #	and, for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap
    cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    #	real    160m51.639s
    cat fb.ponAbe2.chainPanTro2Link.txt
    #	2766615040 bases of 3093572278 (89.431%) in intersection
    #	real    2597m24.771s

    #	create syntenicNets for 7-way conservation maf use
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
	-continue=syntenicNet -syntenicNet \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=kk > synNet.log 2>&1 &
    #	real    131m30.045s

###########################################################################
# BLASTZ SWAP Rhesus rheMac2 (DONE - 2007-10-18 - Hiram)
    ssh kkstore01
    screen # use screen to control this job
    cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
    cat fb.rheMac2.chainPonAbe2Link.txt
    #	2333264093 bases of 2646704109 (88.157%) in intersection

    #	and, for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
    cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
	-swap -chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > swap.log 2>&1 &
    #	real    289m9.479s
    cat fb.ponAbe2.chainRheMac2Link.txt
    #	2556010573 bases of 3093572278 (82.623%) in intersection

    #	create syntenicNets for 7-way conservation maf use
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
	-continue=syntenicNet -syntenicNet \
	-swap -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -bigClusterHub=kk > synNet.log 2>&1 &
    #	real    62m10.816s

###########################################################################
# BLASTZ SWAP Opossum monDom4 (DONE - 2007-11-26 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    #	the original blastz
    cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
    cat fb.monDom4.chainPonAbe2Link.txt
    #	395887393 bases of 3501643220 (11.306%) in intersection

    #	and for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
    cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
	/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    223m38.325s
    cat fb.ponAbe2.chainMonDom4Link.txt
    #	406409435 bases of 3093572278 (13.137%) in intersection

###########################################################################
## PUSHQ entry creation (DONE - 2007-12-12 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/ponAbe2/pushQ
    cd /cluster/data/ponAbe2/pushQ
    #	had to touch
    #	goldenPath/ponAbe2/liftOver/ponAbe2ToPonAbe1.over.chain.gz
    #	to get it to pass that check.  (there isn't any ponAbe1 release)
    /cluster/bin/scripts/makePushQSql.pl -noGenbank ponAbe2 \
	> ponAbe2.sql 2> stderr.out
    ## check the stderr.out for anything that needs to be fixed
    ## copy ponAbe2.sql to hgwbeta:/tmp
    scp ponAbe2.sql hgwbeta:/tmp
    ## then on hgwbeta
    ssh hgwbeta
    cd /tmp
    ### XXX remember to set the pushQ id number in the
    #	last statement in this SQL file
    hgsql qapushq < ponAbe2.sql

###########################################################################
# Blastz Chicken galGal3 (DONE - 2007-12-13 - Hiram)
    ssh kkstore03
    screen # use screen to control this job
    mkdir /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
    cd /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13

    cat << '_EOF_' > DEF
# Orangutan vs. chicken
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Orangutan PonAbe2
SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	testing some changes to the script
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    407m0.026s
    #	some of the jobs failed due to out of memory on kk nodes
    #	did a para recover and ran the last set of jobs on pk
    #	then, continuing and testing the new memk kluster:
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-continue=cat `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	failed when it came to the kolossus run because kkr1u00 was out
    #	which provides some filesystems to kolossus.  Got that fixed
    #	and finished the netChains.csh script manually. 
    #	real    6m2.271s
    #	Then, continuing:
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-continue=load `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > load.log 2>&1 &
    #	real    4m21.364s
    cat fb.ponAbe2.chainGalGal3Link.txt
    #	134448942 bases of 3093572278 (4.346%) in intersection

    mkdir /cluster/data/galGal3/bed/blastz.ponAbe2.swap
    cd /cluster/data/galGal3/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13/DEF \
	-chainMinScore=5000 -chainLinearGap=loose -smallClusterHub=memk \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    cat fb.galGal3.chainPonAbe2Link.txt
    #	115578761 bases of 1042591351 (11.086%) in intersection

###########################################################################
#  Add quality score track (DONE - 2008-03-10 - Hiram)
    ssh kkstore02
    cd /cluster/data/ponAbe2/wustl
    #	use mScore=98 since the "Finished" contigs are not in the qual file
    time nice -n +19 zcat contigs.fa.qual.gz \
	| qaToQac stdin stdout \
	| qacAgpLift -mScore=98 ../ponAbe2.agp stdin ponAbe2.qual.qac
    #	3m35.075s
    mkdir /cluster/data/ponAbe2/bed/qual
    cd /cluster/data/ponAbe2/bed/qual
    time nice -n +19 qacToWig -fixed \
	/cluster/data/ponAbe2/wustl/ponAbe2.qual.qac stdout \
	| wigEncode stdin qual.wig qual.wib
    #	real    21m33.899s

    ssh hgwdev
    cd /cluster/data/ponAbe2/bed/qual
    rm -f /gbdb/ponAbe2/wib/qual.wib
    ln -s /cluster/data/ponAbe2/bed/qual/qual.wib /gbdb/ponAbe2/wib/
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/wib \
	ponAbe2 quality qual.wig
    #	real    1m30.877s

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-03-11 markd)

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    cd /cluster/data/ponAbe2/bed/nscan/
    wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.gtf
    wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.prot.fa
    wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/readme.html
    bzip2 ponAbe2.*
    chmod a-w *

    # load track
    gtfToGenePred -genePredExt ponAbe2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt ponAbe2 nscanGene stdin
    hgPepPred ponAbe2 generic nscanPep  ponAbe2.prot.fa.bz2
    rm *.tab

    # update trackDb; need a ponAbe2-specific page to describe informants
    orangutan/ponAbe2/nscanGene.html   (copy from readme.html)
    orangutan/ponAbe2/trackDb.ra
    # set search regex to
        termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]

############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################
# Create quality download files (DONE - 2008-10-14 - Hiram)
    cd /hive/data/genomes/ponAbe2/bed/qual
    qacToQa /hive/data/genomes/ponAbe2/wustl/ponAbe2.qual.qac ponAbe2.qa.fa
    cat << '_EOF_' > splitByName.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $chrom = "";
while (my $line = <>) {
    if ($line =~ m/^>/) {
	close FH if (length($chrom) > 0);
	chomp $line;
	$chrom = $line;
	$chrom =~ s/>//;
	$chrom =~ s/\s+//g;
	open (FH, "| gzip -c > qa/$chrom.qa.fa.gz") or
		die "can not write to qa/$chrom.qa.fa.gz";
	print FH ">$chrom\n";
    } else {
	print FH "$line";
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./splitByName.pl
    mkdir qa
    ./splitByName.pl < ponAbe2.qa.fa
    cd qa
    tar cvzf ../ponAbe2.quality.fa.gz ./*.fa.gz
    #	check byte counts with these files to make sure they are OK
    for F in *.fa.gz; do zcat $F; done | wc
    #	172337823 3446754951 10512603041
    cd ..
    wc ponAbe2.qa.fa
    #	172337823  3446754951 10512603041 ponAbe2.qa.fa
    # identical counts, OK.
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
    cd /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11

    #	same kind of parameters as used in human<->marmoset
    cat << '_EOF_' > DEF
# Orangutan vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Orangutan PonAbe2
SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    329m55.081s
    cat fb.ponAbe2.chainCalJac3Link.txt 
    #	2086557592 bases of 3093572278 (67.448%) in intersection

    mkdir /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
    cd /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    146m12.301s
    cat fb.calJac3.chainHg19Link.txt 
    #	1978857628 bases of 2752505800 (71.893%) in intersection

############################################################################
# LASTZ Gorilla GorGor3 (DONE - 2011-10-21,23 - Hiram)
    mkdir /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21
    cd /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21

    cat << '_EOF_' > DEF
# Orangutan vs gorilla
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Orangutan PonAbe2
SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Frog gorGor3
SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit
SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	1441m58s
    cat fb.ponAbe2.chainGorGor3Link.txt  
    #	2584628377 bases of 3093572278 (83.548%) in intersection

    cd /hive/data/genomes/ponAbe2/bed
    ln -s lastzGorGor3.2011-10-21 lastz.gorGor3

    #	running the swap
    mkdir /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap
    cd /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21/DEF \
	-swap -syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    86m19.901s
    cat fb.gorGor3.chainPonAbe2Link.txt 
    #	2437562544 bases of 2822760080 (86.354%) in intersection

############################################################################
# LASTZ Mouse Mm10 (DONE - 2012-03-08 - Hiram)
    mkdir /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08
    cd /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08

    cat << '_EOF_' > DEF
# chimp vs mouse
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: Orangutan PonAbe2
SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Mouse Mm10
SEQ2_DIR=/scratch/data/mm10/mm10.2bit
SEQ2_LEN=/scratch/data/mm10/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen -S ponAbe2Mm10
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    636m35.769s
    cat fb.ponAbe2.chainMm10Link.txt
    #	946932454 bases of 3093572278 (30.610%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/ponAbe2/bed
    ln -s lastzMm10.2012-03-08 lastz.mm10

    mkdir /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap
    cd /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    72m38.550s
    cat fb.mm10.chainPonAbe2Link.txt 
    #	915093866 bases of 2652783500 (34.496%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/mm10/bed
    ln -s blastz.ponAbe2.swap lastz.ponAbe2

##########################################################################
# LASTZ  Rhesus rheMac3 Swap (DONE - 2012-04-26 - Chin)
    cd /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18
    cat fb.rheMac3.chainPonAbe2Link.txt
    #   2354152230 bases of 2639145830 (89.201%) in intersection

    #   and, for the swap
    mkdir /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18/DEF \
       -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -swap -chainMinScore=3000 -chainLinearGap=medium  > swap.log 2>&1 &
    #   real    289m9.479s
    cat fb.ponAbe2.chainRheMac3Link.txt
    #   2565157739 bases of 3093572278 (82.919%) in intersection
    cd /hive/data/genomes/ponAbe2/bed
    ln -s blastz.rheMac3.swap lastz.rheMac3

##############################################################################