# for emacs: -*- mode: sh; -*-


# This file describes browser build for the Sea Lamprey
# genome, Petromyzon marinus, March 2007, v.3.0 from WUSTL
#
#       "$Id: petMar1.txt,v 1.23 2009/11/25 21:48:41 hiram Exp $"
#

#	http://www.ncbi.nlm.nih.gov/genome/287
#	http://www.ncbi.nlm.nih.gov/bioproject/50489 - Mich. State
#	http://www.ncbi.nlm.nih.gov/bioproject/12880 - WashU - SRA
#	http://www.ncbi.nlm.nih.gov/sra/SRX028154 - SRA records
#	http://www.ncbi.nlm.nih.gov/bioproject/11809 - NCBI 2004 chrMt

##########################################################################
### Fetch sequence       (DONE - 2008-01-14 - Hiram)
    #	determine estimated disk usage, from example genome like this one,
    #	braFlo1 which used about 3.5 Gb, do not need much space, store04
    #	has 42 Gb, plenty of space on store9
    ssh kkstore04
    mkdir /cluster/store9/petMar1
    ln -s /cluster/store9/petMar1 /cluster/data/petMar1
    mkdir /cluster/data/petMar1/downloads
    cd /cluster/data/petMar1/downloads

    wget --timestamping \
    'ftp://genome.wustl.edu/pub/organism/Other_Vertebrates/Petromyzon_marinus/assembly/Petromyzon_marinus-3.0/output/*'

##########################################################################
#  Fixup the agp file to make a chrUn agp file, add a 1000 base gap
#	between the supercontigs

##########################################################################
#  Run the makeGenomeDb.pl script (DONE - 2008-01-27 - Hiram)
    ssh kkstore04
    cd /cluster/data/petMar1/downloads
    #	eliminate zero-length fragments from the AGP file
    zcat supercontigs.agp.gz | awk '
{
size=$3-$2
if (size >= 0) print
}
' | gzip -c > supers.agp.gz

    cd /cluster/data/petMar1
    cat << '_EOF_' > petMar1.config.ra
# Config parameters for makeGenomeDb.pl:
db petMar1
clade vertebrate
genomeCladePriority 130
scientificName Petromyzon marinus
commonName Lamprey
assemblyDate Mar. 2007
assemblyLabel WUSTL v.3.0 (March 2007)
orderKey 480
mitoAcc 5835065
fastaFiles /cluster/data/petMar1/downloads/supercontigs.fa.gz
agpFiles /cluster/data/petMar1/downloads/supers.agp.gz
# qualFiles none
dbDbSpeciesDir lamprey
'_EOF_'

    #	can stop at db since trackDb and dbDb were created in first
    #	attempt
    makeGenomeDb.pl -stop=db petMar1.config.ra > makeGenome.log 2>&1 &
    twoBitToFa petMar1.unmasked.2bit stdout | faSize stdin
    #	1135497967 bases (303801529 N's 831696438 real 831696438
    #	upper 0 lower) in 2 sequences in 1 files

    #	chrM has a gi:5835065 fragment name, I would rather have NC_001626
    hgsql -e 'update gold set frag="NC_001626" where chrom="chrM";' petMar1

##########################################################################
#  Pick up an image from the EPA, usage rights are free
#	http://www.epa.gov/glnpo/image/restrictions.htm
    mkdir /cluster/data/petMar1/photo
    cd /cluster/data/petMar1/photo
    wget --timestamping 'http://www.epa.gov/glnpo/image/vbig/229.jpg' \
	-O petromyzon_marinus.epa.jpg

##########################################################################
#  Running WindowMasker (DONE - 2008-01-26 - Hiram)
    ssh kolossus
    mkdir /cluster/data/petMar1/bed/WindowMasker
    cd /cluster/data/petMar1/bed/WindowMasker
    /cluster/bin/scripts/doWindowMasker.pl \
	-buildDir /cluster/data/petMar1/bed/WindowMasker \
	-workhorse=kolossus -verbose=2 petMar1 > wm.log 2>&1 &
    twoBitToFa petMar1.wmsk.sdust.2bit stdout | faSize stdin
    #	1027258967 bases (195562529 N's 831696438 real 494032631 upper
    #	337663807 lower) in 108241 sequences in 1 files

    #	%32.87 masked total, %40.60 masked real

    ssh hgwdev
    cd /cluster/data/petMar1/bed/WindowMasker
    hgLoadBed petMar1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 4938211 elements of size 3
    featureBits petMar1 windowmaskerSdust
    #	533224658 bases of 831696438 (64.113%) in intersection

    #	WM has a bug where it masks the gaps too, don't like that,
    #	fetch the WM track without the gaps, can do this in the
    #	table browser with an intersection of WMask AND ! gap
    #	or via featureBits:
    featureBits petMar1 -not gap -bed=notGap.bed
    #	hint: also works with the gold table instead of notGap
    time nice -n +19 featureBits petMar1 windowmaskerSdust notGap.bed \
	-bed=stdout | gzip -c > cleanWMask.bed.gz
    #	337663807 bases of 831696438 (40.599%) in intersection
    #	real    182m45.469s
    hgLoadBed petMar1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 4919722 elements of size 4
    featureBits petMar1 windowmaskerSdust > fb.petMar1.cleanWMask.txt 2>&1
    #	337663807 bases of 831696438 (40.599%) in intersection

##########################################################################
#	Running TRF simple repeats (DONE - 2008-01-26 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/petMar1/bed/simpleRepeat
    cd /cluster/data/petMar1/bed/simpleRepeat
    doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
	-workhorse=kolossus \
	-buildDir=/cluster/data/petMar1/bed/simpleRepeat > do.log 2>&1 &
    #	ran into a bug with the nice difficulty on the featureBits
    featureBits petMar1 simpleRepeat > fb.petMar1.simpleRepeat.txt 2>&1
    #	75844864 bases of 831696438 (9.119%) in intersection
    doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
	-continue=cleanup -workhorse=kolossus \
	-buildDir=/cluster/data/petMar1/bed/simpleRepeat > cleanup.log 2>&1 &

##########################################################################
# Repeat Masker (DONE - 2008-01-26 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/petMar1/bed/RepeatMasker
    cd /cluster/data/petMar1/bed/RepeatMasker
    doRepeatMasker.pl petMar1 -bigClusterHub=kk -workhorse=kolossus \
	-buildDir=/cluster/data/petMar1/bed/RepeatMasker \
	-verbose=2 > do.log 2>&1
##########################################################################
# Add WindowMasker and trfMask masking to the 2bit sequenct
#	(DONE - 2008-01-26 - Hiram)
    ssh kkstore04
    cd /cluster/data/petMar1
    gzip bed/simpleRepeat/trfMask.bed
    zcat bed/WindowMasker/cleanWMask.bed.gz bed/simpleRepeat/trfMask.bed.gz \
	| twoBitMask -type=.bed petMar1.unmasked.2bit stdin petMar1.2bit
    #	ignore the warning about bed file > 13 fields, that is OK
    twoBitToFa petMar1.2bit stdout | faSize stdin
    #	1027258967 bases (195562529 N's 831696438 real 493479824 upper
    #	338216614 lower) in 108241 sequences in 1 files
    #	%32.92 masked total, %40.67 masked real

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-01-15 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("petMar1", "blat15", "17788", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("petMar1", "blat15", "17789", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## Reset default position to
#	Where RHO1 protein is found via blat:
#	GEMSLWSLVV LAIERYIVIC KPMGNFRFGS THAYMGVAFT WFMALSCAAP PLVGWSR
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="Contig11034:8429-10197"
	where name="petMar1";' hgcentraltest

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2008-01-29 - Hiram)
    # Use -repMatch=200 (based on size -- for human we use 1024, and
    # medaka size is ~20% of human judging by gapless oryLat1 vs. hg18
    # genome sizes from featureBits.
    ssh kkstore04
    cd /cluster/data/petMar1
    blat petMar1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=petMar1.11.ooc -repMatch=200
    #	Wrote 49155 overused 11-mers to petMar1.11.ooc

##########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-01-15 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add petMar1 just after gasAcu1

# petMar1 (Branchiostoma floridae - Lancelet)
petMar1.serverGenome = /cluster/data/petMar1/petMar1.2bit
petMar1.clusterGenome = /cluster/bluearc/scratch/data/petMar1/petMar1.2bit
petMar1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
petMar1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
petMar1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
petMar1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
petMar1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
petMar1.ooc = /cluster/data/petMar1/petMar1.11.ooc
petMar1.lift = /cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
petMar1.refseq.mrna.native.load = yes
petMar1.refseq.mrna.xeno.load = yes
petMar1.genbank.mrna.xeno.load = yes
petMar1.genbank.est.native.load = yes
petMar1.downloadDir = petMar1
petMar1.perChromTables = no

    cvs ci -m "Added petMar1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added Petromyzon marinus (Sea Lamprey)." src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen # control this with a screen so you can get back to it
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial petMar1 &
    #	logFile: var/build/logs/2008.01.29-11:27:10.petMar1.initalign.log
    #	real    332m36.025s
    #	There was some kind of problem with this alignment, although the
    #	kluster run actually finished:
# Completed: 366520 of 366520 jobs
# CPU time in finished jobs:    6854057s  114234.28m  1903.90h   79.33d  0.217 y
# IO & Wait Time:               2300432s   38340.53m   639.01h   26.63d  0.073 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest finished job:            1292s      21.53m     0.36h    0.01d
# Submission to last job:         21121s     352.02m     5.87h    0.24d
    #	So, continuing:
    time nice -n +19 bin/gbAlignStep -continue=finish -initial petMar1 &
    #	logFile: var/build/logs/2008.01.30-09:18:56.petMar1.initalign.log
    #	real    70m20.669s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad petMar1
    #	logFile: var/dbload/hgwdev/logs/2008.01.15-16:21:10.dbload.log
    #	real    19m34.853s

    # enable daily alignment and update of hgwdev (2008-01-30 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add petMar1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added petMar1 - Petromyzon marinus (lamprey)" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
# BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-15 - Hiram)
#	Try 1 failed, run again with contigs
    ssh kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
    cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15

    cat << '_EOF_' > DEF
# Medaka vs. Lamprey

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50

# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=30
SEQ1_LAP=10000

# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=memk > do.log 2>&1 &
    #	abandon this run, ended up using the swap from oryLat1 in:
    #	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
    #	real    1053m9.027s
    #	netChains failed, during chainStitchId:
    #	q end mismatch 1073742062 vs 1111312799 line 510 of stdin

############################################################################
#  Create contigs-only masked 2bit file for non-bridged gap free blastz runs
#	 (DONE - 2008-01-15 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/petMar1/contigSequence
    cd /cluster/data/petMar1/contigSequence
    cp -p /cluster/data/priPac1/jkStuff/lft2BitToFa.pl ../jkStuff
    ../jkStuff/lft2BitToFa.pl ../petMar1.2bit \
	../jkStuff/petMar1.nonBridged.lft > supercontigs.fa
    #	this took quite a long time, about 6 hours or so
    #	this doesn't have chrM yet
    faSize supercontigs.fa ../M/chrM.fa
    #	1027258967 bases (195562529 N's 831696438 real 493405059 upper
    #	338291379 lower) in 108241 sequences in 2 files
    #	%32.93 masked total, %40.67 masked real
    cat supercontigs.fa ../M/chrM.fa | faToTwoBit stdin petMar1.supers.2bit
    twoBitToFa petMar1.supers.2bit stdout | faSize stdin
    #	1027258967 bases (195562529 N's 831696438 real 493405059 upper
    #	338291379 lower) in 108241 sequences in 1 files
    #	%32.93 masked total, %40.67 masked real
    twoBitInfo petMar1.supers.2bit stdout | sort -k2nr > super.sizes
    cp -p petMar1.supers.2bit /cluster/bluearc/scratch/data/petMar1
    cd ..
    ln -s contigSequence/super.sizes .

#########################################################################
# BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-16 - 2008-01-30 - Hiram)
#	Try 2 with contigs for Lamprey
    ssh kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
    cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15

    cat << '_EOF_' > DEF
# Medaka vs. Lamprey

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50

# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=30
SEQ1_LAP=10000

# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/scratch/data/petMar1/petMar1.supers.2bit
SEQ2_CTGLEN=/cluster/data/petMar1/super.sizes
SEQ2_LIFT=/cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=150
SEQ2_LAP=0

BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > do.log 2>&1 &
    #	real    218m14.787s
# Completed: 69774 of 70854 jobs
# Crashed: 1080 jobs
# CPU time in finished jobs:    3950133s   65835.55m  1097.26h   45.72d  0.125 y
# IO & Wait Time:               1218716s   20311.93m   338.53h   14.11d  0.039 y
# Average job time:                  74s       1.23m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1629s      27.15m     0.45h    0.02d
# Submission to last job:         12951s     215.85m     3.60h    0.15d
    #	had some parasol bookeeping problems, it actually finished
    #	the kluster run but got confused.  Checked manually with
    #	para recover -> nothing in new job list, and check files in
    #	psl result directory finding the correct number of files for the
    #	the job count.  Make the run.time file and continuing:
    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > cat.log 2>&1 &
    #	real    7m47.963s
    #	something failed
    #	had the specification to the lft file incorrect, failed during
    #	chain run.  Fix that, finish the run, now continuing:
    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=chainMerge -chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > chainMerge.log 2>&1 &
    #	had the same failure as the first time test of this alignment.
    #	something is wrong with netChainSubset, needs to be debugged.
    #	ignore the step to create the "over.chain" file, finish the net
    #	step manually, then continuing:
    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > load.log 2>&1 &
    #	real    3m53.686s
    cat fb.oryLat1.chainPetMar1Link.txt
    #	24990733 bases of 700386597 (3.568%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap
    cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > swap.log 2>&1 &
    #	real    1m3.572s
    #	same problem in netChainSubset trying to make the "over.chain" file.
    #	skip that, finish the chains manually:
    #	real    14m35.340s
    #	then continuing:
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk > load.log 2>&1 &
    cat fb.petMar1.chainOryLat1Link.txt
    #	23358156 bases of 831696438 (2.808%) in intersection

    #	create reciprocal best chains/nets for 9-way maf alignments
    ssh hgwdev
    cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 oryLat1 \
	> rbest.log 2>&1 &
    #	real    22m20.900s

#############################################################################
# BLASTZ SWAP Lamprey - Lanclet (DONE - 2008-04-12 - Hiram)
    ssh kkstore05
    screen	#	control this job with a screen
    #	the original
    cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
    cat fb.braFlo1.chainPetMar1Link.txt
    #	27418217 bases of 923355587 (2.969%) in intersection

    #	and the swap
    mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
    cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    #	real    194m53.514s
    cat fb.petMar1.chainBraFlo1Link.txt
    #	27373264 bases of 831696438 (3.291%) in intersection

    #	create reciprocal best chains/nets for 9-way maf alignments
    ssh hgwdev
    cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 braFlo1 \
	> rbest.log 2>&1 &
    #	real    110m25.613s

#############################################################################
# BLASTZ SWAP Chicken GalGal3 (DONE - 2008-04-15 - Hiram)
    #	the original
    ssh kkstore03
    cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
    cat fb.galGal3.chainPetMar1Link.txt
    #	20134896 bases of 1042591351 (1.931%) in intersection

    #	and the swap
    mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
    cd /cluster/data/petMar1/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
    #	real    33m41.587s
    cat fb.petMar1.chainGalGal3Link.txt
    #	22308118 bases of 831696438 (2.682%) in intersection

    #	create reciprocal best chains/nets for 9-way maf alignments
    ssh hgwdev
    cd /cluster/data/petMar1/bed/blastz.galGal3.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 galGal3 \
	> rbest.log 2>&1 &
    #	real    13m33.139s

#############################################################################
# BLASTZ SWAP Human hg18 (DONE - 2008-01-30 - Hiram)
    ssh kkstore02
    screen # use screen to control this job
    # the original
    cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
    cat fb.hg18.chainPetMar1Link.txt
    #	36042598 bases of 2881515245 (1.251%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
    cd /cluster/data/petMar1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    60m1.928s
    cat  fb.petMar1.chainHg18Link.txt
    #	26751073 bases of 831696438 (3.216%) in intersection

    #	create reciprocal best chains/nets for 9-way maf alignments
    ssh hgwdev
    cd /cluster/data/petMar1/bed/blastz.hg18.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 hg18 \
	> rbest.log 2>&1 &
    #	real    22m34.274s

#############################################################################
# BLASTZ SWAP Mouse mm9 (DONE - 2008-04-15 - Hiram)
    ssh kkstore06
    screen # use screen to control this job
    #	the original
    cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
    cat fb.mm9.chainPetMar1Link.txt
    #	29113438 bases of 2620346127 (1.111%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
    cd /cluster/data/petMar1/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
    #	real    33m29.076s
    cat  fb.petMar1.chainMm9Link.txt
    #	26052507 bases of 831696438 (3.132%) in intersection

    #	create reciprocal best chains/nets for 9-way maf alignments
    ssh hgwdev
    cd /cluster/data/petMar1/bed/blastz.mm9.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 mm9 \
	> rbest.log 2>&1 &
    #	real    21m9.320s

#############################################################################
## 6-Way Multiz (DONE - 2008-04-17 - Hiram)
##
    ssh hgwdev
    mkdir /cluster/data/petMar1/bed/multiz6way
    cd /cluster/data/petMar1/bed/multiz6way
    #	take the 30-way tree from mm9 and eliminate genomes not in
    #	this alignment.  Manually add in petMar1 and braFlo1
    #	arbitrarily somewhere outside of the fishes
    #	rearrange to get petMar1 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    #	select the 6 organisms from the 30-way recently done on mouse mm9
    /cluster/bin/phast/tree_doctor \
	--prune-all-but Human_hg18,Mouse_mm9,Chicken_galGal3,Medaka_oryLat1 \
	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
	> 6-way.fullNames.nh

    #	looks something like this:
    (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
	Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873);

    #	Adding Lamprey and Lanclet:
((Lamprey_petMar1:1.6,
  (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
   Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873):0.4):0.4,
    Lanclet_braFlo1:2.0);

((Lamprey_petMar1:1.6,
  (Medaka_oryLat1:1.077873,((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
   Chicken_galGal3:0.488824):0.325954):0.4):0.4,
    Lanclet_braFlo1:2.0);

((Lamprey_petMar1:1.6,
  (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
    (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
    Lanclet_braFlo1:2.0);

    #	rearrange to get Marmoset at the top:
    # this leaves us with:
    cat << '_EOF_' > petMar1.6-way.nh
((Lamprey_petMar1:1.6,
  (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
    (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
    Lanclet_braFlo1:2.0);
'_EOF_'
    #	<< happy emacs

    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' petMar1.6-way.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    #	verify that has 6 db names in it
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' petMar1.6-way.nh \
	| sed -e "s/  / /g"` > tree.6.nh
    #	that looks like, as a single line:
# ((petMar1 (oryLat1 (galGal3 (mm9 hg18)))) braFlo1)

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/petMar1/bed/multiz6way
foreach db (`grep -v petMar1 species.list`)
    set bdir = /cluster/data/petMar1/bed/blastz.$db
    if (-e $bdir/mafRBestNet/petMar1.$db.rbest.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/petMar1.$db.net.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/petMar1.$db.net.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says, the "mafs not found" should only show up on petMar1
    ./listMafs.csh
# braFlo1 mafRBestNet
# galGal3 mafRBestNet
# hg18 mafRBestNet
# mm9 mafRBestNet
# oryLat1 mafRBestNet

    /cluster/bin/phast/all_dists petMar1.6-way.nh > 6way.distances.txt
    grep -i petmar 6way.distances.txt | sort -k3,3n
Lamprey_petMar1 Chicken_galGal3 2.814778
Lamprey_petMar1 Human_hg18      2.923612
Lamprey_petMar1 Medaka_oryLat1  3.077873
Lamprey_petMar1 Mouse_mm9       3.122529
Lamprey_petMar1 Lanclet_braFlo1 4.000000

    #	use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                       chainCalJac1Link   chain   linearGap
#                     distance    on PetMar1    on other   minScore
#  1  Chicken_galGal3 2.814778     (% 2.682)    (% 1.931)   5000      loose
#  2  Human_hg18      2.923612     (% 3.216)    (% 1.251)   5000      loose
#  3  Medaka_oryLat1  3.077873     (% 2.808)    (% 3.568)   3000      medium
#  4  Mouse_mm9       3.122529     (% 3.132)    (% 1.111)   5000      loose
#  5  Lanclet_braFlo1 4.000000     (% 3.291)    (% 2.969)   5000      loose

    # copy net mafs to cluster-friendly storage, splitting chroms
    mkdir mafLinks
    cd mafLinks
    ln -s ../../blastz.*.swap/mafRBestNet/*.rbest.maf.gz .

    #	need to split these things up by Contig number for efficient kluster run
    ssh kkstore04
    mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
    cd /scratch/tmp
    #	the 16201 is from petMar/chrom.sizes
    echo "chrM 0 16201" > chrM.bed

    for D in `grep -v petMar1 /cluster/data/petMar1/bed/multiz6way/species.list`
do
    echo ${D}
    zcat \
/cluster/data/petMar1/bed/multiz6way/mafLinks/petMar1.${D}.rbest.maf.gz \
	> ${D}.maf
    mkdir /scratch/tmp/${D}
    cd /scratch/tmp/${D}
    mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \
	../${D}.maf -outDirDepth=2
    mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf
    rsync -a --progress ./ \
	/san/sanvol1/scratch/petMar1/multiz6way/contigMaf/${D}
    cd /scratch/tmp
    rm -fr ${D} ${D}.maf
done
    #	create a run-time list of contigs to operate on, not all contigs
    #	exist in all alignments, but we want all contig names used in any
    #	alignment:
    cd /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sort -u > /tmp/6-way.contig.list
    wc -l /tmp/6-way.contig.list
    mkdir /cluster/data/petMar1/bed/multiz6way/splitRun
    cp -p /tmp/6-way.contig.list \
	/cluster/data/petMar1/bed/multiz6way/splitRun
    #	31488 /tmp/6-way.contig.list

    # ready for the multiz run
    ssh pk
    cd /cluster/data/petMar1/bed/multiz6way/splitRun
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = petMar1
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz6way/contigMaf
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.6.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
    set in = $pairs/$s/$subdir/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/petMar1/bed/multiz6way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs

    sed -e "s/^\.\///" ../6-way.contig.list \
	| gensub2 stdin single template jobList

    para create jobList
    para try ... check ... push ... etc
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs:       5723s      95.38m     1.59h    0.07d  0.000 y
# IO & Wait Time:                 84949s    1415.82m    23.60h    0.98d  0.003 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:           900s      15.00m     0.25h    0.01d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    # put the split maf results back together into a single maf file
    #	eliminate duplicate comments
    ssh kkstore04
    cd /cluster/data/petMar1/bed/multiz6way
    mkdir togetherMaf
    grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \
	| sort -u > togetherMaf/petMar1.6way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
	| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> togetherMaf/petMar1.6way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -v -h "^#" "${F}"
done >> togetherMaf/petMar1.6way.maf

    grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \
	| sort -u >> togetherMaf/petMar1.6way.maf

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/petMar1/multiz6way/maf
    ln -s /cluster/data/petMar1/bed/multiz6way/togetherMaf/*.maf \
                /gbdb/petMar1/multiz6way/maf/multiz6way.maf
    # this generates an immense multiz6way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/petMar1/multiz6way/maf petMar1 multiz6way
    #	real    0m6.011s
    #	Loaded 232205 mafs in 1 files from /gbdb/petMar1/multiz6way/maf

    # load summary table *NOT* - no Contigs are long enough to create
    #	anything for the summary table.  This command produces an empty
    #	table.
    time nice -n +19 cat /gbdb/petMar1/multiz6way/maf/*.maf \
	| hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz6waySummary stdin
    #	real    5m58.150s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore04
    mkdir /cluster/data/petMar1/bed/multiz6way/anno
    cd /cluster/data/petMar1/bed/multiz6way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    #	if there are done you will see an ls of them
    #	or else the twoBitInfo command will be echoed, run it if you want
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v petMar1 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh memk
    #	temporarily copy the petMar1.6way.maf file onto the memk
    #	nodes /scratch/data/petMar1/maf/ directory
    for R in 0 1 2 3 4 5 6 7
do
    ssh mkr0u${R} mkdir /scratch/data/petMar1/maf
    ssh mkr0u${R} rsync -a --progress \
/cluster/data/petMar1/bed/multiz6way/togetherMaf/petMar1.6way.maf \
	/scratch/data/petMar1/maf/
done
    mkdir /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
    #	need to split up the single maf file into individual
    #	per-scaffold maf files to run annotation on
    cd /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
    #	create bed files to list approximately 1553 scaffolds in
    #	a single list, approximately 33 lists
    cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $bedCount = 0;
my $i = 0;

my $bedFile = sprintf("file_%d.bed", $bedCount);

open (BF,">$bedFile") or die "can not write to $bedFile $!";

open (FH,"</cluster/data/petMar1/chrom.sizes") or
        die "can not read /cluster/data/petMar1/chrom.sizes $!";
while (my $line = <FH>) {
    chomp $line;
    if ( (($i + 1) % 1553) == 0 )  {
        printf "%s\n", $line;
        close (BF);
        ++$bedCount;
        $bedFile = sprintf("file_%d.bed", $bedCount);
        open (BF,">$bedFile") or die "can not write to $bedFile $!";
    }
    ++$i;
    my ($chr, $size) = split('\s+',$line);
    printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BF);
'_EOF_'
    # << happy emacs
    chmod +x mkBedLists.pl
    ./mkBedLists.pl

    #	now, run a mafsInRegion on each one of those lists
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/petMar1/bed/multiz6way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/petMar1/$resultDir
pushd /scratch/tmp/petMar1/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
        /scratch/data/petMar1/maf/petMar1.6way.maf
popd
rsync -q -a /scratch/tmp/petMar1/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/petMar1/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/petMar1
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls file*.bed > runList
    gensub2 runList single template jobList
    para create jobList
    para try ... check ... push ... etc
# Completed: 70 of 70 jobs
# CPU time in finished jobs:        255s       4.24m     0.07h    0.00d  0.000 y
# IO & Wait Time:                  1904s      31.74m     0.53h    0.02d  0.000 y
# Average job time:                  31s       0.51m     0.01h    0.00d
# Longest finished job:              72s       1.20m     0.02h    0.00d
# Submission to last job:           143s       2.38m     0.04h    0.00d

    cd /cluster/data/petMar1/bed/multiz6way/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/petMar1/petMar1.2bit $result
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../splitMaf -type f -name "*.maf" > maf.list
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs:      10711s     178.52m     2.98h    0.12d  0.000 y
# IO & Wait Time:                 79430s    1323.83m    22.06h    0.92d  0.003 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:          3237s      53.95m     0.90h    0.04d

    ssh kkstore04
    #	put them all back together into a single file
    cd /cluster/data/petMar1/bed/multiz6way/anno
    grep "^##maf version" maf/file_0/Contig0.maf \
	| sort -u > petMar1.anno.6way.maf
    find ./maf -type f -depth -name "*.maf" | while read F
do
    grep -v -h "^#" "${F}"
done >> petMar1.anno.6way.maf
    echo "##eof maf" >> petMar1.anno.6way.maf
XXX - running 2008-04-23 09:58
    ls -og *.maf
# -rw-rw-r--   1 285783792 Apr 23 09:58 petMar1.anno.6way.maf

    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/anno
    mkdir -p /gbdb/petMar1/multiz6way/anno
    ln -s `pwd`/petMar1.anno.6way.maf \
                /gbdb/petMar1/multiz6way/anno/multiz6way.maf
    #	by loading this into the table multiz6way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/petMar1/multiz6way/anno \
                petMar1 multiz6way
    #	Loaded 296783 mafs in 1 files from /gbdb/petMar1/multiz6way/anno
    #	real    0m6.690s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything.
    #	*NOT* Does not work on petMar1, all contigs are too small,
    #	this makes an empty table.
    hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000  multiz6waySummary \
	    /gbdb/petMar1/multiz6way/anno/multiz6way.maf
    #	Created 121083 summary blocks from 3410157 components and 749940 mafs
    #	from /gbdb/petMar1/multiz6way/anno/multiz6way.maf

    #	by loading this into the table multiz6waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz6way*.tab files in this /scratch/tmp directory
    rm multiz6way*.tab
    #	And, you can remove the previously loaded non-annotated maf file link:
    rm /gbdb/petMar1/multiz6way/maf/multiz6way.maf
    rmdir /gbdb/petMar1/multiz6way/maf

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-19)

# (auto) establish native host of /cluster/data/<assembly>
    ssh kkstore04
    # bash  if not using bash shell already

    mkdir /cluster/data/petMar1/blastDb
    cd /cluster/data/petMar1
    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    if test -s 1meg.lst; then
	twoBitToFa -seqList=1meg.lst  petMar1.unmasked.2bit temp.fa
	faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
	rm temp.fa
    fi
    rm 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    if test -s less1meg.lst; then
	twoBitToFa -seqList=less1meg.lst  petMar1.unmasked.2bit temp.fa
	faSplit about temp.fa 1000000 blastDb/y
	rm temp.fa
    fi
    rm less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 1018

    mkdir -p /san/sanvol1/scratch/petMar1/blastDb
    cd /cluster/data/petMar1/blastDb
    for i in nhr nin nsq;
    do
	echo $i
	cp *.$i /san/sanvol1/scratch/petMar1/blastDb
    done

    mkdir -p /cluster/data/petMar1/bed/tblastn.hg18KG
    cd /cluster/data/petMar1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/petMar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst

# 1018 query.lst

   # we want around 100,000 jobs per gig (0.0001 jobs per base)
   numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/petMar1/chrom.sizes`

   # we want around numJobs jobs
   numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
   numQueries=`wc query.lst | awk '{print $1}'`
   numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `

   mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa
   split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/petMar1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	if test -s /cluster/data/petMar1/blastDb.lft
	then
	    liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/petMar1/blastDb.lft carry $f.2
	else
	    mv $f.2 $f.3
	fi
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit

    ssh pk
    cd /cluster/data/petMar1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 103836 of 103836 jobs
# CPU time in finished jobs:    9418283s  156971.39m  2616.19h  109.01d  0.299 y
# IO & Wait Time:                867856s   14464.26m   241.07h   10.04d  0.028 y
# Average job time:                  99s       1.65m     0.03h    0.00d
# Longest finished job:             683s      11.38m     0.19h    0.01d
# Submission to last job:         38789s     646.48m    10.77h    0.45d

    ssh kkstore05
    cd /cluster/data/petMar1/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh memk
    cd /cluster/data/petMar1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.

# Completed: 102 of 102 jobs
# CPU time in finished jobs:       4354s      72.57m     1.21h    0.05d  0.000 y
# IO & Wait Time:                 70795s    1179.91m    19.67h    0.82d  0.002 y
# Average job time:                 737s      12.28m     0.20h    0.01d
# Longest finished job:            1308s      21.80m     0.36h    0.02d
# Submission to last job:          2927s      48.78m     0.81h    0.03d

    ssh kkstore04
    cd /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/petMar1/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table
    ssh hgwdev
    cd /cluster/data/petMar1/bed/tblastn.hg18KG
    hgLoadPsl petMar1 blastHg18KG.psl

    # check coverage
    featureBits petMar1 blastHg18KG
# 7171010 bases of 831696438 (0.862%) in intersection

    featureBits petMar1 xenoRefGene:cds blastHg18KG  -enrichment
# xenoRefGene:cds 0.676%, blastHg18KG 0.862%, both 0.362%, cover 53.51%, enrich 62.06x

    ssh kkstore05
    rm -rf /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
#end tblastn

###########################################################################
## Annotate 6-way multiple alignment with gene annotations
##		(DONE - 2008-01-08 - Hiram)
    # Gene frames
    ## given previous survey done for 8-way alignment on Orangutan,
    ## try using the following tables for this gene annotation:
    #	hg18 is in a bit of transition at the moment, so use ensGene
    #	mm9 can use knownGene
    #	xenoMrna for braFlo1, petMar1
    #	ensGene v49 for galGal3, oryLat1

    ssh hgwdev
    mkdir /cluster/data/petMar1/bed/multiz6way/frames
    cd /cluster/data/petMar1/bed/multiz6way/frames
    mkdir genes
    # knownGene
    for DB in mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in hg18 galGal3 oryLat1
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    #	use xenoMrna for braFlo1, petMar1
    for DB in braFlo1 petMar1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
	   from xenoMrna,gbCdnaInfo,cds \
	   where (xenoMrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    ls -og genes
# -rw-rw-r--  1 1040642 Apr 28 14:30 braFlo1.gp.gz
# -rw-rw-r--  1 1557911 Apr 28 14:29 galGal3.gp.gz
# -rw-rw-r--  1 2151412 Apr 28 14:29 hg18.gp.gz
# -rw-rw-r--  1 1965274 Apr 28 14:28 mm9.gp.gz
# -rw-rw-r--  1 1713692 Apr 28 14:29 oryLat1.gp.gz
# -rw-rw-r--  1  596014 Apr 28 14:31 petMar1.gp.gz

    ssh kkstore04
    cd /cluster/data/petMar1/bed/multiz6way/frames
    #	hg18 is in a bit of transition at the moment, so use ensGene
    #	mm9 can use knownGene
    #	xenoMrna for braFlo1, petMar1
    #	ensGene v49 for galGal3, oryLat1
    #	anything to annotate is in a pair, e.g.: petMar1 genes/petMar1.gp.gz
    time (cat  ../anno/petMar1.anno.6way.maf | nice -n +19 genePredToMafFrames petMar1 stdin stdout petMar1 genes/petMar1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz oryLat1 genes/oryLat1.gp.gz braFlo1 genes/braFlo1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
    #	 37711 braFlo1
    #	 55189 petMar1
    #	 75566 oryLat1
    #	 95715 galGal3
    #	107885 mm9
    #	110835 hg18

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/frames
    time nice -n +19 hgLoadMafFrames petMar1 multiz6wayFrames \
	multiz6way.mafFrames.gz
    #	real    0m7.652s

    #	enable the trackDb entries:
# frames multiz6wayFrames
# irows on
#############################################################################
# phastCons 6-way (DONE - 2008-04-28 - Hiram)

    # split 6way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh memk
    mkdir /cluster/data/petMar1/bed/multiz6way/msa.split
    cd /cluster/data/petMar1/bed/multiz6way/msa.split
    mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/petMar1/bed/multiz6way/anno/maf
set WINDOWS = /san/sanvol1/scratch/petMar1/multiz6way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/petMar1/petMar1.2bit /scratch/tmp/petMar1.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \
    -M /scratch/tmp/petMar1.$c.fa \
    -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/petMar1.$c.fa
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	create list of maf files:
    (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs:       2289s      38.15m     0.64h    0.03d  0.000 y
# IO & Wait Time:                 82557s    1375.95m    22.93h    0.96d  0.003 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest finished job:              40s       0.67m     0.01h    0.00d
# Submission to last job:          3052s      50.87m     0.85h    0.04d

    # take the cons and noncons trees from the mouse 30-way

    #	Estimates are not easy to make, probably more correctly,
    #	take the 30-way .mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way
    cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh memk
    mkdir -p /cluster/data/petMar1/bed/multiz6way/cons/run.cons
    cd /cluster/data/petMar1/bed/multiz6way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals
    #	Well, that's what it was when used in the Mm9 30-way,
    #	in this instance, there is only the directory "all"

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/petMar1/bed/multiz6way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/petMar1/multiz6way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/petMar1/multiz6way/cons
    find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
	> /cluster/data/petMar1/bed/multiz6way/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
    ../../mm9.30way.mod \
--prune-all-but=petMar1,hg18,galGal3,mm9,oryLat1,braFlo1 \
	> all.mod
    #	Actually, this doesn't work because petMar1 and braFlo1 are not
    #	in that business.  So, manually place them in the TREE: line in all.mod,
    #	from the manufactured tree above, and the numbers from mm9.30way.mod:

# TREE: ((petMar1:1.6,(oryLat1:1.077873,(galGal3:0.488824,(mm9:0.325818,hg18:0.136019):0.470757):0.325954):0.4):0.4,braFlo1:2.0);

    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 10471 of 10471 jobs
# CPU time in finished jobs:        991s      16.52m     0.28h    0.01d  0.000 y
# IO & Wait Time:                 68810s    1146.83m    19.11h    0.80d  0.002 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:          4767s      79.45m     1.32h    0.06d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
    find ./bed -type f -name "Contig*.bed" | xargs cat \
	| sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 3 minutes
    cp -p mostConserved.bed /cluster/data/petMar1/bed/multiz6way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/cons/all
    time nice -n +19 hgLoadBed petMar1 phastConsElements6way mostConserved.bed
    #	Loaded 76016 elements of size 5

    # Try for 5% overall cov, and 70% CDS cov
    #	We don't have any gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits petMar1 phastConsElements6way
    #	23909522 bases of 831696438 (2.875%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
    mkdir -p phastCons6wayScores

for D in `ls -1d pp/file* | sort -t_ -k2n`
do
    F=${D/pp\/}
    out=phastCons6wayScores/${F}.data.gz
    echo "${D} > ${F}.data.gz"
    ls -S ${D}/*.pp | xargs cat | gzip > ${out}
done

    #	copy the phastCons6wayScores to:
# /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/phastConsScores
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
    ls -1 phastCons6wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
	| wigEncode stdin phastCons6way.wig phastCons6way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    time nice -n +19 cp -p *.wi? /cluster/data/petMar1/bed/multiz6way/cons/all
    #	real    0m1.729s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/cons/all
    ln -s `pwd`/phastCons6way.wib /gbdb/petMar1/multiz6way/phastCons6way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/petMar1/multiz6way petMar1 \
	phastCons6way phastCons6way.wig
    #	real    0m2.068s
    # remove garbage
    rm wiggle.tab

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=petMar1 phastCons6way > histogram.data 2>&1
    #	real    5m0.608s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Lamprey PetMar1 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	These trackDb entries turn on the wiggle phastCons data track:
    #	type wigMaf 0.0 1.0
    #	maxHeightPixels 100:40:11
    #	wiggle phastCons6way
    #	spanList 1
    #	autoScale Off
    #	windowingFunction mean
    #	pairwiseHeight 12
    #	yLineOnOff Off

#############################################################################
#  Downloads (DONE - 2008-04-29 - Hiram)
    #	Let's see if the downloads will work
    ssh hgwdev
    cd /cluster/data/petMar1
    #	expecting to find repeat masker .out file here:
    ln -s bed/RepeatMasker/petMar1.fa.out .
    gunzip bed/simpleRepeat/trfMask.bed.gz
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
	-workhorse=hgwdev petMar1 > jkStuff/downloads.log 2>&1
    #	real    7m13.080s

    #	failed making upstream sequences:
    #	featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
    #	setpriority: Permission denied.
    #	the 'nice' from my bash shell causes trouble inside the csh
    #	script which uses nice.  Finish off the install step manually
    #	with the mgcGenes upstreams ...

#############################################################################
#  PushQ entries (DONE - 2008-04-29 - Hiram)
    ssh hgwdev
    /cluster/data/petMar1
    /cluster/bin/scripts/makePushQSql.pl petMar1 > jkStuff/pushQ.sql
    #	output warnings:
# petMar1 does not have seq
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	gbWarn
#	multiz6wayFrames
#	phastCons6way
    scp -p jkStuff/pushQ.sql hgwbeta:/tmp/petMar1.sql
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < petMar1.sql

#############################################################################
#  Create 6-way downloads (DONE - 2008-05-01 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
    cd /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
    #	if these haven't been copied here from before:
    cp -p \
/san/sanvol1/scratch/petMar1/multiz6way/cons/all/phastCons6wayScores/* .
    ln -s ../../cons/all/all.mod ./6way.mod
    cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
    # edit that README.txt to be correct for this 6-way alignment
    cd ..
    mkdir multiz6way
    cd multiz6way
    cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
    # edit that README.txt to be correct for this 6-way alignment
    ssh kkstore04
    cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
    ln -s ../../petMar1.6-way.nh ./6way.nh

    time nice -n +19 gzip -c ../../anno/petMar1.anno.6way.maf \
	> petMar1.6way.maf.gz
    #	real    0m49.861s

    ssh hgwdev
    cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
    #	creating upstream files from xenoRefGene, bash script:
    cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=petMar1
GENE=xenoRefGene
NWAY=multiz6way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                stdin stdout \
                -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x ./mkUpstream.sh
    time nice -n +19 ./mkUpstream.sh
    #	real    2m26.254s
# -rw-rw-r--  1    61861 May  1 16:10 upstream1000.maf.gz
# -rw-rw-r--  1   104203 May  1 16:11 upstream2000.maf.gz
# -rw-rw-r--  1   195382 May  1 16:12 upstream5000.maf.gz

    #	check the names in these upstream files to ensure sanity:
    zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
	| sort | uniq -c | sort -rn | less
    #	should be a list of the other 4 species with a high count,
    #	then xenoRefGene names, e.g.:
    #	206 oryLat1
    #	206 mm9
    #	206 hg18
    #	206 galGal3
    #	206 braFlo1
    #	  8 NM_054045
    #	  7 NM_013550
    #	  5 NM_069752

    ssh kkstore04
    cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
    md5sum *.maf.gz > md5sum.txt
    cd ../phastCons6way
    md5sum *.data.gz *.mod > md5sum.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
    mkdir /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
    cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
    rm /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way/mkUpstream.sh
    cd ../phastCons6way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
    #	if your ln -s `pwd`/* made extra links to files you don't want there,
    #	check the goldenPath locations and remove those extra links

#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram
    ssh kkstore01	# not too important since everything moved to hive
    screen # use screen to control this job
    cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26
    cat fb.oryLat2.chainPetMar1Link.txt
    #	41568923 bases of 700386597 (5.935%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap
    cd /cluster/data/petMar1/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 &
    #	real    52m27.942s
    cat fb.petMar1.chainOryLat2Link.txt
    #	39181422 bases of 831696438 (4.711%) in intersection

############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
petMar1.upstreamGeneTbl = xenoRefGene
petMar1.upstreamMaf = multiz6way /hive/data/genomes/petMar1/bed/multiz6way/species.list

############################################################################
# QUALITY TRACK (DONE - 2008-11-25 - Hiram)
    cd /hive/data/genomes/petMar1/downloads
    qaToQac contigs.fa.quals.gz assembly.qac
    qacAgpLift ../petMar1.agp assembly.qac scaffolds.qac
    mkdir /hive/data/genomes/petMar1/bed/qual
    cd /hive/data/genomes/petMar1/bed/qual
    qacToWig -fixed ../../downloads/scaffolds.qac stdout \
        | wigEncode stdin qual.wig qual.wib
    ln -s `pwd`/qual.wib /gbdb/petMar1/wib
    hgLoadWiggle -pathPrefix=/gbdb/petMar1/wib petMar1 quality qual.wig

############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# construct liftOver to petMar2 (DONE - 2012-10-16 - Hiram)
    screen	# manage this longish running job in a screen
    mkdir /hive/data/genomes/petMar1/bed/blat.petMar2.2012-10-16
    cd /hive/data/genomes/petMar1/bed/blat.petMar2.2012-10-16
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/petMar1/petMar1.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev petMar1 petMar2 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/petMar1/petMar1.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev petMar1 petMar2 > do.log 2>&1
    #	real    87m15.687s

    # verify this file exists:
    #	/gbdb/petMar1/liftOver/petMar1ToPetMar2.over.chain.gz
    # and try out the conversion on genome-test from petMar1 to petMar2

############################################################################