# for emacs: -*- mode: sh; -*-
 
#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+-------------------------+
#  | tableName | type                    |
#  +-----------+-------------------------+
#  | refGene   | genePred refPep refMrna |
#  | mgcGenes  | genePred                |
#  | genscan   | genePred genscanPep     |
#  +-----------+-------------------------+


# CREATE BUILD DIRECTORY (DONE 4/17/06 angie)
    # df -h /cluster/store*, choose the one with the most space...
    ssh kkstore04
    mkdir /cluster/store8/xenTro2
    ln -s /cluster/store8/xenTro2 /cluster/data/xenTro2


# DOWNLOAD SEQUENCE (DONE 4/17/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/downloads
    cd /cluster/data/xenTro2/downloads

    wget --timestamp ftp://ftp.jgi-psf.org/pub/JGI_data/Frog/v4.1/Xentr4.fasta.gz
    faSize Xentr4.fasta.gz
#1513925492 bases (154525475 N's 1359400017 real 1193882410 upper 165517607 lower) in 19759 sequences in 1 files
#Total size: mean 76619.5 sd 381240.5 min 2001 (scaffold_20095) max 7817814 (scaffold_1) median 7909
#N count: mean 7820.5 sd 19478.8
#U count: mean 60422.2 sd 323863.1
#L count: mean 8376.8 sd 44842.2


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 4/17/06 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/xenTro2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/xenTro2/bed


# REPEATMASKER (DONE 4/17/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/RMRun
    cd /cluster/data/xenTro2/RMRun
    # Record RM version used:
    ls -l /cluster/bluearc/RepeatMasker
#lrwxrwxrwx  1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320/
    grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20060315;                                            *
    # Run RepeatMasker on a dummy input, just to make it initialize its 
    # species libraries once before the cluster run:
    /cluster/bluearc/RepeatMasker/RepeatMasker -spec "xenopus tropicalis" \
      /dev/null
#Building species libraries in: /cluster/bluearc/RepeatMasker060320/Libraries/20060315/xenopus_tropicalis

    # Apparently RepeatMasker's default lib is smaller than the lib that 
    # was provided along with assembly version 3 (our xenTro1), so we will 
    # run with both the default lib and JGI's lib.
    faSize /cluster/bluearc/RepeatMasker060320/Libraries/20060315/xenopus_tropicalis/specieslib
#84811 bases (92 N's 84719 real 0 upper 84719 lower) in 85 sequences in 1 files
    faSize /cluster/data/xenTro1/downloads/xt3.lib1.fasta
#505033 bases (113 N's 504920 real 504920 upper 0 lower) in 367 sequences in 1 files
    mkdir /cluster/bluearc/xenTro2
    cp -p /cluster/data/xenTro1/downloads/xt3.lib1.fasta \
      /cluster/bluearc/xenTro2/
    /cluster/bluearc/RepeatMasker/RepeatMasker \
      -lib /cluster/bluearc/xenTro2/xt3.lib1.fasta /dev/null

    #- Split sequence into 500kb chunks, at gaps if possible:
    mkdir /cluster/data/xenTro2/scaffoldsSplit500k
    cd /cluster/data/xenTro2/scaffoldsSplit500k
    faSplit -outDirDepth=2 -lift=ss500k.lft \
      gap ../downloads/Xentr4.fasta.gz 500000 ss500k
#22243 pieces of 22243 written


    #- Make the run directory and job list:
    # Run RepeatMasker twice, once with default xenopus lib and once 
    # with the larger lib that was distributed with xenTro1...
    cd /cluster/data/xenTro2
    cat << '_EOF_' > jkStuff/RMXenopus
#!/bin/csh -fe

set tmpDir = /scratch/tmp
/bin/mkdir -p $tmpDir/xenTro2/$2
/bin/cp $1/$2 $tmpDir/xenTro2/$2/
cd $tmpDir/xenTro2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /cluster/bluearc/xenTro2/xt3.lib1.fasta $2
/bin/cp $tmpDir/xenTro2/$2/$2.out $3.jgi
/bin/rm -fr $tmpDir/xenTro2/$2/*
/bin/cp $1/$2 $tmpDir/xenTro2/$2/
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec "xenopus tropicalis" $2
/bin/cp $tmpDir/xenTro2/$2/$2.out $3
/bin/rm -fr $tmpDir/xenTro2/$2/*
/bin/rmdir --ignore-fail-on-non-empty $tmpDir/xenTro2/$2
/bin/rmdir --ignore-fail-on-non-empty $tmpDir/xenTro2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMXenopus
    mkdir RMRun RMOut
    cp /dev/null RMRun/RMJobs
    foreach i (0 1 2 3 4 5 6 7 8 9)
      mkdir RMOut/$i
      foreach j (0 1 2 3 4 5 6 7 8 9)
        mkdir RMOut/$i/$j
        foreach f (/cluster/data/xenTro2/scaffoldsSplit500k/$i/$j/ss500k*.fa)
          echo /cluster/data/xenTro2/jkStuff/RMXenopus \
               $f:h $f:t \
             '{'check out line+ /cluster/data/xenTro2/RMOut/$i/$j/$f:t.out'}' \
            >> RMRun/RMJobs
        end
      end
    end
    wc -l RMRun/RMJobs 
#22243 RMRun/RMJobs

    #- Do the run
    ssh pk
    cd /cluster/data/xenTro2/RMRun
    para make RMJobs; para time | mail -s 'RM cluster run finished' $USER
    para time
#Completed: 22243 of 22243 jobs
#CPU time in finished jobs:    6136121s  102268.68m  1704.48h   71.02d  0.195 y
#IO & Wait Time:                 66562s    1109.37m    18.49h    0.77d  0.002 y
#Average job time:                 279s       4.65m     0.08h    0.00d
#Longest finished job:            2325s      38.75m     0.65h    0.03d
#Submission to last job:         23238s     387.30m     6.46h    0.27d

    #- Lift up the 500KB chunk .out's
    # Also, remove the individual .out file headers and remove duplicate 
    # items found by the two runs (too bad there's no -libOnly).  The 
    # duplicate items are unique for the first 97-105ish characters (up to 
    # the sequential ID field which we ignore anyway).
    ssh kkstore04
    cd /cluster/data/xenTro2/RMOut
    head -3 0/0/ss500k000.fa.out > scaffolds.out
    foreach i (0 1 2 3 4 5 6 7 8 9)
      foreach j (0 1 2 3 4 5 6 7 8 9)
        echo $i/$j/
        foreach f ($i/$j/*.fa.out)
          liftUp -type=.out stdout \
            /cluster/data/xenTro2/scaffoldsSplit500k/ss500k.lft warn \
            $f $f.jgi \
          | tail +4 \
          | sort -k 5,5 -k 6n,6n \
          | uniq -w 97 \
          >> scaffolds.out
        end  
      end
    end
    wc -l scaffolds.out
#2001365 scaffolds.out

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/xenTro2/RMOut
    hgLoadOut xenTro2 -nosplit scaffolds.out
    featureBits -chrom=scaffold_1 xenTro2 rmsk
#1222207 bases of 7578677 (16.127%) in intersection
    featureBits -chrom=scaffold_1 xenTro1 rmsk
#704506 bases of 7406505 (9.512%) in intersection

    # Clean up unmasked split scaffolds.
    ssh kkstore04
    rm -r /cluster/data/xenTro2/scaffoldsSplit500k


# CREATING DATABASE (DONE 4/17/06 angie)
    ssh hgwdev
    hgsql '' -e 'create database xenTro2'
    # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  1.5T  168G  90% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/17/06 angie) 
    ssh hgwdev
    hgsql xenTro2 -e "create table grp (PRIMARY KEY(NAME)) select * from rn4.grp"


# CREATE AGP FILES AND GAP/GOLD TABLES (DONE 4/17/06 angie)
    ssh kkstore04
    cd /cluster/data/xenTro2
    # Look for overrepresented round-number run-of-N sizes:
    faGapSizes -niceSizes=5,10,20,25,50,100,200,500,1000,2000,5000,10000,20000,50000,100000 downloads/Xentr4.fasta.gz
    # Of those, 50 is the only overwhelmingly overrepresented round number.
    # Use that as the -minContigGap (instead of the default 25).
    # There are 65 gaps > 50000 (none exactly 50k or 100k) but I don't see 
    # any other obvious threshold to use for -minScaffoldGap and 65 is pretty 
    # few in the grand scheme of things (all we have are scaffolds though).
    hgFakeAgp -minContigGap=50 downloads/Xentr4.fasta.gz xenTro2.agp

    ssh hgwdev
    cd /cluster/data/xenTro2
    hgGoldGapGl -noGl xenTro2 xenTro2.agp


# SIMPLE REPEATS (TRF) (DONE 4/17/06 angie)
    ssh kkr8u00
    mkdir /cluster/data/xenTro2/bed/simpleRepeat
    cd /cluster/data/xenTro2/bed/simpleRepeat
    trfBig -trf=/cluster/bin/i386/trf ../../downloads/Xentr4.fasta.gz \
      /dev/null -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
    # Took just over 5 hours.

    # Load into the database:
    ssh hgwdev
    hgLoadBed xenTro2 simpleRepeat \
      /cluster/data/xenTro2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    nice featureBits -chrom=scaffold_1 xenTro2 simpleRepeat
#138459 bases of 7578677 (1.827%) in intersection
    # Compare to rn3:
    nice featureBits -chrom=scaffold_1 xenTro1 simpleRepeat
#123690 bases of 7406505 (1.670%) in intersection

# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/xenTro2/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed xenTro2 microsat microsat.bed

# PROCESS SIMPLE REPEATS INTO MASK (DONE 4/17/06 angie)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore04
    cd /cluster/data/xenTro2/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed


# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 4/17/06 angie)
    ssh kkstore04
    cd /cluster/data/xenTro2
    # Soft-mask scaffolds with RepeatMasker and filtered TRF:
    maskOutFa -soft downloads/Xentr4.fasta.gz bed/simpleRepeat/trfMask.bed \
      xenTro2.fa
    maskOutFa -softAdd xenTro2.fa RMOut/scaffolds.out xenTro2.fa
    # See how many bases are lower-cased now:
    faSize xenTro2.fa
#1513925492 bases (154525475 N's 1359400017 real 1092281194 upper 267118823 lower) in 19759 sequences in 1 files
    # ~19.6% of non-N bases are lower-cased, sounds reasonable.
    # Hard-mask scaffolds:
    maskOutFa xenTro2.fa hard xenTro2.fa.masked
    # Make 2bit (for hgBlat, browser):
    faToTwoBit xenTro2.fa xenTro2.2bit


# PUT 2BIT ON /SCRATCH (DONE 4/17/06 angie)
    ssh kkstore04
    mkdir /cluster/bluearc/scratch/hg/xenTro2
    cp -p /cluster/data/xenTro2/xenTro2.2bit \
      /cluster/bluearc/scratch/hg/xenTro2/
    # Ask cluster-admin to distribute to /scratch on big & small cluster


# MAKE GCPERCENT (DONE 4/17/06 angie)
    ssh kkr7u00
    mkdir /cluster/data/xenTro2/bed/gc5Base
    cd /cluster/data/xenTro2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 xenTro2 \
       /cluster/data/xenTro2 \
    | wigEncode stdin gc5Base.wig gc5Base.wib
    ssh hgwdev
    mkdir /gbdb/xenTro2/wib
    cd /cluster/data/xenTro2/bed/gc5Base
    ln -s `pwd`/gc5Base.wib /gbdb/xenTro2/wib
    hgLoadWiggle -pathPrefix=/gbdb/xenTro2/wib xenTro2 gc5Base gc5Base.wig


# MAKE CHROMINFO TABLE WITH 2BIT (DONE 4/17/06 angie)
    ssh kkstore04
    cd /cluster/data/xenTro2
    mkdir bed/chromInfo
    twoBitInfo xenTro2.2bit stdout \
    | awk '{print $1 "\t" $2 "\t/gbdb/xenTro2/xenTro2.2bit";}' \
      > bed/chromInfo/chromInfo.tab

    # Link to 2bit from /gbdb/xenTro2/:
    ssh hgwdev
    cd /cluster/data/xenTro2
    mkdir /gbdb/xenTro2
    ln -s /cluster/data/xenTro2/xenTro2.2bit /gbdb/xenTro2/
    # Load /gbdb/xenTro2/xenTro2.2bit paths into database and save size info.
    hgLoadSqlTab xenTro2 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \
      /cluster/data/xenTro2/bed/chromInfo/chromInfo.tab
    echo "select chrom,size from chromInfo" | hgsql -N xenTro2 > chrom.sizes
    # take a look at chrom.sizes size
    wc chrom.sizes
#  19759   39518  392532 chrom.sizes


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/17/06 angie)
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvsup

    # Add trackDb directories and a description.html
    mkdir xenTro/xenTro2
    cvs add xenTro/xenTro2
    cvs add xenTro/xenTro2/description.html
    cvs ci xenTro/xenTro2
    # Edit that makefile to add xenTro2 in all the right places and do
    make update DBS=xenTro2

    mkdir /gbdb/xenTro2/html
    cvs ci makefile
    # Go public on genome-test.  In a clean tree (no mods, up-to-date):
    cvs up makefile
    make alpha
    # Note: hgcentral*.genome values must correspond
    # with defaultDb.genome values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("xenTro2", "Aug. 2005", "/gbdb/xenTro2", "X. tropicalis", \
               "scaffold_19", 1, 36, "X. tropicalis", \
                "Xenopus tropicalis", "/gbdb/xenTro2/html/description.html", \
                0, 0, "JGI v4.1");'

    ## re-work orderKey 2007-02-20 to get Lizard above frog - Hiram
    hgsql -e 'update dbDb set orderKey="445" where name="xenTro2";' \
	hgcentraltest

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/17/06 angie)
    ssh kkr8u00
    cd /cluster/data/xenTro2
    #- Build the .tar.gz files -- no genbank for now.
    mkdir bigZips
    gzip -c xenTro2.fa > bigZips/xenTro2.fa.gz
    gzip -c xenTro2.fa.masked > bigZips/xenTro2.hardmasked.fa.gz
    gzip -c RMOut/scaffolds.out > bigZips/xenTro2.rmsk.out.gz
    gzip -c bed/simpleRepeat/simpleRepeat.bed > bigZips/xenTro2.trf.bed.gz
    cd bigZips
    md5sum *.gz > md5sum.txt
    # Make a README.txt

    #- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    set gp = /usr/local/apache/htdocs/goldenPath/xenTro2
    mkdir -p $gp/bigZips
    ln -s /cluster/data/xenTro2/bigZips/*.{gz,txt} $gp/bigZips
    # Take a look at bigZips/*
    mkdir $gp/database
    # Create README.txt file in database/ to explain the files.


# MAKE 11.OOC (DONE 4/17/06 angie)
    # Use -repMatch=540 as in makeXenTro1.doc (roughly scaled from human 
    # repMatch by ratio of frog size to human size)
    ssh kkr7u00
    cd /cluster/data/xenTro2
    mkdir /cluster/bluearc/xenTro2
    blat xenTro2.2bit /dev/null /dev/null \
      -tileSize=11 -makeOoc=/cluster/bluearc/xenTro2/11.ooc -repMatch=540
#Wrote 25734 overused 11-mers to /cluster/bluearc/xenTro2/11.ooc


# GENSCAN GENE PREDICTIONS (DONE 4/17/06 angie)
    ssh hgwdev
    mkdir /cluster/data/xenTro2/bed/genscan
    cd /cluster/data/xenTro2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # create hard masked .fa files
    ssh kkstore04
    cd /cluster/data/xenTro2
    mkdir hardMasked
    faSplit about xenTro2.fa.masked 5000000 hardMasked/
    # Generate a list file, genome.list, of all the hard-masked contig chunks:
    ls -1S /cluster/data/xenTro2/hardMasked/* > bed/genscan/genome.list
    wc -l bed/genscan/genome.list
#262 bed/genscan/genome.list

    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/xenTro2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single gsub jobList
    para make jobList
    para time
#Completed: 261 of 262 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:      33758s     562.63m     9.38h    0.39d  0.001 y
#IO & Wait Time:                   905s      15.08m     0.25h    0.01d  0.000 y
#Average job time:                 133s       2.21m     0.04h    0.00d
#Longest finished job:             320s       5.33m     0.09h    0.00d
#Submission to last job:          3605s      60.08m     1.00h    0.04d

    # If there are crashes, diagnose with "para problems" and "para crashed".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    ssh kkr8u00
    cd /cluster/data/xenTro2/bed/genscan
    /cluster/bin/x86_64/gsBig /cluster/data/xenTro2/hardMasked/30.fa gtf/30.gtf -trans=pep/30.pep -subopt=subopt/30.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000

    ls -1 gtf | wc -l
#262
    endsInLf gtf/*

    # Concatenate results:
    ssh kkstore04
    cd /cluster/data/xenTro2/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat pep/*.pep > genscan.pep
    cat subopt/*.bed > genscanSubopt.bed

    # Load into the database (without -genePredExt because no frame info):
    # Don't load the Pep anymore -- redundant since it's from genomic.
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/genscan
    ldHgGene -gtf xenTro2 genscan genscan.gtf
    hgLoadBed xenTro2 genscanSubopt genscanSubopt.bed
    featureBits -chrom=scaffold_1 xenTro2 genscan
#249658 bases of 7578677 (3.294%) in intersection
    featureBits -chrom=scaffold_1 xenTro1 genscan
#275145 bases of 7406505 (3.715%) in intersection
    # Strange that the coverage dropped a bit.  Well, more seq is masked...?


# GENBANK AUTO UPDATE (DONE 4/18/06 angie)
    # align with revised genbank process. drop xeno ESTs.
    cd ~/kent/src/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add xenTro2

# xenTro2 (X. tropicalis) 19579 scaffolds
xenTro2.serverGenome = /cluster/data/xenTro2/xenTro2.2bit
xenTro2.clusterGenome = /scratch/hg/xenTro2/xenTro2.2bit
xenTro2.ooc = /cluster/bluearc/xenTro2/11.ooc
xenTro2.lift = no
xenTro2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
xenTro2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
xenTro2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
xenTro2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
xenTro2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
xenTro2.refseq.mrna.native.load = no
xenTro2.genbank.mrna.xeno.load = no
xenTro2.downloadDir = xenTro2
xenTro2.perChromTables = no
xenTro2.mgcTables.default = full
xenTro2.mgcTables.mgc = all

# N.B. above was changed later to include refseqs don't just copy this or a spell will be
# cast on your descendents.

    cvs ci etc/genbank.conf
    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial xenTro2 &
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad xenTro2 &

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/makeDb/genbank
    cvsup
    # add xenTro2 to:
        etc/align.dbs
        etc/hgwdev.dbs 
    cvs commit
    make etc-update

# enabled native refSeq: 2006-05-11 markd
    # set this in genbank.conf:
    xenTro2.refseq.mrna.native.load = yes
    # kick off an alignment to verify
    ssh kkstore02
    cd /cluster/data/genbank
    (nice ./bin/gbAlignStep xenTro2)|&mail markd&

# CPGISSLANDS (WUSTL) (DONE 4/18/06 angie)
    ssh hgwdev
    mkdir -p /cluster/data/xenTro2/bed/cpgIsland
    cd /cluster/data/xenTro2/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/xenTro2/bed/cpgIsland/
    
    ssh kkr8u00
    cd /cluster/data/xenTro2/bed/cpgIsland
    ./cpglh.exe ../../xenTro2.fa > xenTro2.cpg

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk *.cpg > cpgIsland.bed
    wc -l cpgIsland.bed 
#  42984 cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/cpgIsland
    hgLoadBed xenTro2 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    featureBits -chrom=scaffold_1 xenTro2 cpgIslandExt
#54127 bases of 7578677 (0.714%) in intersection
    featureBits -chrom=scaffold_1 xenTro1 cpgIslandExt
#48512 bases of 7406505 (0.655%) in intersection
    featureBits xenTro1 cpgIslandExt
#19279778 bases of 1381238994 (1.396%) in intersection


# CPGISLANDS (ANDY LAW) (DONE 4/18/06 angie)
    # See notes in makeGalGal2.doc
    ssh kr8u00
    mkdir /cluster/data/xenTro2/bed/cpgIslandGgfAndy
    cd /cluster/data/xenTro2/bed/cpgIslandGgfAndy
    #	Build the preProcGgfAndy program in
    #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
    # Use soft-masked sequence since this is not a mammal...
    ~/bin/x86_64/preProcGgfAndy ../../xenTro2.fa \
    | /cluster/home/angie/ggf-andy-cpg-island-ucsc.pl \
      > cpgIslandGgfAndy.bed
    wc -l ../cpgIsland/cpgIsland.bed *bed
#   42984 ../cpgIsland/cpgIsland.bed
#  184980 cpgIslandGgfAndy.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed xenTro2 cpgIslandGgfAndy -tab \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    featureBits -chrom=scaffold_1 xenTro2 cpgIslandExt
#54127 bases of 7578677 (0.714%) in intersection
    featureBits -chrom=scaffold_1 xenTro2 cpgIslandGgfAndy
#251543 bases of 7578677 (3.319%) in intersection


# SWAP CHAINS/NET MM8 (DONE 4/21/06 hiram -- see makeMm8.doc)


# SWAP CHAINS/NET HG18 (DONE 4/24/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.hg18.swap
    cd /cluster/data/xenTro2/bed/blastz.hg18.swap
    doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.hg18.swap /cluster/data/xenTro2/bed/blastz.hg18
    

# SWAP CHAINS/NET RN4 (DONE 4/24/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.rn4.swap
    cd /cluster/data/xenTro2/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/xenTro2/bed/blastz.rn4


# SWAP CHAINS/NET MONDOM4 (DONE 4/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.monDom4.swap
    cd /cluster/data/xenTro2/bed/blastz.monDom4.swap
    doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    # hgLoadChain ran out of memory while sorting the giant chain set.
    # Pre-sort on a machine with big RAM:
    ssh kkr8u00
    cd /cluster/data/xenTro2/bed/blastz.monDom4.swap
    time nice chainSort -target axtChain/xenTro2.monDom4.all.chain.gz \
      axtChain/all.tSorted.chain
#176.750u 66.580s 4:36.82 87.9%  0+0k 0+0io 0pf+0w
    # -- it's a 5-minute job on a machine with sufficient memory, but will 
    # thrash all day on a machine that doesn't have enough.
    # Manually run hgLoadChain -noSort, then run the rest of loadUp.csh:
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/blastz.monDom4.swap/axtChain
    nice hgLoadChain -noSort -tIndex xenTro2 chainMonDom4 all.tSorted.chain
#Loading 10580431 chains into xenTro2.chainMonDom4
    grep -v hgLoadChain loadUp.csh > tmp.csh
    nice csh -efx tmp.csh >>& ../do.log & tail -f ../do.log
    # back on kkstore04
    cd /cluster/data/xenTro2/bed/blastz.monDom4.swap
    doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.xenTro2/DEF \
      -continue download -workhorse kkr8u00 >>& do.log & tail -f do.log
    ln -s blastz.monDom4.swap /cluster/data/xenTro2/bed/blastz.monDom4


# SWAP CHAINS/NET GALGAL2 (DONE 4/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.galGal2.swap
    cd /cluster/data/xenTro2/bed/blastz.galGal2.swap
    doBlastzChainNet.pl -swap /cluster/data/galGal2/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.galGal2.swap /cluster/data/xenTro2/bed/blastz.galGal2


# SWAP CHAINS/NET DANRER4 (DONE 4/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.danRer4.swap
    cd /cluster/data/xenTro2/bed/blastz.danRer4.swap
    doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.danRer4.swap /cluster/data/xenTro2/bed/blastz.danRer4


# MULTIZ7WAY (DONE 4/28/06 angie)
# ((xenTro2 (galGal2 (monDom4 (hg18 (mm8 rn4))))) danRer4)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27

    # Prune the hg17 17way tree to just these 7 and update db names:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
      --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
      /cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh
    # *carefully* edit 7way.nh to put frog first.
    # create species list and stripped down tree for autoMZ
    sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \
      > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst

    # Split MAFs by sequence onto cluster-friendly server
    mkdir /cluster/bluearc/xenTro2/mafNet
    foreach s (galGal2 monDom4 hg18 mm8 rn4 danRer4)
      echo $s
      mafSplit -byTarget -outDirDepth=2 -useSequenceName \
        dummyArg /cluster/bluearc/xenTro2/mafNet/$s/split \
        /cluster/data/xenTro2/bed/blastz.$s/mafNet/*
    end
    # Change the split%05d names to scaffold_1 etc. so they exactly match
    # sequence names.
    cd /cluster/bluearc/xenTro2/mafNet
    foreach db ( danRer4 galGal2 hg18 mm8 monDom4 rn4 )
      echo $db
      foreach d0 ($db/*)
        foreach f ($d0/*/*)
          set g = `echo $f | sed -e 's/split0*/scaffold_/'`
          mv $f $g
        end
      end
    end

    ssh kki
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27
    mkdir maf run
    cd run

    # stash binaries 
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = xenTro2
    set d = $1
    set c = $2
    set mafOut = $3
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /cluster/bluearc/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        if ($s == $db) then
            continue
        endif
        set clusterMaf = $pairs/$s/$d/$c.maf
        set localMaf = $db.$s.sing.maf
        if (-e $clusterMaf) then
            cp $clusterMaf $localMaf
        else
            echo "##maf version=1 scoring=autoMZ" > $localMaf
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $mafOut
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/xenTro2/bed/multiz7way.2006-04-27/maf/$(dir1)/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs

    # List scaffolds in the dir structure created by mafSplit above:
    perl -wpe 's/^scaffold_(\d+)\s+.*$//; $d0 = $1 % 10; $d1 = ($1/10) % 10; \
               print "$d1/$d0/scaffold_$1";' \
      /cluster/data/xenTro2/chrom.sizes > scaf.lst
    # Make corresponding output directory structure:
    foreach d1 (0 1 2 3 4 5 6 7 8 9)
      mkdir ../maf/$d1
      foreach d0 (0 1 2 3 4 5 6 7 8 9)
        mkdir ../maf/$d1/$d0
      end
    end

    gensub2 scaf.lst single spec jobList
    para make jobList
    para time
#Completed: 19759 of 19759 jobs
#CPU time in finished jobs:       8981s     149.68m     2.49h    0.10d  0.000 y
#IO & Wait Time:                 51309s     855.15m    14.25h    0.59d  0.002 y
#Average job time:                   3s       0.05m     0.00h    0.00d
#Longest finished job:              44s       0.73m     0.01h    0.00d
#Submission to last job:          3771s      62.85m     1.05h    0.04d


# ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 4/28/2006 angie)
# -- mafFilter'd and reloaded 6/9/2006
    ssh kkr8u00
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    foreach db (`cat /cluster/data/xenTro2/bed/multiz7way.2006-04-27/species.lst`)
      ln -s /cluster/data/$db/chrom.sizes $db.len
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
      echo $db.len  >> sizes
    end

    ssh kki
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno/run
    cp /dev/null jobList
    foreach d1 (/cluster/data/xenTro2/bed/multiz7way.2006-04-27/maf/*)
      echo $d1
      foreach f ($d1/*/*.maf)
        set maf = $f:t
        set d1 = $f:h:h:t
        set d0 = $f:h:t
        echo mafAddIRows -nBeds=nBeds -sizes=sizes $f \
          /scratch/hg/xenTro2/xenTro2.2bit ../maf/$d1/$d0/$maf >> jobList
      end
    end
    # Make corresponding output directory structure:
    foreach d1 (0 1 2 3 4 5 6 7 8 9)
      mkdir ../maf/$d1
      foreach d0 (0 1 2 3 4 5 6 7 8 9)
        mkdir ../maf/$d1/$d0
      end
    end
    para make jobList
    para time
#Completed: 19759 of 19759 jobs
#CPU time in finished jobs:      40336s     672.27m    11.20h    0.47d  0.001 y
#IO & Wait Time:                 50270s     837.83m    13.96h    0.58d  0.002 y
#Average job time:                   5s       0.08m     0.00h    0.00d
#Longest finished job:               8s       0.13m     0.00h    0.00d
#Submission to last job:          5667s      94.45m     1.57h    0.07d

    # Consolidate multi-level maf to monolithic file
    # No need to sort chunks by position because the chunk size is greater 
    # than the largest scaffold size.  That may not be true in other 
    # assemblies.
    ssh kkstore04
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    cp /dev/null xenTro2.maf
    foreach scaf (`awk '{print $1;}' /cluster/data/xenTro2/chrom.sizes`)
      set f = `echo $scaf | perl -wpe 's/scaffold_(\d+)//; \
                              $d0 = $1 % 10; $d1 = ($1/10) % 10; \
                              print "maf/$d1/$d0/scaffold_$1.maf";'`
      cat $f >> xenTro2.maf
    end
    # 6/9/2006 -- mafFilter rejected two single-row (xenTro2 only) blocks due to 
    # its default minRow of 2.  That's reasonable, so replace the original with 
    # the filtered version (and reload the db tables based on the file).
    mafFilter -overlap -reject=rf xenTro2.maf > xenTro2.mf.maf
#rejected 2 blocks
    gzip -c xenTro2.maf > xenTro2.preFilter.maf.gz
    mv xenTro2.mf.maf xenTro2.maf

    # Load annotated maf
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    mkdir -p /gbdb/xenTro2/multiz7way/anno
    ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno/xenTro2.maf \
      /gbdb/xenTro2/multiz7way/anno/
    time nice hgLoadMaf -pathPrefix=/gbdb/xenTro2/multiz7way/anno xenTro2 multiz7way
#Loaded 1610296 mafs in 1 files from /gbdb/xenTro2/multiz7way/anno
#old hgwdev:
#126.780u 41.500s 7:15.57 38.6%  0+0k 0+0io 264pf+0w
#new hgwdev:
#31.754u 8.324s 1:12.56 55.2%    0+0k 0+0io 3pf+0w

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kkr8u00
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    time nice hgLoadMafSummary xenTro2 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000 -test multiz7waySummary xenTro2.maf
#Created 570630 summary blocks from 3284353 components and 1610296 mafs from xenTro2.maf
#55.767u 11.404s 1:08.32 98.3%   0+0k 0+0io 4pf+0w

    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/anno
    sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz7waySummary.sql
    time nice hgLoadSqlTab xenTro2 multiz7waySummary \
      /tmp/multiz7waySummary.sql multiz7waySummary.tab
#old hgwdev:
#0.000u 0.000s 2:01.79 0.0%      0+0k 0+0io 234pf+0w
#new hgwdev:
#0.000u 0.001s 0:10.88 0.0%      0+0k 0+0io 3pf+0w
    rm *.tab
    ln -s multiz7way.2006-04-27 /cluster/data/xenTro2/bed/multiz7way


# MULTIZ7WAY DOWNLOADABLES (DONE 6/9/2006 angie)
    # Annotated MAF is now documented, so use anno/maf for downloads.
    ssh hgwdev
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads
    # upstream mafs 
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits xenTro2 mgcGenes:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags xenTro2 multiz7way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'
    # << emacs
    time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
# old hgwdev:
#628.650u 89.300s 14:54.56 80.2% 0+0k 0+0io 4617pf+0w
# new hgwdev:
#89.631u 24.492s 2:58.28 64.0%   0+0k 0+0io 0pf+0w

    # Make a gzipped version of the monolithic annotated maf file:
    ssh kkstore04
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27
    time nice gzip -c anno/xenTro2.maf > mafDownloads/xenTro2.maf.gz
#416.982u 2.804s 7:10.50 97.5%   0+0k 0+0io 0pf+0w

    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads
    time nice gzip up*.maf
#6.301u 0.096s 0:06.50 98.3%     0+0k 0+0io 0pf+0w
    time nice md5sum *.gz > md5sum.txt
#1.812u 0.369s 0:02.24 96.8%     0+0k 0+0io 0pf+0w

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/xenTro2/multiz7way
    mkdir $dir
    ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/mafDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/rn4/multiz9way/README.txt $dir
    # edit README.txt


# MULTIZ7WAY MAF FRAMES (DONE 4/28/2006 angie - REDONE 2006-06-09 markd)
    ssh hgwdev
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/frames
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/frames
    # The following is adapted from MarkD's Makefile used for mm7...

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes: danRer4
    mkdir genes
    foreach queryDb (danRer4)
      set tmpExt = `mktemp temp.XXXXXX`
      set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
      set tmpMrna = ${queryDb}.mrna.${tmpExt}
      set tmpCds = ${queryDb}.cds.${tmpExt}
      echo $queryDb
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${queryDb} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end
    # using mgcGenes for xenTro2
    # using knownGene for rn4 mm8 hg18
    # using refGene for galGal2
    # no genes for monDom4
    # genePreds; (must keep only the first 10 columns for knownGene)
    foreach queryDb (xenTro2 rn4 mm8 hg18 galGal2)
      if ($queryDb == "xenTro2") then
        set geneTbl = mgcGenes
      else if ($queryDb == "galGal2") then
        set geneTbl = refGene
      else
        set geneTbl = knownGene
      endif
      hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end

    #------------------------------------------------------------------------
    # create frames
    set clusterDir = /cluster/bluearc/xenTro2/multiz7wayFrames
    set multizDir = /cluster/data/xenTro2/bed/multiz7way.2006-04-27
    set mafDir = $multizDir/mafDownloads
    set geneDir = $multizDir/frames/genes
    set clusterMafDir = ${clusterDir}/maf
    set clusterGeneDir = ${clusterDir}/genes
    set clusterFramesDir = ${clusterDir}/mafFrames.kki

    # copy mafs to cluster storage
    mkdir $clusterDir
    ssh -x kkstore04 "rsync -av $mafDir/xenTro2.maf.gz $clusterMafDir/"

    # copy genes to cluster storage
    ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"

    # run cluster jobs
    set tmpExt = `mktemp temp.XXXXXX`
    set paraDir = $multizDir/frames/para.${tmpExt}
    mkdir mafFrames $paraDir
    rm -f $paraDir/jobList
    mkdir ${clusterFramesDir}
    foreach queryDb (`cat /cluster/data/xenTro2/bed/multiz7way.2006-04-27/species.lst`)
      mkdir ${clusterFramesDir}/${queryDb}
      if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
        echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} xenTro2 \
          ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/xenTro2.maf.gz \
          ${clusterFramesDir}/${queryDb}/xenTro2.mafFrames \
          >> $paraDir/jobList
      endif
    end
    rm -f $tmpExt
    ssh -x kki "cd ${paraDir} && para make jobList && para time"
#Completed: 12 of 12 jobs
#CPU time in finished jobs:        354s       5.89m     0.10h    0.00d  0.000 y
#IO & Wait Time:                    46s       0.77m     0.01h    0.00d  0.000 y
#Average job time:                  33s       0.56m     0.01h    0.00d
#Longest finished job:              37s       0.62m     0.01h    0.00d
#Submission to last job:            37s       0.62m     0.01h    0.00d

    # combine results from cluster
    foreach queryDb (`cat ../species.lst`)
      echo $queryDb
      ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
    end

    #------------------------------------------------------------------------
    # load the database
    hgLoadMafFrames xenTro2 multiz7wayFrames mafFrames/*.mafFrames.gz

    #------------------------------------------------------------------------
    # clean up
    rm -rf ${clusterDir}

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore04
    cd /cluster/data/xenTro2/bed/multiz7way/frames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (find  ../anno/maf -name '*.maf'|xargs cat | time genePredToMafFrames xenTro2 stdin stdout danRer4 genes/danRer4.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log&
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way/frames

    hgLoadMafFrames xenTro2 multiz7wayFrames multiz7way.mafFrames.gz >&log&


# PHASTCONS (DONE 5/2/2006 angie - REDONE w/pruned ENCODE model 6/14/06)
# Using Kate's process from makeHg17.doc.
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track.  Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model 
# from the (37-way) ENCODE alignments.

    ssh kkstore04
    # Prune Elliot's model to just our 7 species:
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \
      --rename="rn3 -> rn4 ; mm7 -> mm8 ; hg17 -> hg18 ; monDom2 -> monDom4 ; \
                xenTro1 -> xenTro2 ; danRer3 -> danRer4" \
      /cluster/data/hg17/bed/multiz17way/cons/elliotsEncode.mod \
      > elliotsEncodePruned.mod

    # Split MAF into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # 6/14/06: For a chrom-based genome we would run a splitting job on the 
    # small cluster.  However, since this is scaffold-based and we have 
    # thousands of files (for sequences so small that they don't even get 
    # split), the I/O time of a cluster run is huge and wasteful.  
    # So run this directly on the fileserver.
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    cat > doSplitOnFileserver.csh << '_EOF_'
#!/bin/csh -fex
set WINDOWS = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss
set tmpDir = /scratch/tmp
rm -fr $WINDOWS
mkdir -p $WINDOWS
date
# Make directory structure corresponding to maf and fill with .ss:
foreach d1 (0 1 2 3 4 5 6 7 8 9)
  mkdir $WINDOWS/$d1
  foreach d0 (0 1 2 3 4 5 6 7 8 9)
    mkdir $WINDOWS/$d1/$d0
    foreach f (../maf/$d1/$d0/scaf*.maf)
      # skip the maf files that have only comments -- those crash msa_split:
      if (`grep -v ^\# $f | wc -l`) then
        set c = $f:t:r
        twoBitToFa /cluster/data/xenTro2/xenTro2.2bit -seq=$c $tmpDir/$c.fa
        /cluster/bin/phast/$MACHTYPE/msa_split $f -i MAF \
            -M $tmpDir/$c.fa \
            -o SS -r $WINDOWS/$d1/$d0/$c -w 10000000,0 -I 1000 -B 5000
        rm -f $tmpDir/$c.fa
      endif
    end
  end
end
date
'_EOF_'
    # << emacs
    chmod a+x doSplitOnFileserver.csh
    nice ./doSplitOnFileserver.csh >& split.log & tail -f split.log
    # Took 15 minutes (was 1 hour on small cluster due to I/O)

    # check tree model on a single chunk, using params recommended by Adam,
    # (to verify branch lengths on 2X species -- though we aren't using any 
    #  of those here)
    ssh kolossus
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss/0/1/scaffold_1.1-7817814.ss \
        -o phyloFit.tree
    # Comment from makeHg17.doc:
    # # he ok'ed the results -- not necessary for next human run

    # 6/14/06:
    # The elliotsEncode.mod is gives about twice as much coverage as 
    # I thought we wanted, despite extremely low --target-coverage, 
    # but Adam and Jim say the coverage from phyloFit.tree is too low.
    # When I loaded wiggles made with elliotsEncode and phyloFit into 
    # the browser for visual comparison, the elliotsEncode wiggle was maxed 
    # out (~1 wherever anything aligned) while the phyloFit wiggle showed a 
    # lot more dynamics.  So initially I went with phyloFit for that reason,
    # but it was not a good reason.  I'll rerun with elliotsEncode.

    # Run phastCons
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set d = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons
cp -p $san/ss/$d/$f.ss ../elliotsEncodePruned.mod $tmp
pushd $tmp > /dev/null
set c = $f:r:r
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
  --rho $rho --expected-length $len --target-coverage $cov --quiet \
  --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$d $san/bed/$d
sleep 1
mv $tmp/$f.pp $san/pp/$d
mv $tmp/$f.bed $san/bed/$d
rm -fr $tmp
'EOF'
    # << emacs
    chmod a+x doPhast.csh

    # 6/14/06: Estimate rho on scaffold_1
    /cluster/bin/phast/$MACHTYPE/phastCons --estimate-rho /tmp/estimatedRho.mod \
      --target-coverage 0.005 --expected-length 12 --no-post-probs \
      /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss/0/1/scaffold_1.*.ss \
      ../elliotsEncodePruned.mod
#(rho = 0.255026)
# for target-coverage 0.1: (rho = 0.223770)

    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(dir1) $(file1) 12 .005 .26
#ENDLOOP
'EOF'
    # << emacs

    # Create parasol batch and run it
    ssh kki
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons
    pushd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/ss
    cp /dev/null /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons/in.list
    foreach d (*/*)
      ls -1S $d/*.ss | sed 's/.ss$//' \
        >> /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/run.cons/in.list
    end
    popd

    gensub2 in.list single template jobList
    para make jobList
    para time
#Completed: 5587 of 5587 jobs
#CPU time in finished jobs:       3281s      54.68m     0.91h    0.04d  0.000 y
#IO & Wait Time:                 20478s     341.31m     5.69h    0.24d  0.001 y
#Average job time:                   4s       0.07m     0.00h    0.00d
#Longest finished job:              28s       0.47m     0.01h    0.00d
#Submission to last job:          1489s      24.82m     0.41h    0.02d

    # create Most Conserved track
    ssh kolossus
    cd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with 
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "*.bed" \
    | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
    | sort -k7,7 -k9,9n \
    | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
    | xargs cat \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed
    cp -p phastConsElements7way.bed /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons

    # Measure coverage.  If good, load elements into database and proceed with wiggle.
    # Try for somewhere in the neighborhood of 5% overall cov, and 70% CDS cov.
    # (Jim tried for 4% overall in xenTro1)
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    featureBits xenTro2 -enrichment mgcGenes:cds phastConsElements7way.bed

    # FIRST ITERATION: elliotsEncode, doPhast (len cov rho) = (14 .007 .27)
#mgcGenes:cds 0.235%, phastConsElements7way.bed 10.972%, both 0.196%, cover 83.49%, enrich 7.61x
    mv phastConsElements7way.bed phastConsElements7way_14_007_27.bed

    # SECOND ITERATION: elliotsEncode, doPhast (len cov rho) = (12 .005 .27)
#mgcGenes:cds 0.235%, phastConsElements7way.bed 10.561%, both 0.195%, cover 82.94%, enrich 7.85x
# just scaffold_1:
#mgcGenes:cds 0.246%, phastConsElements7way.bed 11.117%, both 0.212%, cover 86.33%, enrich 7.77x
    mv phastConsElements7way.bed phastConsElements7way_12_005_27.bed

    # THIRD ITERATION: phyloFit, doPhast (len cov rho) = (12 .100 .339)
#mgcGenes:cds 0.235%, phastConsElements7way.bed 0.727%, both 0.059%, cover 25.02%, enrich 34.43x
    mv phastConsElements7way.bed phastConsElements7way_12_100_339.bed

    # FOURTH ITERATION: phyloFit, doPhast (len cov rho) = (12 .100 .3)
#mgcGenes:cds 0.235%, phastConsElements7way.bed 0.683%, both 0.054%, cover 22.86%, enrich 33.45x
    mv phastConsElements7way.bed phastConsElements7way_12_100_300.bed

    # FIFTH ITERATION: phyloFit, 12 .50 .339
#mgcGenes:cds 0.235%, phastConsElements7way.bed 1.353%, both 0.091%, cover 38.78%, enrich 28.67x
    mv phastConsElements7way.bed phastConsElements7way_12_500_339.bed

    # SIXTH ITERATION: elliotsEncode, doPhast (len cov rho) = (12 .005 .26)
#mgcGenes:cds 0.235%, phastConsElements7way.bed 10.377%, both 0.194%, cover 82.42%, enrich 7.94x
    mv phastConsElements7way.bed phastConsElements7way_12_005_26.bed

    # When happy:
    hgLoadBed -strict xenTro2 phastConsElements7way phastConsElements7way_12_005_26.bed

    # Create merged posterior probability file and wiggle track data files
    ssh kolossus
    cd /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    time find ./pp -name "*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
        sort -k7,7 -k9,9n | \
        sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
        xargs cat | \
        nice wigEncode -noOverlap stdin phastCons7way.wig phastCons7way.wib

    cp -p phastCons7way.wi? \
      /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons
    ln -s `pwd`/phastCons7way.wib /gbdb/xenTro2/multiz7way/
    hgLoadWiggle -pathPrefix=/gbdb/xenTro2/multiz7way xenTro2 \
        phastCons7way phastCons7way.wig

    # Make .jpg for tree, check in to browser/images and install in
    # htdocs/images/phylo/... don't forget to request a push of that
    # file.  The treeImage setting in trackDb.ra is
    # phylo/xenTro2_7way.jpg (relative to htdocs/images).  
    # Use 7way.nh from the model used in the final run: 
    # elliotsEncodePruned.mod .
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27
    mv 7way.nh 7way.elliotsEncode.nh
    tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' \
      > 7way.phyloFit.nh

    /cluster/bin/phast/all_dists 7way.elliotsEncode.nh > 7way.distances.txt
    grep xenTro2 7way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    cat distances.txt
#1.3604  chicken_galGal2
#1.4357  human_hg18
#1.4659  monodelphis_monDom4
#1.7936  mouse_mm8
#1.7983  rat_rn4
#1.8771  zebrafish_danRer4
    # the order in the browser display will be by tree topology,
    # not by distance.

    # Just for reference, these were the distances from phyloFit.tree.mod...
    # Quite different!
#0.4899  galGal2
#0.5621  monDom4
#0.5738  hg18
#0.6038  mm8
#0.6074  danRer4
#0.6075  rn4

    /cluster/bin/phast/draw_tree 7way.elliotsEncode.nh > 7way.ps
    ps2pdf 7way.ps > 7way.pdf
    pstopnm -stdout 7way.ps | pnmtojpeg > xenTro2_7way.jpg

    # If you haven't already, check out the browser CVS tree in your ~/:
    # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser)
    cp xenTro2_7way.jpg ~/browser/images/phylo/
    cd ~/browser/images/phylo
    cvs add xenTro2_7way.jpg
    cvs ci xenTro2_7way.jpg
    cd ../..
    cvsup
    make alpha


# PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 5/2/06 angie - REDONE 6/14/06)
    ssh kolossus
    mkdir /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads
    set ppDir = /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons/pp
    cp /dev/null xenTro2.pp
    # No need to sort chunks by position because the chunk size is greater 
    # than the largest scaffold size.  That may not be true in other 
    # assemblies.
    foreach scaf (`awk '{print $1;}' /cluster/data/xenTro2/chrom.sizes`)
      set d = `echo $scaf | perl -wpe 's/scaffold_(\d+)//; \
                              $d0 = $1 % 10; $d1 = ($1/10) % 10; \
                              print "$d1/$d0";'`
      set f = `ls -1 $ppDir/$d | egrep ^$scaf'\.1-[0-9]+\.pp'`
      if ("x$f" != "x") then
        cat $ppDir/$d/$f >> xenTro2.pp
      endif
    end
    nice gzip xenTro2.pp
    md5sum xenTro2.pp.gz > md5sum.txt

    ssh hgwdev
    cd /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads
    set dir = /usr/local/apache/htdocs/goldenPath/xenTro2/phastCons7way
    mkdir $dir
    ln -s /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastConsDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/rn4/phastCons9way/README.txt $dir
    # edit README.txt
    # Clean up after phastCons run.
    ssh kkstore04
    rm /cluster/data/xenTro2/bed/multiz7way.2006-04-27/phastCons/*.tab
    rm -r /cluster/bluearc/xenTro2/multiz7way.2006-04-27/phastCons


# ENSEMBL is on 4.0 not 4.1... do the coords mostly carry over???

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney2006-06-16)
    ssh kkstore04
    bash # if not using bash shell already

    mkdir -p /cluster/data/xenTro2/blastDb
    cd /cluster/data/xenTro2/blastDb
    faSplit sequence ../xenTro2.fa  500 x

    for i in *.fa
    do
       /cluster/bluearc/blast229/formatdb -p F -i $i
    done
    rm *.log *.fa 

    mkdir -p /san/sanvol1/scratch/xenTro2/blastDb
    cd /cluster/data/xenTro2/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/xenTro2/blastDb
    done

    mkdir -p /cluster/data/xenTro2/bed/tblastn.hg18KG
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/xenTro2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 496 query.lst
   # we want around 150000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(150000/496) = 121.443947
   mkdir -p /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa
   split -l 121 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/xenTro2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 150784 of 150784 jobs
# CPU time in finished jobs:   12376974s  206282.91m  3438.05h  143.25d  0.392 y
# IO & Wait Time:                603271s   10054.51m   167.58h    6.98d  0.019 y
# Average job time:                  86s       1.43m     0.02h    0.00d
# Longest finished job:             462s       7.70m     0.13h    0.01d
# Submission to last job:         80194s    1336.57m    22.28h    0.93d

    ssh kkstore04
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.
# Completed: 304 of 304 jobs
# CPU time in finished jobs:        764s      12.73m     0.21h    0.01d  0.000 y
# IO & Wait Time:                 11950s     199.17m     3.32h    0.14d  0.000 y
# Average job time:                  42s       0.70m     0.01h    0.00d
# Longest finished job:              77s       1.28m     0.02h    0.00d
# Submission to last job:          2117s      35.28m     0.59h    0.02d

    ssh kkstore04
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/xenTro2/bed/tblastn.hg18KG/blastHg18KG.psl
    pslCheck blastHg18KG.psl
    # this is ok.
    # load table 
    ssh hgwdev
    cd /cluster/data/xenTro2/bed/tblastn.hg18KG
    hgLoadPsl xenTro2 blastHg18KG.psl
    # check coverage
    featureBits xenTro2 refGene:cds  blastHg18KG -enrichment
# refGene:cds 0.337%, blastHg18KG 1.477%, both 0.263%, cover 78.14%, enrich 52.91x
    
    ssh kkstore04
    rm -rf /cluster/data/xenTro2/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/xenTro2/bed/tblastn.hg18KG/blastOut
#end tblastn

##########################################################################
# SWAP CHAINS/NET GALGAL3 (DONE 7/20/06 angie)
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.galGal3.swap
    cd /cluster/data/xenTro2/bed/blastz.galGal3.swap
    doBlastzChainNet.pl -swap /cluster/data/galGal3/bed/blastz.xenTro2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.galGal3.swap /cluster/data/xenTro2/bed/blastz.galGal3


##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna xenTro2

##########################################################################
## WindowMasker masked sequence (DONE - 2007-02-20 - Hiram)
    ssh kolossus
    ##	This directory was placed on store12 since xenTro2 filesystem
    ##	was full.  This is actually a symlink
    mkdir /cluster/data/xenTro2/bed/WindowMasker.2007-02-19
    cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19
    ## copy the .csh scripts from mm8 WindowMasker run, edit to fixup
    #	reference to correct DB and work directory.
    time nice -n +19 ./doCounts.csh > doCounts.out 2>&1
    time nice -n +19 ./doSdust.csh > doSdust.out 2>&1
    
    ssh kkstore05
    cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19
    gzip *.counts *.bed
    nice -n +19 ./applyMask.csh
    #	this addTrf properly gets the n's changed to N which WM masked
    nice -n +19 ./addTrf.csh
    #	measuring faSize of resulting xenTro2.sdTrf.2bit:
    #	1513925492 bases (154525475 N's 1359400017 real
    #	835902481 upper 523497536 lower) in 19759 sequences in 1 files
    #	%38.51 = 523497536 / 1359400017
    #	vs. existing xenTro2.2bit
    #	1513925492 bases (154525475 N's 1359400017 real
    #	1092281194 upper 267118823 lower) in 19759 sequences in 1 files
    #	%19.65 = 267118823 / 1359400017

    ssh hgwdev
    cd /cluster/data/xenTro2/bed/WindowMasker.2007-02-19
    time nice -n +19 ./load.csh > load.out 2>&1
    #	Loaded 7013938 elements of size 3
    #	real    2m29.659s

#########################################################################
## BLASTZ SWAP Lizard/anoCar1 - (DONE - 2007-02-22 - Hiram)
    #	measurement of anoCar1 coverage by Frog
    time nice -n +19 featureBits anoCar1 chainXenTro2Link \
	> fb.anoCar1.chainXenTro2Link.txt 2>&1
    #	real    11m33.086s
    #	83873500 bases of 1741478929 (4.816%) in intersection

    ## the swap
    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap
    cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap > swap.log 2>&1 &
    #	real    136m3.288s

    ssh hgwdev
    cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap
    time nice -n +19 featureBits xenTro2 chainAnoCar1Link \
	> fb.xenTro2.chainAnoCar1Link.txt 2>&1
    #	84514985 bases of 1359412157 (6.217%) in intersection
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
xenTro2.upstreamGeneTbl = mgcGenes
xenTro2.upstreamMaf = multiz7way /hive/data/genomes/xenTro2/bed/multiz7way/species.lst
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-23 - Galt)
    # original alignment to danRer6
    cd /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22
    cat fb.danRer6.chainXenTro2Link.txt
    #  100078259 bases of 1506896106 (6.641%) in intersection

    #   running the swap - DONE - 2009-12-23
    mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer6.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.danRer6.swap
    time nice +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22/DEF \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -swap >& swap.log &

    cat fb.xenTro2.chainDanRer6Link.txt
    #   92089833 bases of 1359412157 (6.774%) in intersection

#######################################################################
# lastz swap from danRer7 (DONE - 2010-12-20 - Hiram)
    # original alignment to danRer7
    cd /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17
    cat fb.danRer7.chainXenTro2Link.txt 
    #	90625809 bases of 1409770109 (6.428%) in intersection

    #   running the swap
    mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    32m57.901s
    cat fb.xenTro2.chainDanRer7Link.txt 
    #	89862892 bases of 1359412157 (6.610%) in intersection

#######################################################################
# lastz swap from melGal1 (DONE - 2011-04-02 - Chin)
    # original alignment to melGal1
    cd /hive/data/genomes/melGal1/bed/lastzXenTro2.2011-04-02
    cat fb.melGal1.chainXenTro2Link.txt
    # 36288270 bases of 935922386 (3.877%) in intersection

    #   running the swap 
    mkdir /hive/data/genomes/xenTro2/bed/blastz.melGal1.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.melGal1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/melGal1/bed/lastzXenTro2.2011-04-02/DEF \
        -swap \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
    #   real    3m40.138s
    cat fb.xenTro2.chainMelGal1Link.txt
    #   41967849 bases of 1359412157 (3.087%) in intersection
    cd /hive/data/genomes/xenTro2/bed
    ln -s blastz.melGal1.swap lastz.melGal1


#######################################################################
# lastz swap from anoCar2 (DONE - 2011-04-26 - Hiram)
    # original alignment
    cd /hive/data/genomes/anoCar2/bed/lastzXenTro2.2011-04-25
    cat fb.anoCar2.chainXenTro2Link.txt 
    #	85962319 bases of 1701353770 (5.053%) in intersection

    #	running the swap - DONE - 2011-04-26
    mkdir /hive/data/genomes/xenTro2/bed/blastz.anoCar2.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.anoCar2.swap
    time nice -n +25 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/anoCar2/bed/lastzXenTro2.2011-04-25/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet -swap -qRepeats=windowmaskerSdust \
	-tRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    29m36.856s
    cat fb.xenTro2.chainAnoCar2Link.txt 
    #	91934327 bases of 1359412157 (6.763%) in intersection

##############################################################################
# construct liftOver to xenTro3 (DONE - 2011-09-19 - Hiram) 
    screen	# manage this longish running job in a screen
    mkdir /hive/data/genomes/xenTro2/bed/blat.xenTro3.2011-09-19
    cd /hive/data/genomes/xenTro2/bed/blat.xenTro3.2011-09-19
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/xenTro2/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev xenTro2 xenTro3 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/xenTro2/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev xenTro2 xenTro3 > do.log 2>&1
    #	real    597m54.341s

    # verify this file exists:
    #	/gbdb/xenTro2/liftOver/xenTro2ToAnoCar2.over.chain.gz
    # and try out the conversion on genome-test from xenTro2 to xenTro3 
############################################################################

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd