# for emacs: -*- mode: sh; -*-


# This file describes browser build for the Fugu
# genome, Takifugu rubripes, October 2004, Fugu v4.0 from
#
#       "$Id: fr2.txt,v 1.25 2009/09/20 17:16:43 markd Exp $"
#

##########################################################################
### Fetch sequence       (DONE - 2007-01-22 - Cory McLean and Hiram)
  ssh kkstore02
  mkdir /cluster/store5/fr2
  ln -s /cluster/store5/fr2 /cluster/data/fr2
  cd /cluster/data/fr2
  mkdir jgi
  cd jgi
  cat << '_EOF_' > fetch.sh
#!/bin/sh

wget --timestamping "ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/v4.0/*"

gunzip fugu.041029.scaffolds.fasta.gz

scaffoldFaToAgp fugu.041029.scaffolds.fasta

gzip fugu.041029.scaffolds.fasta
'_EOF_'
    # << happy emacs
    chmod +x fetch.sh
    ./fetch.sh

##########################################################################
#  Run the makeGenomeDb.pl script (DONE - 2007-01-22 - Cory and Hiram)
    # prepare for the makeGenomeDb.pl script:
    ssh hgwdev
    cd /cluster/data/fr2
    #   the config.ra script pretty much specifies everything
    cat << '_EOF_' >  config.ra
db fr2
scientificName Takifugu Rubripes
assemblyDate Oct. 2004
assemblyLabel JGI V4.0
# orderKey = fr1.orderKey - 1
orderKey 464
#       NC_004299.1
mitoAcc 23397366
fastaFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.fasta.gz
dbDbSpeciesDir fugu
agpFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.agp
commonName Fugu
clade Vertebrate
genomeCladePriority 110
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl config.ra > mgdb.out 2>&1
    #	This sequence creates and loads the following files into the
    #	new database oryLat1
    #	chr*_gap, chr*_gold, chromInfo, gc5Base, grp
    #	And, when you follow the instructions it gives at the end to check in
    #	the trackDb files it creates, and you do a make in your trackDb
    #	hierarchy, you will then create the trackDb and hgFindSpec tables
    #	(with a 'make alpha') or your specific trackDb_logname
    #	hgFindSpec_logname with a simple 'make' with no arguments.
    #	The sequence also adds an entry to the dbDb table to turn on this
    #	organism in the drop-down menus

    ############################################
    # Checked in trackDb fr2 files to the source tree.  Instructions
    # located in ~/kent/src/product/README.trackDb
    ssh hgwdev
    cd /gbdb/fr2
    ln -s /cluster/data/fr2/fr2.unmasked.2bit ./fr2.2bit

################################################
## WINDOWMASKER (DONE - 2007-01-22 - Cory and Hiram)
  cd /cluster/data/fr2/bed/
  ~/kent/src/hg/utils/automation/doWindowMasker.pl fr2 \
      -workhorse=kolossus > wmRun.log 2>&1 &

  # Save the log
  mv wmRun.log WindowMasker.2007-01-22

  # Masking statistics
  cd WindowMasker.2007-01-22
  twoBitToFa fr2.wmsk.2bit stdout | faSize stdin

  # 400525790 bases (49313545 N's 351212245 real 284686886 upper
  # 66525359 lower)

  hgLoadBed -strict fr2 windowmaskerSdust windowmasker.sdust.bed.gz
  # Loaded 1747418 elements of size 3

#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Cory and Hiram)
  ssh kolossus
  mkdir /cluster/data/fr2/bed/simpleRepeat
  cd /cluster/data/fr2/bed/simpleRepeat
  #  This missed chrM sequence
  time nice -n 19 twoBitToFa ../../fr2.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
     -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
  # ~31m
  # Make a filtered version for sequence masking:
  awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
  splitFileByColumn trfMask.bed trfMaskChrom

  # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed fr2 simpleRepeat \
      /cluster/data/fr2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    featureBits fr2 simpleRepeat
    # 9915088 bases of 393312790 (2.521%) in intersection

    featureBits fr1 simpleRepeat
    # 6801339 bases of 315518167 (2.156%) in intersection

    #	recovery attempt to get chrM masked
  time nice -n 19 twoBitToFa -seq=chrM ../../fr2.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
     -bedAt=chrM.simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
    #	It finds nothing !
    #	make an empty trfMaskChrom file:
    touch trfMaskChrom/chrM.bed

#########################################################################
## Add TRF mask to WindowMasker masked sequence
    ssh kkstore02
    cd /cluster/data/fr2
    twoBitMask bed/WindowMasker.2007-01-22/fr2.wmsk.sdust.2bit \
      -add bed/simpleRepeat/trfMask.bed fr2.2bit

    # Received ignorable warning:
    # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
    # might contain block coordinates, but this program uses only the first three
    # fields (the entire span -- no support for blocks).

    # Make this be the actual file the get DNA browser sees
    ssh hgwdev
    cd /gbdb/fr2/
    rm fr2.2bit
    ln -s /cluster/data/fr2/fr2.2bit fr2.2bit

#########################################################################
## Lift our .2bit file against the .lft file to create a scaffold fasta file
    cd /cluster/data/fr2/jkStuff
    cp ../jgi/fugu.041029.scaffolds.lft liftAll.lft
    cp /cluster/data/tetNig1/jkStuff/lft2BitToFa.pl .
    cd ..
    mkdir noUn
    cd noUn
    time ../jkStuff/lft2BitToFa.pl ../fr2.2bit ../jkStuff/liftAll.lft \
         > chrUn.scaffolds.fa
    # real    5m4.520s

    twoBitToFa -seq=chrM ../fr2.2bit chrM.fa
    faToTwoBit *.fa fr2.scaffolds.2bit
    twoBitInfo *.2bit stdout | sort -k2nr > fr2.scaffolds.sizes


##########################################################################
## Move the data out to the cluster
    cd /san/sanvol1/scratch/
    mkdir fr2
    cd fr2
    cp -p /cluster/data/fr2/jkStuff/liftAll.lft
    cp -p /cluster/data/fr2/chrom.sizes .
    cp -p /cluster/data/fr2/fr2.2bit .
    cp -p /cluster/data/fr2/noUn/*2bit .
    cp -p /cluster/data/fr2/noUn/*sizes .

## Edit the kent/src/hg/makeDb/doc/gasAcu1.txt doc to show what we're going to
## do there.
##
## To display the new chains and nets in the gasAcu1 browser, we had to edit the
## trackDb.ra file to include the new chain and net:
##
## track chainFr2
## shortLabel $o_db Chain
## longLabel $o_Organism ($o_date/$o_db) Chained Alignments
## group compGeno
## priority 140
## visibility hide
## color 100,50,0
## altColor 255,240,200
## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91
## spectrum on
## type chain fr2
## otherDb fr2

## track netFr2
## shortLabel $o_db Net
## longLabel $o_Organism ($o_date/$o_db) Alignment Net
## group compGeno
## priority 140.1
## visibility hide
## spectrum on
## type netAlign fr2 chainFr2
## otherDb fr2


## Then we verified that the chain covered a higher percentage of gasAcu1 than fr1 did:
## Look into the gasAcu1.txt file to see the commands going on there.

## Then we needed to update the links to the detail pages of chains and nets: 
## /cluster/home/cmclean/kent/src/hg/makeDb/trackDb/chainFr2.html and netFr2.html

###########################################################

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-24 - Hiram and Cory)
    # This will find repeats within the genome that should not be matched
    # against. Uses 11-mers.
    # Use -repMatch=128 (based on size -- for human we use 1024, and
    # fugu size is ~12% of human judging by gapless fr2 vs. hg18
    # genome sizes from featureBits.

    featureBits hg18 gap
    featureBits -countGaps hg18 gap
    
    ssh kolossus
    blat /cluster/data/fr2/fr2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/data/fr2/11.ooc -repMatch=128
    # Wrote 8898 overused 11-mers to /cluster/data/fr2/11.ooc

    cp -p /cluster/data/fr2/11.ooc /cluster/bluearc/fugu/fr2
    cp -p /cluster/data/fr2/jkStuff/liftAll.lft /cluster/bluearc/fugu/fr2

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-01-24 - Hiram and Cory)
    # Make a liftAll.lft that specifies 5M chunks for genbank:
    # Actually not necessary, since our chunks are small enough.  If we had
    # to, would look like this:
#    ssh kkstore05
#   cd /cluster/data/fr2
#   simplePartition.pl fr2.2bit 5000000 /tmp/fr2
#   cat /tmp/fr2/*/*.lft > jkStuff/liftAll.lft
#   rm -r /tmp/fr2

    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add fr2 just after fr1
    # fr2
    fr2.serverGenome = /cluster/data/fr2/fr2.2bit
    fr2.clusterGenome = /cluster/bluearc/fugu/fr2/fr2.2bit
    fr2.ooc = /cluster/bluearc/fugu/fr2/11.ooc
    fr2.align.unplacedChroms = chrUn
    fr2.lift = /cluster/bluearc/fugu/fr2/liftAll.lft
    fr2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
    fr2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
    fr2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
    fr2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
    fr2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
    fr2.genbank.mrna.xeno.loadDesc = yes
    fr2.refseq.mrna.native.load = no
    fr2.refseq.mrna.xeno.load  = no

    cvs ci -m "Added fr2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.  Not necessary here since
    # fugu already exists.
    #
    # cvs ci -m "Added Oryzias latipes (medaka)." src/lib/gbGenome.c
    # make install-server


    cd /cluster/data/genbank
    screen

    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.  
    nice -n +19 bin/gbAlignStep -initial fr2 &
    # logFile: var/build/logs/2007.01.24-12:09:11.fr2.initalign.log

    # We had an error because machine kkr4u02 was unable to ssh to.  This
    # happened in the middle of the -run subroutine.
    para problems > problems.out 2>&1
    grep host problems.out | sort | uniq -c | sort -n
    # 62021 host: kkr4u02.kilokluster.ucsc.edu

    parasol list machines | grep kkr4u02
    # kkr4u02.kilokluster.ucsc.edu

    parasol remove machine kkr4u02.kilokluster.ucsc.edu "unable to ssh"

    para push -retries=5
    # updated job database on disk
    # Pushed Jobs: 14820
    # Retried jobs: 14820

    # We still need to finish the rest of the gbAlignStep since it failed
    # because the -run subroutine did not finish correctly.  We must manually
    # call the rest of the routine.
    nice -n 19 bin/gbAlignStep -continue=finish -initial fr2 &
    # logFile: var/build/logs/2007.01.24-15:48:19.fr2.initalign.log


    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr2

    # enable daily alignment and update of hgwdev (DONE - 2007-02-05 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add fr2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added fr2." etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
# ENSEMBL GENES (DONE - 2007-01-24 - Hiram and Cory)
#  Good luck with the biomart interface.  It seems to be different each time
#  it is used.
    mkdir /cluster/data/fr2/bed/ensembl
    cd /cluster/data/fr2/bed/ensembl
    # Get the ensembl gene data from 
    # http://www.biomart.org/biomart/martview/
    #	
    # Follow this sequence through the pages:
    #	The default dataset will be Human.  In the right side
    #	frame, if you scroll it up and down, you will come to a pull-down
    #	dataset menu where you can select the organism.  There appear to be
    #	two pull-down menus in this frame, one for:
    #	Database: ENSEMBL 42 GENE (SANGER)
    #	and you can select the second:
    #	Dataset: Takifugu rubripes genes (FUGU4)

    #	After selecting the Dataset, in the left frame, click on the
    #	"Attributes" (Structures) label, now the right frame changes to radio
    #	buttons, Features, Sequences, Structures
    #	Click the "Structures" button, the three optional buttons can be
    #	expanded to select elements of these:
    #	REGION:
    #	GENE:
    #	EXON:
    #	REGION: has Chromosome checked
    #	GENE: has Ensembl Gene ID and Biotype selected
    #	EXON: has no selections
    #	In the GENE: menu:
    #	Unselect Biotype
    #	and Select
    #	Ensembl Gene ID
    #	Ensembl Transcript ID
    #	External Gene ID

    #   Click on the "Filters" section on the left-side frame.
    #   Under GENE in the right-side frame, select Gene type checkbox and
    #   highlight "protein_coding".
    #   Check:  Click "Count" from top buttons, and 22,008/22,409 genes will
    #   be reported.    

    #	Then, in the black menu bar above these frames, click the "Results"
    #	it shows the first ten rows.  For this organism, there appear to be no
    #	External Gene ID in the HTML view.  Change the "Display maximum"
    #	"rows as" pull-down
    #	to GFF, and use the "Export all results to" pull-down to
    #	Compressed web file (notify by email), press the "Go" button to download.
    
    #   After retrieving the URL where our data is located
    #   (http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz),
    #   we get it from that place:
    wget http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz \
         -O fr2.ensembl42.gff.gz

    # Ensemble gives us coordinates on each scaffold relative to the beginning
    # of *that scaffold.*  We want to have them in chrUn coordinates instead.
    # We use the liftUp program on the Ensembl data, with our liftAll.lft file
    # we created a long time ago, to achieve this, and name it
    # fr2.ensembl42.protein_coding.gff since we only took the protein coding
    # genes.
    zcat fr2.ensembl42.gff.gz | liftUp -type=.gff fr2.ensembl42.protein_coding.gff \
       ../../jkStuff/liftAll.lft error stdin 

    # On other files, sometimes we need to massage the input names to match
    # the UCSC naming conventions.  Below is an example, though it was not
    # necessary today.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software, and liftUp to get scaffolds onto chrUn
    #	The scaffolds and ultracontigs mentioned in this are not the scaffolds
    #	we have in our lift file for chrUn ...  can't use them.
    #    zcat oryLat1.ensembl42.gff.gz | egrep -v "scaffold|ultracontig" \
    #	| sed -e "s/^\([0-9][0-9]*\)/chr\1/" | gzip -c > ensembl42.gff.gz
    #	Verify names OK:
    #   zcat ensembl42.gff.gz | awk '{print $1}' | sort | uniq -c
    #	22938 chr1
    #	15887 chr10
    #	18645 chr11
    #	20162 chr12
    #	21474 chr13
    #	21302 chr14
    #	20148 chr15
    #	22978 chr16
    #	26164 chr17
    #	13671 chr18
    #	17109 chr19
    #	10988 chr2
    #	15705 chr20
    #	21361 chr21
    #	21030 chr22
    #	12984 chr23
    #	19009 chr24
    #	21767 chr3
    #	26204 chr4
    #	24335 chr5
    #	24300 chr6
    #	24329 chr7
    #	23323 chr8
    #	27795 chr9


    cd /cluster/data/fr2/bed/ensembl
    ldHgGene -gtf -genePredExt fr2 ensGene fr2.ensembl42.protein_coding.gff
    #   Read 22102 transcripts in 407112 lines in 1 files
    #   22102 groups 1 seqs 1 sources 4 feature types
    #   22102 gene predictions

    # The genome-test database will already populate this into our browser
    # since Ensembl genes are a default track.  However, the link to the
    # Ensembl website is broken because it automatically assumes we are
    # referencing the human genome.  We need to edit the
    # ~/kent/src/hg/makeDb/trackDb/fugu/trackDb.ra file in the ensGene track
    # and update the URL to point to the correct spot.  In this case, we want
    # the incorrect URL:
    # url http://www.ensembl.org/perl/transview?transcript=$$track genscan
    # to the correct URL:
    # url http://dec2006.archive.ensembl.org/Takifugu_rubripes/transview?transcript=$$
    # Also, we add the following lines at the end
    # urlName gene
    # archive dec2006


    ## Now we want to get the proteins that are derived from these genes as a link at
    #  the top of the detail browser.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # for the Attributes, choose the "Features" box, and then In "GENE:"
    # select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Results, choose txt output and a Compressed web file (notify by email).
    # Save this as
    # ensGtp42.txt.gz
    wget http://www.biomart.org/biomart/martresults?file=martquery_0124225551_471.txt.gz \
        -O ensGtp42.txt.gz
    
    #	Strip the first lines which is merely column heading labels
    zcat ensGtp42.txt.gz | headRest 1 stdin | sed -e "s/ /\t/g" > ensGtp.txt

    # We want to load our genes, but unfortunately the gene names are larger
    # than the standard ensGtp.sql file that the following command handles:
    #    hgLoadSqlTab fr2 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
    # Instead, we make our own temporary .sql file to handle the insert.
    sed -e "s/20/21/; s/18/21/" ~/kent/src/hg/lib/ensGtp.sql > ensGtp.bigcols.sql;

    # And then perform the insert.
    hgLoadSqlTab fr2 ensGtp ensGtp.bigcols.sql ensGtp.txt
    rm ensGtp.bigcols.sql

    hgsql -N -e "select count(*) from ensGtp;" fr2
    #   +-------+
    #   | 22102 |
    #   +-------+
    wc -l ensGtp.txt
    #   22102 ensGtp.txt

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # for the Attributes, choose the "Sequences" box, and then 
    # SEQUENCES: Peptide and 
    #	Header Information "Ensembl Transcript ID"
    #	Results output as FASTA
    # Save file as ensPep.fa.gz

    # XXX Still waiting for the proteins to be sent to us so that we can add
    # this part.
    # hgPepPred fr2 generic ensPep ensPep.fa

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-01-25 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("fr2", "blat3", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("fr2", "blat3", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
# BLASTZ/CHAIN/NET Hg18 (DONE - 2007-01-26 - Hiram)
    ##  Swap back to fr2
    mkdir /cluster/data/fr2/bed/blastz.hg18.swap
    cd /cluster/data/fr2/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    #	real    47m14.554s

    ssh hgwdev
    cd /cluster/data/fr2/bed/blastz.hg18.swap
    time nice -n +19 featureBits fr2 chainHg18Link \
	> fb.fr2.chainHg18Link.txt 2>&1 &
    #	42875664 bases of 393312790 (10.901%) in intersection

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26)
    ssh kkstore02
    bash # if not using bash shell already

    mkdir /cluster/data/fr2/blastDb
    cd /cluster/data/fr2
    zcat jgi/fugu.041029.scaffolds.fasta.gz > temp.fa
    faSplit sequence temp.fa 500 blastDb/
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/fr2/blastDb
    cd /cluster/data/fr2/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/fr2/blastDb
    done

    mkdir -p /cluster/data/fr2/bed/tblastn.hg18KG
    cd /cluster/data/fr2/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/fr2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 495 query.lst

   # we want around 100000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\)

# 36727/(100000/495) = 181.798650

   mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa
   split -l 180 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/fr2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/fr2/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 101475 of 101475 jobs
# CPU time in finished jobs:    3924977s   65416.29m  1090.27h   45.43d  0.124 y
# IO & Wait Time:               1118884s   18648.06m   310.80h   12.95d  0.035 y
# Average job time:                  50s       0.83m     0.01h    0.00d
# Longest finished job:            3925s      65.42m     1.09h    0.05d
# Submission to last job:         46557s     775.95m    12.93h    0.54d

    ssh kkstore04
    cd /cluster/data/fr2/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/fr2/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.

# Completed: 205 of 205 jobs
# CPU time in finished jobs:       2262s      37.70m     0.63h    0.03d  0.000 y
# IO & Wait Time:                 40989s     683.15m    11.39h    0.47d  0.001 y
# Average job time:                 211s       3.52m     0.06h    0.00d
# Longest finished job:             360s       6.00m     0.10h    0.00d
# Submission to last job:          1492s      24.87m     0.41h    0.02d

    ssh kkstore04
    cd /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/fr2/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
    cd ..
    pslCheck unliftBlastHg18KG.psl
    liftUp blastHg18KG.psl ../../jkStuff/liftAll.lft warn unliftBlastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/fr2/bed/tblastn.hg18KG
    hgLoadPsl fr2 blastHg18KG.psl

    # check coverage
    featureBits fr2 blastHg18KG 
# 19761405 bases of 393312790 (5.024%) in intersection

    featureBits fr2 refGene:cds blastHg18KG  -enrichment
# ensGene:cds 8.216%, blastHg18KG 5.024%, both 4.401%, cover 53.57%, enrich 10.66x

    ssh kkstore04
    rm -rf /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
#end tblastn

#########################################################################
# BLASTZ/CHAIN/NET TetNig1 SWAP (DONE - 2007-01-29 - Hiram)
##  Align to fr2 scaffolds,
##	results lifted to fr2 chrUn coordinates
    ## Swap to fr2
    mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap
    cd /cluster/data/fr2/bed/blastz.tetNig1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 &
    #	real    40m40.471s

    ssh hgwdev
    cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
    time nice -n +19 featureBits tetNig1 chainFr2Link \
	> fb.tetNig1.chainFr2Link.txt 2>&1 
    #	246828605 bases of 342403326 (72.087%) in intersection

    cd /cluster/data/fr2/bed/blastz.tetNig1.swap
    time nice -n +19 featureBits fr2 chainTetNig1Link \
	> fb.fr2.chainTetNig1.txt 2>&1
    #	247086553 bases of 393312790 (62.822%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET gasAcu1 swap (DONE - 2007-01-23 - Hiram)
##  no chrUn in gasAcu1, and align to fr2 scaffolds,
##	results lifted to fr2 chrUn coordinates
    ssh kkstore05
    mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap
    cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    24m33.761s

    ssh hgwdev
    cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits fr2 chainGasAcu1Link \
	> fb.fr2.chainGasAcu1Link.txt 2>&1 &
    # 158383996 bases of 393312790 (40.269%) in intersection

#########################################################################
##  BLASTZ/CHAIN/NET to gasAcu1 chrUn - the above swap does not include
##	gasAcu1 chrUn - thus its browser would be empty for any fr2 alignments.
##	This procedure will get fr2 alignments added to the gasAcu1 browser
##	for chrUn
    ssh kkstore02
    mkdir /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
    cd /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31

    cat << '_EOF_' > DEF
# Stickleback vs. Fugu, Stickleback chrUn in contigs only, to fugu contigs

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    
# TARGET: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf
#       coordinates
SEQ1_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ1_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ1_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ1_CHUNK=20000000
SEQ1_LIMIT=30
SEQ1_LAP=10000

# QUERY: Stickleback gasAcu1 chrUn only
#       chrUn in contigs for this alignment run
#       The largest is 418,000 bases and there are 5,000 of them.
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
TMPDIR=/scratch/tmp
'_EOF_'
    #  << happy emacs

    time doBlastzChainNet.pl DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/fr2GasAcu1 \
	-stop=net -bigClusterHub=pk > do.log 2>&1 &
    #	real    73m13.156s

    ##	swap back to gasAcu1
    mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap
    cd /cluster/data/gasAcu1/bed/blastz.fr2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 &

    ## Now, with that chain result in hand, place it manually back in with
    ##	the full chroms chains and re-run the nets and so forth.

#########################################################################
## swap oryLat1 results back to fr2 (DONE - 2007-01-24 - Hiram)
    mkdir /cluster/data/fr2/bed/blastz.oryLat1.swap
    cd /cluster/data/fr2/bed/blastz.oryLat1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/oryLat1/bed/blastz.fr2.2007-01-24/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24
    time nice -n +19 featureBits oryLat1 chainFr2Link \
	> fb.oryLat1.chainFr2Link.txt 2>&1
    #	177508958 bases of 700386597 (25.344%) in intersection
    cd /cluster/data/fr2/bed/blastz.oryLat1.swap]
    time nice -n +19 featureBits fr2 chainOryLat1Link \
	> fb.fr2.chainOryLat1Link.txt 2>&1
    #	143996507 bases of 393312790 (36.611%) in intersection

#########################################################################
## 5-Way Multiz (DONE - 2007-02-03 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/fr2/bed/multiz5way
    cd /cluster/data/fr2/bed/multiz5way

    cp /cluster/data/gasAcu1/bed/multiz8way/8way.nh .

    /cluster/bin/phast/tree_doctor \
	--prune human_hg18,mouse_mm8,chicken_galGal3 8way.nh

    #	use the output of that to manually construct this tree.
    #	Arbitrarily set 0.2 distances for this added branch
    #	All other distances remain as specified in the 17way.nh

    cat << '_EOF_' > 5way.nh
  (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
    (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
	zebrafish_danRer4:0.782561);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/fr2_5way.gif

    /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt
    #	Use this output to create the table below
    grep -y fr2 5way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                           chainGasAcu1Link  chain linearGap
#    distance                      on fr2    on other   minScore
#  1  0.4393 - tetraodon tetNig1   (% 62.822) (% 72.087)       2000     loose
#  2  0.8399 - medaka oryLat1      (% 36.611) (% 25.344)       2000     loose
#  3  0.8399 - stickleback gasAcu1 (% 40.269) (% 37.574)       2000     loose
#  4  1.5156 - zebrafish danRer4   (% 20.585) (%  8.543)       5000     loose

    cd /cluster/data/fr2/bed/multiz5way
    #	bash shell syntax here ...
    export H=/cluster/data/fr2/bed
    mkdir mafLinks
    for G in oryLat1 tetNig1 gasAcu1 danRer4
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore02
    mkdir /san/sanvol1/scratch/fr2/multiz5way
    cd /san/sanvol1/scratch/fr2/multiz5way
    time rsync -a --copy-links --progress \
	/cluster/data/fr2/bed/multiz5way/mafLinks/ .

    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

    # the autoMultiz cluster run, there are only 2 jobs, kki is perfect
    ssh kki
    cd /cluster/data/fr2/bed/multiz5way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	5way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = fr2
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz5way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/fr2/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    awk '{print $1}' /cluster/data/fr2/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 2 jobs
    para try ... check ... push ... etc ...
# Completed: 2 of 2 jobs
# CPU time in finished jobs:       7246s     120.77m     2.01h    0.08d  0.000 y
# IO & Wait Time:                   357s       5.94m     0.10h    0.00d  0.000 y
# Average job time:                3802s      63.36m     1.06h    0.04d
# Longest finished job:            7601s     126.68m     2.11h    0.09d
# Submission to last job:          7601s     126.68m     2.11h    0.09d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore02
    cd /cluster/data/fr2/bed/multiz5way
    nice -n +19 catDir maf > multiz5way.maf
    #	makes a 1.3 Gb file:
    #	-rw-rw-r--  1 1341786986 Feb  3 19:52 multiz5way.maf

############################################################################
# ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-02-03 - Hiram)
## re-done 2007-03-27 with corrected nBeds and sizes files - Hiram
    ssh kolossus
    mkdir /cluster/data/fr2/bed/multiz5way/anno
    cd /cluster/data/fr2/bed/multiz5way/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    twoBitInfo -nBed /cluster/data/fr2/fr2.{2bit,N.bed}

    for DB in `cat /cluster/data/fr2/bed/multiz5way/species.lst`
    do
        ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
        ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
	echo ${DB}.bed  >> nBeds
	echo ${DB}.len  >> sizes
	echo $DB
    done

    echo '#!/bin/csh -ef' > jobs.csh
    echo date >> jobs.csh
    # do smaller jobs first so you can see some progress immediately:
    for F in `ls -1rS ../../maf/*.maf`
    do
     echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \
      /cluster/data/fr2/fr2.2bit ../maf/`basename $F` >> jobs.csh
     echo "echo $F" >> jobs.csh
    done 
    echo date >> jobs.csh
    chmod +x jobs.csh
    time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
    #	to watch progress;
    tail -f jobs.log
    #	real    165m43.716s
    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way/anno/maf
    mkdir -p /gbdb/fr2/multiz5way/anno/maf
    ln -s /cluster/data/fr2/bed/multiz5way/anno/maf/*.maf \
      /gbdb/fr2/multiz5way/anno/maf
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/fr2/multiz5way/anno/maf fr2 multiz5way
    #	Loaded 1469786 mafs in 2 files from /gbdb/fr2/multiz5way/anno/maf
    #	real    0m41.123s

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/fr2/bed/multiz5way/anno/maf
    time cat *.maf | \
    	nice -n +19 hgLoadMafSummary fr2 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000 -test multiz5waySummary stdin
    #	Created 146928 summary blocks from 3159326 components
    #	and 1469786 mafs from stdin

    #	real    0m58.171s

    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way/anno/maf
    sed -e 's/mafSummary/multiz5waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz5waySummary.sql
    time nice -n +19 hgLoadSqlTab fr2 multiz5waySummary \
	~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab
    #	real    0m1.941

#######################################################################
# MULTIZ5WAY MAF FRAMES (DONE - 2007-02-03 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/fr2/bed/multiz5way/frames
    cd /cluster/data/fr2/bed/multiz5way/frames
    mkdir genes
    # The following is adapted from the gasAcu1 sequence

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using refGene for danRer4
    for qDB in danRer4
    do
	geneTbl=refGene
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    # using genscan for tetNig1
    # using ensGene for gasAcu1, oryLat1 and fr2
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in gasAcu1 oryLat1 fr2 tetNig1
    do
      if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then
        geneTbl=ensGene
      elif [ $qDB = "tetNig1" ]; then
        geneTbl=genscan
      else
        exit 255
      fi
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    ###
    
    ssh kkstore02
    cd /cluster/data/fr2/bed/multiz5way/frames
    time cat ../maf/*.maf | nice -n +19 genePredToMafFrames fr2 stdin stdout fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz | gzip > multiz5way.mafFrames.gz
    #	real    0m52.606

    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way/frames
    time nice -n +19 hgLoadMafFrames fr2 multiz5wayFrames \
	multiz5way.mafFrames.gz
    #	real    0m20.580s

#########################################################################
# Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram)
    # edit src/hg/makeDb/genbank/genbank.conf to add:

fr2.upstreamGeneTbl = ensGene
fr2.upstreamMaf = multiz5way /hive/data/genomes/fr2/bed/multiz5way/species.lst

#########################################################################
# MULTIZ5WAY DOWNLOADABLES (DONE - 2007-02-05 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/fr2/bed/multiz5way/mafDownloads
    cd /cluster/data/fr2/bed/multiz5way

    # upstream mafs 
    #	rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
    #	did not have fr2 as the first one
    #	There isn't any refGene table, using ensGene instead
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 fr2 \
        ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags fr2 multiz5way \
                stdin stdout -orgs=species.lst \
        | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
    echo "done ensGene.upstream${S}.maf.gz"
done

    ssh kkstore05
    cd /cluster/data/fr2/bed/multiz5way
    ## re-done 2007-03-27 after correction to annotation step - Hiram
    time for M in anno/maf/chr*.maf
    do
	B=`basename $M`
	nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz
	echo ${B}.gz done
    done
    #	real    4m39.440s

    cd mafDownloads
    nice -n +19 md5sum *.maf.gz > md5sum.txt
    # Make a README.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
    cd /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
    ln -s /cluster/data/fr2/bed/multiz5way/mafDownloads/{*.gz,*.txt} .

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE - 2007-02-05 - Hiram)

# Estimate phastCons parameters
    ssh kkstore05
    mkdir /cluster/data/fr2/bed/multiz5way/cons
    cd /cluster/data/fr2/bed/multiz5way/cons

    # Create a starting-tree.mod based on one 25,000,000 window of chrUn
    time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrUn.maf \
	--refseq ../../../Un/chrUn.fa --in-format MAF \
	--windows 25000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	real    4m27.989s


    time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \
	s1.174992629-199991578.ss \
	--tree "(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4)" \
	--out-root starting-tree
    #	As an experiment, ran all of these ss files through this prediction
    #	and the resulting stats of the add up the C and G:
    ## min    Q1 median   Q3   max     mean  N   sum     stddev
    # 0.45 0.457 0.463 0.469 0.479 0.461941 17 7.853 0.00759621
    #	Using the one closest to the mean: s1.174992629-199991578.ss

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.463
    #	This 0.463 is used in the --gc argument below

    ## the fa files are needed for the sequence and they are created during
    #	this loop if they haven't been done before
    # Create SS files on san filesystem
    ssh kkstore05
    mkdir -p  /san/sanvol1/scratch/fr2/cons/ss
    cd  /san/sanvol1/scratch/fr2/cons/ss
    time for C in \
	`awk '{print $1}' /cluster/data/fr2/chrom.sizes | sed -e "s/chr//"`
    do
	mkdir -p chr${C}
	echo msa_split $C
	nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/fr2/bed/multiz5way/maf/chr${C}.maf \
	    --refseq /cluster/data/fr2/${C}/chr${C}.fa \
	    --in-format MAF --windows 2500000,0 --between-blocks 5000 \
	    --out-format SS --out-root chr${C}/chr${C}
    done &
    #	real    4m2.736s

    # Create a random list of 50 1 mb regions
    cd /san/sanvol1/scratch/fr2/cons/ss
    ls -1l chr*/chr*.ss \
	| awk '$5 > 4000000 {print $9;}' \
	| randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/fr2/cons/treeRun1
    cd /san/sanvol1/scratch/fr2/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/fr2/bed/multiz5way/cons/starting-tree.mod \
      --gc 0.463 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 10 --target-coverage 0.20 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
./makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:       5204s      86.74m     1.45h    0.06d  0.000 y
# IO & Wait Time:                   204s       3.39m     0.06h    0.00d  0.000 y
# Average job time:                 108s       1.80m     0.03h    0.00d
# Longest finished job:             138s       2.30m     0.04h    0.00d
# Submission to last job:           141s       2.35m     0.04h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh pk
    cd /san/sanvol1/scratch/fr2/cons/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt 2>&1 &
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    cp -p ave.*.mod ..
    cd ..
    cp -p ave.*.mod /cluster/data/fr2/bed/multiz5way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
	0.20 10 ave.{cons,noncons}.mod
## 0.20 10
( Solving for new omega: 10.000000 10.467305 10.449210 10.449184 )

Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=0.903779 bits/site
Expected min. length: L_min=10.726004 sites
Expected max. length: L_max=6.735382 sites
Phylogenetic information threshold: PIT=L_min*H=9.693936 bits
Recommended expected length: omega=10.449184 sites (for L_min*H=9.783400)

    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/fr2/cons/consRun1
    cd /san/sanvol1/scratch/fr2/cons/consRun1
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 10 --target-coverage 0.20 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
./doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they will hang it up as they work at full tilt.
# Completed: 162 of 162 jobs
# CPU time in finished jobs:       1501s      25.02m     0.42h    0.02d  0.000 y
# IO & Wait Time:                   924s      15.40m     0.26h    0.01d  0.000 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest finished job:              23s       0.38m     0.01h    0.00d
# Submission to last job:            44s       0.73m     0.01h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/fr2/cons/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
    #	if that looks right, then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved5Way.bed
    #	~ 16 seconds
    cp -p mostConserved5Way.bed /cluster/data/fr2/bed/multiz5way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore05
    cd /cluster/data/fr2
    faSize Un/chrUn.fa M/chrM.fa
    #	400525790 bases (49313545 N's 351212245 real 284435760
    #	upper 66776485 lower) in 2 sequences in 2 files

    cd /san/sanvol1/scratch/fr2/cons/consRun1
    #	The 351212245 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/351212245\n",100.0*sum/351212245,sum}' \
	mostConserved5Way.bed
    #	% 15.82 = 100.0*55573943/351212245 --exp-len 10 --tar-cov 0.20

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits fr2 \
	-enrichment ensGene:cds mostConserved5Way.bed
    #	--expected-length 10 --target-coverage 0.20 fr2
    #	ensGene:cds 8.216%, mostConserved5Way.bed 14.130%, both 5.188%, cover
    #	63.14%, enrich 4.47x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way
    # ended up using the set: --expected-length 10 --target-coverage 0.20
    time nice -n +19 hgLoadBed -strict fr2 phastConsElements5way \
	mostConserved5Way.bed
    #	Loaded 1140341 elements of size 5
    #	real    0m28.545

    #	should measure the same as above
    time nice -n +19 \
	featureBits fr2 -enrichment ensGene:cds phastConsElements5way
    #	At: --expected-length 10 --target-coverage 0.20 fr2
    #	ensGene:cds 8.216%, phastConsElements5way 14.130%, both 5.188%, cover
    #	63.14%, enrich 4.47x

    # Create merged posterier probability file and wiggle track data files
    ssh pk
    cd /san/sanvol1/scratch/fr2/cons/consRun1
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
		phastCons5.wig phastCons5.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    3m49.865s
    #	-rw-rw-r--   1 288320445 Feb  5 13:51 phastCons5.wib
    #	-rw-rw-r--   1  37122305 Feb  5 13:51 phastCons5.wig
    time nice -n +19 cp -p phastCons5.wi? /cluster/data/fr2/bed/multiz5way/

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/fr2/cons/consRun1
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons5Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons5Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    5m21.400s

    #	copy them for downloads
    ssh kkstore05
    #	this directory is actually a symlink from store9 to store8 to
    #	avoid the data full problem on store9
    mkdir /cluster/data/fr2/bed/multiz5way/phastCons5Scores
    cd /cluster/data/fr2/bed/multiz5way/phastCons5Scores
    cp -p  /san/sanvol1/scratch/fr2/cons/consRun1/phastCons5Scores/* .
    # make a README.txt file here, and an md5sum

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
    cd /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
    ln -s /cluster/data/fr2/bed/multiz5way/phastCons5Scores/* .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way
    ln -s `pwd`/phastCons5.wib /gbdb/fr2/wib/phastCons5.wib
    # ended up using the set: --expected-length 10 --target-coverage 0.20
    time nice -n +19 hgLoadWiggle fr2 phastCons5 phastCons5.wig
    #	real    0m9.256s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/fr2/bed/multiz5way
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=fr2 phastCons5 > histogram.data 2>&1
    #	real    0m30.744

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback fr2 Histogram phastCons5 track"
set xlabel " phastCons5 score - --expected-length 10 --target-coverage 0.20"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	The mostConserved can also be characterized by a histogram
    awk '{print $3-$2}' mostConserved5Way.bed > mostCons.txt
    textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \
	> mostCons.histo.txt

    cat << '_EOF_' | gnuplot > mostCons.png
set terminal png small color \
        xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback fr2 histogram: lengths of mostConserved track elements"
set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20"
set ylabel " # of elements at this length"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set xrange [0:200]
set y2tics
set boxwidth 2
set style fill solid

plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \
        "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines
'_EOF_'
#	<< happy emacs

############################################################################
##  Set a more interesting default position, location of CABIN1 gene
##	DONE - 2007-02-08 - Hiram
    ssh hgwdev
    hgsql -e "update
    hgsql -e \
'update dbDb set defaultPos="chrUn:29,305,920-29,330,760" where name = "fr2";' \
	-h genome-testdb hgcentraltest

############################################################################
##  DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
    ssh hgwdev
    cd /cluster/data/fr2
    ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
	> makeDownloads.out 2>&1
    #	Doesn't work due to missing Repeat masker outputs
    #	Create WindowMasker separate files by chrom, for downloads
    ssh kkstore05
    cd /cluster/data/fr2/bed/WindowMasker.2007-01-22
    #	This name change here is so the names created by splitFileByColumn
    #	will be reasonable
    zcat windowmasker.sdust.bed.gz > chr.fr2.WMSdust.bed
    splitFileByColumn chr.fr2.WMSdust.bed chrWM
    #	Creating chrWM/chr1.fr2.WMSdust.bed
    #	Creating chrWM/chr2.fr2.WMSdust.bed
    #	... etc ...
    cd chrWM
    tar cvzf ../chromWMSdust.bed.tar.gz *.bed
    cd ..
    #	Verify this process didn't destroy anything:
    cat chrWM/*.bed | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
    #	total size: 115825307
    zcat windowmasker.sdust.bed.gz \
	| awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
    #	total size: 115825307
    #	deliver to bigZips
    cd /cluster/data/fr2/goldenPath/bigZips
    ln -s \
    /cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
    #	remove the chromOut.tar.gz file and re-make the md5sum.txt
    md5sum *gz > md5sum.txt
    #	go back to simpleRepeat and attempt to make a chrM.bed - turns out to
    #	be an empty result, so leave an empty file there.  Then running this
    #	makeDownloads.pl again, create two empty .out files to get through
    #	that.
    cd /cluster/data/fr2
    touch Un/chrUn.fa.out
    touch M/chrM.fa.out
    ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
	> makeDownloads.out 2>&1
    cd goldenPath/bigZips
    ln -s \
	/cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
    # get GenBank native mRNAs
    ssh hgwdev
    cd /cluster/data/genbank
    ./bin/i386/gbGetSeqs -db=fr2 -native \
	GenBank mrna /cluster/data/fr2/goldenPath/bigZips/mrna.fa
    # get GenBank xeno mRNAs
    ./bin/i386/gbGetSeqs -db=fr2 -xeno \
	GenBank mrna /cluster/data/fr2/goldenPath/bigZips/xenoMrna.fa
    cd /cluster/data/fr2/goldenPath/bigZips
    ssh kkstore05
    gzip mrna.fa xenoMrna.fa
    md5sum *.gz > md5sum.txt
    #	Edit the README.txt file to be correct
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/fr2/bigZips
    ln -s /cluster/data/fr2/goldenPath/bigZips/* .

############################################################################
## Fugu Photograph - obtained from Byrappa Venkatesh in email 2007-02-15
##		(DONE - 2007-02-16 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/fr2/photograph
    cd /cluster/data/fr2/photograph
    ## original: Byrappa.Venkatesh.Fugu.jpg
    identify By*
    #	Byrappa.Venkatesh.Fugu.jpg JPEG 700x286 DirectClass 43kb 0.000u 0:01
    convert -quality 80 -geometry 300x200 Byrappa.Venkatesh.Fugu.jpg \
	Takifugu_rubripes.jpg
    ##  check this file into the browser/images CVS source tree and
    ##	copy to /usr/local/apache/htdocs/images

##########################################################################
## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/fr2/bed/RepeatMasker
    cd /cluster/data/fr2/bed/RepeatMasker
    time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
	-buildDir=/cluster/data/fr2/bed/RepeatMasker fr2 > do.log 2>&1

###########################################################################
## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram)
    mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
    cd /cluster/data/fr2/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log  2>&1 &
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net -swap > swap_net.log  2>&1 &
    #	real    3m1.239s
    cat fb.fr2.chainGalGal3Link.txt
    #	36175581 bases of 393312790 (9.198%) in intersection

###########################################################################
# Create liftover fr1 to fr2 (DONE - 2007-04-09 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/fr2/bed/blat.fr1
    cd /cluster/data/fr2/bed/blat.fr1

    time nice -n +19 doSameSpeciesLiftOver.pl fr1 fr2 -bigClusterHub pk \
	-buildDir=/cluster/data/fr2/bed/blat.fr1 > do.log 2>&1

    cp -p fr1ToFr2.over.chain.gz ../liftOver
    cd ../liftOver
    md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/fr2/liftOver
    ln -s /cluster/data/fr2/bed/liftOver/fr1ToFr2.over.chain.gz .

##############################################################################
# SWAP DANRER5 BLASTZ RESULT TO CREATE DANRER5 CHAINS AND NETS TRACKS, 
# AXTNET, MAFNET AND ALIGNMENT DOWNLOADS 
# (DONE, 2007-09-19 and 2007-09-22, hartera)
    
    ssh kkstore02
    mkdir /cluster/data/fr2/bed/blastz.swap.danRer5
    cd /cluster/data/fr2/bed/blastz.swap.danRer5
    # blastz parameters used to align fr2 as query to danRer5 as target:
    # BLASTZ_H=2500
    # BLASTZ_M=50
    # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    # Results for fr2 blastz on danRer5 are in:
    # /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
    time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -qRepeats=windowmaskerSdust -swap \ 
        /cluster/data/danRer5/bed/blastz.fr2.2007-09-18/DEF \
         >& swap.log &
    # 0.139u 0.098s 14:09.59 0.0%     0+0k 0+0io 0pf+0w

    ssh hgwdev
    cat \ 
  /cluster/data/fr2/bed/blastz.danRer5.swap/fb.fr2.chainDanRer5Link.txt
    # 78259559 bases of 393312790 (19.898%) in intersection
    # look at coverage of ensGene CDS, there is no native RefSeqs 
    # track for fugu, fr2.
    featureBits fr2 ensGene:cds chainDanRer5Link -enrichment
# ensGene:cds 8.216%, chainDanRer5Link 19.898%, both 7.130%, cover 86.78%,
# enrich 4.36x 
    featureBits fr2 ensGene:cds chainDanRer4Link -enrichment
# ensGene:cds 8.216%, chainDanRer4Link 20.585%, both 7.030%, cover 85.56%,
# enrich 4.16x 
    featureBits fr2 ensGene:cds netDanRer5 -enrichment
# ensGene:cds 8.216%, netDanRer5 66.051%, both 7.845%, cover 95.48%, 
# enrich 1.45x
    featureBits fr2 ensGene:cds netDanRer4 -enrichment
# ensGene:cds 8.216%, netDanRer4 65.374%, both 7.766%, cover 94.51%, 
# enrich 1.45x
    # clean up a little (2007-09-22, hartera) 
    ssh kkstore02
    cd /cluster/data/fr2/bed
    mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap
    rm -r blastz.swap.danRer5
    ln -s blastz.danRer5.swap blastz.danRer5
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
    ssh kkstore04	# not too important since everything moved to the hive
    screen	# use a screen to control this job
    cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
    cat fb.oryLat2.chainFr2Link.txt
    #	180945351 bases of 700386597 (25.835%) in intersection

    mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap
    cd /cluster/data/fr2/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    24m32.826s
    cat fb.fr2.chainOryLat2Link.txt
    #	153621820 bases of 393312790 (39.058%) in intersection

############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# LASTZ/CHAIN/NET fr2 (DONE - 2009-09-15,18 - Hiram)
    # original alignment to tetNig2:
    cd /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15
    cat fb.tetNig2.chainFr2Link.txt 
    #	243965150 bases of 302314788 (80.699%) in intersection

    #	and the swap to here, fr2:
    mkdir /hive/data/genomes/fr2/bed/blastz.tetNig2.swap
    cd /hive/data/genomes/fr2/bed/blastz.tetNig2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15/DEF \
	-swap -tRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> swap.log 2>&1 &
    #	real    21m56.576s
    cat fb.fr2.chainTetNig2Link.txt 
    #	248984008 bases of 393312790 (63.304%) in intersection

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# construct liftOver to fr3 (DONE - 2012-02-22 - Hiram) 
    screen	# manage this longish running job in a screen
    mkdir /hive/data/genomes/fr2/bed/blat.fr3.2012-02-22
    cd /hive/data/genomes/fr2/bed/blat.fr3.2012-02-22
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/fr2/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev fr2 fr3 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/fr2/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev fr2 fr3 > do.log 2>&1
    #	real    17m59.258s

    # verify this file exists:
    #	/gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz
    # and try out the conversion on genome-test from fr2 to fr3 

############################################################################