# This file describes how we made the browser database on the mouse
# genome, June 2004 build. - Mm5
#
#
#	NOTE:  There is a new chrMT sequence in the build 32
#	>gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion
#
#   Will have to beware of this NC_ contig in the processing since
#	all previous builds had only NT_ contigs
#
# NOTE: The README_PREBUILD file for this assembly mentions several
# differences from the previous release (build 30):
# 1. seq_contig.md - new first line is a comment containing column name
#       Also, last two columns (group label and weight, have been swapped)
#       Also, some lines have id with CONTIG: prepended, and upper-case
#               feature type (CONTIG)
# 2. contig.idmap - has an additional column "contig label"
# This required changing the jkStuff ncbi* utilities (7/1/03 KRR)
#
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2004-06-27 - Fan)
    ssh kksilo
    mkdir -p /cluster/store6/mm5/ncbi
    ln -s /cluster/store6/mm5 /cluster/data
    cd /cluster/data/mm5/ncbi
    mkdir chrfasta contigfasta
    ftp ftp.ncbi.nih.gov
      # user hgpguest, password from /cse/faculty/kent/buildHg6.doc
      cd mouse_33
      prompt
      bin
      mget *
      quit
    gunzip *.agp.gz

# compress chrY.fa (at NCBI site, this one file some how was not compressed)
	cd chrfasta
gzip chrY.fa
cd ..

#use chrMT.fa.gz from mm4 instead because its first line format is correct
 
	cp -p /cluster/store6/mm4/ncbi/chrfasta/chrMT.fa.gz chrfasta
cp -p /cluster/store6/mm4/ncbi/contigfasta/chrMT.fa.gz contigfasta

# Fix the troubles caused by chrMT released later separately

# Fixed allcontig.agp
# add the last line of .../mm4/ncbi/allcontig.agp to allcontig.agp

# Fixed allrefcontig.chr.agp
# add the last line of .../mm4/ncbi/allrefcontig.chr.agp to allrefcontig.chr.agp

# Fix contig.idmap
    cat contig.idmap chrMT/contig.idmap >new.idmap
    mv new.idmap contig.idmap

# Fix seq_contig.md
# Edit seq_contig.md to add 3 lines (from mm4) in its middle before  Un|...
10090   MT      0       0       +       start   -1      CONTIG  C57BL/6J        
1010090   MT      1       16299   +       NC_005089       GI:34538597     CONTIG  
C57BL/6J        na10090   MT      16299   16299   +       end     -2      CONTIG  C57BL/6J        
10

# ctg_coords, contig_overlaps.agp and sequence.inf not fixed.

# Check chromosome files  (DONE - 2004-06-27 - Fan)
cd chrfasta

foreach f (*.fa.gz)
echo $f:r >> faSize.out
gunzip $f
/cluster/bin/i386/faSize $f:r >> faSize.out
echo $f:r done
end

/cluster/bin/i386/faSize *.fa >> faSize.out
grep "^>" *.fa > ../chrfasta.all.fa.headers

gzip *.fa

cd ../contigfasta
gunzip *.fa.gz
grep "^>" *.fa > ../contigfasta.all.fa.headers
gzip *.fa

# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
#					(DONE - 2004-06-27 - Fan)

    ssh kksilo
    cd /cluster/data/mm5
    gunzip ncbi/allrefcontig.chr.agp.gz
    # splitFaIntoContigs doesn't do right with agp lines arriving in a
    # different order than fasta chrom sequences.  so split up the agp
    # into one per chrom.
    foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un)
      mkdir $c
      perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
        ./ncbi/allrefcontig.chr.agp \
        > $c/chr$c.agp
      gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \
        | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \
        | splitFaIntoContigs $c/chr$c.agp \
          stdin /cluster/data/mm5 -nSize=5000000
    end

#    gzip ncbi/chrfasta/chr*.fa

# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2004-06-27 - Fan)
    ssh kksilo
    cd /cluster/data/mm5/ncbi

    gunzip seq_contig.md.gz

    # reorder random contigs in allrefcontig agp file to match seq_contig.md
    # this is required by the ncbiToRandomAgps scripts
    # had to fixup ncbiToRandomAgps from previous use to match the
    #	lines better, and to do the MT/NC_ mitochondrion thing

    mkdir /cluster/store6/mm5/jkStuff

# copy scripts used from previous trial mm5 build
    cd /cluster/data/mm5
    cp -p ~/mm50/jkStuff/* jkStuff
    cd /cluster/data/mm5/ncbi
    ../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \
                        allrefcontig.chr.ordered.agp

#Edit MANUALLY ../jkStuff/ncbiToRandomAgps, to change build 32 to build 33.

    ../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \
                        contig.idmap ..
        # creating ../mm5/1/chr1_random.agp...
        # ... creating ../mm5/Un/chrUn_random.agp...
    #  The chrUn_random.agp created by this is too large with the 5000
    #  gaps.  it will work with 1000 gaps, so fixup the chrUn_random agp:
    ../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
      seq_contig.md allrefcontig.chr.ordered.agp contig.idmap ..

    ssh kksilo
    cd /cluster/data/mm5
    foreach c (?{,?})
      if (-e $c/chr${c}_random.ctg.agp) then
        echo building $c/chr${c}_random.fa
        gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \
          | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
          > ./tmp.fa
        agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
          $c/chr${c}_random.fa ./tmp.fa
        rm tmp.fa
      endif
    end
    # building 1/chr1_random.fa
    # ... etc ...
    # building Un/chrUn_random.fa
    # Writing 102265694 bases to Un/chrUn_random.fa

    # Clean these up to avoid confusion later... they're easily rebuilt
    #	with the ncbiToRandomAgps script above
    rm ?/*.ctg.agp ??/*.ctg.agp

# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 2004-06-27 - Fan)
    ssh kksilo
    cd /cluster/data/mm5
    foreach c (?{,?})
      if (-e $c/chr${c}_random.agp) then
        splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mkdir -p $c/lift
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
    end
    #  This has a lot of output.  It is difficult to see if anything
    #   goes wrong.

#  Fixup chrMT name to be chrM (DONE - 2004-06-27 - Fan)

    ssh kksilo
    cd /cluster/data/mm5
    mv MT MT.ncbi
    mkdir M
    mkdir M/chrM_1
    mkdir M/lift
    cd MT.ncbi

    bash
    find . -type f | while read FN
    do
	NF=`echo $FN | sed -e "s/MT/M/g"`
	sed -e "s/chrMT/chrM/g" $FN > ../M/$NF
    done

# MAKE LIFTALL.LFT (DONE - 2003-06-27 - Fan)

    cd /cluster/data/mm5
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft

# 7:40 PM 6/27/04, used dark blue color above.
# Now changed to use dark pink color for things done. 

# CREATING DATABASE (DONE 2004-06-27 - Fan)

# First, clean out mm5 tables built by previous trail build.
# Rename all mm5.* tables to mm5_old4.*,
# then drop database mm5

o - Create the database.
    ssh hgwdev
    hgsql -e 'create database mm5;' ''
    # if you need to delete this database:  !!! WILL DELETE EVERYTHING !!!
    #	hgsql -e "drop database mm5;" mm5
o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
    Filesystem            Size  Used Avail Use% Mounted on
    /dev/sdc1             1.8T  383G  1.3T  24% /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2004-06-27 - Fan)
    #	Use any of the newest databases to ensure that the organization
    #	of the grp table is up to date
    ssh hgwdev
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm5

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE - 2004-06-27 - Fan)
    # Create (unmasked) nib files
    ssh kksilo
    cd /cluster/data/mm5
    mkdir -p unmaskedNib
    foreach f (?{,?}/chr?{,?}{,_random}.fa)
      echo $f:t:r
      faToNib $f unmaskedNib/$f:t:r.nib
    end
    # Create symbolic links from /gbdb/mm5/nib to real nib files
    #	These unmasked Nib files are temporary just to get the browser
    #	up an running immediately.  After the masking is done and masked
    #	sequence is created, these nibs will be replaced with the masked
    #	nibs
    ssh hgwdev
    mkdir -p /gbdb/mm5/nib
    cd /gbdb/mm5/nib
    ln -s /cluster/data/mm5/unmaskedNib/chr*.nib .

    # Load /gbdb nib paths into database and save size info.
    ssh hgwdev
    cd /cluster/data/mm5
    hgsql mm5  < ~/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib mm5 /gbdb/mm5/nib ?{,?}/chr?{,?}{,_random}.fa
    # 3164952073 total bases
    # NOTE: mm4 was 2952612207, an increase of 212 Mb (~7.2%)
    hgsql -N -e "select chrom,size from chromInfo;" mm5 > chrom.sizes
    # check the resulting file chrom.sizes

    # Store o+o info in database.
    cd /cluster/data/mm5/ncbi
    gunzip sequence.inf
    cd /cluster/data/mm5
    ln -s ncbi ffa
    # remove so as not to confuse hgGoldGap -- they are easily regenerated
    rm */chr*.ctg.agp
    # to undo/redo:
    #     jkStuff/dropSplitTable.csh gap
    #     jkStuff/dropSplitTable.csh gold
    /cluster/bin/i386/hgGoldGapGl mm5 /cluster/data/mm5 .
    featureBits mm5 gold
    # 2615483787 bases of 2615483787 (100.000%) in intersection
    featureBits mm4 gold
    # 2627444668 bases of 2627444668 (100.000%) in intersection

    featureBits mm5 gap
    # 549468286 bases of 2615483787 (21.008%) in intersection
    featureBits mm4 gap
    # 325167539 bases of 2627444668 (12.376%) in intersection
    featureBits mm3 gap
    # 202319873 bases of 2505900260 (8.074%) in intersection


# Make and load GC percent table	(DONE - 2004-06-27 - Fan)
#	NOT REQUIRED, been replaced by gc5Base procedure below
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/gcPercent
     cd /cluster/data/mm5/bed/gcPercent
     hgsql mm5  < ~/kent/src/hg/lib/gcPercent.sql
     hgGcPercent mm5 ../../unmaskedNib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM5 (DONE - 2004-06-27 - Fan)
    #	using the Mm3 position blatted onto Mm5:
    # Enter mm5 into hgcentraltest.dbDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, defaultPos, \
         active, orderKey, genome, scientificName, htmlPath, \
         hgNearOk, hgPbOk, sourceName) \
      VALUES("mm5", "May 2004", "/gbdb/mm5/nib", "Mouse", \
	"chr6:121658238-121674165", \
         1, 20, "Mouse", "Mus musculus", "/gbdb/mm5/html/description.html",\
	0, 0, "NCBI Build 33");' \
	-h genome-testdb hgcentraltest
    #	If you need to delete that entry:
    hgsql -e 'delete from dbDb where name="mm5";' -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add mm5 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM5 (DONE - 2004-07-14 Fan)
    ssh hgwdev

    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/mm5/nib because hgBlat looks there:
    cd /cluster/data/mm5
    faToTwoBit */chr*.fa mm5.2bit
    ln -s /cluster/data/mm5/mm5.2bit /gbdb/mm5/nib/

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("mm5", "snort", "17778", "1", "0"); \
    INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("mm5", "snort", "17779", "0", "1");' \
    -h genome-testdb hgcentraltest

# REPEAT MASKING (Working on 2004-06-27 Fan)
    #	TRF simpleRepeat below can be run at the same time
    # Split contigs, run RepeatMasker, lift results
    # * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make
    #   RepeatMasker runs manageable on the cluster ==> results need lifting.
    # * For the NCBI assembly we repeat mask on the sensitive mode setting
    #  (RepeatMasker -m -s -ali)

    #- Split contigs into 500kb chunks:
    ssh kksilo
    cd /cluster/data/mm5
    foreach d ( */chr?{,?}{,_random}_?{,?} )
	cd $d
	set contig = $d:t
	faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
	    -maxN=500000
	cd ../..
    end
    #	...
    #	11 pieces of 11 written
    #	1 pieces of 1 written
    #	...

    #- Make the run directory and job list:

    cd /cluster/data/mm5
    cat << '_EOF_' > jkStuff/RMMouse
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/mm5/$2
/bin/cp $2 /tmp/mm5/$2
cd /tmp/mm5/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species mus $2
popd
/bin/cp /tmp/mm5/$2/$2.out ./
if (-e /tmp/mm5/$2/$2.align) /bin/cp /tmp/mm5/$2/$2.align ./
if (-e /tmp/mm5/$2/$2.tbl) /bin/cp /tmp/mm5/$2/$2.tbl ./
if (-e /tmp/mm5/$2/$2.cat) /bin/cp /tmp/mm5/$2/$2.cat ./
/bin/rm -fr /tmp/mm5/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm5/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm5
'_EOF_'
    chmod +x jkStuff/RMMouse

    mkdir -p RMRun
    rm -f RMRun/RMJobs
    foreach d ( ?{,?}/chr*_?{,?} )
	foreach f ( $d/chr*_?{,?}_?{,?}.fa )
	    set f = $f:t
	    echo /cluster/data/mm5/jkStuff/RMMouse \
		/cluster/data/mm5/$d $f \
		'{'check out line+ /cluster/data/mm5/$d/$f.out'}' \
		>> RMRun/RMJobs
	end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/mm5/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...

[kk:RMRun> para check
6885 jobs in batch
8 jobs (including everybody's) in Parasol queue.
Checking finished jobs.
ranOk: 6885
total jobs in batch: 6885
[kk:RMRun> para time
6885 jobs in batch
8 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 6885 of 6885 jobs
CPU time in finished jobs:   40084305s  668071.74m 11134.53h  463.94d  1.271 y
IO & Wait Time:                122589s    2043.16m    34.05h    1.42d  0.004 y
Average job time:                5840s      97.33m     1.62h    0.07d
Longest job:                     9804s     163.40m     2.72h    0.11d
Submission to last job:         46771s     779.52m    12.99h    0.54d

# Done 11:57 AM 6/28/04

    #- Lift up the split-contig .out's to contig-level .out's
    ssh kksilo
    cd /cluster/data/mm5
    foreach d ( ?{,?}/chr*_?{,?} )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
      cd ../..
    end

    #- Lift up the contig-level .out's to chr-level
    ssh kksilo
    cd /cluster/data/mm5
    ./jkStuff/liftOut5.csh
    #	This one error is OK
    #	Can not find Un/lift/ordered.lft .

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/mm5
    # to redo:
    #    ./jkStuff/dropSplitTable.csh rmsk
    # make sure there's no chrUn -- rm Un/chrUn.fa.out
    hgLoadOut mm5 ?/*.fa.out ??/*.fa.out

# VERIFY REPEATMASKER RESULTS (DONE - 2004-06-28 Fan)

    # Run featureBits on mm5 and on a comparable genome build, and compare:
    ssh hgwdev
featureBits mm5 rmsk
#1137310280 bases of 2615483787 (43.484%) in intersection
#featureBits mm4 rmsk
1130883581 bases of 2627444668 (43.041%) in intersection
#featureBits mm3 rmsk
1080265553 bases of 2505900260 (43.109%) in intersection

#cd /cluster/data/mm5
#awk '{print $1}' chrom.sizes | sed -e "s/chr//" | grep -v random > chrom.lst

# SIMPLE REPEAT TRACK (DONE - 2004-06-29 Fan)
    # TRF can be run in parallel with RepeatMasker on the file server
    #	since it doesn't require masked input sequence.
    ssh kksilo
    mkdir /cluster/data/mm5/bed/simpleRepeat
    cd /cluster/data/mm5/bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    echo '#\!/bin/csh -fe' > jobs.csh
    # create job list of 5MB chunks
    foreach f \
       (/cluster/data/mm5/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \
       /cluster/data/mm5/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa)
      set fout = $f:t:r.bed
      echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    chmod +x jobs.csh
    wc jobs.csh
    # 640    3836   90839 jobs.csh

    ./jobs.csh >&! jobs.log &
    # in bash:  ./jobs.csh > jobs.log 2>&1 &
    tail -f jobs.log
    # Done 3:07 PM 6/29/04, took about 6 hours.

    # When job is done lift output files
    liftUp simpleRepeat.bed /cluster/data/mm5/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database
    ssh hgwdev
    cd /cluster/data/mm5/bed/simpleRepeat
    hgLoadBed mm5 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 1150615 elements of size 16

    featureBits mm5 simpleRepeat
    # 81414259 bases of 2615483787 (3.113%) in intersection
    featureBits mm4 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection
    featureBits mm3 simpleRepeat
    # 75457193 bases of 2505900260 (3.011%) in intersection


# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-06-29 - Fan)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/mm5/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    # Lift up filtered trf output to chrom coords
    cd /cluster/data/mm5
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
	perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
	  $c/lift/ordered.lst > $c/lift/oTrf.lst
	liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
	  jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      else
	echo "WARNING NO FILE:  $c/lift/ordered.lst"
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # NOTE: ignore warning about non-existent Un/Lift/ordered.lift
    # since there is no chrUn

# MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
#				(Working on - 2004-06-29 Fan)
    ssh kksilo
    cd /cluster/data/mm5
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.csh >&! maskFa.out &
    #	bash:	./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 &
    tail -100f maskFa.out

    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.csh

Edited ./jkStuff/makeNib.csh to comment out "if ..." and "endif" as below:

#!/bin/csh -fe

mkdir -p nib mixedNib maskedNib
foreach i (?{,?})
   cd $i
#   foreach j (chr$i{,_random}.fa)
   foreach j (*.fa)
#       if (-e "${j}")
        set r = $j:r
       /cluster/bin/i386/faToNib $j ../nib/$r.nib
       /cluster/bin/i386/faToNib -softMask $j ../mixedNib/$r.nib
       /cluster/bin/i386/faToNib -hardMask $j ../maskedNib/$r.nib
#       endif
       echo done $j
   end
   cd ..
end

    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.csh
    # ignore complaints about missing chrUn

    # Redo symbolic links from /gbdb/mm5/nib to
    #   mixed (RM and TRF) soft-masked nib files
    ssh hgwdev
    rm -fr /gbdb/mm5/nib/*
    ln -s /cluster/data/mm5/mixedNib/chr*.nib /gbdb/mm5/nib

    # Copy data to /cluster/bluearc for cluster runs
    ssh kksilo

    # masked contigs
    rm -fr /cluster/bluearc/scratch/mus/mm5/trfFa
    mkdir -p /cluster/bluearc/scratch/mus/mm5/trfFa
    cp -p /cluster/data/mm5/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
	/cluster/bluearc/scratch/mus/mm5/trfFa

    # masked chrom nibs
    cd /cluster/data/mm5
    rm -fr /cluster/bluearc/scratch/mus/mm5/softNib
    mkdir -p /cluster/bluearc/scratch/mus/mm5/softNib
    cp -p mixedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/softNib
    rm -fr /cluster/bluearc/scratch/mus/mm5/hardNib
    mkdir -p /cluster/bluearc/scratch/mus/mm5/hardNib
    cp -p maskedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/hardNib

    # fasta files
    rm -fr /cluster/bluearc/scratch/mus/mm5/fasta
    mkdir -p /cluster/bluearc/scratch/mus/mm5/fasta
    cp -p ?/*.fa ??/*.fa /cluster/bluearc/scratch/mus/mm5/fasta

    # RepeatMasker *.out files
    rm -rf /cluster/bluearc/scratch/mus/mm5/rmsk
    mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/scratch/mus/mm5/rmsk

    # lift file, for mrna processing
    cp -p jkStuff/liftAll.lft /cluster/bluearc/scratch/mus/mm5
#above was done 6/29/04 4:50PM

    # also copy to iservers
    ssh kkr1u00
    #cd ~/mm5
    cd /cluster/bluearc/scratch/mus/mm5

    mkdir /iscratch/i/mus/mm5
    cp -p liftAll.lft /iscratch/i/mus/mm5
    mkdir -p /iscratch/i/mus/mm5/softNib
    cp -p /cluster/bluearc/scratch/mus/mm5/softNib/chr*.nib /iscratch/i/mus/mm5/softNib

    mkdir -p /iscratch/i/mus/mm5/trfFa
    cd /cluster/store6/mm5
    cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/scratch/mus/mm5/trfFa
    /cluster/bin/scripts/iSync

ssh kkr1u00
mkdir /iscratch/i/mus/mm5
cd /iscratch/i/mus
rsync -arlv /cluster/bluearc/scratch/mus/mm5 .

#wrote 8660800915 bytes  read 15380 bytes  17729409.00 bytes/sec
#total size is 10242205742  speedup is 1.18

cd /iserver/kkr1u00/i/mus/mm5
mv trfFa maskedContigs
cd /cluster/bluearc/scratch/mus/mm5
mv trfFa maskedContigs

# PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2004-06-29 - Fan)

    ssh kksilo
    mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk.spec
    cd /cluster/bluearc/scratch/mus/mm5/rmsk.spec
    ln -s ../rmsk/*.out .

# NOTE: DON't leave indentations in the script below.
cat << '_EOF_' > runArian.sh
#!/bin/sh
for FN in *.out
do
echo ${FN}
/cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
${FN} -query mouse -comp human -comp rat
done
'_EOF_'

    chmod +x runArian.sh
    ./runArian.sh 

    cd /cluster/bluearc/scratch/mus/mm5
    mkdir linSpecRep.notInHuman
    mkdir linSpecRep.notInRat
    foreach f (rmsk.spec/*.out_hum_rat)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInHuman/$base.out.spec
    end

    foreach f (rmsk.spec/*.out_hum_rat)
        set base = $f:t:r:r
	echo $base.out.spec
	/cluster/bin/scripts/extractLinSpecReps 2 $f > \
		linSpecRep.notInRat/$base.out.spec
	end

    cp rmsk.spec /iscratch/i/mus/mm5 -Rp
    cp linSpecRep.notInRat /iscratch/i/mus/mm5 -Rp
    cp linSpecRep.notInHuman /iscratch/i/mus/mm5 -Rp

    /cluster/bin/scripts/iSync

    # Request rsync /cluster/bluearc/scratch/mus/mm5 to the KiloKluster

#  GC5BASE WIGGLE TRACK (DONE - 2004-06-24 - Hiram)
    #	This previously was a script that ran through each nib.
    #	Recently transformed into a mini cluster run.
    ssh kki
    mkdir /cluster/data/mm5/bed/gc5Base
    cd /cluster/data/mm5/bed/gc5Base

    mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
        /cluster/data/mm5/mixedNib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
        -wibFile=wigData5/gc5Base_${chrom} \
            -name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRun.sh

    ls /cluster/data/mm5/mixedNib > nibList
    cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsub jobList
    para create jobList
    para try, check, ... etc
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       4969s      82.81m     1.38h    0.06d  0.000 y
# IO & Wait Time:                   611s      10.19m     0.17h    0.01d  0.000 y
# Average job time:                 130s       2.16m     0.04h    0.00d
# Longest job:                      370s       6.17m     0.10h    0.00d
# Submission to last job:           598s       9.97m     0.17h    0.01d

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/mm5/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/mm5/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base

    #	And then the zoomed data view
    ssh kki
    cd /cluster/data/mm5/bed/gc5Base
    mkdir wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
        /cluster/data/mm5/mixedNib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
            -name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRunZoom.sh

    cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsubZoom jobListZoom
    para create jobListZoom
    para try ... check ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       4878s      81.29m     1.35h    0.06d  0.000 y
# IO & Wait Time:                   488s       8.14m     0.14h    0.01d  0.000 y
# Average job time:                 125s       2.08m     0.03h    0.00d
# Longest job:                      378s       6.30m     0.10h    0.00d
# Submission to last job:           665s      11.08m     0.18h    0.01d

    #	Then load these .wig files into the same database as above
    ssh hgwdev
    cd /cluster/data/mm5/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
	-oldTable mm5 gc5Base wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base

#  GC5BASE WIGGLE TRACK (DONE - 2004-07-01 - Hiram)
    #	This previously was a script that ran through each nib.
    #	Recently transformed into a mini cluster run.
    ssh kki
    mkdir /cluster/data/mm5/bed/gc5Base
    cd /cluster/data/mm5/bed/gc5Base

    mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
        /cluster/data/mm5/mixedNib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
        -wibFile=wigData5/gc5Base_${chrom} \
            -name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRun.sh

    ls /cluster/data/mm5/mixedNib > nibList
    cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsub jobList
    para create jobList
    para try, check, ... etc
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       4857s      80.94m     1.35h    0.06d  0.000 y
# IO & Wait Time:                   121s       2.02m     0.03h    0.00d  0.000 y
# Average job time:                 116s       1.93m     0.03h    0.00d
# Longest job:                      335s       5.58m     0.09h    0.00d
# Submission to last job:           516s       8.60m     0.14h    0.01d

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/mm5/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/mm5/wib
    mkdir /gbdb/mm5/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base

    #	And then the zoomed data view
    ssh kki
    cd /cluster/data/mm5/bed/gc5Base
    mkdir wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
        /cluster/data/mm5/mixedNib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
            -name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRunZoom.sh

    cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsubZoom jobListZoom
    para create jobListZoom
    para try ... check ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       4819s      80.31m     1.34h    0.06d  0.000 y
# IO & Wait Time:                    82s       1.37m     0.02h    0.00d  0.000 y
# Average job time:                 114s       1.90m     0.03h    0.00d
# Longest job:                      336s       5.60m     0.09h    0.00d
# Submission to last job:           500s       8.33m     0.14h    0.01d

    #	Then load these .wig files into the same database as above
    ssh hgwdev
    cd /cluster/data/mm5/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
	-oldTable mm5 gc5Base wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base

# BLASTZ HG17 (WORKING - 2004-07-06 - Hiram)
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastz.hg17.2004-07-06
    cd /cluster/data/mm5/bed
    ln -s  blastz.hg17.2004-07-06 blastz.hg17
    cd blastz.hg17

    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=/scratch/mus/mm5/rmsk
# not used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=
# FLAG not currently used
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.hg17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/mm5/bed/blastz.hg17
    #	OK to use this script here, it is generic, works anywhere
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 46717 of 46717 jobs
# CPU time in finished jobs:   16171136s  269518.93m  4491.98h  187.17d  0.513 y
# IO & Wait Time:                534501s    8908.35m   148.47h    6.19d  0.017 y
# Average job time:                 358s       5.96m     0.10h    0.00d
# Longest job:                     5263s      87.72m     1.46h    0.06d
# Submission to last job:         30066s     501.10m     8.35h    0.35d

    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/mm5/bed/blastz.hg17
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       2186s      36.43m     0.61h    0.03d  0.000 y
# IO & Wait Time:                  1804s      30.07m     0.50h    0.02d  0.000 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest job:                       82s       1.37m     0.02h    0.00d
# Submission to last job:          3895s      64.92m     1.08h    0.05d

    #	Third cluster run to convert lav's to axt's
    #	Does not work on kki since /scratch on the iservers is not the
    #	same as /scratch on the other clusters.
    ssh kk
    cd /cluster/data/mm5/bed/blastz.hg17
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       2099s      34.98m     0.58h    0.02d  0.000 y
# IO & Wait Time:                  6862s     114.37m     1.91h    0.08d  0.000 y
# Average job time:                 208s       3.47m     0.06h    0.00d
# Longest job:                     1276s      21.27m     0.35h    0.01d
# Submission to last job:          1291s      21.52m     0.36h    0.01d

    # translate sorted axt files into psl
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17
    mkdir p pslChrom
    set tbl = "blastzHg17"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	This takes more than an hour.  You can shorten this by changing
    #	that command to a simple echo, put the results into a file,
    #	split the file into four parts and run the four files as shell
    #	scripts on kksilo to have four processes running at the same
    #	time.  Load on kksilo gets up to about 20 which is reasonable.

    # Load database tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/pslChrom
    bash		#	for tcsh users
    for F in chr*_blastzHg17.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${F}
	echo "${F} done"
    done
    # this is a 40 minute job
    # exit bash if you are tcsh

    # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
    # memory.  But if you reset your ~/.hg.conf to use the read-only
    #	user and contact the hgwdev host, then use the x86_64 featureBits
    # featureBits mm5 blastzHg17
    #	1057836001 bases of 2615483787 (40.445%) in intersection
    # featureBits mm4 blastzHg16
    #	1068995521 bases of 2627444668 (40.686%) in intersection

# CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/mm5/bed/blastz.hg17/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/mus/mm5/softNib \
	/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       5354s      89.23m     1.49h    0.06d  0.000 y
# IO & Wait Time:                 10543s     175.72m     2.93h    0.12d  0.000 y
# Average job time:                 370s       6.16m     0.10h    0.00d
# Longest job:                     1694s      28.23m     0.47h    0.02d
# Submission to last job:          1694s      28.23m     0.47h    0.02d

    # now on the file server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    4m53.428s
    #	user    4m3.040s
    #	sys     0m29.440s

    time chainSplit chain all.chain
    #	real    4m34.674s
    #	user    3m38.370s
    #	sys     0m29.990s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain/chain
    bash	#	for tcsh users
    for I in *.chain
    do
        c=${I/.chain/}
        hgLoadChain mm5 ${c}_chainHg17 $I
        echo done $c
    done
    # exit bash if you are tcsh
    #	This is a 50 minute job

    #	featureBits mm5 chainHg17
    #	2507720521 bases of 2615483787 (95.880%) in intersection
    #	featureBits mm4 chainHg16
    #	2558968088 bases of 2627444668 (97.394%) in intersection

# NET MM5 (WORKING - 2004-07-02 - Hiram)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    mkdir preNet
    cd chain
    bash	#	for tcsh users
    for I in *.chain
    do
      echo preNetting $I
      /cluster/bin/i386/chainPreNet $I /cluster/data/mm5/chrom.sizes \
		/cluster/data/hg17/chrom.sizes ../preNet/$I
    done
    # exit bash if you are tcsh
    #	7 minute job

    cd ..
    mkdir n1
    cd preNet
    bash	#	for tcsh users
    for I in *.chain
    do
      n=${I/.chain/}.net
      echo primary netting $I $n
      /cluster/bin/i386/chainNet $I -minSpace=1 /cluster/data/mm5/chrom.sizes \
	/cluster/data/hg17/chrom.sizes ../n1/$n /dev/null
    done
    # exit bash if you are tcsh
    #	5 minute job

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 2546110464, utime 16327 s/100, stime 3546

    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    time netClass hNoClass.net mm5 hg17 human.net \
	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse
    #	real    9m45.271s
    #	user    6m47.170s
    #	sys     1m20.440s

    # If things look good do
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn human.net > humanSyn.net
    #	real    12m3.701s
    #	user    8m44.180s
    #	sys     1m1.610s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet mm5 netHg17 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet mm5 syntenyNetHg17 stdin

    # check results
    # featureBits mm5 netHg17
    #	2504056038 bases of 2615483787 (95.740%) in intersection
    # featureBits mm4 netHg16
    #	2553137690 bases of 2627444668 (97.172%) in intersection

    # featureBits mm5 syntenyNetHg17
    #	2460442823 bases of 2615483787 (94.072%) in intersection
    # featureBits mm4 syntenyNetHg16
    #	2495783103 bases of 2627444668 (94.989%) in intersection

    # Add entries for net and chain to mouse/hg17 trackDb

    # make net
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    mkdir humanNet
    time netSplit human.net humanNet
    #	real    4m46.190s
    #	user    3m27.740s
    #	sys     0m38.900s

    #	extract axt's from net, and convert to maf's
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtChain
    mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
#!/bin/csh -ef
    foreach f (humanNet/chr*.net)
        set c = $f:t:r
        echo "netToAxt: $c.net -> $c.axt"
        rm -f ../axtNet/$c.axt
        netToAxt humanNet/$c.net chain/$c.chain \
	    /cluster/data/mm5/nib /cluster/data/hg17/nib stdout | \
	    axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/mm5/chrom.sizes /cluster/data/hg17/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=hg17.
	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
    end
'_EOF_'
# << for emacs
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    #	real    39m53.316s
    #	user    20m2.530s
    #	sys     4m40.120s


    ssh hgwdev
    mkdir /cluster/data/mm5/bed/blastz.hg17/axtBest
    cd /cluster/data/mm5/bed/blastz.hg17/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
    gzip *.axt
XXX - running 2004-07-13 14;18
    # add README.txt file to dir (use previous assembly's copy as template)
    #	32 minute gzip

    #  Convert those axt files to psl
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo -n "processing $c.axt -> ${c}_blastzBesthg17.psl ..."
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestHg17.psl
	echo "Done"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/pslBest
    for I in chr*BestHg17.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
	echo "done ${I}"
    done

     # check results
    # featureBits mm5 blastzBestHg17
    #	1020692679 bases of 2615483787 (39.025%) in intersection
    # featureBits mm4 blastzBestHg16
    #	1030510540 bases of 2627444668 (39.221%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm5/axtBest/Hg17
     cd /gbdb/mm5/axtBest/Hg17
     ln -s /cluster/data/mm5/bed/blastz.hg17/axtNet/chr*.axt .
     cd /cluster/data/mm5/bed/blastz.hg17/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/mm5/axtBest/Hg17/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \
         >>! axtInfoInserts.sql
     end
    hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql mm5 < axtInfoInserts.sql

# MM5 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.canFam1/axtChain
    mkdir net
    netSplit dog.net net 
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain    
    done
    rm -rf over/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
    cp /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain .
    gzip mm5ToCanFam1.chain
    mkdir -p /gbdb/mm5/liftOver
    ln -s /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain /gbdb/mm5/liftOver/mm5ToCanFam1.over.chain
    hgAddLiftOverChain -multiple mm5 canFam1

# ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, Heather)
    ssh hgwdev
    cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
      /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
    cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
      /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17
    md5sum *.gz */*.gz > md5sum.txt
    # Update the README.txt

# LIFTOVER CHAIN TO MM6 (DONE 4/20/2005 Andy)
    ssh kkstore
    cd /cluster/data/mm6
    mkdir liftSplits/
    cat << _EOF_ > split.csh
#!/bin/tcsh
set liftDir = /cluster/data/mm6/liftSplits
cd /cluster/data/mm6
foreach n (\`ls ?{,?}/*.fa\`)
    set d = \$n:h
    set c = \$n:t:r
    echo \$c
    faSplit -lift=\$liftDir/lift/\$c.lft size /cluster/data/mm6/\$d/\$c.fa -oneFile 3000 \$liftDir/split/\$c
end
_EOF_
    chmod +x split.csh
    ./split.csh
    # kkstore not mounting /panasas ... weird.  
    ssh hgwdev
    cd /cluster/data/mm6
    cp -r liftSplits/ /panasas/store/mm6
    ssh kk 
    cd /cluster/data/mm5
    makeLoChain-align mm5 /scratch/mus/mm5/softNib \
                    mm6 /panasas/store/mm6/liftSplits/split
        # Created parasol job in bed/blat.mm6.2005-04-20/run
    cd bed/blat.mm6.2005-04-20/run/
    para create spec
    para push
    # para time was complicated by the fact I redid some hippos (mostly chrUn_random
    # alignments) on kk9.  Basically, it took about a day.
    # In the end, the chrUn_random vs. chrUn_random just took wayyyyyy too long.
    # Later, if a more rigorous chain file is desired, it can be made after rerunning 
    # that blat.

    # Lifting
    ssh kksilo
    cd /cluster/data/mm5/bed/blat.mm6
    makeLoChain-lift mm5 mm6 /panasas/store/mm6/liftSplits/lift \
                        > lift.log &
    tail -f lift.log
    # OK so I remember this problem with makeLoChain-lift: it always stops with chr1.
    # I'll just do it manually.
    cd raw/
    for nib in `ls /cluster/data/mm6/nib`; do
       chrom=${nib%.nib}
       echo $chrom
       liftUp -pslQ ../psl/${chrom}.psl /panasas/store/mm6/liftSplits/lift/${chrom}.lft warn chr*_${chrom}.psl
       echo done $chrom
    done    

    ssh kk9
    cd /cluster/data/mm5/bed
    ln -s blat.mm6.2005-04-20 blat.mm6.2005-04-22
    makeLoChain-chain mm5 /cluster/data/mm5/nib mm6 /cluster/data/mm6/nib
    cd /cluster/data/mm5/bed/blat.mm5.2005-02-08/chainRun
    para try
    para check
    para push
    para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs:      27315s     455.25m     7.59h    0.32d  0.001 y
#IO & Wait Time:                 67093s    1118.22m    18.64h    0.78d  0.002 y
#Average job time:                2360s      39.34m     0.66h    0.03d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           11656s     194.27m     3.24h    0.13d
#Submission to last job:         31329s     522.15m     8.70h    0.36d
    # That looks weird but I think it was because 8 jobs crashed because there was no disk space.
    # I freed up some space but then there wasn't much room for the netting stage.
    # It crashed twice when I tried it using the script makeLoChain-net after the 
    # chainMergeSort/split.  I figured out that it needed more memory.  So I ran it manually on
    # kolossus
    ssh kolossus
    mkdir -p /tmp/andy
    cd /tmp/andy
    cp -r /cluster/data/mm5/bed/blat.mm6/chainRaw .
    rm -rf /cluster/data/mm5/bed/blat.mm6/chainRaw
    mkdir chain
    chainMergeSort chainRaw/*.chain | chainSplit chain stdin
    mkdir net over
    cd chain
    for c in *.chain; do
       echo ${c%.chain}; 
       chainNet $c /cluster/data/mm5/chrom.sizes \
        /cluster/data/mm6/chrom.sizes ../net/${c%.chain}.net /dev/null
       echo done $c
    done
    for chain in *; do 
       c=${chain%.chain}
       netChainSubset ../net/$c.net $chain ../over/$c.over 
    done
    cd ../over/
    cat * >> ../mm5ToMm6.chain
    cd ../
    cp mm5ToMm6.chain /cluster/data/mm5/bed/liftOver/
    cd /cluster/data/mm5/bed/liftOver
    mv mm5ToMm6.chain mm5ToMm6.over.chain
    ssh hgwdev
    ln -s /cluster/data/mm5/bed/liftOver/mm5ToMm6.over.chain /gbdb/mm5/liftOver/mm5ToMm6.over.chain
    hgAddLiftOverChain mm5 mm6 /gbdb/mm5/liftOver/mm5ToMm6.over.chain
    cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
    cp /gbdb/mm5/liftOver/mm5ToMm6.over.chain .
    gzip mm5ToMm6.over.chain

# MAKING HUMAN SYNTENY (DONE - 2004-07-13 - Hiram)

ssh hgwdev
mkdir /cluster/data/mm5/bed/syntenyHg17
cd /cluster/data/mm5/bed/syntenyHg17

# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyRn3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .

./syntenicBest.pl -db=mm5 -table=blastzBestHg17 > synBest.out 2>&1
./smooth.pl > smooth.out 2>&1
./joinsmallgaps.pl > joingaps.out 2>&1
./fillgap.pl -db=mm5 -table=blastzBestHg17 > fillgap.out 2>&1
./synteny2bed.pl > syn2bed.out 2>&1

    #	The five commands above
    #	real    168m43.627s
    #	user    0m18.680s
    #	sys     0m4.990s

#	Used to load this in syntenyHg17, but that type is misleading to
#	the table browser and fails the checkTableCoords check.
#	Better to use this ensRatMusHom type:
#	Need a new name here for the Hg17 to not conflict with the
#	others
sed -e 's/ensPhusionBlast/ensRatMusHg17/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRatMusHg17.sql
hgLoadBed mm5 ensRatMusHg17 ucsc100k.bed -sqlTable=ensRatMusHg17.sql

    # featureBits mm5 ensRatMusHg17
    #	2366463967 bases of 2615483787 (90.479%) in intersection
    # featureBits mm4 syntenyHg16
    #	2299774191 bases of 2627444668 (87.529%) in intersection

# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-13 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.hg17/axtNet
    mkdir -p ../axtTight
    bash	#	for tcsh users
    for I in *.axt
    do
      echo "axtNet/$I -> ../axtTight/$I"
      subsetAxt  $I ../axtTight/$I \
	~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    done
    # exit bash if you are tcsh
    #	An 8 minute job

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    bash	#	for tcsh users
    for I in *.axt
    do
      C=${I/.axt/}
      axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightHg17.psl
      echo "Done: $I -> ${C}_blastzTightHg17.psl"
    done
    # exit bash if you are tcsh

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/pslTight
    for I in chr*TightHg17.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
	echo "done ${I}"
    done

    #	Compare results with previous assembly:
    #	featureBits mm5 blastzTightHg17
    #	168148800 bases of 2615483787 (6.429%) in intersection
    #	featureBits mm4 blastzTightHg16
    #	170163839 bases of 2627444668 (6.476%) in intersection

    # copy  axt's to download area
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.hg17/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)
    #	4 minute gzip

#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-07-13 - Fan)

# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSMART DATA OF MOUSE BUILD 32.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
# WHEN ENSEMBL FINISHES THEIR MOUSE BUILD 33 RELEASE, WE NEED TO REBUILD THIS
# TABLE.
    # Get the ensembl gene/protein cross-reference data from
    # http://www.ensembl.org/Multi/martview?species=Mus_musculus
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Mus musculus choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
    # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref

    sed ensXref.tsv -e 's/\./\t/g' > ensemblXref3.tab

    hgsql mm5 -e "drop table ensemblXref3"
    hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql

    hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'

# CPGISLANDS (DONE - 2004-07-13 - Fan)
    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/cpgIsland
    cd /cluster/data/mm5/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe /cluster/data/mm5/bed/cpgIsland/
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh kksilo
    cd /cluster/data/mm5/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    #	the warnings:
    # Bad char 0x52 = 'R' at line 117472, base 5873535, sequence chr14
    # Bad char 0x53 = 'S' at line 120651, base 6032462, sequence chr14
    # Bad char 0x53 = 'S' at line 120652, base 6032546, sequence chr14
    #	real    21m47.823s
    #	user    18m30.810s
    #	sys     1m13.420s

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/mm5/bed/cpgIsland
    hgLoadBed mm5 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    
    # Reading cpgIsland.bed
    # Loaded 16238 elements of size 10
    # Sorted
    # Saving bed.tab
    # Loading mm5

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2004-07-14 Fan)
    ssh kksilo
    cd /cluster/data/mm5

    # Build the .zip files
    cp /cluster/data/rn3/jkStuff/zipAll.sh jkStuff
    # edit this zipAll.sh to produce output to /cluster/data/mm5/bigZips
    jkStuff/zipAll.sh > zipAll.log
    #	bash:	./jkStuff/zipAll.sh > zipAll.log 2>&1 &
    tail -f zipAll.log

    mkdir zip
    mv *.zip zip
    cd zip
    # Look at zipAll.log to make sure all file lists look reasonable.
    # Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end

    wc -l *.zip.test
    # 46 chromAgp.zip.test
    # 45 chromFa.zip.test
    # 45 chromFaMasked.zip.test
    # 45 chromOut.zip.test
    # 45 chromTrf.zip.test
    # 641 contigAgp.zip.test
    # 641 contigFa.zip.test
    # 641 contigFaMasked.zip.test
    # 641 contigOut.zip.test
    # 641 contigTrf.zip.test
    #3431 total

    ssh hgwdev
    cd /cluster/data/mm5/jkStuff
    # create generic copy program
    cat << '_EOF_' > cpToWeb.sh
#!/bin/sh
if [ $# -ne 1 ]; then
	echo "usage: cpToWeb.sh <goldenPath download directory>"
	echo -e "\texample: cpToWeb.sh mm5"
	exit 255
fi
GP=/usr/local/apache/htdocs/goldenPath/$1
mkdir -p ${GP}
mkdir -p ${GP}/chromosomes
for f in ../?/*.fa ../??/*.fa
do
    BN=`basename ${f}`
    zip -j ${GP}/chromosomes/${BN}.zip ${f}
    echo "zipped: ${BN}"
done
mkdir -p ${GP}/bigZips
for Z in *.zip
do
	cp -p ${Z} ${GP}/bigZips
	echo "copied: ${Z}"
done
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x cpToWeb.sh
    cd /cluster/data/mm5/zip
    ../jkStuff/cpToWeb.sh mm5
    cd /usr/local/apache/htdocs/goldenPath/mm5
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Make the upstream sequence files.
    # NOTE: must be redone due to bad gap track
    cd bigZips
    featureBits mm5 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits mm5 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits mm5 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa
    # mrna zips -- auto dump process takes care of this


# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 7/15/04 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Do the same for mouse.  
    # Scripts expect *.out.spec filenames, so set that up:
    ssh kkr1u00
    cd /cluster/data/mm5
    mkdir /iscratch/i/mus/mm5/linSpecRep.notInChicken
    foreach f (/iscratch/i/mus/mm5/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mus/mm5/linSpecRep.notInChicken/$f:t:r:r.out.spec
    end
    iSync
    # Use these the next time we run human-chicken blastz.


# BLASTZ CHICKEN (GALGAL2) (DONE 7/19/04 angie)
    ssh kk
    mkdir /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    ln -s blastz.galGal2.2004-07-15 /cluster/data/mm5/bed/blastz.galGal2
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    # Use human-chicken params: set L=10000 (higher threshold on blastz's 
    # outer loop) and abridge repeats.
    cat << '_EOF_' > DEF
# mouse vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInChicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.galGal2.2004-07-15

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
#Completed: 51491 of 51491 jobs
#Average job time:                 357s       5.95m     0.10h    0.00d
#Longest job:                     1015s      16.92m     0.28h    0.01d
#Submission to last job:         89841s    1497.35m    24.96h    1.04d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest job:                       55s       0.92m     0.02h    0.00d
#Submission to last job:           245s       4.08m     0.07h    0.00d

    # third run: lav -> axt
    # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix 
    # and abridged repeats (Penn State's restore_rpts program rescores with 
    # default matrix, oops).
    ssh kki
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
set path = (/cluster/bin/x86_64 $path)
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
    /iscratch/i/mus/mm5/softNib /iscratch/i/galGal2/nib stdout \
| axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q stdin stdout \
| axtSort stdin ../../axtChrom/$chr.axt 
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time:                  38s       0.63m     0.01h    0.00d
#Longest job:                      160s       2.67m     0.04h    0.00d
#Submission to last job:           233s       3.88m     0.06h    0.00d


# CHAIN CHICKEN BLASTZ (DONE 7/19/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
         -minScore=5000 $1 \
    /iscratch/i/mus/mm5/softNib \
    /iscratch/i/galGal2/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time:                  60s       1.00m     0.02h    0.00d
#Longest job:                      355s       5.92m     0.10h    0.00d
#Submission to last job:           355s       5.92m     0.10h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain mm5 ${c}_chainGalGal2 $i
    end
    featureBits mm5 chainGalGal2Link
#78951466 bases of 2615483787 (3.019%) in intersection
    featureBits hg17 chainGalGal2Link
#103882699 bases of 2866216770 (3.624%) in intersection


# NET CHICKEN BLASTZ (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    netClass -noAr noClass.net mm5 galGal2 chicken.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn chicken.net > chickenSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    netFilter -minGap=10 chicken.net |  hgLoadNet mm5 netGalGal2 stdin
    netFilter -minGap=10 chickenSyn.net | hgLoadNet mm5 syntenyNetGalGal2 stdin
    # Add entries for chainGalGal2, netGalGal2, syntenyNetGalGal2 to 
    # mouse/mm5 trackDb


# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    netSplit chicken.net net
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/mixedNib \
        /cluster/data/galGal2/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/mm5/chrom.sizes /cluster/data/galGal2/chrom.sizes \
            $maf -tPrefix=mm5. -qPrefix=galGal2.
    end

# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.mm5
# The results of this are also symlinked under mm5/bed

# MAKE VSGALGAL2 DOWNLOADABLES (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    gzip axtNet/*.axt
    cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
    ln all.chain chicken.chain
    zip /cluster/data/mm5/zip/chicken.chain.zip chicken.chain
    rm chicken.chain
    zip /cluster/data/mm5/zip/chicken.net.zip chicken.net
    zip /cluster/data/mm5/zip/chickenSyn.net.zip chickenSyn.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
    mv /cluster/data/mm5/zip/chicken*.zip .
    cp -pR /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtNet .
    md5sum *.zip axtNet/* > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 7/15/04 angie)
    ssh kkr1u00
    cd /cluster/bluearc/scratch/mus/mm5/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Even though we already have the mouse-human linSpecReps,
    # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
    # additions.  So add human, then ignore it.  
    # Dog in extra column 1, Human in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
          ${outfl} -query mouse -comp dog -comp human
    end
    # Now extract dog (extra column 1), ignore human.
    cd /iscratch/i/mus/mm5
    mkdir linSpecRep.notInDog
    foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInDog/$base.out.spec
    end
    # Clean up.
    rm /cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum
    iSync


# BLASTZ DOG (CANFAM1) (DONE 7/16/04 angie)
    ssh kk
    mkdir /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    ln -s blastz.canFam1.2004-07-15 /cluster/data/mm5/bed/blastz.canFam1
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    # Use default (Human-Mouse) settings for starters.
    cat << '_EOF_' > DEF
# mouse vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.canFam1.2004-07-15

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
    # cluster was mobbed...
#Completed: 93775 of 93775 jobs
#Average job time:                 187s       3.11m     0.05h    0.00d
#Longest job:                     3907s      65.12m     1.09h    0.05d
#Submission to last job:         76763s    1279.38m    21.32h    0.89d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time:                  98s       1.63m     0.03h    0.00d
#Longest job:                      281s       4.68m     0.08h    0.00d
#Submission to last job:          2102s      35.03m     0.58h    0.02d

    # third run: lav -> axt
    # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
    # the pipe after lavToAxt)
    ssh kki
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /iscratch/i/mus/mm5/softNib /iscratch/i/canFam1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time:                 671s      11.18m     0.19h    0.01d
#Longest job:                     2398s      39.97m     0.67h    0.03d
#Submission to last job:          2417s      40.28m     0.67h    0.03d


# CHAIN DOG BLASTZ (DONE 7/16/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainchimpSuperQuals
    ls -1S /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
    /iscratch/i/mus/mm5/softNib \
    /iscratch/i/canFam1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time:                 537s       8.96m     0.15h    0.01d
#Longest job:                     2071s      34.52m     0.58h    0.02d
#Submission to last job:          2071s      34.52m     0.58h    0.02d
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainCanFam1 $i
    end
    # mouse-dog gets significantly less coverage than human-dog:
    featureBits mm5 -chrom=chr1 chainCanFam1Link
#63386139 bases of 185739816 (34.126%) in intersection
    featureBits hg17 -chrom=chr1 chainCanFam1Link
#123999291 bases of 222827847 (55.648%) in intersection
    # mouse-dog isn't a whole lot less than mouse-human though:
    featureBits mm5 -chrom=chr1 chainHg17Link
#75492250 bases of 185739816 (40.644%) in intersection


# NET DOG BLASTZ (DONE 7/16/04 angie)
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    netClass -noAr noClass.net mm5 canFam1 dog.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn dog.net > dogSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    netFilter -minGap=10 dog.net |  hgLoadNet mm5 netCanFam1 stdin
    netFilter -minGap=10 dogSyn.net | hgLoadNet mm5 syntenyNetCanFam1 stdin
    # Add entries for chainCanFam1, netCanFam1 to mouse/mm5 trackDb


# MAKE VSCANFAM1 DOWNLOADABLES (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    gzip axtNet/chr*.axt
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    ln all.chain dog.chain
    zip /cluster/data/mm5/zip/dog.chain.zip dog.chain
    rm dog.chain
    zip /cluster/data/mm5/zip/dog.net.zip dog.net
    zip /cluster/data/mm5/zip/dogSyn.net.zip dogSyn.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
    mv /cluster/data/mm5/zip/dog*.zip .
    cp -pR /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtNet .
    md5sum *.zip axtNet/* > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
    netSplit dog.net net
    cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
        /cluster/data/canFam1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/mm5/chrom.sizes /cluster/data/canFam1/chrom.sizes \
            $maf -tPrefix=mm5. -qPrefix=canFam1.
    end


### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
#                              (DONE - 2004-07-16 - Fan)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
ssh kkr1u00
mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
ssh kk
cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm5/bed/affyU74.2004-07-16
cd /cluster/data/mm5/bed/affyU74.2004-07-16
mkdir run
cd run
mkdir psl
echo /scratch/mus/mm5/maskedContigs/*.fa | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
# do usual para check/para push etc. until the job is done. 

# Completed: 1917 of 1917 jobs
# CPU time in finished jobs:      14240s     237.34m     3.96h    0.16d  0.000 y
# IO & Wait Time:                  7946s     132.43m     2.21h    0.09d  0.000 y
# Average job time:                  12s       0.19m     0.00h    0.00d
# Longest job:                       40s       0.67m     0.01h    0.00d
# Submission to last job:           307s       5.12m     0.09h    0.00d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kksilo
cd /cluster/data/mm5/bed/affyU74.2004-07-16/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null

# Processed 44630 alignments
liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/affyU74.2004-07-16
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm5 affyU74.psl
rm -fr chrom temp run

##   MAKE THE affyGnfU74 TRACKs (DONE - 2004-07-18 - Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
# Fix broken symlinks to microarray data after directory structure changed
# (DONE, 2005-05-03, hartera)
----------------------------------
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/affyGnf.2004-07-16
cd /cluster/data/mm5/bed/affyGnf.2004-07-16
#	may need to build this command in src/hg/affyGnf
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"

# and reload data into table
hgLoadBed mm5 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm5 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm5 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm5 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm5 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm5 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  [DONE Fan 7/18/2004]
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm5/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    mkdir -p /cluster/bluearc/geneAtlas2
    cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
    ls -1 /scratch/mus/mm5/maskedContigs/ > genome.lst
    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm5/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time
# Completed: 639 of 639 jobs
# CPU time in finished jobs:      58174s     969.57m    16.16h    0.67d  0.002 y
# IO & Wait Time:                  4833s      80.55m     1.34h    0.06d  0.000 y
# Average job time:                  99s       1.64m     0.03h    0.00d
# Longest job:                      189s       3.15m     0.05h    0.00d
# Submission to last job:          1749s      29.15m     0.49h    0.02d
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl
    rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm5/bed/geneAtlas2
    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm5 affyGnf1m.psl
    hgLoadSeq mm5 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed mm5 gnfAtlas2 gnfAtlas2.bed

# MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-07-19, Fan)
    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04.

    # Set up cluster job to align MOE430 consensus sequences to mm5
    ssh kkr1u00
    cd /cluster/data/mm5/bed
    mkdir -p affyMOE430
    cd affyMOE430
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/mm5/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    ls -1 /scratch/mus/mm5/maskedContigs/ > allctg.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -
ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm5/maskedContigs/$(path1) $(path2) 
{check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    # Actually do the job with usual para try/check/push/time etc.
# para time
# Completed: 639 of 639 jobs
# CPU time in finished jobs:      24369s     406.14m     6.77h    0.28d  0.001 y
# IO & Wait Time:                  2263s      37.72m     0.63h    0.03d  0.000 y
# Average job time:                  42s       0.69m     0.01h    0.00d
# Longest job:                       63s       1.05m     0.02h    0.00d
# Submission to last job:           671s      11.18m     0.19h    0.01d


    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl 
contig.psl /dev/null
    liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm5 affyMOE430.psl
    # 1 warning on loading: Blat error so that 1449824_at has a 
    # negative entry (-195) in the qBaseInsert field. 
    # Loading into the database forces this to 0.
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm5 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
    rm batch.bak contig.psl raw.psl 
    
    # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table


######## MAKING GENE SORTER TABLES #######  (STARTED - 2004-07-15 - Hiram)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track. and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm5 knownGene knownIsoforms knownCanonical
#	You may need to build this binary in src/hg/near/hgClusterGenes
#	Got 24603 clusters, from 41208 genes in 43 chromosomes
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection
#	! ! ! Can not do featureBits on knownIsoforms

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm5/bed/geneSorter/blastp
cd /cluster/data/mm5/bed/geneSorter/blastp
pepPredToFa mm5 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/bluearc/scratch/mus/mm5/blastp
cp -p /cluster/data/mm5/bed/geneSorter/blastp/known.* \
	/cluster/bluearc/scratch/mus/mm5/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm5/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm5/bed/geneSorter/blastp/self
cd /cluster/data/mm5/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
	-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:     120685s    2011.42m    33.52h    1.40d  0.004 y
# IO & Wait Time:                 22722s     378.69m     6.31h    0.26d  0.001 y
# Average job time:                  19s       0.31m     0.01h    0.00d
# Longest job:                      147s       2.45m     0.04h    0.00d
# Submission to last job:           705s      11.75m     0.20h    0.01d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm5/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm5 knownBlastTab *.tab
# Scanning through 7739 files
    #	Loading database with 8017562 rows
    #	real    17m9.104s
    #	user    3m8.980s
    #	sys     0m28.800s

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
# DONE (04-07-18 Fan)

hgMapToGene mm5 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm5 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1m


# Create table that maps between known genes and RefSeq
hgMapToGene mm5 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm5 affyU74  knownGene knownToU74
hgMapToGene mm5 affyMOE430 knownGene knownToMOE430
hgMapToGene mm5 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm5/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
    ../affyMOE430/affyMOE430.psl
hgLoadBed mm5 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm5/bed/affyGnf95
cd /cluster/data/mm5/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm5 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
hgExpDistance
hgExpDistance mm5 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
# Got 13593 unique elements in affyGnfU74A
hgExpDistance mm5 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
# Got 8512 unique elements in affyGnfU74B
hgExpDistance mm5 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
# Got 2318 unique elements in affyGnfU74C


# C.ELEGANS BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 128 below.
    wget -O wormPep128.faa \
      ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep128/wormpep128
    formatdb -i wormPep128.faa -t wormPep128 -n wormPep128
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep128.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/ce2/run/out
    cd /cluster/data/mm5/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep128 -i \$1 
-o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls �1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
    #ls -1S ../../split/*.fa > split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      54871s     914.51m    15.24h    0.64d  0.002 y
# IO & Wait Time:                 26157s     435.95m     7.27h    0.30d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest job:                       41s       0.68m     0.01h    0.00d
# Submission to last job:           210s       3.50m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/ce2/run/out
    hgLoadBlastTab mm5 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg17/bed/blastp
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/hg17/blastp) then
      rm -r /iscratch/i/hg17/blastp
    endif
    mkdir -p /iscratch/i/hg17/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
    cd /cluster/data/mm5/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o 
\$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls �1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...

# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:     125830s    2097.17m    34.95h    1.46d  0.004 y
# IO & Wait Time:                 22740s     379.00m     6.32h    0.26d  0.001 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest job:                      137s       2.28m     0.04h    0.00d
# Submission to last job:           301s       5.02m     0.08h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/hg17/run/out
    hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab


# ZEBRAFISH BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer1/bed/blastp
    cd /cluster/data/danRer1/bed/blastp
    wget 
ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.ma
y.pep.fa.gz 
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer1/blastp) then
      rm -r /iscratch/i/danRer1/blastp
    endif
    mkdir -p /iscratch/i/danRer1/blastp
    cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
    iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/danRer1/run/out
    cd /cluster/data/mm5/bed/blastp/danRer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i 
\$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls �1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      96773s    1612.89m    26.88h    1.12d  0.003 y
# IO & Wait Time:                 29356s     489.26m     8.15h    0.34d  0.001 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest job:                       73s       1.22m     0.02h    0.00d
# Submission to last job:           282s       4.70m     0.08h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/danRer1/run/out
    hgLoadBlastTab mm5 drBlastTab -maxPer=1 *.tab


# YEAST BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-
ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/o
rf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep
    #ABOVE WAS ALREDY DONE BY JIM

    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm5/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 
-o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls �1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...

# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      16348s     272.46m     4.54h    0.19d  0.001 y
# IO & Wait Time:                 23063s     384.39m     6.41h    0.27d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest job:                       14s       0.23m     0.00h    0.00d
# Submission to last job:           203s       3.38m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/sacCer1/run/out
    hgLoadBlastTab mm5 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    ssh kkr1u00
    if (-e /iscratch/i/dm1/blastp) then
      rm -r /iscratch/i/dm1/blastp
    endif
    mkdir -p /iscratch/i/dm1/blastp
    cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/dm1/run/out
    cd /cluster/data/mm5/bed/blastp/dm1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o 
\$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls �1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      64033s    1067.22m    17.79h    0.74d  0.002 y
# IO & Wait Time:                 20868s     347.79m     5.80h    0.24d  0.001 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest job:                       45s       0.75m     0.01h    0.00d
# Submission to last job:           351s       5.85m     0.10h    0.00d
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/dm1/run/out
    hgLoadBlastTab mm5 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink (DONE 7/20/04 Fan)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm5 \
        > refToLl.txt
hgMapToGene mm5 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 30303

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm5 knownGene name proteinID Pfam knownToPfam
# row count is 29069

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm5 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and genePix database (DONE 3/15/05 JK)
    knownToGenePix mm5

# ENABLE GENE SORTER FOR mm5 IN HGCENTRALTEST (DONE 7/20/04 Fan)
    echo "update dbDb set hgNearOk = 1 where name = 'mm5';" \
      | hgsql -h genome-testdb hgcentraltest


# RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan)
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/rn3/bed/blastp
    cd /cluster/data/rn3/bed/blastp
    pepPredToFa rn3 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/rn3/blastp) then
      rm -r /iscratch/i/rn3/blastp
    endif
    mkdir -p /iscratch/i/rn3/blastp
    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/rn3/run/out
    cd /cluster/data/mm5/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...

Completed: 7739 of 7739 jobs
CPU time in finished jobs:      24369s     406.14m     6.77h    0.28d  0.001 y
IO & Wait Time:                 21867s     364.46m     6.07h    0.25d  0.001 y
Average job time:                   6s       0.10m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:              25s       0.42m     0.01h    0.00d
Submission to last job:           276s       4.60m     0.08h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/rn3/run/out
    hgLoadBlastTab mm5 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF
#############################################################################

#  BLASTZ RAT RN3 (DONE - 2004-07-15 - Fan)
#  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-30. Fan.
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-07-14
    cd /cluster/data/mm5/bed
    ln -s  blastz.rn3.2004-07-14 blastz.rn3
    cd blastz.rn3

    cat << '_EOF_' > DEF
# rat vs. mouse
export 
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartz
bin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/mm5/bed/blastz.rn3
    bash
    source ./DEF
    # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    #	it is a generic script and works for any assembly

    cp /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
       /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....

Completed: 41943 of 41943 jobs
CPU time in finished jobs:   16854319s  280905.31m  4681.76h  195.07d  0.534 y
IO & Wait Time:                448464s    7474.41m   124.57h    5.19d  0.014 y
Average job time:                 413s       6.88m     0.11h    0.00d
Longest job:                     9358s     155.97m     2.60h    0.11d
Submission to last job:         73416s    1223.60m    20.39h    0.85d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/mm5/bed/blastz.rn3
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    #	fixup machine check, should be kki, not kk
    cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
       /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh

    /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       7859s     130.98m     2.18h    0.09d  0.000 y
# IO & Wait Time:                104771s    1746.19m    29.10h    1.21d  0.003 y
# Average job time:                 330s       5.50m     0.09h    0.00d
# Longest job:                     1625s      27.08m     0.45h    0.02d
# Submission to last job:          8535s     142.25m     2.37h    0.10d

    #	Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.rn3
    bash
    source ./DEF
    #	The copy of this in mm4 was broken, use the hg17 one instead
    cp /cluster/data/hg17/jkStuff/BlastZ_run2.sh \
       /cluster/data/mm5/jkStuff/BlastZ_run2.sh
    # vi /cluster/data/mm5/jkStuff/BlastZ_run2.sh
    /cluster/data/mm5/jkStuff/BlastZ_run2.sh
    cd run.2
    #edited gsub to change /scratch/mus/mm5 to /cluster/bluearc/scratch/mus/mm5
    # and recreated jobList by:
    gensub2 chrom.list single gsub jobList
    para create jobList
    para try, check, push, etc ...

# Completed: 42 of 43 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       2050s      34.17m     0.57h    0.02d  0.000 y
# IO & Wait Time:                143135s    2385.58m    39.76h    1.66d  0.005 y
# Average job time:                3457s      57.61m     0.96h    0.04d
# Longest job:                    14460s     241.00m     4.02h    0.17d
# Submission to last job:         14849s     247.48m     4.12h    0.17d

# Note: previous numbers were:

# Completed: 46 of 46 jobs
# CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
# IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
# Average job time:                 168s       2.79m     0.05h    0.00d
# Longest job:                      642s      10.70m     0.18h    0.01d
# Submission to last job:           642s      10.70m     0.18h    0.01d
# probably due to data on bluearc instead of on kki nodes.

# One job failed consistently because short of memory error
# went to kkr4u00 to run the following job:
# Per Angie's advice, created /cluster/bin/scripts/blastz-chromlav2axtLargeMem
# by from /cluster/bin/scripts/blastz-chromlav2axt and changed /cluster/bin/i386
# to /cluster/bin/x86_64 and then ran: 

/cluster/bin/scripts/blastz-chromlav2axtLargeMem 
/cluster/data/mm5/bed/blastz.rn3/lav/chr2 
/cluster/data/mm5/bed/blastz.rn3/axtChrom/chr2.axt 
/cluster/bluearc/scratch/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs

# It worked!

    # translate sorted axt files into psl
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3
    mkdir pslChrom
    set tbl = "blastzRn3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 2 hours

    # Load database tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/pslChrom
    bash
for I in *.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done: ${I}"
done
    # Check results
    #	featureBits hg16 blastzRn3
    # 1013603401 bases of 2865248791 (35.376%) in intersection

    #	featureBits mm5 blastzRn3 ran out of memory.  
    # So check a few specific chromosomes

    # featureBits mm5 blastzRn3 -chrom=chr17
    # 61029084 bases of 86658738 (70.425%) in intersection
    # featureBits mm4 blastzRn3 -chrom=chr17
    # 62824556 bases of 89616841 (70.104%) in intersection

    # featureBits mm5 blastzRn3 -chrom=chr18
    # 61442155 bases of 86685738 (70.879%) in intersection
    # featureBits mm4 blastzRn3 -chrom=chr18
    # 57158006 bases of 81388777 (70.228%) in intersection
    
# CHAIN RN3 BLASTZ (DONE - 2004-07-22 - Fan)
#  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-30. Fan.

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/mm5/bed/blastz.rn3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} 
out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1 \
	/iscratch/i/mus/mm5/softNib \
	/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:      18318s     305.30m     5.09h    0.21d  0.001 y
# IO & Wait Time:                 41906s     698.44m    11.64h    0.49d  0.001 y
# Average job time:                1401s      23.34m     0.39h    0.02d
# Longest job:                     5598s      93.30m     1.55h    0.06d
# Submission to last job:          5635s      93.92m     1.57h    0.07d
    # now on the file server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain &
    # real    26m14.694s
    # user    16m16.190s
    # sys     2m19.520s

    time chainSplit chain all.chain &
    # real    26m29.801s
    # user    15m40.780s
    # sys     2m40.610s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainRn3 $i
        echo done $c
    end

    # featureBits mm4 chainRn3Link -chrom=chr16
    # 67474802 bases of 95076222 (70.969%) in intersection
    # featureBits mm5 chainRn3Link -chrom=chr16
    # 66703715 bases of 92679592 (71.972%) in intersection

    # featureBits mm4 chainRn3Link -chrom=chr17
    # 61932430 bases of 89616841 (69.108%) in intersection
    # featureBits mm5 chainRn3Link -chrom=chr17
    # 60676019 bases of 86658738 (70.017%) in intersection

# NET RN3 (DONE - 2004-07-23 - Fan)
#  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-31. Fan.

    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/mm5/chrom.sizes \
                        /cluster/data/rn3/chrom.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm5/chrom.sizes \
                            /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 1850904576, utime 9294 s/100, stime 2079

    # The netClass operations requires an "ancientRepeat" table to exist
    # in either mm5 or rn3.  So, create the table:

    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/ancientRepeat
    cd /cluster/data/mm5/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    # and you need to use your read/write enabled user with password
    chmod 777 .
    hgsqldump --all --tab=. mm4 ancientRepeat
    chmod 775 .
    hgsql mm5 < ancientRepeat.sql
    mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
    # This is a hand curated table obtained from Arian.

    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    time netClass hNoClass.net mm5 rn3 rat.net \
	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
    # 508.060u 89.340s 12:10.36 81.7% 0+0k 0+0io 201pf+0w
    
    # If things look good do
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn rat.net > ratSyn.net

    # real    5m5.494s
    # user    3m52.710s
    # sys     0m32.670s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    netFilter -minGap=10 rat.net |  hgLoadNet mm5 netRn3 stdin
    netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin

    # real    8m50.781s
    # user    4m59.660s
    # sys     0m52.840s
    
    # check results
    # featureBits mm4 netRn3
    # 96806381 bases of 95076222 (101.820%) in intersection
    # featureBits mm5 netRn3
    # 2638255333 bases of 2615483787 (100.871%) in intersection

    # featureBits mm4 syntenyNetRn3
    # 96760405 bases of 95076222 (101.771%) in intersection
    # featureBits mm5 syntenyNetRn3
    # 2600849289 bases of 2615483787 (99.440%) in intersection

    # Add entries for net and chain to mouse/mm5 trackDb

    # make net
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    mkdir ratNet
    time netSplit rat.net ratNet
    # real    5m28.037s
    # user    3m58.150s
    # sys     0m37.870s

    # extract axts from net 
    mkdir ../axtNet 
    foreach n (ratNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt ratNet/$c.net chain/$c.chain \
		/cluster/data/mm5/nib \
		/cluster/data/rn3/nib ../axtNet/$c.axt
	echo "Complete: $c.net -> axtNet/$c.axt"
    end
    # sort axt's and convert to maf format
    mkdir ../mafNet
cat << 'EOF' > makeMaf.csh
    foreach f (../axtNet/chr*.axt)
        set c=$f:t:r
        echo $c.axt
        mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
        axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
        rm ../axtNet/$c.unsorted.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
    end
'EOF'
    #csh makeMaf.csh >&! makeMaf.log &
    csh makeMaf.csh > makeMaf.log &
    tail -100f makeMaf.log
    # THE ABOVE DID NOT WORK.  TRIED THE FOLLOWING:
    foreach f (../axtNet/chr*.axt)
        set c=$f:t:r
        echo $c.axt
        mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
        axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
        rm ../axtNet/$c.unsorted.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
    end

    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtBest
    cd /cluster/data/mm5/bed/blastz.rn3/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
	echo "Done: ${c}_blastzBestRn3.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3/pslBest
    bash
    for I in chr*BestRn3.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
	echo "done ${I}"
    done

     # check results
    # featureBits mm5 blastzBestRn3
    # 1778653886 bases of 2615483787 (68.005%) in intersection
    # featureBits mm4 blastzBestRn3
    # 1780774716 bases of 2627444668 (67.776%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm5/axtBest/Rn3
     cd /gbdb/mm5/axtBest/Rn3
     ln -s /cluster/data/mm5/bed/blastz.rn3/axtNet/chr*.axt .
     cd /cluster/data/mm5/bed/blastz.rn3/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql mm5 < axtInfoInserts.sql

# BLASTZ RN3 CLEAN UP (DONE - 2004-07-26 - Fan)
#  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-31. Fan.
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3
    nice rm -rf raw &
    nice rm axtChain/run1/chain/* &
    nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &

# MAKE VSRN3 DOWNLOADABLES (DONE 9/14/04 Fan)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3/axtChain
    ln all.chain rat.chain
    foreach f (rat.chain rat.net)
      gzip -c $f > $f.gz
    end
    rm rat.chain

    # Make chain-format of raw alignments
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3
    mkdir blastzECF
    foreach f (axtChrom/chr*.axt)
      set chr = $f:t:r
      axtToChain $f S1.len S2.len stdout \
      | gzip -c - > blastzECF/$chr.ecf.gz
    end

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
    mv /cluster/data/mm5/bed/blastz.rn3/axtChain/rat*.gz .
    cp -p /cluster/data/mm5/bed/blastz.rn3/axtChain/all.chain.gz \
          /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/rat.chain.gz
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
    # Not for pushing -- handle separately.
    mv /cluster/data/mm5/bed/blastz.rn3/blastzECF .
    cd blastzECF
    md5sum *.gz > md5sum.txt

# BLASTZ ZEBRAFISH (DANRER1) (DONE, 2004-07-29, hartera)

    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.

    mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
    foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end

    mkdir -p /iscratch/i/danRer1/linSpecRep.notInMouse
    foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/danRer1/linSpecRep.notInMouse/$f:t:r:r.out.spec
    end
    iSync

    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastz.danRer1.2004-07-27
    ln -s /cluster/data/mm5/bed/blastz.danRer1.2004-07-27 \
          /cluster/data/mm5/bed/blastz.danRer1
    cd /cluster/data/mm5/bed/blastz.danRer1
    # use same parameters as for danRer1-hg17
    cat << '_EOF_' > DEF
# mouse (mm5) vs zebrafish (danRer1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm5)
SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer1)
SEQ2_DIR=/iscratch/i/danRer1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.danRer1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.mm5-danRer1.2004-07-27
    # setup cluster run
    # copy shell scripts for blastz runs if not there already
    cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
    # edit BlastZ_run0.sh
    # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
    # this is the directory for the latest version of blastz-run

        # source the DEF file
    bash
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 57970 of 57970 jobs
# CPU time in finished jobs:   18228826s  303813.77m  5063.56h  210.98d  0.578 y
# IO & Wait Time:               1019215s   16986.92m   283.12h   11.80d  0.032 y
# Average job time:                 332s       5.53m     0.09h    0.00d
# Longest job:                     2211s      36.85m     0.61h    0.03d
# Submission to last job:         45422s     757.03m    12.62h    0.53d
    # Took about 12 hours to run and output is 1.7G
    # second cluster run to convert the .out's to .lav's
    cd /cluster/data/mm5/bed/blastz.danRer1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Checking finished jobs
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       4536s      75.60m     1.26h    0.05d  0.000 y
# IO & Wait Time:                 65931s    1098.85m    18.31h    0.76d  0.002 y
# Average job time:                 207s       3.44m     0.06h    0.00d
# Longest job:                      636s      10.60m     0.18h    0.01d
# Submission to last job:          1282s      21.37m     0.36h    0.01d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.danRer1
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer1/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:        246s       4.10m     0.07h    0.00d  0.000 y
# IO & Wait Time:                  4985s      83.08m     1.38h    0.06d  0.000 y
# Average job time:                 122s       2.03m     0.03h    0.00d
# Longest job:                      446s       7.43m     0.12h    0.01d
# Submission to last job:           653s      10.88m     0.18h    0.01d

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.danRer1
    mkdir -p pslChrom
    set tbl = "blastzDanRer1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer1/pslChrom

    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl mm5 $f
    end
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
#refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L4000 -enrichment
# refGene:cds 0.763%, blastzDanRer1L4000 17.878%, both 0.581%, cover 76.18%, 
# enrich 4.26x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L5000 -enrichment
# refGene:cds 0.763%,blastzDanRer1L5000 6.013%,both 0.540%,cover 70.81%,
# enrich 11.78x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L6500 -enrichment
# refGene:cds 0.763%, blastzDanRer1L6500 2.386%, both 0.495%, cover 64.91%, 
# enrich 27.20x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L7000 -enrichment
# refGene:cds 0.763%, blastzDanRer1L7000 2.062%, both 0.480%, cover 62.87%, 
# enrich 30.50x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1HumanParams -enrichment
# refGene:cds 0.763%,blastzDanRer1HumanParams 1.661%,both 0.502%, cover 65.82%,
# enrich 39.64x

# row counts:   172167 blastzDanRer1, 
#               2288714 blastzDanRer1HumanParams,
#               3373525 blastzDanRer1L4000
#               700927 blastzDanRer1L5000
#               13719318 blastzDanRer1L3000
#               103190 blastzDanRer1L6500
#               76758 blastzDanRer1L7000 
# Do test runs - repeat above using L=4000 and then try the mm5-hg17 parameters
# also L=2000, L=3000 and L=5000. Use only mm5 chr1 for tests.
# L=2000 and L=3000 lavToAxt crashed so re-do on kolossus. L2000 crashed again
# probably ran out of memory. 
# The orginal blastzDanRer1 with L= 6000 looks best: good coverage and 
# enrichment without too many alignments in the database table.

# RESCORE DANRER1 BLASTZ ALIGNMENTS (DONE, 2004-08-02, hartera)

    # Low scores can occur with repeats abridged and using the
    # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
    # with the default matrix instead of the BLASTZ_Q matrix.
    # Rescore them here so the chainer sees the higher scores:
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.danRer1
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
        axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom

#   psl files and blastz tables will be the same regardless of score so
#   no need to reload

# CHAIN DANRER1 BLASTZ (DONE, 2004-08-03, hartera)
# FILTERED WITH A HIGHER MINSCORE THRESHOLD (DONE, 2004-08-04, hartera)
# RELOADED TABLES (DONE, 2004-08-18, hartera)
# removed all chainDanRer1 and chainDanRer1Link tables, some extra tables had 
# been accidentally loaded with this name from a different genome so there
# were duplicate chain ids causing joinerCheck to complain.

    # Re do chains with rescored blastz danRer1
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.danRer1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm5/bed/blastz.danRer1/axtChrom/*.axt \
        > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    # Make our own linear gap file with reduced gap penalties, 
    # in hopes of getting longer chains:
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=5000 $1 \
    /cluster/bluearc/scratch/mus/mm5/softNib \
    /iscratch/i/danRer1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       2260s      37.67m     0.63h    0.03d  0.000 y
# IO & Wait Time:                   863s      14.38m     0.24h    0.01d  0.000 y
# Average job time:                  73s       1.21m     0.02h    0.00d
# Longest job:                      342s       5.70m     0.10h    0.00d
# Submission to last job:         36951s     615.85m    10.26h    0.43d

   # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    # filter again to use minScore of 7500 (see featureBits below) (2004-08-04)
    mv all.chain all.chain.filt5k
    chainFilter -minScore=7500 all.chain.unfiltered > all.chain
    # remove old chains
    rm -r chain
    chainSplit chain all.chain
    gzip all.chain.filt5k

# take a look at score distr's,try also with smaller bin size.
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist.out
      textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out
      echo ""
    end
    # also hist5000.out has bin size 5000. looks good so load into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainDanRer1 $i
        echo done $c
    end
# featureBits still shows good coverage and enrichment
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
# refGene:cds 0.763%, chainDanRer1Link 2.246%, both 0.508%, cover 66.61%, 
# enrich 29.65x
# Human Parameters Blastz Chain with minScore = 5,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1HPLink -enrichment
# refGene:cds 0.763%, chainDanRer1HPLink 1.208%, both 0.484%, cover 63.43%, 
# enrich 52.49x
# L=5000 Blastz Chain with minScore = 5,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5kLink -enrichment
# refGene:cds 0.763%, chainDanRer1L5kLink 4.137%, both 0.534%, cover 69.96%, 
# enrich 16.91x
# L=5000 Blastz Chain with minScore =10,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5k10kLink -enrichment
# refGene:cds 0.763%, chainDanRer1L5k10kLink 1.038%, both 0.448%, cover 58.69%,
# enrich 56.54x
# filter too stringent, coverage has dropped a lot
# with less filtering of blastzDanRer1 where minScore =3000
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt3kLink -enrichment
# refGene:cds 0.763%, chainDanRer1Filt3kLink 2.487%, both 0.509%, cover 66.78%,
# enrich 26.86x
# with more filtering, minScore = 6000
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt6kLink -enrichment
# refGene:cds 0.763%, chainDanRer1Filt6kLink 2.172%, both 0.508%, cover 66.54%, # enrich 30.64x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt7500Link -enrichment
# refGene:cds 0.763%, chainDanRer1Filt75kLink 2.022%, both 0.504%, cover 66.10%,# enrich 32.70x

# rows in database table:
# chr1_blastzDanRer1Link: 515119
# chr1_chainDanRer1L5kLink: 1241480
# chr1_chainDanRer1L5k10kLink: 74963
# chr1_chainDanRer1HPLink: 309740
# chr1_chainDanRer1Filt3k: 594057
# chr1_chainDanRer1Filt6kLink: 479368 
# chr1_chainDanRer1Filt7500Link: 378954
# Using the original parameters is a good compromise between high coverage
# and high enrichment but a filter of 7500 on the score produces only a tiny
# reduction in coverage with higher enrichment as there are a lot less 
# alignments of low score of the same regions or other low scoring alignments.

# NET DANRER1 BLASTZ (DONE, 2004-08-04, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 103493632, utime 668 s/100, stime 127

# Add classification info using db tables:
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse
    cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \
       /cluster/bluearc/danRer1/linSpecRep.notInMouse
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    time netClass noClass.net mm5 danRer1 danRer1.net \
         -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
         -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse -noAr
    # 77.700u 46.610s 3:05.75 66.9%   0+0k 0+0io 215pf+0w
    netFilter -minGap=10 danRer1.net |  hgLoadNet mm5 netDanRer1 stdin

# EXTRACT AXTs AND MAFs FROM ZEBRAFISH (danRer1) NET
# (DONE, 2004-08-04, hartera)
    ssh eieio
    # create axts
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    netSplit danRer1.net danRer1Net
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (danRer1Net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt danRer1Net/$c.net chain/$c.chain \
                 /cluster/data/mm5/mixedNib \
                 /cluster/data/danRer1/nib ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
                                                                                
    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log

    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/mm5/bed/blastz.danRer1
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end
    # create maf
    ssh eieio
    cd /cluster/data/mm5/bed/blastz.danRer1
    cd axtNet
    mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.danRer1.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/mm5/chrom.sizes /cluster/data/danRer1/chrom.sizes \
            ../mafNet/$maf -tPrefix=mm5.  -qPrefix=danRer1.
    end
'EOF'
    chmod +x makeMaf.csh
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
 
# BLASTZ DANRER1 CLEAN UP (DONE, 2004-08-04, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer1
    nice rm -rf raw &
    nice rm -rf lav &
    nice rm -rf axtChrom.orig &
    nice rm axtChain/run1/chain/* &
    nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# unzip all.chain.gz and danRer1.net.gz to make vsDanRer1 downloadables
# then zip these again (hartera, 2004-09-10)

# UPDATE BACEND SEQUENCES (DONE - 2004-07-20 - Fan)

    # Download new files
    ssh kksilo
    mkdir �p /cluster/data/mm5/bed/bacends/ncbi
    cd /cluster/data/mm5/bed/bacends/ncbi
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README
    gunzip AllBACends.mfa.gz
    gunzip cl_acc_gi_len.gz

    # Convert fa file
    cat << '_EOF_' > convert.pl
#!/usr/local/bin/perl -w

use strict;

while (my $line = <>) {
    if (substr($line,0,1) ne ">") {
        print $line;
    } else {
        my @fields = split(/\|/, $line);
        my $printed = 0;
        for (my $i = 0; $i < $#fields; $i++) {
                if ($fields[$i] eq "gb") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $#fields;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    chmod +x convert.pl
    ./convert.pl < AllBACends.mfa > BACends.fa

    # Create new pairs files
    convertBacEndPairInfo cl_acc_gi_len

    # Split file into pieces and copy to cluster to propagate
    ssh kksilo
    cd /cluster/data/mm5/bed/bacends/ncbi
    /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends
    rm -rf /cluster/bluearc/scratch/mus/mm5/bacEnds
    mkdir /cluster/bluearc/scratch/mus/mm5/bacEnds
    mv BACends???.fa /cluster/bluearc/scratch/mus/mm5/bacEnds
    cp -p BACends.fa /cluster/bluearc/scratch/mus/mm5/bacEnds

    # Ask for propagation from sysadmin

    # Load the sequences (change bacends.# to match correct location)
    ssh hgwdev
    mkdir /gbdb/mm5/bacends
    cd /gbdb/mm5/bacends
    ln -s /cluster/data/mm5/bed/bacends/ncbi/BACends.fa .
    cd /tmp
    hgLoadSeq mm5 /gbdb/mm5/bacends/BACends.fa
    #Adding /gbdb/mm5/bacends/BACends.fa
    #452237 sequences
    #Updating seq table
    
    # One additional step 9/10/04 Fan.
    # Create a composite index to speed up hgTracks display when BAC Ends track selected.
    hgsql mm5 -e 'create index bacIndex2 on all_bacends(bin, qName(8));'
    # This will take hours.
    
    #All done

# BACEND SEQUENCE ALIGNMENTS (DONE - 2004-07-23 - Fan)
    # (alignments done without RepeatMasking)
    #	We need an ooc file for this genome
    ssh kksilo
    mkdir /cluster/data/mm5/ooc
    cd /cluster/data/mm5/ooc
    ls ../unmaskedNib/chr*.nib > nib.list
    blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl
    # Wrote 26077 overused 11-mers to 11.ooc
    # Did not end using this.  Used an old one instead.

    # Create full sequence alignments
    ssh kk
    cd /cluster/data/mm5/bed/bacends

    /cluster/bin/scripts/splitContigList -scratch 
/iscratch/i/mus/mm5/maskedContigs 1

    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:

    cat << '_EOF_' > runBlat.sh
#!/bin/sh
path1=$1
path2=$2
root1=$3
root2=$4
result=$5
rm -fr /tmp/${root1}_${root2}
mkdir /tmp/${root1}_${root2}
pushd /tmp/${root1}_${root2}
/cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \
	${root1}.${root2}.psl
popd
rm -f ${result}
mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /tmp/${root1}_${root2}
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x runBlat.sh

    cat << '_EOF_' > template
#LOOP
./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) 
$(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    #ls -1S /iscratch/i/mm5/bacEnds/BACends???.fa > bacEnds.lst
    ls -1S /scratch/mus/mm5/bacEnds/BACends???.fa > bacEnds.lst
    mkdir bacEnds.out
    #	create results directories for each to avoid the all result files in
    #	one directory problem
    foreach f (`cat bacEnds.lst`)
	set b = $f:t:r
	echo $b
	mkdir bacEnds.out/$b
    end

    gensub2 contig.lst bacEnds.lst template jobList
    para create jobList
    # 62622 jobs written to batch
    para try, check, push, etc ...

# Completed: 62622 of 62622 jobs
# CPU time in finished jobs:    3760354s   62672.57m  1044.54h   43.52d  0.119 y
# IO & Wait Time:               3216480s   53608.00m   893.47h   37.23d  0.102 y
# Average job time:                 111s       1.86m     0.03h    0.00d
# Longest job:                     2841s      47.35m     0.79h    0.03d
# Submission to last job:          9395s     156.58m     2.61h    0.11d

# Compile alignments and lift the files.
# First attempt failed due to /cluster/store6 ran out of space.
# Redoing it 7/22/04.

    ssh kksilo
    cd /cluster/data/mm5/bed/bacends

    mkdir /cluster/store8/fanTemp
    time pslSort dirs raw.psl /cluster/store8/fanTemp bacEnds.out/* \
      > time.out &

    # This may take over over 14 hours!

    ssh kolossus
    cd /cluster/data/mm5/bed/bacends

    time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl  
bacEnds.psl /dev/null
# Processed 562840490 alignments

    rmdir temp
    #	You will want to keep this file around until later processing is
    #	proven correct
    rm raw.psl		# 72 Gb !  It takes a while even to remove it.

    ssh kksilo
    cd /cluster/data/mm5/bed/bacends

    time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm5 bacEnds.psl 
    # real    130m36.149s
    # user    82m38.180s
    # sys     10m59.580s

    cp -p ~booch/clusterJobs/bacends/split.pl .
    cp -p ~booch/clusterJobs/bacends/header .
    time ./split.pl header < bacEnds.psl.lifted
    # real    2m16.354s
    # user    0m36.390s
    # sys     0m42.290s

    cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save
    time pslSort dirs bacEnds.psl.lifted temp split

    # real    17m2.353s
    # user    14m17.040s
    # sys     1m38.560s

    rmdir temp
    rm -r split

    # Copy files to final destination and remove
    mkdir /cluster/data/mm5/bacends
    cp -p bacEnds.psl.lifted /cluster/data/mm5/bacends

# BACEND PAIRS TRACK (DONE  2004-07-27 - Fan)

    ssh kolossus
    cd /cluster/data/mm5/bacends

bash
time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl.lifted \
	../bed/bacends/ncbi/bacEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > 
header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    # edit header to make sure \t is/become tab character

    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -
del > bacEndPairs.bed
    cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch 
bacEnds.orphan \
        | row score ge 300 | sorttbl chr start | headchg -del > 
bacEndPairsBad.bed

# The following took too long, break it into 3 steps.
#    extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \
#    bacEndPairsBad.bed | sorttbl tname tstart | headchg -del > bacEnds.load.psl

    extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed   \
    bacEndPairsBad.bed >j1.out
    cat j1.out| sorttbl tname tstart >j2.out
    cat j2.out | headchg -del > bacEnds.load.psl

    rm j1.out j2.out

    # load into database
    ssh hgwdev
    cd /cluster/data/mm5/bacends

    # edit bacEndPairs.bed to fix one ID that has a blank character in it.
    hgLoadBed mm5 bacEndPairs bacEndPairs.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
        # Loaded 168535
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed mm5 bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
        # Loaded 43182
    #hgLoadPsl mm5 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl mm5 -table=all_bacends bacEnds.load.psl
    # load of all_bacends did not go as planned: 14426473 record(s), 0 row(s) 
skipped, 4519 warning(s) loading psl.tab
    
# featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
# featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection

# featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
# featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection

# featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
# featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection


# BLASTZ FUGU (FR1) (WORKING 7/28/04 kate)
    # Using Angie's hg17/fugu as a model

    # Treat all mouse repeats as lineage-specific (same as chicken, so just
    # reuse linSpecRep.Chicken).
    ssh kkr1u00
    ln -s /iscratch/i/mus/mm5/linSpecRep.notInChicken \
                /iscratch/i/mus/mm5/linSpecRep.notInFugu
    iSync

    ssh kk
    cd /cluster/data/mm5/bed
    mkdir blastz.fr1.2004-07-28
    ln -s blastz.fr1.2004-07-28 blastz.fr1
    cd blastz.fr1

    cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse
SEQ1_DIR=/iscratch/i/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInFugu
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.fr1.2004-07-28

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
    
# GOT HERE

#Completed: 93775 of 93775 jobs
#Average job time:                 187s       3.11m     0.05h    0.00d
#Longest job:                     3907s      65.12m     1.09h    0.05d
#Submission to last job:         76763s    1279.38m    21.32h    0.89d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time:                  98s       1.63m     0.03h    0.00d
#Longest job:                      281s       4.68m     0.08h    0.00d
#Submission to last job:          2102s      35.03m     0.58h    0.02d

    # third run: lav -> axt
    # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
    # the pipe after lavToAxt)
    ssh kki
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /iscratch/i/mus/mm5/softNib /iscratch/i/fr1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time:                 671s      11.18m     0.19h    0.01d
#Longest job:                     2398s      39.97m     0.67h    0.03d
#Submission to last job:          2417s      40.28m     0.67h    0.03d


# CHAIN FUGU BLASTZ (WORKING 7/16/04 kate)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainchimpSuperQuals
    ls -1S /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
    /iscratch/i/mus/mm5/softNib \
    /iscratch/i/fr1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time:                 537s       8.96m     0.15h    0.01d
#Longest job:                     2071s      34.52m     0.58h    0.02d
#Submission to last job:          2071s      34.52m     0.58h    0.02d
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainFr1 $i
    end
    # mouse-fugu gets significantly less coverage than human-fugu:
    featureBits mm5 -chrom=chr1 chainFr1Link
#63386139 bases of 185739816 (34.126%) in intersection
    featureBits hg17 -chrom=chr1 chainFr1Link
#123999291 bases of 222827847 (55.648%) in intersection
    # mouse-fugu isn't a whole lot less than mouse-human though:
    featureBits mm5 -chrom=chr1 chainHg17Link
#75492250 bases of 185739816 (40.644%) in intersection
    featureBits mm5 -chrom=chr1 chainCanFam1Link
#63386139 bases of 185739816 (34.126%) in intersection


# NET FUGU BLASTZ (WORKING 7/16/04 kate)
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    netClass -noAr noClass.net mm5 fr1 fugu.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn fugu.net > fuguSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    netFilter -minGap=10 fugu.net |  hgLoadNet mm5 netFr1 stdin
    netFilter -minGap=10 fuguSyn.net | hgLoadNet mm5 syntenyNetFr1 stdin
    # Add entries for chainFr1, netFr1 to mouse/mm5 trackDb


# MAKE VSFR1 DOWNLOADABLES (WORKING 7/19/04 kate)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    gzip axtNet/chr*.axt
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    ln all.chain fugu.chain
    zip /cluster/data/mm5/zip/fugu.chain.zip fugu.chain
    rm fugu.chain
    zip /cluster/data/mm5/zip/fugu.net.zip fugu.net
    zip /cluster/data/mm5/zip/fuguSyn.net.zip fuguSyn.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
    mv /cluster/data/mm5/zip/fugu*.zip .
    cp -pR /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtNet .
    md5sum *.zip axtNet/* > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# GENERATE FR1 MAF FOR MULTIZ FROM NET (WORKING 7/19/04 kate)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
    netSplit fugu.net net
    cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
        /cluster/data/fr1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.mc.maf
      axtToMaf $f \
            /cluster/data/mm5/chrom.sizes /cluster/data/fr1/chrom.sizes \
            $maf -tPrefix=mm5. -qPrefix=fr1.
    end


# BLASTZ FR1 CLEAN UP (WORKING - 2004-07-28 - kate)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.fr1
    nice rm -rf raw &
    nice rm axtChain/run1/chain/* &
    nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &


# CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-07-29 kate)

    ssh kksilo
    set multizDir = multiz.2004-07-29
    set workingDir = /cluster/bluearc/mm5/$multizDir
    ln -s $workingDir /cluster/bluearc/mm5/multiz5way
    mkdir -p $workingDir
    mkdir -p /cluster/data/mm5/bed/$multizDir
    cd /cluster/data/mm5/bed/$multizDir

# wrapper script for multiz
    # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
    # NOTE: next time, modify script so it only needs one arg -- saves the
    # multiple dirname in a file for use by the next run
    cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
    cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
    chmod +x doMultiz.csh

    # copy mafs to bluearc -- rat
    ssh kksilo
    set workingDir = /cluster/bluearc/mm5/multiz.2004-07-29
    mkdir $workingDir/rn3
    cp /cluster/data/mm5/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
    ls $workingDir/rn3/*.maf > chrom.lst

    # human
    mkdir $workingDir/hg17
    cp /cluster/data/mm5/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17

    # dog
    mkdir $workingDir/canFam1
    cp /cluster/data/mm5/bed/blastz.canFam1/mafNet/chr*.maf $workingDir/canFam1

    # chicken
    mkdir $workingDir/galGal2
    cp /cluster/data/mm5/bed/blastz.galGal2/mafNet/chr*.maf $workingDir/galGal2

    # first multiz - add in human to mouse/rat
    # 
    ssh kki
    set multizDir = multiz.2004-07-29
    set workingDir = /cluster/bluearc/mm5/$multizDir
    cd /cluster/data/mm5/bed/$multizDir
    mkdir run.hg17
    cd run.hg17
    echo "hg17/rn3" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 43 jobs
    para try, check, push, check
    cd ..

    # dog
    mkdir run.canFam1
    cd run.canFam1
    echo "canFam1/rn3hg17" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
    para try, check, push, check
    cd ..

    # chicken
    mkdir run.galGal2
    cd run.galGal2
    echo "galGal2/rn3hg17canFam1" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    # no alignment file for chr18_random -- create one so we can create jobList
    para create jobList
    para try, check, push, check
    cd ..

    # copy 5-way mafs to build directory
    ssh kksilo
    set multizDir = multiz.2004-07-29
    set workingDir = /cluster/bluearc/mm5/$multizDir
    ln -s $workingDir/rn3hg17canFam1galGal2 $workingDir/maf
    cd /cluster/data/mm5/bed/multiz.2004-07-29
    mkdir maf
    cp $workingDir/maf/*.maf maf


# PHYLO-HMM CONSERVATION FOR 5-WAY MULTIZ (DONE 2004-07-29 kate)
# updated 09-13-04 acs

    ssh kksilo
    set path = ($path /cluster/bin/phast)
    cd /cluster/data/mm5/bed/multiz.2004-07-29
    mkdir cons
    cd cons

    #break up the genome-wide MAFs into pieces
    mkdir /cluster/bluearc/mm5/chrom
    cd /cluster/data/mm5
    foreach f (?{,?}/*.fa)
        echo $f
        cp $f /cluster/bluearc/mm5/chrom
    end

    ssh kki
    cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
    mkdir run.split
    cd run.split
    set WINDOWS = /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/sh

PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/mm5/chrom
WINDOWS=/cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS

maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O mm5,rn3,hg17,canFam1,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs
    chmod +x doSplit.sh
    rm -f jobList
    foreach file (/cluster/bluearc/mm5/multiz.2004-07-29/maf/*.maf) 
        set c = $file:t:r
	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
    end
    
    para create jobList
        # 43 jobs
    para try
    para check
    para push
#CPU time in finished jobs:       4354s      72.57m     1.21h    0.05d  0.000 y
#IO & Wait Time:                  6102s     101.70m     1.70h    0.07d  0.000 y
#Average job time:                 243s       4.05m     0.07h    0.00d
#Longest job:                      728s      12.13m     0.20h    0.01d
#Submission to last job:          1300s      21.67m     0.36h    0.02d
    cd ..

    # generate conservation scoring using phastCons
    ssh kk
    cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
    mkdir run.cons
    cd run.cons

    # skip parameter estimation step: use parameters already estimated for
    # hg17 (see makeHg17.doc)
    cp /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.cons.mod /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.noncons.mod .

    cat << 'EOF' > doPhastCons.sh
#!/bin/sh

mkdir -p /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod
--expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/mm5/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/mm5/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
    chmod u+x doPhastCons.sh

    rm -fr /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
    rm -f jobs.lst
    for f in /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs.lst ; done

    # run cluster job
    para create, ...
    # took about 10 minutes

    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    find /cluster/bluearc/mm5/phastCons/ELEMENTS -name "*.bed" > files
    rm -f splitfiles* all.raw.bed
    split files splitfiles
    for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
    /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
    rm files splitfiles* 

    hgLoadBed mm5 phastConsElements all.bed

    # check coverage
    featureBits mm5 phastConsElements
#135605549 bases of 2615483787 (5.185%) in intersection
    # This should be close enough.  If necessary, you can rerun the
    # steps above with a different target coverage.  When hitting the
    # target is important, you may want to perform several iterations
    # using a representative subset of the entire dataset (in human, chr1
    # seems to work pretty well)

    # set up wiggle
    mkdir -p /cluster/bluearc/mm5/phastCons/wib
    cat << 'EOF' > doWigAsciiToBinary.sh
#!/bin/sh
chr=$1
zcat `ls /cluster/bluearc/mm5/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/mm5/phastCons/wib/${chr}_phastCons stdin 
EOF
    chmod u+x doWigAsciiToBinary.sh

    rm -f jobs2.lst
    for chr in `ls /cluster/bluearc/mm5/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs2.lst ; done

    # run a little wigAsciiToBinary cluster job
    ssh kk, etc.

    # copy wibs and wigs from bluearc
    rsync -av /cluster/bluearc/mm5/phastCons/wib .

    # load track
    hgLoadWiggle mm5 phastCons -pathPrefix=/gbdb/mm5/phastCons/wib \
                wib/chr*_phastCons.wig
    mkdir -p /gbdb/mm5/phastCons/wib
    rm -f /gbdb/mm5/phastCons/wib/chr*phastCons.wib
    ln -s /cluster/data/mm5/bed/multiz.2004-07-29/cons/run.cons/wib/*.wib /gbdb/mm5/phastCons/wib
    chmod 775 . wib /gbdb/mm5/phastCons /gbdb/mm5/phastCons/wib
    chmod 664 wib/*.wib

    # move postprobs over and clean up bluearc 
    rsync -av /cluster/bluearc/mm5/phastCons/POSTPROBS .
    # (people sometimes want the raw scores)
    rm -r /cluster/bluearc/mm5/phastCons/ELEMENTS /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/wib

    # load data for track name "multiz5way"    

    # load multiz maf tables 
    ssh hgwdev
    cd /cluster/data/mm5/bed/multiz.2004-07-29
    set mafDir = /gbdb/mm5/multiz5way/maf
    set table = multiz5way
    mkdir -p $mafDir/$table
    ln -s `pwd`/maf/*.maf $mafDir/$table
    cd maf
    hgLoadMaf mm5 -warn multiz5way -pathPrefix=$mafDir/$table

   # load blastz maf tables
    # TODO: change mafWiggle to use db names instead of species names
    # in speciesOrder 
    ssh hgwdev
    cd /cluster/data/mm5/bed
    ln -s multiz.2004-07-29 multiz5way
cat > multiz5way/loadMaf.csh << 'EOF'
    set mafDir = /gbdb/mm5/multiz5way/maf
    foreach s (rn3 hg17 canFam1 galGal2)
        set O = `echo "select genome from dbDb where name='$s'" | \
                hgsql -s -h genome-testdb hgcentraltest`
        set o = $O:l
        set table = ${o}_netBlastz
        mkdir -p $mafDir/$table
        ln -s `pwd`/blastz.$s/mafNet/*.maf $mafDir/$table
        echo $o
        hgLoadMaf mm5 -warn ${o}_netBlastz -pathPrefix=$mafDir/$table
    end
'EOF'
    # <<EOF for emacs
    csh multiz5way/loadMaf.csh >&! multiz5way/loadMaf.log &
    
# track multiz5way
# shortLabel Conservation
# longLabel Rat/Human/Dog/Chicken Multiz Alignments & PhyloHMM Cons
# group compGeno
# priority 149
# visibility pack
#color 0, 10, 100
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons
# yLineOnOff Off
# autoScale Off
# pairwise netBlastz
# speciesOrder rat human dog chicken


# MULTIZ DOWNLOAD FILES (DONE kate 2004-08-03)
    ssh kksilo
    cd /cluster/data/mm5/bed/multiz5way

    # multiz
    mkdir gzMaf
    foreach f (maf/*.maf)
      gzip -c $f > gzMaf/$f:t.gz
      echo $f
    end
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
    cd /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
    mv /cluster/data/mm5/bed/multiz5way/gzMaf/* .
    rmdir /cluster/data/mm5/bed/multiz5way/gzMaf
    md5sum *.gz > md5sum.txt
    # make a README.txt file


# PHASTCONS SCORES DOWNLOADABLES (DONE 10/11/04 angie)
    ssh kksilo
    mkdir /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2
    cd /cluster/data/mm5/bed/multiz5way/cons/run.cons/POSTPROBS
    foreach chr (`awk '{print $1;}' /cluster/data/mm5/chrom.sizes`)
      echo $chr
      zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
      | gzip -c \
      > /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/$chr.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons
    # Doh!  /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 is 8.6G now -- too much 
    # to dump on hgwdev's / which is at 94%.  Instead of doing this:
    #mv /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 .
    # make symbolic links:
    mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
    cd /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
    ln -s /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/* .
    md5sum *.gz > md5sum.txt
    # make a README.txt.


# PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-08-02 kate)
    
    # split into 3K chunks
    ssh kksilo
    cd /cluster/data/mm5
    set liftDir = /iscratch/i/mm5/liftOver/liftSplit
    mkdir -p $liftDir
    cd $liftDir
    mkdir -p split lift
cat > split.csh << 'EOF'
    set liftDir = /iscratch/i/mm5/liftOver/liftSplit
    cd /cluster/data/mm5
    foreach n (`ls ?{,?}/*.fa`)
        set d = $n:h
        set c = $n:t:r
        echo $c
        faSplit -lift=$liftDir/lift/$c.lft size \
            /cluster/data/mm5/$d/$c.fa -oneFile 3000 $liftDir/split/$c
    end
'EOF'
# << for emacs
    csh split.csh >&! split.log &
    tail -100f split.log
    ssh kkr1u00
    iSync


# LOAD GENEID GENES (DONE 8/2/04 Fan)
    # reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon)
    mkdir -p /cluster/data/mm5/bed/geneid/download
    cd /cluster/data/mm5/bed/geneid/download
    foreach f (/cluster/data/mm5/*/chr*.fa)
      set chr = $f:t:r
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.gtf
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene -genePredExt -gtf mm5 geneid download/*.gtf
    hgPepPred mm5 generic geneidPep download/*-fixed.prot


# PRODUCING GENSCAN PREDICTIONS (DONE 08-03-04 Fan)
    ssh hgwdev
    mkdir /cluster/data/mm5/bed/genscan
    cd /cluster/data/mm5/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/mm5/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/data/mm5/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line 
gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out 
line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -
par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 638 of 639 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:     386282s    6438.03m   107.30h    4.47d  0.012 y
# IO & Wait Time:                  3735s      62.25m     1.04h    0.04d  0.000 y
# Average job time:                 611s      10.19m     0.17h    0.01d
# Longest job:                    22687s     378.12m     6.30h    0.26d
# Submission to last job:         33710s     561.83m     9.36h    0.39d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
/cluster/bin/i386/gsBig /cluster/data/mm5/19/chr19_1/chr19_1.fa.masked 
gtf/chr19_1.fa.gtf -trans=pep/chr19_1.fa.pep -subopt=subopt/chr19_1.fa.bed -
exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -
tmp=/tmp -window=1200000

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/mm5/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm5/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene mm5 -gtf genscan genscan.gtf
    hgPepPred mm5 generic genscanPep genscan.pep
    hgLoadBed mm5 genscanSubopt genscanSubopt.bed


# MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
    ssh hgwdev
    mkdir /cluster/data/mm5/bed/mitopred
    cd /cluster/data/mm5/bed/mitopred
    wget http://mitopred.sdsc.edu/data/mus_30.out
    perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' mus_30.out > mitopred.tab
    cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
    name varchar(10) not null,      # SwissProt ID
    confidence varchar(8) not null, # Confidence level
              #Indices
    PRIMARY KEY(name(6))
);
'_EOF_'
    # << this line makes emacs coloring happy
    hgsql mm5 < mitopred.sql
    hgsql mm5 -e 'load data local infile "mitopred.tab" into table mitopred'

# STS MARKERS TRACK (RE-BUILT - 2004-08-24- Fan)

    ssh kksilo
    mkdir -p /cluster/data/mm5/bed/STSmarkers/downloads
    cd /cluster/data/mm5/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    # these map files appear to be old, 2002 Data
    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
    #	Picks up files:
    #	345184 Feb 20  2002 10090.MGD.txt
    #	173294 Jun 27  2002 10090.WI_Mouse_Genetic.txt
    #	240637 Jun 27  2002 10090.WI_Mouse_YAC.txt
    #	390088 Jun 27  2002 10090.Whitehead-MRC_RH.txt
    # If these files have not been changing, then no need to worry about
    #	them.  We are just picking them up to see if they have changed
    #	since the last time we worked on this.

    # these reports from jax.org appear to be changing daily
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt

    # compare them with previous versions.  Before this these were
    #	in /cluster/store5/mouseMarker/orig
    # these newly picked up files:
    sum -r 10090*
# 48882   338 10090.MGD.txt
# 24176   381 10090.Whitehead-MRC_RH.txt
# 62367   170 10090.WI_Mouse_Genetic.txt
# 50616   235 10090.WI_Mouse_YAC.txt
    sum -r *.rpt
# 21267  4442 MRK_Dump2.rpt
# 51274  3743 MRK_Sequence.rpt
# 35293  2315 PRB_PrimerSeq.rpt
    sum -r UniSTS*
# 40884 10502 UniSTS.aliases
# 14407  2931 UniSTS_mouse.sts

# the previous copies
    cd /cluster/store5/mouseMarker/orig
    sum -r 10090*
# 48882   338 10090.MGD.txt
# 24176   381 10090.Whitehead-MRC_RH.txt
# 62367   170 10090.WI_Mouse_Genetic.txt
# 50616   235 10090.WI_Mouse_YAC.txt
    sum -r *.rpt
# 36880  4160 MRK_Dump2.rpt
# 02447  3132 MRK_Sequence.rpt
# 57914  2220 PRB_PrimerSeq.rpt
    sum -r UniSTS*
# 36201  8843 UniSTS.aliases
# 58524   970 UniSTS_mouse.alias
# 42464  2291 UniSTS_mouse.sts

    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file

    cp /cluster/data/mm4/bed/STSmarkers/downloads/*.sh . -p
    cp /cluster/data/mm4/bed/STSmarkers/downloads/*.pl . -p

    #	This process has been captured in the script:
    #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
    # which uses a couple of perl scripts in that same directory.
    # briefly it is:
    
    # cd /cluster/data/mm5/bed/STSmarkers/downloads
    # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
    # grep MGI: UniSTS.aliases > MGI.aliases
    # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
    #	stsInfoAliases.txt
    # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
    # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
    #    | sort -n > UniSTS_mouse.alias

    /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh

    # with that, we can create a new stsInfoMouse.bed file:
    bash
    cd /cluster/data/mm5/bed/STSmarkers
    /cluster/store5/mouseMarker/code/updateBed.pl \
	/cluster/store5/mouseMarker/stsInfoMouse.bed \
	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile

    # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
    /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
	
    # copy the stsInfoMouse.bed file from working dir to the marker info storage fold.
    # added 2 new steps by Yontao	
	mv /cluster/store5/mouseMarker/stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed_mm3
	cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to Mm4, this file was used there:
    # /cluster/store6/mm4/bed/STSmarkers
    # a wc of it shows:
    # 56406  786036 6425721 stsInfoMouse.bed
    # Now we have:
    # 58488  790056 6602318 stsInfoMouse.bed

    # and from that, create new primer fa, epcr, etc:
    /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty
    wc mouse?.*
    #      0       0       0 mouseC.fa
    # 286740  286686 6474893 mouseP.fa
    # 32232  161234 2044810 mouseP.info
    # 318972  447920 8519703 total

    # the equivalent Mm4 versions:
    #      0       0       0 mouseC.fa
    # 258307  258245 5815248 mouseP.fa
    # 29906  149545 1890926 mouseP.info

    #	copy the primers over to the bluearc for the kluster run
    cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm5
    cp -p mouseP.info /cluster/bluearc/scratch/mus/mm5

#  CLUSTER RUN FOR THE STS PRIMERS

    ssh kk
    mkdir -p /cluster/data/mm5/bed/STSmarkers/primer
    mkdir -p /cluster/data/mm5/bed/STSmarkers/ePCR
    cd /cluster/data/mm5/bed/STSmarkers/primer
    # the mouseP.fa comes from above
    echo "/cluster/bluearc/scratch/mus/mm5/mouseP.fa" > primers.lst
    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
    mkdir primers.out
    /cluster/bin/scripts/splitContigList -mouse -scratch \
	/cluster/bluearc/scratch/mus/mm5/maskedContigs 1
    /cluster/bin/i386/gensub2 contig.lst primers.lst template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
# Completed: 639 of 639 jobs
# CPU time in finished jobs:     334066s    5567.76m    92.80h    3.87d  0.011 y
# IO & Wait Time:                 72565s    1209.42m    20.16h    0.84d  0.002 y
# Average job time:                 636s      10.61m     0.18h    0.01d
# Longest job:                      800s      13.33m     0.22h    0.01d
# Submission to last job:          1090s      18.17m     0.30h    0.01d

    # on the file server
    ssh kksilo
    cd /cluster/data/mm5/bed/STSmarkers/primer
    /cluster/bin/i386/pslSort dirs primers.psl temp primers.out
    rmdir temp

    # comparing results to Mm4:
    wc primers.psl
    # 5719969 120119288 590806241 primers.psl
    # Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
    # 5745617 120657896 592135728 primers.psl

    # another kluster run
    ssh kk
    cd /cluster/data/mm5/bed/STSmarkers/ePCR
    ls -1S /cluster/bluearc/scratch/mus/mm5/maskedContigs > contig.lst
# Edit this list to get full path names!
    mkdir epcr.out
    cat << '_EOF_' > template
#LOOP
/cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr
#ENDLOOP
'_EOF_'
    # the mouseP.info was created above
    echo "/cluster/bluearc/scratch/mus/mm5/mouseP.info" > epcr.lst
    gensub2 epcr.lst contig.lst template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
# Completed: 639 of 639 jobs
# CPU time in finished jobs:     146365s    2439.41m    40.66h    1.69d  0.005 y
# IO & Wait Time:                 67691s    1128.19m    18.80h    0.78d  0.002 y
# Average job time:                 335s       5.58m     0.09h    0.00d
# Longest job:                      427s       7.12m     0.12h    0.00d
# Submission to last job:           485s       8.08m     0.13h    0.01d
    ssh hgwdev
    cd /cluster/data/mm5/bed/STSmarkers/ePCR
    # all those results become all.epcr
    cat epcr.out/*.epcr > all.epcr
    # comparing results to Mm4:

    wc *.epcr
    # 55677  222708 2945623 all.epcr
    wc /cluster/store6/mm4/bed/STSmarkers/ePCR/*.epcr
    # 74705  298820 3971712 /cluster/store6/mm4/bed/STSmarkers/ePCR/all.epcr

    cd /cluster/data/mm5/bed/STSmarkers/primer

    /cluster/bin/scripts/filterSTSPrimers \
    -mouse ../stsInfoMouse.bed primers.psl \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	5700000
    #	Determining ePCR not found
    #
    wc primers.psl.filter.blat
    # 33476  702996 3442402 primers.psl.filter.blat

    # Mm4:  wc primers.psl.filter.blat
    # 32729  687309 3331894 primers.psl.filter.blat

    # create accession_info.rdb  (chrM added to Terry's script for mouse)
    touch empty_sequence.inf
    /cluster/bin/scripts/compileAccInfo -mouse \
	/cluster/data/mm5 empty_sequence.inf
    # works with two seemingly errors:
    # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
    # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
    mv accession_info.rdb accession_info.rdb.tmp
    /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
accession_info.rdb
    rm accession_info.rdb.tmp
    # comparing results to Mm4:
    #  Mm5 wc accession_info.rdb
    # 131845 1450299 9681940 accession_info.rdb
    #  Mm4 wc accession_info.rdb
    #  86935  956289 6374930 accession_info.rdb

    # 
    # 219652 1885501 11875772 total
    # wc /cluster/data/mm5/?/*.agp /cluster/data/mm5/??/*.agp
    # 252515 2152346 13568720 total

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    /cluster/bin/scripts/epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm5
    # Comparing results to Mm4:
    # Mm5 wc epcr*
    # 463    1852   17080 epcr.not.found
    #  61     732    5845 epcr.not.found.nomatch
    # 402    8442   39011 epcr.not.found.psl

    # Mm4 wc epcr*
    # 328    1312   12011 epcr.not.found
    #  57     684    5474 epcr.not.found.nomatch
    # 266    5586   25711 epcr.not.found.psl

    # there is a single error being propagated here from the file
    # /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error
    # at line 53958:
62943   D2J3    91947   D2J3                            CAACCAGCTCAC    
CAACCAGCTCAC    1825, 1025BP    0       MUS MUSCULUS
    # The value '1825,' is incorrect.  Should be a small integer here.
    # to work around this problem, I'm manually eliminating this problem
    # from the epcr.not.found.psl file where it has now become four bad
    # lines:
# 24    0       0       0       1       1801    1       1789    +       27119   
1825    0       1825chr11_16 0       1115413 1117226 2       12,12,  0,1813, 
1115413,1117214,
# 24    0       0       0       1       1801    1       1789    +       27119   
1825    0       1825chr11_16 0       1115413 1117226 2       12,12,  0,1813, 
1115413,1117214,
216a219,220
# 24    0       0       0       1       1801    1       1789    +       62943   
1825,   0       1825,chr11_16        0       1115413 1117226 2       12,12,  
0,1813, 1115413,1117214,
# 24    0       0       0       1       1801    1       1789    +       62943   
1825,   0       1825,chr11_16        0       1115413 1117226 2       12,12,  
0,1813, 1115413,1117214,
    # taking those four lines out. 
    
    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter

    # lift those primers  (added chrM to this lifter script for mouse)
    # creates primers.psl.filter.lifted
    /cluster/bin/scripts/lifter -mouse -psl \
	/cluster/data/mm5 primers.psl.filter
    # wc primers.psl.filter.lifted
    # 33691  707511 3601164 primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
      bash
	PATH=/cluster/bin/scripts:$PATH 
	/cluster/bin/scripts/extractPslInfo primers.psl.filter.lifted
    #	wc primers.psl.filter.lifted.initial
    # 33689  202134 1799016 primers.psl.filter.lifted.initial

    # create primers.psl.filter.lifted.initial.acc
    /cluster/bin/scripts/findAccession -agp \
	-mouse primers.psl.filter.lifted.initial /cluster/data/mm5
    # wc primers.psl.filter.lifted.initial.acc
    # 33689  235823 2158029 primers.psl.filter.lifted.initial.acc

    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    /cluster/bin/scripts/getStsId -rat \
	../stsInfoMouse.bed  primers.psl.filter.lifted.initial.acc \
	> primers.initial.acc.trans
    # wc primers.initial.acc.trans
    # 33689  235823 1834889 primers.initial.acc.trans

    sort -k 4n primers.initial.acc.trans > primers.final
    rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans
    # comparing results to Mm4:
    # Mm5 wc primers.final
    # 33689  235823 1834889 primers.final
    # Mm4 wc primers.final
    # 32983  230881 1771293 primers.final

    cd /cluster/data/mm5/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    bash
    PATH=/cluster/bin/scripts:$PATH \
    /cluster/bin/scripts/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    # Comparing results to Mm4
    # Mm5 wc stsMarkers_pos.rdb
    # 32085  224595 1862816 stsMarkers_pos.rdb
    # Mm4 wc stsMarkers_pos.rdb
    # 31270  218890 1869417 stsMarkers_pos.rdb

    /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
	stsInfoMouse.bed  stsMarkers_pos.rdb 500 > stsMapMouse.bed
    # wc stsMapMouse.bed
    # 29069  301535 2123622 stsMapMouse.bed

#  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/STSmarkers
    cp -p /cluster/store6/mm4/bed/STSmarkers/ucscAlias.pl .
    bash
    ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
     
    # wc ucscStsAlias.tab
    # 126624  379859 3037850 ucscStsAlias.tab
    hgsql -e "drop table stsAlias;" mm5
    hgsql mm5 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm5
    hgsql -e "drop table stsMapMouseNew;" mm5
    hgsql mm5 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm5
    hgsql -e "drop table stsInfoMouseNew;" mm5
    hgsql mm5 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm5

    hgLoadPsl -nobin -table=all_sts_primer mm5 primer/primers.psl.filter.lifted

    # load primer sequences	
    mkdir /gbdb/mm5/stsMarker
    ln -s /cluster/data/mm5/bed/STSmarkers/mouseP.fa \
	/gbdb/mm5/stsMarker/mouseP.fa
# PLEASE NOTE THAT THE -replace option is used because this is a rebuild,
# otherwise there will be a problem that the seq and extFile tables 
# will be out of sync. 
    hgLoadSeq -replace mm5 /gbdb/mm5/stsMarker/mouseP.fa
#  Adding /gbdb/mm5/stsMarker/mouseP.fa
#  32232 sequences
   # DONE - 2004-08-24 17:02

# QA repush 2006-02-08 seq table to remove old STS sequences with no extFile reference (Jen)
  Heather found problem found on rr. RR table matched dev and beta was correct, so no 
  joinerCheck errors for the mismatch were flagged for review.

#  BLASTZ RAT RN3 (RE-DONE - 2004-08-30 - Fan)

#  !!! PLEASE NOTE AS OF 9/2/04, THE 8/30/04-8/31/04 REBUILD OF BLASTZ, CHAIN, AND NET 
#  FOR MM5-RN3 IS NO LONG USED FOR MM5.  THE OLD MM5-RN3 CHAIN AND NET BUILD OF 7/14/04
#  IS REVERSE PUSHED FROM RR BACK TO HGWDEV.

# Reason for rebuild is to use more stringent blastz parameters to reduce size
# of output files.

# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=50000
# scoring matrix
# BLASTZ_Q=/cluster/data/blastz/mus_rat.q

# MAKE SURE TO INCLUDE THE RESCORE STEP TO CORRECT A BLASTZ PROBLEM.
# (axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q ...)

    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    cd blastz.rn3.2004-08-29

    cat << '_EOF_' > DEF
# rat vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=50000
BLASTZ_T=2
# scoring matrix
BLASTZ_Q=/cluster/data/blastz/mus_rat.q 
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.rn3.2004-08-29

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    bash
    source ./DEF
    # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    #	it is a generic script and works for any assembly

    cp -p /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
       /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....

# Completed: 41943 of 41943 jobs
# CPU time in finished jobs:    4656727s   77612.11m  1293.54h   53.90d  0.148 y
# IO & Wait Time:                460782s    7679.70m   128.00h    5.33d  0.015 y
# Average job time:                 122s       2.03m     0.03h    0.00d
# Longest job:                     2042s      34.03m     0.57h    0.02d
# Submission to last job:          8307s     138.45m     2.31h    0.10d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    #	fixup machine check, should be kki, not kk
    cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
       /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh

    /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       1293s      21.54m     0.36h    0.01d  0.000 y
# IO & Wait Time:                  2113s      35.22m     0.59h    0.02d  0.000 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest job:                       54s       0.90m     0.01h    0.00d
# Submission to last job:           719s      11.98m     0.20h    0.01d

# NOTE: BlastZ_run2.sh is not used here.  Instead Angie's approach 
# (using Rescore) is adopted here.

    # third run: lav -> axt
    # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix 
    # and abridged repeats (Penn State's restore_rpts program rescores with 
    # default matrix, oops).
    ssh kki
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    # mv old subdirectories
    mv axtChrom axtChrom.old
    mv run.2 run.2.old
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
set path = (/cluster/bin/x86_64 $path)
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
    /iscratch/i/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs stdout \
| axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q stdin stdout \
| axtSort stdin ../../axtChrom/$chr.axt 
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
# Completed: 43 of 43 jobs
# CPU time in finished jobs:        498s       8.31m     0.14h    0.01d  0.000 y
# IO & Wait Time:                  3367s      56.11m     0.94h    0.04d  0.000 y
# Average job time:                  90s       1.50m     0.02h    0.00d
# Longest job:                      299s       4.98m     0.08h    0.00d
# Submission to last job:           685s      11.42m     0.19h    0.01d

# CHAIN RAT BLASTZ (RE-DONE 8/30/04 Fan)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/mus_rat.q \
         -minScore=5000 $1 \
    /iscratch/i/mus/mm5/softNib \
    /iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       3145s      52.42m     0.87h    0.04d  0.000 y
# IO & Wait Time:                   989s      16.48m     0.27h    0.01d  0.000 y
# Average job time:                  96s       1.60m     0.03h    0.00d
# Longest job:                      280s       4.67m     0.08h    0.00d
# Submission to last job:          1219s      20.32m     0.34h    0.01d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain mm5 ${c}_chainRn3 $i
    end
    featureBits mm5 chainRn3Link
# 1677291680 bases of 2615483787 (64.129%) in intersection
    nice featureBits hg17 chainRn3Link
# 982059013 bases of 2866216770 (34.263%) in intersection

# NET RAT BLASTZ (RE-DONE 8/31/04 Fan)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin hNoClass.net
# memory usage 1710399488, utime 7360 s/100, stime 1891

# The above adapted from Angie's approach 

    # The netClass operations requires an "ancientRepeat" table to exist
    # in either mm5 or rn3.  So, create the table:

    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/ancientRepeat
    cd /cluster/data/mm5/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    # and you need to use your read/write enabled user with password
    chmod 777 .
    hgsqldump --all --tab=. mm4 ancientRepeat
    chmod 775 .
    hgsql mm5 < ancientRepeat.sql
    mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
    # This is a hand curated table obtained from Arian.

# The ancientRepeat table was loaded during the first build of NET RAT BLASTZ.

    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    time netClass hNoClass.net mm5 rn3 rat.net \
	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
    # 491.210u 96.250s 12:27.37 78.6% 0+0k 0+0io 249pf+0w
    
    # If things look good do
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    rm -r hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn rat.net > ratSyn.net
    # 216.290u 34.220s 4:27.60 93.6%  0+0k 0+0io 119pf+0w

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    netFilter -minGap=10 rat.net |  hgLoadNet mm5 netRn3 stdin
    netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin

    # check results
    # featureBits mm4 netRn3
    # 96806381 bases of 95076222 (101.820%) in intersection
    # featureBits mm5 netRn3
    # 2601384082 bases of 2615483787 (99.461%) in intersection

    # featureBits mm4 syntenyNetRn3
    # 96760405 bases of 95076222 (101.771%) in intersection
    # featureBits mm5 syntenyNetRn3
    # 2575035774 bases of 2615483787 (98.454%) in intersection

    # Add entries for net and chain to mouse/mm5 trackDb

    # make net
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
    mkdir ratNet
    time netSplit rat.net ratNet
    # 218.990u 29.290s 4:27.86 92.6%  0+0k 0+0io 190pf+0w


    # extract axts from net 
    mkdir ../axtNet 
    foreach n (ratNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt ratNet/$c.net chain/$c.chain \
		/cluster/data/mm5/nib \
		/cluster/data/rn3/nib ../axtNet/$c.axt
	echo "Complete: $c.net -> axtNet/$c.axt"
    end
    # sort axt's and convert to maf format
    mkdir ../mafNet
    foreach f (../axtNet/chr*.axt)
        set c=$f:t:r
        echo $c.axt
        mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
        axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
        rm ../axtNet/$c.unsorted.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
    end

    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
    nice gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
	echo "Done: ${c}_blastzBestRn3.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/pslBest
    bash
    for I in chr*BestRn3.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done ${I}"
done

     # check results
    # featureBits mm5 blastzBestRn3
    # 1674716868 bases of 2615483787 (64.031%) in intersection
    # featureBits mm4 blastzBestRn3
    # 1780774716 bases of 2627444668 (67.776%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm5/axtBest/Rn3
     cd /gbdb/mm5/axtBest/Rn3
     rm *
     ln -s /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet/chr*.axt .

     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    
    # these axtInfo file entries should be appended to the table,
    # not replacing it.  The previous hg17 entries are needed  --  bob kuhn
    hgsql mm5 -e 'drop table mm5.axtInfo;'
    hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
    hgsql mm5 < axtInfoInserts.sql

    cd /cluster/data/mm5/bed
    rm blastz.rn3
    ln -s  blastz.rn3.2004-08-29 blastz.rn3
    
# BLASTZ RN3 CLEAN UP (RE-DONE - 2004-08-31 - Fan)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
    nice rm -rf raw 
    nice rm axtChain/run1/chain/* 

# do the following later, after rn3-mm5 net and chain done.
    nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
# The above line done on 9/7/04. Fan.
:
# CREATE CYTOBAND TRACK (DONE - 2004-09-7 - Fan)
    # Should be done after NCBI updated their MapViewer to the latest release.
    ssh hgwdev
    cd /cluster/data/mm5
    mkdir cytoBand
    cd cytoBand
    # Get file from NCBI
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
    gunzip ideogram
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed

# Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    # Make the cytoBand track (above) and then:
    echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm5

# REBUILD CYTOBAND TRACK (DONE - 2004-09-15 - Fan)
    # NCBI updated the ideogram.gz file and also changed its format,
    # added a new density field after stein.
    ssh hgwdev
    cd /cluster/data/mm5
    mv cytoBand cytoBand.old
    mkdir cytoBand
    cd cytoBand
    # Get file from NCBI
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
    gunzip ideogram
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed

# Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    # First, drop the cytoBandIdeo table in mm5.
    # Make the cytoBand track (above) and then:
    echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"|hgsql mm5

# ADD MAP CONTIGS TRACK (DONE - 2004-09-07 - Fan)
    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/ctgPos
    cd /cluster/data/mm5/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
    # from the assembly.  (In the future, we should go with the NT's!)
    # So... just for this release, go straight from the seq_contig.md
    # to the table def'n: contig, size, chrom, chromStart, chromEnd
    cat << '_EOF_' > parseSeqContig.pl
#!/usr/local/bin/perl -w

use strict;

while (<>) {
    if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) {
        my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5;
        if ($chr !~ /N/ ) {
        print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n";
        }
    }
}
'_EOF_'
    chmod +x parseSeqContig.pl
      ./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab
    hgsql mm5 < ~/kent/src/hg/lib/ctgPos.sql
    echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm5
    # Note: the info is there in seq_contig.md to also do the _random's,
    # but we'd have to do some more work: duplicate the gaps of 50000 between
    # contigs for all _random's except chrUn_random (1000 between).

    # featureBits mm5 ctgPos
    # 2557516950 bases of 2615483787 (97.784%) in intersection
    # featureBits mm4 ctgPos 
    # 2554101163 bases of 2627444668 (97.209%) in intersection
    # featureBits mm3 ctgPos
    # 2500661074 bases of 2505900260 (99.791%) in intersection

# RELOAD MAP CONTIGS TRACK (DONE - 2005-Mar-03 - Heather)
# /cluster/data/mm5/ncbi/seq_contig.md contains more than just C57BL/6J.
# Filter those out.

    ssh hgwdev
    cd /cluster/data/mm5/bed/ctgPos
    cp /cluster/data/mm5/ncbi/seq_contig.md .
    grep C57BL seq_contig.md > contig.C57BL
    # contig.C57BL has 41061 lines (252 lines fewer than seq_contig.md)
    ./parseSeqContig.pl contig.C57BL > ctgPosFiltered.tab
    # ctgPosFiltered.tab has 302 rows (227 fewer than ctgPos.tab)
    echo "delete from ctgPos" | hgsql mm5
    echo "load data local infile 'ctgPosFiltered.tab' into table ctgPos" | hgsql mm5
    # echo "update ctgPos set chrom = "chrM" where chrom = "chrMT" | hgsql mm5
    # featureBits mm5 ctgPos
    # 2557064874 bases of 2615483787 (97.766%) in intersection

  
# FUGU BLAT ALIGNMENTS (DONE 2004-09-08 Fan)
    ssh kk
    mkdir /cluster/data/mm5/bed/blatFr1
    cd /cluster/data/mm5/bed/blatFr1
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /scratch/mus/mm5/softNib/*.nib > mouse.lst
    cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    mkdir psl
    gensub2 mouse.lst fugu.lst gsub spec
    para create spec
    para try, check, push, check, ...
Completed: 24854 of 24854 jobs
CPU time in finished jobs:    8215774s  136929.56m  2282.16h   95.09d  0.261 y
IO & Wait Time:               1415723s   23595.39m   393.26h   16.39d  0.045 y
Average job time:                 388s       6.46m     0.11h    0.00d
Longest job:                    46761s     779.35m    12.99h    0.54d
Submission to last job:         46761s     779.35m    12.99h    0.54d

    # Sort alignments:
    ssh kksilo
    cd /cluster/data/mm5/bed/blatFr1
    pslCat -dir psl | pslSortAcc nohead chrom temp stdin
    # Processed 1116383 lines into 5 temp files
    # lift query side to Fugu browser chrUn coordinates
    liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl

    # load into database:
    ssh hgwdev
    cd /cluster/data/mm5/bed/blatFr1
    hgLoadPsl -fastLoad -table=blatFr1 mm5 all.psl
    # Processing all.psl
    # load of blatFr1 did not go as planned: 1116383 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
    # a record is already in trackDb as type xeno psl fr1, with colorChromDefault off

# BLASTZ TETRAODON (tetNig1) (DONE, 2004-09-08, hartera)

    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.

    mkdir -p /iscratch/i/mm5/linSpecRep.notInTetraodon
    foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mm5/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
    end

    mkdir -p /iscratch/i/tetNig1/linSpecRep.notInMouse
    foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/tetNig1/linSpecRep.notInMouse/$f:t:r:r.out.spec
    end
    iSync

    ssh kksilo
    # more space on store8 than store6
    mkdir -p /cluster/store8/mm5/blastz.tetNig1.2004-09-02
    ln -s /cluster/store8/mm5/blastz.tetNig1.2004-09-02 \
          /cluster/data/mm5/bed
    ln -s /cluster/data/mm5/bed/blastz.tetNig1.2004-09-02 \
          /cluster/data/mm5/bed/blastz.tetNig1
    ssh kk
    cd /cluster/data/mm5/bed/blastz.tetNig1
    # use same parameters as for danRer1-mm5
    cat << '_EOF_' > DEF
# mouse (mm5) vs Tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm5)
SEQ1_DIR=/iscratch/i/mus/mm5/test
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInTetraodon
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon (tetNig1)
SEQ2_DIR=/iscratch/i/tetNig1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.tetNig1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.mm5-tetNig1.2004-09-02
    # setup cluster run
    # copy shell scripts for blastz runs if not there already
    cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
    # edit BlastZ_run0.sh
    # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
    # this is the directory for the latest version of blastz-run

    # source the DEF file
    bash
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 19437 of 19437 jobs
# CPU time in finished jobs:    4681483s   78024.71m  1300.41h   54.18d  0.148 y
# IO & Wait Time:                176260s    2937.67m    48.96h    2.04d  0.006 y
# Average job time:                 250s       4.17m     0.07h    0.00d
# Longest job:                      790s      13.17m     0.22h    0.01d
# Submission to last job:          5475s      91.25m     1.52h    0.06d
    
    # second cluster run to convert the .out's to .lav's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.tetNig1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs:        262s       4.37m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   981s      16.35m     0.27h    0.01d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           108s       1.80m     0.03h    0.00d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.tetNig1
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/mus/mm5/softNib \
/iscratch/i/tetNig1/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:         41s       0.68m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   414s       6.90m     0.12h    0.00d  0.000 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest job:                       28s       0.47m     0.01h    0.00d
# Submission to last job:           396s       6.60m     0.11h    0.00d

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.tetNig1
    mkdir -p pslChrom
    set tbl = "blastzTetNig1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.tetNig1/pslChrom

    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl mm5 $f
    end

# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 0.765%, blastzTetNig1 1.709%, both 0.519%, cover 67.80%, 
# enrich 39.67x
# default with H=2000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2000 -enrichment
# refGene:cds 0.765%, blastzTetNig1H2000 1.239%, both 0.502%, cover 65.59%, 
# enrich 52.92x
# blastzDanRer1 with L=8000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 0.765%, blastzTetNig1L8k 1.333%, both 0.444%, cover 58.05%, 
# enrich 43.56x
# too much drop in coverage
# H=2000, L=4000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL4k -enrichment
# refGene:cds 0.765%, blastzTetNig1H2kL4k 1.166%, both 0.489%, cover 63.91%, 
# enrich 54.81x
# H=2000, L=6000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL6k -enrichment
# refGene:cds 0.765%, blastzTetNig1H2kL6k 1.014%, both 0.437%, cover 57.15%, 
# enrich 56.36x
# too much drop in coverage

# number of rows in table
# blastzTetNig1 	38196
# blatzTetNig1H2000	38314
# blastzTetNig1L8k	24749
# blastzTetNig1H2kL4k	31433
# blastzTetNig1H2kL6k	21389

# use blastzTetNig1 as this has the best coverage. enrich is quite high too.
# featureBits -chrom=chr1 hg17 refGene:cds blastzFr1 -enrichment
# refGene:cds 1.246%, blastzFr1 2.319%, both 0.833%, cover 66.87%, enrich 28.83x
# similar coverage to blastzFr1 for hg17

# RESCORE TETNIG1 BLASTZ (DONE, 2004-09-08, hartera)
    # Low scores can occur with repeats abridged and using the
    # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
    # with the default matrix instead of the BLASTZ_Q matrix.
    # Rescore them here so the chainer sees the higher scores:
                                          
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.tetNig1
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
        axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom

# CHAIN TETRAODON (TETNIG1) BLASTZ (DONE, 2004-09-08, hartera)
    # Re do chains with rescored blastz Hg17
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.tetNig1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/*.axt \
        > input.lst
    # Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy

 cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'

 cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/mus/mm5/softNib \
    /iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy

    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...

# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:        524s       8.74m     0.15h    0.01d  0.000 y
# IO & Wait Time:                   140s       2.33m     0.04h    0.00d  0.000 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest job:                       25s       0.42m     0.01h    0.00d
# Submission to last job:           632s      10.53m     0.18h    0.01d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # take a look at score distr's,try also with larger bin size.
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # not a large amount of changes with score < 5000
    # load chr1 into database to check
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
    hgLoadChain mm5 chr1_chainTetNig1 chr1.chain
# featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.765%, chainTetNig1Link 1.563%, both 0.512%, cover 66.84%, 
# enrich 42.76x
   # try filtering with minScore=5000
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    chainSplit chainFilt5k all.chain
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chainFilt5k
    hgLoadChain mm5 chr1_chainTetNig1Filt5k chr1.chain
# featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Filt5kLink -enrichment
# refGene:cds 0.765%, chainTetNig1Filt5kLink 1.398%, both 0.504%, cover 65.91%, # enrich 47.13x
# chr1_chainTetNig1 21782
# chr1_chainTetNig1Filt5k 9670

    # loses very little in coverage so use filtering with minScore=5000
    # remove chain
    rm -r chain
    mv chainFilt5k chain
    rm all.chain.unfiltered

    ssh hgwdev
    # remove test tables
    hgsql -e "drop table chr1_chainTetNig1Filt5k;" mm5
    hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" mm5
    # load chains into database
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainTetNig1 $i
        echo done $c
    end

# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-09-08, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 69083136, utime 402 s/100, stime 37
    # Add classification info using db tables:
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon
    mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse
    cp /iscratch/i/mm5/linSpecRep.notInTetraodon/* \
       /cluster/bluearc/mm5/linSpecRep.notInTetraodon
    cp /iscratch/i/tetNig1/linSpecRep.notInMouse/* \
       /cluster/bluearc/tetNig1/linSpecRep.notInMouse

    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    # there is no ancient repeats table for rodent vs fish so use -noAr flag
    time netClass noClass.net mm5 tetNig1 tetNig1.net \
          -tNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon \
          -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse -noAr
    # 59.490u 37.630s 2:41.82 60.0%   0+0k 0+0io 216pf+0w

    netFilter -minGap=10 tetNig1.net |  hgLoadNet mm5 netTetNig1 stdin
    # featureBits mm5 refGene:cds netTetNig1 -enrichment
    # refGene:cds 0.921%, netTetNig1 23.633%, both 0.725%, cover 78.70%, 
    # enrich 3.33x

# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
    ssh kksilo
    # zip chains and nets
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
    cp all.chain tetNig1.chain
    zip -j /cluster/data/mm5/zip/tetNig1.chain.zip tetNig1.chain
    rm tetNig1.chain
    zip -j /cluster/data/mm5/zip/tetNig1.net.zip tetNig1.net

    ssh hgwdev
    # copy chains and nets to downloads area
    set gp = /usr/local/apache/htdocs/goldenPath/mm5
    mkdir -p $gp/vsTetNig1
    cd $gp/vsTetNig1
    mv /cluster/data/mm5/zip/tetNig1*.zip .
    md5sum *.zip > md5sum.txt

    # move axt files to downloads area and zip
    cd /cluster/data/mm5/bed/blastz.tetNig1/axtChrom
    mkdir -p $gp/vsTetNig1/axtChrom
    cp -p *.axt $gp/vsTetNig1/axtChrom
    cd $gp/vsTetNig1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt

    # Copy over & edit README.txt w/pointers to chain, net formats.

# MAKE VSDANRER1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
    ssh kksilo
    # zip chains and nets
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    gunzip all.chain.gz
    cp all.chain danRer1.chain
    zip -j /cluster/data/mm5/zip/danRer1.chain.zip danRer1.chain
    rm danRer1.chain
    gunzip danRer1.net.gz
    zip -j /cluster/data/mm5/zip/danRer1.net.zip danRer1.net

    ssh hgwdev
    # copy chains and nets to downloads area
    set gp = /usr/local/apache/htdocs/goldenPath/mm5
    mkdir -p $gp/vsDanRer1
    cd $gp/vsDanRer1
    mv /cluster/data/mm5/zip/danRer1*.zip .
    md5sum *.zip > md5sum.txt

    # move axt files to downloads area and zip
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChrom
    mkdir -p $gp/vsDanRer1/axtChrom
    cp -p *.axt $gp/vsDanRer1/axtChrom
    cd $gp/vsDanRer1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
  
    # add the axtNet *.axt in blastz.danRer1/axtNet
    cd /cluster/data/mm5/bed/blastz.danRer1/axtNet
    set gp = /usr/local/apache/htdocs/goldenPath/mm5
    mkdir -p $gp/vsDanRer1/axtNet
    nice cp -p *.axt $gp/vsDanRer1/axtNet
    cd $gp/vsDanRer1/axtNet
    nice gzip *.axt
    md5sum *.gz > md5sum.txt

    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ TETNIG1 CLEAN UP (DONE, 2004-09-10, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.tetNig1
    nice rm -rf raw &
    nice rm -rf lav &
    nice rm -rf axtChrom.orig &
    nice rm axtChain/run1/chain/* &
    nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &


# SGP GENES (REDONE 5/24/05 angie)
    # Originally loaded 9/17/04; user noticed chrX was missing; IMIM folks 
    # regenerated & we reloaded.
    ssh kksilo
    mkdir /cluster/data/mm5/bed/sgp
    cd /cluster/data/mm5/bed/sgp
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.gtf
      wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.prot
    end
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    cp /dev/null sgpPep.fa
    foreach f (chr*.prot)
      nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
    end
    ssh hgwdev
    cd /cluster/data/mm5/bed/sgp
    ldHgGene -gtf -genePredExt mm5 sgpGene chr*.gtf
    hgPepPred mm5 generic sgpPep sgpPep.fa

# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

# MAKE mm5-hg17 OVER.CHAIN FOR LIFTOVER  (DONE 2004-09-24 braney)
    ssh kolossus
    mkdir -p /cluster/data/mm5/bed/bedOver/mm5Tohg17
    cd /cluster/data/mm5/bed/bedOver/mm5Tohg17
    set chainDir = /cluster/data/mm5/bed/blastz.hg17/axtChain
    netSplit $chainDir/human.net net
    mkdir subset
    foreach f ($chainDir/chain/*.chain)
      echo subsetting $f:t:r
      netChainSubset net/$f:t:r.net $f subset/$f:t
    end
    cat subset/*.chain > /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain
    hgAddLiftOverChain -multiple mm5 hg17

#  miRNA track (DONE - 2004-09-30 - Fan)
    #   data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #   and Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    cd /cluster/data/mm5/bed
    mkdir miRNA
    cd miRNA
    wget --timestamping \
    ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu.bed
    grep -v "tion" mmu.bed | sed -e "s/ /\t/g" > mm5.bed
    # check previous release track before update
    nice featureBits mm4 miRNA
    # 17782 bases of 2627444668 (0.001%) in intersection
    hgLoadBed mm5 miRNA mm5.bed
    # entry in trackDb/trackDb.ra already there
    # and verify similar numbers after:
    nice featureBits mm5 miRNA
    # 17957 bases of 2615483787 (0.001%) in intersection
 
# BLASTZSELF Done (Tue Oct 19 18:06:45 PDT 2004) sugnet

    # blastzSelf run for mm5. This took about a week due to
    # being busy with other things and some crashed jobs in a
    # few places. Think all of the instructions ended up here.
    # based off of Hiram's instructions for blastzSelf in hg16 & hg17
    mkdir -p /cluster/store6/mm5/bed/blastzSelf
    cd /cluster/store6/mm5/bed/blastzSelf
    
    # Create the definitions file.
    cat << '_EOF_' > DEF
    # mouse vs. mouse
    export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
    
    ALIGN=blastz-run
    BLASTZ=blastz
    BLASTZ_H=2000
    BLASTZ_ABRIDGE_REPEATS=1
    
    # TARGET
    # Mouse
    SEQ1_DIR=/scratch/mus/mm5/softNib
    # RMSK not currently used
    SEQ1_RMSK=/scratch/mus/mm5/rmsk
    # FLAG not currently used
    SEQ1_FLAG=-rodent
    SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
    SEQ1_IN_CONTIGS=0
    SEQ1_CHUNK=10000000
    SEQ1_LAP=10000
    
    # QUERY
    # Mouse
    SEQ2_DIR=/scratch/mus/mm5/softNib
    # RMSK not currently used
    SEQ2_RMSK=/scratch/mus/mm5/rmsk
    # FLAG not currently used
    SEQ2_FLAG=-rodent
    SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
    SEQ2_IN_CONTIGS=0
    SEQ2_CHUNK=30000000
    SEQ2_LAP=0
    
    BASE=/cluster/data/mm5/bed/blastzSelf
    
    DEF=$BASE/DEF
    RAW=$BASE/raw
    CDBDIR=$BASE
    SEQ1_LEN=$BASE/S1.len
    SEQ2_LEN=$BASE/S2.len
    '_EOF_'
        # << this line makes emacs coloring happy
    
    ssh kk
    cd /cluster/store6/mm5/bed/blastzSelf
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, push, check
    
    # on mini-cluster, otherwise I/O gets very demanding....
    ssh kki
    cd /cluster/store6/mm5/bed/blastzSelf
    mkdir -p run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
    cd run.1
    wc -l jobList 
    #    341 jobList
    head jobList 
    para create jobList 
    para try
    
    #	Third cluster run to convert lav's to axt's
    mkdir run.2
    cd run.2
    cat << '_EOF_' > gsub
    #LOOP
    /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/mm5/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/data/mm5/bed/blastzSelf/axtChrom/$(root1).axt} /scratch/mus/mm5/softNib /scratch/mus/mm5/softNib
    #ENDLOOP
    '_EOF_'
    ls -1S /cluster/data/mm5/bed/blastzSelf/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    para create
    para push
    # This seems to beat up on the file server a little, load up to 56 on kksilo
    
    # Number of jobs died, unsure why. Try them on kksilo:
    ssh kksilo
    cat << '_EOF_' > doStragglers.csh
    #!/bin/tcsh
    
    cd /cluster/store6/mm5/bed/blastzSelf
    set base=/cluster/data/hg16/bed/blastzSelf
    set seq1_dir=/cluster/data/mm5/nib
    set seq2_dir=/cluster/data/mm5/nib
    foreach c (lav/chr17  lav/chr2  lav/chr3  lav/chr7  lav/chrUn_random  lav/chrX  lav/chrY)
      echo "Doing $c"
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` > $base/$out
      popd
    end
    '_EOF_'
    
    #  Need to drop overlaps to eliminate diagonals
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "doing $c"
      /cluster/bin/i386/axtDropOverlap axtChrom/$c.axt chromSizes.tab chromSizes.tab \
	 	/cluster/store6/mm5/bed/blastzSelf/axtChromDropped/$c.axt
      echo "Done: $c"
    end
    cd axtChromDropped
    gzip *.axt

    # Translate to psls
    cd /cluster/data/mm5/bed/blastzSelf
    mkdir pslChrom
    set tbl = "blastzSelf"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      zcat /cluster/data/mm5/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
       /cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load files into the database
    /cluster/bin/i386/hgLoadPsl -noTNameIx mm5  *_blastzSelf.psl

# end BLASTZSELF

# CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)

    hgsql mm5 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm5 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm5.kgSpAlias.tab
    rm j.tmp

    hgsql mm5 -e 'drop table kgSpAlias';
    hgsql mm5 < ~/src/hg/lib/kgSpAlis.sql
    hgsql mm5 -e 'load data local infile "mm5.kgSpAlias.tab" into table kgSpAlias'


# ECGENE TRACK (DONE, 2004-10-29, hartera)
    ssh kksilo
    mkdir -p /cluster/data/mm5/bed/ECgene.2004-10-29
    ln -s /cluster/data/mm5/bed/ECgene.2004-10-29 \
          /cluster/data/mm5/bed/ECgene
    cd /cluster/data/mm5/bed/ECgene
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_gene.txt.gz"
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_pep.txt.gz"
    gunzip *.gz
    # load database
    ssh hgwdev
    cd /cluster/data/mm5/bed/ECgene
    ldHgGene -predTab mm5 ECgene v1.2_mm5_low_gene.txt
    # 343337 gene predictions
    hgPepPred mm5 tab ECgenePep v1.2_mm5_low_pep.txt
    rm *.tab
    nice gzip *.txt


## NIA Mouse Gene Index - (DONE - 2004-11-16 Fan)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov

#       pick up data
    ssh hgwdev 
    mkdir -p /cluster/data/mm5/bed/NIAGene
    cd /cluster/data/mm5/bed/NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl

    wget --timestamping \
    http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcripts.fasta
    
    hgLoadPsl mm5 -table=NIAGene NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl

    mkdir /gbdb/mm5/NIAGene
    ln -s /cluster/data/mm5/bed/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta \
        /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta
    
    hgLoadSeq mm5 /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta

    Added and edited NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm5


# CREATE jaxQTL3 (MOUSE QTL) TRACK (DONE - 2004-11-18 Fan)

    cd /cluster/data/mm5/bed
    mkdir qtl.2004-11-08
    ln -s qtl.2004-11-08 qtl
    cd qtl

# Get the raw data file, mouse_qtl_100804.txt, sent by Carol Bult [cjb@informatics.jax.org].

    hgsql mm5 -e 'drop table jaxQtlRaw'
    hgsql mm5 < ~/src/hg/lib/jaxQtlRaw.sql
    hgsql mm5 -e 'load data local infile "mouse_qtl_100804.txt" into table jaxQtlRaw ignore 1 lines'

# Make sure hgJaxQtl binary executable exist.  hgJaxQtl is under ~/src/hg/hgJaxQtl

    hgJaxQtl mm5
    wc jaxQTL3.tab
#    981   15310  105164 jaxQTL3.tab

    hgLoadBed -nobin -tab -sqlTable=$HOME/src/hg/lib/jaxQTL3.sql mm5 jaxQTL3 jaxQTL3.tab

	
# TWINSCAN (DONE 11/29/04 angie)
    ssh kksilo
    mkdir /cluster/data/mm5/bed/twinscan
    cd /cluster/data/mm5/bed/twinscan
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_gtf/$chr.gtf
      wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_ptx/$chr.ptx
    end
    # Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
    perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa
    # load.
    ssh hgwdev
    cd /cluster/data/mm5/bed/twinscan
    ldHgGene -gtf -genePredExt mm5 twinscan chr*.gtf
    hgPepPred mm5 generic twinscanPep twinscanPep.fa
    featureBits -enrichment mm5 refGene twinscan
#refGene 1.551%, twinscan 1.245%, both 0.783%, cover 50.46%, enrich 40.52x

# Create mm5GeneList.html (to be used by Google).
# This step was done 12/08/04.
    
    cd /cluster/data/mm5/bed
    mkdir geneList
    cd geneList
    wget -O mm5GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=mm5"
    cp -p mm5GeneList.html /usr/local/apache/htdocs/goldenPath
# Check this html file into CVS.


# BLASTZ ZEBRAFISH (danRer2) (DONE, 2004-12-12, hartera)

    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.
    # this directory of mouse repeats exists already
    mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
    foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end
    
    mkdir -p /iscratch/i/danRer2/linSpecRep.notInMouse
    foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/danRer2/linSpecRep.notInMouse/$f:t:r:r.out.spec
    end
    iSync

    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastz.danRer2.2004-12-10
    ln -s /cluster/data/mm5/bed/blastz.danRer2.2004-12-10 \
          /cluster/data/mm5/bed/blastz.danRer2
    cd /cluster/data/mm5/bed/blastz.danRer2
 # use same parameters as for danRer[1|2]-hg17 and for hg16-fr1 and mm5-danRer1
 # and similar to those originally used for hg17-galGal2
    cat << '_EOF_' > DEF
# mouse (mm5) vs zebrafish (danRer2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer1
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm5)
SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer2)
SEQ2_DIR=/iscratch/i/danRer2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm5/bed/blastz.danRer2

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.mm5-danRer2.2004-12-10
    # setup cluster run
    # copy shell scripts for blastz runs if not there already
    cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
    # edit BlastZ_run0.sh
    # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
    # this is the directory for the latest version of blastz-run

    # source the DEF file
    bash
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 58993 of 58993 jobs
# CPU time in finished jobs:   17513361s  291889.35m  4864.82h  202.70d  0.555 y
# IO & Wait Time:               1506128s   25102.13m   418.37h   17.43d  0.048 y
# Average job time:                 322s       5.37m     0.09h    0.00d
# Longest job:                     2552s      42.53m     0.71h    0.03d
# Submission to last job:         50001s     833.35m    13.89h    0.58d

    # output is 864M
    # second cluster run to convert the .out's to .lav's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.danRer2
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Checking finished jobs
# Completed: 341 of 341 jobs
# CPU time in finished jobs:        689s      11.48m     0.19h    0.01d  0.000 y
# IO & Wait Time:                  1305s      21.76m     0.36h    0.02d  0.000 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest job:                       14s       0.23m     0.00h    0.00d
# Submission to last job:           250s       4.17m     0.07h    0.00d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/mm5/bed/blastz.danRer2
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer2/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer2/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:         82s       1.37m     0.02h    0.00d  0.000 y
# IO & Wait Time:                  1429s      23.82m     0.40h    0.02d  0.000 y
# Average job time:                  35s       0.59m     0.01h    0.00d
# Longest job:                       91s       1.52m     0.03h    0.00d
# Submission to last job:          1421s      23.68m     0.39h    0.02d

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.danRer2
    mkdir -p pslChrom
    set tbl = "blastzDanRer2"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer2/pslChrom

    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl mm5 $f
    end
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
#refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer2 -enrichment
# refGene:cds 0.780%, blastzDanRer2 2.816%, both 0.529%, cover 67.89%, 
# enrich 24.11x
 
# RESCORE DANRER2 BLASTZ ALIGNMENTS (DONE, 2004-12-12, hartera)

    # Low scores can occur with repeats abridged and using the
    # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
    # with the default matrix instead of the BLASTZ_Q matrix.
    # Rescore them here so the chainer sees the higher scores:
    ssh kolossus
    cd /cluster/data/mm5/bed/blastz.danRer2
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
        axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom

#   psl files and blastz tables will be the same regardless of score so
#   no need to reload

# CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE PRIMARILY THE RESULTS OF 
# REPEATS AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
    # Make chains with rescored blastz danRer2
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/mm5/bed/blastz.danRer2
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm5/bed/blastz.danRer2/axtChrom/*.axt \
        > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    # Make our own linear gap file with reduced gap penalties, 
    # in hopes of getting longer chains:
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=../../chickenHumanTuned.gap $1 \
    /cluster/bluearc/scratch/mus/mm5/softNib \
    /iscratch/i/danRer1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       1797s      29.95m     0.50h    0.02d  0.000 y
# IO & Wait Time:                   575s       9.59m     0.16h    0.01d  0.000 y
# Average job time:                  55s       0.92m     0.02h    0.00d
# Longest job:                      133s       2.22m     0.04h    0.00d
# Submission to last job:           514s       8.57m     0.14h    0.01d

   # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

# take a look at score distr's,try also with smaller bin size.
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # filter on minScore = 5000
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain.filt5k
    # remove old chains
    rm -r chain
    chainSplit chain all.chain.filt5k
   
    # remove repeats from chains and reload into database
    # (2004-12-22, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    mv chain chainRaw
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c
       nice chainAntiRepeat /cluster/bluearc/scratch/mus/mm5/softNib \
                            /cluster/bluearc/danRer2/nib $f \
                            ../chain/$c.chain
    end
    cd ..
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # load filtered chains with chains removed that are mostly due to repeats 
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm5 ${c}_chainDanRer2 $i
        echo done $c
    end
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2 -enrichment
# refGene:cds 0.780%, chainDanRer2 22.478%, both 0.604%, cover 77.48%, 
# enrich 3.45x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.780%, chainDanRer2Link 2.164%, both 0.526%, cover 67.43%, 
# enrich 31.17x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1 -enrichment
# refGene:cds 0.780%, chainDanRer1 20.053%, both 0.593%, cover 75.99%, 
# enrich 3.79x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
# refGene:cds 0.780%, chainDanRer1Link 2.022%, both 0.512%, cover 65.64%, 
# enrich 32.47x
# after chainAntiRepeat:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.785%, chainDanRer2Link 2.058%, both 0.530%, cover 67.53%, 
# enrich 32.81x
 
# NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22,hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    rm -r preNet
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 105357312, utime 632 s/100, stime 117
# Add classification info using db tables:
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
    cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
       /cluster/bluearc/danRer2/linSpecRep.notInMouse
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    time netClass noClass.net mm5 danRer2 zfishdanRer2.net \
         -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
         -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse -noAr
    # 87.010u 56.100s 5:15.16 45.4%   0+0k 0+0io 207pf+0w
    netFilter -minGap=10 zfishdanRer2.net |  hgLoadNet mm5 netDanRer2 stdin
# featureBits mm5 refGene:cds netDanRer2 -enrichment
# refGene:cds 0.938%, netDanRer2 21.447%, both 0.714%, cover 76.17%, 
# enrich 3.55x
# featureBits mm5 refGene:cds netDanRer1 -enrichment
# refGene:cds 0.938%, netDanRer1 19.993%, both 0.702%, cover 74.87%, 
# enrich 3.74x
# after chainAntiRepeat:
# featureBits mm5 refGene:cds netDanRer2 -enrichment
# refGene:cds 0.942%, netDanRer2 21.161%, both 0.717%, cover 76.14%, 
# enrich 3.60x
# add trackDb.ra entries and html for details pages

# TIGR GENE INDEX (DONE 2004-12-13 Fan)
    mkdir -p /cluster/data/mm5/bed/tigr
    cd /cluster/data/mm5/bed/tigr
    wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_05-2004.tgz
    
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/mm5/bed/tigr
    hgsql mm5 -e "drop table tigrGeneIndex"
    hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
        hgsql mm5 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 354491 entries created in tigrGeneIndex table.

    hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords mm5 tigrGeneIndex
    
    gzip *.gff *TCs

# TIGR GENE INDEX (RE-DONE 2004-12-21 Fan)
# This track is re-done due to an error (no strand info) in the original files provided by TIGR.
    cd /cluster/data/mm5/bed
    mv tigr tigr_old_wrong
    mkdir -p /cluster/data/mm5/bed/tigr
    cd /cluster/data/mm5/bed/tigr
    wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_12-2004.tgz 
    tar xvzf TGI*.tgz

    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/mm5/bed/tigr
    hgsql mm5 -e "drop table tigrGeneIndex"
    hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
        hgsql mm5 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 385814 entries created in tigrGeneIndex table.

    hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords mm5 tigrGeneIndex
    
    gzip *.gff *TCs

#### LOAD ENSEMBL GENES (DONE - 2004-12-17 Fan)
# ADDDED STABLE URL TO TRACKDB BLOCK (V27, DEC 2004) (2008-01-11, rhead)
#	needed for Gene Sorter procedure below
#	Ensembl released Mouse build 33 the week of Dec 4 2004
     mkdir /cluster/data/mm5/bed/ensembl
     cd /cluster/data/mm5/bed/ensembl

        Get the ensembl gene data from http://www.ensembl.org/
        Go to the EnsMart link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
        Page 3) Choose the "Structures" tab. 
        Page 4) Choose GTF as the ouput, choose gzip compression , name the
	output file ensGeneMm5.gtf.gz and then hit Export

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf
#	Let's see how much it loses:
#  	None.

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene
    sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf
#	(should also fixup chrMT name here too - 2005-02-28 - Hiram)
#    sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf
    ldHgGene mm5 ensGene ensGene.gtf
# Read 31035 transcripts in 551352 lines in 1 files
#   31035 groups 22 seqs 1 sources 4 feature types
# 31035 gene predictions
#	save space, gzip them:
    gzip unrandom.gtf
    gzip ensGene.gtf
#  The name on chrM was incorrect, fixed (2005-02-28 - Hiram)
    hgsql mm5 -e 'update ensGene set chrom="chrM" where chrom="chrMT";'

# Load Ensembl peptides:
        Get the ensembl protein data from http://www.ensembl.org/
        Go to the EnsMart link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Sequences" tab. 
        Page 4) Choose Transcripts/Proteins and peptide Only as the output,
		choose text/fasta and gzip compression,
		name the file ensGeneMm5.pep.gz and then hit export.

#delete * at end of each protein
     bash
     zcat ensGeneMm5.pep.gz | sed "s/\*$//" > ensembl.pep
    ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
     hgPepPred mm5 generic ensPep fixPep_ensembl.pep
#
#	The chrMT (chrM) peptides as obtained via EnsMart have only
#	aa's of: X				(2005-02-28 - Hiram)
#	These 13 peptides were fixed up manually by fetching each
#	one individually by following the 13 links from our browser
#	to the ensemble protein, asking it to dump the protein
#	sequence, cut and paste that answer to a local file.
#	The 13 peptides were dropped from ensPep table via:
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082392.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082396.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082402.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082405.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082407.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082408.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082409.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082411.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082413.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082414.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082418.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082419.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082421.1";'
#	Then explicitly reloaded with SQL statements such as:
INSERT into ensPep (name, seq) VALUES ('ENSMUST00000082407.1', 'MPQLDTSTWFITIISSMITLFILFQLKVSSQTFPLAPSPKSLTTMKVKTPWELKWTKIYLPHSLPQQ');
#	The 13 SQL statements were left in the file:
#	/cluster/data/mm5/bed/ensembl/chrMPep.sql
#	loaded via:
    hgsql mm5 < chrMPep.sql
#	The following files were "touched" on the RR/MGC after the chrMT/M 
#	change to prevent false errors with joinerCheck. J.Jackson 2005-03-01
#	mm5.superfamily.name 
#	mm5.ensGtp.transcript 
#	mm5.ensPep.name 
#	mm5.knownToEnsembl.value 
#	mm5.sfDescription.name 


# Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format, gzip.
    #  Result name file as ensGtpMm5.tab.gz
    gunzip ensGtpMm5.tab.gz
    hgsql mm5 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql -N -e 'load data local infile "ensGtpMm5.tab" into table ensGtp ignore 1 lines;' mm5

# Create knownToEnsembl column
    hgMapToGene mm5 ensGene knownGene knownToEnsembl
# Compress everthing to save space
    gzip *.tab
    gzip *.pep

#### RE-BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-17 - Fan)

# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS RE-BUILT USING ENSMART DATA OF MOUSE BUILD 33.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
    # Get the ensembl gene/protein cross-reference data from
    # http://www.ensembl.org/Multi/martview?species=Mus_musculus
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Mus musculus choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
    # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref

    zcat ensXref.tsv.gz|sed -e 's/\./\t/g' > ensemblXref3.tab

    hgsql mm5 -e "drop table ensemblXref3"
    hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql

    hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'

# CREATE SUPERFAMILY TRACK (DONE 2004-12-17 - Fan)
   mkdir /cluster/data/mm5/bed/superfamily
   cd    /cluster/data/mm5/bed/superfamily
   hgSuperfam mm5 superfam041128 > sf.log
   wc *
# It is normal that many proteins does not have corresponding Superfamily entries.

# Load the sfDescription table.
   hgsql mm5 < ~/src/hg/lib/sfDescription.sql
   hgsql mm5 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table mm5.sfDescription;'

# Finally, load the superfamily table.
   hgLoadBed mm5 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
   
   cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab | hgKnownToSuper mm5 mm stdin
# created 21899 records output

# MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-22, hartera)
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/mm5
    mkdir -p $gp/vsDanRer2/axtChrom
    cp -p *.axt $gp/vsDanRer2/axtChrom
    cd $gp/vsDanRer2/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
                                                                               
    # copy chains and nets to downloads area
    # re-make chains and net downloadables (2004-12-22, hartera)
    rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
    cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
    gzip -c all.chain.antirepeat > \
            /cluster/data/mm5/zip/zebrafishDanRer2.chain.gz
    gzip -c zfishdanRer2.net > /cluster/data/mm5/zip/zebrafishDanRer2.net.gz
    cd $gp/vsDanRer2
    mv /cluster/data/mm5/zip/zebrafish*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ DANRER2 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-22, hartera)
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.danRer2
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
    nice gzip axtChain/all.chain.antirepeat axtChain/all.chain.filt5k axtChain/chainAR/*.chain &
    nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &

#  MOUSE PHOTOGRAPH added to gateway page
#	Obtained from Jackson Labs press office via email:
#
#	Subject: Re: mouse press photographs
#	Date: Wed, 29 Dec 2004 14:26:15 -0500
#	From: Joyce Peterson <joyce@jax.org>
#	To: Hiram Clawson <hiram@soe.ucsc.edu>
#	References: <41D2FF0B.3090207@soe.ucsc.edu>

#	Hi, Hiram. You may use the attached photo, noting credit to "The 
#	Jackson Laboratory."
#	
#	Cheers,
#	--Joyce
#
#	Joyce Peterson
#	Public Information Manager
#	The Jackson Laboratory
#	610 Main Street, Mailbox 664
#	Bar Harbor, ME 04609-1526
#	Tel. 207-288-6058
#	Mobile 207-266-5745
#	E-mail joyce@jax.org
#	http://www.jax.org/news
#
#  Original from this email placed into /cluster/data/mm5/html/C57BL_6J.JPG

    ssh hgwdev
    cd /cluster/data/mm5/html
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 890x690+330+70 -quality 80 -sharpen 0 \
	-normalize C57BL_6J.JPG mm.jpg
    convert -geometry 300x200 -quality 80 mm.jpg Mus_musculus.jpg
    rm -f mm.jpg

    cp -p Mus_musculus.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push


# ANDY LAW CPGISSLANDS (DONE 1/14/05 angie)
    # See notes about this in makeGalGal2.doc.
    # Running only on masked sequence.  
    ssh kksilo
    mkdir /cluster/data/mm5/bed/cpgIslandGgfAndy
    cd /cluster/data/mm5/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../?{,?}/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc masked $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
      echo running on $chr masked
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/mm5/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed mm5 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits mm5 cpgIslandExt
#10422989 bases of 2615483787 (0.399%) in intersection
    featureBits mm5 cpgIslandGgfAndyMasked
#38305840 bases of 2615483787 (1.465%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed cpgIslandGgfAndyMasked.bed
#  16238 ../cpgIsland/cpgIsland.bed
#  67737 cpgIslandGgfAndyMasked.bed
    # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
    # for Dave Burt's cross-species island comparisons.
    ssh kksilo
    cd /cluster/data/mm5/bed/cpgIslandGgfAndy
    mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
    perl -wpe '@w=split("\t"); $w[3] = "mm5.$w[0]." . ($w[1]+1) . ".$w[2]"; \
               $_ = join("\t", @w);' \
      cpgIslandGgfAndyMasked.bed.orig \
    > cpgIslandGgfAndyMasked.bed
    ssh hgwdev
    cd /cluster/data/mm5/bed/cpgIslandGgfAndy
    hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
      mm5 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed


# MAKE MM5-RN3 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/mm5/bed/blastz.rn3/axtChain
    mkdir -p /cluster/data/mm5/bed/bedOver
    mkdir /tmp/mm5ToRn3
    foreach f ($chainDir/ratNet/chr*.net.gz)
      set chr = $f:t:r:r
      echo $chr
      netChainSubset $f $chainDir/chain/$chr.chain.gz \
        /tmp/mm5ToRn3/$chr.chain
    end
    cat /tmp/mm5ToRn3/*.chain \
      > /cluster/data/mm5/bed/bedOver/mm5ToRn3.over.chain
    rm -r /tmp/mm5ToRn3
    

# MAKE MM5-GALGAL2 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/mm5/bed/blastz.galGal2/axtChain
    mkdir -p /cluster/data/mm5/bed/bedOver
    netChainSubset $chainDir/chicken.net $chainDir/all.chain \
      /cluster/data/mm5/bed/bedOver/mm5ToGalGal2.over.chain
    

# UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new mm5 protein display IDs to the alias table to support user search
    
    ssh hgwdev
    mkdir -p /cluster/data/mm5/bed/pb/newDisplayId
    cd /cluster/data/mm5/bed/pb/newDisplayId
 
    hgsql proteome -e 'select mm5.kgSpAlias.kgID, mm5.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.tab
   
# get rid of the header line at the end of the file
    vi mm5.tab

    hgsql mm5 -e 'load data local infile "mm5.tab" into table mm5.kgSpAlias'

# UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new mm5 protein display IDs to the alias table to support user search
    
    ssh hgwdev
    cd /cluster/data/mm5/bed/pb/newDisplayId

     hgsql proteome -e 'select mm5.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.kgProtAlias.tab

# get rid of the header line at the end of the file
    vi mm5.kgProtAlias.tab 

    hgsql mm5 -e 'load data local infile "mm5.kgProtAlias.tab" into table mm5.kgProtAlias'


# BLASTZ/CHAIN/NET BOSTAU1 (DONE 2/21/05 angie)
    ssh kksilo
    mkdir /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
    cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
    cat << '_EOF_' > DEF
# mouse vs. cow

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/mm5/chrom.sizes

# QUERY
# Cow
SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes

BASE=/cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
'_EOF_'
    # << this line keeps emacs coloring happy

    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/mouseVsCow >& do.log &
    tail -f do.log
    # kksilo was rebooted so original invocation of doBlastzChainNet.pl
    # was killed in the middle of the cluster run.  I watched the job
    # progress and restarted 70 failed jobs like this:
    ssh kk
    cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19/run.blastz
    para check
    para push
    para check ...
    # When the batch was complete:
    para time > run.time
    # (doBlastzChainNet.pl uses run.time as a checkpoint)
    # Then to continue the run:
    ssh kksilo
    cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
    doBlastzChainNet.pl -continue=cat DEF \
      -blastzOutRoot /cluster/bluearc/mouseVsCow >>& do.log &
    tail -f do.log
    # For some reason the script got hung waiting for tty input; I 
    # foregrounded it, hit return a few times, and it eventually completed.
    # That should be fixed in a future version of doBlastzChainNet.pl.  
    ln -s blastz.bosTau1.2005-02-19 /cluster/data/mm5/bed/blastz.bosTau1
    # Add chainBosTau1 and netBosTau1 to mm5/trackDb.ra
    # Add /usr/local/apache/htdocs/goldenPath/mm5/vsBosTau1/README.txt


# LOAD SNPS (Done; March 3, 2005; Heather)


  # directory structure
  ssh hgwdev
  cd /cluster/bluearc/snp
  mkdir mm5.heather
  cd mm5.heather
  mkdir det loc seq str xml

  # get data
  ftp ftp.ncbi.nih.gov
  cd snp/mouse/XML
  prompt
  mget ds_ch*.xml.gz

  # make sure script is current (should add makefile so general build does this)
  cp -f /cluster/home/heather/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts

  # build jobList for parsing
  touch jobList
  foreach file ( ds_ch*.xml.gz )
    set out = $file:t:r
    echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/mm5.heather $out.contig >> jobList
  end

  # do the parsing
  ssh kk
  cd /cluster/bluearc/snp/mm5.heather
  para create jobList
  para try
  para check
  para push

  # output goes to det, loc, seq, str and xml directories

  # concatenate details
  ssh hgwdev
  zcat det/ds_ch*.xml.contig.det.gz > in.bed

  # couldn't find contig-based lift file from mm5
  # generate from ctgPos
  echo "select chromStart, chrom, contig, size, chrom from ctgPos;" > ctgPos.sql
  hgsql mm5 < ctgPos.sql > ctgPos.out
  # edit ctgPos.out to put in proper format -- next time write script for this

  # lift
  # expect warnings from non-reference assemblies (limited to first 10)
  liftUp out.bed ctgPos.out warn in.bed

  # load (exception column will be empty for all rows)
  hgLoadBed mm5 snp out.bed -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp.sql

  # generate exceptions 1-20; drop 7 and 9 as they will be changing
  cd /usr/local/apache/htdocs/qa/test-results/snpException
  mkdir mm5
  cd mm5
  snpException mm5 0 mm5snpException

  # Invariant 1 has 0 exceptions, written to this file: mm5snpException.01.bed
  # Invariant 2 has 0 exceptions, written to this file: mm5snpException.02.bed
  # Invariant 3 has 0 exceptions, written to this file: mm5snpException.03.bed
  # Invariant 4 has 0 exceptions, written to this file: mm5snpException.04.bed
  # Invariant 5 has 0 exceptions, written to this file: mm5snpException.05.bed
  # Invariant 6 has 3 exceptions, written to this file: mm5snpException.06.bed
  # Invariant 7 has 1 exceptions, written to this file: mm5snpException.07.bed
  # Invariant 8 has 0 exceptions, written to this file: mm5snpException.08.bed
  # Invariant 9 has 22 exceptions, written to this file: mm5snpException.09.bed
  # Invariant 10 has 0 exceptions, written to this file: mm5snpException.10.bed
  # Invariant 11 has 0 exceptions, written to this file: mm5snpException.11.bed
  # Invariant 12 has 0 exceptions, written to this file: mm5snpException.12.bed
  # Invariant 13 has 0 exceptions, written to this file: mm5snpException.13.bed
  # Invariant 14 has 0 exceptions, written to this file: mm5snpException.14.bed
  # Invariant 15 has 0 exceptions, written to this file: mm5snpException.15.bed
  # Invariant 16 has 0 exceptions, written to this file: mm5snpException.16.bed
  # Invariant 17 has 0 exceptions, written to this file: mm5snpException.17.bed
  # Invariant 18 has 3634 exceptions, written to this file: mm5snpException.18.bed
  # Invariant 19 has 0 exceptions, written to this file: mm5snpException.19.bed
  # Invariant 20 has 0 exceptions, written to this file: mm5snpException.20.bed
  # Invariant 21 has no query string
  # Invariant 22 has no query string
  # Invariant 23 has no query string
  # Invariant 24 has no query string

  mv mm5snpException.07.bed mm5snpException.07.bed.notused
  mv mm5snpException.09.bed mm5snpException.09.bed.notused


  # snpValid
  cd /cluster/bluearc/snp/mm5.heather/seq
  nice snpValid mm5 . > & snpValid.out &
  tail -20 snpValid.out

  # Grand Totals:
  # matches: 494545
  # mismatches: 246 (exceptionId #22)
  # missing from flanks: 0 (exceptionId #23)
  # rev compl matches: 56285
  # not rptd strand : 1 (exceptionId #24)
  # assembly = -: 0
  # nib in gap : 0 (must be 0)
  # Total rows in snp: 494791
  # no dna found for : 0
  # Total goodExact: 493886
  # Total  badExact: 534 (exceptionId #21)

  # copy 21-24 exceptions to location of 1-20
  cp *bed /usr/local/apache/htdocs/qa/test-results/snpException/mm5

  # add exception data to snp table
  cp ../build124/updateExceptionList.pl .

  tail +3 mm5snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt  
  updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
  hgsql mm5 < updateExceptionList.sql

# HUMAN BLASTP FOR GENE SORTER (RE-DONE 7/28/05 Fan)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
# NOTE: THE SECTION BELOW WAS ALREADY DONE.
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/hg17/blastp) then
      rm -r /iscratch/i/hg17/blastp
    endif
    mkdir -p /iscratch/i/hg17/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
    iSync

# THE SECTION ABOVE WAS ALREADY DONE PREVIOUSLY.

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
    cd /cluster/data/mm5/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
Completed: 7739 of 7739 jobs
CPU time in finished jobs:     113019s    1883.65m    31.39h    1.31d  0.004 y
IO & Wait Time:                 22145s     369.08m     6.15h    0.26d  0.001 y
Average job time:                  17s       0.29m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             124s       2.07m     0.03h    0.00d
Submission to last job:           495s       8.25m     0.14h    0.01d
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm5/bed/blastp/hg17/run/out
    hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab

# KNOWN GENES 
# This was built using ~/kent/src/hg/protein/KGprocess.sh
# and it was not documented. 

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/mm5/bed/
     mkdir -p kgMm5/index
     cd kgMm5/index
     hgKgGetText mm5 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ix /gbdb/mm5/knownGene.ix
     ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ixx /gbdb/mm5/knownGene.ixx

# RE-BUILD cgapAlias TABLE 
# ORIGINALLY TABLE WAS BUILT BY THE KNOWN GENES PROCESS
# cgapAlias table has replicate rows so remove (DONE, 2005-07-26, hartera)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)

    ssh hgwdev
    cd /cluster/store6/kgDB/bed/kgMm5B
    # DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
    # OR sort -n | uniq.
    # USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ 
    # (hartera, 2005-10-06)

    sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
    hgsql mm5 -e "drop table cgapAlias"
    hgsql mm5 < ~/kent/src/hg/lib/cgapAlias.sql
    hgsql mm5 -e 'load data local infile "cgapAliasSorted.tab" \
          into table cgapAlias'

# Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt)
    knownToVisiGene mm5
    #Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780, 
    #genbankImageHash 1301
    #knownToLocusLink 30303, knownToRefSeq 30291, knownToGene 266841

# RIKEN CAGE STUFF (DONE 11-16-2005 Andy)
    # Make download area.
    ssh hgwdev
    cd /cluster/data/mm5/bed
    mkdir rikenCageCtss
    cd rikenCageCtss/
    wget -r http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/
    # stupid thing didn't work.  Tried tinkering with wget almost every way possible.
    # Finally just did it the hard way.
    wget -O /dev/stdout http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/ 2> /dev/null 
          | egrep ".sql|.bz2" | grep href | sed 's/^.*href=\"//;s/\".*$//' > files.lst 
    rm -rf fantom*
    for f in `cat files.lst`; do
       wget http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/$f; 
    done
    bunzip2 *.bz2

    # Make the simple table of the CAGE-related TSSs.
    awk 'BEGIN{FS="\t"};{printf("%s\t%s\t%s\t%s\t%s\t1000\t%s\n",$9,$4,$7,$8,$1,($6 == "F") ? "+" : "-")}' \
       tss_summary.tsv | grep "^CAGE" | cut -f2- > basicCAGE.bed 
    # Make CAGE wiggle tracks for plus and minus strands
    awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
        ctss_summary.tsv | wigEncode stdin ctssForward.wig ctssForward.wib
    awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
        ctss_summary.tsv | wigEncode stdin ctssReverse.wig ctssReverse.wib
    mkdir wiggle
    mv ctss*.wi{g,b} wiggle/
 
    # Load stuff up:  
    hgLoadBed mm5 rikenCageTc basicCAGE.bed
    ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssForward
    ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssReverse
    hgLoadWiggle mm5 ctssForward ctssForward.wig
    hgLoadWiggle mm5 ctssReverse ctssReverse.wig    

    # OK make them bedGraphs instead.
    cd ../
    rm -rf wiggle/
    rm /gbdb/mm5/wib/ctss*
    hgsql mm5 -e 'drop table ctssForward'
    hgsql mm5 -e 'drop table ctssReverse'
    awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
       ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssPlus stdin
    awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
       ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssMinus stdin

    # track html:
    cp rikenCageCtss.html ~/kent/src/hg/makeDb/trackDb/mouse/
    # trackDb:
track rikenCageTc
shortLabel Riken CAGE TC
longLabel Riken CAGE - Associated Transcript Clusters
group genes
priority 47.5
visibility hide
type bed 6 .

track rikenCageCtss
compositeTrack on
shortLabel Riken CAGE
longLabel Riken CAGE - Predicted Gene Start Sites
group genes
priority 47.51
visibility hide
type bedGraph 4
maxHeightPixels 128:16:16
minLimit 1
maxLimit 4316
viewLimits 1.0:10.0
windowingFunction mean
autoScale Off
origAssembly hg16

    track rikenCageCtssPlus
    subTrack rikenCageCtss
    shortLabel Riken CAGE +
    longLabel Riken CAGE Plus Strand - Predicted Gene Start Sites
    priority 1
    color 109,51,43

    track rikenCageCtssMinus
    subTrack rikenCageCtss
    shortLabel Riken CAGE -
    longLabel Riken CAGE Minus Strand - Predicted Gene Start Sites
    priority 2
    color 43,51,109

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch mm5 geneidPep 0408071900.00
  sudo mytouch mm5 genscanPep 0501071300.00
  sudo mytouch mm5 superfamily 0503011100.00
  sudo mytouch mm5 ensGtp 0503011100.00
  sudo mytouch mm5 knownToEnsembl 0503011100.00
  sudo mytouch mm5 sfDescription 0503011100.00

############################################################################
#	Mm7 to Mm5 liftOver creation (DONE - 2006-02-22 - 2006-02-24 - Hiram)
#	instructions lifted from Andy's sequence in makeMm7.doc
######## LIFTOVER PREPARATION
    # Split up mm5
    ssh kkr1u00
    cd /iscratch/i/mm5
    mkdir liftSplits
    mkdir liftSplits/split
    mkdir liftSplits/lift
    for fa in /cluster/data/mm5/?/*.fa /cluster/data/mm5/??/*.fa
    do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 \
	liftSplits/split/$c
    done
    mkdir biggerSplits
    mkdir biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chrX.fa 10 chrX_
    rm chr{1,X}.fa
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm5/ kkr${R}u00:/iscratch/i/mm5/
    done

    ######## LIFTOVER BLATING    
    # MM7
    ssh kk
    cd /cluster/data/mm7
    /cluster/bin/scripts/makeLoChain-align mm7 /scratch/hg/mm7/nib mm5 \
	/iscratch/i/mm5/biggerSplits/split
    cd bed/blat.mm5.2006-02-22/run

    #	target is Mm7
    #	query is Mm5
    cat << '_EOF_' > blat.csh
#!/bin/csh -fe

set target=$1
set query=$2
set output=$3
set chain=$4

set tPart=$target:t:r
set qPart=$query:t:r

set tmpDir=/scratch/tmp/${chain}.${tPart}_${qPart}
set tmpOutput=$tmpDir/$output:t

mkdir -p $tmpDir
sleep 2
/cluster/bin/$MACHTYPE/blat $target $query $tmpOutput \
        -tileSize=11 -minScore=100 -minIdentity=98 -fastMap \
	-ooc=/iscratch/i/mm5/11.ooc
mkdir -p `dirname $output`
cp $tmpOutput $output
rm $tmpOutput
rmdir --ignore-fail-on-non-empty $tmpDir
'_EOF_'
    #	happy emacs
    chmod +x blat.csh

    sed 's#^blat#./blat.csh#; s/\}.*$/}/; s/$/ mm7ToMm5/' spec > jobList

    para create jobList
    para -maxNode=200 -priority=25 push
    para time
# Completed: 2451 of 2451 jobs
# CPU time in finished jobs:    1266001s   21100.02m   351.67h   14.65d  0.040 y
# IO & Wait Time:                 13972s     232.87m     3.88h    0.16d  0.000 y
# Average job time:                 522s       8.70m     0.15h    0.01d
# Longest finished job:            6769s     112.82m     1.88h    0.08d
# Submission to last job:         26506s     441.77m     7.36h    0.31d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh kki
    cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
    cat << '_EOF_' > mm5SplitLift.sh
#!/bin/bash
for C in chr1 chrX
do
    echo joining $C
    for P in `ls *_${C}_[0-9]*.psl | sed -e "s/_chr.*//" | sort -u`
    do
	echo "${P}_${C}.psl"
	tail --lines=+6 -q "${P}_${C}_[0-9]*.psl"  > ${P}_${C}.psl
   done
   for f in *_${C}.psl; do
      cat /san/sanvol1/scratch/andy/psl.header $f > tmp
      mv tmp $f
   done
done
echo Lifting...
for C in `awk '{print $1}' /cluster/data/mm5/chrom.sizes`; do
   echo "lifting $C ... "
   liftUp -pslQ ../psl/${C}.psl \
	/iscratch/i/mm5/biggerSplits/lift/${C}.lft error chr*_${C}.psl
   echo done $C
done    
'_EOF_'
    #	happy emacs
    chmod +x mm5SplitLift.sh

    cat << "EOF" > mm5ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/andy/mm5Lifts
pushd /scratch/andy/mm5Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
EOF
    chmod +x mm5ChainMergeSplit.sh

    cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/raw
    ../mm5SplitLift.sh

    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat << '_EOF_' > template
#LOOP
axtChain -linearGap=medium -verbose=0 -psl $(path1) /scratch/hg/mm7/nib /cluster/data/mm5/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'_EOF_'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single template jobList
    para create jobList
    para push
    para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs:       7259s     120.98m     2.02h    0.08d  0.000 y
# IO & Wait Time:                  1086s      18.10m     0.30h    0.01d  0.000 y
# Average job time:                 194s       3.23m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1088s      18.13m     0.30h    0.01d
# Submission to last job:          2289s      38.15m     0.64h    0.03d

    ssh kkstore02
    cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
    mkdir chain
    time chainMergeSort chainRaw/* | chainSplit chain stdin
    #	real    29m42.365s

    mkdir net over
    cd chain
    for c in *.chain
    do
       echo ${c%.chain}; 
       nice chainNet $c /cluster/data/mm7/chrom.sizes \
        /cluster/data/mm5/chrom.sizes ../net/${c%.chain}.net /dev/null
       echo done $c
    done
    #	real    15m33.593s
    for chain in *.chain
    do 
       c=${chain%.chain}
       nice netChainSubset ../net/$c.net $chain ../over/$c.over 
    done
    #	real    10m48.898s

    ########## FINISHING
    ssh kkstore02
    cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/over
    cat * > ../mm7ToMm5.over.chain
    cd ..
    gzip mm7ToMm5.over.chain
    rm -rf psl net chain chainRaw over
    ssh hgwdev
    cd /cluster/data/mm7/bed
    ln -s blat.mm5.2006-02-22 blat.mm5
    ln -s `pwd`/blat.mm5/mm7ToMm5.over.chain.gz liftOver/mm7ToMm5.over.chain.gz
    ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
	/gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz
    ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
	/usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm5.over.chain.gz
    hgAddLiftOverChain mm7 mm5 /gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz

############################################################################
# UPDATED mm5.knownToVisiGene (2006-03-21 galt)
ssh hgwdev
knownToVisiGene mm5

#######################################################################
## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram)
    ssh kkr1u00
    #	do not need to run this command since /cluster/data/mm8/split10k
    #	already exists from previous liftOver jobs (mm7 to mm8)
    # $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
    #	mm8 /cluster/data/mm8/nib
    # as it says, DO THIS NEXT:
    ssh kk
    #	if bin/scripts is not in your PATH, add it for this command:
    PATH=$PATH:/cluster/bin/scripts \
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
	mm5 /cluster/data/mm5/nib mm8 /iscratch/i/mm8/split10k \
	/cluster/data/mm8/11.ooc
    # as it says, DO THIS NEXT:
    cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/run
    para try, check, push, check, ...
# Completed: 1462 of 1462 jobs
# CPU time in finished jobs:    3990246s   66504.10m  1108.40h   46.18d  0.127 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                2371s      39.51m     0.66h    0.03d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           24307s     405.12m     6.75h    0.28d
# Submission to last job:       1474509s   24575.15m   409.59h   17.07d

    # as it says, DO THIS NEXT:
    #	this does the liftUp and makes the psl files
    #	kkr1u00 is down these days
    ssh kkr3u00
    cd /cluster/data/mm5/bed
    ln -s blat.mm8.2006-05-15 blat.mm8
    #	edit this script to allow use on kkr3u00
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm5 mm8
    #	real    16m5.091s
    # as it says, DO THIS NEXT:
    #	the prepares the batch to run for the chaining
    ssh kki
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
	mm5 /cluster/data/mm5/nib mm8 /cluster/data/mm8/nib

    # as it says, DO THIS NEXT:
    #	running the chain batch
    cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/chainRun
    para try, check, push, check, ...
# Completed: 34 of 34 jobs
# CPU time in finished jobs:       6893s     114.88m     1.91h    0.08d  0.000 y
# IO & Wait Time:                  7183s     119.72m     2.00h    0.08d  0.000 y
# Average job time:                 414s       6.90m     0.12h    0.00d
# Longest finished job:            1130s      18.83m     0.31h    0.01d
# Submission to last job:          1130s      18.83m     0.31h    0.01d

    ssh kkstore03
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm5 mm8

    #	Created /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz
    # as it says, DO THIS NEXT:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm5 mm8
    #	It says this:
    # 	Now, add link for
    #	/usr/local/apache/htdocs/goldenPath/mm5/liftOver/mm5ToMm8.over.chain
    #	to hgLiftOver
    #	But I believe that link was already done:
    cd /gbdb/mm5/liftOver
    ls -og mm5ToMm8*
    #	lrwxrwxrwx  1 53 Jun  5 16:10 mm5ToMm8.over.chain.gz ->
    #		/cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/mm5/bed/genomicSuperDups
    cd /cluster/data/mm5/bed/genomicSuperDups
    sed -e 's/\t_\t/\t-\t/' mm5_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed mm5 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm5