# This file describes how we made the browser database on the mouse
# genome, October 2003 build. - Mm4
#
#
#	NOTE:  There is a new chrMT sequence in the build 32
#	>gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion
#
#   Will have to beware of this NC_ contig in the processing since
#	all previous builds had only NT_ contigs
#
# NOTE: The README_PREBUILD file for this assembly mentions several
# differences from the previous release (build 30):
# 1. seq_contig.md - new first line is a comment containing column name
#       Also, last two columns (group label and weight, have been swapped)
#       Also, some lines have id with CONTIG: prepended, and upper-case
#               feature type (CONTIG)
# 2. contig.idmap - has an additional column "contig label"
# This required changing the jkStuff ncbi* utilities (7/1/03 KRR)
#
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2003-10-16 - Hiram)
    ssh kksilo
    mkdir -p /cluster/store6/mm4/ncbi
    ln -s /cluster/store6/mm4 /cluster/data
    cd /cluster/data/mm4/ncbi
    mkdir chrfasta contigfasta
    ftp ftp.ncbi.nih.gov
      # user hgpguest, password from /cse/guests/kent/buildHg6.doc
      cd mouse_32
      prompt
      bin
      mget *
      quit
    gunzip *.agp.gz

# Check chromosome files  (DONE - 2003-10-19 - Hiram)
    cd chrfasta
    foreach f (*.fa.gz)
        echo $f:r >> faSize.out
	gunzip $f
        /cluster/bin/i386/faSize $f:r >> faSize.out
	echo $f:r done
    end
   /cluster/bin/i386/faSize *.fa >> faSize.out
   grep "^>" *.fa > ../chrfasta.all.fa.headers
   gzip *.fa
   cd ../contigfasta
   gunzip *.fa.gz
   grep "^>" *.fa > ../contigfasta.all.fa.headers
   gzip *.fa


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
#					(DONE - 2003-10-16 - Hiram)

    ssh kksilo
    cd /cluster/data/mm4
    gunzip ncbi/allrefcontig.chr.agp.gz
    # splitFaIntoContigs doesn't do right with agp lines arriving in a
    # different order than fasta chrom sequences.  so split up the agp
    # into one per chrom.
    foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un)
      mkdir $c
      perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
        ./ncbi/allrefcontig.chr.agp \
        > $c/chr$c.agp
      gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \
        | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \
        | splitFaIntoContigs $c/chr$c.agp \
          stdin /cluster/data/mm4 -nSize=5000000
    end
    gzip ncbi/chrfasta/chr*.fa

# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2003-10-17 - Hiram)
    ssh kksilo
    cd /cluster/data/mm4/ncbi

    gunzip seq_contig.md.gz

    # reorder random contigs in allrefcontig agp file to match seq_contig.md
    # this is required by the ncbiToRandomAgps scripts
    # had to fixup ncbiToRandomAgps from previous use to match the
    #	lines better, and to do the MT/NC_ mitochondrion thing
    ../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \
                        allrefcontig.chr.ordered.agp
    ../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \
                        contig.idmap ..
        # creating ../mm4/1/chr1_random.agp...
        # ... creating ../mm4/Un/chrUn_random.agp...
    #  The chrUn_random.agp created by this is too large with the 5000
    #  gaps.  it will work with 1000 gaps, so fixup the chrUn_random agp:
    ../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
      seq_contig.md allrefcontig.chr.ordered.agp contig.idmap ..

    ssh kksilo
    cd /cluster/data/mm4
    foreach c (?{,?})
      if (-e $c/chr${c}_random.ctg.agp) then
        echo building $c/chr${c}_random.fa
        gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \
          | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
          > ./tmp.fa
        agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
          $c/chr${c}_random.fa ./tmp.fa
        rm tmp.fa
      endif
    end
    # building 1/chr1_random.fa
    # ... etc ...
    # building Un/chrUn_random.fa
    # Writing 85112544 bases to Un/chrUn_random.fa


    # Clean these up to avoid confusion later... they're easily rebuilt
    #	with the ncbiToRandomAgps script above
    rm ?/*.ctg.agp ??/*.ctg.agp

# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (2003-10-20 - Hiram)
    ssh kksilo
    cd /cluster/data/mm4
    foreach c (?{,?})
      if (-e $c/chr${c}_random.agp) then
        splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mkdir -p $c/lift
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
    end
    #  This has a lot of output.  It is difficult to see if anything
    #	goes wrong.

#  Fixup chrMT name to be chrM (DONE - 2003-10-21 - Hiram)
    ssh kksilo
    cd /cluster/data/mm4
    mv MT MT.ncbi
    mkdir M
    mkdir M/chrM_1
    mkdir M/lift
    cd MT.ncbi
    find . -type f | while read FN
    do
	NF=`echo $FN | sed -e "s/MT/M/g"`
	sed -e "s/chrMT/chrM/g" $FN > ../M/$NF
    done
#	The RepeatMasker for this was re-run separately since RM was
#	already done by the time this was renamed.

# MAKE LIFTALL.LFT (DONE - 2003-10-20 - Hiram)

    cd /cluster/data/mm4
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft


# CREATING DATABASE (DONE - 2003-10-20 - Hiram)

o - Create the database.
    ssh hgwdev
    hgsql -e 'create database mm4;' ''
    # if you need to delete this database:  !!! WILL DELETE EVERYTHING !!!
    #	hgsql -e "drop database mm4;" mm4
o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    # [hiram@hgwdev /] df -h /var/lib/mysql
    #	Filesystem            Size  Used Avail Use% Mounted on
    #	/dev/sda1             472G  414G   34G  93% /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2003-10-20 - Hiram)
    #	Use any of the newest databases to ensure that the organization
    #	of the grp table is up to date
    ssh hgwdev
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm4


# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE - 2003-10-20 - Hiram)
    # Create (unmasked) nib files
    ssh kksilo
    cd /cluster/data/mm4
    mkdir -p unmaskedNib
    foreach f (?{,?}/chr?{,?}{,_random}.fa)
      echo $f:t:r
      faToNib $f unmaskedNib/$f:t:r.nib
    end
    # Create symbolic links from /gbdb/mm4/nib to real nib files
    #	These unmasked Nib files are temporary just to get the browser
    #	up an running immediately.  After the masking is done and masked
    #	sequence is created, these nibs will be replaced with the masked
    #	nibs
    ssh hgwdev
    mkdir -p /gbdb/mm4/nib
    cd /gbdb/mm4/nib
    ln -s /cluster/data/mm4/unmaskedNib/chr*.nib .

    # Load /gbdb nib paths into database and save size info.
    ssh hgwdev
    cd /cluster/data/mm4
    hgsql mm4  < ~/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib mm4 /gbdb/mm4/nib ?{,?}/chr?{,?}{,_random}.fa
    #	2952612207 total bases
    # NOTE: mm3 was 2708220133, an increase of 244 Mb (~9%)
    hgsql -N -e "select chrom,size from chromInfo;" mm4 > chrom.sizes
    # check the resulting file chrom.sizes

    # Store o+o info in database.
    cd /cluster/data/mm4/ncbi
    gunzip sequence.inf
    cd /cluster/data/mm4
    ln -s ncbi fft
    # remove so as not to confuse hgGoldGap -- they are easily regenerated
    rm */chr*.ctg.agp
    # to undo/redo:
    #     jkStuff/dropSplitTable.csh gap
    #     jkStuff/dropSplitTable.csh gold
    /cluster/bin/i386/hgGoldGapGl mm4 /cluster/data/mm4 .
    featureBits mm4 gold
    # 2627444668 bases of 2627444668 (100.000%) in intersection
    featureBits mm3 gold
    # 2505900260 bases of 2505900260 (100.000%) in intersection

    featureBits mm4 gap
    # 325167539 bases of 2627444668 (12.376%) in intersection
    featureBits mm3 gap
    # 202319873 bases of 2505900260 (8.074%) in intersection


# Make and load GC percent table	(DONE - 2003-10-20 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/mm4/bed/gcPercent
     cd /cluster/data/mm4/bed/gcPercent
     hgsql mm4  < ~/kent/src/hg/lib/gcPercent.sql
     hgGcPercent mm4 ../../unmaskedNib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM4 (DONE - 2003-10-20 - Hiram)
    #	using the Mm3 position blatted onto Mm4:
    # Enter mm4 into hgcentraltest.dbDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, defaultPos, \
         active, orderKey, genome, scientificName, htmlPath, hgNearOK) \
      VALUES(
        "mm4", "Oct. 2003", "/gbdb/mm4/nib", "Mouse", \
	"chr6:122959016-122974672", \
         1, 20, "Mouse", "Mus musculus", "/gbdb/mm4/html/description.html",
	0);' \
	-h genome-testdb hgcentraltest
    #	If you need to delete that entry:
    	hgsql -e 'delete from dbDb where name="mm4";' \
	-h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add mm4 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM4 (DONE - 2003-07-13 kate)
    ssh hgwdev
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \
	    VALUES ("mm4", "blat10", "17778", "1"); \
	INSERT INTO blatServers (db, host, port, isTrans) \
	    VALUES ("mm4", "blat10", "17779", "0");' \
	-h genome-testdb hgcentraltest

# REPEAT MASKING (DONE 2003-10-21 Hiram)
    #	TRF simpleRepeat below can be run at the same time
    # Split contigs, run RepeatMasker, lift results
    # * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make
    #   RepeatMasker runs manageable on the cluster ==> results need lifting.
    # * For the NCBI assembly we repeat mask on the sensitive mode setting
    #  (RepeatMasker -m -s -ali)

    #- Split contigs into 500kb chunks:
    ssh kksilo
    cd /cluster/data/mm4
    foreach d ( */chr?{,?}{,_random}_?{,?} )
	cd $d
	set contig = $d:t
	faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
	    -maxN=500000
	cd ../..
    end
    #	...
    #	11 pieces of 11 written
    #	1 pieces of 1 written
    #	...

    #- Make the run directory and job list:

    cd /cluster/data/mm4
    cat << '_EOF_' > jkStuff/RMMouse
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/mm4/$2
/bin/cp $2 /tmp/mm4/$2
cd /tmp/mm4/$2
/scratch/hg/RepeatMasker/RepeatMasker -ali -s -species mus $2
popd
/bin/cp /tmp/mm4/$2/$2.out ./
if (-e /tmp/mm4/$2/$2.align) /bin/cp /tmp/mm4/$2/$2.align ./
if (-e /tmp/mm4/$2/$2.tbl) /bin/cp /tmp/mm4/$2/$2.tbl ./
if (-e /tmp/mm4/$2/$2.cat) /bin/cp /tmp/mm4/$2/$2.cat ./
/bin/rm -fr /tmp/mm4/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm4/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm4
'_EOF_'
    chmod +x jkStuff/RMMouse

    mkdir -p RMRun
    rm -f RMRun/RMJobs
    foreach d ( ?{,?}/chr*_?{,?} )
	foreach f ( $d/chr*_?{,?}_?{,?}.fa )
	    set f = $f:t
	    echo /cluster/data/mm4/jkStuff/RMMouse \
		/cluster/data/mm4/$d $f \
		'{'check out line+ /cluster/data/mm4/$d/$f.out'}' \
		>> RMRun/RMJobs
	end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/mm4/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 6429 of 6429 jobs
# CPU time in finished jobs:   39351255s  655854.25m 10930.90h  455.45d  1.248 y
# IO & Wait Time:                185895s    3098.25m    51.64h    2.15d  0.006 y
# Average job time:                6150s     102.50m     1.71h    0.07d
# Longest job:                    18075s     301.25m     5.02h    0.21d
# Submission to last job:         56648s     944.13m    15.74h    0.66d

    #- Lift up the split-contig .out's to contig-level .out's
    ssh kksilo
    cd /cluster/data/mm4
    foreach d ( ?{,?}/chr*_?{,?} )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
      cd ../..
    end

    #- Lift up the contig-level .out's to chr-level
    ssh kksilo
    cd /cluster/data/mm4
    ./jkStuff/liftOut5.csh
    #	This one error is OK
    #	Can not find Un/lift/ordered.lft .

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/mm4
    # to redo:
    #    ./jkStuff/dropSplitTable.csh rmsk
    # make sure there's no chrUn -- rm Un/chrUn.fa.out
    hgLoadOut mm4 ?/*.fa.out ??/*.fa.out

# VERIFY REPEATMASKER RESULTS (DONE - 2003-10-21 Hiram)

    # Run featureBits on mm4 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits mm4 rmsk
    # 1130883581 bases of 2627444668 (43.041%) in intersection
    featureBits mm3 rmsk
    # 1080265553 bases of 2505900260 (43.109%) in intersection


# SIMPLE REPEAT TRACK (DONE - 2003-10-20 Hiram)
    # TRF can be run in parallel with RepeatMasker on the file server
    #	since it doesn't require masked input sequence.
    ssh kksilo
    mkdir /cluster/data/mm4/bed/simpleRepeat
    cd /cluster/data/mm4/bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    echo '#\!/bin/csh -fe' > jobs.csh
    # create job list of 5MB chunks
    foreach f \
       (/cluster/data/mm4/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \
       /cluster/data/mm4/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa)
      set fout = $f:t:r.bed
      echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    chmod +x jobs.csh
    wc jobs.csh
    #	600    3596   90543 jobs.csh

    ./jobs.csh >&! jobs.log &
    # in bash:  ./jobs.csh > jobs.log 2>&1 &
    tail -f jobs.log

    # When job is done lift output files
    liftUp simpleRepeat.bed /cluster/data/mm4/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database
    ssh hgwdev
    cd /cluster/data/mm4/bed/simpleRepeat
    hgLoadBed mm4 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 1164675 elements of size 16
    featureBits mm4 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection
    featureBits mm3 simpleRepeat
    # 75457193 bases of 2505900260 (3.011%) in intersection


# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-10-21 - Hiram)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/mm4/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    # Lift up filtered trf output to chrom coords
    cd /cluster/data/mm4
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
	perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
	  $c/lift/ordered.lst > $c/lift/oTrf.lst
	liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
	  jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      else
	echo "WARNING NO FILE:  $c/lift/ordered.lst"
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # NOTE: ignore warning about non-existent Un/Lift/ordered.lift
    # since there is no chrUn

# MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
#				(DONE - 2003-10-21 Hiram)
    ssh kksilo
    cd /cluster/data/mm4
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.csh >&! maskFa.out &
    #	bash:	./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 &
    tail -100f maskFa.out

    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.csh

    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.csh
    # ignore complaints about missing chrUn

    # Redo symbolic links from /gbdb/mm4/nib to
    #   mixed (RM and TRF) soft-masked nib files
    ssh hgwdev
    rm -fr /gbdb/mm4/nib/*
    ln -s /cluster/data/mm4/mixedNib/chr*.nib /gbdb/mm4/nib

    # Copy data to /cluster/bluearc for cluster runs
    ssh kksilo

    # masked contigs
    rm -fr /cluster/bluearc/mm4/trfFa
    mkdir -p /cluster/bluearc/mm4/trfFa
    cp -p /cluster/data/mm4/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
	/cluster/bluearc/mm4/trfFa

    # masked chrom nibs
    cd /cluster/data/mm4
    rm -fr /cluster/bluearc/mm4/softNib
    mkdir -p /cluster/bluearc/mm4/softNib
    cp -p mixedNib/chr*.nib /cluster/bluearc/mm4/softNib
    rm -fr /cluster/bluearc/mm4/hardNib
    mkdir -p /cluster/bluearc/mm4/hardNib
    cp -p maskedNib/chr*.nib /cluster/bluearc/mm4/hardNib

    # fasta files
    rm -fr /cluster/bluearc/mm4/fasta
    mkdir -p /cluster/bluearc/mm4/fasta
    cp -p ?/*.fa ??/*.fa /cluster/bluearc/mm4/fasta

    # RepeatMasker *.out files
    rm -rf /cluster/bluearc/mm4/rmsk
    mkdir -p /cluster/bluearc/mm4/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/mm4/rmsk

    # lift file, for mrna processing
    cp -p jkStuff/liftAll.lft /cluster/bluearc/mm4

    # also copy to /scratch
    ssh kksilo
    mkdir -p /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/fasta /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/hardNib /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/softNib /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/rmsk /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/trfFa /cluster/bluearc/scratch/mus/mm4
    mkdir -p /cluster/bluearc/scratch/mus/mm4/rmsk.spec
    cp -Rp /cluster/bluearc/mm4/rmsk.spec/* /cluster/bluearc/scratch/mus/mm4/rmsk.spec

    # also copy to iservers
    ssh kkr1u00
    cd ~/mm4
    cp -p liftAll.lft /iscratch/i/mm4
    mkdir -p /iscratch/i/mm4/softNib
    cp -p /cluster/bluearc/mm4/softNib/chr*.nib /iscratch/i/softNib
    mkdir -p /iscratch/i/mm4/trfFa
    cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/mm4/trfFa
    /cluster/bin/scripts/iSync


# PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-10-21 - Hiram)

    ssh kksilo
    mkdir -p /cluster/bluearc/mm4/rmsk.spec
    cd /cluster/bluearc/mm4/rmsk.spec
    ln -s ../rmsk/*.out .
    cat << '_EOF_' > runArian.sh
#!/bin/sh
for FN in *.out
do
    echo ${FN}
    /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
	${FN} -query mouse -comp human -comp rat
done
'_EOF_'
    chmod +x runArian.sh
    ./runArian.sh

    cd /cluster/bluearc/mm4
    mkdir linSpecRep.notInHuman
    mkdir linSpecRep.notInRat
    foreach f (rmsk.spec/*.out_hum_rat)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInHuman/$base.out.spec
    end
    foreach f (rmsk.spec/*.out_hum_rat)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInRat/$base.out.spec
    end

    cp -Rp /cluster/bluearc/mm4/linSpecRep.notInHuman /cluster/bluearc/scratch/mus/mm4
    cp -Rp /cluster/bluearc/mm4/linSpecRep.notInRat /cluster/bluearc/scratch/mus/mm4
    #	Request rsync of /scratch to the KiloKluster

# AUTO UPDATE GENBANK MRNA RUN  (IN PROGRESS - 2003-10-21 - Hiram)

    ssh eieio
    cd /cluster/data/genbank/etc
    #	edit genbank.conf to specify
    #	# mm4
    #	mm4.genome = /scratch/mus/mm4/softNib/*.nib
    #	mm4.lift = /cluster/data/mm4/jkStuff/liftAll.lft
    #	mm4.downloadDir = mm4
    #	mm4.genbank.est.xeno.load = yes
    #	mm4.mgcTables.default = full
    #	mm4.mgcTables.mgc = all
    cd /cluster/data/genbank

    nice ./bin/gbAlignStep -initial -type=mrna -verbose=1 \
	-clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4
    # this failed in the alignment step due to /scratch problems, to continue:
    ssh kk
    cd /cluster/bluearc/genbank/work/initial.mm4/align
    para recover align.jobs recover_align.jobs
    para create recover_align.jobs
    para try, push, check, etc ...
    # then
    ssh eieio
    cd /cluster/data/genbank
    nice ./bin/gbAlignStep -initial -continue=finish -type=mrna -verbose=1 \
	-clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4
    # to load that:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm4
    # then, for the big est run
    ssh eieio
    cd /cluster/data/genbank
    nice ./bin/gbAlignStep -initial -type=est -verbose=1 \
	-clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4
# Completed: 256925 of 256971 jobs
# CPU time in finished jobs:   79788950s 1329815.83m 22163.60h  923.48d  2.530 y
# IO & Wait Time:               7185609s  119760.16m  1996.00h   83.17d  0.228 y
# Time in running jobs:          290862s    4847.70m    80.80h    3.37d  0.009 y
# Average job time:                 339s       5.64m     0.09h    0.00d
# Longest job:                    69850s    1164.17m    19.40h    0.81d
# Submission to last job:        141340s    2355.67m    39.26h    1.64d
    # cluster problems required that to be completed manually:
    ssh kk
    cd /cluster/bluearc/genbank/work/initial.mm4/align
    para recover align.jobs recover.jobs
    para create recover.jobs
    para try, push, check, etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:      11250s     187.50m     3.13h    0.13d  0.000 y
# IO & Wait Time:                   423s       7.05m     0.12h    0.00d  0.000 y
# Average job time:                 254s       4.23m     0.07h    0.00d
# Longest job:                     1532s      25.53m     0.43h    0.02d
# Submission to last job:          1532s      25.53m     0.43h    0.02d
    ssh eieio
    cd /cluster/data/genbank
    nice ./bin/gbAlignStep -initial -continue=finish -type=est -verbose=1 \
	-clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4
    # Then for an efficient load of everything, drop the mrna tables and
    #	load all:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm4

# LOAD CPGISSLANDS (DONE 2003-10-21 Hiram)
     ssh kksilo
     mkdir -p /cluster/data/mm4/bed/cpgIsland
     cd /cluster/data/mm4/bed/cpgIsland
     # cpglh requires hard-masked (N) .fa's.
     # There may be warnings about "bad character" for IUPAC ambiguous
     # characters like R, S, etc.  Ignore the warnings.
     foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
       set fout=$f:t:r:r.cpg
       /cluster/bin/cpglh $f > $fout
       echo Done with $fout
     end

    cat << '_EOF_' > filter.awk
/* chr1  1325    3865    754     CpG: 183        64.9    0.7 */
/* Transforms to: */
/* chr1  1325    3865    CpG: 183  754  183 489  64.9  0.7 */
{
width = $3-$2;
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
  $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
'_EOF_'

     awk -f filter.awk chr*.cpg > cpgIsland.bed

     # Load into db
     ssh hgwdev
     cd /cluster/data/mm4/bed/cpgIsland
     /cluster/bin/i386/hgLoadBed mm4 cpgIsland -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed

# BLASTZ Human	(DONE - 2003-10-28 - Hiram)

    ssh kk
    mkdir -p /cluster/data/mm4/bed/blastz.hg16
    cd /cluster/data/mm4/bed/blastz.hg16
    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1


# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ1_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=/iscratch/i/gs.17/build34/rmsk
# FLAG not currently used
SEQ2_FLAG=-primate
SEQ2_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm4/bed/blastz.hg16

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run script
    source DEF
    cat << '_EOF_' > ../../jkStuff/BlastZ_run0.sh
#!/bin/sh
#       prepare first cluster run for blastz processing
#
M=`uname -n`
if [ "$M" != "kk" ]; then
    echo "ERROR: you are on machine: '$M'"
    echo -e "\tthis script expects machine kk"
    exit 255
fi
source DEF
mkdir ${RAW}
mkdir run.0
echo "running 'make-joblist'"
~angie/hummus/make-joblist $DEF > $BASE/run.0/j
if [ ! -f ./xdir.sh ]; then
    echo "ERROR: no ./xdir.sh file present.  Should have been made by make-joblist."
    exit 255
fi
echo "running 'xdir.sh'"
sh ./xdir.sh
cd run.0
sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > jobList
rm j
echo "running 'para create'"
para create jobList
echo "Ready for cluster run.  para try, check, push, etc ..."
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x ../../jkStuff/BlastZ_run0.sh
    # prepare first cluster run
    ../../jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# CPU time in finished jobs:   15818152s  263635.86m  4393.93h  183.08d  0.502 y
# IO & Wait Time:               6171331s  102855.52m  1714.26h   71.43d  0.196 y
# Average job time:                 507s       8.45m     0.14h    0.01d
# Longest job:                     8930s     148.83m     2.48h    0.10d
# Submission to last job:         85042s    1417.37m    23.62h    0.98d

    #	Prepare second cluster run script, .out's to .lav's
    ssh kk
    cd /cluster/data/mm4/bed/blastz.hg16
cat << '_EOF_' > ../../jkStuff/BlastZ_run1.sh
#!/bin/sh
#       prepare second cluster run for blastz processing
#
M=`uname -n`
if [ "$M" != "kk" ]; then
    echo "ERROR: you are on machine: '$M'"
    echo -e "\tthis script expects machine kk"
    exit 255
fi
source DEF
mkdir $BASE/run.1
mkdir $BASE/lav
echo "running 'make-joblist'"
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
echo "running 'para create'"
para create jobList
echo "Ready for cluster run.  para try, check, push, etc ..."
'_EOF_'
    chmod +x ../../jkStuff/BlastZ_run1.sh

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kkr1u00
    source DEF
    ../../jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 323 of 323 jobs
# CPU time in finished jobs:      12410s     206.83m     3.45h    0.14d  0.000 y
# IO & Wait Time:                335408s    5590.14m    93.17h    3.88d  0.011 y
# Average job time:                1077s      17.95m     0.30h    0.01d
# Longest job:                     1966s      32.77m     0.55h    0.02d
# Submission to last job:          1992s      33.20m     0.55h    0.02d

    #	Prepare third cluster run script to convert lav's to axt's
cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh
#!/bin/sh
#       prepare third cluster run for blastz processing
#
M=`uname -n`
if [ "$M" != "kk" ]; then
    echo "ERROR: you are on machine: '$M'"
    echo -e "\tthis script expects machine kk"
    exit 255
fi
source DEF
mkdir axtChrom
mkdir run.2
cd run.2
# usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
echo '#LOOP' > gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR} ${SEQ2_DIR} >> gsub
echo '#ENDLOOP' >> gsub
ls -1S ${BASE}/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
echo "running 'para create'"
para create jobList
echo "Ready for cluster run.  para try, check, push, etc ..."
'_EOF_'
    chmod +x ../../jkStuff/BlastZ_run2.sh
    #	Third cluster run to convert lav's to axt's
    source DEF
    ../../jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:       3981s      66.35m     1.11h    0.05d  0.000 y
# IO & Wait Time:                 49397s     823.28m    13.72h    0.57d  0.002 y
# Average job time:                1213s      20.22m     0.34h    0.01d
# Longest job:                     7584s     126.40m     2.11h    0.09d
# Submission to last job:          7597s     126.62m     2.11h    0.09d

    # translate sorted axt files into psl
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16
    mkdir -p pslChrom
    set tbl = "blastzHg16"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 30 minutes
    # Load database tables
    ssh hgwdev
    set tbl = "blastzHg16"
    cd /cluster/data/mm4/bed/blastz.hg16/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_${tbl}.psl
    # this is a 55 minute job

# CHAIN HG16 BLASTZ (DONE - 2003-10-24 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
#	in this case, it was run on the kk kluster
    ssh kk
    mkdir -p /cluster/data/mm4/bed/blastz.hg16/axtChain/run1
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/mm4/bed/blastz.hg16/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ_random $1 | axtChain stdin \
	/scratch/mus/mm4/softNib \
	/iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 41 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:      27109s     451.82m     7.53h    0.31d  0.001 y
# IO & Wait Time:                 29921s     498.68m     8.31h    0.35d  0.001 y
# Average job time:                1296s      21.60m     0.36h    0.02d
# Longest job:                    12356s     205.93m     3.43h    0.14d
# Submission to last job:         12356s     205.93m     3.43h    0.14d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    # real    8m55.100s
    # user    6m46.680s
    # sys     0m58.250s
    time chainSplit chain all.chain
    # real    8m36.259s
    # user    5m46.920s
    # sys     0m59.700s

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm4 ${c}_chainHg16 $i
        echo done $c
    end

# NET HUMAN BLASTZ (DONE - 2003-10-31 - Hiram)

    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/mm4/chrom.sizes \
                        /cluster/data/hg16/chrom.sizes ../preNet/$i
    end
    # real    11m58.018s
    # user    4m10.390s
    # sys     2m10.780s

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm4/chrom.sizes \
                            /cluster/data/hg16/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 1591906304, utime 6877 s/100, stime 1977

    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    time netClass hNoClass.net mm4 hg16 human.net \
	-tNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse
    # real    11m35.367s
    # user    7m20.880s
    # sys     1m33.360s

    # manually fixup one line in the linSpecRep.notInHuman/chr3.out.spec file
    #	RepeatMasker or the post-processing script is making an error
    #	Remove the line that is missing the rest of the fields
    # the line : 2860   6.4  0.0  1   0  0
Expecting 7 words line 156272 of /cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman/chr3.out.spec got 6
2860   6.4  0.0  1.1  chr3      164080732 164081087  (231854) +  L1Md_F2        LINE/L1               6208 6561   (21)      1   0  0
 2860   6.4  0.0  1   0  0
 1420   8.1  0.0  0.0  chr3      164081159 164081355  (231586) C  ORR1A0-int     LTR/MaLR            (1593)  199      3      3   0  0


    # If things look good do
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet mm4 netHg16 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet mm4 syntenyNetHg16 stdin

    # Add entries for net and chain to mouse/mm4 trackDb

    # make net
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16/axtChain
    mkdir humanNet
    time netSplit human.net humanNet
    # real    6m13.469s
    # user    3m22.670s
    # sys     0m44.880s

    mkdir ../axtNet
    foreach n (humanNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt humanNet/$c.net chain/$c.chain \
		/cluster/data/mm4/nib \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end

    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/blastz.hg16/axtBest
    cd /cluster/data/mm4/bed/blastz.hg16/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet
    gzip *.axt
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestHg16.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestHg16.psl
	echo "Done: ${c}_blastzBestHg16.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/pslBest
    time /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_blastzBestHg16.psl
    # real    10m47.853s
    # user    2m48.700s
    # sys     0m24.250s

     # check results
#  featureBits mm4 blastzBestHg16
#  1030510540 bases of 2627444668 (39.221%) in intersection

#  featureBits mm3 blastzBestHuman
#  1019460260 bases of 2505900260 (40.682%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm4/axtBestHg16
     cd /gbdb/mm4/axtBestHg16
     ln -s /cluster/data/mm4/bed/blastz.hg16/axtNet/chr*.axt .
     cd /cluster/data/mm4/bed/blastz.hg16/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/mm4/axtBestHg16/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql mm4 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql mm4 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-10-31 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.hg16/axtNet
    mkdir -p ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg16.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.hg16/pslTight
    hgLoadPsl -noTNameIx mm4 chr*_blastzTightHg16.psl

    # copy to axt's to download area
    cd /cluster/data/mm4/bed/blastz.hg16/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight
    cd /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight
    gzip *.axt
    # add README.txt file to dir, if needed

# BLASTZ Rat (DONE - 2003-10-27 - Hiram)

    ssh kk
    mkdir -p /cluster/data/mm4/bed/blastz.rn3.2003-10-21
    cd /cluster/data/mm4/bed
    ln -s blastz.rn3.2003-10-21 blastz.rn3
    cd blastz.rn3

    cat << '_EOF_' > DEF
# mouse vs. rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ1_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/scratch/mus/mm4/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=/cluster/bluearc/rat/rn3/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm4/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    ../../jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 39728 of 39729 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:   16243094s  270718.23m  4511.97h  188.00d  0.515 y
# IO & Wait Time:               1164706s   19411.77m   323.53h   13.48d  0.037 y
# Average job time:                 438s       7.30m     0.12h    0.01d
# Longest job:                     8391s     139.85m     2.33h    0.10d
# Submission to last job:        189809s    3163.48m    52.72h    2.20d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kkr1u00
    cd /cluster/data/mm4/bed/blastz.rn3
    source DEF
    ../../jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 323 of 323 jobs
# CPU time in finished jobs:      10597s     176.62m     2.94h    0.12d  0.000 y
# IO & Wait Time:                207530s    3458.83m    57.65h    2.40d  0.007 y
# Average job time:                 675s      11.26m     0.19h    0.01d
# Longest job:                     1613s      26.88m     0.45h    0.02d
# Submission to last job:          2456s      40.93m     0.68h    0.03d

    #	Third cluster run to convert lav's to axt's
    ssh kk
    cd /cluster/data/mm4/bed/blastz.rn3
    source DEF
    ../../jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:      10649s     177.48m     2.96h    0.12d  0.000 y
# IO & Wait Time:                129039s    2150.65m    35.84h    1.49d  0.004 y
# Average job time:                3175s      52.91m     0.88h    0.04d
# Longest job:                    18946s     315.77m     5.26h    0.22d
# Submission to last job:         19121s     318.68m     5.31h    0.22d

    # translate sorted axt files into psl
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3
    mkdir -p pslChrom
    set tbl = "blastzRn3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 20 minutes
    # Load database tables
    ssh hgwdev
    set tbl = "blastzRn3"
    cd /cluster/data/mm4/bed/blastz.rn3/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_${tbl}.psl


# CHAIN Rn3 BLASTZ (DONE - 2003-10-30 - Hiram)

#	Experiment, run this on kolossus
    ssh kolossus
    cd ~/kent.$MACHTYPE/src/hg/mouseStuff/axtChain
    make
    cd ../axtFilter
    make
    mkdir -p /cluster/data/mm4/bed/blastz.rn3/axtChain/run1
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/mm4/bed/blastz.rn3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
./doChain $(path1) chain/$(root1).chain out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    $HOME/bin/$MACHTYPE/axtFilter -notQ_random $1 | \
        $HOME/bin/$MACHTYPE/axtChain stdin \
        /cluster/store6/mm4/nib \
        /cluster/store3/rn3/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    mkdir out chain

    gensub2 input.lst single gsub jobList
    # copy the jobList to runJobs.sh
    echo "#!/bin/sh" > runJobs.sh
    cat jobList >> runJobs.sh
    chmod +x runJobs.sh
    nohup ./runJobs.sh &
    #	Well, maybe that wasn't such a good idea.  That shell
    #	script took about 7 hours to run.  I'd recommend the small
    #	kluster or the kk9 kluster for this job

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    # real    22m15.194s
    # user    14m9.970s
    # sys     2m18.350s
    time chainSplit chain all.chain
    # real    99m40.570s
    # user    13m41.590s
    # sys     29m44.260s

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain mm4 ${c}_chainRn3 $i
        echo done $c
    end
# problem loading chr13 and chr7 - rerun these two
    foreach i (chr13.chain chr7.chain)
        set c = $i:r
        hgLoadChain mm4 ${c}_chainRn3 $i
        echo done $c
    end

# NET RAT BLASTZ (DONE - 2003-10-31 - Hiram)

    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/mm4/chrom.sizes \
                        /cluster/data/rn3/chrom.sizes ../preNet/$i
    end
    # real    38m28.442s
    # user    11m24.220s
    # sys     3m24.980s

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm4/chrom.sizes \
                            /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 1855098880, utime 8760 s/100, stime 2368

# The netClass operations requires an "ancientRepeat" table to exist
    # in either mm4 or rn3.  So, create the table:

    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/ancientRepeat
    cd /cluster/data/mm4/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    chmod 777 .
    hgsqldump --all --tab=. hg16 ancientRepeat
    chmod 775 .
    hgsql mm4 < ancientRepeat.sql
    mysqlimport -u<r/w user> -p<r/w pass> hg16 ancientRepeat.txt
    # mm4.ancientRepeat: Records: 133  Deleted: 0  Skipped: 0  Warnings: 0
    # This is a hand curated table obtained from Arian.

    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    time netClass hNoClass.net mm4 rn3 rat.net \
	-tNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
    # real    12m58.603s
    # user    8m7.610s
    # sys     1m40.280s

    # If things look good do
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    rm n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn rat.net > ratSyn.net
    # real    5m52.054s
    # user    3m24.820s
    # sys     0m44.020s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    time netFilter -minGap=10 rat.net |  hgLoadNet mm4 netRn3 stdin
    # real    17m0.651s
    # user    4m51.440s
    # sys     0m36.060s
    time netFilter -minGap=10 ratSyn.net | hgLoadNet mm4 syntenyNetRn3 stdin
    # real    14m4.567s
    # user    4m41.660s
    # sys     0m34.420s

    # Add entries for net and chain to mouse/mm4 trackDb

    # make net
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3/axtChain
    mkdir ratNet
    time netSplit rat.net ratNet
    # real    28m29.554s
    # user    4m48.770s
    # sys     4m3.580s

    mkdir ../axtNet
    foreach n (ratNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt ratNet/$c.net chain/$c.chain \
		/cluster/data/mm4/nib \
		/cluster/data/rn3/nib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> ../axtNet/$c.axt"
    end

    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/blastz.rn3/axtBest
    cd /cluster/data/mm4/bed/blastz.rn3/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet
    cd /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet
    gzip *.axt
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
	echo "Done: ${c}_blastzBestRn3.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/pslBest
    time /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_blastzBestRn3.psl
    # real    9m7.118s
    # user    2m32.780s
    # sys     0m25.300s

     # check results
     # the original axtBest stuff from the axtBest operation:
#  featureBits mm4 blastzBestRn3
#  1780774716 bases of 2627444668 (67.776%) in intersection

#  featureBits mm3 blastzBestRat
#  1577582189 bases of 2505900260 (62.955%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm4/axtBestRn3
     cd /gbdb/mm4/axtBestRn3
     ln -s /cluster/data/mm4/bed/blastz.rn3/axtNet/chr*.axt .
     cd /cluster/data/mm4/bed/blastz.rn3/axtNet
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm4/axtBestRn3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm4 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm4 < axtInfoInserts.sql


# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-10-31 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kksilo
    cd /cluster/data/mm4/bed/blastz.rn3/axtNet
    mkdir ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/mm4/bed/blastz.rn3/pslTight
    time hgLoadPsl -noTNameIx mm4 chr*_blastzTightRn3.psl
    # real    9m53.256s
    # user    2m2.170s
    # sys     0m17.370s


    # copy to axt's to download area
    cd /cluster/data/mm4/bed/blastz.rn3/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight
    cd /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight
    gzip *.axt

    # add README.txt file to dir, if needed

# MAKING RAT AND HUMAN SYNTENY (DONE - 2003-11-05 - Hiram)

    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/syntenyRn3
    cd /cluster/data/mm4/bed/syntenyRn3

#	updating the scripts in use here from 
#	/cluster/data/hg16/bed/syntenyMm3
    cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .

#	fix the syntenicBest script to not try and work on empty
#	results from its queries.  Also, set the db and table name
#	in the script itself so the arguments are not needed

    ./syntenicBest.pl
#	on the order of 3 to 4 hours to complete syntenicBest
#	almost no time, or only a few minutes at most for any of
#	the rest
    ./smooth.pl
    ./joinsmallgaps.pl
#	set db and table name in fillgap.pl
    ./fillgap.pl
    ./synteny2bed.pl
    hgLoadBed mm4 syntenyRn3 ucsc100k.bed

#	same thing for Hg16
    mkdir -p /cluster/data/mm4/bed/syntenyHg16
    cd /cluster/data/mm4/bed/syntenyHg16

    cp ../syntenyRn3/syntenicBest.pl
    # fixup table name to be Hg16
    ./syntenicBest.pl
    ../syntenyRn3/smooth.pl
    ../syntenyRn3/joinsmallgaps.pl
    cp ../syntenyRn3/fillgap.pl
    # fixup table name to be Hg16
    ./fillgap.pl
    ../syntenyRn3/synteny2bed.pl
    hgLoadBed mm4 syntenyHg16 ucsc100k.bed

    #	check results
# featureBits mm3 syntenyRat
# 2384684351 bases of 2505900260 (95.163%) in intersection
# featureBits mm4 syntenyRn3
# 2397585098 bases of 2627444668 (91.252%) in intersection
# featureBits mm3 syntenyHuman
# 2343924066 bases of 2505900260 (93.536%) in intersection
# featureBits mm4 syntenyHg16
# 2299774191 bases of 2627444668 (87.529%) in intersection


# BLASTZ Self	(IN PROGRESS - 2003-10-21 - Hiram)

    ssh kk
    #	Create artifical linSpecRep.notInMouse
    # The procedure for lineage spec business with self is to simply
    # use the actual repeat masker output for this mouse assembly as
    # the lineage specific repeats for itself.  Thus, merely make
    # symlinks to the repeat masker out files and name them as expected
    # for blastz.  In this case they are called notInMouse but they
    # really mean InMouse.  When asked what to use for this,
    #	Scott Schwartz says:
    #	It depends on what you want to do with the results.
    #	If you don't mind never having any RepeatMasker stuff
    #	align, then it should be fine.   But one could imagine
    #	other cases.

    cd /cluster/bluearc/scratch/mus/mm4
    mkdir linSpecRep.notInMouse
    cd linSpecRep.notInMouse

    foreach f (../rmsk/*.fa.out)
	set base = $f:t:r:r
	echo $base.out.spec
	ln -s $f $base.out.spec
    end


    mkdir -p /cluster/data/mm4/bed/blastzSelf
    cd /cluster/data/mm4/bed/blastzSelf

    cat << '_EOF_' > DEF
# mouse vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ1_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/iscratch/i/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/iscratch/i/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm4/bed/blastzSelf

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy
    #	First cluster run
    cd /cluster/data/mm4/bed/blastzSelf
    source DEF
    ../../jkStuff/BlastZ_run0.sh
    cd run
    para try, check, push, check, ....
# Completed: 41344 of 41344 jobs
# CPU time in finished jobs:   21214466s  353574.44m  5892.91h  245.54d  0.673 y
# IO & Wait Time:               2233362s   37222.69m   620.38h   25.85d  0.071 y
# Average job time:                 567s       9.45m     0.16h    0.01d
# Longest job:                   338309s    5638.48m    93.97h    3.92d
# Submission to last job:        774549s   12909.15m   215.15h    8.96d
    # note, one of those jobs took 94 hours !

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kkr1u00
    cd /cluster/data/mm4/bed/blastzSelf
    source DEF
    ../../jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 323 of 323 jobs
# CPU time in finished jobs:      70707s    1178.45m    19.64h    0.82d  0.002 y
# IO & Wait Time:                 14609s     243.48m     4.06h    0.17d  0.000 y
# Average job time:                 264s       4.40m     0.07h    0.00d
# Longest job:                     5502s      91.70m     1.53h    0.06d
# Submission to last job:         10167s     169.45m     2.82h    0.12d

    #	Third cluster run to convert lav's to axt's
    ssh kk
    cd /cluster/data/mm4/bed/blastzSelf
    source DEF
    ../../jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 37 of 44 jobs
# Crashed: 7 jobs
# CPU time in finished jobs:      12417s     206.95m     3.45h    0.14d  0.000 y
# IO & Wait Time:                186792s    3113.20m    51.89h    2.16d  0.006 y
# Average job time:                5384s      89.73m     1.50h    0.06d
# Longest job:                    33349s     555.82m     9.26h    0.39d
# Submission to last job:         33401s     556.68m     9.28h    0.39d
    # failed: chr17, chr1_random, chr2, chr3, chr5, chr7, chrUn_random
    # finish these on kolossus
    cat << '_EOF_' > runFailed.sh
#!/bin/sh

for c in 17 1_random 2 3 5 7 Un_random
do
time ../../../jkStuff/x86_64-chromlav2axt \
        /cluster/data/mm4/bed/blastzSelf/lav/chr${c} \
        /cluster/data/mm4/bed/blastzSelf/axtChrom/chr${c}.axt \
        /cluster/data/mm4/nib /cluster/data/mm4/nib
done
'_EOF_'
    #
    chmod +x runFailed.sh
    ./runFailed.sh > runFailed.out 2>&1
XXXX running 2003-11-03 08:35

# PRODUCING GENSCAN PREDICTIONS (DONE - 2003-10-27 - Hiram)

    #	Must run this on the small cluster since this
    #	process requires more memory than is found in the KK nodes
    ssh kkr1u00
    mkdir -p /cluster/data/mm4/bed/genscan
    cd /cluster/data/mm4/bed/genscan
    # Make subdirectories for genscan to put their output files in
    mkdir gtf pep subopt

    ls -1S /cluster/data/mm4/?{,?}/chr*_*/chr*_*.fa.masked \
      > genome.list
    # Create template file, gsub, for gensub2.
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try
    para check
    para push
    # with only 6 CPUs available:
# Completed: 598 of 599 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:     906601s   15110.02m   251.83h   10.49d  0.029 y
# IO & Wait Time:                206027s    3433.78m    57.23h    2.38d  0.007 y
# Average job time:                1861s      31.01m     0.52h    0.02d
# Longest job:                    76738s    1278.97m    21.32h    0.89d
# Submission to last job:        196259s    3270.98m    54.52h    2.27d
    #	The failed job, rerun on Itanium:
    ssh snort
    cd kent.ia64/src/hg/gsBig
    make
    cd /cluster/data/mm4/bed/genscan
    #	It would not run at window=2400000, but it did with 2000000
    $HOME/bin/$MACHTYPE/gsBig /cluster/data/mm4/2/chr2_15/chr2_15.fa.masked gtf/chr2_15.fa.gtf -trans=pep/chr2_15.fa.pep -subopt=subopt/chr2_15.fa.bed -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2000000
    # Run time 209 seconds :  0.2 s / kb
    # Got 118 exons
    # Got 18 genes
    # Got 216 suboptimal exons

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/mm4/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/chr*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/chr*.bed > \
      /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm4/bed/genscan
    ldHgGene mm4 genscan genscan.gtf
    # Read 46885 transcripts in 331775 lines in 1 files
    # 46885 groups 44 seqs 1 sources 1 feature types
    # 46885 gene predictions
    hgPepPred mm4 generic genscanPep genscan.pep
    hgLoadBed mm4 genscanSubopt genscanSubopt.bed > /dev/null

# MAKE DOWNLOADABLE SEQUENCE FILES (IN PROGRESS 2003-10-21 Hiram)
    ssh kksilo
    cd /cluster/data/mm4

    # Build the .zip files
    jkStuff/zipAll.sh >&! zipAll.log &
    #	bash:	./jkStuff/zipAll.sh > zipAll.log 2>&1 &
    tail -f zipAll.log
    mkdir zip
    mv *.zip zip
    cd zip
    # Look at zipAll.log to make sure all file lists look reasonable.
    # Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    #  47 chromAgp.zip.test
    #  46 chromFa.zip.test
    #  46 chromFaMasked.zip.test
    #  46 chromOut.zip.test
    #  46 chromTrf.zip.test
    # 601 contigAgp.zip.test
    # 601 contigFa.zip.test
    # 601 contigFaMasked.zip.test
    # 601 contigOut.zip.test
    # 601 contigTrf.zip.test
    #3236 total

    ssh hgwdev
    cd /cluster/data/mm4/jkStuff
    # create generic copy program
    cat << '_EOF_' > cpToWeb.sh
#!/bin/sh
if [ $# -ne 1 ]; then
	echo "usage: cpToWeb.sh <goldenPath download directory>"
	echo -e "\texample: cpToWeb.sh mm4"
	exit 255
fi
GP=/usr/local/apache/htdocs/goldenPath/$1
mkdir -p ${GP}
mkdir -p ${GP}/chromosomes
for f in ../?/*.fa ../??/*.fa
do
    BN=`basename ${f}`
    zip -j ${GP}/chromosomes/${BN}.zip ${f}
    echo "zipped: ${BN}"
done
mkdir -p ${GP}/bigZips
for Z in *.zip
do
	cp -p ${Z} ${GP}/bigZips
	echo "copied: ${Z}"
done
'_EOF_'
    chmod +x cpToWeb.sh
    cd /cluster/data/mm4/zip
    ../jkStuff/cpToWeb.sh mm4
    cd /usr/local/apache/htdocs/goldenPath/mm4
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Make the upstream sequence files.
    # NOTE: must be redone due to bad gap track
    cd bigZips
    featureBits mm4 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits mm4 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits mm4 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa
    # mrna zips -- auto dump process takes care of this

# CREATE CYTOBAND TRACK
    # Can be done at any time
    ssh hgwdev
    cd /cluster/data/mm4
    mkdir cytoBand
    cd cytoBand
    # Get file from NCBI - might be Mouse400.dat.gz for future
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.32/ideogram.gz
    gunzip ideogram
    # Had to modify chr16 coordinates because chrom size too big. Simply
    # scaled sizes down to fit in correct chrom size.
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm4 cytoBand cytoBand.bed

# Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    # Make the cytoBand track (above) and then:
    echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm4

# UPDATE BACEND SEQUENCES (DONE - 2003-10-27 - Hiram)

    # Download new files
    ssh kksilo
    mkdir /cluster/data/mm4/bed/bacends/ncbi
    cd /cluster/data/mm4/bed/bacends/ncbi
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README
    gunzip AllBACends.mfa.gz
    gunzip cl_acc_gi_len.gz

    # Convert fa file
    cat << '_EOF_' > convert.pl
#!/usr/local/bin/perl -w

use strict;

while (my $line = <>) {
    if (substr($line,0,1) ne ">") {
        print $line;
    } else {
        my @fields = split(/\|/, $line);
        my $printed = 0;
        for (my $i = 0; $i < $#fields; $i++) {
                if ($fields[$i] eq "gb") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $#fields;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    chmod +x convert.pl
    ./convert.pl < AllBACends.mfa > BACends.fa

    # Create new pairs files
    convertBacEndPairInfo cl_acc_gi_len

    # Split file into pieces and copy to cluster to propagate
    ssh kksilo
    /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends
    cd /cluster/data/mm4/bed/bacends/ncbi
    rm /cluster/bluearc/scratch/mus/mm4/bacEnds
    mv BACends???.fa /cluster/bluearc/scratch/mus/mm4/bacEnds
    cp -p BACends.fa /cluster/bluearc/scratch/mus/mm4/bacEnds

    # Ask for propagation from sysadmin

    # Load the sequences (change bacends.# to match correct location)
    ssh hgwdev
    mkdir /gbdb/mm4/bacends
    cd /gbdb/mm4/bacends
    ln -s /cluster/data/mm4/bed/bacends/ncbi/BACends.fa .
    cd /tmp
    hgLoadSeq mm4 /gbdb/mm4/bacends/BACends.fa
    #Adding /gbdb/mm4/bacends/BACends.fa)
    #452237 sequences
    #Updating seq table

    # One additional step 9/10/04 Fan.
    # Create a composite index to speed up hgTracks display when BAC Ends track selected.
    hgsql mm4 -e 'create index bacIndex2 on all_bacends(tName, qName(8));'
    # This will take hours.
    #All done
    
# BACEND SEQUENCE ALIGNMENTS (DONE - 2003-10-28 - Hiram)
    # (alignments done without RepeatMasking)
    #	We need an ooc file for this genome
    ssh kksilo
    mkdir /cluster/data/mm4/ooc
    cd /cluster/data/mm4/ooc
    ls ../unmaskedNib/chr*.nib > nib.list
    blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl
    # Wrote 26077 overused 11-mers to 11.ooc
    # Did not end using this.  Used an old one instead.

    # Create full sequence alignments
    ssh kk
    cd /cluster/data/mm4/bed/bacends

    /cluster/bin/scripts/splitContigList -scratch /iscratch/i/mm4/trfFa 1

    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:

    cat << '_EOF_' > runBlat.sh
#!/bin/sh
path1=$1
path2=$2
root1=$3
root2=$4
result=$5
rm -fr /tmp/${root1}_${root2}
mkdir /tmp/${root1}_${root2}
pushd /tmp/${root1}_${root2}
/cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \
	${root1}.${root2}.psl
popd
rm -f ${result}
mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /tmp/${root1}_${root2}
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x runBlat.sh

    cat << '_EOF_' > template
#LOOP
./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    ls -1S /iscratch/i/mm4/bacEnds/BACends???.fa > bacEnds.lst
    mkdir bacEnds.out
    #	create results directories for each to avoid the all result files in
    #	one directory problem
    foreach f (`cat bacEnds.lst`)
	set b = $f:t:r
	echo $b
	mkdir bacEnds.out/$b
    end

    gensub2 contig.lst bacEnds.lst template jobList
    para create jobList
    # 58702 jobs written to batch
    para try, check, push, etc ...
# Completed: 58702 of 58702 jobs
# CPU time in finished jobs:    3702485s   61708.08m  1028.47h   42.85d  0.117 y
# IO & Wait Time:               1321608s   22026.80m   367.11h   15.30d  0.042 y
# Average job time:                  86s       1.43m     0.02h    0.00d
# Longest job:                     2767s      46.12m     0.77h    0.03d
# Submission to last job:          5781s      96.35m     1.61h    0.07d

# Compile alignments and lift the files (takes a while)
    ssh kksilo
    cd /cluster/data/mm4/bed/bacends
    time pslSort dirs raw.psl temp bacEnds.out/*
    # real    852m20.286s	<- over 14 hours !
    # user    522m19.890s
    # sys     65m43.910s

    ssh kolossus
    time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl  bacEnds.psl /dev/null
# Processed 540122688 alignments
# 5289.06user 306.10system 1:56:57elapsed
# 79%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (146major+16134522minor)pagefaults 0swaps

# time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl  bacEnds.psl /dev/null
    # Processed 540122688 alignments
    # real    157m1.227s
    # user    113m57.480s
    # sys     17m13.940s

    rmdir temp
    #	You will want to keep this file around until later processing is
    #	proven correct
    rm raw.psl		# 68 Gb !  It takes a while even to remove it.

    ssh kksilo
    time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm4 bacEnds.psl
    # 4037.85
    # user 743.34
    # system 1:48:09elapsed 73%CP
    # real    28m29.554s	== 9 hours
    # user    4m48.770s
    # sys     4m3.580s

    cp -p ~booch/bacends/split.pl .
    cp -p ~booch/bacends/header .
    time ./split.pl header < bacEnds.psl.lifted
    # real 169.41
    # user 38.91
    # sys 43.44

    #	These are 2 Gb files:
    # -rw-rw-r--    1 2275160342 Nov  1 15:15 bacEnds.psl
    # -rw-rw-r--    1 2385296298 Nov  1 22:19 bacEnds.psl.lifted

    # we are up to 8 Gb for these files:
    # -rw-rw-r--    1 8476583727 Oct 30 10:49 bacEnds.psl
    # -rw-rw-r--    1 8892639725 Oct 30 23:04 bacEnds.psl.lifted

    cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save
    time pslSort dirs bacEnds.psl.lifted temp split
    # real 1135.23
    # user 934.31
    # sys 112.83

    #real    72m23.363s
    #user    58m48.260s
    #sys     7m13.090s

    rmdir temp
    rm -r split

    # Copy files to final destination and remove
    mkdir /cluster/data/mm4/bacends
    cp -p bacEnds.psl.lifted /cluster/data/mm4/bacends
XXX done 2003-11-01 23:50

# BACEND PAIRS TRACK (WORKING - 2003-10-28 - Hiram)

    ssh kolossus
    cd /cluster/data/mm4/bacends

    # two line header file for the cat:
	cat << '_EOF_' > header
bin     chr     start   end     clone   score   strand  all     features        starts  sizes   names   rid     rcov    lid     lcov
10N     10      10N     10N     10      10N     10      10      10N     10      10      10      10N     10N     10N     10N
'_EOF_'

	time ${HOME}/bin/${MACHTYPE}/pslPairs -min=50000 -max=300000 \
	-short -long -orphan -mismatch -verbose bacEnds.psl.lifted \
	../bed/bacends/ncbi/bacEndPairs.txt bacEnds
XXX running 2003-11-01 23:55

XXXX - running 2003-10-31 14:45

        cat header ../bed/bacends/ncbi/bacEndPairs.txt | \
	/cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairs.bed

    # edit this Makefile to add this executable path to the scripts:
    #	/projects/compbiousr/booch/booch/scripts/

    # Update Makefile with new location of pairs/singles
    # files, if necessary (DONE)

    # Make initial file of alignments
    make bacEnds.rdb

    # Try to fish out more pairs
    make bacEndsMiss.psl

    # Re-make bacEnds.rdb with new info
    make bacEnds.rdb

    # Create bacEndPairs track file
    make bacEndPairs.bed

    # Create bacEndPairsBad and bacEndPairsLong files
    make bacEndPairsBad.bed

    # Create psl file to load
    make bacEnds.load.psl

    # Create database tables
    ssh hgwdev
    cd /projects/hg2/booch/psl/tables
    hgsql mm4 < all_bacends.sql
    hgsql mm4 < bacEndPairs.sql
    hgsql mm4 < bacEndPairsBad.sql
    hgsql mm4 < bacEndPairsLong.sql

    # Load the tables
    cd /cluster/data/mm4/bacends
    hgsql -e \
	'load data local infile "bacEnds.load.psl" into table all_bacends;' mm4
    hgsql -e \
	'load data local infile "bacEndPairs.bed" into table bacEndPairs;' mm4
    hgsql -e \
    'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' \
	mm4
    hgsql -e \
  'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' \
	mm4
# featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection
# featureBits mm3 all_bacends
# 220398676 bases of 2505900260 (8.795%) in intersection

# featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection
# featureBits mm3 bacEndPairs
# 2508331798 bases of 2505900260 (100.097%) in intersection

# featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection
# featureBits mm3 bacEndPairsBad
# 547271514 bases of 2505900260 (21.839%) in intersection

# featureBits mm4 bacEndPairsLong
# 2833125883 bases of 2627444668 (107.828%) in intersection
# featureBits mm3 bacEndPairsLong
# 2500534101 bases of 2505900260 (99.786%) in intersection

XXXX - running 2003-11-04 10:32

# ADD MAP CONTIGS TRACK (DONE - 2003-11-04 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/ctgPos
    cd /cluster/data/mm4/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
    # from the assembly.  (In the future, we should go with the NT's!)
    # So... just for this release, go straight from the seq_contig.md
    # to the table def'n: contig, size, chrom, chromStart, chromEnd
    cat << '_EOF_' > parseSeqContig.pl
#!/usr/local/bin/perl -w

use strict;

while (<>) {
    if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) {
        my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5;
        if ($chr !~ /N/ ) {
        print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n";
        }
    }
}
'_EOF_'
    chmod +x parseSeqContig.pl
      ./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab
    hgsql mm4 < ~/kent/src/hg/lib/ctgPos.sql
    echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm4
    # Note: the info is there in seq_contig.md to also do the _random's,
    # but we'd have to do some more work: duplicate the gaps of 50000 between
    # contigs for all _random's except chrUn_random (1000 between).

    # featureBits mm4 ctgPos
    # 2554101163 bases of 2627444668 (97.209%) in intersection
    # featureBits mm3 ctgPos
    # 2500661074 bases of 2505900260 (99.791%) in intersection

# KNOWN GENES TRACK (DONE - 2003-11-20 - Hiram)

    # you will probably need to make the programs in kent/src/hg/protein
    cd ~/kent/src/hg/protein
    make
    # The scripts run below will check for programs and let you know
    # which ones are missing

    # obtain new SwissProt database (should be done about once a month)
    # the swiss prot data is currently living on store5, first step is
    # on the fileserver.  This script was used once as it was created,
    # it may need to be verified and improved as it is used again.  See
    # comments at the top of the script.
    ssh eieio
    cd /cluster/data/swissprot
    ~/kent/src/hg/protein/mkSwissProtDB.sh
    # that obtains the data and unpacks it, second step is on hgwdev
    # to create the database
    ssh hgwdev
    cd /cluster/data/swissprot
    ~/kent/src/hg/protein/mkSwissProtDB.sh
    # Now the proteins database can be created from that.  Must be on hgwdev
    # Again, a script that has been used once upon creation, see
    # comments in it.  For example currently it is assumed these two
    # scripts have been run on the same day.  In this case 03112
    ssh hgwdev
    cd /cluster/data/proteins
    ~/kent/src/hg/protein/mkProteinsDb.sh
    # with those two databases existing, read for the actual known genes
    # track build.  Must be on hgwdev since it is all mostly database
    # operations.  The {Date} argument is the date stamp created by the
    # above two scripts.  Something of the form YYMMDD, e.g.: 031112
    # Again, a script that has been used only once at creation, see
    # comments at top of script.
    ssh hgwdev
    mkdir /cluster/data/mm4/bed/knownGenes
    cd /cluster/data/mm4/bed/knownGenes
    DateStamp=031112
    ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
    # that runs to a point where it prepares data and jobList for a
    # cluster run.  Continue with a cluster run on kk
    ssh kk
    cd /cluster/data/mm4/bed/knownGenes/kgBestMrna
    para create jobList
    para try
    para check
    para push
    # this is a quick cluster job.  Less than five minutes. e.g.:
# Completed: 43580 of 43580 jobs
# CPU time in finished jobs:     114636s    1910.60m    31.84h    1.33d  0.004 y
# IO & Wait Time:                111889s    1864.82m    31.08h    1.30d  0.004 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           282s       4.70m     0.08h    0.00d

    # Continuing back on hgwdev, run the same script again
    ssh hgwdev
    cd /cluster/data/mm4/bed/knownGenes
    DateStamp=031112
    ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
    # that should run to completion and the known genes track is ready

    # Add the proteins link into gdbPdb.hgcentral:
    hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \
	VALUES ("mm4","proteins031112");' \
	-h genome-testdb hgcentraltest

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/mm4/bed/
     mkdir -p kgMm4/index
     cd kgMm4/index
     hgKgGetText mm4 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/mm4/bed/kgMm4/index/knownGene.ix /gbdb/mm4/knownGene.ix
     ln -s /cluster/data/mm4/bed/kgMm4/index/knownGene.ixx /gbdb/mm4/knownGene.ixx


# SET defaultDb in hgcentraltest to move from mm3 to mm4
    hgsql -e 'update defaultDb set name = "mm4" where name = "mm3";' \
	-h genome-testdb hgcentraltest

# STS MARKERS TRACK (STARTED - 2003-11-25 - Hiram)
    # DONE - 2004-01-07 15:14  !  - this wasn't easy ....

    ssh kksilo
    mkdir -p /cluster/data/mm4/bed/STSmarkers/downloads
    cd /cluster/data/mm4/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    # these map files appear to be old, 2002 Data
    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
    #	Picks up files:
    #	345184 Feb 20  2002 10090.MGD.txt
    #	173294 Jun 27  2002 10090.WI_Mouse_Genetic.txt
    #	240637 Jun 27  2002 10090.WI_Mouse_YAC.txt
    #	390088 Jun 27  2002 10090.Whitehead-MRC_RH.txt
    # If these files have not been changing, then no need to worry about
    #	them.  We are just picking them up to see if they have changed
    #	since the last time we worked on this.

    # these reports from jax.org appear to be changing daily
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt

    # compare them with previous versions.  Before this these were
    #	in /cluster/store5/mouseMarker/orig
    # these newly picked up files:
    sum -r 10090*
# 48882   338 10090.MGD.txt
# 24176   381 10090.Whitehead-MRC_RH.txt
# 62367   170 10090.WI_Mouse_Genetic.txt
# 50616   235 10090.WI_Mouse_YAC.txt
    sum -r *.rpt
# 21097  4161 MRK_Dump2.rpt
# 15420  3139 MRK_Sequence.rpt
# 41562  2233 PRB_PrimerSeq.rpt
    sum -r UniSTS*
# 11981  8850 UniSTS.aliases
# 42464  2291 UniSTS_mouse.sts

    # the previous copies
    cd /cluster/store5/mouseMarker/orig
    sum -r 10090*
# 48882   338 10090.MGD.txt
# 24176   381 10090.Whitehead-MRC_RH.txt
# 62367   170 10090.WI_Mouse_Genetic.txt
# 50616   235 10090.WI_Mouse_YAC.txt
    sum -r *.rpt
# 36880  4160 MRK_Dump2.rpt
# 02447  3132 MRK_Sequence.rpt
# 57914  2220 PRB_PrimerSeq.rpt
    sum -r UniSTS*
# 36201  8843 UniSTS.aliases
# 58524   970 UniSTS_mouse.alias
# 42464  2291 UniSTS_mouse.sts

    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file
    #
    #	This process has been captured in the script:
    #	/cluster/data/mm4/bed/STSmarkers/downloads/fetchAllAliases.sh
    # which uses a couple of perl scripts in that same directory.
    # briefly it is:
    cd /cluster/data/mm4/bed/STSmarkers/downloads
    ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
    grep MGI: UniSTS.aliases > MGI.aliases
    ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
	stsInfoAliases.txt
    ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
    cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
        | sort -n > UniSTS_mouse.alias


    # with that, we can create a new stsInfoMouse.bed file:
    cd /cluster/data/mm4/bed/STSmarkers
    /cluster/store5/mouseMarker/code/updateBed.pl \
	/cluster/store5/mouseMarker/stsInfoMouse.bed \
	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile
    /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
    # comparing to Mm3, this file was used there:
    # /projects/hg2/booch/psl/mm.2003.02/mm3/info/stsInfoMouseNew.bed
    # a wc of it shows:
    # 25341  339289 3572880 stsInfoMouseNew.bed
    # that is about half of what we have here:
    # 56406  786036 6425721 stsInfoMouse.bed

    # and from that, create new primer fa, epcr, etc:
    /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty
    wc mouse?.*
    #        0       0       0 mouseC.fa
    #   258307  258245 5815248 mouseP.fa
    #    29906  149545 1890926 mouseP.info

    # the equivalent Mm3 versions:
    #   243750  243721 5544726 musPrimer.fa == mouseP.fa
    #    25341  126705 1609168 mouse.primers == mouseP.info

    #	copy the primers over to the bluearc for the kluster run
    cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm4
    cp -p mouseP.info /cluster/bluearc/scratch/mus/mm4


#  CLUSTER RUN FOR THE STS PRIMERS

    ssh kk
    mkdir -p /cluster/data/mm4/bed/STSmarkers/primer
    mkdir -p /cluster/data/mm4/bed/STSmarkers/ePCR
    cd /cluster/data/mm4/bed/STSmarkers/primer
    # the mouseP.fa comes from above
    echo "/cluster/bluearc/scratch/mus/mm4/mouseP.fa" > primers.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
    mkdir primers.out
    /cluster/bin/scripts/splitContigList -mouse -scratch \
	/cluster/bluearc/scratch/mus/mm4/trfFa 1
    /cluster/bin/i386/gensub2 contig.lst primers.lst template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
# Completed: 599 of 599 jobs
# CPU time in finished jobs:     315430s    5257.17m    87.62h    3.65d  0.010 y
# IO & Wait Time:                 42867s     714.45m    11.91h    0.50d  0.001 y
# Average job time:                 598s       9.97m     0.17h    0.01d
# Longest job:                      709s      11.82m     0.20h    0.01d
# Submission to last job:           729s      12.15m     0.20h    0.01d

    # on the file server
    ssh kksilo
    cd /cluster/data/mm4/bed/STSmarkers/primer
    /cluster/bin/i386/pslSort dirs primers.psl temp primers.out
    rmdir temp
    # comparing results to mm3:
    # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/primers.psl
    # Mm3 wc primers.psl
    # 4263983 89543582 440456942 primers.psl
    # Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
    # 5745617 120657896 592135728 primers.psl
XXX This is quite an increase, I wonder if it is reasonable ?

    # another kluster run
    ssh kk
    cd /cluster/data/mm4/bed/STSmarkers/ePCR
    ls -1S /cluster/bluearc/scratch/mus/mm4/trfFa > contig.lst
    mkdir epcr.out
    cat << '_EOF_' > template
#LOOP
/cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr
#ENDLOOP
'_EOF_'
    # the mouseP.info was created above
    echo "/cluster/bluearc/scratch/mus/mm4/mouseP.info" > epcr.lst
    gensub2 epcr.lst contig.lst template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
# Completed: 599 of 599 jobs
# CPU time in finished jobs:     135807s    2263.45m    37.72h    1.57d  0.004 y
# IO & Wait Time:                 32256s     537.60m     8.96h    0.37d  0.001 y
# Average job time:                 281s       4.68m     0.08h    0.00d
# Longest job:                      353s       5.88m     0.10h    0.00d
# Submission to last job:           394s       6.57m     0.11h    0.00d

    ssh hgwdev
    # all those results become all.epcr
    cat epcr.out/*.epcr > all.epcr
    # comparing results to Mm3:
    # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/all.epcr
    # Mm3 wc all.epcr
    #  24627   98508 1329559 all.epcr
    # Mm4 wc all.epcr	/cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #  74705  298820 3971712 all.epcr
XXX  This is an unusual increase ?

    cd /cluster/data/mm4/bed/STSmarkers/primer

    # create primers.psl.filter.blat and epcr.not.found
    # must use -rat here, not the -mouse
    /cluster/bin/scripts/filterPrimers -rat \
	../stsInfoMouse.bed primers.psl ../mouseP.info \
	../ePCR/all.epcr > primers.psl.filter.blat 2> filterPrimers.err
    #  The filterPrimers.err file should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	5800000
    #	5900000
    #	Determining ePCR not found
    #
    # Mm4:  wc primers.psl.filter.blat
    # 32729  687309 3331894 primers.psl.filter.blat
    # Checking vs. Mm3 on on kks00:
    # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/primers.psl.filter.blat
    # wc of that shows:
    # 26088  547848 2612720 primers.psl.filter.blat

    # create accession_info.rdb  (chrM added to Terry's script for mouse)
    touch empty_sequence.inf
    /cluster/bin/scripts/compileAccInfo -mouse \
	/cluster/data/mm4 empty_sequence.inf
    # works with one seemingly error:
    # cat: /cluster/data/mm4/M/chrM_random.agp: No such file or directory
    mv accession_info.rdb accession_info.rdb.tmp
    /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > accession_info.rdb
    rm accession_info.rdb.tmp
    # comparing results to Mm3:
    #  Mm3 wc accession_info.rdb
    # 173948 1913432 12738822 accession_info.rdb
    #  Mm4 wc accession_info.rdb
    #  86935  956289 6374930 accession_info.rdb
    #### This is an interesting difference.  The structure of the .agp files 
    # for Mm3 and Mm4 show a large difference in line counts:
    # wc /cluster/data/mm3/?/*.agp /cluster/data/mm3/??/*.agp
    # 339937 2893447 18141029 total
    # wc /cluster/data/mm4/?/*.agp /cluster/data/mm4/??/*.agp
    # 219652 1885501 11875772 total

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    /cluster/bin/scripts/epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm4
    # Comparing results to Mm3:
    # Mm3 wc epcr*
    # 205     820    7585 epcr.not.found
    #   1      12      95 epcr.not.found.nomatch
    # 204    4284   19817 epcr.not.found.psl

    # Mm4 wc epcr*
    #	328    1312   12011 epcr.not.found
    #	 57     684    5474 epcr.not.found.nomatch
    #	271    5691   26224 epcr.not.found.psl
    # there is a single error being propagated here from the file
    # /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error
    # at line 53958:
62943   D2J3    91947   D2J3                            CAACCAGCTCAC    CAACCAGCTCAC    1825, 1025BP    0       MUS MUSCULUS
    # The value '1825,' is incorrect.  Should be a small integer here.
    # to work around this problem, I'm manually eliminating this problem
    # from the epcr.not.found.psl file where it has now become two bad
    # lines:
# 24      0       0       0       1       1801    1       1789    +       62943        1825,   0       1825,   chr11_16        0       1115413 1117226 2       12,12,  0,1813, 1115413,1117214,
# 24      0       0       0       1       1801    1       1789    +       62943        1825,   0       1825,   chr11_16        0       1115413 1117226 2   
    # taking those two lines out, find another two lines:
# 24      0       0       0       1       951     1       994     +       64793   975,    0       975,    chr5_23 0       3405277 3406295 2       12,12,  0,963,  3405277,3406283,
# 24      0       0       0       1       951     1       994     +       64793   975,    0       975,    chr5_23 0       3405277 3406295 2       12,12,  0,963,  3405277,3406283,
    # And one more line that had a 930BP in the wrong place, the last line
    # in this file.


    # combine them to one
    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter

    # lift those primers  (added chrM to this lifter script for mouse)
    # creates primers.psl.filter.lifted
    /cluster/bin/scripts/lifter -mouse -psl \
	/cluster/data/mm4 primers.psl.filter
    # wc primers.psl.filter.lifted
    # 32995  692895 3488746 primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
	PATH=/cluster/bin/scripts:$PATH \
	/cluster/bin/scripts/extractPslInfo \
	primers.psl.filter.lifted
    #	wc primers.psl.filter.lifted.initial
    #	32983  197898 1758225 primers.psl.filter.lifted.initial

    # create primers.psl.filter.lifted.initial.acc
    /cluster/bin/scripts/findAccession -agp \
	-mouse primers.psl.filter.lifted.initial /cluster/data/mm4
    # wc primers.psl.filter.lifted.initial.acc
    # 32983  230881 2083248 primers.psl.filter.lifted.initial.acc

    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    /cluster/bin/scripts/getStsId -rat \
	../stsInfoMouse.bed  primers.psl.filter.lifted.initial.acc \
	> primers.initial.acc.trans
    # wc primers.initial.acc.trans
    # 32983  230881 1771293 primers.initial.acc.trans

    sort -k 4n primers.initial.acc.trans > primers.final
    rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans
    # comparing results to Mm3:
    # Mm3 wc primers.final
    # 26292  184044 1253365 primers.final
    # wc primers.final
    # 32983  230881 1771293 primers.final

    cd /cluster/data/mm4/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    PATH=/cluster/bin/scripts:$PATH \
    /cluster/bin/scripts/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    # Comparing results to Mm3
    # Mm3 wc stsMarkers_pos.rdb
    # 25925  181475 1413458 stsMarkers_pos.rdb
    # Mm4 wc stsMarkers_pos.rdb
    # 31270  218890 1869417 stsMarkers_pos.rdb

    /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
	stsInfoMouse.bed  stsMarkers_pos.rdb 500 > stsMapMouse.bed
    # wc stsMapMouse.bed
    # 27976  307388 2198948 stsMapMouse.bed

#  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm4/bed/STSmarkers
    ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
     
    # wc ucscStsAlias.tab
    # 114257  342771 2746002 ucscStsAlias.tab
    hgsql -e "drop table stsAlias;" mm4
    hgsql mm4 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm4
    hgsql -e "drop table stsMapMouseNew;" mm4
    hgsql mm4 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm4
    hgsql -e "drop table stsInfoMouseNew;" mm4
    hgsql mm4 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm4

    hgLoadPsl -nobin -table=all_sts_primer mm4 primer/primers.psl.filter.lifted

    # load primer sequences	
    mkdir /gbdb/mm4/stsMarker
    ln -s /cluster/data/mm4/bed/STSmarkers/mouseP.fa \
	/gbdb/mm4/stsMarker/mouseP.fa
    hgLoadSeq mm4 /gbdb/mm4/stsMarker/mouseP.fa
#  Adding /gbdb/mm4/stsMarker/mouseP.fa
#  29906 sequences
   # DONE - 2004-01-07 15:14

#  STS Markers plots - needs to wait until stsMarkers_pos_all.rdb
#	is created correctly from above, also needs the three maps files
###  XXXXX These are obsolete, not necessary
    /cluster/bin/scripts/combineAllSeqPrimer \
	-rat stsInfoMouse.bed \
	dummy primer/primers.final > stsMarkers_pos_all.rdb

    # can not find this result for Mm3
    # Mm4 wc stsMarkers_pos_all.rdb
    # 26811   59539  385261 stsMarkers_pos_all.rdb

    mkdir /cluster/data/mm4/bed/STSmarkers/html
    cd /cluster/data/mm4/bed/STSmarkers/html
    mkdir maps plots sts

    # not sure how to make the three maps/*.map files
    /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \
	../primer/accession_info.rdb ../stsMarkers_pos.rdb \
	maps/wi_rh.id.map 3.0 ../stsMarkers_pos_all.rdb \
	/cluster/store5/mouseMarker/stsInfoMouse.bed > rh.rdb
    mv map.ranges rh.ranges
    /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \
	../primer/accession_info.rdb ../stsMarkers_pos.rdb \
	maps/wi_gen.id.map 3.0 ../stsMarkers_pos_all.rdb \
	/cluster/store5/mouseMarker/stsInfoMouse.bed > wi_gen.rdb
    mv map.ranges wi_gen.ranges
    /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \
	../primer/accession_info.rdb ../stsMarkers_pos.rdb \
	maps/mgd_gen.id.map 3.0 ../stsMarkers_pos_all.rdb \
	/cluster/store5/mouseMarker/stsInfoMouse.bed > mgd.rdb
    mv map.ranges mgd.ranges

    /projects/compbio/usr/booch/booch/scripts/combineTPFsts \
	rh.rdb wi_gen.rdb mgd.rdb > sts.rdb

    /projects/compbio/usr/booch/booch/scripts/TPFrdb2html \
	-mouse maps/wi_rh.id.map -plain ASSEM sts sts.rdb mm4 sts
	
    mv *.rdb sts/
    mv *.ranges sts/

    cd plots
    STSFLAGS="-dbl -top 0.3 -mouse -m mm4"
    /projects/compbio/usr/booch/booch/scripts/createUCSCplots \
	${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb
    /projects/compbio/usr/booch/booch/scripts/createUCSCplots -x 10000000 \
	${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb
    /projects/compbio/usr/booch/booch/scripts/createUCSCplots -x 50000000 \
	${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb

LOAD GENEID GENES (DONE - 2003-11-27 - Hiram RELOADED -gtf 2004-04-02 kate)
    mkdir -p /cluster/data/mm4/bed/geneid/download
    cd /cluster/data/mm4/bed/geneid/download
    foreach f (/cluster/data/mm4/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmOct2003/geneid_v1.1/$chr.gtf
      wget \
http://genome.imim.es/genepredictions/M.musculus/mmOct2003/geneid_v1.1/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene mm4 geneid download/*.gtf -gtf
        # 36974 gene predictions
    hgPepPred mm4 generic geneidPep download/*-fixed.prot

# SGP GENE PREDICTIONS (DONE - 2003-12-30 - Hiram)
    mkdir -p /cluster/data/mm4/bed/sgp/download
    cd /cluster/data/mm4/bed/sgp/download
#http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307

    foreach f (/cluster/data/mm4/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/$chr.gtf
      wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/$chr.prot
    end
    wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/readme
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene mm4 sgpGene download/*.gtf -exon=CDS
#	Read 43988 transcripts in 320332 lines in 21 files
#	  43988 groups 21 seqs 1 sources 3 feature types
#	43988 gene predictions
    hgPepPred mm4 generic sgpPep download/*-fixed.prot
#  featureBits mm4 sgpGene
#  40100330 bases of 2627444668 (1.526%) in intersection
#  featureBits mm3 sgpGene
#  38897454 bases of 2505900260 (1.552%) in intersection

# TWINSCAN 1.3 GENE PREDICTIONS (2003-12-15 braney)

    cd /cluster/data/mm4/bed
    rm -fr twinscan
    mkdir twinscan.2003-12-15
    ln -s twinscan.2003-12-15 twinscan
    cd twinscan
    tarFile=MGSCv3_v_NCBI34_11-24-03.tgz
    wget http://genes.cs.wustl.edu/predictions/mouse/11-24-03/MGSCv3_v_NCBI34_11-24-03.tgz
    wget  http://genes.cs.wustl.edu/predictions/mouse/11-24-03/md5sum.txt

    # check file transferred correctly
    md5sum $tarFile | diff - md5sum.txt
    tar xvfz $tarFile
    unset tarFile 

    # pare down protein FASTA header to id and add missing .a:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y )
      echo chr$c
      perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
    end
    ldHgGene mm4 twinscan chr_gtf/chr*.gtf -gtf
    hgPepPred mm4 generic twinscanPep chr_ptx/chr*-fixed.fa

#### LOAD ENSEMBL GENES (DONE - 2004-02-19 - Hiram)
#	needed for Gene Sorter procedure below
#	Ensembl released Mouse build 32 the week of Feb 16 2004
     mkdir /cluster/data/mm4/bed/ensembl
     cd /cluster/data/mm4/bed/ensembl

        Get the ensembl gene data from http://www.ensembl.org/
        Go to the EnsMart link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
        Page 3) Choose the "Structures" tab. 
        Page 4) Choose GTF as the ouput, choose gzip compression , name the
	output file ensGeneMm4.gtf.gz and then hit Export

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf
#	Let's see how much it loses:
#	zcat ensGene.gtf.gz | wc
#	542591 7596274 77247475
#	wc unrandom.gtf
#	525809 7361326 74690888 unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
    sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf
    ldHgGene mm4 ensGene ensGene.gtf
#	Read 30665 transcripts in 525809 lines in 1 files
#	30665 groups 21 seqs 1 sources 4 feature types
#	30665 gene predictions
#	save space, gzip them:
    gzip unrandom.gtf
    gzip ensGene.gtf

o - Load Ensembl peptides:
        Get the ensembl protein data from http://www.ensembl.org/
        Go to the Martview link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Sequences" tab. 
        Page 4) Choose Transcripts/Proteins and peptide Only as the output,
		choose text/fasta and gzip compression,
		name the file ensGeneMm4.pep.gz and then hit export.

     If needed, substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     #subs -e mm4EnsGene.pep > /dev/null

     #delete * at end of each protein
     zcat ensGeneMm4.pep.gz | sed "s/\*$//" > ensembl.pep
    ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
     hgPepPred mm4 generic ensPep fixPep_ensembl.pep

o - Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format, gzip.
    #  Result name file as ensGtpMm4.tab.gz
    gunzip ensGtpMm4.tab.gz
    hgsql mm4 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql -N -e 'load data local infile "ensGtpMm4.tab" into table ensGtp;' mm4

### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
#                              (DONE - 2003-01-06 - Hiram)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
# [DONE, hartera, 2004-02-06]
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
ssh kkr1u00
mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
ssh kk
cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm4/bed/affyU74.2004-02-06
cd /cluster/data/mm4/bed/affyU74.2004-02-06
mkdir run
cd run
mkdir psl
echo /scratch/mus/mm4/trfFa/*.fa | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
# do usual para check/para push etc. until the job is done. 
# para time: 2004-02-06
# Completed: 1797 of 1797 jobs
# CPU time in finished jobs:      13759s     229.31m     3.82h    0.16d  0.000 y
# IO & Wait Time:                 37615s     626.92m    10.45h    0.44d  0.001 y
# Average job time:                  29s       0.48m     0.01h    0.00d
# Longest job:                       73s       1.22m     0.02h    0.00d
# Submission to last job:           176s       2.93m     0.05h    0.00d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kksilo
cd /cluster/data/mm4/bed/affyU74/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null

# Processed 44630 alignments
liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm4/bed/affyU74.2004-02-06
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm4 affyU74.psl
rm -fr chrom temp run

##   MAKE THE affyGnfU74 TRACKs (DONE - 2004-01-06 - Hiram)
# Make bed files and load consensus sequences for Affy U74 chip set.
# [DONE, hartera, 2004-02-06]


----------------------------------

#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm4/bed/affyGnf.2004-02-06
cd /cluster/data/mm4/bed/affyGnf.2004-02-06
#	may need to build this command in src/hg/affyGnf
affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadBed mm4 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm4 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm4 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [DONE, hartera, Feb 6, 2004]
# Fix broken symlinks to microarray data after directory structure changed
# (DONE, 2005-05-03, hartera)
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
    
    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm4 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm4 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm4 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  [Done jk 3/29/2004]
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm4/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    mkdir -p /cluster/bluearc/geneAtlas2
    cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
    ls -1 /scratch/mus/mm4/trfFa/ > genome.lst
    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm4/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time
#Completed: 599 of 599 jobs
#CPU time in finished jobs:      55883s     931.39m    15.52h    0.65d  0.002 y
#IO & Wait Time:                  1869s      31.15m     0.52h    0.02d  0.000 y
#Average job time:                  96s       1.61m     0.03h    0.00d
#Longest job:                      174s       2.90m     0.05h    0.00d
#Submission to last job:           411s       6.85m     0.11h    0.00d
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl
    rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm4/bed/geneAtlas2
    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm4 affyGnf1m.psl
    hgLoadSeq mm4 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed mm4 gnfAtlas2 gnfAtlas2.bed

######## MAKING FAMILY BROWSER TABLES #######  (DONE - 2004-01-06 - Hiram)
# These are instructions for building the
# neighborhood browser.  Don't start these until
# there is a knownGene track. and the affy tracks


# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
hgClusterGenes mm4 knownGene knownIsoforms knownCanonical
#	You may need to build this binary in src/hg/near/hgClusterGenes
#	Got 24849 clusters, from 40996 genes in 44 chromosomes
#	featureBits mm4 knownCanonical
#	829173598 bases of 2627444668 (31.558%) in intersection
#	featureBits mm3 knownCanonical
#	836749019 bases of 2505900260 (33.391%) in intersection

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir -p /cluster/data/mm4/bed/blastp
cd /cluster/data/mm4/bed/blastp
pepPredToFa mm4 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/mm4/blastp
mkdir -p /cluster/bluearc/mm4/blastp
cp /cluster/data/mm4/bed/blastp/known.* /cluster/bluearc/mm4/blastp

# Load up cluster/bluearc with blastp and related files
# if necessary
if (! -e /cluster/bluearc/blast/blastall) then
    mkdir -p /cluster/bluearc/blast
    cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
    mkdir -p /cluster/bluearc/blast/data
    cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
endif

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm4/bed/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk9
mkdir -p /cluster/data/mm4/bed/blastp/self
cd /cluster/data/mm4/bed/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data
export BLASTMAT
/cluster/bluearc/blast/blastall -p blastp \
	-d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      49565s     826.08m    13.77h    0.57d  0.002 y
# IO & Wait Time:                 20374s     339.57m     5.66h    0.24d  0.001 y
# Average job time:                   9s       0.15m     0.00h    0.00d
# Longest job:                       53s       0.88m     0.01h    0.00d
# Submission to last job:           775s      12.92m     0.22h    0.01d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/self/run/out
hgLoadBlastTab mm4 knownBlastTab *.tab
# Scanning through 7737 files
# Loading database with 7522182 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
hgMapToGene mm4 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm4 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1m


# Create table that maps between known genes and RefSeq
hgMapToGene mm4 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm4 affyU74  knownGene knownToU74
hgMapToGene mm4 affyMOE430 knownGene knownToMOE430
hgMapToGene mm4 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm4/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
    ../affyMOE430/affyMOE430.psl
hgLoadBed mm4 rinnSex rinnSex.bed


# Format and load the GNF data
mkdir /cluster/data/mm4/bed/affyGnf95
cd /cluster/data/mm4/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in hgExpDistance
hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
#	Got 8900 unique elements in affyGnfU74A
hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
#	Got 6850 unique elements in affyGnfU74B
hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
#	Got 2145 unique elements in affyGnfU74C

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.

# Create knownToEnsembl column
# XXXX do not have an ensGene table yet !
#  XXXX TBD - to be done when they release Mouse build 32 gene data
#	OK to leave this our for now, will leave a couple of missing
#	columns in the family browser
hgMapToGene mm4 ensGene knownGene knownToEnsembl

# Make knownToCdsSnp column.  This is a little complicated by
# having to merge data form the snpTsc and the snpNih tracks.
hgMapToGene mm4 snpNih knownGene knownToCdsSnp -all -cds

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab  (the blastall binary only works on kk9 for now ...)
ssh kk9
mkdir /cluster/data/mm4/bed/blastp/ce1
cd /cluster/data/mm4/bed/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# This should finish in ~10 minutes if the cluster is free.
# Here's the para time results
# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      24219s     403.64m     6.73h    0.28d  0.001 y
# IO & Wait Time:                 20227s     337.12m     5.62h    0.23d  0.001 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest job:                       20s       0.33m     0.01h    0.00d
# Submission to last job:           494s       8.23m     0.14h    0.01d


# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/ce1/run/out
hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab

# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm3.doc for procedure
#	the directory: /cluster/bluearc/hg16/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/mm4/bed/blastp/hg16
cd /cluster/data/mm4/bed/blastp/hg16
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/hg16/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
# (wordLine wouldn't run on kk9:
#	wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found
#	run this echo statement on hgwdev
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

#	takes about 15 minutes:
# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      53172s     886.21m    14.77h    0.62d  0.002 y
# IO & Wait Time:                 20181s     336.34m     5.61h    0.23d  0.001 y
# Average job time:                   9s       0.16m     0.00h    0.00d
# Longest job:                       52s       0.87m     0.01h    0.00d
# Submission to last job:           809s      13.48m     0.22h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/hg16/run/out
hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/mm4/bed/blastp/dr1
cd /cluster/data/mm4/bed/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      33916s     565.26m     9.42h    0.39d  0.001 y
# IO & Wait Time:                 20370s     339.50m     5.66h    0.24d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest job:                       29s       0.48m     0.01h    0.00d
# Submission to last job:           655s      10.92m     0.18h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/dr1/run/out
hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/mm4/bed/blastp/sc1
cd /cluster/data/mm4/bed/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:       6732s     112.19m     1.87h    0.08d  0.000 y
# IO & Wait Time:                 21851s     364.19m     6.07h    0.25d  0.001 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           324s       5.40m     0.09h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/sc1/run/out
hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/mm4/bed/blastp/dm1
cd /cluster/data/mm4/bed/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      27801s     463.35m     7.72h    0.32d  0.001 y
# IO & Wait Time:                 19866s     331.10m     5.52h    0.23d  0.001 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest job:                       23s       0.38m     0.01h    0.00d
# Submission to last job:           526s       8.77m     0.15h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/blastp/dm1/run/out
hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab

# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
ssh hgwdev
#	Until we get an Ensmart update for Mm4, we need an empty table
#	here so kgKnownToSuper will function
hgsql mm4 < ~/kent/src/hg/lib/ensGtp.sql
zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \
	hgKnownToSuper mm4 mm stdin


LOAD SNPS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes snpNih, snpTsc, and snpMap
    #   ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/snpMap
    cd       $oo/bed/snp/build$snpBuild/snpMap
    processSnpLocations.csh mm4 mouse 32 119 >& log &

    # check data:
    # wc -l snpTsc.bed; mm4 -e "select count(*) from snpTsc; 
    # wc -l snpNih.bed; mm4 -e "select count(*) from snpNih; 
    # wc -l snpMap.bed; mm4 -e "select count(*) from snpMap; 
    # select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
    # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc"
    # select * from snpMap limit 5; desc snpMap; show indexes from snpMap"

    # remove temp files
    # rm mouse* *bed.gz

LOAD SNP DETAILS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes dbSnpRsMm
    #   ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/details/Done
    mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
    cd       $oo/bed/snp/build$snpBuild/details
    processSnpDetails.csh mm4 mouse 119 >& log &
    load data local infile "$fileBase.out" into table $database.$table
    gzip $fileBase.out

    # check data:
    # hgFixed -e "select count(*) from dbSnpRsMm; 
    # select * from dbSnpRSMm limit 5; desc dbSnpRsMm; show indexes from dbSnpRSMm"

    # remove temp files
    # rm dbSnpRs*

#  KNOWN GENE UPDATE (DONE - 2004-02-19 - Hiram
#	using proteins and swissProt as created for Hg16 recently
    ssh hgwdev
    mkdir /cluster/data/kgDB/bed/kgMm4
    cd /cluster/data/kgDB/bed/kgMm4
    ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115
#	that runs to the first cluster run, then on kk
    ssh kk
    cd /cluster/data/kgDB/bed/kgMm4/kgBestMrna
    para create jobList
    para try
    para push ... etc ...
# Completed: 43352 of 43352 jobs
# CPU time in finished jobs:     118572s    1976.21m    32.94h    1.37d  0.004 y
# IO & Wait Time:                119951s    1999.18m    33.32h    1.39d  0.004 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest job:                       25s       0.42m     0.01h    0.00d
# Submission to last job:           307s       5.12m     0.09h    0.00d

#  Continuing back on hgwdev
    ssh hgwdev
    cd /cluster/data/kgDB/bed/kgMm4
    ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115
#	that runs until it prepares the second cluster run, then
    ssh kk
    cd /cluster/data/kgDB/bed/kgMm4/kgProtMap
    para create jobList
    para try
    para push ... etc
# Completed: 4944 of 4944 jobs
# CPU time in finished jobs:    1113456s   18557.60m   309.29h   12.89d  0.035 y
# IO & Wait Time:                192756s    3212.60m    53.54h    2.23d  0.006 y
# Average job time:                 264s       4.40m     0.07h    0.00d
# Longest job:                     1269s      21.15m     0.35h    0.01d
# Submission to last job:          1777s      29.62m     0.49h    0.02d

#  Continuing back on hgwdev
    ssh hgwdev
    cd /cluster/data/kgDB/bed/kgMm4
    ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115
#  That runs to completion
#	existing tables were backed up into kgMm4_2003_11:
# affyGnfU74ADistance     affyGnfU74BDistance     affyGnfU74CDistance
# ceBlastTab      cgapAlias       cgapBiocDesc    cgapBiocPathway dmBlastTab
# drBlastTab      dupSpMrna       hgBlastTab      keggMapDesc     keggPathway
# kgAliaskgProtAlias      kgXref  knownBlastTab   knownCanonical  knownGene
# knownGeneLink   knownGeneMrna   knownGenePep    knownIsoforms   knownToCdsSnp
# knownToRefSeq   knownToU74      mrnaRefseq      scBlastTab      spMrna
# until QA authorizes this build, then delete this table

#	rename the new tables into the mm4 database:
	cat << '_EOF_' > rename.sh
#!/bin/sh
for T in cgapAlias cgapBiocDesc cgapBiocPathway \
    dupSpMrna keggMapDesc keggPathway kgAlias kgProtAlias \
    kgProtMap kgXref knownGene knownGeneLink knownGeneMrna \
    knownGenePep mrnaRefseq spMrna
do
    hgsql -e "drop table ${T};" mm4
    hgsql -e "rename table kgMm4.${T} to mm4.${T};" mysql
    echo $T
done
'_EOF_'
    chmod +s rename.sh
    ./rename.sh
#	continue with the FAMILY BROWSER UPDATE to complete this process

######## FAMILY BROWSER UPDATE #######  (DONE - 2004-02-19 - Hiram)
# These are instructions for building the
# neighborhood browser.  Don't start these until
# there is a knownGene track. and the affy tracks


# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/mm4/bed/famBro.2004-02-19
ln -l /cluster/data/mm4/bed/famBro.2004-02-19 /cluster/data/mm4/bed/famBro
cd /cluster/data/mm4/bed/famBro
hgClusterGenes mm4 knownGene knownIsoforms knownCanonical
#	Got 24894 clusters, from 41225 genes in 44 chromosomes
#	You may need to build this binary in src/hg/near/hgClusterGenes
#	featureBits mm4 knownCanonical
#       830993563 bases of 2627444668 (31.627%) in intersection
#	previously was
#	829173598 bases of 2627444668 (31.558%) in intersection

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/mm4/bed/famBro/blastp
cd /cluster/data/mm4/bed/famBro/blastp
pepPredToFa mm4 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
formatdb -i known.faa -t known -n known
/scratch/blast/formatdb -i known.faa -t known -n known

# Copy over database to bluearc
rm -fr /cluster/bluearc/mm4/blastp
mkdir /cluster/bluearc/mm4/blastp
cp -p /cluster/data/mm4/bed/famBro/blastp/known.* /cluster/bluearc/mm4/blastp

# Load up cluster/bluearc with blastp and related files
# if necessary
if (! -e /cluster/bluearc/blast/blastall) then
    mkdir -p /cluster/bluearc/blast
    cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
    mkdir -p /cluster/bluearc/blast/data
    cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
endif

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm4/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/self
cd /cluster/data/mm4/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data
export BLASTMAT
/scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~5 minutes if the cluster is free.

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:     104031s    1733.85m    28.90h    1.20d  0.003 y
# IO & Wait Time:                 74885s    1248.08m    20.80h    0.87d  0.002 y
# Average job time:                  23s       0.39m     0.01h    0.00d
# Longest job:                      188s       3.13m     0.05h    0.00d
# Submission to last job:           253s       4.22m     0.07h    0.00d

# Load into database.  This takes about 15 minutes.
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/self/run/out
time hgLoadBlastTab mm4 knownBlastTab *.tab
# Scanning through 7737 files
# Loading database with 7707618 rows
#	real    15m43.413s
#	user    2m57.190s
#	sys     0m20.330s

# Create table that maps between known genes and RefSeq
hgMapToGene mm4 refGene knownGene knownToRefSeq
#	row count goes from 29675 to 29958

# Create a table that maps between known genes and 
# the nice affy expression data.
#	fixup problem with names
cd /cluster/data/mm4/bed/famBro
mkdir fixupAffyNames
cd fixupAffyNames
hgsqldump --all -c mm4 affyGnfU74A > affyGnfU74A.sql.orig
hgsqldump --all -c mm4 affyGnfU74B > affyGnfU74B.sql.orig
hgsqldump --all -c mm4 affyGnfU74C > affyGnfU74C.sql.orig

sed -e "s/U74Av2://" affyGnfU74A.sql.orig > affyGnfU74A.sql
sed -e "s/U74Bv2://" affyGnfU74B.sql.orig > affyGnfU74B.sql
sed -e "s/U74Cv2://" affyGnfU74C.sql.orig > affyGnfU74C.sql
hgsql -e "rename table affyGnfU74A to affyGnfU74AlongName;" mm4
hgsql -e "rename table affyGnfU74B to affyGnfU74BlongName;" mm4
hgsql -e "rename table affyGnfU74C to affyGnfU74ClongName;" mm4
hgsql mm4 < affyGnfU74A.sql
hgsql mm4 < affyGnfU74B.sql
hgsql mm4 < affyGnfU74C.sql

hgsqldump --all -c mm4 affyU74 > affyU74.sql.orig
sed -e "s/U74Av2://; s/U74Bv2://; s/U74Cv2://; s/;'/'/" affyU74.sql.orig > \
	affyU74.sql
hgsql -e "rename table affyU74 to affyU74longName;" mm4
hgsql mm4 < affyU74.sql
hgMapToGene mm4 affyU74  knownGene knownToU74
#	row count went from 18831 to 25613


#	I believe this GNF business is taken care of elsewhere, or at
#	least it only needs to be done once.
# Format and load the GNF data
# mkdir /cluster/data/mm4/bed/affyGnf95
# cd /cluster/data/mm4/bed/affyGnf95
# affyPslAndAtlasToBed -newType ../affyU95.psl \
# 	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
# 	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  The first one takes about 20 minutes, the other two 5 and 1
#	The affyGnfU74?Exps arguments appear to be unused in hgExpDistance
#  runs most efficiently in /tmp since it makes large temporary files
cd /tmp
hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
#	Got 13450 unique elements in affyGnfU74A
hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
#	Got 8314 unique elements in affyGnfU74B
hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
#	Got 2288 unique elements in affyGnfU74C

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.

# Create knownToEnsembl column
cd /cluster/data/mm4/bed/famBro
hgMapToGene mm4 ensGene knownGene knownToEnsembl
#	creates 22975 rows in knownToEnsembl

# Make knownToCdsSnp column.
hgMapToGene mm4 snpNih knownGene knownToCdsSnp -all -cds
#	row count went from 11832 to 11874

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab  (the blastall binary only works on kk9 for now ...)
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/ce1
cd /cluster/data/mm4/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# This should finish in 4 minutes if the cluster is free.
# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      51883s     864.71m    14.41h    0.60d  0.002 y
# IO & Wait Time:                 66670s    1111.17m    18.52h    0.77d  0.002 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest job:                       60s       1.00m     0.02h    0.00d
# Submission to last job:           217s       3.62m     0.06h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 24776 rows

# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm3.doc for procedure
#	the directory: /cluster/bluearc/hg16/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/hg16
cd /cluster/data/mm4/bed/famBro/blastp/hg16
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/hg16/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

#	takes about 10 minutes:
# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:     118896s    1981.60m    33.03h    1.38d  0.004 y
# IO & Wait Time:                 47306s     788.43m    13.14h    0.55d  0.002 y
# Average job time:                  21s       0.36m     0.01h    0.00d
# Longest job:                      171s       2.85m     0.05h    0.00d
# Submission to last job:           520s       8.67m     0.14h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/hg16/run/out
hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 32908 rows

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/mm4/bed/famBro/blastp/dr1
cd /cluster/data/mm4/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      72793s    1213.21m    20.22h    0.84d  0.002 y
# IO & Wait Time:                 30073s     501.22m     8.35h    0.35d  0.001 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest job:                       82s       1.37m     0.02h    0.00d
# Submission to last job:           367s       6.12m     0.10h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 29369 rows

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/sc1
cd /cluster/data/mm4/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      15306s     255.11m     4.25h    0.18d  0.000 y
# IO & Wait Time:                 22711s     378.51m     6.31h    0.26d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest job:                       17s       0.28m     0.00h    0.00d
# Submission to last job:           160s       2.67m     0.04h    0.00d


# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab
# Scanning through 7737 files
# Loading database with 15709 rows

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/dm1
cd /cluster/data/mm4/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      59235s     987.24m    16.45h    0.69d  0.002 y
# IO & Wait Time:                 23669s     394.49m     6.57h    0.27d  0.001 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest job:                       53s       0.88m     0.01h    0.00d
# Submission to last job:           283s       4.72m     0.08h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 26213 rows

# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
ssh hgwdev
zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \
	hgKnownToSuper mm4 mm stdin
#	creates 27319 rows

    # Update the proteins link into gdbPdb.hgcentraltest:
hgsql \
 -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "mm4";' \
        -h genome-testdb hgcentraltest

#  miRNA track (UPDATED - 2004-05-04 - Hiram)
    #	(first version done 2004-03-02)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    #  for the update, save the previous version
    cd /cluster/data/mm4/bed
    mv miRNA miRNA.2004_03_02
    mkdir miRNA
    cd miRNA
    wget --timestamping \
    "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu_ncbi32.*"
    grep -v "^track " mmu_ncbi32.bed | sed -e "s/ /\t/g" > mm4.bed
    #  check existing track before update
    #	featureBits mm4 miRNA
    #	17260 bases of 2627444668 (0.001%) in intersection
    hgLoadBed mm4 miRNA mm4.bed
    # entry in trackDb/trackDb.ra already there
    #	and verify similar numbers after:
    #	featureBits mm4 miRNA
    #	17782 bases of 2627444668 (0.001%) in intersection

#  gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram)
    # Perform a gc count with a 5 base
    # window.  Also compute a "zoomed" view for display efficiency.
    mkdir /cluster/data/mm4/bed/gc5Base
    cd /cluster/data/mm4/bed/gc5Base

    #	in the script below, the 'grep -w GC' selects the lines of
    #	output from hgGcPercent that are real data and not just some
    #	information from hgGcPercent.  The awk computes the number
    #	of bases that hgGcPercent claimed it measured, which is not
    #	necessarily always 5 if it ran into gaps, and then the division
    #	by 10.0 scales down the numbers from hgGcPercent to the range
    #	[0-100].  Two columns come out of the awk print statement:
    #	<position> and <value> which are fed into wigAsciiToBinary through
    #	the pipe.  It is set at a dataSpan of 5 because each value
    #	represents the measurement over five bases beginning with
    #	<position>.  The result files end up in ./wigData5.
    cat << '_EOF_' > runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in ../../nib/*.nib
do
	c=`basename ${n} | sed -e "s/.nib//"`
	C=`echo $c | sed -e "s/chr//"`
	echo -n "working on ${c} - ${C} ... "
	hgGcPercent -chr=${c} -doGaps \
		-file=stdout -win=5 mm4 ../../nib | grep -w GC | \
		awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
	wigAsciiToBinary \
	-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
	-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
    chmod +x runGcPercent.sh
    #	This is going to take perhaps two hours to run.  It is a lot of
    #	data.  make sure you do it on the fileserver:
    ssh eieio
    cd /cluster/data/mm4/bed/gc5Base
    time ./runGcPercent.sh > run.out 2>&1
    #	real    105m11.151s
    #	user    153m40.050s
    #	sys     14m57.570s

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/mm4/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm4/wib/gc5Base mm4 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/mm4/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/mm4/wib/gc5Base

    # to speed up display for whole chromosome views, compute a "zoomed"
    # view and load that on top of the existing table.  The savings
    # comes from the number of data table rows the browser needs to load
    # for a full chromosome view.  Without the zoomed view there are
    # over 43,000 data rows for chrom 1.  With the zoomed view there are
    # only 222 rows needed for the display.  If your original data was
    # at 1 value per base the savings would be even greater.
    #	Pretty much the same data calculation
    # situation as above, although this time note the use of the
    # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
    # together the data points coming out of the awk print statment over
    # a span of 1000 bases.  Thus each <position> coming out of wigZoom
    # will represent the measurement of GC in the next 1000 bases.  Note
    # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
    # this type of data.  You want your dataSpan here to be an exact
    # multiple of your original dataSpan (5*200=1000) and on the order
    # of at least 1000, doesn't need to go too high.  For data that is
    # originally at 1 base per value, a convenient span is: -dataSpan=1024
    # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
    cat << '_EOF_' > runZoom.sh
#!/bin/sh

mkdir -p wigData5_1K
mkdir -p dataLimits5_1K

for n in ../../nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 mm4 ../../nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
	wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
        -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
        -name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
    chmod +x runZoom.sh
    #	This is going to take even longer than above, certainly do this
    #	on the fileserver
    ssh eieio
    time ./runZoom.sh > zoom.out 2>&1
    #	real    99m1.959s
    #	user    151m1.660s
    #	sys     11m11.070s

    #	Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -pathPrefix=/gbdb/mm4/wib/gc5Base -oldTable mm4 gc5Base \
	wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm4/wib/gc5Base

# MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-04-19, hartera)
    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
       /cluster/bluearc/affy/
    # Set up cluster job to align MOE430 consensus sequences to mm4
    ssh kkr1u00
    cd /cluster/data/mm4/bed
    mkdir -p affyMOE430
    cd affyMOE430
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/mm4/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    ls -1 /scratch/mus/mm4/trfFa/ > allctg.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm4/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    # Actually do the job with usual para try/check/push/time etc.
# para time
# Completed: 599 of 599 jobs
# CPU time in finished jobs:      25242s     420.71m     7.01h    0.29d  0.001 y
# IO & Wait Time:                  5969s      99.48m     1.66h    0.07d  0.000 y
# Average job time:                  52s       0.87m     0.01h    0.00d
# Longest job:                      102s       1.70m     0.03h    0.00d
# Submission to last job:          6379s     106.32m     1.77h    0.07d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl

    # Load alignments and sequences into database
    ssh hgwdev
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm4 affyMOE430.psl
    # 1 warning on loading: Blat error so that 1449824_at has a 
    # negative entry (-195) in the qBaseInsert field. 
    # Loading into the database forces this to 0.
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm4 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
    rm batch.bak contig.psl raw.psl 
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table

    # Add a knownToMOE430 table for purposes of mapping probesets 
    # to known Genes

    hgMapToGene mm4 affyMOE430  knownGene knownToMOE430

# ####### FAMILY BROWSER UPDATE #######  (DONE - 2004-04-01 - Fan)
# This update is to complement the recent KG update based on 
# SWISS-PROT data release of 3/15/04.
# These are instructions for updating the
# neighborhood browser.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/mm4/bed/famBro.2004-04-01
rm /cluster/data/mm4/bed/famBro
ln -s /cluster/data/mm4/bed/famBro.2004-04-01 /cluster/data/mm4/bed/famBro
cd /cluster/data/mm4/bed/famBro
hgClusterGenes mm4 knownGene knownIsoforms knownCanonical
#	Got 25004 clusters, from 41693 genes in 44 chromosomes
#	row count on knownIsoforms is 41669
#	row count on knownCanonical is 25004

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/mm4/bed/famBro/blastp
cd /cluster/data/mm4/bed/famBro/blastp
pepPredToFa mm4 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
#
#	expect to find /scratch/blast loaded with all required blast files
/scratch/blast/formatdb -i known.faa -t known -n known

# Copy over database to bluearc
rm -fr /cluster/bluearc/mm4/blastp
mkdir /cluster/bluearc/mm4/blastp
cp -p /cluster/data/mm4/bed/famBro/blastp/known.* /cluster/bluearc/mm4/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm4/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/self
cd /cluster/data/mm4/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# This should finish in ~5 minutes
# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:     108579s    1809.65m    30.16h    1.26d  0.003 y
# IO & Wait Time:                 76232s    1270.54m    21.18h    0.88d  0.002 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                      188s       3.13m     0.05h    0.00d
# Submission to last job:           254s       4.23m     0.07h    0.00d

# Load into database.  This takes about 15 minutes.
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/self/run/out
time hgLoadBlastTab mm4 knownBlastTab *.tab
# Scanning through 7738 files
# Loading database with 8019510 rows
# 184.360u 28.660s 17:01.40 20.8% 0+0k 0+0io 202pf+0w

# Create table that maps between known genes and RefSeq
hgMapToGene mm4 refGene knownGene knownToRefSeq
#	row count goes from 27744 to 30116

# Create a table that maps between known genes and 
# the nice affy expression data.
#	fixup problem with names
cd /cluster/data/mm4/bed/famBro
hgMapToGene mm4 affyU74  knownGene knownToU74
#	row count went to 25944

#	I believe this GNF business is taken care of elsewhere, or at
#	least it only needs to be done once.
# Format and load the GNF data
# mkdir /cluster/data/mm4/bed/affyGnf95
# cd /cluster/data/mm4/bed/affyGnf95
# affyPslAndAtlasToBed -newType ../affyU95.psl \
# 	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
# 	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  The first one takes about 20 minutes, the other two 5 and 1
#	The affyGnfU74?Exps arguments appear to be unused in hgExpDistance
#  runs most efficiently in /tmp since it makes large temporary files
cd /tmp
hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
#	Got 13563 unique elements in affyGnfU74A
#	row count went to 13563000
hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
#	Got 8473 unique elements in affyGnfU74B
#	row count went to 8473000 
hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
#	Got 2338 unique elements in affyGnfU74C
#	row count went to 2338000

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.

# Create knownToEnsembl column
cd /cluster/data/mm4/bed/famBro
hgMapToGene mm4 ensGene knownGene knownToEnsembl
#	row count went to 34357

# Make knownToCdsSnp column.
hgMapToGene mm4 snpMap knownGene knownToCdsSnp -all -cds
#	row count went to 12061

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm4.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/ce1
cd /cluster/data/mm4/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:      52727s     878.78m    14.65h    0.61d  0.002 y
# IO & Wait Time:                 54604s     910.07m    15.17h    0.63d  0.002 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest job:                       62s       1.03m     0.02h    0.00d
# Submission to last job:           133s       2.22m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab
# Scanning through 7738 files
# Loading database with 25102 rows
# row count went to 25102

# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm4.doc for procedure
#	the directory: /cluster/bluearc/hg16/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/hg16
cd /cluster/data/mm4/bed/famBro/blastp/hg16
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/hg16/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:     121081s    2018.02m    33.63h    1.40d  0.004 y
# IO & Wait Time:                 85030s    1417.17m    23.62h    0.98d  0.003 y
# Average job time:                  27s       0.44m     0.01h    0.00d
# Longest job:                      201s       3.35m     0.06h    0.00d
# Submission to last job:           266s       4.43m     0.07h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/hg16/run/out
hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab
# Scanning through 7738 files
# Loading database with 33319 rows
#	row count went to 33319

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm4.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/dr1
cd /cluster/data/mm4/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:      74098s    1234.96m    20.58h    0.86d  0.002 y
# IO & Wait Time:                 65822s    1097.04m    18.28h    0.76d  0.002 y
# Average job time:                  18s       0.30m     0.01h    0.00d
# Longest job:                      105s       1.75m     0.03h    0.00d
# Submission to last job:           176s       2.93m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab
Scanning through 7738 files
Loading database with 29757 rows

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm4.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/sc1
cd /cluster/data/mm4/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:      15489s     258.14m     4.30h    0.18d  0.000 y
# IO & Wait Time:                 27686s     461.44m     7.69h    0.32d  0.001 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest job:                       19s       0.32m     0.01h    0.00d
# Submission to last job:            54s       0.90m     0.01h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab
# Scanning through 7738 files
# Loading database with 15945 rows

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm4.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm4/bed/famBro/blastp/dm1
cd /cluster/data/mm4/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:      60361s    1006.02m    16.77h    0.70d  0.002 y
# IO & Wait Time:                 52628s     877.13m    14.62h    0.61d  0.002 y
# Average job time:                  15s       0.24m     0.00h    0.00d
# Longest job:                       77s       1.28m     0.02h    0.00d
# Submission to last job:           142s       2.37m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm4/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab
# Scanning through 7738 files
# Loading database with 26563 rows

# CREATING KNOWNTOSUPER (which enables Superfamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
ssh hgwdev
zcat /cluster/store1/superFamily/040321/ass_28-Mar-2004.tab.gz | \
	hgKnownToSuper mm4 mm stdin

#	27405 records output

    # Update the proteins link into gdbPdb.hgcentraltest:
hgsql \
 -e 'update gdbPdb set proteomeDb = "proteins040315" where genomeDb = "mm4";' \
        -h genome-testdb hgcentraltest

# Perform preliminary review, using the Genome Browser and Gene Sorter for mm4.
# After successful review, notify QA to start formal review. 

# Create table that maps between known genes and LocusLink (DONE 5/25/04 Fan)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm4 \
        > refToLl.txt
hgMapToGene mm4 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 30313

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm4 knownGene name proteinID Pfam knownToPfam

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm4 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

######## PROTEOME BROWSER BUILD #######  (REBUILT - 2004-04-19 - Fan)
# These are instructions for building or rebuilding tables 
# needed for the Proteome Browser.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE BUILT (OR REBUILT)

o Create the working directory

  mkdir /cluster/store6/mm4/bed/pb
  cd /cluster/store6/mm4/bed/pb 

o save old PB tables (if they exist) to the backup DB, kgMm4Sav3

From mysql prompt:

alter table mm4.pepCCntDist rename as kgMm4Sav3.pepCCntDist;
alter table mm4.pepExonCntDist rename as kgMm4Sav3.pepExonCntDist;
alter table mm4.pepHydroDist rename as kgMm4Sav3.pepHydroDist;
alter table mm4.pepIPCntDist rename as kgMm4Sav3.pepIPCntDist;
alter table mm4.pepMolWtDist rename as kgMm4Sav3.pepMolWtDist;
alter table mm4.pepMwAa rename as kgMm4Sav3.pepMwAa;
alter table mm4.pepPi rename as kgMm4Sav3.pepPi;
alter table mm4.pepPiDist rename as kgMm4Sav3.pepPiDist;
alter table mm4.pepResDist rename as kgMm4Sav3.pepResDist;

alter table mm4.pbAaDistA rename as kgMm4Sav3.pbAaDistA;
alter table mm4.pbAaDistC rename as kgMm4Sav3.pbAaDistC;
alter table mm4.pbAaDistD rename as kgMm4Sav3.pbAaDistD;
alter table mm4.pbAaDistE rename as kgMm4Sav3.pbAaDistE;
alter table mm4.pbAaDistF rename as kgMm4Sav3.pbAaDistF;
alter table mm4.pbAaDistG rename as kgMm4Sav3.pbAaDistG;
alter table mm4.pbAaDistH rename as kgMm4Sav3.pbAaDistH;
alter table mm4.pbAaDistI rename as kgMm4Sav3.pbAaDistI;
alter table mm4.pbAaDistK rename as kgMm4Sav3.pbAaDistK;
alter table mm4.pbAaDistL rename as kgMm4Sav3.pbAaDistL;
alter table mm4.pbAaDistM rename as kgMm4Sav3.pbAaDistM;
alter table mm4.pbAaDistN rename as kgMm4Sav3.pbAaDistN;
alter table mm4.pbAaDistP rename as kgMm4Sav3.pbAaDistP;
alter table mm4.pbAaDistQ rename as kgMm4Sav3.pbAaDistQ;
alter table mm4.pbAaDistR rename as kgMm4Sav3.pbAaDistR;
alter table mm4.pbAaDistS rename as kgMm4Sav3.pbAaDistS;
alter table mm4.pbAaDistT rename as kgMm4Sav3.pbAaDistT;
alter table mm4.pbAaDistV rename as kgMm4Sav3.pbAaDistV;
alter table mm4.pbAaDistW rename as kgMm4Sav3.pbAaDistW;
alter table mm4.pbAaDistY rename as kgMm4Sav3.pbAaDistY;
alter table mm4.pbAnomLimit rename as kgMm4Sav3.pbAnomLimit;
alter table mm4.pbResAvgStd rename as kgMm4Sav3.pbResAvgStd;
alter table mm4.pbStamp rename as kgMm4Sav3.pbStamp;


o Define pep* tables in mm4 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

  First edit out pepPred table definition, then

	hgsql mm4 < pepAll.sql


o Build the pepMwAa table

  hgsql proteins040315 -e "select info.acc, molWeight, aaSize from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab


hgsql mm4 <<end
load data local infile "pepMwAa.tab" into table mm4.pepMwAa ignore 1 lines;
end

o Build the pepPi table

  hgsql proteins040315 -e "select info.acc from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp040315 pepPi.tab

hgsql mm4 <<end
load data local infile "pepPi.tab" into table mm4.pepPi;
end

o Calculate and load pep distributions

  pbCalDist sp040315 proteins040315 10090 mm4

  From mysql prompt:

use mm4;

load data local infile "pepExonCntDist.tab" into table mm4.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table mm4.pepCCntDist;
load data local infile "pepHydroDist.tab" into table mm4.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table mm4.pepMolWtDist;
load data local infile "pepResDist.tab" into table mm4.pepResDist;
load data local infile "pepIPCntDist.tab" into table mm4.pepIPCntDist;
load data local infile "pepPiDist.tab" into table mm4.pepPiDist;

o Calculate frequency distributions

  pbCalResStd sp040315 proteins040315 10090 mm4

o Create pbAaDistX tables

  cat  ~/kent/src/hg/lib/pbAaDist*.sql >pbAaAll.sql
  hgsql mm4 < pbAaAll.sql

  From mysql prompt:

  use mm4;

load data local infile "pbAaDistW.tab" into table mm4.pbAaDistW;
load data local infile "pbAaDistC.tab" into table mm4.pbAaDistC;
load data local infile "pbAaDistM.tab" into table mm4.pbAaDistM;
load data local infile "pbAaDistH.tab" into table mm4.pbAaDistH;
load data local infile "pbAaDistY.tab" into table mm4.pbAaDistY;
load data local infile "pbAaDistN.tab" into table mm4.pbAaDistN;
load data local infile "pbAaDistF.tab" into table mm4.pbAaDistF;
load data local infile "pbAaDistI.tab" into table mm4.pbAaDistI;
load data local infile "pbAaDistD.tab" into table mm4.pbAaDistD;
load data local infile "pbAaDistQ.tab" into table mm4.pbAaDistQ;
load data local infile "pbAaDistK.tab" into table mm4.pbAaDistK;
load data local infile "pbAaDistR.tab" into table mm4.pbAaDistR;
load data local infile "pbAaDistT.tab" into table mm4.pbAaDistT;
load data local infile "pbAaDistV.tab" into table mm4.pbAaDistV;
load data local infile "pbAaDistP.tab" into table mm4.pbAaDistP;
load data local infile "pbAaDistG.tab" into table mm4.pbAaDistG;
load data local infile "pbAaDistE.tab" into table mm4.pbAaDistE;
load data local infile "pbAaDistA.tab" into table mm4.pbAaDistA;
load data local infile "pbAaDistL.tab" into table mm4.pbAaDistL;
load data local infile "pbAaDistS.tab" into table mm4.pbAaDistS;

o Create pbAnomLimit and pbResAvgStd tables

   hgsql mm4 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql mm4 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm4 -e 'load data local infile "pbResAvgStd.tab" into table mm4.pbResAvgStd;'
   hgsql mm4 -e 'load data local infile "pbAnomLimit.tab" into table mm4.pbAnomLimit;'

o Download data files from Ensembl and create ensGeneXref and ensTranscript tables

     mkdir /cluster/bluearc/fan/ensembl/19_32aM
     cd /cluster/bluearc/fan/ensembl/19_32aM

     wget --timestamping \
	"ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/gene_xref.txt.table.gz"

     wget --timestamping \
	"ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/transcript.txt.table.gz"

     wget --timestamping \
	"ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/mus_musculus_lite_19_32a.sql.gz""

     gzip -d *.gz

  If mm4.ensGeneXref table already exists

     alter table mm4.ensGeneXref rename as kgMm4Sav3.ensGeneXref;

  ENSEMBL IS KNOWN FOR CHANGING THEIR TABLE DEFINTION AND CONTENT!!!
  So, first check if the table definition of gene_xref in mus_musculus_lite_19_32a.sql
  is the same as ~/src/hg/lib/ensGeneXref.sql.  If they are not the same, some
  programming change to hgTracks.c and possibly other programs may be needed. 

  If they are the same, create gensGeneXref table in mm4 by:

     hgsql mm4 < ~/src/hg/lib/ensGeneXref.sql

     hgsql mm4 -e \
	"load data local infile "gene_xref.txt.table" into table mm4.ensGeneXref;"

  If mm4.ensTranscript exists

  	alter table mm4.ensTranscript rename as kgMm4Sav3.ensTranscript;
 
  Again, check if the table definition of transcript in mus_musculus_lite_19_32a.sql
  is the same as ~/src/hg/lib/ensTranscript.sql.  If they are not the same, some
  programming change to hgTracks.c and possibly other programs may be needed. 

  If they are the same, create gensTranscript table in mm4 by:

     hgsql mm4 < ~/src/hg/lib/ensTranscript.sql
     hgsql mm4 -e 'load data local infile "transcript.txt.table" into table mm4.ensTranscript;'


o Download Superfamily data files and build the Superfamily DB

  	mkdir /cluster/store1/superFamily/040321
	cd /cluster/store1/superFamily/040321

  ftp over the following two files:

	ass_28-Mar-2004.tab.gz
	supfam_21-Mar-2004.sql.gz

     hgsql mm4 -e "create database superfam040321"
     hgsql superfam040321 < supfam_21-Mar-2004.sql

  Make sure to add an index on id of the des table of superfam040321.
		
     hgsql superfam040321 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam040321 -e 'load data local infile "ass_28-Mar-2004.tab" into table superfam040321.sfAssign;'

o Build or rebuild Superfamily track and create sf tables needed for PB 

   hgsql mm4 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/store1/superFamily/040321
   
   hgsql mm4 -e 'load data local infile "ass_28-Mar-2004.tab" into table mm4.sfAssign;'

   If mm4.sfDes already exists,

   	alter table mm4.sfDes rename as kgMm4Sav3.sfDes;

   hgsql superfam040321 -e "select * from des" >sfDes.tab

   hgsql mm4 < ~/src/hg/lib/sfDes.sql

   hgsql mm4 -e 'load data local infile "sfDes.tab" into table mm4.sfDes ignore 1 lines;'

  If mm4.superfamily already exists

	alter table mm4.superfamily rename as kgMm4Sav3.superfamily;

     hgSuperfam mm4 > sf.log
   
   It is normal that many proteins does not have corresponding Superfamily entries.
   
   If mm4.sfDescription exists,

   	alter table mm4.sfDescription rename as kgMm4Sav3.sfDescription;

   hgsql mm4 < ~/src/hg/lib/sfDescription.sql
   hgsql mm4 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table mm4.sfDescription;'

  Finally, load the superfamily table.

      hgLoadBed mm4 superfamily superfamily.tab -tab


o Create pbStamp table for PB
   
  hgsql mm4 < ~/src/hg/lib/pbStamp.sql

  cd ~/kent/src/hg/protein/pbTracks
  hgsql mm4 -e 'load data local infile "pbStampMouse.tab" into table mm4.pbStamp;'

o Enable Proteome Browser

  if ~/kent/src/hg/hgGene/hgGeneData/Mouse/links.ra does not exist:

     cd ~/kent/src/hg/hgGene/hgGeneData/Mouse
     mkdir mm4
     cd mm4

     Create links.ra with the following content

# mm4 Proteome Browser link info.

name protBrowser
shortLabel Proteome Browser
tables kgXref
idSql select spDisplayID from kgXref where kgID = '%s'
nameSql select spId from kgXref where kgID = '%s'
url ../cgi-bin/pbTracks?proteinID=%s
priority 12

     cd ../..
     make alpha

o Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) by updating the pbStampMouse.tab 
  file and then reload the pbStamp table. 

o Perform preliminary review of Proteom Browser for mm4, then
  notify QA for formal review.


# LIFTOVER ENCODE REGIONS (DONE 2004-04-16 kate)
    # preliminary test
    ssh hgwdev
    mkdir /cluster/data/mm4/bed/encodeLiftOver
    cd /cluster/data/mm4/bed/encodeLiftOver
    echo 'select * from hg16.encodeRegions' | hgsql hg16 -N > hg16.eR.bed
    # Of 44 regions:
    # -minMatch=.95 (default) -> nothing mapped
    # -minMatch=.5 -> 6 mapped
    # -minMatch=.2 -> 35 mapped
    # -minMatch=.1 -> 43 mapped
    liftOver -minMatch=.1 hg16.eR.bed \
      /cluster/data/hg16/bed/bedOver/hg16ToMm4.over.chain \
      encodeRegions.bed encodeRegions.unmapped
    hgLoadBed mm4 encodeRegions encodeRegions.bed -noBin


# TIGR GENE INDEX (DONE 2004-05020 Fan)
    mkdir -p /cluster/data/mm4/bed/tigr
    cd /cluster/data/mm4/bed/tigr
    wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm4_05-2004.tgz
    
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/mm4/bed/tigr
    hgsql mm4 -e "drop table tigrGeneIndex"
    hgsql mm4 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm4 tigrGeneIndex $f
        hgsql mm4 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 354491 entries created in tigrGeneIndex table.

    hgsql mm4 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql mm4 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords mm4 tigrGeneIndex
    
    gzip *.gff *TCs


# LIFTOVER CHAINS TO MM5 (WORKING 2004-08-02 kate)

    # run alignment
    # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc
    ssh kk
    cd /cluster/data/mm4
    makeLoChain-align mm4 /iscratch/i/mm4/softNib \
                    mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc
        # Created parasol job in bed/blat.mm5.2004-08-02/run
        # 1892 jobs
    cd bed
    rm blat.mm5
    ln -s blat.mm5.2004-08-02 blat.mm5
    cd blat.mm5/run
    para try
    para check
    para push

    # lift results
    # the lift directory was defined in makeHg17.doc when split was performed
    # this expects data in bed/blat.mm5, so symlink must be there 
    ssh kksilo
    cd /cluster/data/mm4/bed/blat.mm5
    makeLoChain-lift mm4 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift \
                        >&! lift.log &
    tail -100f lift.log
        # GOT HERE
        # Problems with lift: 
# liftUp: psl.c:112: pslLoad: Assertion `sizeOne == ret->blockCount' failed.
# sizeOne tStarts 4 bs 17
# Abort

    # chain alignments
    ssh kk
    makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib
        # Created parasol job in /cluster/data/mm4/bed/blat.mm5/chainRun
    cd /cluster/data/mm4/bed/blat.mm5/chainRun
    para try
        # 46 jobs
    para check
    para push

    # make alignment net
    ssh kolossus
    makeLoChain-net mm4 mm5

    # load into database and copy to download directory
    ssh hgwdev
    makeLoChain-load mm4 mm5
        # Finished loading mm4ToHg17.over.chain
        # Now, add download link for /usr/local/apache/htdocs/goldenPath/mm4/liftOver/mm4ToHg17.over.chain.gz

## NIA Mouse Gene Index - (DONE - 2004-10-08 Fan)
#	requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov

#	pick up data
    ssh hgwdev
    mkdir -p /cluster/data/mm4/bed/NIAGene
    cd /cluster/data/mm4/bed/NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex3-Transcript-to-Genome.psl

    wget --timestamping \
    http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex3-Transcripts.fasta
    
    hgLoadPsl mm4 -table=NIAGene NIA-Mouse-GeneIndex3-Transcript-to-Genome.psl

    mkdir /gbdb/mm4/NIAGene
    ln -s /cluster/data/mm4/bed/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta \
	/gbdb/mm4/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta
    
    hgLoadSeq mm4 /gbdb/mm4/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta

    Added and edited NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm4

# MM4->MM5 LIFTOVER CHAIN (DONE 2/8/05 Andy)
    # I'm starting at the chaining point.  Kate believes the alignment is OK, but there's no
    # chrA_chrB.psl files in the /cluster/data/mm4/bed/blat.mm5/raw directory, so I can't
    # start at the lifting stage.
    ssh kk9
    makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib
    cd /cluster/data/mm4/bed/blat.mm5.2005-02-08/chainRun
    para try
    para check
    para push
#Completed: 43 of 43 jobs
#CPU time in finished jobs:      21502s     358.36m     5.97h    0.25d  0.001 y
#IO & Wait Time:                 48317s     805.29m    13.42h    0.56d  0.002 y
#Average job time:                1624s      27.06m     0.45h    0.02d
#Longest job:                    21152s     352.53m     5.88h    0.24d
#Submission to last job:         26995s     449.92m     7.50h    0.31d
    ssh kk
    nice makeLoChain-net mm4 mm5

# LIFTOVER CHAINS TO MM5 (REDONE AGAIN 03-02-2005 TO 03-07-2005 Andy)

    # run alignment
    # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc
    ssh kk9
    cd /cluster/data/mm4
    makeLoChain-align mm4 /iscratch/i/mm4/softNib \
                    mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc
        # Created parasol job in bed/blat.mm5.2005-03-02/run
    
    cd bed
    rm blat.mm5
    ln -s blat.mm5.2004-03-02 blat.mm5
    cd blat.mm5/run
    grep "mm4/softNib/chr[1239].nib" spec | sed 's/=98/=94/' > spec.reduced
    grep -v "mm4/softNib/chr[1239].nib" spec > spec.otherChroms
    para create spec.reduced
    # 172 jobs written to batch
    para try
    para check
    para push

    # lift results
    # the lift directory was defined in makeHg17.doc when split was performed
    # this expects data in bed/blat.mm5, so symlink must be there
    ssh kksilo
    cd /cluster/data/mm4/bed
    ln -s blat.mm5 blat.mm5.2005-03-03
    cd blat.mm5
    makeLoChain-lift mm4 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift > lift.log 2> lift.log &
    # That didn't quite work.  I'll do it manually:
    cd raw
    for chr in `awk '{print $1;}' /cluster/data/mm5/chrom.sizes`; do
      echo $chr
      liftUp -pslQ ../psl/$chr.psl /iscratch/i/mm5/liftOver/liftSplit/lift/$chr.lft warn chr*_$chr.psl
    done
    # chaining
    ssh kki
    makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib
    cd /cluster/data/mm4/bed/blat.mm5/chainRun
    para try
    para check
    para push
    # netting (this step also creates the liftOver chain)
    ssh kksilo
    makeLoChain-net mm4 mm5
    ssh hgwdev
    makeLoChain mm4 mm5

# CREATE kgSpAlias TABLE FOR PB (Done 05/04/05)

    hgsql mm4 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm4 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm4.kgSpAlias.tab
    rm j.tmp

    hgsql mm4 -e 'drop table kgSpAlias';
    hgsql mm4 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm4 -e 'load data local infile "mm4.kgSpAlias.tab" into table kgSpAlias'