# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the Rattus 
# Norvegicus genome, June 2003 update.
#
#	Things got a bit confused because the first Rat for June
#	got started about June 22.  Processing proceeded on that for
#	that week, then it was annnounced that portions of Chr 7 and Chr X
#	were being moved around and a new Rat was released June 29.
#	Also, had some difficulty with RepeatMasker.
#	And, there were cluster difficulties to work around.
#	What happens is that some things may not be exactly where they
#	are claimed to be.  Although I believe everything has been
#	corrected.

DOWNLOAD SEQUENCE (DONE Rnor3.1 - 2003-06-29 - Hiram)

    ssh kkstore
    mkdir /cluster/store3/rn3
    cd /cluster/store3/rn3
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/README
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/conditions_for_use
    #  Three extra files that were not in Rn2:
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bac.clones
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bacfile.gz
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bactigs.bacs
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bactigs.contigs

    # Get BCM's chrom assemblies -- we will assemble our own chr*.fa from 
    # contig fa + agp, and cross-check against this.  
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un)
      mkdir $c
      wget -O $c/chr$c.fa.bcm.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.fa.gz
      wget -O $c/chr${c}_random.fa.bcm.gz \
 ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.random.fa.gz
    end

    # Get BCM's contig fa + agp.  We will split into our own conveniently-sized
    # pseudo-contigs, and assemble chrom fa.  
    # These two files are not available in this rat June 2003
    #  But they don't seem to be used anywhere else here anyway
    # wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/bacfile2-1.gz
    # wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/record.dat.gz
    #  Perhaps the extra three files bac.clones, bactigs.bacs and
    #  bactigs.contigs can be used ?
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un)
      wget -O $c/chr$c.agp \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.agp
      wget -O $c/chr$c.contig.fa.gz \
   ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.contig.fa.gz
      wget -O $c/chr${c}_random.agp \
   ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.agp
      wget -O $c/chr${c}_random.contig.fa.gz \
   ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.contig.fa.gz
    end

BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE - 2003-06-29 - Hiram)

    # Make chrom fa:
    ssh kkstore
    # The contig sequences are delivered with various upper and lower
    # case.  We need them all in upper case to begin with.  The first
    # line of the file is not a problem, it is already all upper case.
    # According to Kim Worley: <kworley@swan.hgsc.bcm.tmc.edu>
    # "The lowercase letters are regions where Phrap assigns a quality score
    #	of less than 20.  These can safely be ignored."
    foreach c (?{,?})
	set f = $c/chr$c.contig.fa.gz
	set g = $f:r
	echo "${f}"
	zcat ${f} | tr '[a-z]' '[A-Z]' > ${g}
    end
    foreach c (?{,?})
	set f = $c/chr${c}_random.contig.fa.gz
	set g = $f:r
	if (-e ${f}) then
	    echo "${f}"
	    zcat ${f} | tr '[a-z]' '[A-Z]' > ${g}
	endif
    end

    foreach c (?{,?})
      echo "Processing ${c}"
      $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.agp chr$c \
	$c/chr$c.fa $c/chr$c.contig.fa
      if (-e $c/chr${c}_random.agp) then
        $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr${c}_random.agp \
	chr${c}_random $c/chr${c}_random.fa $c/chr${c}_random.contig.fa
      endif
      echo "${c} - DONE"
    end
    # Check that the size of each chromosome .fa file is equal to the 
    # last coord of the .agp:
    foreach f ( ?{,?}/*.agp )
      set agpLen = `tail -1 $f | awk '{print $3;}'`
      set g = $f:r
      set faLen = `faSize $g.fa | awk '{print $1;}'`
      if ($agpLen == $faLen) then
        echo "   OK: $f length = $g length = $faLen"
      else
        echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
      endif
    end
    # Check that our assembled chrom fa jive with the BCM chrom fa
    #	You will want a long window size in your terminal to run this
    #	It outputs quite a few lines
    foreach c ( ?{,?} )
      set ucscLen = `faSize $c/chr$c.fa | awk '{print $1;}'`
      set bcmLen  = `gunzip -c $c/chr$c.fa.bcm.gz | faSize stdin \
                       | awk '{print $1;}'`
      if ($ucscLen == $bcmLen) then
        echo "   OK: chr$c.fa length = chr$c.fa.bcm length = $bcmLen"
      else
        echo -n "ERROR:  chr$c.fa length = $ucscLen, but chr$c.fa.bcm length"
	echo " = $bcmLen"
        echo -n "ERROR:  chr$c.fa length = $ucscLen, but chr$c.fa.bcm length"
	echo " = $bcmLen"
      endif
      if (-e $c/chr${c}_random.fa) then
        set ucscLen = `faSize $c/chr${c}_random.fa | awk '{print $1;}'`
        set bcmLen  = `gunzip -c $c/chr${c}_random.fa.bcm.gz | faSize stdin \
                        | awk '{print $1;}'`
        if ($ucscLen == $bcmLen) then
          echo -n "   OK: chr${c}_random.fa length = chr${c}_random.fa.bcm"
	  echo " length = $bcmLen"
        else
          echo -n "ERROR:  chr${c}_random.fa length = $ucscLen, but"
	  echo " chr${c}_random.fa.bcm length = $bcmLen"
        endif
      endif
    end

BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE - 2003-06-29)
					(DONE - 2003-06-29 - Hiram)

    # This will split the rat sequence into approx. 5 Mbase
    # supercontigs between non-bridged clone contigs and drop the
    # resulting dir structure in /cluster/store3/rn3.  The resulting
    # dir structure will include 1 dir for each chromosome, each of
    # which has a set of subdirectories, one subdir per supercontig.
    ssh kkstore
    cd /cluster/store3/rn3
    foreach c (?{,?})
      echo "Working ${c}"
      cp -p $c/chr$c.agp $c/chr$c.agp.bak
      cp -p $c/chr$c.fa $c/chr$c.fa.bak
      splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000
      if (-e $c/chr${c}_random.fa) then
        cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak
        cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak
        rm -fr ${c}_random
        splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        rm -fr $c/chr${c}_random_1
        rm -fr $c/chr${c}_random_2
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
      echo "DONE ${c}"
    end
    # Make sure the reconstructed .fa jives with the original:
    #	four lines output for each chrom, long window required to be
    #	able to catch all the output.  Look for non-zero numbers.  The
    #	wc -l should always output zero
    foreach f ( */*.fa.bak )
      echo -n $f:r " "
      diff $f $f:r | wc -l
    end
    # The .agp goes through a slight format change, but make sure it 
    # at least ends up with the same number of lines:
    foreach f ( ?{,?}/*.agp.bak )
      set l1 = `wc -l $f | awk '{print $1}'`
      set l2 = `wc -l $f:r | awk '{print $1}'`
      if ($l1 == $l2) then
        echo "   OK: $f and $f:r have the same #lines"
      else
        echo "ERROR:  $f has $l1 lines, but $f:r has $l2"
      endif
    end
    # Save some space - NO - the *.contig.fa.gz files already existing
    # here are the original downloads.  These non-zipped versions are
    # the translated to upper case versions.
#	foreach c (?{,?})
#	echo $c
#	gzip $c/chr*.contig.fa
#	end
    # Do need to at least move these out of the way because later
    # wild card expansions are looking for things like *.fa and we
    # don't want these in those expansions.
	foreach c (?{,?})
	echo "$c/chr${c}.contig.fa -> $c/chr${c}.contig.fa.UC"
	mv $c/chr${c}_random.contig.fa $c/chr${c}_random.contig.fa.UC
	mv $c/chr${c}.contig.fa $c/chr${c}.contig.fa.UC
	end
    # This rm may be a good thing to do.  But later, after repeat
    # masking is done and thus the soft masked results can be compared
    # with the *.bak originals.
#	rm */*.bak

COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE - 2003-06-29 - Hiram)
	# Some of these scripts are being edited after they were
	#	copied to here.  Some cases of binaries in
	#	~kent/bin/i386 need to be changed either to
	#	/cluster/bin/i386 or ~hiram/bin/i386 depending upon what
	#	was being fixed as all this proceeded.
	#	The "real" place should be /cluster/bin/i386 but before
	#	we go that route, we need to have our build processes
	#	working properly to keep that bin up to date.

    ssh kkstore
    ln -s /cluster/store3/rn3 ~/rn3
    rm -f ~/lastRn
    ln -s /cluster/store4/rn2 ~/lastRn
    cd ~/rn3
    cp -Rp ~/lastRn/jkStuff .
    rm jkStuff/*.{out,lst,lft} jkStuff/*~


CREATING DATABASE (DONE - 2003-06-23 - Hiram)

    # Create the database.
    ssh hgwdev
    echo 'create database rn3' | hgsql ''
    # make a semi-permanent read-only alias:
    #	(I haven't noticed any use of this ?  Useless ? - Hiram)
    alias rn3 "mysql -u hguser -phguserstuff -A rn3"
    # Use df to ake sure there is at least 5 gig free on 
    #		hgwdev:/var/lib/mysql
    # [hiram@hgwdev /cluster/home/hiram] df -h /var/lib/mysql
    # Filesystem            Size  Used Avail Use% Mounted on
    # /dev/sda1             472G  384G   64G  86% /var/lib/mysql


CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2003-06-23 - Hiram)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql rn3


REPEAT MASKING (RUNNING - 2003-06-30 - Hiram)
   Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -m -s)

    #- Split contigs into 500kb chunks:
    ssh kkstore
    cd ~/rn3
    foreach d ( */chr*_?{,?} )
	cd $d
	echo "splitting $d"
	set contig = $d:t
	faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
	    -maxN=500000
	cd ../..
    end

    #- Make the run directory and job list:
    cd ~/rn3
    cat << '_EOF_' > jkStuff/RMRat
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/rn3/$2
/bin/cp $2 /tmp/rn3/$2/
cd /tmp/rn3/$2
/cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s -spec rat -comp mouse $2
popd
/bin/cp /tmp/rn3/$2/$2.out ./
if (-e /tmp/rn3/$2/$2.align) /bin/cp /tmp/rn3/$2/$2.align ./
if (-e /tmp/rn3/$2/$2.tbl) /bin/cp /tmp/rn3/$2/$2.tbl ./
if (-e /tmp/rn3/$2/$2.cat) /bin/cp /tmp/rn3/$2/$2.cat ./
/bin/rm -fr /tmp/rn3/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/rn3/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/rn3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMRat
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
    foreach d ( ?{,?}/chr*_?{,?} )
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/store3/rn3/jkStuff/RMRat \
                 /cluster/store3/rn3/$d $f \
               '{'check out line+ /cluster/store3/rn3/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
    end

    #- Do the run
    ssh kk
    cd ~/rn3/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 6209 of 6209 jobs
# CPU time in finished jobs:   37009055s  616817.58m 10280.29h  428.35d  1.174 y
# IO & Wait Time:                 96849s    1614.15m    26.90h    1.12d  0.003 y
# Average job time:                5976s      99.60m     1.66h    0.07d
# Longest job:                    16048s     267.47m     4.46h    0.19d
# Submission to last job:         49589s     826.48m    13.77h    0.57d

        #- Lift up the split-contig .out's to contig-level .out's
	ssh kkstore
    cd ~/rn3
        foreach d ( ?{,?}/chr*_?{,?} )
          cd $d
          set contig = $d:t
          /cluster/home/hiram/bin/i386/liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end

        #- Lift up the contig-level .out's to chr-level
        cd ~/rn3
        ./jkStuff/liftOut5.sh

        # soft-mask contig .fa's with .out's
        foreach i (?{,?})
            foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \
                       $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa)
                /cluster/home/hiram/bin/i386/maskOutFa $j $j.out $j -soft
            end
            echo done $i
        end
	# You will see a few WARNING lines from maskOutFa.  This is a
	# a good sign.  Much better than when maskOutFa used to exit.
	# With the fix, everything is getting masked properly.
	# Fixing a temporary glitch in repeat masker.  This hasn't
	# affected anything above, but it will change the result of
	# the load below.  Since the repeat masker run above, the
	# processrepeats script of RepeatMasker has been fixed.  This
	# would not be necessary after that fix.
        foreach i (?{,?})
	    /cluster/bluearc/RepeatMasker030619/MakeRMoutSane.pl ${i}/*.fa.out
		foreach k ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa.out \
			$i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa.out)
	    /cluster/bluearc/RepeatMasker030619/MakeRMoutSane.pl ${k}
		end
	end

        #- Load the .out files into the database with:
        ssh hgwdev
	cd ~/rn3
        hgLoadOut rn3 ?{,?}/*.fa.out

MAKE LIFTALL.LFT (DONE - 2003-07-01 - Hiram)

    ssh kkstore
    cd ~/rn3
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
    cp -p jkStuff/liftAll.lft /cluster/bluearc/rat/rn3

VERIFY REPEATMASKER RESULTS (DONE - 2003-06-24 - Hiram)

    # Something about this is too early at this point.  It errors with:
    #	Can't start query:
    #	select chrom,size,fileName from chromInfo
    #	mySQL error 1146: Table 'rn3.chromInfo' doesn't exist
    # you have to wait until the nibs have been loaded below and the
    # chrom.sizes works

    # Run featureBits on rn3 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits rn3 rmsk  (rat version 3.1)
    #	1117483165 bases of 2571104688 (43.463%) in intersection
    featureBits rn3 rmsk  (rat version 3.0)
    #   1117328265 bases of 2571104688 (43.457%) in intersection
    featureBits rn2 rmsk  (Jan 2003)
    #	1100534407 bases of 2495551408 (44.100%) in intersection
    featureBits rn1 rmsk  (Nov 2002)
    #	1081814344 bases of 2555848684 (42.327%) in intersection


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE - 2003-07-02 - Hiram)

    # XXXXX - These unmasked nibs are unnecessary.  They will be
    # replaced below by soft masked nibs.  This is only done here in
    # order to quickly get a browser going so that other tracks and
    # tests can be begun.

    # Make (unmasked) nibs
    ssh kkstore
    cd ~/rn3
    mkdir nibNotMasked
    foreach i (?{,?})
	cd $i
	foreach j ( chr?{,?}{,_random}.fa )
	echo "nibbing $j"
	~/bin/i386/faToNib $j ../nibNotMasked/$j:r.nib
	end
	cd ..
    end

    # Make symbolic links from /gbdb/rn3/nib to the real nibs.
    # Below, after the soft masked nibs are created, these
    # links will be reset to point to the soft masked nibs.
    ssh hgwdev
    mkdir -p /gbdb/rn3/nib
    foreach f (/cluster/store3/rn3/nibNotMasked/chr*.nib)
      ln -s $f /gbdb/rn3/nib
    end
    # Load /gbdb/rn3/nib paths into database and save size info.
     ssh hgwdev
     hgsql rn3  < ~/kent/src/hg/lib/chromInfo.sql
     cd ~/rn3
     hgNibSeq -preMadeNib rn3 /gbdb/rn3/nib ?{,?}/chr?{,?}{,_random}.fa
    #	2835152425 total bases
     echo "select chrom,size from chromInfo" | hgsql -N rn3 > chrom.sizes
    # take a look at chrom.sizes, should be 44 lines and reasonable
    # numbers
    wc chrom.sizes
    #	44      88     778 chrom.sizes


GOLD AND GAP TRACKS (DONE - 2003-07-02 - Hiram)(Re-load 2004-10-19 - Hiram)
    #	since these were first loaded, the hgGoldGapGl has changed to
    #	fixup the fragStart position to be 0 relative, hence the re-load
    ssh hgwdev
    cd /cluster/data/rn3
    hgGoldGapGl -noGl rn3 /cluster/store3/rn3 .


MAKE GCPERCENT (DONE - 2003-07-02 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/store3/rn3/bed/gcPercent
     cd /cluster/store3/rn3/bed/gcPercent
     hgsql rn3  < ~/kent/src/hg/lib/gcPercent.sql
     hgGcPercent rn3 ../../nibNotMasked


MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN3 (DONE - 2003-06-24 - Hiram)
    # Enter rn3 into hgcentraltest.dbDb so test browser knows about it:
    echo 'insert into dbDb values("rn3", "Jun 2003", \
        "/gbdb/rn3/nib", "Rat", "Espn", 1, \
        30, "Rat");' | hgsql -h genome-testdb hgcentraltest
    # Temporary entry until mrna tables come along.  Approximate
    # calculated position the same as rn2
    # Note: for next assembly, set scientificName column to "Rattus norvegicus"
    echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values("rn3", "June 2003", \
        "/gbdb/rn3/nib", "Rat", "chr5:169359914-169392264", 1, \
        30, "Rat", "Rattus norvegicus");' | hgsql -h genome-testdb hgcentraltest

    # Trying to request Espn in this new rat results in 4 different
    # hits on the genome.  Thus, the specified position here is the
    # desired Espn position as was in the previous Rat
    #  If you need to delete this dbDb entry to rewrite it:
        echo 'delete from dbDb where name="rn3";' \
        | hgsql -h genome-testdb hgcentraltest
    # To change the default version of the Rat on the organism
    #	menu.  First delete it, then add the new one:

    echo 'delete from defaultDb where genome="Rat";' \
	| hgsql -h genome-testdb hgcentraltest

    echo 'insert into defaultDb values("Rat", "rn3");' \
	| hgsql -h genome-testdb hgcentraltest


    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add rn3 in all the right places and do
    make update
    make alpha
    cvs commit makefile


MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN3 (DONE - 2003-07-04 - Jorge)
    ssh hgwdev
    echo 'insert into blatServers values("rn3", "blat11", "17778", "1"); \
          insert into blatServers values("rn3", "blat11", "17779", "0");' \
      | hgsql -h genome-testdb hgcentraltest


SIMPLE REPEAT TRACK (DONE - 2003-07-01 - Hiram)
    #  Need to be careful with this one on the cluster.  This trf
    #  thing does a lot of I/O on the input and output file.
    #  Place the splits on /cluster/bluearc for cluster access
    ssh kkstore
    mkdir -p /cluster/bluearc/rat/rn3
    cd /cluster/store3/rn3/
    ls ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa  \
	| tar -c -T - -f - | (cd /cluster/bluearc/rat/rn3.1.split; tar xfBp -)

    # now, a safe cluster job to trf mask
    ssh kk
    mkdir ~/rn3/bed/simpleRepeat
    cd ~/rn3/bed/simpleRepeat
    mkdir trf
 ls /cluster/bluearc/rat/rn3.1.split/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
	> genome.lst

    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst single gsub spec
    para create spec
    para try
    para check
    para push
    ... etc ...
Completed: 591 of 591 jobs
CPU time in finished jobs:      27837s     463.95m     7.73h    0.32d  0.001 y
IO & Wait Time:                 13105s     218.41m     3.64h    0.15d  0.000 y
Average job time:                  69s       1.15m     0.02h    0.00d
Longest job:                      205s       3.42m     0.06h    0.00d
Submission to last job:           263s       4.38m     0.07h    0.00d

    ssh kkstore
    cd ~/rn3/bed/simpleRepeat
    liftUp simpleRepeat.bed ~/rn3/jkStuff/liftAll.lft warn trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    cd ~/rn3/bed/simpleRepeat
    hgLoadBed rn3 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-02 - Hiram)

    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    # XXXX The next time we do this, we want to experiment with period <= 8
    ssh kkstore
    cd ~/rn3/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/rn3
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
        $c/lift/ordered.lst > $c/lift/oTrf.lst
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
      endif
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
        jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      if (-e $c/lift/rTrf.lst) then
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end


MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-02)
			(DONE - 2003-07-02 - Hiram)

    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
    ssh kkstore
    cd ~/rn3
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.sh
    # You will see warnings from the above command, these are OK:
    # WARNING: negative rEnd: -203 chr1:36149039-36149198 L1M3c
    # WARNING: negative rEnd: -14 chr1:185432802-185432864 PB1D10
    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.sh
    # Copy the masked contig fa to /scratch:
    ssh kkstore
    rm -rf /cluster/bluearc/rat/rn3/trfFa
    mkdir -p /cluster/bluearc/rat/rn3/trfFa
    cp -p ~/rn3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
	/cluster/bluearc/rat/rn3/trfFa

RESET NIB POINTERS TO softNib (DONE - 2003-07-02 - Hiram)

    # As mentioned above the first time this was done, the first set
    # of nibs linked to were unnecessary.  Here, we change the links to
    # point to softNib which is both repeat masker and trf masked
    ssh hgwdev
    cd /gbdb/rn3/nib
    rm -f *
    foreach f (/cluster/store3/rn3/softNib/chr*.nib)
      ln -s $f /gbdb/rn3/nib
    end
    #  I wonder if this needs to be run again ?  I'm going to do it
    #	anyway.
    # Load /gbdb/rn3/nib paths into database and save size info.
     ssh hgwdev
     cd ~/rn3
     hgNibSeq -preMadeNib rn3 /gbdb/rn3/nib ?{,?}/chr?{,?}{,_random}.fa
    #	2835152425 total bases
     echo "select chrom,size from chromInfo" | hgsql -N rn3 > chrom.sizes
    # take a look at chrom.sizes, should be 44 lines and reasonable
    # numbers
    wc chrom.sizes
    #	44      88     778 chrom.sizes

MAKE DOWNLOADABLE SEQUENCE FILES (DONE - 2003-07-03 - 2003-07-09 - Hiram)
    ssh kkstore
    cd ~/rn3
    #- Build the .zip files
    ./jkStuff/zipAll.sh |& tee zipAll.log
    mkdir zip
    mv *.zip* zip
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/rn3/zip
    ../jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/rnJun2003
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Then make the upstream sequence files.
    cd bigZips
    featureBits rn3 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits rn3 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits rn3 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa

# CREATE CYTOBAND TRACK
    # Can be done at any time
    ssh hgwdev
    cd /cluster/data/rn3
    mkdir cytoBand
    cd cytoBand
    # Get file from NCBI
    wget ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/maps/mapview/BUILD.2/ideogram.gz
    gunzip ideogram.gz
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql rn3 cytoBand cytoBand.bed

   # extend to cytoBandIdeo
   ssh hgwdev
   cd kent/src/hg/lib
   hgsql rn3 < cytoBandIdeo.sql
   # mysql: insert into cytoBandIdeo select * from cytoBand
   # add entry in kent/src/hg/makeDb/trackDb/rat/rn3/trackDb.ra

# PREPARE CLUSTER FOR BLASTZ RUN (IN PROGRESS - 2003-07-10 - kate)
    # This needs to be done after trf-masking and nib generation.

    # NOTE: generated lineage-specific .out files
    # using "old" script, for use vs. human 
    # Will need species-specific version for mouse 
    # 2003-07-09 kate
    # Before mouse run:
    # XXXXXXXXX  Currently stuck on this one 2003-07-09 - the lineage specific
    # XXXXXXXXX  repeats business is now different.
    # XXXXXXXXX  Trying to work the kinks out of that. Hiram
    ssh kkstore
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p /cluster/data/rn3/bed/linSpecRep
    cd /cluster/data/rn3/bed/linSpecRep
    # foreach f (~/rn3/*/chr*.out)
        # ln -sf $f .
    # end
    # don't step on existing .out's
    foreach f (/cluster/data/rn3/*/chr*.out)
        cp $f .
    end
    
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    cd /cluster/data/rn3/bed
    rm -rf /cluster/bluearc/rat/rn3/linSpecRep
    #  This dir already exists from above
    # mkdir -p /cluster/bluearc/rat/rn3
    cp -Rp linSpecRep /cluster/bluearc/rat/rn3
    # RepeatMasker .out:
    cd ~/rn3
    rm -rf /cluster/bluearc/rat/rn3/rmsk
    mkdir -p /cluster/bluearc/rat/rn3/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/rat/rn3/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /cluster/bluearc/rat/rn3/softNib
    mkdir -p /cluster/bluearc/rat/rn3/softNib
    cp -p softNib/chr*.nib /cluster/bluearc/rat/rn3/softNib

    # New RepeatMasker lineage-specific annotation
    # Hiram ran /cluster/bluearc/rat/rn3/rmsk.spec/runArian
    # to generate /cluster/bluearc/rat/rn3/rmsk.spec/chrN.fa.out_mus_hum
    # This script loops through .out's, running 
    # DateRepsinRMoutput.pl <.out> -query rat -comp mouse -comp human
    # Extract repeats not in mouse (0 in column 1)
    cd /cluster/bluearc/rat/rn3
    mkdir linSpecRep.notInMouse
    foreach f (rmsk.spec/*.out_mus_hum)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInMouse/$base.out.spec
    end

    # copy to iscratch and sync
    ssh kkr1u00
    cd /iscratch/i
    mkdir -p mm3.RM030619
    cd mm3.RM030619
    #cp -r /cluster/bluearc/rat/rn3/linSpecRep.notInMouse .
    /cluster/bin/scripts/iSync


# BLASTZ MOUSE MM3 (DONE 2003-07-28 kate) 
# NOTE: This uses mm3.RM030619, which is MM3 masked with new RepeatMasker

    ssh kkstore
    mkdir -p /cluster/data/rn3/bed/blastz.mm3
    mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.mm3
    cd /cluster/data/rn3/bed/blastz.mm3
    cat << '_EOF_' > DEF
# mouse vs. rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=50000
BLASTZ_T=2
# scoring matrix
BLASTZ_Q=/cluster/data/blastz/mus_rat.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Rat
SEQ1_DIR=/scratch/rat/rn3/softNib/
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInRat/
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/bluearc/hg/rn3/bed/blastz.mm3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy


# Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.rn3-mm3.RM030619 

    ssh kk
    cd /cluster/data/rn3/bed/blastz.mm3

    # source the DEF file
    bash
    . ./DEF
    cp DEF $BASE

    # follow the next set of directions slavishly
    mkdir -p run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    cp $BASE/xdir.sh .
    sh xdir.sh
    cd run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # 36504 jobs
    para try
    para check
    para push
    # ... etc ...
       # 37 min longest job
       # 2.3 hours elapsed

    # post-process blastz
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3
        #   source the DEF file again in case you are coming back to this
    bash
    . ./DEF
    
    # a new run directory
    mkdir -p run.1
    
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 312 jobs 
    head jobList

    # run on cluster
    #ssh kkr9u01
    ssh kk
    cd /cluster/data/rn3/bed/blastz.mm3/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
    # 10 minutes total 

    # convert lav files to axt
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create template file for gensub2
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/bluearc/hg/rn3/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/data/rn3/bed/blastz.mm3/axtChrom/$(root1).axt} /cluster/bluearc/rat/rn3/softNib /iscratch/i/mm3.RM030619/mixedNib
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1S /cluster/bluearc/hg/rn3/bed/blastz.mm3/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 44 jobs
    head jobList

    ssh kk
    # NOTE: would have run this on rack 9, but /iscratch/i/mm3.RM030619
    # not available there
    cd /cluster/data/rn3/bed/blastz.mm3
    cd run.2
    para create jobList
    para try
    para check
    para push
        # 10 minutes ?

    # copy chrom axt's back to fileserver
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir -p axtChrom
    cp $base/axtChrom/*.axt axtChrom
    
    # translate sorted axt files into psl
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir -p pslChrom
    set tbl = "blastzMm3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load database tables
    ssh hgwdev
    set tbl = "blastzMm3"
    cd /cluster/data/rn3/bed/blastz.mm3
    cd pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl
    # create trackDb/rat/rn3/trackDb.ra entry:
# track blastzMm3
# shortLabel BLASTZ Mouse
# longLabel All BLASTZ Mouse alignments
# group compGeno
# priority 111
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno hg15


# MAKE BLASTZ BEST MOUSE MM3 (DONE 2003-07-23 kate)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3
    
    # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
    cp -r axtChrom /cluster/bluearc/hg/rn3/bed/blastz.mm3
    mkdir -p axtBest pslBest
    mkdir run.3
    cd run.3

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore-300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/rn3/bed/blastz.mm3/S1.len /cluster/data/rn3/bed/blastz.mm3/S2.len $4
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doBestAxt
    cd axtChrom
    ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
    cd ../run.3

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/$(root1).axt {check out line+ /cluster/data/rn3/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+  /cluster/data/rn3/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 44 jobs
    head jobList

    ssh kk
    cd /cluster/data/rn3/bed/blastz.mm3
    cd run.3
    para create jobList
    para try
    para check
    para push
        # 1 hour ?

    # chr1 too large for kk nodes -- reran on kkr1u00 (fileserver would do)

    ./doBestAxt chr1 /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/rn3/bed/blastz.mm3/axtBest/chr1.axt /cluster/data/rn3/bed/blastz.mm3/pslBest/chr1_blastzBestMm3.psl
        # Read 186121 elements from /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/chr1.axt
        # Allocated 268113159 elements in array
        # 118172 elements in after soft filter.
        # Output 115210 alignments including 0 trimmed from overlaps
        # Bases in 236687179, bases out 164536193 (69.52%)

    # Load tables
     ssh hgwdev
     set base="/cluster/data/rn3/bed/blastz.mm3"
     set tbl="blastzBestMm3"
     cd $base/pslBest
     /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl

     # check results
     featureBits rn3 blastzBestMm3
        # 1667035238 bases of 2571104688 (64.837%) in intersection
     # no mouse alignment on rn2 
     featureBits rn3 blastzMm3
        # 1669090687 bases of 2571104688 (64.917%) in intersection


    # create trackDb/rat/rn3/trackDb.ra entry:
# track blastzBestMm3
# shortLabel BLASTZ Mouse
# longLabel All BLASTZ Mouse alignments
# group compGeno
# priority 111
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno hg15

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/rn3/axtBestMm3
     cd /gbdb/rn3/axtBestMm3
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/rn3/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     # already done, above
     # hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql rn3 < axtInfoInserts.sql


# CHAIN MOUSE BLASTZ (DONE 2003-07-24 kate)

    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain

# Run axtChain on little cluster
    ssh kkr1u00
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir -p axtChain/run1
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain/run1
    ls -1S /cluster/data/rn3/bed/blastz.mm3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ=chrUn_random $1 | axtChain stdin /iscratch/i/rn3/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    mkdir out chain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...

    # now on the cluster server, sort chains
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain rn3 ${c}_chainMm3 $i
        echo done $c
    end


# NET MOUSE BLASTZ (DONE 2003-07-24 kate)

    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      chainPreNet $i /cluster/data/rn3/chrom.sizes \
                        /cluster/data/mm3/chrom.sizes ../preNet/$i
    end
    cd ..
    # This foreach loop will take about 15 min to execute.

    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \
                            /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net

    ssh hgwdev
    cd /cluster/store3/rn3/bed/blastz.mm3/axtChain
    netClass hNoClass.net rn3 mm3 mouse.net -tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse -qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInRat

    # If things look good do
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm3/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet rn3 netMm3 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 syntenyMm3 stdin

    # Add entries for net and chain to rat/rn3 trackDb


# MOUSE MM3 BLASTZ CLEANUP (DONE 2004-09-20 kate)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm3
    nice rm -rf raw lav
    nice rm axtChain/run1/chain/* axtChain/preNet axtChain/hNoClass.net
    nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} axtChain/*Net/*.net
    nice gzip axt*/*.axt psl*/*.psl


# BLASTZ Self (DONE 2003-09-22 angie) 

    ssh kkstore
    mkdir -p /cluster/data/rn3/bed/blastz.rn3
    cd /cluster/data/rn3/bed/blastz.rn3
    cat << '_EOF_' > DEF
# rat vs. rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
# Rat
SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs/
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
# not used
SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs/
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/store3/rn3/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy


# Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.rn3-rn3

    ssh kk
    cd /cluster/data/rn3/bed/blastz.rn3

    # source the DEF file
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # 97344 jobs 
    para try
    para check
    para push
    # ... etc ...
       # 1.3 hr longest job

    # make tSizes, qSizes files
    ssh hgwdev
    cd /cluster/store3/rn3/bed/blastz.rn3
    echo 'select chrom,size from chromInfo' | hgsql -A -N rn3 \
      > tSizes
    echo 'select chrom,size from chromInfo' | hgsql -A -N rn3 \
      > qSizes

    # post-process blastz
    # --- normalize (PennSpeak for lift)
    ssh kk
    cd /cluster/store3/rn3/bed/blastz.rn3
    # run bash shell if not running it already
    source DEF
    mkdir -p $BASE/run.1
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 312 jobList
    head jobList

    para create jobList
    para push
#Completed: 312 of 312 jobs
#CPU time in finished jobs:      42835s     713.92m    11.90h    0.50d  0.001 y
#IO & Wait Time:                940201s   15670.02m   261.17h   10.88d  0.030 y
#Average job time:                3151s      52.51m     0.88h    0.04d
#Longest job:                     4794s      79.90m     1.33h    0.06d
#Submission to last job:          4795s      79.92m     1.33h    0.06d

    # convert lav files to axt
    ssh kk
    cd /cluster/store3/rn3/bed/blastz.rn3
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create template file for gensub2
    # Use blastz-self-chromlav2axt, which includes axtDropOverlap in 
    # the pipe (that helps to keep axtSort from barfing).
    # usage:  blastz-self-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-self-chromlav2axt /cluster/store3/rn3/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store3/rn3/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/rat/rn3/softNib /iscratch/rat/rn3/softNib /cluster/store3/rn3/bed/blastz.rn3/tSizes /cluster/store3/rn3/bed/blastz.rn3/qSizes
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1S /cluster/store3/rn3/bed/blastz.rn3/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 44 jobs
    head jobList

    cd /cluster/data/rn3/bed/blastz.rn3/run.2
    para create jobList
    para try
    para check
    para push
#Completed: 38 of 44 jobs
#Crashed: 6 jobs
#CPU time in finished jobs:      11390s     189.83m     3.16h    0.13d  0.000 y
#IO & Wait Time:                179130s    2985.51m    49.76h    2.07d  0.006 y
#Average job time:                5014s      83.56m     1.39h    0.06d
#Longest job:                    22358s     372.63m     6.21h    0.26d
#Submission to last job:         22377s     372.95m     6.22h    0.26d
    # Chroms 1,2,3,4,Un,X crashed... run in 2 passes:
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.rn3
    set base=/cluster/data/rn3/bed/blastz.rn3
    set seq1_dir=/cluster/data/rn3/softNib
    set seq2_dir=/cluster/data/rn3/softNib
    foreach c (lav/chr1 lav/chr2 lav/chr3 lav/chr4 lav/chrX lav/chrUn)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropOverlap stdin /cluster/data/rn3/bed/blastz.rn3/tSizes \
          /cluster/data/rn3/bed/blastz.rn3/qSizes stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` > $out
      popd
    end
    foreach f (axtChrom/chr*.axt)
      echo gzipping $f...
      gzip $f
    end

    # Make the axt's available on genome-test:
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/rnJun2003/alignments/vsRn3
    cd /usr/local/apache/htdocs/goldenPath/rnJun2003/alignments/vsRn3
    cp -p /cluster/data/rn3/bed/blastz.rn3/axtChrom/*.axt.gz .
	

# CHAIN SELF BLASTZ (DONE 2/28/05 angie)
    ssh kki
    mkdir -p /cluster/data/rn3/bed/blastz.rn3/axtChain/run1
    cd /cluster/data/rn3/bed/blastz.rn3/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/rn3/bed/blastz.rn3/axtChrom/*.axt.gz > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
    /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \
| chainAntiRepeat \
    /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check, ...
    # Both chrUn and chr1 crashed with out-of-mem....
#Completed: 42 of 44 jobs
#Crashed: 2 jobs
#Average job time:                 342s       5.70m     0.09h    0.00d
#Longest job:                     1205s      20.08m     0.33h    0.01d
#Submission to last job:          1205s      20.08m     0.33h    0.01d

    # We can tolerate the lack of chrUn self chains, but not chr1.
    # Try to split it in half around a large gap and chain halves, then merge.
    # chr1's largest assembly gap is 664000bp [68333401, 68997401), 
    # but that doesn't divide it very evenly in half (chr1 is 268Mbp).
    # There is a 316000bp gap [129715113, 130031113), use that.
    ssh kolossus
    cd /cluster/data/rn3/bed/blastz.rn3/axtChain
    axtFilter ../axtChrom/chr1.axt.gz -tStartMax=129873113 > chr1_1.axt
    axtFilter ../axtChrom/chr1.axt.gz -tStartMin=129873113 > chr1_2.axt
    # line count sanity check: original == total new:
    zcat ../axtChrom/chr1.axt.gz | wc -l
#21602304
    wc -l chr1_?.axt
#  11973760 chr1_1.axt
#   9628544 chr1_2.axt
#  21602304 total
    # Chain the two halves independently, assuming no self chain would 
    # span such a large gap:
    axtChain -verbose=0 chr1_1.axt \
      /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \
    | chainAntiRepeat \
      /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin \
      run1/chain/chr1_1.chain
    axtChain -verbose=0 chr1_2.axt \
      /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \
    | chainAntiRepeat \
      /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin \
      run1/chain/chr1_2.chain
    rm chr1_1.axt chr1_2.axt

    # Sort and split:
    ssh kolossus
    cd /cluster/data/rn3/bed/blastz.rn3/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    rm run1/chain/*
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.rn3/axtChain
    chainSplit chain all.chain
# DONE TO HERE 2/28/05 angie
 Looks like Brian filtered -minScore=20000 and netted, 10/19/05.


# BLASTZ Human hg15 (IN PROGRESS 2003-07-20 kate) 

    # Extract lineage-specific repeats vs. human
    ssh kkstore
    cd /cluster/bluearc/rat/rn3
    mkdir linSpecRep.notInHuman
    foreach f (rmsk.spec/*.out_mus_hum)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInHuman/$base.out.spec
    end

    mkdir -p /cluster/data/rn3/bed/blastz.hg15
    cd /cluster/data/rn3/bed/blastz.hg15
    cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Rat
SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs/
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
# not used
SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman/
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
SEQ2_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/
SEQ2_RMSK=/iscratch/i/gs.16/build33/rmsk/
SEQ2_SMSK=/iscratch/i/gs.16/build33/linSpecRep/
SEQ2_FLAG=-primate
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store3/rn3/bed/blastz.hg15

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy


# Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.rn3-hg15

    ssh kk
    cd /cluster/data/rn3/bed/blastz.hg15

    # source the DEF file
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # 42120 jobs
    para try
    para check
    para push
    # ... etc ...

    # post-process blastz


# SWAPPING HUMAN-RAT BLASTZ ALIGNMENTS TO RAT-HUMAN (DONE 2003-07-11 kate)
    ssh kkstore
    # Human-rat alignments were already run and processed into axt.  
    # Swap target and query to get rat-human alignments.  
    set aliDir = "/cluster/data/hg15/bed/blastz.rn3"
    set revAliDir = "/cluster/data/rn3/bed/blastz.hg15.swap"
    mkdir $revAliDir
    cd $revAliDir
    # axtBest will need .len files - copy those, swap S1<->S2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    # Swap target and query coords, then re-apportion alignments so that 
    # unsorted/chrN.axt has all the alignments with chrN as target.
    cat $aliDir/axtChrom/chr*.axt \
    | /cluster/bin/i386/axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | /cluster/bin/i386/axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    rm -r unsorted

    # Create psl from sorted axt
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.hg15.swap
    set tbl = "blastzHg15"
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load blastz results
    ssh hgwdev
    cd /cluster/data/rn3/bed
    mv blastz.hg15.swap blastz.hg15
    cd blastz.hg15
    cd pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_*.psl
    # create trackDb/rat/rn3/trackDb.ra entry:
# track blastzHg15
# shortLabel BLASTZ Human
# longLabel All BLASTZ Human alignments
# group compGeno
# priority 111
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno hg15


# CHAIN HUMAN BLASTZ (DONE 2003-07-16 kate)

    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.hg15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    # create shell script to run jobs on file server
    # cat << '_EOF_' > serialJobList
#LOOP
# something like this
#doChain $1 chain/$1.chain  out/$1.out}
#ENDLOOP
# '_EOF_'
    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ=chrUn_random $1 | axtChain stdin /cluster/bluearc/rn3/softNib /cluster/data/hg15/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    csh serialJobList &
    # this took 8.5 hours on kkstore

# Run axtChain on little cluster
    # ssh kkr1u00
    # cd /cluster/data/rn3/bed/blastz.hg15
    # mkdir -p axtChain/run1
    # cd /cluster/data/rn3/bed/blastz.hg15/axtChain/run1
    # ls -1S /cluster/data/rn3/bed/blastz.hg15/axtChrom/*.axt > input.lst
    # cat << '_EOF_' > gsub
#LOOP
# doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
#'_EOF_'
    # cat << '_EOF_' > doChain
#!/bin/csh
    # axtFilter -notQ=chrUn_random $1 | axtChain stdin /iscratch/i/rn3/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib $2 > $3
#'_EOF_'
    # chmod a+x doChain
    # mkdir out chain
    # gensub2 input.lst single gsub jobList
    # para create jobList
    # para try
    # para push # ... etc ...

    # now on the cluster server, sort chains
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.hg15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg15/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain rn3 ${c}_chainHg15 $i
        echo done $c
    end

# NET HUMAN BLASTZ (DONE 2003-07-16 kate)

    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.hg15/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      chainPreNet $i /cluster/data/rn3/chrom.sizes \
                        /cluster/data/hg15/chrom.sizes ../preNet/$i
    end
    cd ..
    This foreach loop will take about 15 min to execute.

    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \
                            /cluster/data/hg15/chrom.sizes ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | netSyntenic stdin hNoClass.net

    ssh hgwdev
    cd /cluster/store3/rn3/bed/blastz.hg15/axtChain
    netClass hNoClass.net rn3 hg15 human.net -tNewR=/cluster/data/rn3/bed/linSpecRep -qNewR=/cluster/data/hg15/bed/linSpecRep

    # If things look good do
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.hg15/axtChain
    rm -r n1 hNoClass.net
    Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg15/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet rn3 netHg15 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet rn3 syntenyHg15 stdin

    # Add entries for net and chain to rat/rn3 trackDb


# MAKE THE BLASTZBESTHUMAN TRACK FROM RN3 AXT FILES (DONE 2003-07-16 baertsch, kate)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/data/rn3/bed/blastz.hg15"
    set seq1_dir="/cluster/data/rn3/mixedNib/"
    set seq2_dir="/cluster/data/hg15/mixedNib/"
    set tbl="blastzBestHg15"
    cd $base
    mkdir -p axtBest pslBest
    foreach f (axtChrom/chr*.axt)
      set chr=$f:t:r
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # If some chromosome's alignments were too big and caused axtSort to 
    # run out of memory, split it in half (by 4-line axt records) and 
    # run axtBest just on the halves.  
    foreach chr (chr1)
      echo two-pass axtBesting $chr
      set len = `wc -l < axtChrom/$chr.axt`
      set numRec = `expr $len / 4`
      if (($numRec * 4) != $len) then
        echo "Uh-oh: length of axtChrom/$chr.axt is $len, not a multiple of 4"
        break
      endif
      set halfRec   = `expr $numRec / 2`
      set halfLen   = `expr $halfRec \* 4`
      set halfLenp1 = `expr $halfLen + 1`
      head -$halfLen   axtChrom/$chr.axt > axtChrom/$chr.h0.axt
      tail +$halfLenp1 axtChrom/$chr.axt > axtChrom/$chr.h1.axt
      axtBest axtChrom/$chr.h0.axt $chr axtChrom/$chr.h0.axtBest -minScore=300
      axtBest axtChrom/$chr.h1.axt $chr axtChrom/$chr.h1.axtBest -minScore=300
      cat axtChrom/$chr.h{0,1}.axtBest > axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
      rm axtChrom/$chr.h*
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store3/rn3/bed/blastz.hg15"
     set tbl="blastzBestHg15"
     cd $base/pslBest
     hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/rn3/axtBestHg15
     cd /gbdb/rn3/axtBestHg15
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/rn3/axtBestHg15/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('hg15','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql rn3 < axtInfoInserts.sql


# MAKE AXTTIGHT FROM AXTBEST (DONE 2003-07-21 kate)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    # NOTE: currently the axtBest's are the blastz.hg15.2003-07-09 directory
    # since blastz.hg15 is in use for rerun of blastz with
    # updated lineage-specific repeats for new RepeatMasker
     
    ssh kkstore
    cd ~/rn3/bed/blastz.hg15.2003-07-09/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        /cluster/data/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      echo "Processing $i"
      set c = $i:r
      /cluster/bin/i386/axtToPsl $i \
                ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg15.psl
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg15.2003-07-09
    cd pslTight
    /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_blastzTightHg15.psl


# MAKE MULTIPLE ALIGNMENT (MAF) FILES FROM BEST AXT'S (DONE 2003-07-22 kate)
#  Reran multiz and reloaded table on 8/27, as some chroms were truncated (2003-08-27 kate)

    ssh kkstore

    # rat/human
    cd /cluster/data/rn3/bed/blastz.hg15.2003-07-09
    mkdir maf
    cd axtBest
    foreach i (*.axt)
      set c = $i:r
      echo "Making maf for chr $c"
      /cluster/bin/i386/axtToMaf -tPrefix=rn3. -qPrefix=hg15. $i \
            /cluster/data/rn3/chrom.sizes /cluster/data/hg15/chrom.sizes  \
                ../maf/${c}.rh.maf
      /cluster/bin/scripts/fixMaf.pl \
                < ../maf/${c}.rh.maf > ../maf/${c}.rh.maf.fixed
      mv ../maf/${c}.rh.maf ../maf/${c}.rh.maf.unfixed
      mv ../maf/${c}.rh.maf.fixed ../maf/${c}.rh.maf
    end
    # NOTE: remove .unfixed files after checking

    # rat/mouse
    cd /cluster/data/rn3/bed/blastz.mm3
    mkdir maf
    cd axtBest
    foreach i (*.axt)
      set c = $i:r
      echo "Making maf for chr $c"
      /cluster/bin/i386/axtToMaf -tPrefix=rn3. -qPrefix=mm3. $i \
            /cluster/data/rn3/chrom.sizes /cluster/data/mm3/chrom.sizes  \
                ../maf/${c}.rm.maf
      /cluster/bin/scripts/fixMaf.pl \
                < ../maf/${c}.rm.maf > ../maf/${c}.rm.maf.fixed
      mv ../maf/${c}.rm.maf ../maf/${c}.rm.maf.unfixed
      mv ../maf/${c}.rm.maf.fixed ../maf/${c}.rm.maf
    end
    # 30 minutes ?

    # Run multiple alignment using Webb Miller's multiz
    # Note: use - for 3rd arg when doing rodent/rodent (close species)
    # See penn/multiz/README for details 
    ssh kkstore
    cd /cluster/data/rn3/bed
    mkdir -p multiz.hg15-mm3/maf
    cd  multiz.hg15-mm3
cat << "EOF" > doMultiz
#!/bin/csh -f    
    cd /cluster/data/rn3/bed
    cd blastz.hg15.2003-07-09/maf
    foreach f (*.rh.maf)
        set c = $f:r:r
        echo "Multiz on $c"
        nice /cluster/bin/penn/multiz $f ../../blastz.mm3/maf/${c}.rm.maf - \
                                 > ../../multiz.hg15-mm3/maf/${c}.rmh.maf
    end
"EOF"
    # << this line makes emacs coloring happy
    csh doMultiz >&! doMultiz.log &
    tail -100f doMultiz.log
    # 3 hours ? 

    # copy files to cluster I/O server
    mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf
    cp -r /cluster/data/rn3/bed/blastz.hg15.2003-07-09/maf /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09
    mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.mm3/maf
    cp -r /cluster/data/rn3/bed/blastz.mm3/maf /cluster/bluearc/hg/rn3/bed/blastz.mm3

    set multizDir = /cluster/data/rn3/bed/multiz.hg15-mm3
    cd $multizDir
    mkdir run
    cd run
cat << "EOF" > doMultiz.kk
/cluster/bin/penn/multiz /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf/$1.rh.maf /cluster/bluearc/hg/rn3/bed/blastz.mm3/maf/$1.rm.maf - > /cluster/data/rn3/bed/multiz.hg15-mm3/maf/$1.rmh.maf
"EOF"
    # << this line makes emacs coloring happy
    chmod +x doMultiz.kk

cat << "EOF" > gsub
#LOOP
doMultiz.kk $(root1) {check out line+ /cluster/data/rn3/bed/multiz.hg15-mm3/maf/$(root1).rmh.maf}
#ENDLOOP
"EOF"
    # << this line makes emacs coloring happy
    cd /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf
    # NOTE: probably want a better way to make the chrom list 
    ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list
    gensub2 chrom.list single gsub jobList
    ssh kkr9u01
    set multizDir = /cluster/data/rn3/bed/multiz.hg15-mm3
    cd $multizDir/run
    para create jobList
    para try
    para check
    para push

    # setup up external files for database to reference
    ssh hgwdev
    mkdir -p /gbdb/rn3/multizHg15Mm3
    cd /gbdb/rn3/multizHg15Mm3
    foreach f (/cluster/data/rn3/bed/multiz.hg15-mm3/maf/*.maf)
        ln -s $f .
    end

    # load into database
    # NOTE: use "warn" option due to presence of score=0 aligments in MAF files
    /cluster/bin/i386/hgLoadMaf -warn rn3 multizHg15Mm3
        #Couldn't find rn3. sequence line 819945 of /gbdb/rn3/multizHg15Mm3/chrX.rmh.maf
        #Indexing and tabulating /gbdb/rn3/multizHg15Mm3/chrX_random.rmh.maf
        #Couldn't find rn3. sequence line 4341 of /gbdb/rn3/multizHg15Mm3/chrX_random.rmh.maf
        #Loading multizHg15Mm3 into database
        #Warning: load of multizHg15Mm3 did not go as planned: 3512170 record(s), 0 row(s) skipped, 27410 warning(s) loading ./multizHg15Mm3.tab


# AUTO UPDATE GENBANK MRNA RUN  (DONE - 2003-07-07 - Hiram)

    ssh eieio
    cd /cluster/store5/genbank
    set db = rn3
    set nibGlob = '/cluster/bluearc/rat/rn3/softNib/chr*.nib'
    set liftFile = /cluster/bluearc/rat/rn3/liftAll.lft
    # the iservers were acting up, the following uses the bluearc instead
    # make sure 'ssh localhost' works before running this:
    nice bin/gbAlignStep -verbose=1 -initial -iserver=localhost \
	-clusterdir=/cluster/bluearc/genbank/work \
	-srcDb=genbank $db "$nibGlob" $liftFile &
    # This job took almost two days
# Checking finished jobsCompleted: 216306 of 216306 jobs
# CPU time in finished jobs:   74560544s 1242675.73m 20711.26h  862.97d  2.364 y
# IO & Wait Time:               1371840s   22864.00m   381.07h   15.88d  0.044 y
# Average job time:                 351s       5.85m     0.10h    0.00d
# Longest job:                     2478s      41.30m     0.69h    0.03d
# Submission to last job:        151934s    2532.23m    42.20h    1.76d

    # To watch the progress of your cluster job, go to machine kk and
    # cd /cluster/store5/genbank/work/initial.rn3/align
    # where the batch file is.  You can now do normal parasol checking
    # operations.  Beware of "para check" it can take quite a bit of time
    # if your batch job is very large.  'parasol list batches' or
    # 'parasol list users' may be quicker to take a look at status.

    #  After that is finished successfully, load the mRNAs:
    #  The drop and load is faster if tables have been loaded before.
    ssh hgwdev
    cd /cluster/store5/genbank
    ./bin/i386/gbLoadRna -drop  rn3
    nice ./bin/gbDbLoadStep -verbose=1 -initialLoad rn3
    # This load took about 7 hours

    # native mRNAs were suspect due to bug in gbBlat script, redoing them.
    # alignment now uses a config file, so add rn3 to etc/genbank.conf
        
    # remove suspect mRNAs.
    rm -f data/aligned/genbank.136.0/rn3/*/mrna.*

    # align genbank and refseq mrnas, using bluearc for cluster fs:
    nice bin/gbAlignStep -type=mrna -verbose=1 -initial -iserver=no -clusterRootDir=/cluster/bluearc/genbank rn3&

    # need to clean out all mRNAs from database, but leave ESTs inplace.
      - drop these tables: chr*_mrna all_mrna xenoMrna mrnaOrientInfo
      - selectively delete mrna related rows:
        delete from imageClone where type = 'mRNA';
        delete from mrna where type = 'mRNA';
        delete from gbStatus where type = 'mRNA';
        delete from gbSeq where type = 'mRNA';
        delete from gbExtFile where path like '%/mrna.fa';

    # load genbank and refseq mRNAs into database
    nice ./bin/gbDbLoadStep -type=mrna -verbose=1 rn3

    # refseq protein ids included version, which caused searches to fail.
    # clean our refseq sequences and reload:
      drop table refFlat,refGene,refLink,refSeqAli,refSeqStatus;
      delete from gbSeq where acc like 'NM_%' or acc like 'NP_%';
      delete from gbStatus where acc like 'NM_%' or acc like 'NP_%';
      delete from mrna where acc like 'NM_%';
      delete from gbLoaded where srcDb='RefSeq';
     nice ./bin/gbDbLoadStep -verbose=1 rn3


# GENBANK: add Non-Rat Refseqs (DONE May 9, 2005, Heather) 
    # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit: 
    # rn3.refseq.mrna.xeno.load = yes 
    cd /cluster/data/genbank/etc 
    cvs update 
    # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus 
    # refSeqSummary 

CREATE RNACLUSTER TABLE (TO DO)
    # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are 
    # made already (see above).
    ssh hgwdev
    mkdir -p ~/rn3/bed/rnaCluster/chrom
    cd ~/rn3/bed/rnaCluster
    foreach i (~/rn3/?{,?})
      foreach f ($i/chr*.fa)
        set c = $f:t:r
        clusterRna rn3 /dev/null chrom/$c.bed -chrom=$c
        echo done $c
      end
    end
    hgLoadBed rn3 rnaCluster chrom/*.bed


PRODUCING GENSCAN PREDICTIONS (DONE - 2003-07-09 - Hiram)
    
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    mkdir -p ~/rn3/bed/genscan
    cd ~/rn3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store3/rn3/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try
    para check
    para push

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    # chr14_21, chr16_4
    #	There was one job that did crash in this manner.
    #	It was run manually:
    # /cluster/home/hiram/bin/i386/gsBig /cluster/store3/rn3/10/chr10_3/chr10_3.fa.masked gtf/chr10_3.fa.gtf -trans=pep/chr10_3.fa.pep -subopt=subopt/chr10_3.fa.bed -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=1200000
# Checking finished jobsCompleted: 590 of 591 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:     312553s    5209.22m    86.82h    3.62d  0.010 y
# IO & Wait Time:                  1927s      32.11m     0.54h    0.02d  0.000 y
# Average job time:                 533s       8.88m     0.15h    0.01d
# Longest job:                    23072s     384.53m     6.41h    0.27d
# Submission to last job:         43744s     729.07m    12.15h    0.51d

    
    # Convert these to chromosome level files as so:
    ssh kkstore
    cd ~/rn3/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/rn3/bed/genscan
    ldHgGene rn3 genscan genscan.gtf
    hgPepPred rn3 generic genscanPep genscan.pep
    hgLoadBed rn3 genscanSubopt genscanSubopt.bed


TWINSCAN GENE PREDICTIONS (reloaded -gtf 2004-04-01 kate)

    mkdir -p ~/rn3/bed/twinscan
    cd ~/rn3/bed/twinscan
    wget http://genome.cse.wustl.edu/~bio/rat/Jan03/rat_Jan03_03-26-03.tgz
    gunzip -c *.tgz | tar xvf -
    rm -r chr_tx
    # clean up chrom field of GTF files
    foreach f (chr_gtf/chr*.gtf)
      set chr = $f:t:r
      sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf
    end
    # pare down protein FASTA header to id and add missing .a:
    foreach f (chr_ptx/chr*.ptx)
      set chr = $f:t:r
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \
        chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa
    end
    ldHgGene rn3 twinscan chr_gtf/chr*-fixed.gtf -gtf
    hgPepPred rn3 generic twinscanPep chr_ptx/chr*-fixed.fa
    /cluster/bin/scripts/runGeneCheck /cluster/data/rn3/bed/twinscan
    # 526 badFrame
    # 452 gap
    # 320 noStart
    # 305 noStop
    # 9 inFrameStop


PRODUCING TETRAODON FISH ALIGNMENTS (TO DO)

o - Download sequence from ... and put it on the cluster local disk
    at
       /scratch/hg/fish
o - Do fish/rat alignments.
       ssh kk
       cd ~/rn3/bed
       mkdir blatFish
       cd blatFish
       mkdir psl
       ls -1S /scratch/hg/fish/* > fish.lst
       ls -1S /scratch/hg/rn3/trfFa/* > rat.lst
       cp ~/lastRn/blatFish/gsub .
       gensub2 rat.lst fish.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl | liftUp -type=.psl stdout ~/rn3/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin
o - Copy to hgwdev:/scratch.  Rename to correspond with tables as so and 
    load into database:
       ssh hgwdev
       cd ~/rn3/bed/blatFish/chrom
       foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_blatFish.psl
       end
       hgLoadPsl -noTNameIx rn3 *.psl
       hgLoadRna addSeq rn3 /cluster/store2/fish/seq15jun2001/*.fa

# PRODUCING SQUIRT ALIGNMENTS  (TO DO)
    ssh kkstore
    mkdir -p ~/rn3/bed/blatCi1
    cd ~/rn3/bed/blatCi1
    ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst
    ls -1S /scratch/hg/rn3/trfFa/* > rat.lst

    rm -rf psl
    foreach ctg (`cat rat.lst`)
      mkdir -p psl/$ctg:t:r
    end
    # get gsub2D from someplace
    gensub2 rat.lst squirt.lst gsub2D spec

    ssh kk
    cd ~/rn3/bed/blatCi1
    para create spec
    ....
    # When cluster run is done, sort alignments:
    ssh kkstore
    cd ~/rn3/bed/blatCi1
    mkdir /tmp/$LOGNAME
    pslSort dirs raw.psl /tmp/$LOGNAME psl/*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.05
    liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/rn3/bed/blatCi1/chrom
    rm -f chr*_blatCi1.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatCi1.psl
    end
    hgLoadPsl -noTNameIx rn3 *.psl

    # Make squirt /gbdb/ symlink
    mkdir /gbdb/rn3/squirtSeq
    cd /gbdb/rn3/squirtSeq
    ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta

PRODUCING FUGU FISH ALIGNMENTS (TO DO)

    # (Already done, for mm2:)
    # Download sequence to /cluster/store3/fuguSeq from ... and put it on the 
    # cluster local disk at /scratch/hg/fugu on kkstore.
    # Sequence was downloaded from:
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z
    # mkdir split2.5Mb; cd split2.5Mb;
    # faSplit about ../fugu_v3_mask.fasta 2500000 fuguSplit

    ssh kkr1u00
    rm -rf /iscratch/i/fugu
    mkdir /iscratch/i/fugu
    cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu
    ~kent/bin/iSync

    ssh kk
    mkdir ~/rn3/bed/blatFugu
    cd ~/rn3/bed/blatFugu
    ls -1S /iscratch/i/fugu/* > fugu.lst
    ls -1S /scratch/hg/rn3/trfFa/* > rat.lst
    cp ~/lastRn/bed/blatFugu/gsub .
    mkdir psl
    foreach f (~/rn3/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa)
      set c=$f:t:r
      mkdir psl/$c
    end
    gensub2 rat.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
    # Sort alignments:
    ssh kkstore
    cd ~/rn3/bed/blatFugu
    pslCat -dir psl/* \
      | liftUp -type=.psl stdout ~/rn3/jkStuff/liftAll.lft warn stdin \
      | pslSortAcc nohead chrom /cluster/store2/temp stdin

    # load into database:
    ssh hgwdev
    cd ~/rn3/bed/blatFugu/chrom
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl -noTNameIx rn3 *.psl
    mkdir -p /gbdb/rn3/fuguSeq
    cd /gbdb/rn3/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
    cd /cluster/store2/temp
    hgLoadRna addSeq rn3 /gbdb/rn3/fuguSeq/fugu_v3_mask.fasta


MAKE LIFT FILE FOR AGPS (TO DO)
    ssh kkstore
    cd ~/rn3/jkStuff
    ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,_random}.agp \
      > jkStuff/liftRNOR.lft


LOAD BACTIG POSITIONS (DONE - 2003-07-14 - Hiram)

    ssh hgwdev
    mkdir -p ~/rn3/bed/bactigPos
    cd ~/rn3/bed/bactigPos
    awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < ../../bactigs.contigs > bactigPos.bed
    hgLoadBed rn3 bactigPos bactigPos.bed \
      -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql


LOAD CPGISSLANDS (DONE - 2003-07-08 - Hiram)
    ssh hgwdev
    mkdir -p ~/rn3/bed/cpgIsland
    cd ~/rn3/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    cp -p /cluster/store4/rn2/bed/cpgIsland/cpg_dist.tar .
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    
    ssh kkstore
    cd ~/rn3/bed/cpgIsland
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpg_dist/cpglh.exe $f > $fout.cpg
    end
    # copy filter.awk from a previous release
    cp -p /cluster/store4/rn2/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd ~/rn3/bed/cpgIsland
    hgLoadBed rn3 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


LOAD SOFTBERRY GENES (DONE - 2003-07-17 - Hiram)
     cd /cluster/store3/rn3/bed
     mkdir softberry
     cd softberry
     wget \
	ftp://www.softberry.com/pub/SC_RAT_JUN03/Softb_fgenesh_rat_jun03.tar.gz
     tar xvzf Softb_fgenesh_rat_jun03.tar.gz
     ldHgGene rn3 softberryGene chr*.gff
#	43912 groups 22 seqs 1 sources 4 feature types
#	43912 gene predictions
     hgPepPred rn3 softberry *.protein
     hgSoftberryHom rn3 *.protein

LOAD GENEID GENES (DONE - 2003-07-08 - Hiram RELOADED -gtf 2004-04-02 kate)
    mkdir -p ~/rn3/bed/geneid/download
    cd ~/rn3/bed/geneid/download
    foreach f (~/rn3/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJun2003/geneid_v1.1/$chr.gtf
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJun2003/geneid_v1.1/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn3 geneid download/*.gtf -gtf
        # 37571 gene predictions
    hgPepPred rn3 generic geneidPep download/*-fixed.prot

SGP GENE PREDICTIONS (DONE - 2003-07-30 - Hiram)
    mkdir -p ~/rn3/bed/sgp/download
    cd ~/rn3/bed/sgp/download
    foreach f (~/rn3/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://monstre1.imim.es/genepredictions/R.norvegicus/rnJun2003/SGP/humangp20030410/$chr.gtf
      wget http://monstre1.imim.es/genepredictions/R.norvegicus/rnJun2003/SGP/humangp20030410/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn3 sgpGene download/*.gtf -exon=CDS
    hgPepPred rn3 generic sgpPep download/*-fixed.prot

SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

LOAD STS MAP (TO DO )
     - login to hgwdev
      cd ~/rn3/bed
      rn3 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed
      - Enter database with "rn3" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type

LOAD MGI IDs (TO DO)
      - The Locuslink ID to MGI IDs converstion data file,
        LL2MGI.txt, from Jackson Lab should be found under
        ~/rn3/bed/refSeq
      - login to hgwdev
      
      cd ~/rn3/bed/refSeq
      rn3 < ~/src/hg/lib/mgiID.sql
      - Enter database with "rn3" command.
      - At mysql> prompt type in:
          load data local infile 'LL2MGI.txt' into table MGIid;
      - At mysql> prompt type
          quit

LOAD RATREF TRACK (TO DO)
    First copy in data from kkstore to ~/rn3/bed/ratRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb rn3 ratRef *.alignments

LOAD AVID RAT TRACK (TO DO)
      ssh cc98
      cd ~/rn3/bed
      mkdir avidRat
      cd avidRat
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (TO DO)
      - ssh hgwdev
      - cd ~/rn3/bed
      - mkdir snp
      - cd snp
      - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz
      - Unpack.
        createBed < rat.b27.out > snpNih.bed
        hgLoadBed rn3 snpNih snpNih.bed

LOAD ENSEMBL ESTs (TO DO)
     ln -s /cluster/store3/rn3 ~/rn3
     mkdir -p ~/rn3/bed/ensembl
     cd ~/rn3/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/rat-est.gz
     wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz
     gunzip -c rat-est.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf
     ldHgGene rn3 ensEst rat-est-fixed.gtf
> The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
     # pick out the transcript= attribute -- that's the id to use:
     # also remove the first line:
     gunzip -c rat-est.pep.gz | tail +2 | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       rat-est-fixed.pep
     hgPepPred rn3 generic ensEstPep rat-est-fixed.pep

#### LOAD ENSEMBL GENES (DONE - 2004-04-29 - Fan)
#	ensGene is needed for Superfamily track, Gene Sorter, and Proteome Browser
#	Ensembl released Rat build 3.1 the week of Feb 9th, 2004
     mkdir /cluster/data/rn3/bed/ensembl
     cd /cluster/data/rn3/bed/ensembl

        Get the ensembl gene data from http://www.ensembl.org/
        Go to the EnsMart link
        Choose Rattus norvegicus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
        Page 3) Choose the "Structures" tab. 
        Page 4) Choose GTF as the ouput, choose gzip compression , name the
	output file ensGeneRn3 and then hit Export

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     zcat ensGeneRn3.gff.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf
#	Let's see how much it loses:
#	For Rn3 there is no difference
#       
# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
    sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf

# change chr*.random to chr*_random
    perl -wpe 's/(chr.*)\.random/$1_random/' ensGene.gtf > ensGeneFixed.gtf

    ldHgGene rn3 ensGene ensGeneFixed.gtf
#	Read 28545 transcripts in 475909 lines in 1 files
#  	28545 groups 36 seqs 1 sources 4 feature types
#	28545 gene predictions

#	save space, gzip them:
    gzip unrandom.gtf
    gzip ensGene.gtf

o - Load Ensembl peptides:
        Get the ensembl protein data from http://www.ensembl.org/
        Go to the EnsMart link
        Choose Rattus norvegicus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Sequences" tab. 
        Page 4) Choose Transcripts/Proteins and peptide Only as the output,
		choose text/fasta and gzip compression,
		name the file ensGeneRn3.pep and then hit export.

     If needed, substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     #subs -e rn3EnsGene.pep > /dev/null

     #delete * at end of each protein
     zcat ensGeneRn3.pep.fasta.gz | sed "s/\*$//" > ensembl.pep
    ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
     hgPepPred rn3 generic ensPep fixPep_ensembl.pep

o - Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format, gzip.
    #  Result name file as ensGtpRn3.tsv.gz
    gunzip ensGtpRn3.tsv.gz
    hgsql rn3 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql -N -e 'load data local infile "ensGtpRn3.tsv" into table ensGtp ignore 1 lines;' rn3

LOAD RNAGENES (TO DO)
      - login to hgwdev
      - cd ~kent/src/hg/lib
      - rn3 < rnaGene.sql
      - cd /cluster/store3/rn3/bed
      - mkdir rnaGene
      - cd rnaGene
      - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz
      - gunzip *.gz
      - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff
      - hgRnaGenes rn3 chrom.gff

LOAD EXOFISH (TO DO)  (SEE ECORES TRACKS BELOW, hartera)
     - login to hgwdev
     - cd /cluster/store3/rn3/bed
     - mkdir exoFish
     - cd exoFish
     - rn3 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store3/rn3/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed rn3 exoFish exoFish.bed

LOAD GENIE (TO DO)
     mkdir -p ~/rn3/bed/genieAlt
     cd ~/rn3/bed/genieAlt
     wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz
     gunzip -c mgscv3.genie.gtf.tgz | tar xvf -
     ldHgGene rn3 genieAlt mgscv3.genie.gtf/chr*.gtf
     wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz
     gunzip -c mgscv3.aa.tgz | tar xvf -
     hgPepPred rn3 genie geniePep chr*.aa.fa

LOAD GENIE CLONE BOUNDS (TO DO)
     mkdir -p ~/rn3/bed/genieBounds
     cd ~/rn3/bed/genieBounds
     wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz
     gunzip -c mgscv3_cb.bed.tgz | tar xvf -
     - Trim the track definition from each file (these are actually custom 
       track files):
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed
     end
     hgLoadBed rn3 genieBounds *-fixed.bed

LOAD SOFTBERRY GENES (TO DO)
     - ln -s /cluster/store3/rn3 ~/rn3
     - cd ~/rn3/bed
     - mkdir softberry
     - cd softberry
     - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz
     ldHgGene rn3 softberryGene chr*.gff
     hgPepPred rn3 softberry *.protein
     hgSoftberryHom rn3 *.protein

LOAD GENOMIC DUPES (TO DO)
o - Load genomic dupes
    ssh hgwdev
    cd ~/rn3/bed
    mkdir genomicDups
    cd genomicDups
    wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    hgsql rn3 < ~/src/hg/lib/genomicDups.sql
    hgLoadBed rn3 -oldTable genomicDups genomicDupes.bed

TWINSCAN GENE PREDICTIONS (DONE 07/14/03)

    mkdir -p ~/rn3/bed/twinscan
    cd ~/rn3/bed/twinscan
    wget http://genome.cse.wustl.edu/~bio/rat/Jun03/rat_Jun03_07-11-03.tgz
    gunzip -c *.tgz | tar xvf -
    rm -r chr_tx
    # clean up chrom field of GTF files
    foreach f (chr_gtf/chr*.gtf)
      set chr = $f:t:r
      sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf
    end
    # pare down protein FASTA header to id and add missing .a:
    foreach f (chr_ptx/chr*.ptx)
      set chr = $f:t:r
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \
        chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa
    end
    ldHgGene rn3 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
    hgPepPred rn3 generic twinscanPep chr_ptx/chr*-fixed.fa

AXTBEST

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    tcsh
    set base="/cluster/store3/rn3/bed/blastz.hg15"
    set tbl="blastzBestHg15"
    cd $base
    mkdir -p axtBest pslBest
    #chrom 1 is too big
    axtFilter -tStartMax=99999999 chr1.axt >chr1FirstHalf.axt
    axtFilter -tStartMin=100000000 chr1.axt >chr1SecondHalf.axt
    foreach chrdir (`cat chrom.lst`)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
    end
    axtBest axtChrom/chr1FirstHalf.axt chr1 axtBest/chr1FirstHalf.axt -minScore=300
    axtBest axtChrom/chr1SecondHalf.axt chr1 axtBest/chr1SecondHalf.axt -minScore=300
    cd axtBest
    cat chr1FirstHalf.axt chr1SecondHalf.axt > chr1.axt
    rm chr1FirstHalf.axt chr1SecondHalf.axt
    cd ../axtChrom
    rm chr1FirstHalf.axt chr1SecondHalf.axt
    cd ..
    foreach chrdir (`cat chrom.lst`)
      set chr=$chrdir:t
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # Load tables
    ssh hgwdev
    set base="/cluster/store3/rn3/bed/blastz.hg15"
    set tbl="blastzBestHg15"
    cd $base/pslBest
    hgLoadPsl -noTNameIx hg15 chr*_${tbl}.psl


# TBA ALIGNMENT - NON-MAMMAL SPECIES INCLUDED (Fugu & Chicken)  2003-10-20 kate
    ssh hgwdev
    mkdir -p  /cluster/data/rn3/bed/nisc/cftr
    ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/rat.out \
                /cluster/data/rn3/bed/nisc/cftr/tbaFishBird.maf
    set table = tbaFishBirdCFTR
    mkdir -p /gbdb/rn3/$table
    cd /gbdb/rn3/$table
    ln -s /cluster/data/rn3/bed/nisc/cftr/tbaFishBird.maf tba.maf
    cd /cluster/data/rn3/bed/nisc/cftr
    /cluster/bin/i386/hgLoadMaf -WARN rn3 $table
        # 463 warnings
        # 4604 rows loaded
    
# BLASTZ MOUSE MM4 (DONE - 2003-11-03 - Hiram)

    ssh kk
    mkdir -p /cluster/data/rn3/bed/blastz.mm4.2003-10-29
    cd /cluster/data/rn3/bed
    ln -s blastz.mm4.2003-10-29 blastz.mm4
    cd blastz.mm4

    cat << '_EOF_' > DEF
# rat vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Rat
SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs
# RMSK not currently used
SEQ1_RMSK=/cluster/bluearc/rat/rn3/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInRat
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/rn3/bed/blastz.mm4

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    source DEF
    cd /cluster/data/rn3/bed/blastz.mm4
    /cluster/data/mm4/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 39935 of 39936 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:   16180312s  269671.87m  4494.53h  187.27d  0.513 y
# IO & Wait Time:                687304s   11455.06m   190.92h    7.95d  0.022 y
# Average job time:                 422s       7.04m     0.12h    0.00d
# Longest job:                     5808s      96.80m     1.61h    0.07d
# Submission to last job:         44812s     746.87m    12.45h    0.52d
    # the crashed job:
# /cluster/home/angie/schwartzbin/blastz-run chr5.nib 1 10010000 chr14.nib 60000001 90000000 /cluster/data/rn3/bed/blastz.mm4/DEF
#	seq_read(/tmp/blastz.DkVf00/s1.fa): Input/output error
#	try it again and it works OK

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kkr1u00
    cd /cluster/data/rn3/bed/blastz.mm4
    source DEF
    /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 312 of 312 jobs
# CPU time in finished jobs:      35956s     599.27m     9.99h    0.42d  0.001 y
# IO & Wait Time:                 15078s     251.30m     4.19h    0.17d  0.000 y
# Average job time:                 164s       2.73m     0.05h    0.00d
# Longest job:                      642s      10.70m     0.18h    0.01d
# Submission to last job:          5811s      96.85m     1.61h    0.07d

    #	Third cluster run to convert lav's to axt's
    ssh kk
    cd /cluster/data/rn3/bed/blastz.mm4
    source DEF
    /cluster/data/mm4/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 37 of 44 jobs
# Crashed: 7 jobs
# CPU time in finished jobs:       5646s      94.10m     1.57h    0.07d  0.000 y
# IO & Wait Time:                 74584s    1243.07m    20.72h    0.86d  0.002 y
# Average job time:                2168s      36.14m     0.60h    0.03d
# Longest job:                    18498s     308.30m     5.14h    0.21d
# Submission to last job:         97292s    1621.53m    27.03h    1.13d
    # and a recover run of the 7 failing jobs
# Completed: 4 of 7 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:       2333s      38.89m     0.65h    0.03d  0.000 y
# IO & Wait Time:                 32031s     533.84m     8.90h    0.37d  0.001 y
# Average job time:                8591s     143.18m     2.39h    0.10d
# Longest job:                     9314s     155.23m     2.59h    0.11d
# Submission to last job:         10283s     171.38m     2.86h    0.12d
    # FAILED: chr1, chr4, chr5
    # try these on kolossus
    ssh kolossus
    cd /cluster/data/rn3/bed/blastz.mm4/run.2
    time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/rn3/bed/blastz.mm4/lav/chr1 \
	/cluster/data/rn3/bed/blastz.mm4/axtChrom/chr1.axt \
	/cluster/data/rn3/nib /cluster/data/mm4/nib
    time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/rn3/bed/blastz.mm4/lav/chr4 \
	/cluster/data/rn3/bed/blastz.mm4/axtChrom/chr4.axt \
	/cluster/data/rn3/nib /cluster/data/mm4/nib
    time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/rn3/bed/blastz.mm4/lav/chr5 \
	/cluster/data/rn3/bed/blastz.mm4/axtChrom/chr5.axt \
	/cluster/data/rn3/nib /cluster/data/mm4/nib
    # about 75 minutes in the above jobs

    # translate sorted axt files into psl
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4
    mkdir -p pslChrom
    set tbl = "blastzMm4"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      if (-e pslChrom/${c}_${tbl}.psl) then
      echo "Done  pslChrom/${c}_${tbl}.psl"
      else
      echo "Processing chr ${c}"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
      endif
    end
    #	That takes about 20 minutes
    # Load database tables
    ssh hgwdev
    set tbl = "blastzMm4"
    cd /cluster/data/rn3/bed/blastz.mm4/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl
    # real    162m7.520s
    # user    32m40.530s
    # sys     5m31.500s
    # check results, need to run this on kolossus to avoid an
    # out-of-memory problem:
# featureBits rn3 blastzMm4
# 1804072721 bases of 2571104688 (70.167%) in intersection
# featureBits rn3 blastzMm3
# 1669090687 bases of 2571104688 (64.917%) in intersection


# CHAIN Mm4 BLASTZ (WORKING - 2003-11-03 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kkr1u00
    mkdir -p /cluster/data/rn3/bed/blastz.mm4/axtChain/run1
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/rn3/bed/blastz.mm4/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/sh
    axtFilter -notQ_random $1 | axtChain stdin \
	/iscratch/i/rn3/bothMaskedNibs \
	/scratch/mus/mm4/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 41 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:      53019s     883.66m    14.73h    0.61d  0.002 y
# IO & Wait Time:                 33812s     563.53m     9.39h    0.39d  0.001 y
# Average job time:                1973s      32.89m     0.55h    0.02d
# Longest job:                    22348s     372.47m     6.21h    0.26d
# Submission to last job:         22348s     372.47m     6.21h    0.26d

    # now on the cluster server, sort chains
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    # real    8m55.100s
    # user    6m46.680s
    # sys     0m58.250s
    time chainSplit chain all.chain
    # real    8m36.259s
    # user    5m46.920s
    # sys     0m59.700s

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain rn3 ${c}_chainMm4 $i
        echo done $c
    end

# NET Mm4 BLASTZ (TBD)

    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/rn3/chrom.sizes \
                        /cluster/data/mm4/chrom.sizes ../preNet/$i
    end
    # real    11m58.018s
    # user    4m10.390s
    # sys     2m10.780s

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \
                            /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 1591906304, utime 6877 s/100, stime 1977

    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    time netClass hNoClass.net rn3 mm4 mouse.net \
	-tNewR=/cluster/bluearc/scratch/mus/rn3/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse
    # real    11m35.367s
    # user    7m20.880s
    # sys     1m33.360s

    # If things look good do
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet rn3 netMm4 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 syntenyNetMm4 stdin

    # Add entries for net and chain to rat/rn3 trackDb

    # make net
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4/axtChain
    mkdir mouseNet
    time netSplit mouse.net mouseNet
    # real    6m13.469s
    # user    3m22.670s
    # sys     0m44.880s

    mkdir ../axtNet
    foreach n (mouseNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt mouseNet/$c.net chain/$c.chain \
		/cluster/data/rn3/nib \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end

    ssh hgwdev
    mkdir -p /cluster/data/rn3/bed/blastz.mm4/axtBest
    cd /cluster/data/rn3/bed/blastz.mm4/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet
    gzip *.axt
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestMm4.psl
	echo "Done: ${c}_blastzBestMm4.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/pslBest
    time /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_blastzBestMm4.psl
    # real    10m47.853s
    # user    2m48.700s
    # sys     0m24.250s

     # check results
#  featureBits rn3 blastzBestMm4
#  1030510540 bases of 2627444668 (39.221%) in intersection

#  featureBits rn3 blastzBestMm3
#  1019460260 bases of 2505900260 (40.682%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/rn3/axtBestMm4
     cd /gbdb/rn3/axtBestMm4
     ln -s /cluster/data/rn3/bed/blastz.mm4/axtNet/chr*.axt .
     cd /cluster/data/rn3/bed/blastz.mm4/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/rn3/axtBestMm4/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql rn3 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST (TBD)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd /cluster/data/rn3/bed/blastz.mm4/axtNet
    mkdir -p ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm4/pslTight
    hgLoadPsl -noTNameIx rn3 chr*_blastzTightMm4.psl

    # copy to axt's to download area
    cd /cluster/data/rn3/bed/blastz.mm4/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight
    gzip *.axt
    # add README.txt file to dir, if needed

# MOUSE MM4 BLASTZ CLEANUP (WORKING 2004-09-20 kate)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4
    nice rm -rf raw lav
    nice rm axtChain/run1/chain/* axtChain/preNet axtChain/hNoClass.net
    # GOT HERE
    nice gzip axtChrom/*.axt axtChain/{all.chain,*.net} axtChain/*Net/*.net
    nice gzip pslChrom/*.psl axtBest/*.axt


# LOAD ECgene tables (re-done with sort (braney, 2004-01-30))
    cd /cluster/data/rn3/bed
    rm -f ECgene
    mkdir ECgene.2003-12-19
    ln -s ECgene.2003-12-19 ECgene
    cd ECgene
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_rn3_v1.1_25oct2003_genes.txt.gz"
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_rn3_v1.1_25oct2003_genepep.txt.gz"
    gunzip *.gz
    ldHgGene -predTab rn3 ECgene ECgene_rn3_v1.1_25oct2003_genes.txt
    hgPepPred rn3 tab ECgenePep ECgene_rn3_v1.1_25oct2003_genepep.txt
    rm genePred.tab
    gzip *


LOAD SNPS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes snpNih, snpTsc, and snpMap
    #   ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/snpMap
    cd       $oo/bed/snp/build$snpBuild/snpMap
    processSnpLocations.csh rn3 rat 2 119 >& log &

    # check data:
    # wc -l snpNih.bed; rn3 -e "select count(*) from snpNih; 
    # wc -l snpMap.bed; rn3 -e "select count(*) from snpMap; 
    # select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
    # select * from snpMap limit 5; desc snpMap; show indexes from snpMap"

    # remove temp files
    # rm rat* *bed.gz

LOAD SNP DETAILS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes dbSnpRsMm
    #   ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/details/Done
    mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
    cd       $oo/bed/snp/build$snpBuild/details
    processSnpDetails.csh rn3 rat 119 >& log &
    load data local infile "$fileBase.out" into table $database.$table
    gzip $fileBase.out

    # check data:
    # hgFixed -e "select count(*) from dbSnpRsRn; 
    # select * from dbSnpRsRn limit 5; desc dbSnpRsRn; show indexes from dbSnpRSRn"

    # remove temp files
    # rm dbSnpRs*

# ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31]
# download data from  http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigRF - ecores on Rat, genome conserved with Fugu
# ecotigRT - ecores on Rat, genome conserved with Tetraodon
                                                                                
    ssh hgwdev
    mkdir /cluster/data/rn3/bed/ecores/
    # add parse_ecotig.pl to this directory
                                                                                
# FUGU
    mkdir /cluster/data/rn3/bed/ecores/fr1
    cd /cluster/data/rn3/bed/ecores/fr1/
                                                                                
    # download data for ecotigRF to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigRF > ecotigRF.bed
                                                                                
    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigRF.bed
                                                                                
    hgLoadBed -tab rn3 ecoresFr1 ecotigRF.bed
                                                                               
# TETRAODON
    mkdir /cluster/data/rn3/bed/ecores/tetraodon
    cd /cluster/data/rn3/bed/ecores/tetraodon/
                                                                                
    # download data for ecotigHT to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigRT > ecotigRT.bed
    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigRT.bed

    hgLoadBed -tab rn3 ecoresTetraodon ecotigRT.bed
                                                                                
    # clean up
    rm *.bak
                                                                                
    # add entries in kent/src/hg/makeDb/trackDb/rat/rn3/trackDb.ra
    # add html for details pages to this directory:
    # ecoresFr1.html and ecoresTetraodon.html

# MAKE RAT 11.OOC FILE [DONE, 2004-05-04, hartera]
# OOC FILE REQUIRED FOR BLAT
# Use -repMatch=1024 (based on genome size, for human use 1024)
    ssh kkr1u00
    mkdir -p /cluster/bluearc/rat/rn3
    mkdir -p /cluster/data/rn3/bed/ooc
    cd /cluster/data/rn3/bed/ooc
    mkdir -p /cluster/bluearc/rat/rn3/softNib
    cp -p /cluster/data/rn3/softNib/chr*.nib /cluster/bluearc/rat/rn3/softNib

    ls -1 /cluster/data/rn3/softNib/chr*.nib > nib.lst
    /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/rat/rn3/11.ooc -repMatch=1024
# Writing /cluster/bluearc/rat/rn3/11.ooc
# Wrote 25600 overused 11-mers to /cluster/bluearc/rat/rn3/11.ooc 
    cp -p /cluster/bluearc/rat/rn3/11.ooc rn3_11.ooc
    cp -p /cluster/bluearc/rat/rn3/11.ooc /iscratch/i/rn3/rn3_11.ooc
    iSync

# Affymetrix RAE230 GeneChip track - affyRAE230
# RE-DO alignments using rat specific 11.ooc file
# [DONE, hartera, 2004-05-04]

    mkdir /projects/compbio/data/microarray/affyRat
    # Download RAE230A and RAE230B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=rae230

    # check for duplicate probes: 100 from 136745_at to 1367551_a_at 
    # remove "consensus:" and ";" from FASTA headers to shorten probeset 
    # names for database 
   
    sed -e 's/consensus://' RAE230A_consensus | sed -e 's/;/ /' > RAE230_all.fa
    sed -e 's/consensus://' RAE230B_consensus | sed -e 's/;/ /' >> RAE230_all.fa

    # copy sequences to a directory visible on kkr1u00
                                                                                
    cp /projects/compbio/data/microarray/affyRat/RAE230_all.fa /cluster/bluearc/affy/

    # Set up cluster job to align RAE230 consensus sequences to rn3
    ssh kkr1u00
    cd /cluster/data/rn3/bed
    mkdir affyRat.2004-05-04
    cd affyRat.2004-05-04
    mkdir -p /iscratch/i/affy 
    cp /cluster/bluearc/affy/RAE230_all.fa /iscratch/i/affy
    iSync
    
    ssh kk
    cd /cluster/data/rn3/bed/affyRat.2004-05-04
    ls -1 /iscratch/i/affy/RAE230_all.fa > affy.lst
    ls -1 /scratch/rat/rn3/trfFa/ > allctg.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/rn3/rn3_11.ooc /scratch/rat/rn3/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec

    # Actually do the job with usual para try/check/push/time etc.
# para time 2004-05-04
#Completed: 591 of 591 jobs
# CPU time in finished jobs:       9950s     165.83m     2.76h    0.12d  0.000 y
# IO & Wait Time:                  5522s      92.04m     1.53h    0.06d  0.000 y
# Average job time:                  26s       0.44m     0.01h    0.00d
# Longest job:                       49s       0.82m     0.01h    0.00d
# Submission to last job:           165s       2.75m     0.05h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least 
    # 95% identity in aligned region. minAli = 0.97 too high. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyRAE230.psl ../../jkStuff/liftAll.lft warn contig.psl
    
    # shorten names in psl file
    sed -e 's/RAE230//' affyRAE230.psl > affyRAE230.psl.bak
    mv affyRAE230.psl.bak affyRAE230.psl
    
    # load track into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/affyRat.2004-05-04
    hgsql -e 'drop table affyRAE230' rn3
    hgsql -e 'select * from  history' rn3 | less
    # affyRAE230 was added as entry ix = 23 
    # so add errata to say these alignments have been removed
    hgsql -e 'update history set errata = "Removed alignmnents and re-align sequences using rat specific 11.ooc file for Blat" where ix = 23' rn3
    hgLoadPsl rn3 affyRAE230.psl

    # Add consensus sequences for RAE230
    # Copy sequences to gbdb is they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyRat/RAE230_all.fa .
    # sequences are the still the same so no need to reload these
    hgLoadSeq -abbr=RAE230 rn3 /gbdb/hgFixed/affyProbes/RAE230_all.fa
   
    # Clean up    
    rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab psl.tab 

# UPDATE knownToRAE230 TABLE - USED BY THE FAMILY BROWSER [DONE, 2004-05-04, hartera]
# knownToRAE230 depends on affyRAE230 so update this table
    ssh hgwdev
    cd /cluster/data/rn3/bed/affyRat.2004-05-04
    hgsql -e 'drop table knownToRAE230' rn3
    # create and load knownToRAE230
    hgMapToGene rn3 affyRAE230 knownGene knownToRAE23

#  gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram)
    # Perform a gc count with a 5 base
    # window.  Also compute a "zoomed" view for display efficiency.
    mkdir /cluster/data/rn3/bed/gc5Base
    cd /cluster/data/rn3/bed/gc5Base

    #	in the script below, the 'grep -w GC' selects the lines of
    #	output from hgGcPercent that are real data and not just some
    #	information from hgGcPercent.  The awk computes the number
    #	of bases that hgGcPercent claimed it measured, which is not
    #	necessarily always 5 if it ran into gaps, and then the division
    #	by 10.0 scales down the numbers from hgGcPercent to the range
    #	[0-100].  Two columns come out of the awk print statement:
    #	<position> and <value> which are fed into wigAsciiToBinary through
    #	the pipe.  It is set at a dataSpan of 5 because each value
    #	represents the measurement over five bases beginning with
    #	<position>.  The result files end up in ./wigData5.
    cat << '_EOF_' > runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in ../../nib/*.nib
do
	c=`basename ${n} | sed -e "s/.nib//"`
	C=`echo $c | sed -e "s/chr//"`
	echo -n "working on ${c} - ${C} ... "
	hgGcPercent -chr=${c} -doGaps \
		-file=stdout -win=5 rn3 ../../nib | grep -w GC | \
		awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
	wigAsciiToBinary \
	-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
	-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
    # << emacs
    chmod +x runGcPercent.sh
    #	This is going to take perhaps two hours to run.  It is a lot of
    #	data.  make sure you do it on the fileserver:
    ssh eieio
    cd /cluster/data/rn3/bed/gc5Base
    ./runGcPercent.sh
    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/rn3/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/rn3/wib/gc5Base rn3 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/rn3/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/rn3/wib/gc5Base

    # to speed up display for whole chromosome views, compute a "zoomed"
    # view and load that on top of the existing table.  The savings
    # comes from the number of data table rows the browser needs to load
    # for a full chromosome view.  Without the zoomed view there are
    # over 43,000 data rows for chrom 1.  With the zoomed view there are
    # only 222 rows needed for the display.  If your original data was
    # at 1 value per base the savings would be even greater.
    #	Pretty much the same data calculation
    # situation as above, although this time note the use of the
    # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
    # together the data points coming out of the awk print statment over
    # a span of 1000 bases.  Thus each <position> coming out of wigZoom
    # will represent the measurement of GC in the next 1000 bases.  Note
    # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
    # this type of data.  You want your dataSpan here to be an exact
    # multiple of your original dataSpan (5*200=1000) and on the order
    # of at least 1000, doesn't need to go too high.  For data that is
    # originally at 1 base per value, a convenient span is: -dataSpan=1024
    # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
    cat << '_EOF_' > runZoom.sh
#!/bin/sh

mkdir -p wigData5_1K
mkdir -p dataLimits5_1K

for n in ../../nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 rn3 ../../nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
	wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
        -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
        -name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
    # << emacs
    chmod +x runZoom.sh
    #	This is going to take even longer than above, certainly do this
    #	on the fileserver
    ssh eieio
    time ./runZoom.sh
real    232m3.265s
user    302m37.050s
sys     16m13.770s
    #	Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -pathPrefix=/gbdb/rn3/wib/gc5Base -oldTable rn3 gc5Base \
	wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/rn3/wib/gc5Base

# Affymetrix RG-U34A GeneChip track - affyRGU34A 
# RG-U34A is the chip used for the GNF Atlas 2 expression data for Rat
# (DONE, 2004-05-11, hartera)
    mkdir -p /projects/compbio/data/microarray/affyRat
    # Download RG-U34A exemplar sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=rgu34

    # remove "consensus:" and ";" from FASTA headers to shorten probeset 
    # names for database 
   
    sed -e 's/exemplar://'  RG-U34A_exemplar | sed -e 's/;/ /' > RG-U34A_all.fa

    # copy sequences to a directory visible on kkr1u00
                                                                                
    cp /projects/compbio/data/microarray/affyRat/RG-U34A_all.fa \
                /cluster/bluearc/affy/

    # Set up cluster job to align RG-U34A exemplar sequences to rn3
    ssh kkr1u00
    cd /cluster/data/rn3/bed
    mkdir affyU34A
    cd affyU34A
    mkdir -p /iscratch/i/affy 
    cp /cluster/bluearc/affy/RG-U34A_all.fa /iscratch/i/affy
    iSync
    
    ssh kk
    cd /cluster/data/rn3/bed/affyU34A
    ls -1 /iscratch/i/affy/RG-U34A_all.fa > affy.lst
    ls -1 /scratch/rat/rn3/trfFa/ > allctg.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/rn3/rn3_11.ooc /scratch/rat/rn3/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec

    # Actually do the job with usual para try/check/push/time etc.
# para time 2004-05-11
#Completed: 591 of 591 jobs
# CPU time in finished jobs:       4705s      78.42m     1.31h    0.05d  0.000 y
# IO & Wait Time:                  6498s     108.29m     1.80h    0.08d  0.000 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest job:                       38s       0.63m     0.01h    0.00d
# Submission to last job:           481s       8.02m     0.13h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRGU34A.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least 
    # 95% identity in aligned region. minAli = 0.97 too high. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU34A.psl ../../jkStuff/liftAll.lft warn contig.psl
    
    # shorten names in psl file
    sed -e 's/RG-U34A://' affyU34A.psl > affyU34A.psl.bak
    mv affyU34A.psl.bak affyU34A.psl
    
    # load track into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/affyU34A
    hgLoadPsl rn3 affyU34A.psl

    # Add exemplar sequences for RG-U34A
    # Copy sequences to gbdb is they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyRat/RG-U34A_all.fa .
    # sequences are the still the same so no need to reload these
    cd /cluster/data/rn3/bed/affyU34A
    hgLoadSeq -abbr=RG-U34A: rn3 /gbdb/hgFixed/affyProbes/RG-U34A_all.fa
   
    # Clean up    
    rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab psl.tab 
    # add trackDb.ra entry and affyU34A.html

# CREATE knownToU34A TABLE - USED BY THE FAMILY BROWSER 
# [DONE, 2004-05-12, hartera]
    ssh hgwdev
    cd /cluster/data/rn3/bed/affyU34A
    # create and load knownToRAE230
    hgMapToGene rn3 affyU34A knownGene knownToU34A

# ADD EXPRESSION DATA TRACK BY MERGING EXPRESSION AND ALIGNMENT DATA
# GNF ATLAS 2 TRACK (DONE, 2004-05-25, hartera)

    ssh hgwdev
    cd /cluster/data/rn3/bed/affyU34A
    hgMapMicroarray gnfRatAtlas2.bed hgFixed.gnfRatAtlas2MedianRatio \
        affyU34A.psl
    # Loaded 8022 rows of expression data from hgFixed.gnfRatAtlas2MedianRatio
    # Mapped 7442,  multiply-mapped 239, missed 694, unmapped 580
 
    hgLoadBed rn3 gnfAtlas2 gnfRatAtlas2.bed

######## PROTEOME BROWSER BUILD #######  (PARTIALLY DONE - 2004-04-14 - Fan)

o Create the working directory

  mkdir /cluster/store3/rn3/bed/pb
  cd /cluster/store3/rn3/bed/pb 

o save old PB tables (if they exist) to the backup DB, kgRn3Sav

From mysql prompt:

alter table rn3.pepCCntDist rename as kgRn3Sav.pepCCntDist;
alter table rn3.pepExonCntDist rename as kgRn3Sav.pepExonCntDist;
alter table rn3.pepHydroDist rename as kgRn3Sav.pepHydroDist;
alter table rn3.pepIPCntDist rename as kgRn3Sav.pepIPCntDist;
alter table rn3.pepMolWtDist rename as kgRn3Sav.pepMolWtDist;
alter table rn3.pepMwAa rename as kgRn3Sav.pepMwAa;
alter table rn3.pepPi rename as kgRn3Sav.pepPi;
alter table rn3.pepPiDist rename as kgRn3Sav.pepPiDist;
alter table rn3.pepResDist rename as kgRn3Sav.pepResDist;

alter table rn3.pbAaDistA rename as kgRn3Sav.pbAaDistA;
alter table rn3.pbAaDistC rename as kgRn3Sav.pbAaDistC;
alter table rn3.pbAaDistD rename as kgRn3Sav.pbAaDistD;
alter table rn3.pbAaDistE rename as kgRn3Sav.pbAaDistE;
alter table rn3.pbAaDistF rename as kgRn3Sav.pbAaDistF;
alter table rn3.pbAaDistG rename as kgRn3Sav.pbAaDistG;
alter table rn3.pbAaDistH rename as kgRn3Sav.pbAaDistH;
alter table rn3.pbAaDistI rename as kgRn3Sav.pbAaDistI;
alter table rn3.pbAaDistK rename as kgRn3Sav.pbAaDistK;
alter table rn3.pbAaDistL rename as kgRn3Sav.pbAaDistL;
alter table rn3.pbAaDistM rename as kgRn3Sav.pbAaDistM;
alter table rn3.pbAaDistN rename as kgRn3Sav.pbAaDistN;
alter table rn3.pbAaDistP rename as kgRn3Sav.pbAaDistP;
alter table rn3.pbAaDistQ rename as kgRn3Sav.pbAaDistQ;
alter table rn3.pbAaDistR rename as kgRn3Sav.pbAaDistR;
alter table rn3.pbAaDistS rename as kgRn3Sav.pbAaDistS;
alter table rn3.pbAaDistT rename as kgRn3Sav.pbAaDistT;
alter table rn3.pbAaDistV rename as kgRn3Sav.pbAaDistV;
alter table rn3.pbAaDistW rename as kgRn3Sav.pbAaDistW;
alter table rn3.pbAaDistY rename as kgRn3Sav.pbAaDistY;
alter table rn3.pbAnomLimit rename as kgRn3Sav.pbAnomLimit;
alter table rn3.pbResAvgStd rename as kgRn3Sav.pbResAvgStd;
alter table rn3.pbStamp rename as kgRn3Sav.pbStamp;


o Define pep* tables in rn3 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

  First edit out pepPred table definition, then

	hgsql rn3 < pepAll.sql


o Build the pepMwAa table

  hgsql proteins040315 -e "select info.acc, molWeight, aaSize from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > pepMwAa.tab


hgsql rn3 <<end
load data local infile "pepMwAa.tab" into table rn3.pepMwAa ignore 1 lines;
end

o Build the pepPi table

  hgsql proteins040315 -e "select info.acc from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp040315 pepPi.tab

hgsql rn3 <<end
load data local infile "pepPi.tab" into table rn3.pepPi;
end

o Calculate and load pep distributions

  pbCalDist sp040315 proteins040315 10116 rn3

  From mysql prompt:

use rn3;

load data local infile "pepExonCntDist.tab" into table rn3.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table rn3.pepCCntDist;
load data local infile "pepHydroDist.tab" into table rn3.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table rn3.pepMolWtDist;
load data local infile "pepResDist.tab" into table rn3.pepResDist;
load data local infile "pepIPCntDist.tab" into table rn3.pepIPCntDist;
load data local infile "pepPiDist.tab" into table rn3.pepPiDist;

o Calculate frequency distributions

  pbCalResStd sp040315 proteins040315 10116 rn3

o Create pbAaDistX tables

  cat  ~/kent/src/hg/lib/pbAaDist*.sql >pbAaAll.sql
  hgsql rn3 < pbAaAll.sql

  From mysql prompt:

  use rn3;

load data local infile "pbAaDistW.tab" into table rn3.pbAaDistW;
load data local infile "pbAaDistC.tab" into table rn3.pbAaDistC;
load data local infile "pbAaDistM.tab" into table rn3.pbAaDistM;
load data local infile "pbAaDistH.tab" into table rn3.pbAaDistH;
load data local infile "pbAaDistY.tab" into table rn3.pbAaDistY;
load data local infile "pbAaDistN.tab" into table rn3.pbAaDistN;
load data local infile "pbAaDistF.tab" into table rn3.pbAaDistF;
load data local infile "pbAaDistI.tab" into table rn3.pbAaDistI;
load data local infile "pbAaDistD.tab" into table rn3.pbAaDistD;
load data local infile "pbAaDistQ.tab" into table rn3.pbAaDistQ;
load data local infile "pbAaDistK.tab" into table rn3.pbAaDistK;
load data local infile "pbAaDistR.tab" into table rn3.pbAaDistR;
load data local infile "pbAaDistT.tab" into table rn3.pbAaDistT;
load data local infile "pbAaDistV.tab" into table rn3.pbAaDistV;
load data local infile "pbAaDistP.tab" into table rn3.pbAaDistP;
load data local infile "pbAaDistG.tab" into table rn3.pbAaDistG;
load data local infile "pbAaDistE.tab" into table rn3.pbAaDistE;
load data local infile "pbAaDistA.tab" into table rn3.pbAaDistA;
load data local infile "pbAaDistL.tab" into table rn3.pbAaDistL;
load data local infile "pbAaDistS.tab" into table rn3.pbAaDistS;

o Create pbAnomLimit and pbResAvgStd tables

   hgsql rn3 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql rn3 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql rn3 -e 'load data local infile "pbResAvgStd.tab" into table rn3.pbResAvgStd;'
   hgsql rn3 -e 'load data local infile "pbAnomLimit.tab" into table rn3.pbAnomLimit;'


o Download data files from Ensembl and create ensGeneXref and ensTranscript tables

     mkdir /cluster/bluearc/fan/ensembl/19_3bR
     cd /cluster/bluearc/fan/ensembl/19_3bR

     wget --timestamping \
"ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/gene_xref.txt.table.gz"

     wget --timestamping \
"ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/transcript.txt.table.gz"

     wget --timestamping \
"ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/rattus_norvegicus_lite_19_3b.sql.gz"

     gzip -d *.gz

  If rn3.ensGeneXref table already exists

     alter table rn3.ensGeneXref rename as kgRn3Sav.ensGeneXref;

  ENSEMBL IS KNOWN FOR CHANGING THEIR TABLE DEFINTION AND CONTENT!!!
  So, first check if the table definition of gene_xref in rattus_norvegicus_lite_19_3b.sql
  is the same as ~/src/hg/lib/ensGeneXref.sql.  If they are not the same, some
  programming change to hgTracks.c and possibly other programs may be needed. 

  If they are the same, create gensGeneXref table in rn3 by:

     hgsql rn3 < ~/src/hg/lib/ensGeneXref.sql

     hgsql rn3 -e 'load data local infile "gene_xref.txt.table" into table rn3.ensGeneXref;'

  If rn3.ensTranscript exists

  	alter table rn3.ensTranscript rename as kgRn3Sav.ensTranscript;
 
  Again, check if the table definition of transcript in rattus_norvegicus_lite_19_3b.sql.gz
  is the same as ~/src/hg/lib/ensTranscript.sql.  If they are not the same, some
  programming change to hgTracks.c and possibly other programs may be needed. 

  If they are the same, create gensTranscript table in rn3 by:

     hgsql rn3 < ~/src/hg/lib/ensTranscript.sql
 
     hgsql rn3 -e 'load data local infile "transcript.txt.table" into table rn3.ensTranscript;'


o Download Superfamily data files and build the Superfamily DB
  
  Make sure an index on id is created for the des table of superfam040321.

  Skip this step if it is already done.

  	mkdir /cluster/store1/superFamily/040321
	cd /cluster/store1/superFamily/040321

  ftp over the following two files:

	ass_28-Mar-2004.tab.gz
	supfam_21-Mar-2004.sql.gz
		
   	hgsql superfam040321 < ~/src/hg/lib/sfAssign.sql

   	hgsql superfam040321 -e \
	   "load data local infile "ass_28-Mar-2004.tab" into table superfam040321.sfAssign;"

o Load sfAssign and sfDes tables into rn3, they are needed by PB and Superfamily track

   hgsql rn3 < ~/src/hg/lib/sfAssign.sql

   hgsql rn3 -e 'load data local infile "ass_28-Mar-2004.tab" into table rn3.sfAssign;'

   If rn3.sfDes already exists,

   	alter table rn3.sfDes rename as kgRn3Sav.sfDes;

   hgsql superfam040321 -e "select * from des" >des.tab

   hgsql rn3 < ~/src/hg/lib/sfDes.sql

   hgsql rn3 -e 'load data local infile "des.tab" into table rn3.sfDes ignore 1 lines'

o Build or rebuild Superfamily track
  (Superfamily track needs Ensembl Genes track (ensGene table) to be built first)

  If rn3.superfamily already exists

	alter table rn3.superfamily rename as kgRn3Sav.superfamily;

  Go to rn3 build working directory.

     hgSuperfam rn3 > sf.log
   
  It is normal that many proteins does not have corresponding Superfamily entries.

   If rn3.sfDescription exists,
   	alter table rn3.sfDescription rename as kgRn3Sav.sfDescription;

   hgsql rn3 < ~/src/hg/lib/sfDescription.sql
   hgsql rn3 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table rn3.sfDescription;'

  Finally, load the superfamily table.
  
      hgLoadBed rn3 superfamily superfamily.tab -tab

o Create pbStamp table for PB
   
  If rn3.pbStamp already exists,

  	hgsql rn3 -e "alter table rn3.pbStamps rename as kgRn3Sav.pbStamp;"

  Create pbStam table

	hgsql rn3 < ~/src/hg/lib/pbStamp.sql

  cd ~/kent/src/hg/protein/pbTracks
  hgsql rn3 -e 'load data local infile "pbStampRn3.tab" into table rn3.pbStamp;'

o Enable Proteome Browser

  if ~/kent/src/hg/hgGene/hgGeneData/Rat/links.ra does not exist:

     cd ~/kent/src/hg/hgGene/hgGeneData/Rat
     mkdir rn3
     cd rn3

     Create links.ra with the following content
-----------------------------
# rn3 link info.

name protBrowser
shortLabel Proteome Browser
tables kgXref
idSql select spDisplayID from kgXref where kgID = '%s'
nameSql select spId from kgXref where kgID = '%s'
url ../cgi-bin/pbTracks?proteinID=%s
hgsid on
dbInUrl on
priority 12
-----------------------------
     cd ../..
     make 

o Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) by updating the pbStampMouse.tab 
  file and then reload the pbStamp table. 

o Perform preliminary review of Proteome Browser for rn3

o Notify QA for formal review, when all PB build steps done.


####### BUILD/REBUILD RGD TRACKS (DONE 5/26/04 Fan) ##############

# download data files from RGD

wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_EST.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_QTL.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_QTL.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_RH3.4_Markers.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_SSLP.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_curated_genes.gff
wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/chr_coordinates.gff

# create rgdGeneLink.tab
echo "Processing RGD_curated_genes.gff ..."

awk '{print $12"\t"$10"\t"$14}'  RGD_curated_genes.gff | sed -e 's/"//g' | sed -e 's/RGD://g'|sed -e 's/;//g' |sort -u > rgdGeneLink.tab

# load rgdGeneLink table
hgsql rn3 -e "drop table rn3.rgdGeneLink"
hgsql rn3 < ~/kent/src/hg/lib/rgdGeneLink.sql
hgsql rn3 -e 'load data local infile "rgdGeneLink.tab" into table rn3.rgdGeneLink;'

# create rgdGene.tab
hgsql rn3 -e "select refGene.name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene,rgdGeneLink where refGene.name=rgdGeneLink.refSeq;" >rgdGene.tab

# load rgdGene table
hgsql rn3 -e "drop table rn3.rgdGene"
hgsql rn3 < ~/kent/src/hg/lib/rgdGene.sql
hgsql rn3 -e 'load data local infile "rgdGene.tab" into table rn3.rgdGene ignore 1 lines;'

# check rgdGene table
echo "Performing checkTableCoords rn3 rgdGene ..."
checkTableCoords rn3 rgdGene


# create rgdEstLink.tab
echo "Processing RGD_EST.gff ..."

awk '{print $12"\t"$10"\t"$14}'  RGD_EST.gff | sed -e 's/"//g' | sed -e 's/RGD://g'|sed -e 's/;//g' |sort -u > rgdEstLink.tab

# load rgdEstLink table
hgsql rn3 -e "drop table rn3.rgdEstLink"
hgsql rn3 < ~/kent/src/hg/lib/rgdEstLink.sql
hgsql rn3 -e 'load data local infile "rgdEstLink.tab" into table rn3.rgdEstLink;'

# create rgdEst.tab
hgsql rn3 -e "select all_est.* from all_est,rgdEstLink where all_est.qName=rgdEstLink.name;" >rgdEst.tab

# load rgdEst table
hgsql rn3 -e "drop table rn3.rgdEst"
hgsql rn3 < ~/kent/src/hg/lib/rgdEst.sql
hgsql rn3 -e 'load data local infile "rgdEst.tab" into table rn3.rgdEst ignore 1 lines;'

# check rgdEst table
echo "Performing checkTableCoords rn3 rgdEst ..."
checkTableCoords rn3 rgdEst


# create rgdQtl.tab
echo "Processing RGD_QTL.gff ..."
awk '{print $1"\t"$4-1"\t"$5"\t"$10}'  RGD_QTL.gff |sed -e 's/Chr/chr/g'| sed -e 's/"//g' | sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab

# create rgdQtlLink.tab
awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' RGD_QTL.gff | sed -e 's/"//g' | sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtlLink.tab

# load rgdQtl table
hgLoadBed rn3 rgdQtl rgdQtl.tab

# check rgdQtl table
echo "Performing checkTableCoords rn3 rgdQtl ..."
checkTableCoords rn3 rgdQtl

# load rgdQtlLink table
hgsql rn3 -e "drop table rn3.rgdQtlLink;"
hgsql rn3 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql rn3 -e 'load data local infile "rgdQtlLink.tab" into table rn3.rgdQtlLink;'


# create rgdSslp.tab 
echo "Processing RGD_SSLP.gff ..."
awk '{print $1"\t"$4-1"\t"$5"\t"$10}' RGD_SSLP.gff|sed -e 's/Chr/chr/g'| sed -e 's/;//g'|sed -e 's/"//g'|sed -e 's/RGD://g' >rgdSslp.tab

# create rgdSslpLink.tab
awk '{print $12"\t"$10}' RGD_SSLP.gff|sed -e 's/"//g'|sed -e 's/RGD://g'|sed -e 's/;//g' >rgdSslpLink.tab

# load rgdSslp table
hgLoadBed rn3 rgdSslp rgdSslp.tab

# check rgdSslp table
echo "Performing checkTableCoords rn3 rgdSslp ..."
checkTableCoords rn3 rgdSslp

# load rgdSslpLink table
hgsql rn3 -e "drop table rn3.rgdSslpLink"
hgsql rn3 < ~/kent/src/hg/lib/rgdSslpLink.sql
hgsql rn3 -e 'load data local infile "rgdSslpLink.tab" into table rn3.rgdSslpLink;'

#  BLASTZ HUMAN HG17 (DONE - 2004-06-14 - Hiram)

    ssh kk
    mkdir /cluster/data/rn3/bed/blastz.hg17.2004-06-13
    cd /cluster/data/rn3/bed
    ln -s  blastz.hg17.2004-06-13 blastz.hg17
    cd blastz.hg17

    cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Rat
SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/rn3/bed/blastz.hg17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/rn3/bed/blastz.hg17
    source DEF
    #	This is a generic script and works for any assembly
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 42744 of 42744 jobs
# CPU time in finished jobs:   15578956s  259649.27m  4327.49h  180.31d  0.494 y
# IO & Wait Time:                924985s   15416.41m   256.94h   10.71d  0.029 y
# Average job time:                 386s       6.44m     0.11h    0.00d
# Longest job:                     4140s      69.00m     1.15h    0.05d
# Submission to last job:         19077s     317.95m     5.30h    0.22d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/rn3/bed/blastz.hg17
    source DEF
    # This is a generic script and works for any assembly
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 312 of 312 jobs
# CPU time in finished jobs:       1902s      31.70m     0.53h    0.02d  0.000 y
# IO & Wait Time:                  4320s      72.00m     1.20h    0.05d  0.000 y
# Average job time:                  20s       0.33m     0.01h    0.00d
# Longest job:                       85s       1.42m     0.02h    0.00d
# Submission to last job:           453s       7.55m     0.13h    0.01d

    #	Third cluster run to convert lav's to axt's
    source DEF
    cd /cluster/data/rn3/bed/blastz.hg17
    # This is a generic script and works for any assembly
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:        427s       7.11m     0.12h    0.00d  0.000 y
# IO & Wait Time:                  4707s      78.45m     1.31h    0.05d  0.000 y
# Average job time:                 117s       1.94m     0.03h    0.00d
# Longest job:                      536s       8.93m     0.15h    0.01d
# Submission to last job:           536s       8.93m     0.15h    0.01d


    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17
    mkdir pslChrom
    set tbl = "blastzHg17"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 30 minutes

    # Load database tables
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/pslChrom
    # this is a 55 minute job
    for I in *.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I}
	echo "done: ${I}"
    done

# CHAIN HG17 BLASTZ (DONE - 2004-06-14 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/rn3/bed/blastz.hg17/axtChain/run1
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/rn3/bed/blastz.hg17/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ_random $1 | axtChain stdin \
    /iscratch/i/rn3/bothMaskedNibs \
    /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doChain

    # 44 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:       5395s      89.91m     1.50h    0.06d  0.000 y
# IO & Wait Time:                  2637s      43.96m     0.73h    0.03d  0.000 y
# Average job time:                 183s       3.04m     0.05h    0.00d
# Longest job:                     1032s      17.20m     0.29h    0.01d
# Submission to last job:          1032s      17.20m     0.29h    0.01d

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    5m45.285s
    #	user    4m51.730s
    #	sys     0m34.290s

    time chainSplit chain all.chain
    #	real    5m34.606s
    #	user    4m47.860s
    #	sys     0m26.010s

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain rn3 ${c}_chainHg17 $i
        echo done $c
    end
    #	real    26m9.947s
    #	user    14m36.240s
    #	sys     1m54.300s

# NET HG17 (WORKING - 2004-06-15 - Hiram)

    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/rn3/chrom.sizes \
	    /cluster/data/hg17/chrom.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \
	    /cluster/data/hg17/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | time /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 1499635712, utime 11819 s/100, stime 1808

    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    time netClass hNoClass.net rn3 hg17 human.net \
	-tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat
    #	real    13m41.971s
    #	user    7m30.930s
    #	sys     1m53.450s

    # If things look good do
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn human.net > humanSyn.net
    #	real    6m0.129s
    #	user    5m1.590s
    #	sys     0m30.010s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet rn3 netHg17 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet rn3 syntenyNetHg17 stdin
    #	real    17m7.948s
    #	user    9m39.800s
    #	sys     1m16.990s

    # check results
    #	featureBits -countGaps rn3 netHg17
    #	2633759972 bases of 2835152425 (92.897%) in intersection

    #	featureBits -countGaps rn3 netHg15
    #	2628441031 bases of 2835152425 (92.709%) in intersection

    # featureBits rn3 syntenyNetHg17
    #	featureBits -countGaps rn3 syntenyNetHg17
    #	2581754861 bases of 2835152425 (91.062%) in intersection

    #	featureBits -countGaps rn3 hg15SynNet
    #	2578335860 bases of 2835152425 (90.942%) in intersection

    # Add entries for net and chain to rat/rn3 trackDb

    # make net
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    mkdir humanNet
    time netSplit human.net humanNet
    #	real    6m33.389s
    #	user    5m7.570s
    #	sys     0m52.390s

    mkdir ../axtNet
    foreach n (humanNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt humanNet/$c.net chain/$c.chain \
		/cluster/data/rn3/nib \
		/cluster/data/hg17/nib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> axtNet/$c.axt"
    end

    ssh hgwdev
    mkdir -p /cluster/data/rn3/bed/blastz.hg17/axtBest
    cd /cluster/data/rn3/bed/blastz.hg17/axtBest
    ln -s ../axtNet/chr*.axt .

    # NOTE: some of these axt files were corrupted (chr4, 8, and X), 
    # (I noticed this while generating maf's from them)
    # Remade below (kate)

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestHg17.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestHg17.psl
	echo "Done: ${c}_blastzBestHg17.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/pslBest
    for I in chr*BestHg17.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I}
	echo "done ${I}"
    done

     # check results
    #	featureBits -countGaps rn3 blastzBestHg17
    #	978297270 bases of 2835152425 (34.506%) in intersection

    #	featureBits -countGaps rn3 blastzBestHg15
    #	998080478 bases of 2835152425 (35.204%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/rn3/axtBest/Hg17
     cd /gbdb/rn3/axtBest/Hg17
     ln -s /cluster/data/rn3/bed/blastz.hg17/axtNet/chr*.axt .
     cd /cluster/data/rn3/bed/blastz.hg17/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/rn3/axtBest/Hg17/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql rn3 < axtInfoInserts.sql

# ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE May 13, 2005, Heather)

ssh hgwdev
cd /cluster/data/rn3/bed/blastz.hg17/axtChain
gzip all.chain
cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/human.chain.gz
gzip human.net
cp -p human.net.gz /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/human.net.gz
cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17
md5sum *.gz */*.gz > md5sum.txt
# Update the README.txt


# MAKING HUMAN SYNTENY (DONE - 2004-06-16 - Hiram)

ssh hgwdev
mkdir /cluster/data/rn3/bed/syntenyHg17
cd /cluster/data/rn3/bed/syntenyHg17

# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
cp -p /cluster/data/hg17/bed/syntenyRn3/*.sh .

./syntenicBest.pl -db=rn3 -table=blastzBestHg17
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=g17 -table=blastzBestHg17
./synteny2bed.pl
#	The five commands above
#	real    185m3.399s
#	user    0m18.620s
#	sys     0m3.770s

#	Used to load this in syntenyHg17, but that type is misleading to
#	the table browser and fails the checkTableCoords check.
#	Better to use this ensRatMusHom type:
sed -e 's/ensPhusionBlast/ensRatMusHom/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRatMusHom.sql
hgLoadBed rn3 ensRatMusHom ucsc100k.bed -sqlTable=ensRatMusHom.sql

# MAKING HUMAN AXTTIGHT FROM AXTBEST (WORKING - 2004-06-15 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtNet
    mkdir -p ../axtTight
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg17.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/pslTight
    for I in chr*TightHg17.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I}
	echo "done ${I}"
    done

    # copy  axt's to download area
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.hg17/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight
    gzip *.axt
XXX - README file not done yet - 2004-06-15
    # add README.txt file to dir


# HUMAN HG17 BLASTZ NET MAF FILES (WORKING 2004-09-21 kate)
# NOTE: remade axt's  on chr4, chrX and chr8, as they contained some garbage
# probably due to file corruption on store3, as copies in download
# directory are fine
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.hg17/axtChain
    foreach f (humanNet/chr{4,8,X}.net)
        set c = $f:t:r
        echo "axt of $c"
        netToAxt $f chain/$c.chain /cluster/data/rn3/nib \
            /cluster/data/hg17/nib ../axtNet/$c.axt
    end
cat > makeMaf.csh << 'EOF'
    foreach f (humanNet/chr*.net)
        set c = $f:t:r
        echo "$c"
        axtSort ../axtNet/$c.axt stdout | axtToMaf stdin \
            /cluster/data/rn3/chrom.sizes /cluster/data/hg17/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=rn3. -qPrefix=hg17.
    end
'EOF'
    # << emacs
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log


# ANDY LAW CPGISSLANDS (DONE 6/15/04 angie)
    # See notes about this in makeGalGal2.doc.
    ssh eieio
    mkdir /cluster/data/rn3/bed/cpgIslandGgfAndy
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandAndy.bed
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo preproc $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
      echo running original on $chr
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndy.bed
      echo running modified on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    # this one is a bed 4:
    hgLoadBed rn3 cpgIAndy -tab -noBin cpgIslandAndy.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed rn3 cpgIslandGgfAndy -tab -noBin \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    # WOW, even masking out repeat bases from the results, there's a huge 
    # increase in reported islands!!
    featureBits rn3 cpgIsland
#9679881 bases of 2571104688 (0.376%) in intersection
    featureBits rn3 cpgIslandGgfAndy
#70368349 bases of 2571104688 (2.737%) in intersection
    featureBits rn3 cpgIslandGgfAndy \!rmsk
#51841401 bases of 2571104688 (2.016%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  15832 ../cpgIsland/cpgIsland.bed
# 206593 cpgIslandAndy.bed
# 152003 cpgIslandGgfAndy.bed
    # LaDeana asked for a run on masked seq for comparison.
    cp /dev/null cpgIslandAndyMasked.bed
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../?{,?}/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc masked $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
      echo running original on $chr masked
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndyMasked.bed
      echo running modified on $chr masked
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    hgLoadBed rn3 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed rn3 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits rn3 cpgIAndyMasked
#62121817 bases of 2571104688 (2.416%) in intersection
    featureBits rn3 cpgIslandGgfAndyMasked
#43903454 bases of 2571104688 (1.708%) in intersection
    wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed
# 124457 cpgIslandAndyMasked.bed
#  91015 cpgIslandGgfAndyMasked.bed
    # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
    # might not be the right thing to do (?).  Give it a try with less-masked 
    # sequence.
    ssh eieio
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyOnlyRM.bed
    cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed
    foreach f (../../?{,?}/chr*.fa)
      set chr = $f:t:r
      echo preproc, ggf-andy $chr onlyRM
      maskOutFa $f $f.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRM.bed
      echo preproc, ggf-andy $chr onlyRMAlu
      head -3 $f.out > /tmp/tmp2.fa.out
      awk '$11 == "SINE/Alu" {print;}' $f.out >> /tmp/tmp2.fa.out
      maskOutFa $f /tmp/tmp2.fa.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRMAlu.bed
    end
#  91272 cpgIslandGgfAndyOnlyRM.bed
# 149225 cpgIslandGgfAndyOnlyRMAlu.bed
    ssh hgwdev
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed rn3 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRM.bed
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed rn3 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRMAlu.bed
    featureBits rn3 cpgIslandGgfAndyOnlyRM.bed
#44021003 bases of 2571104688 (1.712%) in intersection
    featureBits rn3 cpgIslandGgfAndyOnlyRMAlu.bed
#68483127 bases of 2571104688 (2.664%) in intersection
    # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
    # for Dave Burt's cross-species island comparisons.
    ssh eieio
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
    perl -wpe '@w=split("\t"); $w[3] = "rn3.$w[0]." . ($w[1]+1) . ".$w[2]"; \
               $_ = join("\t", @w);' \
      cpgIslandGgfAndyMasked.bed.orig \
    > cpgIslandGgfAndyMasked.bed
    ssh hgwdev
    cd /cluster/data/rn3/bed/cpgIslandGgfAndy
    hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
      rn3 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed


######## GENE SORTER BUILD #######  (DONE - 2004-06-01 - Fan)
# This update is to complement the KG update using SWISS-PROT data release of 3/15/04.
# These are instructions for updating the
# neighborhood browser.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/rn3/bed/famBro.2004-04-05
rm /cluster/data/rn3/bed/famBro
ln -s /cluster/data/rn3/bed/famBro.2004-04-05 /cluster/data/rn3/bed/famBro
cd /cluster/data/rn3/bed/famBro
hgClusterGenes rn3 knownGene knownIsoforms knownCanonical
#	Got 25004 clusters, from 41693 genes in 44 chromosomes
#	row count on knownIsoforms is 41669
#	row count on knownCanonical is 25004

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/rn3/bed/famBro/blastp
cd /cluster/data/rn3/bed/famBro/blastp
pepPredToFa rn3 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
#
#	expect to find /scratch/blast loaded with all required blast files
/scratch/blast/formatdb -i known.faa -t known -n known

# Copy over database to bluearc
rm -fr /cluster/bluearc/rn3/blastp
mkdir /cluster/bluearc/rn3
mkdir /cluster/bluearc/rn3/blastp
cp -p /cluster/data/rn3/bed/famBro/blastp/known.* /cluster/bluearc/rn3/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/rn3/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/rn3/bed/famBro/blastp/self
cd /cluster/data/rn3/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/rn3/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << emacs
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << emacs

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# This should finish in ~5 minutes
# Completed: 7738 of 7738 jobs
# CPU time in finished jobs:     108579s    1809.65m    30.16h    1.26d  0.003 y
# IO & Wait Time:                 76232s    1270.54m    21.18h    0.88d  0.002 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                      188s       3.13m     0.05h    0.00d
# Submission to last job:           254s       4.23m     0.07h    0.00d

# Load into database.  This takes about 15 minutes.
ssh hgwdev
cd /cluster/data/rn3/bed/famBro/blastp/self/run/out
time hgLoadBlastTab rn3 knownBlastTab *.tab
# Scanning through 7738 files
# Loading database with 8019510 rows
# 184.360u 28.660s 17:01.40 20.8% 0+0k 0+0io 202pf+0w

# Create table that maps between known genes and RefSeq
hgMapToGene rn3 refGene knownGene knownToRefSeq
#	row count goes from 27744 to 30116

# Part 2, started 5/25/04
# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene rn3 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'


    hgExpDistance rn3 hgFixed.gnfRatAtlas2MedianRatio \
        hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \
        -lookup=knownToGnfAtlas2

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
GO database, go, downloaded and updated.

# Create knownToEnsembl column
cd /cluster/data/rn3/bed/famBro
hgMapToGene rn3 ensGene knownGene knownToEnsembl
#	row count went to 34357

# Make knownToCdsSnp column.
hgMapToGene rn3 snpMap knownGene knownToCdsSnp -all -cds
#	row count went to 12061

# CREATING KNOWNTOSUPER (which enables Superfamily stuff in hgNear/hgGene) (DONE 6/1/04 Fan.)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
ssh hgwdev
cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab| hgKnownToSuper rn3 rn stdin
#	5984 records output

# CREATE KNOWNTOLOCUSLINK (DONE 6/1/04 Fan.)
# that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" rn3 \
        > refToLl.txt
hgMapToGene rn3 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 6084


# MAP ENCODE ORTHOLOG REGIONS (2004-07-09 kate)
    ssh eieio
    cd /cluster/data/rn3/bed
    mkdir encodeRegions
    cd encodeRegions
    # Of 44 regions:
    # -minMatch=.20 -> 36 mapped
    liftOver -minMatch=.20 \
        /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
        /cluster/data/hg16/bed/liftOver/hg16Torn3.chain \
        encodeRegions.bed encodeRegions.unmapped
    ssh hgwdev
    cd /cluster/data/rn3/bed/encodeRegions
    hgLoadBed rn3 encodeRegions encodeRegions.bed -noBin

    ssh eieio
    liftOver -minMatch=.20 \
        /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
        /cluster/data/hg16/bed/liftOver/hg16Torn3.chain \
        encodeRegions.bed encodeRegions.unmapped
    ssh hgwdev
    cd /cluster/data/rn3/bed/encodeRegions
    hgLoadBed rn3 encodeRegions encodeRegions.bed -noBin


# ACCESSIONS FOR CONTIGS (DONE 2004-08-24 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/rn3/bed
    mkdir -p contigAcc
    cd contigAcc
    
    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, and send to file.
    # output to file <species>.acc

    contigAccession rat.acc > contigAcc.tab
    wc -l contigAcc.tab
        # 137910 contigAcc.tab
    # extract all records from chrN_gold, using table browser 
    # (includes header line)
    wc -l rat*
        # 137911 ratcontigs.tab
    hgsql rn3 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql rn3
    hgsql rn3 -e "SELECT COUNT(*) FROM contigAcc"
        # 137910


# SWAP BLASTZ MOUSE-RAT TO RAT-MOUSE (MM5) (DONE 9/1/04 Fan)
    ssh eieio
    mkdir /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01
    ln -s blastz.mm5.swap.2004-09-01 /cluster/data/rn3/bed/blastz.mm5
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01
    set aliDir = /cluster/data/mm5/bed/blastz.rn3
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.

    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    # This took about 2 hours.
    du -sh $aliDir/axtChrom unsorted axtChrom
    rm -r unsorted


# CHAIN MOUSE BLASTZ (DONE 9/1/04 Fan)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
  /iscratch/i/rn3/bothMaskedNibs \
  /iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# Completed: 44 of 44 jobs
# CPU time in finished jobs:       3611s      60.18m     1.00h    0.04d  0.000 y
# IO & Wait Time:                  1090s      18.17m     0.30h    0.01d  0.000 y
# Average job time:                 107s       1.78m     0.03h    0.00d
# Longest job:                      301s       5.02m     0.08h    0.00d
# Submission to last job:           484s       8.07m     0.13h    0.01d

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain
    limit vmemoryuse 5000000
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        nice hgLoadChain rn3 ${c}_chainMm5 $i
    end
    # Similar to human-mouse coverage -- not nearly as high as rat-human.
    featureBits -chrom=chr1 rn3 chainMm5Link
# 158815202 bases of 242966425 (65.365%) in intersection
    featureBits -chrom=chr2 rn3 chainMm5Link
# 145927900 bases of 235737501 (61.903%) in intersection

# NET MOUSE BLASTZ (DONE 9/1/04 Fan)

    ssh eieio
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain

    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len rat.net mouse.net
    limit vmemoryuse 8000000
    netSyntenic rat.net hNoClass.net
    # memory usage 1499435008, utime 11055 s/100, stime 1385

    # The netClass operations requires an "ancientRepeat" table to exist
    # in either rn3 or mm5.  So, create the table:

    ssh hgwdev
    mkdir -p /cluster/data/rn3/bed/ancientRepeat
    cd /cluster/data/rn3/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    # and you need to use your read/write enabled user with password
    chmod 777 .
    hgsqldump --all --tab=. mm4 ancientRepeat
    chmod 775 .
    hgsql rn3 < ancientRepeat.sql
    hgsql mm5 -e 'load data local infile "ancientRepeat.txt" into table ancientRepeat'

    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain

    time netClass hNoClass.net rn3 mm5 mouse.net \
        -tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse \
        -qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat
    # 386.820u 84.400s 20:15.00 38.7% 0+0k 0+0io 31184pf+0w

    # Make a 'syntenic' subset:
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain
    rm hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain
    nice netFilter -minGap=10 mouse.net |  hgLoadNet rn3 netMm5 stdin
    nice netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 netSyntenyMm5 stdin
    # Add entries for chainMm5, netMm5, syntenyMm5 to rat/rn3 trackDb

# GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 9/1/04 Fan)
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain
    netSplit mouse.net net
    cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/rn3/nib \
        /cluster/data/mm5/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/rn3/chrom.sizes /cluster/data/mm5/chrom.sizes \
            $maf -tPrefix=rn3. -qPrefix=mm5.
    end


# MAKE VSMM5 DOWNLOADABLES (DONE 9/2/04 Fan)
    ssh eieio
    cd /cluster/data/rn3/bed/blastz.mm5/axtChain
    ln all.chain mouse.chain
    foreach f (mouse.chain mouse.net)
      gzip -c $f > $f.gz
    end
    rm mouse.chain
    # Make chain-format of raw alignments 
    cd /cluster/data/rn3/bed/blastz.mm5
    mkdir blastzECF
    foreach f (axtChrom/chr*.axt)
      set chr = $f:t:r
      axtToChain $f S1.len S2.len stdout \
      | gzip -c - > blastzECF/$chr.ecf.gz
    end

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/rn3/vsMm5
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm5
    mv /cluster/data/rn3/bed/blastz.mm5/axtChain/mouse*.gz .
    md5sum *.gz > md5sum.txt

    cd /cluster/data/rn3/bed/blastz.mm5/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet
    cd /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet
    md5sum * > md5sum.txt
    gzip *.axt
  
    # Copy over & edit README.txt w/pointers to chain, net formats.
    # Not for pushing -- handle separately.
    mv /cluster/data/rn3/bed/blastz.mm5/blastzECF .
    cd blastzECF
    md5sum *.gz > md5sum.txt


# CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-09-21 kate)

    ssh eieio
    set multizDir = multiz.2004-09-21
    set workingDir = /cluster/bluearc/rn3/$multizDir
    ln -s $workingDir /cluster/bluearc/rn3/multiz
    mkdir -p $workingDir
    mkdir -p /cluster/data/rn3/bed/$multizDir
    cd /cluster/data/rn3/bed
    ln -s multiz.2004-09-21 multiz
    cd multiz
    cut -f 1 /cluster/data/rn3/chrom.sizes > chrom.lst

    # copy mafs to bluearc -- mouse
    set workingDir = /cluster/bluearc/rn3/$multizDir
    mkdir $workingDir/mm5
    cp /cluster/data/rn3/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5

    # copy mafs to bluearc -- human
    mkdir $workingDir/hg17
    cp /cluster/data/rn3/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17

    # multiz - add in human to mouse/rat
    # 
    ssh kki
    set multizDir = multiz.2004-09-21
    set workingDir = /cluster/bluearc/rn3/$multizDir
    cd /cluster/data/rn3/bed/$multizDir

# wrapper script for multiz
    # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
    cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs

    cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/rn3/multiz.2004-09-21/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/rn3/multiz.2004-09-21/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/rn3/multiz.2004-09-21/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs

    chmod +x doMultiz.csh

    # multiz - add in human to mouse/rat

    mkdir run.hg17
    cd run.hg17
    echo "hg17/mm5" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 44 jobs
    para try, check, push, check
    cd ..

    # copy 3-way mafs to build directory
    ssh eieio
    set multizDir = multiz.2004-09-21
    set workingDir = /cluster/bluearc/rn3/$multizDir
    ln -s $workingDir /cluster/bluearc/rn3/multiz
    ln -s $workingDir/mm5hg17 $workingDir/maf
    cd /cluster/data/rn3/bed/multiz.2004-09-21
    mkdir maf
    cp $workingDir/maf/*.maf maf

# CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)

    hgsql rn3 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql rn3 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >rn3.kgSpAlias.tab
    rm j.tmp

    hgsql rn3 -e 'drop table kgSpAlias';
    hgsql rn3 < ~/src/hg/lib/kgSpAlis.sql
    hgsql rn3 -e 'load data local infile "rn3.kgSpAlias.tab" into table kgSpAlias'

# ECGENE TRACK - RELOAD NEW DATA (DONE, 2004-10-29, hartera)
    ssh eieio
    rm -f /cluster/data/rn3/bed/ECgene
    mkdir -p /cluster/data/rn3/bed/ECgene.2004-10-29
    ln -s /cluster/data/rn3/bed/ECgene.2004-10-29 \
          /cluster/data/rn3/bed/ECgene
    cd /cluster/data/rn3/bed/ECgene
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_rn3_low_gene.txt.gz"
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_rn3_low_pep.txt.gz"
    gunzip *.gz
    # load database
    ssh hgwdev
    cd /cluster/data/rn3/bed/ECgene
    ldHgGene -predTab rn3 ECgene v1.2_rn3_low_gene.txt
    # 119846 gene predictions
    # added peptide table 11/15/04
    hgPepPred rn3 tab ECgenePep v1.2_rn3_low_pep.txt
    rm *.tab
    nice gzip *.txt

# ENABLE AUTO BUILD MGC GENES TRACK (DONE markd)
   - add the following lines to rn3 entry in /cluster/data/genbank/etc/genbank.conf
      rn3.mgcTables.default = full
      rn3.mgcTables.mgc = all
   - wait one day for nightly build to align and load them into the db
   - rebuild trackDb

# Create rn3GeneList.html (to be used by Google).
# This step was done 12/08/04.

    cd /cluster/data/rn3/bed
    mkdir geneList
    cd geneList
    wget -O rn3GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=rn3"
    cp -p rn3GeneList.html /usr/local/apache/htdocs/goldenPath
# Check this html file into CVS.

# UPDATE TIGR GENE INDEX (DONE 2004-12-13 Fan)
    cd /cluster/data/rn3/bed
    mv tigr tigr-2004-05020
    mkdir tigr-2004-12-13
    ln -s tigr-2004-12-13 tigr
    cd tigr
    wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Rattus_norvegicus/TGI_track_RatGenome_rn3_12-2004.tgz
    
    tar -xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/rn3/bed/tigr
    hgsql rn3 -e "drop table tigrGeneIndex"
    hgsql rn3 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC rn3 tigrGeneIndex $f
        hgsql rn3 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 367787 entries created in tigrGeneIndex table.

    hgsql rn3 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql rn3 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords rn3 tigrGeneIndex
    
    gzip *.gff *TCs

# UPDATE TIGR GENE INDEX (RE-DONE 2004-12-21 Fan)
# This track is re-done due to an error (missing strand info) in the original files from TIGR.
    cd /cluster/data/rn3/bed
    mv tigr tigr_old_wrong
    mkdir tigr-2004-12-21
    ln -s tigr-2004-12-21 tigr
    cd tigr
    wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Rattus_norvegicus/TGI_track_RatGenome_rn3_12-2004.tgz 
    
    tar -xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/rn3/bed/tigr
    hgsql rn3 -e "drop table tigrGeneIndex"
    hgsql rn3 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC rn3 tigrGeneIndex $f
        hgsql rn3 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 367787 entries created in tigrGeneIndex table.

    hgsql rn3 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql rn3 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords rn3 tigrGeneIndex
    
    gzip *.gff *TCs


# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/rat_image.jpg'
	-O rat_image.jpg
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 1536x1040+0+560 -quality 80 -sharpen 0 \
	-normalize -gamma 1.7 rat_image.jpg rrn.jpg
    convert -geometry 300x300 -quality 80 rrn.jpg Rattus_norvegicus.jpg
    rm -f rrn.jpg rat_image.jpg
    cp -p Rattus_norvegicus.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push


# MAKE RN3-HG17 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/rn3/bed/blastz.hg17/axtChain
    mkdir -p /cluster/data/rn3/bed/bedOver
    netChainSubset $chainDir/human.net $chainDir/all.chain \
      /cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain
    

# MAKE RN3-MM5 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/rn3/bed/blastz.mm5/axtChain
    mkdir -p /cluster/data/rn3/bed/bedOver
    netChainSubset $chainDir/mouse.net $chainDir/all.chain \
      /cluster/data/rn3/bed/bedOver/rn3ToMm5.over.chain
    

# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 1/26/05/04 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Do the same for rat.  
    # Scripts expect *.out.spec filenames, so set that up:
    ssh kkr1u00
    cd /cluster/data/rn3
    mkdir /iscratch/i/rn3/linSpecRep.notInChicken
    foreach f (/cluster/data/rn3/?{,?}/chr*.fa.out)
      cp -p $f /iscratch/i/rn3/linSpecRep.notInChicken/$f:t:r:r.out.spec
    end
    iSync
    # Use these the next time we run human-chicken blastz.


# UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new rn3 protein display IDs to the alias table to support user search
    
    ssh hgwdev
    mkdir -p /cluster/data/rn3/bed/pb/newDisplayId
    cd /cluster/data/rn3/bed/pb/newDisplayId
 
    hgsql proteome -e 'select rn3.kgSpAlias.kgID, rn3.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, rn3.kgSpAlias where spOldNew.acc=rn3.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >rn3.tab
   
# get rid of the header line at the end of the file
    vi rn3.tab

    hgsql rn3 -e 'load data local infile "rn3.tab" into table rn3.kgSpAlias'

# UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new rn3 protein display IDs to the alias table to support user search
    
    ssh hgwdev
    cd /cluster/data/rn3/bed/pb/newDisplayId

     hgsql proteome -e 'select rn3.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, rn3.kgSpAlias where spOldNew.acc=rn3.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >rn3.kgProtAlias.tab

# get rid of the header line at the end of the file
    vi rn3.kgProtAlias.tab 

    hgsql rn3 -e 'load data local infile "rn3.kgProtAlias.tab" into table rn3.kgProtAlias'


#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/rn3/bed/knownGene
     mkdir index
     cd index
     hgKgGetText rn3 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/rn3/bed/knownGene/index/knownGene.ix /gbdb/rn3/knownGene.ix
     ln -s /cluster/data/rn3/bed/knownGene/index/knownGene.ixx /gbdb/rn3/knownGene.ixx


# BUILD KNOWN GENE LIST FOR GOOGLE.  DONE 6/27/05 Fan.

    cd /cluster/data/rn3/bed
    rm -rf knownGeneList/rn3
    
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/rn3

    hgKnownGeneList rn3

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/rn3
    mkdir -p /usr/local/apache/htdocs/knownGeneList/rn3
    cp -Rfp knownGeneList/rn3/* /usr/local/apache/htdocs/knownGeneList/rn3

# Build kgReactome table for KG to Reactome xref.  Done 6/28/05 Fan.

# First, make sure the reactome DB is built.  See makeHg17.doc for details.

    ssh hgwdev
    mkdir -p /cluster/data/rn3/bed/reactome
    cd /cluster/data/rn3/bed/reactome

    hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, rn3.kgXref where identifier=spID' >kgReactome.tab;

    hgsql rn3 -e 'drop table kgReactome'
    hgsql rn3 < ~/src/hg/lib/kgReactome.sql
    hgsql rn3 -e 'load data local infile "kgReactome.tab" into table kgReactome'

# ADD LINK TO GENENETWORK (Done. 9/6/05 Fan).

# Received the file, rat.RefSeqId, list of RefSeq IDs from GeneNetwork.
# remove extra CR (or LF?) at end of the line.

    rmLf rat.RefSeqId >rn3.geneNetworkId.tab

    hgsql rn3 -e 'drop table geneNetworkId'
    hgsql rn3 < ~/src/hg/lib/geneNetworkId.sql
    hgsql rn3 -e 'load data local infile "rn3.geneNetworkId.tab" into table geneNetworkId'


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/12/05 angie)
    ssh kolossus
    cd /panasas/store/rn3/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query rat -comp dog
    end
    # Now extract dog (extra column 1):
    cd ..
    mkdir linSpecRep.notInDog
    foreach f (rmsk/*.out_canis-familiaris)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractRepeats 1 $f > \
		linSpecRep.notInDog/$base.out.spec
    end
    # Clean up.
    rm rmsk/*.out_canis*


# BLASTZ/CHAIN/NET CANFAM2 (DONE 8/15/05 angie)
    ssh kkstore01
    mkdir /cluster/data/rn3/bed/blastz.canFam2.2005-08-12
    cd /cluster/data/rn3/bed/blastz.canFam2.2005-08-12
    cat << '_EOF_' > DEF
# rat vs. dog
# TARGET: Rat
SEQ1_DIR=/panasas/store/rn3/nib
SEQ1_RMSK=/panasas/store/rn3/rmsk
SEQ1_SMSK=/panasas/store/rn3/linSpecRep.notInDog
SEQ1_LEN=/cluster/data/rn3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam2/nib
#SEQ2_DIR=/iscratch/i/canFam2/nib
SEQ2_RMSK=/panasas/store/canFam2/rmsk
SEQ2_SMSK=/panasas/store/canFam2/linSpecRep.notInRat
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/rn3/bed/blastz.canFam2.2005-08-12
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/blastzRn3CanFam2Out >& do.log &
    # Bummer -- small cluster can't see /scratch/hg/canFam2 so edit 
    # DEF to use SEQ2_DIR=/iscratch/i/canFam2/nib, then continue:
    doBlastzChainNet.pl DEF -continue=chainRun \
      >>& do.log &
    ln -s blastz.canFam2.2005-08-12 /cluster/data/rn3/bed/blastz.canFam2 


# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch rn3 geneidPep 0404031400.00
  sudo mytouch rn3 twinscanPep 0404021800.00
  sudo mytouch rn3 ensGeneXref 0404301100.00
  sudo mytouch rn3 ensGtp 0404301100.00
  sudo mytouch rn3 ensPep 0404301100.00
  sudo mytouch rn3 ensTranscript 0404301100.00
  sudo mytouch rn3 superfamily 0404301300.00


# LIFTOVER TO RN4 (DONE 2/9/06 angie)
    ssh kkr1u00
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
      rn4 /cluster/data/rn4/nib
    # Do what its output says to do next:
    ssh kk (or kk9)
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
      rn3 /iscratch/i/rn3/bothMaskedNibs rn4 /iscratch/i/rn4/split10k \
      /scratch/hg/h/rat11.ooc
    # Do what its output says to do next:
    cd /cluster/data/rn3/bed/blat.rn4.2006-02-07/run
    para make spec
    para time
#Completed: 1980 of 1980 jobs
#CPU time in finished jobs:    4111620s   68526.99m  1142.12h   47.59d  0.130 y
#IO & Wait Time:                 32154s     535.91m     8.93h    0.37d  0.001 y
#Average job time:                2093s      34.88m     0.58h    0.02d
#Longest finished job:           24496s     408.27m     6.80h    0.28d
#Submission to last job:         24496s     408.27m     6.80h    0.28d

    ssh eieio
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh rn3 rn4 \
      /iscratch/i/rn4/split10k
    # Do what its output says to do next:
    ssh kki
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
     rn3 /iscratch/i/rn3/bothMaskedNibs rn4 /cluster/bluearc/scratch/hg/rn4/nib
    # Do what its output says to do next:
    cd /cluster/data/rn3/bed/blat.rn4.2006-02-07/chainRun
    para make spec
#Completed: 45 of 45 jobs
#CPU time in finished jobs:       6512s     108.53m     1.81h    0.08d  0.000 y
#IO & Wait Time:                  3369s      56.15m     0.94h    0.04d  0.000 y
#Average job time:                 220s       3.66m     0.06h    0.00d
#Longest finished job:             919s      15.32m     0.26h    0.01d
#Submission to last job:         14449s     240.82m     4.01h    0.17d
    ssh eieio
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh rn3 rn4
    # Do what its output says to do next:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh rn3 rn4
    # Hmmm.. the scripts should be updated to follow the new convention 
    # for placement of files, but I didn't realize that until this was 
    # done.  We use bed/liftOver/ now instead of bed/bedOver; the files 
    # should be compressed everywhere;  and goldenPath should have a 
    # link not a real file.  This was sufficient for now, though:
    # on eieio (this should be worked into the previous scripts):
    rm /cluster/data/rn3/bed/bedOver/rn3ToRn4.over.chain
    nice gzip /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain
    # on hgwdev (this should be worked into -load):
    rm /gbdb/rn3/liftOver/rn3ToRn4.over.chain
    ln -s /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
      /gbdb/rn3/liftOver/
    hgAddLiftOverChain rn3 rn4 hgcentraltest \
      /gbdb/rn3/liftOver/rn3ToRn4.over.chain.gz \
      -path=/gbdb/rn3/liftOver/rn3ToRn4.over.chain.gz

##########################################################################
# NSCAN track - (2006-05-29 markd)
    cd /cluster/data/rn3/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv http://genes.cs.wustl.edu/jeltje/rn3/rn3.nscan.gtf
    gzip rn3.nscan.gtf
    wget -nv -r -np http://genes.cs.wustl.edu/jeltje/rn3/chr_ptx/
    cat genes.cs.wustl.edu/jeltje/rn3/chr_ptx/*.fa >rn3.nscan.pep.fa
    gzip rn3.nscan.pep.fa
    rm -rf genes.cs.wustl.edu
    chmod a-w *.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt rn3 nscanGene rn3.nscan.gtf.gz
    # add .a suffix to match transcript id
    WRONG, see below: hgPepPred -suffix=.a rn3 generic nscanPep rn3.nscan.pep.fa.gz
    rm *.tab

    # update trackDb; need a rn3-specific page to describe informants
    rat/rn3/nscanGene.html    (from  CanFam1_hg17_nscan_10_18_2005/description.html )
    rat/rn3/trackDb.ra

    # replace twinScan track with n-scan when pushed.
    drop table twinscan;
    drop table twinscanPep;


    # 2006-07-24 markd: reloaded without -suffix
    hgPepPred rn3 generic nscanPep rn3.nscan.pep.fa.gz

#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/rn3/bed/genomicSuperDups
    cd /cluster/data/rn3/bed/genomicSuperDups
    sed -e 's/\t_\t/\t-\t/' rn3_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed rn3 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna rn3

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
rn3.upstreamGeneTbl = refGene