# for emacs: -*- mode: sh; -*-


# Danio Rerio (zebrafish) from Sanger, version Zv3 (released 11/27/03)
#   Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/Zv3_assembly_information.shtml

# DOWNLOAD SEQUENCE (DONE, 2004-05-17, kate)

    ssh kksilo
    mkdir /cluster/store7/danRer1
    ln -s /cluster/store7/danRer1 /cluster/data
    cd /cluster/data/danRer1
    wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/README
    wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.contigs.agp
    wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.agp
    wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.fa 
    wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.fa.tag

# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-05-24, hartera)
# CREATE ChrM.agp (DONE, 2004-05-27, hartera)
# Add "chr" prefix to chrM.agp (DONE, 2004-07-06, hartera)
# Error reported by a user for danRer2, also a problem in danRer1: 
# chrM.agp is space delimited rather than tab delimited so correct this. NCBI 
# defines the agp format as being tab delimited. (DONE, 2005-04-25, hartera)
    ssh kksilo
    mkdir /cluster/data/danRer1/M
    cd /cluster/data/danRer1/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for
    # "Danio mitochondrion genome".  That shows the gi number:
    # 8576324 for the accession, AC024175
 # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
    # Edit chrM.fa: make sure the header line says it is the
    # Danio Rerio mitochondrion complete genome, and then replace the
    # header line with just ">chrM".

    # Make a "pseudo-contig" for processing chrM too:
    mkdir ./chrM_1
    sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
    mkdir ./lift
    echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
    echo "chrM_1" > ./lift/ordered.lst
    echo "0	M/chrM_1	16596	chrM	16596" > ./lift/ordered.lft

# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
    cat << '_EOF_' > ./chrM.agp
M       1       16596	1       F	AC024175.3      1       16596	+
'_EOF_'

    # Add "chr" prefix to M in chrM.agp (2004-07-06)
    perl -pi.bak -e 's/M/chrM/' ./M/chrM.agp
    # check file then remove backup
    rm ./M/chrM.agp.bak
    # chrM.agp is space delimited instead of tab delimited
    # so correct this (hartera, 2005-04-25)
    cd /cluster/data/danRer1/M
    # edit chrM.agp and change field delimiters from spaces to tabs.
    # add this new file to zipped files of agp and get it pushed to the
    # downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of     
    # this make doc.

# SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-05-26, hartera)
    ssh kksilo
    cd /cluster/data/danRer1
    # There are 2 .agp files: one for supercontigs and then one for contigs
    # showing how they map on to supercontigs.

    # split up the agp into one per chrom.
    foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
                                 21 22 23 24 25 )
      mkdir $c
      perl -we "while(<>){if (/^$c\t/) {print;}}" \
        ./Zv3.contigs.agp \
        > $c/chr$c.contigs.agp
      perl -we "while(<>){if (/^$c\t/) {print;}}" \
        ./Zv3.supercontigs.agp \
        > $c/chr$c.supercontigs.agp

     end 
     # From agp files, get supercontigs and contigs for
     # ctg* as chrUn, NA* as chrNA and Finished* as chrUn
     foreach t ( ctg NA Finished )
        if ($t == "ctg") then 
           set c = "Un"
        else 
           set c = $t
        endif
        mkdir $c
        perl -we "while(<>){if (/^$t/) {print;} }" \
                   ./Zv3.contigs.agp \
        >> $c/chr$c.contigs.agp
        perl -we "while(<>){if (/^$t/) {print;} }" \
                 ./Zv3.supercontigs.agp \
        >> $c/chr$c.supercontigs.agp

     end 

# BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-05-27, hartera)
     ssh kksilo
     cd /cluster/data/danRer1
     # Sequence is already in upper case so no need to change

     foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
                                 21 22 23 24 25)
       echo "Processing ${c}"
       $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.supercontigs.agp $c \
         $c/chr$c.fa ./Zv3.supercontigs.fa
       echo "${c} - DONE"
     end

     # Need to change the number for each chromosome in the .agp and .fa 
     # files to read "chrN" - should have done this before processing 
     # original sequence and .agp files

     foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
                                 21 22 23 24 25)
       echo "Processing ${c}"
         perl -pi -e 's/^>([0-9]+)/>chr$1/' $c/*.fa
         perl -pi -e 's/^([0-9]+)/chr$1/' $c/*.agp
       echo "${c} - DONE"
     end
     
# CREATE ChrUn, chrNA AND chrFinished FASTA, AGP, GAP AND LIFT FILES FROM 
# ctg*, NA* AND Finished* SUPERCONTIGS 
# (DONE, 2005-05-27, hartera)
# ADD CORRECT FRAGMENT TYPE TO .agp FILES (DONE, 2004-07-07, hartera)
     ssh kksilo
     cd /cluster/data/danRer1
     foreach c ( Finished NA Un )
       awk '{print $1;}' $c/chr$c.supercontigs.agp > $c/chr$c.supercontigs.lst
       $HOME/bin/i386/faSomeRecords /cluster/data/danRer1/Zv3.supercontigs.fa \
          $c/chr$c.supercontigs.lst $c/chr$c.fa
     end 
  
     # check FASTA files then generate AGP and lift files 
     # from the chromosome fastas 
    
     foreach c ( Finished NA Un )
        $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
        mv $c/chr$c.fa $c/chr$c.supercontigs.fa
        perl -pi -e "s/chrUn/chr$c/" $c/chr$c.*
        $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.agp \
                      chr$c $c/chr$c.fa ./Zv3.supercontigs.fa 
     end
     # chrFinished
     # scaffold gap size is 1000, total scaffolds: 209
     # chrom size is 40167097
     # chrNA
     # scaffold gap size is 1000, total scaffolds: 54798
     # chrom size is 390413307
     # chrUn
     # scaffold gap size is 1000, total scaffolds: 1842
     # chrom size is 367113659
     # Add correct fragment type to .agp files (2004-07-07)
     # chrUn is "W", chrFinished is "F" and chrNA is "W" for all supercontigs
     ssh kksilo
     cd /cluster/data/danRer1
     foreach c (Un NA Finished)
        if ($c == "Un" || $c == "NA") then
           set f = "W"
        else
           set f = "F"
        endif
        perl -pi.bak -e "s/D/$f/;" $c/chr${c}.agp
     end
     # check .agp files and then remove backup files
     foreach c (Un NA Finished)
        rm ./$c/chr${c}.agp.bak
     end

# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-05-27, hartera)
     # Check that the size of each chromosome .fa file is equal to the 
     # last coord of the .agp:
     ssh hgwdev
     cd /cluster/data/danRer1
     foreach c ( Finished NA Un )
       foreach f ( $c/chr$c.agp )
         set agpLen = `tail -1 $f | awk '{print $3;}'`
         set g = $f:r
         set faLen = `faSize $g.fa | awk '{print $1;}'`
         if ($agpLen == $faLen) then
           echo "   OK: $f length = $g length = $faLen"
         else
           echo "ERROR:  $f length = $agpLen, but $g length = $faLen"          
         endif
       end
     end

     foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
                                 21 22 23 24 25)
       foreach f ( $c/chr$c.supercontigs.agp )
         set agpLen = `tail -1 $f | awk '{print $3;}'`
         set h = $f:r
         set g = $h:r
         echo "Getting size of $g.fa"
         set faLen = `faSize $g.fa | awk '{print $1;}'`
         if ($agpLen == $faLen) then
           echo "   OK: $f length = $g length = $faLen"
         else
           echo "ERROR:  $f length = $agpLen, but $g length = $faLen"          
         endif
       end
     end
     # All are OK so all Fasta files are the correct size

# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-05-27, hartera)
    
     ssh kksilo
     cd /cluster/data/danRer1
     foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
                                  21 22 23 24 25 Finished NA Un )
       foreach agp ($c/chr$c.{,contigs.}agp)
         if (-e $agp) then
           set fa = $c/chr$c.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end

# Create list of chromsosomes (DONE, 2004-05-27, hartera)
    ssh hgwdev
    cd /cluster/data/danRer1
    foreach f (*/*.agp)
      set chr = `echo $f:h | sed -e 's/^chr//'`
      echo $chr >> chrom
    end
    sort -n chrom | uniq > chrom.lst
    rm chrom

# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-05-27, hartera)
    # This used to hold scripts -- better to keep them inline here 
    # Now it should just hold lift file(s) and
    # temporary scripts made by copy-paste from this file.
    mkdir /cluster/data/danRer1/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/danRer1/bed

# CREATING DATABASE (DONE, 2004-05-27 - hartera)   
    # Create the database.
    # next machine                                           
    ssh hgwdev
    echo 'create database danRer1' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
    echo 'drop database danRer1' | hgsql danRer1
    # Use df to make sure there is at least 5 gig free on
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  303G  1.4T  19% /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-05-27, hartera)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #  grp in the database gaGal2 to our new database danRer1
    echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
      | hgsql danRer1
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
    echo 'drop table grp;' | hgsql danRer1

# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-05-28, hartera)    
#- Split contigs into 500kb chunks, at gaps if possible:
    ssh kksilo
    cd /cluster/data/danRer1
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

#- Make the run directory and job list:
    cd /cluster/data/danRer1
    cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
                                                                                
cd $1
pushd .
/bin/mkdir -p /tmp/danRer1/$2
/bin/cp $2 /tmp/danRer1/$2/
cd /tmp/danRer1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec danio $2
popd
/bin/cp /tmp/danRer1/$2/$2.out ./
if (-e /tmp/danRer1/$2/$2.align) /bin/cp /tmp/danRer1/$2/$2.align ./
if (-e /tmp/danRer1/$2/$2.tbl) /bin/cp /tmp/danRer1/$2/$2.tbl ./
if (-e /tmp/danRer1/$2/$2.cat) /bin/cp /tmp/danRer1/$2/$2.cat ./
/bin/rm -fr /tmp/danRer1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMZebrafish
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer1/jkStuff/RMZebrafish \
                 /cluster/data/danRer1/$d $f \
               '{'check out line+ /cluster/data/danRer1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/danRer1/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# para time
# Completed: 3623 of 3623 jobs
# CPU time in finished jobs:    6514173s  108569.55m  1809.49h   75.40d  0.207 y
# IO & Wait Time:                 34466s     574.43m     9.57h    0.40d  0.001 y
# Average job time:                1808s      30.13m     0.50h    0.02d
# Longest job:                     2539s      42.32m     0.71h    0.03d
# Submission to last job:         23551s     392.52m     6.54h    0.27d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/danRer1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/danRer1
    hgLoadOut danRer1 */chr*.fa.out

# MAKE LIFTALL.LFT (DONE, 2004-05-28, hartera)
    ssh kksilo
    cd /cluster/data/danRer1
    cat */lift/ordered.lft > jkStuff/liftAll.lft

# SIMPLE REPEAT [TRF] TRACK  (DONE, 2004-06-07, hartera)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh kksilo
    mkdir -p /cluster/data/danRer1/bed/simpleRepeat
    cd /cluster/data/danRer1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/danRer1/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end

    chmod a+x jobs.csh
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
# trf crashes with chr10_5 so remove from job list for now and run the rest 
# of the jobs.  use jobs1.csh
# contacted authors of trf - they are going to send a new binary. In the
# meantime, they ran chr10_5.fa through the fixed version of trf
# Need to change filenames from chr10_5.fa.* to chr10_5.tf.*
    mkdir /cluster/data/danRer1/bed/simpleRepeat/test
    cd /cluster/data/danRer1/bed/simpleRepeat/test
    mkdir trf

    # Download and unzip results.zip file in this directory
cat << '_EOF_' > changefileName.pl
#!/usr/bin/perl -w
use strict;
                                                                                
while (<STDIN>) {
    chomp;
    my $a = $_;
    if ($a =~ /^chr10_5/) {
       print "cp $a ";
       $a =~ s/chr10_5.fa/chr10_5.tf/;
       print "$a\n";
    }
}
'_EOF_'

    # change filenames and copy to /tmp/ directory
    ls > fileList
    perl changeFilename.pl < fileList > newFiles
    cp chr10_5.tf* /tmp/

    # Edit ~/kent/src/hg/trfBig/trfBig.c to remove calls to trfSysCall 
    # so trf is not used but output files are post processed
    # make and run trfBig
    /cluster/home/hartera/bin/i386/trfBig -trf=/cluster/bin/i386/trf \
            /cluster/data/danRer1/10/chr10_5/chr10_5.fa /dev/null \
            -bedAt=trf/chr10_5.bed -tempDir=/tmp

    # copy bed file output to directory of bed files for the other sequences
    cp /cluster/data/danRer1/bed/simpleRepeat/test/trf/chr10_5.bed \
       /cluster/data/danRer1/bed/simpleRepeat/trf

    liftUp simpleRepeat.bed /cluster/data/danRer1/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed danRer1 simpleRepeat \
      /cluster/data/danRer1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 582635 elements of size 16

# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-06-07, hartera)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/danRer1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/danRer1
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF 
# (DONE, 2004-06-07, hartera)
    ssh kksilo
    cd /cluster/data/danRer1
    # Soft-mask (lower-case) the contig and chr .fa's,
    # then make hard-masked versions from the soft-masked.
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
# This warning is extremely rare -- if it indicates a problem, it's only
# with the repeat annotation and doesn't affect the masking.
# WARNING: negative rEnd: -54 chr5:3971603-3971634 (TCTG)n
# WARNING: negative rEnd: -31 chrNA:145096456-145096484 (TTTG)n
# WARNING: negative rEnd: -29 chrNA:206929524-206929579 (TCCA)n

    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
# same warning here too. 
# WARNING: negative rEnd: -31 chrNA_29:4620077-4620105 (TTTG)n
# WARNING: negative rEnd: -29 chrNA_42:1194625-1194680 (TCCA)n
# WARNING: negative rEnd: -54 chr5_1:3971603-3971634 (TCTG)n
# sent these to Arian 
    # Build nib files, using the soft masking in the fa
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2004-06-08 - hartera)
    # Make symbolic links from /gbdb/danRer1/nib to the real nibs.
    ssh hgwdev
    cd /cluster/data/danRer1
    mkdir -p /gbdb/danRer1/nib
    foreach f (/cluster/data/danRer1/nib/chr*.nib)
      ln -s $f /gbdb/danRer1/nib
    end
    # Load /gbdb/danRer1/nib paths into database and save size info
    # hgNibSeq creates chromInfo table
    hgNibSeq -preMadeNib danRer1 /gbdb/danRer1/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N danRer1 > chrom.sizes
    # take a look at chrom.sizes, should be 29 lines
    wc chrom.sizes
    
    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/danRer1/nib because hgBlat looks there:
    faToTwoBit */chr*.fa danRer1.2bit
    ln -s /cluster/data/danRer1/danRer1.2bit /gbdb/danRer1/nib/

# MAKE GOLD AND GAP TRACKS (DONE, 2004-06-08, hartera)
# REMAKE GOLD AND GAP TRACKS (DONE, 2004-07-07, hartera)
    ssh hgwdev
    cd /cluster/data/danRer1
    # the gold and gap tracks are created from the chrN.agp file and this is
    # the contigs agp so need to use chrN.supercontigs.agp to create
    # an assembly scaffolds track as for panTro1
   #mv chrN.agp to a "contigs" dir and rename chrN.supercontigs.agp to chrN.agp
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
               21 22 23 24 25)
       mkdir ./$c/contigs
       cp ./$c/chr${c}.agp ./$c/contigs/
       mv ./$c/chr${c}.contigs.agp ./$c/contigs/
       mv ./$c/chr${c}.supercontigs.agp ./$c/chr${c}.agp
    end
                                                                                
    # check track and delete the chrN_supercontigs gap and gold tables
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
               21 22 23 24 25)
       hgsql -e "drop table chr${c}_supercontigs_gold;" danRer1
       hgsql -e "drop table chr${c}_supercontigs_gap;" danRer1
    end
    hgGoldGapGl -noGl -chromLst=chrom.lst danRer1 /cluster/data/danRer1 .

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER1 
# (DONE, 2004-06-08, hartera)
# CHANGE DEFAULT POSITION FOR BROWSER (DONE, 2004-07-16, hartera)
# CHANGED ERROR IN SCIENTIFIC NAME (DONE, 2004-08-19, hartera)

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer1

    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add danRer1 in all the right places and do
    make update
    make alpha
    cvs commit -m "Added danRer1." makefile

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("danRer1", "Nov. 2003", \
          "/gbdb/danRer1/nib", "Zebrafish", "chr1:1000-4000", 1, \
          41, "Zebrafish", "Danio Rerio", \
          "/gbdb/danRer1/html/description.html", 0, 0, \
          "Sanger Centre, Danio rerio Sequencing Project Zv3");' \
    | hgsql -h genome-testdb hgcentraltest
    echo 'insert into defaultDb (genome, name) \
          values ("Zebrafish", "danRer1");' \ 
          | hgsql -h genome-testdb hgcentraltest
# Move Zebrafish so it is between Chicken and Fugu in the organism list
    echo 'update dbDb set orderKey = 38 where name = "danRer1";' \
         | hgsql -h genome-testdb hgcentraltest
# Change default position to show "tiggy-winkle hedgehog" gene 
# involved in signaling in development
# (2004-07-16, hartera)

    echo 'update dbDb set defaultPos = "chr2:16,330,443-16,335,196" \
         where name = "danRer1";' | hgsql -h genome-testdb hgcentraltest
    
# error in scientific name: uppercase beginning of species name so correct
# (2004-08-19, hartera)
    echo 'update dbDb set scientificName = "Danio rerio" where \
         name = "danRer1";' | hgsql -h genome-testdb hgcentraltest
    # set active = 0 for danRer1 in dbDb on hgcentralbeta and hgcentral
    # (see section on ARCHIVING danRer1) (2006-05-02, hartera)

# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-06-30, hartera)
    ssh hgwdev
    mkdir /cluster/data/danRer1/html
    cd /cluster/data/danRer1/html
    # make a symbolic link from /gbdb/danRer1/html to /cluster/data/danRer1/html    ln -s /cluster/data/danRer1/html /gbdb/danRer1/html
    # Add a description page for zebrafish
    cd /cluster/data/danRer1/html
    cp /cluster/data/fr1/html/*.html .
    # Edit this for zebrafish
    
    # create a description.html page here
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer1
    cd ~/kent/src/hg/makeDb/trackDb/
    cvs add zebrafish
    cvs commit zebrafish
    cd zebrafish
    cvs add danRer1
    cvs commit danRer1
    # Add description page here too
    cp /cluster/data/danRer1/html/description.html \
       $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/
    chmod a+r \
          $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html
    cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/
    # Check it in and copy (ideally using "make alpha" in trackDb) to
    # /gbdb/danRer1/html
    cvs add description.html
    cvs commit description.html

# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-06-09, hartera)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer1/nib
    mkdir -p /iscratch/i/danRer1/nib
    cp -p /cluster/data/danRer1/nib/chr*.nib /iscratch/i/danRer1/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer1/trfFa
    mkdir /iscratch/i/danRer1/trfFa
    foreach d (/cluster/data/danRer1/*/chr*_?{,?})
      cp $d/$d:t.fa /iscratch/i/danRer1/trfFa
    end
    rm -rf /iscratch/i/danRer1/rmsk
    mkdir -p /iscratch/i/danRer1/rmsk
    cp -p /cluster/data/danRer1/*/chr*.fa.out /iscratch/i/danRer1/rmsk
    cp -p /cluster/data/danRer1/danRer1.2bit /iscratch/i/danRer1/
    iSync

# CREATE gc5Base wiggle TRACK (DONE, 2004-06-09, hartera)
    ssh kki
    mkdir /cluster/data/danRer1/bed/gc5Base
    cd /cluster/data/danRer1/bed/gc5Base
    #   in the script below, the 'grep -w GC' selects the lines of
    #   output from hgGcPercent that are real data and not just some
    #   information from hgGcPercent.  The awk computes the number
    #   of bases that hgGcPercent claimed it measured, which is not
    #   necessarily always 5 if it ran into gaps, and then the division
    #   by 10.0 scales down the numbers from hgGcPercent to the range
    #   [0-100].  Two columns come out of the awk print statement:
    #   <position> and <value> which are fed into wigAsciiToBinary through
    #   the pipe.  It is set at a dataSpan of 5 because each value
    #   represents the measurement over five bases beginning with
    #   <position>.  The result files end up in ./wigData5.
    #   A new script is used (from makeHg17.doc) which gets around the
    #   problem that wigAsciiToBinary was calculating chromEnd to be 
    #   beyond the real chromosome end 

    mkdir wigData5 dataLimits5
    cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1
                                                                                
chr=${NIB/.nib/}
chrom=${chr#chr}
                                                                                
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 danRer1 \
        /iscratch/i/danRer1/nib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
        -wibFile=wigData5/gc5Base_${chrom} \
            -name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRun.sh

    ls /iscratch/i/danRer1/nib  > nibList
    cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsub jobList
    para create jobList
    para try, check, push ... etc
# para time
# Completed: 29 of 29 jobs
# CPU time in finished jobs:       2450s      40.84m     0.68h    0.03d  0.000 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                  80s       1.33m     0.02h    0.00d
# Longest job:                      506s       8.43m     0.14h    0.01d
# Submission to last job:           650s      10.83m     0.18h    0.01d

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/danRer1/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/danRer1/wib/gc5Base \
                 danRer1 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/danRer1/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/danRer1/wib/gc5Base

    # And then the zoomed data view
    ssh kki
    cd /cluster/data/danRer1/bed/gc5Base
    mkdir wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1
                                                                                
chr=${NIB/.nib/}
chrom=${chr#chr}
                                                                                
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 danRer1 \
        /iscratch/i/danRer1/nib | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
        -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
            -name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRunZoom.sh

    cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 nibList single gsubZoom jobListZoom
    para create jobListZoom
    para try, check, push, ... etc.
# para time
# Completed: 29 of 29 jobs
# CPU time in finished jobs:       2433s      40.55m     0.68h    0.03d  0.000 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                  81s       1.36m     0.02h    0.00d
# Longest job:                      522s       8.70m     0.14h    0.01d
# Submission to last job:          2503s      41.72m     0.70h    0.03d
                                                                                
    #   Then load these .wig files into the same database as above
    ssh hgwdev
    cd /cluster/data/danRer1/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/danRer1/wib/gc5Base \
        -oldTable danRer1 gc5Base wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    mkdir -p /gbdb/danRer1/wib/gc5Base
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/danRer1/wib/gc5Base

# BLASTZ FOR HG17 (DONE, 2004-06-16, hartera)
    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.
                                                                                
    mkdir /iscratch/i/danRer1/linSpecRep.notInHuman
    foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/danRer1/linSpecRep.notInHuman/$f:t:r:r.out.spec
    end
                                                                                
    mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
    foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
 cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end
    iSync
                                                                                
    ssh kk
    mkdir -p /cluster/data/danRer1/bed/blastz.hg17.2004-06-08
    ln -s /cluster/data/danRer1/bed/blastz.hg17.2004-06-08 \
          /cluster/data/danRer1/bed/blastz.hg17
    cd /cluster/data/danRer1/bed/blastz.hg17
    # Set L=6000 and abridge repeats - these are the same parameters used
    # for hg16 and Fugu.
                                                                                
    cat << '_EOF_' > DEF
# zebrafish vs human (hg17)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
                                                                                
ALIGN=blastz-run
BLASTZ=blastz
                                                                                
# Reuse parameters from hg16-fr1.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
                                                                                
# TARGET: Zebrafish
SEQ1_DIR=/iscratch/i/danRer1/nib/
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
                                                                                
# QUERY: Human (hg17) 
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
                                                                                
BASE=/cluster/data/danRer1/bed/blastz.hg17
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.danRer1-hg17.2004-06-08
    # Need shell scripts from mm4 to do cluster runs
    mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/danRer1/jkStuff/
    # edit BlastZ_run0.sh
    # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
    # this is the directory for the latest version of blastz-run
                                                                                
    # prepare first cluster run
    ssh kk
    cd /cluster/data/danRer1/bed/blastz.hg17
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/danRer1/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 57970 of 57970 jobs
# CPU time in finished jobs:   20265671s  337761.18m  5629.35h  234.56d  0.643 y# IO & Wait Time:                570125s    9502.09m   158.37h    6.60d  0.018 y# Average job time:                 359s       5.99m     0.10h    0.00d
# Longest job:                    10482s     174.70m     2.91h    0.12d
# Submission to last job:         73680s    1228.00m    20.47h    0.85d
    # Took about 17 hours to run. Output is 6.3 Gigabytes - rather large.
    # Try processing and check in browser before adjusting parameters
    #   Second cluster run to convert the .out's to .lav's
    cd /cluster/data/danRer1/bed/blastz.hg17
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/danRer1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 170 of 170 jobs
# CPU time in finished jobs:      16047s     267.45m     4.46h    0.19d  0.001 y# IO & Wait Time:                146162s    2436.03m    40.60h    1.69d  0.005 y# Average job time:                 954s      15.90m     0.27h    0.01d
# Longest job:                     2179s      36.32m     0.61h    0.03d
# Submission to last job:          2451s      40.85m     0.68h    0.03d
                                                                                
    #   Third cluster run to convert lav's to axt's
    ssh kk
    cd /cluster/data/danRer1/bed/blastz.hg17
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/danRer1/nib \
/iscratch/i/gs.18/build35/bothMaskedNibs stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer1/bed/blastz.hg17/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 27 of 29 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:       2063s      34.38m     0.57h    0.02d  0.000 y# IO & Wait Time:                 19245s     320.75m     5.35h    0.22d  0.001 y# Average job time:                 789s      13.15m     0.22h    0.01d
# Longest job:                     9103s     151.72m     2.53h    0.11d
# Submission to last job:         14332s     238.87m     3.98h    0.17d
                                                                                
# chrNA and chrUn are the crashed jobs. These are very large and ran out of
# memory. Re-run these separately on kolossus
                                                                                
     ssh kk
     # put nibs on bluearc
     mkdir -p /cluster/bluearc/danRer1/nib
     mkdir -p /cluster/bluearc/hg17/bothMaskedNibs
     cp /iscratch/i/danRer1/nib/* /cluster/bluearc/danRer1/nib
     cp /iscratch/i/gs.18/build35/bothMaskedNibs/* \
        /cluster/bluearc/hg17/bothMaskedNibs
                                                                                
     cat << '_EOF_' > do2.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/bluearc/danRer1/nib \
/cluster/bluearc/hg17/bothMaskedNibs stdout \
| axtSort stdin $2
'_EOF_'
                                                                                
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.hg17/run.2
    ./do2.csh ../lav/chrNA \
              /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrNA.axt
    ./do2.csh ../lav/chrUn \
             /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrUn.axt
                                                                                
    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.hg17
    mkdir -p pslChrom
    set tbl = "blastzHg17"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
                                                                                
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.hg17/pslChrom
                                                                                
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl -noTNameIx danRer1 $f
      echo "$f Done"
    end
# To tune Blastz parameters: these parameters above where used but it was 
# thought that there were probably too many pileups. So then the
# blastz was repeated with L=10000 (as for hg16-galGal2) and with an
# intermediate value of L=8000. L is the threshold for gapped alignments.
# Looking at alignments in the browser it was found that L=10000 and L=8000
# were too stringent and alignments to coding regions were being lost.
# L=6000 (as above) seemed to be the best choice and then low scoring 
# alignments could be filtered at the chaining step instead to reduce pileups.
# e.g. using the browser with the non-zebrafish mRNAs track turned on
# chr1:24,547-28,249 region lost an exon with all but blastz with L=6000
# chr2:1,124,174-1,135,282 is worse with an entire gene being lost except
# with the Blastz with L=6000.

# RESCORE HG17 BLASTZ (DONE, 2004-06-21, hartera)
# USE HG17 BLASTZ WITH L=6000 SO AS NOT TO MISS TOO MUCH CDS ALIGNING
    # Low scores can occur with repeats abridged and using the
    # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
    # with the default matrix instead of the BLASTZ_Q matrix.
    # Rescore them here so the chainer sees the higher scores:
                                                                                
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.hg17
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
        axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom
                                                                                
#   psl files and blastz tables will be the same regardless of score so
#   no need to reload
                                                                                
# CHAIN HG17 BLASTZ (DONE, 2004-06-24, hartera)
    # Re do chains with rescored blastz Hg17
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer1/bed/blastz.hg17
    mv axtChain axtChain.orig
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    # create 2 input lists as need to process chrNA and chrUn separately
    ls -1S /cluster/data/danRer1/bed/blastz.hg17/axtChrom/*.axt \
        > input.lst
    grep "chrNA" input.lst > inputNAandUn.lst
    grep "chrUn" input.lst >> inputNAandUn.lst
    # remove chrNA and chrUn from input.lst
                                                                                
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    # Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=10000 stdin \
    /iscratch/i/danRer1/nib \
    /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs:       4572s      76.21m     1.27h    0.05d  0.000 y# IO & Wait Time:                  1252s      20.86m     0.35h    0.01d  0.000 y# Average job time:                 216s       3.60m     0.06h    0.00d
# Longest job:                     1092s      18.20m     0.30h    0.01d
# Submission to last job:          3242s      54.03m     0.90h    0.04d
                                                                                
    cat << '_EOF_' > gsub2
#LOOP
doChain2 {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
cat << '_EOF_' > doChain2
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=15000 stdin \
    /iscratch/i/danRer1/nib \
    /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 inputNAandUn.lst single gsub2 jobList2
    para create jobList2
    para try, check, push, check...
# para time
# crashes (out of memory) so try again on kolossus
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain/run1
    # use sequences on bluearc for kolossus
    cat << '_EOF_' > doChain2
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=15000 stdin \
    /cluster/bluearc/danRer1/nib \
    /cluster/bluearc/hg17/bothMaskedNibs $2 >& $3
'_EOF_'
    chmod +x doChain2
                                                                                
    doChain2 /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrNA.axt \
               chain/chrNA.chain out/chrNA.out
    doChain2 /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrUn.axt \
               chain/chrUn.chain out/chrUn.out
                                                                                
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
# take a look at score distr's,try also with larger bin size.
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist.out
      textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out
      echo ""
    end
                                                                                
    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer1 ${c}_chainHg17 $i
        echo done $c
    end
# load in original chains as chainHg17NoFilt - minScore = 5000 on all chroms
# featureBits -chrom=chr1 danRer1 chainHg17
# 12359874 bases of 40488791 (30.527%) in intersection
# featureBits -chrom=chr1 danRer1 refGene:cds chainHg17
# 147375 bases of 40488791 (0.364%) in intersection
# featureBits -chrom=chr1 danRer1 chainHg17NoFilt
# 13518074 bases of 40488791 (33.387%) in intersection
# featureBits -chrom=chr1 danRer1 refGene:cds chainHg17NoFilt
#  149120 bases of 40488791 (0.368%) in intersection
                                                                                
# featureBits -chrom=chrNA danRer1 chainHg17
# 79678867 bases of 335615307 (23.741%) in intersection
# featureBits -chrom=chrNA danRer1 refGene:cds chainHg17
# 556727 bases of 335615307 (0.166%) in intersection
# featureBits -chrom=chrNA danRer1 chainHg17NoFilt  
# 100579306 bases of 335615307 (29.969%) in intersection
# featureBits -chrom=chrNA danRer1 refGene:cds chainHg17NoFilt
# 622781 bases of 335615307 (0.186%) in intersection
# featureBits -chrom=chrNA danRer1 refGene:cds
# 741426 bases of 335615307 (0.221%) in intersection

# Using minScore=10000 for all chroms but 15000 for chrUn and chrNA
# reduces low scoring chains but without compromising CDS region coverage
# too much.

# NET HG17 BLASTZ (DONE, 2004-07-07, hartera)
                                                                                
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
                                                                                
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    #  memory usage 111640576, utime 625 s/100, stime 120
                                                                                
# Add classification info using db tables:
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman
    mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish
    cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/hg17/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \
       /cluster/bluearc/danRer1/linSpecRep.notInHuman
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
                                                                                
    time netClass noClass.net danRer1 hg17 humanhg17.net \
         -tNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman \
         -qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr
    # 85.530u 52.020s 5:15.86 43.5%   0+0k 0+0io 2072pf+0w
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
    netFilter -minGap=10 humanhg17.net |  hgLoadNet danRer1 netHg17 stdin

# MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-06-09, hartera)
    # Use -repMatch=460 (based on size -- for human we use 1024, and
    # the zebrafish genome is ~45% of the size of the human genome
    ssh kkr1u00
    mkdir /cluster/data/danRer1/bed/ooc
    cd /cluster/data/danRer1/bed/ooc
    mkdir -p /cluster/bluearc/danRer1
    ls -1 /cluster/data/danRer1/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/danRer1/11.ooc -repMatch=460
    # Wrote 44155 overused 11-mers to /cluster/bluearc/danRer1/11.ooc
    # For 10.ooc, repMatch = 4096 for human, so use 1840
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/danRer1/10.ooc -repMatch=1840
    # Wrote 10767 overused 10-mers to /cluster/bluearc/danRer1/10.ooc
    # keep copies of ooc files in this directory and copy to iscratch
    cp /cluster/bluearc/danRer1/*.ooc .
    cp -p /cluster/bluearc/danRer1/*.ooc /iscratch/i/danRer1/
    iSync

# AUTO UPDATE GENBANK MRNA AND EST RUN  (DONE, 2004-07-01, hartera)
    ssh eieio
    cd /cluster/data/genbank
    # This is a new organism, edit the etc/genbank.conf file and add:
# danRer1 (zebrafish)
danRer1.genome = /iscratch/i/danRer1/nib/chr*.nib
danRer1.lift = /cluster/data/danRer1/jkStuff/liftAll.lft
danRer1.downloadDir = danRer1
    # Default includes native genbank mRNAs and ESTs,
    # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but
    # not xeno RefSeq
    cvs commit -m "Added danRer1" etc/genbank.conf
    # Edit $HOME/kent/src/hg/makeDb/genbank/src/align/gbBlat 
    # to add /iscratch/i/danRer1/11.ooc
    # Add line:  DANRER_OOC=/iscratch/i/danRer1/11.ooc
    # Add line:  danRer*) oocOpt="-ooc=${DANRER_OOC}" ;

    cvs diff src/align/gbBlat
    make
    cvs commit -m "Added 11.ooc for danRer1." src/align/gbBlat
    # Edit $HOME/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c to add new genome
    # ADD: static char *danRerNames[] = {"Danio rerio", NULL};
    # ADD: {"danRer", danRerNames, NULL},
    # to static struct dbToSpecies dbToSpeciesMap[]
    cvs commit -m "Added zebrafish, danRer1." lib/gbGenome.c
    # Install to /cluster/data/genbank:
    cd $HOME/kent/src/hg/makeDb/genbank
    make install-server
                                                                                
    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, RefSeq mRNA only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial danRer1
    # Load results for RefSeq mRNAs:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer1
    # To start next run, results need to be moved out the way
    cd /cluster/data/genbank/work
    mv initial.danRer1 initial.danRer1.refseq.mrna
                                                                                
    # -initial for GenBank mRNAs
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial danRer1
                                                                                
    # Load results for GenBank mRNAs
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 danRer1
    cd /cluster/data/genbank/work
    mv initial.danRer1 initial.danRer1.genbank.mrna
                                                                                
    # -initial for ESTs
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial danRer1
                                                                                
    # Load results for GenBank ESTs
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 danRer1
    cd /cluster/data/genbank/work
    mv initial.danRer1 initial.danRer1.genbank.est

# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.danRer1
# The results of this are also symlinked under danRer1/bed


# ENSEMBL GENES (DONE, 2004-06-22, hartera)

   mkdir /cluster/data/danRer1/bed/ensembl
   cd /cluster/data/danRer1/bed/ensembl
    # Get the ensembl protein data from
    # http://www.ensembl.org/Danio_rerio/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Danio_rerio choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box.
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz
# Need to get lift files for each chrom from supercontig to chrom level
    ssh kksilo
    mkdir -p /cluster/data/danRer1/bed/ensembl/liftSupertoChrom
    cd /cluster/data/danRer1/bed/ensembl/liftSupertoChrom
    foreach c (NA Un Finished)
       set b = "/cluster/data/danRer1"
       awk '{print $6;}' $b/$c/chr$c.supercontigs.agp > chr$c.supercontigs.lst
       $HOME/bin/i386/faSomeRecords $b/Zv3.supercontigs.fa \
          ./chr$c.supercontigs.lst ./chr$c.fa
    echo $c " done"
    end

     # check FASTA files then generate AGP and lift files 
     # from the chromosome fastas 
     foreach c (NA Un Finished)
        $HOME/bin/i386/scaffoldFaToAgp ./chr$c.fa
        sed -e "s/chrUn/${c}/;" chr$c.agp > chr$c.superToChrom.agp
        sed -e "s/chrUn/${c}/;" chr$c.lft > chr$c.superToChrom.lft
     end
     cat *.superToChrom.lft > liftUnSuperToChrom.lft

     # get chrUn, Finished and NA records
     cd /cluster/data/danRer1/bed/ensembl
     awk '$1 ~ /NA/ || $1 ~ /ctg/ || $1 ~ /Finished/' ensemblGene.gtf \
                    > ensemblGenechrUns.gtf
     awk '$1 !~ /NA/ && $1 !~ /ctg/ && $1 !~ /Finished/' ensemblGene.gtf \
                    > ensemblGenechroms.gtf
     liftUp -type=.gtf ensemblGenechrUns.lifted \
         ./liftSupertoChrom/liftUnSuperToChrom.lft warn ensemblGenechrUns.gtf
     # Got 113698 lifts in ./liftSupertoChrom/liftUnSuperToChrom.lft

     # Process chroms and virtual chroms. Add "chr" to front of each line
     cp ensemblGenechroms.gtf all.gtf
     cat ensemblGenechrUns.lifted >> all.gtf

     sed -e "s/^/chr/" all.gtf > ensGene.gtf
     # load into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ensembl
     /cluster/bin/i386/ldHgGene danRer1 ensGene \
            /cluster/data/danRer1/bed/ensembl/ensGene.gtf
# Read 30783 transcripts in 554794 lines in 1 files
#   30783 groups 28 seqs 1 sources 4 feature types
# 30783 gene predictions

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql danRer1 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    echo "load data local infile 'ensGtp.txt' into table ensGtp" \
         | hgsql -N danRer1

    # Get the ensembl peptide sequences from
    # http://www.ensembl.org/Danio_rerio/martview
    # Choose Danio Rerio as the organism
    # Follow this sequence through the pages:
    # Page 1) Choose the Ensembl Genes choice. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Sequences" tab.
    # Page 4) Choose Transcripts/Proteins and peptide Only as the output,
    # choose text/fasta and gzip compression,
    # name the file ensGeneDanRer1.pep.gz and then hit export.

    gunzip ensGeneDanRer1.pep.gz
    # delete * at end of some proteins
    cat ensGeneDanRer1.pep | sed 's/\*$//' > ensembl.pep
    ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
     hgPepPred danRer1 generic ensPep fixPep_ensembl.pep

# BLASTZ FUGU (Fr1) (DONE, 2004-06-13, hartera)
    ssh kkr1u00
    # blastz requires lineage-specific repeats but there are none
    # available between these two fish species to make empty files
    mkdir -p /iscratch/i/danRer1/linSpecRep.notInDanRer
    mkdir -p /iscratch/i/fugu/linSpecRep.notInFugu
    cd /cluster/data/danRer1
      # for zebrafish
    foreach c (`cat chrom.lst`)
      cp /dev/null /iscratch/i/danRer1/linSpecRep.notInDanRer/chr${c}.out.spec
    end
      # for Fugu
    cp /dev/null /iscratch/i/fugu/linSpecRep.notInFugu/chrUn.out.spec
    iSync

    ssh kk
    mkdir -p /cluster/data/danRer1/bed/blastz.fr1.2004-06-13
    ln -s /cluster/data/danRer1/bed/blastz.fr1.2004-06-13 \
          /cluster/data/danRer1/bed/blastz.fr1
    cd /cluster/data/danRer1/bed/blastz.fr1

    cat << '_EOF_' > DEF
# zebrafish (danRer1) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer1)
SEQ1_DIR=/iscratch/i/danRer1/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species so these files are empty
SEQ1_SMSK=/iscratch/i/danRer1/linSpecRep.notInDanRer
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Fugu (fr1)
# soft-masked chrom nib
SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
SEQ2_RMSK=
SEQ2_SMSK=/iscratch/i/fugu/linSpecRep.notInFugu
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/danRer1/bed/blastz.fr1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy

    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.danRer1-fr1.2004-06-13

    # setup cluster run
    # source the DEF file
    bash
    . ./DEF
    /cluster/data/danRer1/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# Completed: 5950 of 5950 jobs
# CPU time in finished jobs:    1590351s   26505.86m   441.76h   18.41d  0.050 y
# IO & Wait Time:                 27786s     463.09m     7.72h    0.32d  0.001 y
# Average job time:                 272s       4.53m     0.08h    0.00d
# Longest job:                      601s      10.02m     0.17h    0.01d
# Submission to last job:          4298s      71.63m     1.19h    0.05d

    cd /cluster/data/danRer1/bed/blastz.fr1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/danRer1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 170 of 170 jobs
# CPU time in finished jobs:       1722s      28.69m     0.48h    0.02d  0.000 y
# IO & Wait Time:                 16055s     267.59m     4.46h    0.19d  0.001 y
# Average job time:                 105s       1.74m     0.03h    0.00d
# Longest job:                      152s       2.53m     0.04h    0.00d
# Submission to last job:           310s       5.17m     0.09h    0.00d

    #   Third cluster run to convert lav's to axt's
    ssh kk
    cd /cluster/data/danRer1/bed/blastz.fr1
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/danRer1/nib \
/cluster/bluearc/fugu/fr1/chromNib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer1/bed/blastz.fr1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 29 of 29 jobs
# CPU time in finished jobs:        600s      10.00m     0.17h    0.01d  0.000 y
# IO & Wait Time:                  2782s      46.37m     0.77h    0.03d  0.000 y
# Average job time:                 117s       1.94m     0.03h    0.00d
# Longest job:                      401s       6.68m     0.11h    0.00d
# Submission to last job:           966s      16.10m     0.27h    0.01d

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.fr1
    mkdir -p pslChrom
    set tbl = "blastzFr1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.fr1/pslChrom

    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer1 $f
      echo "$f Done"
    end

# CHAIN FUGU (Fr1) BLASTZ (DONE, 2004-06-24, hartera)
    # Create chains using -minScore=5000 for axtChain for chroms and
    # -minScore = 10000 for the messy chroms - chrUn and chrNA.
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer1/bed/blastz.fr1
    mkdir -p axtChain2/run1
    cd axtChain/run1
    mkdir out chain
    # create 2 input lists as need to process chrNA and chrUn separately
    ls -1S /cluster/data/danRer1/bed/blastz.fr1/axtChrom/*.axt \
        > input.lst
    grep "chrNA" input.lst > inputNAandUn.lst
    grep "chrUn" input.lst >> inputNAandUn.lst
    # remove chrNA and chrUn from input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -minScore=5000 $1 \
    /iscratch/i/danRer1/nib \
    /cluster/bluearc/fugu/fr1/chromNib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy

    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs:        356s       5.94m     0.10h    0.00d  0.000 y
# IO & Wait Time:                   128s       2.13m     0.04h    0.00d  0.000 y# Average job time:                  18s       0.30m     0.00h    0.00d
# Longest job:                       28s       0.47m     0.01h    0.00d
# Submission to last job:           830s      13.83m     0.23h    0.01d

    cat << '_EOF_' > gsub2
#LOOP
doChain2 {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain2
#!/bin/csh
axtChain -minScore=10000 $1 \
    /iscratch/i/danRer1/nib \
    /cluster/bluearc/fugu/fr1/chromNib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doChain2
    gensub2 inputNAandUn.lst single gsub2 jobList2
    para create jobList2
    para try, check, ...
# para time
# CPU time in finished jobs:        175s       2.92m     0.05h    0.00d  0.000 y
# IO & Wait Time:                    64s       1.06m     0.02h    0.00d  0.000 y
# Average job time:                 120s       1.99m     0.03h    0.00d
# Longest job:                      120s       2.00m     0.03h    0.00d
# Submission to last job:           120s       2.00m     0.03h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain2
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.fr1/
    # replace original axtChain with default minScore (=1000)
    mv axtChain axtChain.orig
    mv axtChain2 axtChain
    # remove old chain tables from database for Filt and Filt2

    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer1 ${c}_chainFr1 $i
        echo done $c
    end

# chain tables for fr1:
# chainFr1: default minScore = 1000
# chainFr1Filt1: minScore = 10000 for all chroms, except 15000 for Un and NA
# chainFr1Filt2: minScore = 5000 for all chroms, except 10000 for Un and NA
# (chainFr1Filt2 is now chainFr1)
# coverage is calculated as
#     size(refGene:CDS & Fr1) / size(refGene:CDS)
# enrichment is
#     (size(refGene:CDS & Fr1) / size(Fr1) ) / size(refGene:CDS)                # Chains:
# featureBits -chrom=chr1 danRer1 chainFr1Link
# 2170390 bases of 40488791 (5.360%) in intersection
# featureBits -chrom=chr1 danRer1 chainFr1FiltLink
# 1505825 bases of 40488791 (3.719%) in intersection
# featureBits -chrom=chr1 danRer1 chainFr1Filt2Link
# 1897128 bases of 40488791 (4.686%) in intersection

# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link
# 149280 bases of 40488791 (0.369%) in intersection
# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1FiltLink
# 135943 bases of 40488791 (0.336%) in intersection
# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Filt2Link
# 145585 bases of 40488791 (0.360%) in intersection

# featureBits -chrom=chr1 danRer1 refGene:cds
# 165468 bases of 40488791 (0.409%) in intersection
# featureBits danRer1 refGene:cds
# 4852787 bases of 1459132082 (0.333%) in intersection
#                Coverage Enrichment
# chainFr1Link:      90.2%   17.0X
# chainFr1FiltLink:  82.2%   22.1X
# chainFr1Filt2Link: 88.0%   18.8X

# NET FUGU (Fr1) BLASTZ (DONE, 2004-06-25, hartera)
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 97132544, utime 737 s/100, stime 115
    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain
    # use -noAr option - don't look for ancient repeats
    netClass -noAr noClass.net danRer1 fr1 fr1.net

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain
    netFilter -minGap=10 fr1.net | hgLoadNet danRer1 netFr1 stdin

# EXTRACT AXT'S AND MAF'S FROM FUGU NET (DONE, 2004-06-25, hartera)
    ssh eieio
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain
    netSplit fr1.net fr1Net
    mkdir -p ../axtNet
# make sorted axts from net 
cat > axtNet.csh << 'EOF'
    foreach f (fr1Net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt fr1Net/$c.net chain/$c.chain \
        /cluster/data/danRer1/nib /cluster/data/fr1/nib stdout \
        | axtSort stdin ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log

    ssh eieio
    cd /cluster/data/danRer1/bed/blastz.fr1
    cd axtNet
    mkdir ../mafNet

cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.fr1.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/danRer1/chrom.sizes /cluster/data/fr1/chrom.sizes \
            ../mafNet/$maf -tPrefix=danRer1. -qPrefix=fr1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log

# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP 
# (DONE, 2004-07-01, hartera)
    mkdir /projects/compbio/data/microarray/affyZebrafish
    cd /projects/compbio/data/microarray/affyZebrafish
  # Download Zebrafish genome array consensus sequences from Affymetrix
  # http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish
    unzip Zebrafish_consensus.zip
    sed -e 's/consensus://' Zebrafish_consensus \
           | sed -e 's/;/ /' > Zebrafish_consensus.fa

    # copy sequences to a directory visible on kkr1u00

    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /cluster/bluearc/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer1
    ssh kkr1u00
    cd /cluster/data/danRer1/bed
    mkdir affyZebrafish.2004-06-30
    ln -s /cluster/data/danRer1/bed/affyZebrafish.2004-06-30 \
          /cluster/data/danRer1/bed/affyZebrafish
    cd affyZebrafish
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/danRer1/bed/affyZebrafish
    ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
    ls -1 /iscratch/i/danRer1/trfFa/ > genome.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer1/11.ooc /iscratch/i/danRer1/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    para try, check, push ... etc.
# para time
# Completed: 310 of 310 jobs
# CPU time in finished jobs:      14931s     248.85m     4.15h    0.17d  0.000 y
# IO & Wait Time:                  1028s      17.13m     0.29h    0.01d  0.000 y
# Average job time:                  51s       0.86m     0.01h    0.00d
# Longest job:                       96s       1.60m     0.03h    0.00d
# Submission to last job:           574s       9.57m     0.16h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyZebrafish.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least
    # 95% identity in aligned region.
    # do not use minCover since a lot of sequence is in Un, NA and Finished
    # so genes may be split up so good to see all alignments
    pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak
    mv affyZebrafish.psl.bak affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # psl is good                                                              
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer1/bed/affyZebrafish
    hgLoadPsl danRer1 affyZebrafish.psl

    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=Zebrafish: danRer1 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa

    # Clean up
    rm batch.bak contig.psl raw.psl
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer1

# ADD CONTIGS TRACK, ctgPos2 (DONE, 2004-07-02, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift
    ssh kksilo
    cd /cluster/data/danRer1/bed
    mkdir ctgPos2
    cd ctgPos2
    # make a .as file for ctgPos2 which is an extended ctgPos with
    # a field for clone type
    cat << '_EOF_' > $HOME/kent/src/hg/lib/ctgPos2.as
table ctgPos2
"Where a contig is inside of a chromosome including contig type information."
    (
    string chrom;       "Chromosome name"
    uint chromStart;    "Start in chromosome"
    uint chromEnd;      "End in chromosome"
    string name;        "Name of contig"
    char[1] type;       "(W)GS contig, (F)inished, (P)redraft, (D)raft, (O)ther"    uint size;          "Size of contig"
    )
'_EOF_'
    cd $HOME/kent/src/hg/lib/
    autoSql ctgPos2.as ctgPos2
    mv ctgPos2.h $HOME/kent/src/hg/inc
    # do make to check it works and commit the .as, .sql, .c and .h file to CVS
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
              21 22 23 24 25)
         awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer1/$c/contigs/chr${c}.agp >> ctgPos2.tab
    end
                                                                                
    # need to parse with chrom, chromStart, chromEnd first for liftUp to work
    foreach c (ctg NA Finished)
       awk 'BEGIN {OFS="\t"} \
         {if (($1 ~ /^'"${c}"'[0-9]+$/) && $5 != "N" ) \
         print $1, $2-1, $3, $5, $6, $3-$2+1}' \
         /cluster/data/danRer1/Zv3.contigs.agp >> chrUns.tmp
    end
                                                                                
    # Use liftUp to change co-ordinates to those of virtual chroms
    # make lift file from all 3 virtual chroms
    foreach c (NA Un Finished)
       cat /cluster/data/danRer1/$c/chr${c}.lft >> chrAllUns.lft
    end
                                                                                
    liftUp -type=.bed chrUns.lifted chrAllUns.lft warn chrUns.tmp
                                                                                
    # once have lifted co-ordinates, need to parse to change to table format
    awk 'BEGIN {OFS="\t"} {print $5, $6, $1, $2, $3, $4}' \
        chrUns.lifted > chrUns.tab
    # add this to ctgPos2.tab
    cat chrUns.tab >> ctgPos2.tab
                                                                                
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ctgPos2
    echo "drop table ctgPos2" | hgsql danRer1
    hgsql danRer1 < ~/kent/src/hg/lib/ctgPos2.sql
    echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
         | hgsql danRer1

# WZ EST ALIGNMENTS (DONE, 2004-07-09, hartera)
# WZ ESTs are compiled ESTs from WashU. They were provided by
# Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard
# Contact: adibiase@enders.tch.harvard.edu
# http://zon.tchlab.org
     ssh hgwdev
     mkdir -p /cluster/data/danRer1/bed/ZonLab/wzESTs
     cd /cluster/data/danRer1/bed/ZonLab/wzESTs
     # put WZ ESTs in this directory as wzcontigs.txt -
     # obtained from the Zon Lab, these are unmasked.
     # There are 42857 ESTs in this file.
     # Translate to upper case
     tr '[a-z]' '[A-Z]' < wzcontigs.txt > wzcontigs.fa
     ssh kkr1u00
     mkdir -p /iscratch/i/danRer1/wzESTs
     cd /cluster/data/danRer1/bed/ZonLab/wzESTs
     faSplit sequence wzcontigs.fa 20 /iscratch/i/danRer1/wzESTs/wzcontigs
     iSync
                                                                                
     ssh kk
     cd /cluster/data/danRer1/bed/ZonLab/wzESTs
     mkdir psl
     ls -1 /iscratch/i/danRer1/wzESTs/*.fa > est.lst
     ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst
# REdO without masking - not used for native ESTs
     cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer1/11.ooc} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
     gensub2 genome.lst est.lst gsub spec
     para create spec
     para try
     para check
# para time
# Completed: 6200 of 6200 jobs
# CPU time in finished jobs:      50309s     838.48m    13.97h    0.58d  0.002 y# IO & Wait Time:                 19334s     322.23m     5.37h    0.22d  0.001 y# Average job time:                  11s       0.19m     0.00h    0.00d
# Submission to last job:           326s       5.43m     0.09h    0.00d
                                                                                
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create wzEsts.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least
    # 96% identity in aligned region. use parameters used by auto
    # GenBank update for native ESTs
    pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp wz_ests.psl /cluster/data/danRer1/jkStuff/liftAll.lft warn contig.psl                                                                                
    # Load EST alignments into database.
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/wzESTs
    hgLoadPsl danRer1 wz_ests.psl
                                                                                
    # Add WZ EST sequences
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/danRer1/wzESTs
    ln -s \
       /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \
       /gbdb/danRer1/wzESTs
                                                                                
    hgLoadSeq danRer1 /gbdb/danRer1/wzESTs/wzcontigs.fa

# BACENDS (DONE, 2004-07-28, hartera)
# Added bacEndPairsBad table (DONE, 2004-08-02, hartera)
# EXTENDED bacCloneXRef TABLE TO INCLUDE MORE ALIASES, UNISTS ID AND CONTIG
# (DONE, 2005-04-25, hartera)
# RELOADED bacCloneXRef TABLE AFTER CORRECTING PROGRAM TO USE NULL
# IN ALL PLACES WHERE THERE IS NO UniSTS ID (DONE, 2005-04-27, hartera)
# ADDED SANGER STS NAME FIELD (sangerName) TO bacCloneAlias TABLE AND
# RELOADED DATA INCLUDING THIS NEW FIELD, ADDED SEARCHES FOR BAC CLONES 
# ALIASES AND ASSOCIATED STS ALIASES (DONE, 2005-05-17, hartera)
# ADDED MORE SEARCHES, CORRECTED BUGS IN CREATING bacCloneXRef TABLE,
# ADDED MORE INDICES FOR bacCloneXRef AND REMOVED BAC INTERNAL NAMES
# FROM bacCloneAlias TABLE, MADE CORRRECTED FILE OF SANGER AND UNISTS IDS 
# AND RELOADED DATA FOR THESE TABLES (DONE, 2005-05-25, hartera)
# REMOVED INCORRECT ROW FROM bacCloneXRef TABLE (DONE, 2006-04-20, hartera)
# Provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston
# Children's Hospital. Collaborated with them on this track.
# Anthony DiBiase:adibiase@enders.tch.harvard.edu
# Yi Zhou:yzhou@enders.tch.harvard.edu 
# BAC clone end sequences are from Robert Geisler's lab, 
# Max Planck Institute for Developmental Biology, Tuebingen, Germany
    ssh kksilo
    cd /cluster/data/danRer1/bed/ZonLab
    mkdir /cluster/data/danRer1/bed/ZonLab/bacends
    cp cloneEnd.txt ./bacends/bacends.fa
    cd bacends
    # copy BAC clone ends seqeunce file to this directory
    faSize bacends.fa
    # 486978153 bases (39070196 N's 447907957 real) in 594614 sequences in
    # 1 files Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za)
    # max 5403 (zC259G13.zb) median 796
    # N count: mean 65.7 sd 154.7
    mkdir /iscratch/i/danRer1/bacends
    faSplit sequence bacends.fa 20 /iscratch/i/danRer1/bacends/bacends
    ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst
    ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    mkdir psl
    gensub2 genome.lst bacends.lst template jobList
    ssh kkr1u00
    # iSync bacends to kilokluster
    iSync
    ssh kk
    cd /cluster/data/danRer1/bed/ZonLab/bacends
    para create jobList
    para try, check, push, check, ...
# para time
# Completed: 6200 of 6200 jobs
# CPU time in finished jobs:   4833892s   80564.87m  1342.75h   55.95d  0.153y
# IO & Wait Time:                31979s     532.98m     8.88h    0.37d  0.001y
# Average job time:                785s      13.08m     0.22h    0.01d
# Longest job:                    2300s      38.33m     0.64h    0.03d
# Submission to last job:        14904s     248.40m     4.14h    0.17d
    # back on kksilo, filter and lift results:
    ssh kksilo
    cd /cluster/data/danRer1/bed/ZonLab/bacends
    pslCheck ./psl/*.psl >& pslCheck.log
    # No problems reported by pslCheck
    pslSort dirs raw.psl temp psl
    # tried minCover=0.6 but only 55% of sequences were aligned
    # but with minCover=0.4, 77% of seqeunces are aligned
    # No minCover, aligns 90% of sequences but with a large number
    # of extra alignments - noise
    pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \
      bacEnds.psl /dev/null
    liftUp bacEnds.lifted.psl /cluster/data/danRer1/jkStuff/liftAll.lft \
           warn bacEnds.psl
    wc -l bacEnds.lifted.psl
    # 3717153 bacEnds.lifted.psl
    rm -r temp

# Need to add accession information and BAC end sequence pairs
    mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1
    cd /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1
    grep ">" ../bacends.fa > bacEnds
    # remove '>' at beginning of each line
    perl -pi.bak -e 's/>//' bacEnds
    # remove ^M char from end of lines
    perl stripEndChar.pl < bacEnds > bacEnds.names
    rm bacEnds bacEnds.bak
    # Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk
    # provided accession data for BAC End clones - zfish_accs.txt
    # and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir
    # write and use perl script to split BAC Ends into pairs and singles
    # bacEndPairs.txt and bacEndSingles.txt
    # For pslPairs, the reverse primer (T7 or .z*) should in the first column
    # and the forward primer (SP6 or .y*) should be in the second column
    # There are several reads for some sequences and these have similar names.
    # Reads for the same sequencee should be in a comma separated list.
    # Sequence read names can be translated to external clone names as found
    # in NCBI's clone registry using the name without the suffix and the 
    # translation of prefixes as follows after removal of dashes 
    # and extra zeros in the name:
    # library         external prefix         internal prefix         
    # CHORI-211		CH211-                  zC
    # DanioKey        	DKEY-                   zK
    # DanioKey Pilot  	DKEYP-                  zKp
    # RZPD-71         	RP71-                   bZ
    # BUSM1 (PAC)      	BUSM1-                  dZ
    # e.g. zC001-A03.za becomes CH211-1A03
    perl getBacEndInfo.pl bacEnds.names BACEnd_accessions.txt > bacs.log
    # script not working so fix and zip again (2005-04-29, hartera)    
    gzip getBacEndInfo.pl

    # 180280 bacEndPairs.txt  - 571985 sequences
    # 16020  bacEndSingles.txt - 22629 sequences
    # 47048  bacEndAccs.aliases
    # bacEndAccs.aliases contains sequence read names and their
    # Genbank accessions.  
    
    # First process BAC end alignments
    mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/pairs
    cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs
    set bacDir = /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1
    # these bacEnds vary in size from around 2800 bp to 626,000 bp
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \
     all_bacends bacEnds
     wc -l bacEnds.*
     # 495 bacEnds.long
     # 13046 bacEnds.mismatch
     # 200235 bacEnds.orphan
     # 72820 bacEnds.pairs
     # 0 bacEnds.short
     # 139 bacEnds.slop

    # create header required by "rdb" tools
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
          > ../header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
    # make pairs bed file
    cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
               | headchg -del > bacEndPairs.bed
    # also need to process bacEndSingles.txt into a database table
    # for singles in bacEndSingles.txt, create a dummy file where they
    # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
    # sequence, put the dummy sequence in the second column, if the single is
    # a reverse sequence put in first column. use a perl script to do this.
    cd /cluster/data/danRer1/bed/ZonLab/bacends
    set bacDir = /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1
    mkdir singles
    cd singles
    perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                          $bacDir/bacEndSingles.format
    # then run pslPairs on this formatted sequence
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
     wc -l bacEnds*
    # 0 bacEnds.long
    # 0 bacEnds.mismatch
    # 14126 bacEnds.orphan
    # 0 bacEnds.pairs
    # 0 bacEnds.short
    # 0 bacEnds.slop
    # there are 14126 orphans here and 200235 from pair analysis 
    # so a total of 214361 orphans
    cat bacEnds.orphan ../bacEnds.orphan > bacEnds.singles
    wc -l bacEnds.singles
    # 214361 bacEnds.singles
    # make singles bed file
    cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed
    cp bacEndSingles.bed ../pairs
    cd ../pairs
    # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
    # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
    # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
 
    cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
    extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \
                bacEndPairsBadNoOrphans.bed  bacEndSingles.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl
   
    # load BAC end sequences into seq table so alignments may be viewed
    mkdir -p /gbdb/danRer1/bacends
    ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \
                                /gbdb/danRer1/bacends/BACends.fa
    hgLoadSeq danRer1 /gbdb/danRer1/bacends/BACends.fa
    # when loaded into danRer1, Heather found there were rows where
    # the aligments were the same but the lfNames were different. This is 
    # due to the presence of multiple reads for the same BAC end sequence.
    # Sometimes they are slightly different lenghths so the alignments are 
    # a little different. It would be good to consolidate all of these and
    # just pick the best alignment. However, here only the identical rows
    # were merged into one with a list of all the lfNames with that alignment.
   
    # load all alignments into temporary database
    ssh hgwdev
    echo "create database bacs_rah;" | hgsql danRer1
    cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs
    hgLoadBed bacs_rah bacEndPairs bacEndPairs.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 72272 elements of size 11
    # create a bacEndSingles table like bacEndPairs
    cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql
    # edit to give correct table name
    hgLoadBed bacs_rah bacEndSingles bacEndSingles.bed \
                 -sqlTable=../singles/bacEndSingles.sql
    # Loaded 203683 elements of size 11
    # load all_bacends later

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed bacs_rah bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 203709 elements of size 11
    # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
    # name, different lfNames and same alignments.
    mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/duplicates
    cd /cluster/data/danRer1/bed/ZonLab/bacends/duplicates
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
          bacEndPairs order by name, chrom, chromStart;" bacs_rah > pairs.txt
    sort pairs.txt | uniq -c > pairs.txt.uniq
    wc -l pairs*
    # 72272 pairs.txt
    # 53992 pairs.txt.uniq
    # for replicate rows, find all the unique lfNames and put these
    # in one row with the relevant lfStarts, lfSizes and correct lfCount
    perl removeReplicates.pl pairs.txt.uniq bacEndPairs > pairsNoReps.bed
    # repeat for singles
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
       bacEndSingles order by name, chrom, chromStart;" bacs_rah > singles.txt
    sort singles.txt | uniq -c > singles.txt.uniq
    wc -l singles*
    # 203683 singles.txt
    # 179170 singles.txt.uniq
    perl removeReplicates.pl singles.txt.uniq bacEndSingles \
                             > singlesNoReps.bed                                   
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
          bacEndPairsBad order by name, chrom, chromStart;" bacs_rah \
          > badPairs.txt
    sort badPairs.txt | uniq -c > badPairs.txt.uniq
    wc -l badPairs*
    # 203709 badPairs.txt
    # 178304 badPairs.txt.uniq
    perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad \
         > badPairsNoReps.bed
                                                                               
    # reload sequences into tables in danRer1
    # bed files are already sorted by chrom and chromStart
    # so no need to re-sort                                                                                
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/bacends/duplicates
    echo "drop table bacEndPairs;" | hgsql danRer1
    echo "drop table bacEndPairsBad;" | hgsql danRer1
    echo "drop table bacEndSingles;" | hgsql danRer1
    echo "drop table all_bacends;" | hgsql danRer1
                                                                              
    hgLoadBed danRer1 bacEndPairs pairsNoReps.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 53992 elements of size 11
    hgLoadBed danRer1 bacEndPairsBad badPairsNoReps.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 178304 elements of size 11
    # edit to give correct table name
    hgLoadBed danRer1 bacEndSingles singlesNoReps.bed \
                 -sqlTable=../singles/bacEndSingles.sql
    # Loaded 179170 elements of size 11
    cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs
    hgLoadPsl danRer1 -table=all_bacends bacEnds.load.psl
    # 2835212 record(s), 0 row(s) skipped, 706 warning(s) loading psl.tab
    # after merging rows with the same BAC name, the scoring is now
    # wrong in bacEndPairs, bacEndSingles and bacEndBadPairs.
    # Scores should be 1000 if there is 1 row for that name, else
    # 1500/number of rows for that sequence name - calculated by pslPairs.
    # Correct the scores.
    
    ssh hgwdev
    mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/scores
    cd /cluster/data/danRer1/bed/ZonLab/bacends/scores
    hgsql -N -e "select name from bacEndPairs;" danRer1 \
                 | sort | uniq -c > pairs.hits
    # download bacEndPairs table
    hgsql -N -e "select * from bacEndPairs;" danRer1 > bacEndPairs.out
    # use perl script to change scores from this file and write to new file
    perl correctScores.pl bacEndPairs.out pairs.hits \
                          > bacEndPairsGoodScores.bed
    # same for singles
    hgsql -N -e "select name from bacEndSingles;" danRer1 \
                 | sort | uniq -c > singles.hits
    # dowload bacEndSingles table
    hgsql -N -e "select * from bacEndSingles;" danRer1 > bacEndSingles.out
    perl correctScores.pl bacEndSingles.out singles.hits \
                         > bacEndSinglesGoodScores.bed
    # add bacEndPairsBad -  (hartera, 2004-08-02)
    # this was reloaded temporarily into bacs_rah
    hgsql -N -e "select name from bacEndPairsBad;" bacs_rah \
                 | sort | uniq -c > badPairs.hits
    hgsql -N -e "select * from bacEndPairsBad;" bacs_rah > bacEndPairsBad.out 
    perl correctScores.pl bacEndPairsBad.out badPairs.hits \
                         > bacEndPairsBadGoodScores.bed
    # check scores are correct now
    # reload BAC ends database tables
    hgsql -e "drop table bacEndPairs;" danRer1
    hgsql -e "drop table bacEndSingles;" danRer1
    hgsql -e "drop table bacEndPairsBad;" danRer1
    hgLoadBed danRer1 bacEndPairs bacEndPairsGoodScores.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 53992 elements of size 11
    hgLoadBed danRer1 bacEndSingles bacEndSinglesGoodScores.bed \
                 -sqlTable=../singles/bacEndSingles.sql

    # featureBits danRer1 bacEndPairs gap
    # 18271004 bases of 1459132082 (1.252%) in intersection
    # featureBits danRer1 bacEndSingles gap
    # 2718421 bases of 1459132082 (0.186%) in intersection
    # featureBits danRer1 bacEndSingles bacEndPairs
    # 158155304 bases of 1459132082 (10.839%) in intersection

    # Loaded 179170 elements of size 11
    # 179170 record(s), 0 row(s) skipped, 23 warning(s) loading bed.tab
    # loaded bacEndPairsBad table (hartera, 2004-08-02)
    hgLoadBed danRer1 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 178304 elements of size 11
    # 178304 record(s), 0 row(s) skipped, 23 warning(s) loading bed.tab

    # From getBacEndInfo.pl, created a bacEndsAcc.aliases file
    # this has the format <alias> <id> <trueName>
    # where trueName is the accession provided by Sanger in
    # BACEnd_accessions.txt and aliases are the read names for sequences
    # Create a bacEndAlias table.
    cat > bacEndAlias.as << $HOME/kent/src/hg/lib
table bacEndAlias
"BAC ends aliases and associated identification numbers"
    (
    string alias;       "BAC end read name"
    uint identNo;       "Identification number of BAC End"
    string acc;         "GenBank accession for the BAC End"
    )
'_EOF_'
    # << this line makes emacs coloring happy
    cd $HOME/kent/src/hg/lib
    autoSql bacEndAlias.as bacEndAlias
    cp bacEndAlias.h ../inc
    # edit bacEndAlias.sql to add INDEX(identNo) to indices
    # add and commit to cvs bacEndAlias.as, .sql, .c and .h
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1
    echo "drop table bacEndAlias" | hgsql danRer1
    hgsql danRer1 < $HOME/kent/src/hg/lib/bacEndAlias.sql
    echo "load data local infile 'bacEndAccs.aliases' into table bacEndAlias"\
         | hgsql danRer1
   # Add a xref table to give external clone registry names, internal names
   # and Genbank accessions - this table is expanded later, see further down
   # in documentation.
    cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneXRef.as
table bacCloneXRef
"BAC clones external names, internal sequence names and Genbank accessions"
    (
    string extName;       "External name for BAC clone"
    string intName;       "Internal sequencing name for BAC clone"
    string acc;         "Genbank accession for the BAC End"
    )
'_EOF_'
    # << this line makes emacs coloring happy
    cd $HOME/kent/src/hg/lib
    autoSql bacCloneXRef.as bacCloneXRef
    cp bacCloneXRef.h ../inc
    # edit bacCloneXRef.sql to add INDEX(extName) to indices
    # add and commit to cvs bacCloneXRef.as, .sql, .c and .h
    cd /cluster/data/danRer1/bed/ZonLab/bacends
    # use zfish_accs.txt provided by the Sanger Centre to create a list
    # of BAC clone names, internal names and Genbank accessions
    perl getBACInfo.pl zfish_accs.txt > bacCloneXRef.txt
    # there are 35 clones with different prefixes to those in our track
    # these are in BACerror.log and are not loaded into the XRef table
    echo "drop table bacCloneXRef" | hgsql danRer1
    hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneXRef.sql
    echo "load data local infile 'bacCloneXRef.txt' into table bacCloneXRef"\
         | hgsql danRer1
    mv bacCloneXRef.txt ./bacends.1
    # edit doLinkedFeaturesSeries in hgc.c to use this information
    # so that links to clone and clone end accessions can be made on the
    # details pages.
  
    # ALIGNED BAC END SEQUENCES TO CHECK THAT ONES WITH SIMILAR NAMES
    # ARE READS FROM THE SAME SEQUENCES.
    # also the sequences ending in .z or .Z and .y or .Y were of unknown
    # direction. Sanger thought that z may be forward but aligning the
    # sequences suggested that z was actually reverse knowing that SP6 
    # is a forward sequence and T7 is reverse.
    # Anthony DiBiase wrote script : countns.pl to filter out sequences with
    # too many Ns. threshold is 80% Ns
    cat bacends.fa | perl countns.pl > bacends.percentNs80
    grep '>' bacends.percentNs80 | sed -e 's/>//' > bacends.remove
    perl removeNames.pl ./bacends.1/bacEnds.names bacends.remove \
         > bacends.noseqswithlotsNs
    # get these good sequences with < 80% Ns
    $HOME/bin/i386/faSomeRecords bacends.fa \
          bacends.noseqswithlotsNs bacends_good.fa
    # blast
    ssh kkr1u00
    mkdir -p /iscratch/i/danRer1/bacends/clusterBlastdb
    cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends_good.fa \
       /iscratch/i/danRer1/bacends/clusterBlastdb
    cd /iscratch/i/danRer1/bacends/clusterBlastdb
    # do all against all blast
    /scratch/blast/formatdb -i bacends_good.fa -t bacendsgood \
                            -n bacendsgood -p F
    mkdir -p /iscratch/i/danRer1/bacends/clusterBlast
    faSplit sequence /cluster/data/danRer1/bed/ZonLab/bacends/bacends_good.fa \
            500 /iscratch/i/danRer1/bacends/clusterBlast/bacendsgood
    iSync
    # do kilokluster run
    ssh kk
    cd /cluster/data/danRer1/bed/ZonLab/bacends
    mkdir clusterBlast
    cd clusterBlast
    mkdir blastout
    ls -1S /iscratch/i/danRer1/bacends/clusterBlast/*.fa > bacs.lst
    ls -1S /iscratch/i/danRer1/bacends/clusterBlastdb/*.nsq \
           | sed "s/.nsq//" > bacsBlastdb.lst
    # Make blast script
    cat > blastSome << end
#!/bin/csh
setenv BLASTMAT /scratch/blast/data
/scratch/blast/blastall -p blastn -d \$1 -i \$2 -o \$3 -e 0.0
end
    # << this line keeps emacs coloring happy
    chmod a+x blastSome
    # Make gensub2 file
    cat > blastGsub <<end
#LOOP
blastSome $(path1) {check in line+ $(path2)} {check out line+ blastout/$(root1)_$(root2).out}
#ENDLOOP
end
    # << this line keeps emacs coloring happy
    gensub2 bacsBlastdb.lst bacs.lst blastGsub blastSpec
    para create blastSpec
    para try, check, push etc.
    # convert to psl format
    ssh kksilo
    cd /cluster/data/danRer1/bed/ZonLab/bacends/clusterBlast
    mkdir psl scores
    foreach b (blastout/*.out)
      set t = $b:t
      set s = $t:r
      echo "Creating ${s}.psl"
      /cluster/bin/i386/blastToPsl $b psl/${s}.psl -scores=scores/${s}.scores \
                     >>& blastToPsl.log
    end
    pslSort dirs raw.psl tmp psl
    mv raw.psl clusterBacEnds.psl
    awk '{print $10, $14, $9;}' clusterBacEnds.psl > clusterBacEnds.out
    # look at blast results for e-value threshold of 0
# ADD ALIASES FOR BAC ENDS (DONE, 2005-04-25, hartera)
# 2005-04-25 (hartera) add more aliases for BAC ends and the UniSTS ID
# add this to the bacCloneXRef table.
# Changes to bacAlias table and alias searches (DONE, 2005-05-17, hartera)
    # download the files of aliases from Sanger. URL for ftp site from
    # Sean Humphray (sjh@sanger.ac.uk)  
    ssh kksilo
    mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.02.12.04.txt
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.02.12.04.txt
    wget --timestamp \
         ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.02.12.04.txt
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    hgsql -N -e 'select name from bacEndPairs;' danRer1 > bacEnds.names
    hgsql -N -e 'select name from bacEndSingles;' danRer1 >> bacEnds.names
    hgsql -N -e 'select name from bacEndPairsBad;' danRer1 >> bacEnds.names
    sort bacEnds.names | uniq > bacEnds.names.uniq
    # 172990 unique names 
    # 594614 bacEnd sequences to start with 
    # some of the bacEnd sequences are replicates so count lfNames from table
    hgsql -N -e 'select lfNames from bacEndPairs;' danRer1 > bacEnds.lfNames
    hgsql -N -e 'select lfNames from bacEndSingles;' danRer1 >> bacEnds.lfNames
    hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer1 >> bacEnds.lfNames
    # need to remove "," and put one name per line
    perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames
    sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq
    # 337751 bacEnds.lfNames.uniq 
    # pslCheck has 2736 bad alignments
    # from psl file
    awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
    # remove first few lines with no names
    sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
    # 457560 bacEndsPsl.names.uniq
    # about 77% align - this is after filtering
    # Add an alias table for BAC clones 
    # Replaced the name field with a Sanger Name field so that aliases for 
    # that STS may be printed out in a table on the BAC Ends tracks 
    # description pages (2005-05-04, hartera)
    cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneAlias.as
table bacCloneAlias
"BAC clone associated STS aliases and Sanger STS names"
    (
    string alias;      "STS aliases associated with BAC clones"
    string sangerName; "Sanger STS name"
)
'_EOF_'
    # << this line makes emacs coloring happy
    cd $HOME/kent/src/hg/lib
    autoSql bacCloneAlias.as bacCloneAlias
    cp bacCloneAlias.h ../inc
    # edit bacCloneAlias.sql to add INDEX(alias) to indices
    # edit bacCloneAlias.sql to add INDEX(sangerName) to indices
    # remove primary key, can not make a compound primary key by
    # adding sangerName column since not all the BAC clones have this
    # and it is not unique, more than one BAC clone can have the same 
    # Sanger STS name. 
    # Add a xref table to give external clone registry names, internal names
    # sanger name, relationship between STS and BAC clone (method of finding
    # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, 
    # Genbank accession and STS primer sequences

    cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneXRef.as
table bacCloneXRef
"BAC clones and corresponding STS information"
    (
    string name;       	 	"External name for BAC clone"
    string intName;       	"Internal Sanger FPC name"
    string chroms;	  	"Chromosome(s) to which at one or both BAC ends are mapped by BLAT"
    string genbank;           	"Genbank accession for the BAC Clone"
    string sangerName;    	"Sanger STS name"
    uint relationship;	  	"Relationship type - method of finding STS"
    string uniStsId;      	"UniSTS ID(s) for STS"
    string leftPrimer;	  	"STS 5' primer sequence"
    string rightPrimer;   	"STS 3' primer sequence"
    )
'_EOF_'
    # << this line makes emacs coloring happy
    cd $HOME/kent/src/hg/lib
    autoSql bacCloneXRef.as bacCloneXRef
    cp bacCloneXRef.h ../inc
    # edit bacCloneXRef.sql to add INDEX(name) and remove primary key.
    # Add indices: INDEX(intName), INDEX(sangerName) for searches (2005-05-20)
    # edit so that all fields except the relationship field can be null
    # add and commit to cvs bacCloneXRef.as, .sql, .c and .h
    ssh kksilo
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    wc -l *.02.12.04.txt
    28305 clonemarkers.02.12.04.txt
    167441 ctgnames.02.12.04.txt
    12008 markers.02.12.04.txt
    cp ../getBACInfo.pl  getBACInfo2.pl
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
 
    # create list of BAC clones in the tables with chromosome aligned to
    hgsql -N -e 'select name, chrom from bacEndPairs;' danRer1 \
          > bacClones.namesandchrom
    hgsql -N -e 'select name, chrom from bacEndSingles;' danRer1 \
          >> bacClones.namesandchrom
    sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
    # use zfish_accs.txt provided by the Sanger Centre to create a list
    # of BAC clone names, internal names and Genbank accessions
    # extend getBACInfo.pl to add extended information (getBacInfo2.pl)
    perl getBACInfo2.pl zfish_accs.txt > bacCloneXRef.txt
    # write program to create bacCloneAlias.tab and bacCloneXRef.tab files
    # to load into tables - zfishBacEndsandSts.c
    # in $HOME/kent/src/hg/zfishBacEndsandSts

    ssh kksilo
    # use ssh kkstore01 to replace kksilo (2005-05-04)
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    # get list of UniSTS IDs using aliases to search alias file
    # print Sanger name, alias and UniSTS ID, use print_markers2.pl
    # modified version of program written by Tony DiBiase (Boston 
    # Children's Hospital)
    # create find_markers3.pl as the previous version was finding too
    # many UniSTS IDs for some markers as using grep (2005-05-23, hartera)

    cat << '_EOF_' > find_markers3.pl
    # example:
# perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
   chomp;
   ($id, $al) = split /\t/;
   @alsArray = split(/;/, $al);
   foreach my $as (@alsArray)
      {
      push (@{$aliasHash{$as} }, $id);
      }
}
close ALIAS;

while (<MARKERS>) {
    my @idArray;
    ($f, $t, $m, $idArray[0]) = 0;
    my @ids;
    chomp; ($a, $b, $aliases, @rest) = split /\|/;
    if ($verbose > 3) { printf "aliases $aliases \n"; }
    @alias = split /;/, $aliases;
    ALIAS: foreach $s (@alias) {
        if ($s =~ /[\D]+/) {
            if ($verbose > 5) { printf "this $s \n"; }
            if (exists($aliasHash{$s}))
               {
               @idArray = @{$aliasHash{$s}};
               }
            if ($idArray[0]) {
                $f = 1; $t = $s; @ids = @idArray;
                if ($verbose) { printf "this $s found $m \n"; }
                last ALIAS;
            }
        }
    }
    if ($f) 
     { 
     my @sNames = split(/;/, $b);
     foreach my $sn (@sNames)
        {
        foreach my $i (@ids)
           { 
           printf "$sn\t$i\n"; 
           }
        }
    }
}
close MARKERS;
'_EOF_'
    chmod +x find_markers3.pl
    perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \
         markers.02.12.04.txt > sangerandUniSTSId.txt
    wc -l sangerandUniSTSId.txt
    # No need to reformat this for zfishBacClonesandSts
 
    # FPC contig information (i.e. FPC contig number) from ctgnames file is 
    # not included in the tables as these are dynamic and constantly 
    # changing with the assembly.
    # made code changes so that zfishBacClonesandSts always uses NULL
    # in the bacCloneXRef table if there is no UniSTS ID and then
    # reloaded the bacCloneXRef table with the resulting tab file 
    # (hartera, 2005-04-27)
    # edit zfishBacClonesandSts to have aliases and Sanger STS name
    # in the bacCloneXRef.tab output file and re-run program 
    # this means that aliases may be listed by sangerName on the
    # details page and searches for aliases can still be made using
    # a join to the XRef table to get the external name to find the
    # position in the relevant bacEnds table.
    # (hartera, 2005-05-12)
    # Added another argument so that output directory must be specified
    # and fixed bug as not all the BAC clone names for each Sanger STS name
    # were being stored. Change to zfishBacClonesandSts.c so that the 
    # internal (Sanger) BAC clone names are not added to the alias table. 
    # These can be found in the bacCloneXRef table and there are only 
    # 12987 internal BAC clone names that have Sanger names and over 
    # 200,000 more internal names in the bacCloneXRef table. Re-run program
    # and re-load tables. 
    # Also use updated version of accessions file for BAC clones.
    # (hartera, 2005-05-20)
    # Received a new version of the zfish_accs.txt file:
    # zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to 
    # /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    # Created a merged file of accessions from this file and zfish_accs.txt
    # see makeDanRer1.doc, new version is zfish_accsMerged.txt
    # change to zfishBacClonesandSts.c so that the internal (Sanger)
    # BAC clone names are not added to the alias table. These can be found
    # in the bacCloneXRef table and there are only 12987 that have Sanger names
    # and over 200,000 more in the bacCloneXRef table. 
    # Re-ran zfishBacClonesandSts after increasing NUMALIAS used for setting
    # array size as not all the aliases were being stored. Also, the 
    # Sanger name and UniSTS ID file was re-created as this had incorrect
    # UniSTS IDs for some Sanger names (as many as 200) and bug fixed as
    # correct extName was not being added to the extNameHash in one case. 
    # (hartera, 2005-05-24)
    set dir=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    cp $dir/zfish_accsMerged.txt .
    mkdir -p /cluster/bluearc/danRer1/bacEnds/out
    nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \
      clonemarkers.02.12.04.txt markers.02.12.04.txt \
      bacClones.namesandchrom.uniq zfish_accsMerged.txt \
      sangerandUniSTSId.txt /cluster/bluearc/danRer1/bacEnds/out \
      > zfishBacs.out
    # output is in /cluster/bluearc/danRer1/bacends/out so copy over
    # sort alias file by sangerName
    sort -k2 /cluster/bluearc/danRer1/bacEnds/out/bacAlias.tab \
         > bacAlias.sort.tab
    cp /cluster/bluearc/danRer1/bacEnds/out/bacXRef.tab .
    wc -l *.tab
    # 55836 bacAlias.sort.tab
    # 234056 bacXRef.tab
 
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    # Remove bacCloneAlias table and add new table with sangerName field 
    # instead of the external name field
    # Re-load data including sanger STS name for the associated STS markers.
    # (2005-05-05, hartera)
    # Re-load data after bug fix and using new BAC clone accessions and 
    # without adding internal (Sanger) BAC clone names to alias table and 
    # using the bacCloneAlias table with indices for name, intName and
    # sangerName for searches. (2005-05-20, hartera)
    # Reload tables after using new version of zfishBacClonesandSts and 
    # corrected mapping of Sanger names to UniSTS IDs 
    # and fixed case where incorrect extName was being stored for an 
    # internal name. (hartera, 2005-05-24)
    hgsql danRer1 -e 'drop table bacCloneXRef'
    hgsql danRer1 -e 'drop table bacCloneAlias'
    hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneAlias.sql
    hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneXRef.sql
    nice hgsql danRer1 -e \
          'load data local infile "bacAlias.sort.tab" into table bacCloneAlias'
    nice hgsql danRer1 -e \
          'load data local infile "bacXRef.tab" into table bacCloneXRef'
    # checked data in tables to check that everything was correctly loaded
    # from the files and errors were corrected - see TEST section below
    # (DONE, 2005-05-24, hartera)
    # Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19)
    # There is one row where the name is "CH211-155M11__" and the
    # intName is "zC155M11__" which is incorrect.
    # There is a correct row for intName zC155M11 and zC155M11__ and
    # CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and
    # zC155M11__ is not an alias in bacEndAlias.
    hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer1

    cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1
    # edit trackDb.ra to add searches for the alias table
    # added dontCheckXrefQueryFormat so now a table join can be performed
    # (2005-05-17, hartera)
    # corrected termRegex for some bacCloneXRef searches so that they work 
    # correctly (bacPairsSangerSts and bacSinglesSangerSts)
    # (2006-04-19, hartera)
# searchName bacPairsAliases
# searchTable bacEndPairs
# searchMethod exact
# searchType bed
# searchBoth 1
# xrefTable bacCloneAlias
# dontCheckXrefQueryFormat .
# xrefQuery select bacCloneXRef.name,alias from %s as bca, bacCloneXRef where alias like '%s' and bacCloneXRef.sangerName = bca.sangerName
# searchPriority 9

# searchName bacSinglesAliases
# searchTable bacEndSingles
# searchMethod exact
# searchType bed
# searchBoth 1
# xrefTable bacCloneAlias
# dontCheckXrefQueryFormat .
# xrefQuery select bacCloneXRef.name,alias from %s as bca, bacCloneXRef where alias like '%s' and bacCloneXRef.sangerName = bca.sangerName
# searchPriority 9

    # add new searches to search added sangerName field of bacCloneXRef table
    # (2005-05-17, hartera)
# searchName bacPairsSangerSts
# searchTable bacEndPairs
# searchMethod exact
# searchType bed
# xrefTable bacCloneXRef
# xrefQuery select name, sangerName from %s where sangerName like '%s'
# termRegex (et|st)ID[0-9]+\.[0-9]+
# searchPriority 10
                                                                                
# searchName bacSinglesSangerSts
# searchTable bacEndSingles
# searchMethod exact
# searchType bed
# xrefTable bacCloneXRef
# xrefQuery select name, sangerName from %s where sangerName like '%s'
# termRegex (et|st)ID[0-9]+\.[0-9]+
# searchPriority 9

# commit trackDb.ra to CVS
# make code changes to $HOME/kent/src/hg/hgc/hgc.c to display some of the 
# alias and STS information from bacCloneXRef and bacCloneAlias tables on the
# BAC ends tracks details pages. Update hgc.c to show the aliases associated
# with each Sanger STS name listed for a BAC clones rather than just a list of 
# all aliases associated with that BAC clone (2005-05-17, hartera).

# add searches for internal (Sanger) BAC clone name (2005-05-20, hartera):
# searchName bacPairsIntName
# searchTable bacEndPairs
# searchMethod exact
# searchType bed
# xrefTable bacCloneXRef
# xrefQuery select name,intName from %s where intName like '%s'
# termRegex (zC|zK|bZ|dZ|bY)[[:alnum:]]+
# searchPriority 8.5

# searchName bacSinglesIntName
# searchTable bacEndSingles
# searchMethod exact
# searchType bed
# xrefTable bacCloneXRef
# xrefQuery select name,intName from %s where intName like '%s'
# termRegex (zC|zK|bZ|dZ|bY)[[:alnum:]]+
# searchPriority 8.5

# BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES 
# (DONE, 2005-05-24, hartera)
# ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES 
# (DONE, 2005-05-26, hartera)
    # The following tests were carried out to check that all the data
    # in the bacCloneAlias and bacCloneXRef tables is correct.
    ssh hgwdev
    mkdir -p \
        /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
# Check that the correct aliases are associated with their Sanger STS names
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
        ../markers.02.12.04.txt > sNameandaliases
    # write script to get one SangerName and one alias on each line
    chmod +x getSangerAndAlias.pl
    perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
    sort sNameandaliases.format | uniq > sNameandaliases.sort
    hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer1 \
          | sort | uniq > alias.db.sort
    wc -l alias.db.sort
    # 55836 alias.db.sort
    diff sNameandaliases.sort alias.db.sort
    # some differences
    awk '{print $1;}' sNameandaliases.sort | sort | uniq -c | sort -nr \
        > sNameandaliases.sNameCount
    # the most is 216
    # 216 etID22623.3
    # 154 stID49848.11
    # 61 etID49717.20
    # 58 etID35364.24
    # 39 etID45086.3
    # extend NUMALIASES from 50 to 250 in zfishBacCloneandSts.c - not all the 
    # aliases for some Sanger names are being stored.
    # re-run program and reload tables and test again - now ok.

# Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
# (2005-05-26, hartera)
    # get Sanger names from alias table
    hgsql -N -e 'select sangerName from bacCloneAlias;' danRer1 \
             | sort | uniq > sName.alias.sort  
    wc -l sName.alias.sort 
    # 14940 sName.alias.sort
    # get Sanger names from xRef table
    hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
          is not null;' danRer1 | sort | uniq > sName.xRef.sort
    wc -l sName.xRef.sort
    # 15153 sName.xRef.sort
    comm -23 sName.alias.sort sName.xRef.sort 
    # nothing in alias file only so all sanger names in the alias table are
    # also in the xRef table
    comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias 
    wc -l sNamexRefNotAlias
    # 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table
    # get Sanger names from clonemarkers file
    awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \         > clonemarkers.sNames.sort
    # get Sanger names from markers file
    awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames
    # remove semi-colons and sort 
    sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
    # sanger names unique to markers file
    comm -13 clonemarkers.sNames.sort markers.sNames.sort
    # there are none
    comm -23 clonemarkers.sNames.sort markers.sNames.sort \
         > sNames.clonemarkersOnly
    wc -l sNames.clonemarkersOnly
    # 213 sNames.clonemarkersOnly
    diff sNames.clonemarkersOnly sNamexRefNotAlias
    # no difference so all the extra Sanger Names in the xRef table are 
    # from the clonemarkers file and these have no aliases in the markers file
    # so they are not in the alias table so this is all ok.
    
# Check that Sanger STS names and primers are associated correctly 
    # get sanger names and primers from markers file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
        ../markers.02.12.04.txt > sNameandPrimers
    # write script to reformat and write with one Sanger name per line
    chmod +x getSangerandPrimers.pl
    perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
    sort sNameandPrimers.format > sNameandPrimers.format.sort
    wc -l sNameandPrim*
    # 12008 sNameandPrimers
    # 14940 sNameandPrimers.format
    # 14940 sNameandPrimers.format.sort
    # get Sanger names and primers from the database
    hgsql -N -e \
      'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
      where sangerName is not null and leftPrimer is not null and \
      rightPrimer is not null;' danRer1 | sort | uniq \
      > sNamesandprimers.fromdb.sort
   wc -l sNamesandprimers.fromdb.sort
   # 14940 sNamesandprimers.fromdb.sort
   diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
   # No difference so ok.

# Check that UniSTS IDs and Sanger names are associated correctly
   # get Sanger names and UniSTS IDs from the database
   hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
        uniStsId is not null;' danRer1 | sort | uniq > sNameUniSTS.fromdb.sort
   wc -l sNameUniSTS.fromdb.sort
   # 5456 sNameUniSTS.fromdb.sort
   # Need to reformat the sNameUniSTS.fromdb.sort
   chmod +x formatUniSts.pl 
   perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
        > sNameUniSTS.fromdb.format.sort
   # sort file of Sanger names and UniSTS IDs generated by find_markers2.pl
   sort ../sangerandUniSTSId.format > sangerandUniSTSId.format.sort
   diff sangerandUniSTSId.format.sort sNameUniSTS.fromdb.format.sort \
       > sangerandUniSTSIdvsdb
   # some differences found in original file. see how many uniSTS Ids there
   # are for each Sanger name
   awk '{print $1}' ../sangerandUniSTSId.format | sort | uniq -c | sort -nr \
       > sangerandUniSTSId.count
   # 200 etID309730.9
   # 84 etID138038.17
   # 11 etID315128.1
   # 11 etID315081.10
   # 10 etID309656.22
   #  9 etID22068.24
   #   8 etID309667.22
   # this is not correct, the UniSTS IDs for etID209730.9 are not correct.
   # need to change program that finds these UniSTS IDs for the Sanger names.
   # rewrote find_markers3.pl and recreated this file as sangerandUniSTSId.txt
   awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
       > sangerandUniSTSId.count2
   # the most is now 3 UniSTS IDs
   # 3 etID9056.23
   # 3 etID9042.2
   # 3 etID8627.2
   # 3 etID8281.9
   # 3 etID11096.5
   # 2 etID9986.14
   # 2 etID9968.17
 
   # re-check as above after reloading data in database - all ok now
   sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
   diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
       > sangerandUniSTSIdvsdb

# Check that chrom mappings and external BAC clone names are correct
   # get extNames and chroms they map to from the database
   hgsql -N -e 'select name, chroms from bacCloneXRef where \
         chroms is not null;' danRer1 | sort | uniq \
         > nameandchromsfromdb.sort
   # reformat nameandchromsfromdb.sort
   perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
        > nameandchromsfromdb.format.sort
   # compare extNames and chroms from db to those in data file
   diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
   # no difference - all ok

# Check Genbank accessions and internal BAC clone names
   hgsql -N -e 'select intName,genbank from bacCloneXRef where \
         genbank is not null;' danRer1 | sort | uniq \
         > intNamesandAccs.fromdb.sort
   # this should be a subset of zfish_accsMerged.txt - not all BAC clones
   # listed here appear in either our BAC ends tracks or the markers files.
   sort ../zfish_accsMerged.txt > zfish_accsMerged.sort
   comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort
   # zC212E7 BX324109
   # this accession is for zC212E4
   comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs
   wc -l onlyinzfishAccs
   # 88 onlyinzfishAccs
   # zC212E7 
   # extNameHash had no entry for zC212E4 and when queried with zC212E7
   # this returned CH211-212E4 as the extName which is incorrect  
   # fixed bug in zfishBacClonesandSts.c and re-ran program, reloaded tables.
   # re-check as above. now only 87 in zfishAccs and all those internal names
   # in the database are found in the zfish_accsMerged.txt file
   # get all intNames that have NULL as accession and check that none of these
   # are in the zfish_accsMerged.txt file
   hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer1
         | sort | uniq > intNamesNoAcc.fromdb.sort
   awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort
   comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
        > indbNoAccsandAccs.out
   # none of these names are common to both so all accessions from 
   # zfish_accsMerged.txt are in the database for the internal names stored
   # where there are accessions available.
   
# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
   # get internal names and Sanger names from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNameandSanger.sort
   # get internal names and Sanger names from database
   hgsql -N -e 'select intName, sangerName from bacCloneXRef \
       where sangerName is not null;' danRer1 \
       | sort | uniq > intNameandSanger.fromdb.sort
   diff intNameandSanger.sort intNameandSanger.fromdb.sort 
   # no difference so ok

# Check BAC clone internal name and relationship fields
   # get internal names and relationships from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNameandRelation.sort
   # get internal names and relationships from database 
   hgsql -N -e 'select intName, relationship from bacCloneXRef \
       where relationship != 0;' danRer1 \
       | sort | uniq > intNameandrelation.fromdb.sort
   # differences unique to database file
   comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.indbonly
   # differences unique to data file
   comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.incloneMarkersonly
   awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
   awk '{print $1}' intNameRelation.incloneMarkersonly \
       > intNameRelation.incloneMarkersonly.names
   diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
   # there is no difference in the internal names with relationship fields
   # no difference in names and the only places these should differ is that 
   # the second column should all be 3 in the data from the database only.
   # this is because all the relationship entries that were blank were 
   # in the clonemarkers file were changed to 3 when entered into the database.
   awk '{print $2}' intNameRelation.indbonly | sort | uniq
   # 3 - correct so all ok
   # all the differences should be that those that are blank in clonemarkers
   # are 3 in the database
   # check that those that have 0 in the database bacCloneXRef relationshipe 
   # field are not in the list from cloneMarkers
   # select these internal names with 0 relationship from the database
   hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
         danRer1 | sort | uniq > intNameNoRelation.fromdb.sort  
   # get all the internal names from the data file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNamefromCloneMarkers.sort
   comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
   # nothing in common between these two files as expected so there are
   # no internal names in the db with 0 in the relationship field that 
   # appear in the clonemarkers file.
   
# Check all BAC clone internal names and external names from the 
# ctgnames file are in the database
   # get intName and extName from ctgnames file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \
       | sort | uniq > intNameandextNamefromCtgNames.sort
   # get intName and extName from database
   hgsql -N -e 'select intName,name from bacCloneXRef;' danRer1 \
       | sort | uniq > intNameandextName.fromdb.sort
   wc -l intNameandextName*
   # 218738 intNameandextName.fromdb.sort
   # 167441 intNameandextNamefromCtgNames.sort
   comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextindbAndCtgNames
   wc -l intandextindbAndCtgNames 
   # 167441 intandextindbAndCtgNames
   # there are 167441 name pairs common between the file and the database
   # and this is the same number of name pairs as in the data file
   diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
   # no difference between those name pairs from the data file and those that 
   # are common between the data file and the database so all internal and 
   # external names from ctgNames file are in the database 
   # get the list of extra ones from db
   comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextNamesindbNotinCtgNames
   wc -l intandextNamesindbNotinCtgNames
   # 51297 intandextNamesindbNotinCtgNames
   # get list of internal names from the clonemarkers file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \
       > clonemarkers.intName.sort
   wc -l clonemarkers.intName.sort
   # 12987 clonemarkers.intName.sort
   # compare these intNames to those from the database not in the ctgnames file
   comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
   # none of these clone markers internal names are in this list so they
   # must all be in the ctgnames file too. These extra internal names will be
   # translations of external names found in the list of mappings of BAC clones
   # to chroms. 

# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database. 
   # get all extNames from baclones.namesandchrom.uniq and from ctgnames
   awk '{print $1}' ../bacClones.namesandchrom.uniq > \
       extNames.ctgnamesandbacClones
   awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \
       >> extNames.ctgnamesandbacClones
   wc -l extNames.ctgnamesandbacClones
   # 386727 extNames.ctgnamesandbacClones
   sort extNames.ctgnamesandbacClones | uniq \
        > extNames.ctgnamesandbacClones.sort
   wc -l extNames.ctgnamesandbacClones.sort
   # 218738 extNames.ctgnamesandbacClones.sort
   # get extNames from the database
   hgsql -N -e 'select name from bacCloneXRef;' danRer1 | sort | uniq \
         > extNames.fromdb.sort
   wc -l extNames.fromdb.sort
   # 218738 extNames.fromdb.sort
   # find extNames in common from data files and database
   comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
         > extNames.fromdbandfiles
   wc -l extNames.fromdbandfiles
   # 218738 extNames.fromdbandfiles
   diff extNames.fromdb.sort extNames.fromdbandfiles
   # no difference, all extNames from files are in db

# Check that all BAC clone internal names from the ctgnames and clonemarkers 
# files are in the database
   # get internal names from ctgnames and clonemarkers files
   awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \
       > intNames.ctgnamesandclonemarkers
   awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \
       >> intNames.ctgnamesandclonemarkers
   wc -l intNames.ctgnamesandclonemarkers
   # 195746 intNames.ctgnamesandclonemarkers
   sort intNames.ctgnamesandclonemarkers | uniq \
        > intNames.ctgnamesandclonemarkers.sort
   wc -l intNames.ctgnamesandclonemarkers.sort
   # 167441 intNames.ctgnamesandclonemarkers.sort
   # get internal names from database
   hgsql -N -e 'select intName from bacCloneXRef;' danRer1 | sort | uniq \
        > intNames.fromdb.sort
   wc -l intNames.fromdb.sort
   #  218738 intNames.fromdb.sort
   # some of these intNames are derived from the corresponding extNames
   # all of the intNames from the file should be in the db
   comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
        > intNames.fromdbandfiles
   wc -l intNames.fromdbandfiles
   # 167441 intNames.fromdbandfiles
   diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
   # no difference, all intNames from files are in db

# Check that all translations are correct between BAC clone 
# external and internal names.
   # write script to get the prefixes from internal and external names
   chmod +x getNamePrefixes.pl
   hgsql -N -e 'select name, intName from bacCloneXRef;' danRer1 \
         | sort | uniq > extandintNames.fromdb.sort
   perl getNamePrefixes.pl < extandintNames.fromdb.sort \
         > extandintNames.prefixes
   sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
   # these all look good
   # BUSM1   dZ
   # CH211   zC
   # CH211   zc
   # CH73    zH
   # CT7     bP
   # DKEY    zK
   # DKEY    zk
   # DKEYP   zKp
   # RP71    bZ
   # XX      bY
   # zk is a internal name prefix for the external name prefix, DKEY-. There
   # is only one example where this is used (DKEY-81G7) and this in the 
   # ctgnames file and is in the bacCloneXRef table so that is ok.
   # All data looks good in these tables now. 

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER1
# (DONE, 2004-07-13, hartera)
                                                                                
    ssh hgwdev
    # DNA port is "0", trans prot port is "1"
    echo 'insert into blatServers values("danRer1", "blat8", "17779", "0", "0"); \
          insert into blatServers values("danRer1", "blat8", "17778", "1", "0");' \
    | hgsql -h genome-testdb hgcentraltest
    # if you need to delete those entries
    echo 'delete from blatServers where db="danRer1";' \
    | hgsql -h genome-testdb hgcentraltest
    # to check the entries:
    echo 'select * from blatServers where db="danRer1";' \
    | hgsql -h genome-testdb hgcentraltest
    # Remove Blat servers entries for danRer1 from blatServers table in
    # hgcentralbeta and hgcentral (see section on ARCHIVING danRer1)
    # (2006-05-02, hartera)

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-07-20, hartera)
# CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF
# SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS
# (DONE, 2005-04-25, hartera)
# UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera)
    ssh kksilo
    cd /cluster/data/danRer1
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip */chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip */chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip */chr*.fa.masked
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMask/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer1 -native GenBank mrna \
        /cluster/data/danRer1/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer1 -xeno GenBank mrna \
        /cluster/data/danRer1/zip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer1 -native refseq mrna \
/cluster/data/danRer1/zip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer1 -native GenBank est \
/cluster/data/danRer1/zip/est.fa
                                                                                
cd /cluster/data/danRer1/zip
# zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j refMrna.zip refMrna.fa
zip -j est.zip est.fa
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.
    # Make upstream files and Copy the .zip files to
    # hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/danRer1/zip
    # make upstream files for zebrafish RefSeq
    featureBits danRer1 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    featureBits danRer1 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
                                                                                
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f (../*/chr*.fa) 
       zip -j $gp/chromosomes/$f:t.zip $f
    end
    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera)
    ssh kksilo
    cd /cluster/data/danRer1
    rm /cluster/data/danRer1/zip/chromAgp.zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
    ssh hgwdev
    cd /cluster/data/danRer1/zip
    #- Check zip file integrity:
    foreach f (chromAgp.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l chromAgp.zip.test
    # looks good
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    rm $gp/bigZips/chromAgp.zip
    rm $gp/bigZips/md5sum.txt
    cp -p chromAgp.zip $gp/bigZips
    # go to directory with zip files and remake the md5sum.txt file
    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    # updated bigZips and chromosomes README.txt files with new ftp site
    # for the assembly sequence at Sanger (2005-08-04, hartera):
    # now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/

# MAKE VSHG17 DOWNLOADABLES (DONE, 2004-07-20, hartera)
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.hg17/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    mkdir -p $gp/vsHg17/axtChrom
    cp -p *.axt $gp/vsHg17/axtChrom
    cd $gp/vsHg17/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt

    cd /cluster/data/danRer1/bed/blastz.hg17/axtChain
    zip -j /cluster/data/danRer1/zip/hg17.chain.zip hg17.chain
    rm hg17.chain
    cp humanhg17.net hg17.net
    zip -j /cluster/data/danRer1/zip/hg17.net.zip hg17.net
    rm hg17.net
    cd $gp/vsHg17
    mv /cluster/data/danRer1/zip/hg17*.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
                                                                                
# MAKE VSFR1 DOWNLOADABLES (DONE, 2004-07-26, hartera)
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    mkdir -p $gp/vsFr1/axtChrom
    cp -p *.axt $gp/vsFr1/axtChrom
    cd $gp/vsFr1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
    # add the axtNet *.axt in blastz.fr1/axtNet
    cd /cluster/data/danRer1/bed/blastz.fr1/axtNet
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    mkdir -p $gp/vsFr1/axtNet
    cp -p *.axt $gp/vsFr1/axtNet
    cd $gp/vsFr1/axtNet
    gzip *.axt
    md5sum *.gz > md5sum.txt
                                                                                
    cd /cluster/data/danRer1/bed/blastz.fr1/axtChain
    cp all.chain fr1.chain
    zip -j /cluster/data/danRer1/zip/fr1.chain.zip fr1.chain
    rm fr1.chain
    zip -j /cluster/data/danRer1/zip/fr1.net.zip fr1.net
    cd $gp/vsFr1
    mv /cluster/data/danRer1/zip/fr1*.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
                                                                           
# TBLASTN HG16 KNOWN GENES TRACK (DONE, 2004-07-23, hartera)
    # Collaboration with Anthony DiBiase and Yi Zhou of Leonard Zon's lab at 
    # the Childrens Hospital, Boston - part of the Zebrafish Genome Initiative
    # Protein set created by Anthony DiBiase: adibiase@enders.tch.harvard.edu
    # tblastn of proteins from hg16 knownGenePep table as query
    # against a database of repeatmasked Zv3 zebrafish assembly
    # Originally received tblastn output in html format and a 
    # blastHg16.psl file which has an extra accession column at
    # the beginning - this was created from the html
    # Use perl script to move accession to the end of each row
    ssh hgwdev
    # Create a new .as for an extended psl table with a
    # query ID column for the accession
    cat << '_EOF_' > $HOME/kent/src/hg/lib/pslWQueryID.as
table pslWQueryID
"Summary info about a patSpace alignment with a query ID addition"
    (
    uint    matches;      "Number of bases that match that aren't repeats"
    uint    misMatches;   "Number of bases that don't match"
    uint    repMatches;   "Number of bases that match but are part of repeats"
    uint    nCount;       "Number of 'N' bases"
    uint    qNumInsert;   "Number of inserts in query"
    uint     qBaseInsert;  "Number of bases inserted in query"
    uint    tNumInsert;   "Number of inserts in target"
    uint     tBaseInsert;  "Number of bases inserted in target"
    char[2] strand;       "+ or - for query strand. For mouse second +/- for genomic strand"
    string  qName;        "Query sequence name"
    uint    qSize;        "Query sequence size"
    uint    qStart;       "Alignment start position in query"
    uint    qEnd;         "Alignment end position in query"
    string  tName;        "Target sequence name"
    uint    tSize;        "Target sequence size"
    uint    tStart;       "Alignment start position in target"
    uint    tEnd;         "Alignment end position in target"
    uint    blockCount;   "Number of blocks in alignment"
    uint[blockCount] blockSizes;  "Size of each block"
    uint[blockCount] qStarts;     "Start of each block in query."
    uint[blockCount] tStarts;     "Start of each block in target."
    string    queryID;         "query ID field"
    )
'_EOF_'
    cd $HOME/kent/src/hg/lib/
    autoSql pslWQueryID.as pslWQueryID
    mv pslWQueryID.h ../inc
    # edit pslWQueryID.sql to have:
    # index(tName(32),tStart,tEnd) instead of PRIMARY KEY (matches)
    # commit to CVS the .as, .sql, .h and .c files
    
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/blast
    echo "drop table pslWQueryID" | hgsql danRer1
    hgsql danRer1 < $HOME/kent/src/hg/lib/pslWQueryID.sql
    # rename table to blastHg16KG
    echo "alter table pslWQueryID rename blastHg16KG" | hgsql danRer1
    # process data to add to the table
    mkdir -p /cluster/data/danRer1/bed/ZonLab/blast
    cd /cluster/data/danRer1/bed/ZonLab/blast
    # put blastHg16.psl into this directory
    perl formatPsl.pl < blastHg16.psl > blastHg16.format.tab
    # use pslCheck.c to check file after removing accessions
    pslCheck blastHg16NoAcc.psl
    # there are many errors. the psl file was not correctly formed.
    # Easiest to re-run tblasn and get regular output and use blastToPsl
    # to get a correctly formatted psl file.

    mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn
    cd tblastn
    # copy hg16_kg_proteinseq.fa here. This FASTA file was provided by
    # Anthony DiBiase and it contains 40115 predicted protein sequences from
    # hg16 known genes - from the knownGenePep table. There are 42026 
    # peptide sequences in hg16 knownGenePep. The FASTA file sequences were
    # obtained proteins by merging annotation data from the kgXref table with 
    # the sequence and name found in the knownGenePep table.
    
    # split known Genes proteins into smaller files
    ssh kkr1u00
    mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn
    cd tblastn
    # copy hg16_kg_proteinseq.fa here
    mkdir /iscratch/i/danRer1/hg16kgPep
    faSplit sequence hg16_kg_proteinseq.fa 20 \
              /iscratch/i/danRer1/hg16kgPep/hg16kg
    cd /iscratch/i/danRer1/
    mkdir blastdb
    cd blastdb
    # make formatted database for each
    foreach c (/iscratch/i/danRer1/trfFa/*.fa)
       set h = $c:t
       /scratch/blast/formatdb -i $c -t  $h -n $h -p F
    end
    iSync
    ssh kk
    cd /cluster/data/danRer1/bed/ZonLab/tblastn
    mkdir blastout
    ls -1S /iscratch/i/danRer1/hg16kgPep/*.fa > hg16kg.lst
    ls -1S /iscratch/i/danRer1/blastdb/*.nsq | sed "s/.nsq//" > zv3.lst
                                                                                
    # Make blast script
    cat > blastSome << end
#!/bin/csh
setenv BLASTMAT /scratch/blast/data
/scratch/blast/blastall -p tblastn -d \$1 -i \$2 -o \$3 -e 0.01
end
    # << this line keeps emacs coloring happy
    chmod a+x blastSome
    # Make gensub2 file
    cat > blastGsub <<end
#LOOP
blastSome $(path1) {check in line+ $(path2)} {check out line+ blastout/$(root1)_\$(root2).out}
#ENDLOOP
end
    # << this line keeps emacs coloring happy
    gensub2 zv3.lst hg16kg.lst blastGsub blastSpec
    para create blastSpec
    para try, check, push etc.
# para time
# Completed: 5270 of 5270 jobs
# CPU time in finished jobs:   10936672s  182277.86m  3037.96h  126.58d  0.347 y# IO & Wait Time:                707198s   11786.64m   196.44h    8.19d  0.022 y# Average job time:                2209s      36.82m     0.61h    0.03d
# Longest job:                    11825s     197.08m     3.28h    0.14d
# Submission to last job:         72712s    1211.87m    20.20h    0.84d
   ssh kksilo
   cd /cluster/data/danRer1/bed/ZonLab/tblastn
   # convert blast results to psl
   mkdir psl scores
   # some query names are too long and are on 2 separate lines so blastToPsl
   # can not parse them properly so parse out blast results so get only the
   # nucleotide accession name for the query
    cat << '_EOF_' > shortenQueryName.pl
#!/usr/bin/perl -w
use strict;
my $found = "false"; # flag set when query line found
open(QUERY, ">>queries.log") || die "Can not create error.log:$!";
while (<STDIN>) {
   if (/^(Query=\s?[A-Z]{1,2}[0-9]{4,})\|\w+/) {
      my $query = $1;
      print QUERY;
      $found = "true";
      print "$query\n";
   }
   elsif ($found eq "true") {
      if (/^\s+\([0-9]+\s?letters\)/) { print; }
      else { }
      $found = "false";
   }
   else {
      print;
   }
}
'_EOF_'
   foreach c (blastout/*.out)
      set t = $c:t
      set s = $t:r
      echo "Processing $c"
      perl shortenQueryName.pl < $c > blastout/${s}.format
   end
   foreach b (blastout/*.format)
      set t = $b:t
      set s = $t:r
      echo "Creating ${s}.psl"
      /cluster/bin/i386/blastToPsl $b psl/${s}.psl -scores=scores/${s}.scores \
                     >>& blastToPsl.log
   end
    # need to filter so that only results with e-value <= 1e-100 are kept
    # edit blastToPsl.c so can add e-value option and only get results for
    # e-values less than or equal to that value for filtering
    mkdir filtered/Blast
    cd filteredBlast
    mkdir psl scores
                                                                                
    foreach b (../blastout/*.format)
      set t = $b:t
      set s = $t:r
      echo "Creating ${s}.psl"
      $HOME/bin/i386/blastToPsl $b psl/${s}.psl -scores=scores/${s}.scores \
                     >>& blastToPsl.log
    end

    mkdir -p /cluster/data/danRer1/bed/ZonLab/blast
        lSort dirs raw.psl tmp psl
    liftUp -type=".psl" tblastnHg16KG.psl \
          /cluster/data/danRer1/jkStuff/liftAll.lft warn raw.psl
    # add protein IDs to the qName
    $HOME/bin/i386/protDat tblastnHg16KG.psl \
            /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \
            /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames tblastnHg16KGPep.psl
    pslCheck tblastnHg16KGPep.psl
    # psl is good
    # load psl into database
    hgLoadPsl danRer1 tblastnHg16KGPep.psl
    # there are 173355 rows in the table
    # need to do further filtering
    # rename as tblastnHg16KGEval
    hgsql -e "alter table tblastnHg16KGPep rename tblastnHg16KGEval;" danRer1
    mv tblastnHg16KGPep.psl tblastnHg16KGEval.psl
    foreach s (scores/*.scores)
      cat $s >> all.scores
    end
    sort -k2 all.scores > all.scores.sort
    # filter by taking just top 5 hits so for each protein query, take all the
    # alignments associated with its top 5 hits to the whole zebrafish genome
    # using the scores directory created by blastToPsl after filtering at an
    # e-value threshold of 1e-100. 
    # Heather wrote a Java program and used database tables to do this.
    # ScoreFilter.java is in ~kent/java/src/edu/ucsc/genome/qa/filter/
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top5
    hgsql -e "create database qa"; danRer1
    cat << end > blast.sql
CREATE TABLE blast (
    strand char(2) not null,
    qName varchar(255) not null,
    qStart int unsigned not null,
    qEnd int unsigned not null,
    tName varchar(255) not null,
    tStart int unsigned not null,
    tEnd int unsigned not null,
    tSize int unsigned not null,
    score double not null
);
end
    hgsql qa < blast.sql
    echo "load data local infile 'all.scores' into table blast" \
         | hgsql qa
    cat << end > blast2.sql
CREATE TABLE blast2 (
    strand char(2) not null,
    qName varchar(255) not null,
    qStart int unsigned not null,
    qEnd int unsigned not null,
    tName varchar(255) not null,
    tStart int unsigned not null,
    tEnd int unsigned not null,
    tSize int unsigned not null,
    score double not null
);
end
    hgsql qa < blast2.sql
    # load data into blast2 and sort
    echo "insert into blast2 select * from blast order by qName, score, tName" \         | hgsql qa
    # create index
    echo "create index qName on blast2(qName)" | hgsql qa
    cat << end > blast3.sql
CREATE TABLE blast3 (
    strand char(2) not null,
    qName varchar(255) not null,
    qStart int unsigned not null,
    qEnd int unsigned not null,
    tName varchar(255) not null,
    tStart int unsigned not null,
    tEnd int unsigned not null,
    tSize int unsigned not null,
    score double not null
);
end
    hgsql qa < blast3.sql
    mkdir data
    echo "select distinct(qName) from blast" | hgsql qa -N > data/idlist
    wc -l data/idlist
    # 1987 qNames
    # then run ScoreFilter program - this reads from blast2 and
    # writes to blast3 and takes the list of qNames as input
    source javaenv
    javac ScoreFilter.java
    java ScoreFilter qNames.blast.out
    # Test case
    echo "select score, tName from blast2 where qName = 'AB002324' limit 12;" \
         | hgsql qa
    echo "select * from blast3 where qName = 'AB002324';" | hgsql qa
    # get list of qNames and tNames from blast3 to query psl
    echo "select qName, tName from blast3;" | hgsql qa -N \
         > data/qNametName.blast3.out
    wc -l data/qNametName.blast3.out
    # 5201 data/qNametName.blast3.out
    # create psl table in qa
    cat << end > pslBlast.sql
CREATE TABLE pslBlast (
    matches int unsigned not null,
    misMatches int unsigned not null,
    repMatches int unsigned not null,
    nCount int unsigned not null,
    qNumInsert int unsigned not null,
    qBaseInsert int not null,
    tNumInsert int unsigned not null,
    tBaseInsert int not null,
    strand char(2) not null,
    qName varchar(255) not null,
    qSize int unsigned not null,
    qStart int unsigned not null,
    qEnd int unsigned not null,
    tName varchar(255) not null,
    tSize int unsigned not null,
    tStart int unsigned not null,
    tEnd int unsigned not null,
    blockCount int unsigned not null,
    blockSizes longblob not null,
    qStarts longblob not null,
    tStarts longblob not null,
    index(tName(32),tStart,tEnd)
);
end
    hgsql qa < pslBlast.sql
    # cat together psl files
    foreach s (../psl/*.psl)
      cat $s >> all.blast.psl
    end
    wc -l all.blast.psl
    # 181405 all.blast.psl
    # load psl data into table
    echo "load data local infile 'all.blast.psl' into table pslBlast" \
         | hgsql qa
    # check all data loaded, correct number of rows
    # select all lines with qName and tName in data/qNametName.blast3.out
    # from this pslBlast table
    cat << end > getSelectedPsl.pl
#!/usr/bin/perl -w
use strict;
                                                                                
while(<>) {
    chomp;
    my @names = split(/\t/);
    my ($qName, $tName) = @names;
    print STDERR "Querying pslBlast for query: $qName and target:$tName ... \n";    system "hgsql -N -e 'select * from pslBlast where qName = \"$qName\" and tName = \"$tName\";' qa";
                                                                                
}
end
    chmod +x getSelectedPsl.pl
    perl getSelectedPsl.pl < data/qNametName.blast3.out > blastTop5.psl
    # Need to remove long names
    cat << end > shortenNames.pl
#!/usr/bin/perl -w
use strict;
                                                                               
while(<>) {
   my $line = $_;
   my @row = split(/|/);
   # qName is the 10th field in a psl file
   if ($row[9] =~ /\|/) {
      my @accs = split(/\|/, $row[9]);
      my $newAcc = $accs[0];
      foreach (my $i = 0; $i <= $#row; $i++ ) {
         if ($i == 9) {
            print "$newAcc\t";
         }
         elsif ($i == $#row) {
            print "$row[$i]\n";
         }
         else {
            print "$row[$i]\t";
         }
      }
   }
   else {
      print $line;
     }
}
end
   chmod +x shortenNames.pl
   perl shortenNames.pl < blastTop5.psl > blastTop5Format.psl
   wc -l blastTop*
   # 29091 blastTop5.psl
   # 29091 blastTop5Format.psl
    mkdir psl
    mv blastTop5Format.psl ./psl
    pslSort dirs blastTop5Sort.psl tmp psl
    # lift up to chrom co-ordinates
    liftUp -type=".psl" tblastnHg16KG.psl \
          /cluster/data/danRer1/jkStuff/liftAll.lft warn blastTop5Sort.psl
    # add protein IDs to the qName
    $HOME/bin/i386/protDat tblastnHg16KG.psl \
            /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \
            /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames tblastnHg16KGPep.psl    pslCheck tblastnHg16KGPep.psl
    # psl is good
    # load psl into database
    hgLoadPsl danRer1 tblastnHg16KGPep.psl
    # previous table of alignments filtered on 1e-100 only is tblastnHg16KGEval    # compare this to current tblastnHg16KGPep table with featureBits
    # featureBits danRer1 tblastnHg16KGEval
    # 2783971 bases of 1459132082 (0.191%) in intersection
    # featureBits danRer1 tblastnHg16KGPep
    # 2397991 bases of 1459132082 (0.164%) in intersection
    # featureBits danRer1 refGene:cds tblastnHg16KGEval
    # 246994 bases of 1459132082 (0.017%) in intersection
    # featureBits danRer1 refGene:cds tblastnHg16KGPep
    # featureBits danRer1 refGene:cds tblastnHg16KGPep
    # 250847 bases of 1459132082 (0.017%) in intersection
    # featureBits danRer1 tblastnHg16KGPepTop3
    # 2223873 bases of 1459132082 (0.152%) in intersection
    # featureBits danRer1 refGene:cds tblastnHg16KGPepTop3
    # 236309 bases of 1459132082 (0.016%) in intersection

    # load sequences into table hg16_kg_proteinseq.fa has 2833 sequences
    # less than in knownGenePep for hg16 - load all knownGenePep sequences
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/tblastn
    echo "select * from knownGenePep;" | hgsql -N hg16 > knownGenePepHg16.out
    # make FASTA file and add a "p" to accessions so hgc.c does not get
    # confused with the Genbank sequences of the same name
    awk '{ print ">"$1"p\n"$2}' knownGenePepHg16.out > knownGenePepHg16.fa
    grep '>' knownGenePepHg16.fa | wc
    # 42026 sequences
    # Copy sequences to gbdb if they are not already there
    mkdir -p /gbdb/hgFixed/tblastnKGHg16
    ln -s /cluster/data/danRer1/bed/ZonLab/tblastn/knownGenePepHg16.fa \
          /gbdb/hgFixed/tblastnKGHg16
                                                                               
    # load sequences
    hgLoadSeq danRer1 /gbdb/hgFixed/tblastnKGHg16/knownGenePepHg16.fa
                                                                               
    # try taking top 3 instead of top 5
    ssh hgwdev
    mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top3
    cd /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top3
    mkdir data
    cp ../top5/data/idlist data/
    cat << '_EOF_' > getTop3.csh
#!/bin/tcsh
                                                                               
set db = "qa"
set table = "blast3"
set acc = $1
                                                                               
hgsql -N -e 'select qName, tName from '$table' where qName = "'$acc'" order by
score limit 3;' $db >> data/qNametName.blast3.out
'_EOF_'
    # << this line makes emacs coloring happy
     chmod +x getTop3.csh
     cat << '_EOF_' > gsub
#LOOP
./getTop3.csh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 data/idlist single gsub jobList.csh
    chmod +x jobList.csh
    jobList.csh

    perl ../top5/getSelectedPsl.pl < data/qNametName.blast3.out > blastTop3.psl    perl ../top5/shortenNames.pl < blastTop3.psl > blastTop3Format.psl
    wc -l blastTop*
    # 19823 blastTop5.psl
    # 19823 blastTop5Format.psl
    mkdir psl
    mv blastTop3Format.psl ./psl
    pslSort dirs blastTop3Sort.psl tmp psl
    # lift up to chrom co-ordinates
    liftUp -type=".psl" tblastnHg16KGTop3.psl \
          /cluster/data/danRer1/jkStuff/liftAll.lft warn blastTop3Sort.psl
    # add protein IDs to the qName
    $HOME/bin/i386/protDat tblastnHg16KGTop3.psl \
            /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \
            /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames \
            tblastnHg16KGPepTop3.psl
    pslCheck tblastnHg16KGPepTop3.psl
    # psl is good
    # load psl into database
    hgLoadPsl danRer1 tblastnHg16KGPepTop3.psl
    # previous table of alignments filtered on 1e-100 only is tblastnHg16KGEval
    # featureBits danRer1 tblastnHg16KGPepTop3
    # 2223873 bases of 1459132082 (0.152%) in intersection
    # Decided to release the Top 5 hits with tblastnHg16KGPep
    # This could be improved upon. Brian Raney's method below is better and
    # finding exons and reduces noise in the alignments.
    # At a later date, only one of these tracks should be displayed.

    # small changes -- sort by chrom, chromStart, chromEnd; add tName index
    mkdir /cluster/data/danRer1/bed/ZonLab/tblastn/aug12
    cd /cluster/data/danRer1/bed/ZonLab/tblastn/aug12
    hgsqldump danRer1 tblastnHg16KGPep --no-data > tblastnHg16KGPepSorted.sql
    # edit the tablename to be tblastnHg16KGPepSorted.sql
    # also don't need the indices in tblastnHg16KGPepSorted.sql
    hgsql danRer1 < tblastnHg16KGPepSorted.sql
    hgsql danRer1
    insert into tblastnHg16KGPepSorted select * from tblastnHg16KGPep order by tName, tStart, tEnd;
    rename table tblastnHg16KGPep to tblastnHg16KGPepNotSorted;
    rename table tblastnHg16KGPepSorted to tblastnHg16KGPep;
    # would prefer dual indices, but convention is single
    create index bin on tblastnHg16KGPep(bin);
    create index tStart on tblastnHg16KGPep(tStart);
    create index qName on tblastnHg16KGPep(qName);
    create index tEnd on tblastnHg16KGPep(tEnd);
    create index tName on tblastnHg16KGPep(tName);

# Human Proteins - tBLASTn of hg16 knownGene (2004-07-27, braney)
# blastHg16KG track.
    ssh kksilo
    bash

    mkdir -p /cluster/data/danRer1/bed/tblastn.hg16KG
    cd /cluster/data/danRer1/bed/tblastn.hg16KG

    ls -1S /iscratch/i/danRer1/blastdb/*.nsq | sed "s/\.nsq//" >  fish.lst
    mkdir kgfas
    split -l 68 /cluster/data/hg16/bed/blat.hg16KG.2004-05-27/hg16KG.psl kgfas/kg
    cd kgfas
    for i in *
    do
	pslxToFa $i $i.fa
	rm $i
    done
    ls -1S kgfas/*.fa > kg.lst

    # use bluearc to hold our temp output
    mkdir -p /cluster/bluearc/danRer1/bed/tblastn.hg16KG/blastOut
    ln -s  /cluster/bluearc/danRer1/bed/tblastn.hg16KG/blastOut .
    for i in `cat kg.lst`
    do
	mkdir blastOut/$i
    done

    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } {check out exists blastOut/$(root2)/q.$(root1).bscore} {check out exists blastOut/$(root2)/t.$(root1).psl } {check out exists blastOut/$(root2)/t.$(root1).bscore}
#ENDLOOP
'_EOF_'

    gensub2 fish.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/danRer1/bed/tblastn.hg16KG
    para create blastSpec
    para push

    cd blastOut
    for i in kg??
    do
	cat $i/q.*.psl > q.$i.psl
	cat $i/q.*.bscore > q.$i.bscore
	cat $i/t.*.psl > t.$i.psl
	cat $i/t.*.bscore > t.$i.bscore
	simpleChain -prot -maxGap=27000 -outPsl q.$i.psl c.$i.psl
	sort -rn c.$i.psl | pslUniq stdin stdout | awk "((\$13 - \$12) / \$11) > 0.6 {print}" > u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c.$i.psl > m60.$i.psl
	echo $i
    done
    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > preblastHg16KG.psl

    blatDir=/cluster/data/hg16/bed/blat.hg16KG.2004-05-27
    protDat preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/danRer1/bed/tblastn.hg16KG/blastOut
    hgLoadPsl danRer1 blastHg16KG.psl
#end Human Proteins

# RH MAP TRACK (DONE, 2004-07-26, hartera)
# RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING 
# (DONE, 2005-05-14, hartera)
    # Data from Leonard Zon's lab at the Childrens Hospital, Boston
    # Radiation hybrid (RH) map data consists of 8707 genomic sequences:
    #   (1) 2835 BAC clone ends (Max Planck Inst.  Robert
    #   Geisler's lab, Tuebingen, Germany,
    #   (2) 2015 NCBI ESTs (submitted from around the planet,
    #   (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s
    #   Hosp. Boston,
    #   (4) 3686  WZ compiled ESTs from WashU
    #
    # Repeatmasked Zv3 supercontigs BLASTn 8707 RH map genomic sequences
    ssh kkr1u00
    mkdir -p /cluster/data/danRer1/bed/ZonLab/rhmap
    cd /cluster/data/danRer1/bed/ZonLab/rhmap
                                                                                
    mkdir -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq
    cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq
    ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst
                                                                                
    # Put the unmasked sequence data in rhDevSeq dir
                                                                                
    unzip rhSeqAll.zip
    foreach f (rhSeqAll/*/*.txt)
       mv $f .
    end
    rm -r rhSeqAll
                                                                                
    foreach s (*.txt)
      tr '[a-z]' '[A-Z]' < $s >> rhcontigs.fa
    end
                                                                                
    mkdir -p /iscratch/i/danRer1/rhmap
    cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq
    # fasplit sequence rhcontigs.fa 20 /iscratch/i/danRer1/rhmap
    # remove ^M at end of each line
    perl stripEndChar.pl < rhcontigs.fa > rhcontigsNoEndChar.fa
    # also remove long headers and just keep first and second accession
    grep '>'  rhcontigsNoEndChar.fa > rhcontigsFa.headers
    cat << '_EOF_' > formatFastaHeader.pl
#!/usr/bin/perl -w
use strict;
                                                                                
while(<>) {
   if (/>/) {
      my @f = split(/\|/);
      print "$f[0].$f[1]\n";
   }
   elsif ($_ eq "") { }
   else {
      print;
   }
}
'_EOF_'
    chmod +x formatFastaHeader.pl
    perl formatFastaHeader.pl<  rhcontigsNoEndChar.fa > rhcontigsFormat.fa
    mv rhcontigsFormat.fa rhcontigs.fa
    rm rhcontigsNoEndChar.fa
    grep '>' rhcontigs.fa | wc -l
    # 8690 sequences, (~15500 for Affy zebrafish) so do not split
    cp rhcontigs.fa /iscratch/i/danRer1/rhmap
    iSync
                                                                                
    ssh kk
    cd /cluster/data/danRer1/bed/ZonLab/rhmap
    mkdir psl
    ls -1S  /iscratch/i/danRer1/rhmap/rhcontigs.fa > rhmap.lst
    ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst
# try same parameters as for bacEnds
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
     gensub2 genome.lst rhmap.lst gsub spec
     para create spec
     para try
     para check
     para push, try ... etc.
# para time
# Completed: 310 of 310 jobs
# CPU time in finished jobs:       7256s     120.93m     2.02h    0.08d  0.000 y
# IO & Wait Time:                   987s      16.45m     0.27h    0.01d  0.000 y# Average job time:                  27s       0.44m     0.01h    0.00d
# Longest job:                       38s       0.63m     0.01h    0.00d
# Submission to last job:           196s       3.27m     0.05h    0.00d
                                                                                
    # Make & check the psl table
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create rhmap.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least 80% identity in aligned region.
    # these are the parameters used for STS markers
    # try more stringent filtering for these:  minAli=0.96 and minCover=0.40
    # was suggested by Jim (DONE, 2005-05-14, hartera)
    # pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \
    #            contig.psl /dev/null
    # Processed 646182 alignments
    mv rhMap.psl rhMap.psl.old
    mv contig.psl contig.psl.old
    pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \
            contig.psl /dev/null
    # Processed 646182 alignments
    liftUp rhMap.psl /cluster/data/danRer1/jkStuff/liftAll.lft warn contig.psl
    pslCheck rhMap.psl
    # Summary of different alignment filters:
# nearTop	minAli	minCover	-noIntrons	#alignments #aligned
# 0.0001	0.8	-		yes		  16738		8380
# -		0.96	0.40		no		  8711		7421
# 0.0001	0.96	0.40		no		  7866		7421
# 0.002		0.96	0.40		no		  7924		7421
# 0.0001	0.96	0.40		yes		  7877		7421

# the best parameters seems to be nearTop=0.0001, minAli=0.96 and minCover=0.40 # previously, the sequence with the most alignments had 1987. Now the sequence
# with the most alignments has 7, 26 have 3-7 alignments and 7014 have 1
# For the previous parameters, there were > 900 sequences with many alignments
# as well as 7382 with 1 alignment.
          
    # Load sequence alignments into database.
    # Reload rhMap alignments - more stringent filtering (hartera, 2005-05-14)
    ssh hgwdev
    cd /cluster/data/danRer1/bed/ZonLab/rhmap
    echo "drop table rhMap;" | hgsql danRer1
    hgLoadPsl danRer1 rhMap.psl
    hgsql -N -e "select qName from rhMap;" danRer1 | sort | uniq | wc
    # 8380 out of 8690 sequences aligned (96%)
    # after more stringent filtering - 7421 out of 8690 sequences aligned (85%)
                                                                                
    # Add RH Map sequences
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/danRer1/rhmap
    ln -s \
      /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
       /gbdb/danRer1/rhmap
    cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq
    hgLoadSeq danRer1 /gbdb/danRer1/rhmap/rhcontigs.fa

# BLASTZ SWAP FOR mm5 vs danRer1 BLASTZ TO CREATE danRer1 vs mm5 BLASTZ
# USE RESCORED ALIGNMENTS (see makeMm5.doc)
# CONVERT AXTs TO PSL AND LOAD INTO DATABASE
# (DONE, 2004-08-05, hartera)

    ssh kolossus
    mkdir /cluster/data/danRer1/bed/blastz.mm5.swap
    cd /cluster/data/danRer1/bed/blastz.mm5.swap
    # use rescored axtChrom from blastzDanRer1 on mm5
    set aliDir = /cluster/data/mm5/bed/blastz.danRer1
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
        # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    # 3.2G    /cluster/data/mm5/bed/blastz.danRer1/axtChrom
    # 3.2G    unsorted
    # 3.2G    axtChrom
    rm -r unsorted
    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer1/bed/blastz.mm5.swap
    mkdir -p pslChrom
    set tbl = "blastzMm5"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer1 $f
      echo "$f Done"
    end

# CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-08-05, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer1/bed/blastz.mm5.swap
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/danRer1/bed/blastz.mm5.swap/axtChrom/*.axt \
                    > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'

    # << this line makes emacs coloring happy

# Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
# create chains with only default filtering on score - minScore = 1000
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/danRer1/nib \
    /cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3
'_EOF_'
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 29 of 29 jobs
# CPU time in finished jobs:       2987s      49.78m     0.83h    0.03d  0.000 y
# IO & Wait Time:                   848s      14.14m     0.24h    0.01d  0.000 y
# Average job time:                 132s       2.20m     0.04h    0.00d
# Longest job:                      426s       7.10m     0.12h    0.00d
# Submission to last job:           710s      11.83m     0.20h    0.01d

   # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end

    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain
    hgLoadChain danRer1 chr1_chainMm5 chr1.chain
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.449%, chainMm5Link 9.488%, both 0.364%, cover 81.22%, 
# enrich 8.56x
# featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.449%, chainHg17Link 3.924%, both 0.358%, cover 79.76%, 
# enrich 20.33x

    # Try filtering for minScore = 5000
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm -r chain
    chainSplit chain all.chain
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain
    hgLoadChain danRer1 chr1_chainMm5Filt5k chr1.chain

# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Filt5kLink -enrichment
# refGene:cds 0.449%, chainMm5Filt5kLink 7.948%, both 0.362%, cover 80.64%,
# enrich 10.15x
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Filt10kLink -enrichment
# refGene:cds 0.449%, chainMm5Filt10kLink 6.363%, both 0.357%, cover 79.66%, 
# enrich 12.52x
# rows: chr1_chainMm5Link 303196
#       chr1_chainMm5Filt5kLink 231508
#       chr1_chainMm5Filt10kLink 147336
    # filter chr1-25, M and Finished with minScore=5000
    # and minScore=10000 for chrNA and chrUn which have a lot more 
    # low scoring hits.
# featureBits -chrom=chrNA danRer1 refGene:cds chainMm5Link -enrichment

    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain
    rm chrNA.chain chrUn.chain
    
    # replace with those filtered on minScore=10000
    cp ../filt10k/chain/chrUn.chain .
    cp ../filt10k/chain/chrNA.chain .
    cd ..
    rm -r filt10k
    gzip all.chain.unfiltered 

    hgsql -e "drop table chr1_chainMm5Filt5k" danRer1 
    hgsql -e "drop table chr1_chainMm5Filt5kLink" danRer1
    hgsql -e "drop table chr1_chainMm5Filt10k" danRer1
    hgsql -e "drop table chr1_chainMm5Filt10kLink" danRer1
 
    # load database tables
    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer1 ${c}_chainMm5 $i
        echo done $c
    end

# NET MOUSE (mm5) BLASTZ (DONE, 2004-08-05, hartera)
    ssh kksilo
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 205414400, utime 5596 s/100, stime 237
    # Add classification info using db tables:
    cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse
    cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \
       /cluster/bluearc/danRer1/linSpecRep.notInMouse

    ssh hgwdev
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain
    time netClass noClass.net danRer1 mm5 mm5.net \
          -tNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse \
          -qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr
    # 112.390u 57.180s 4:17.17 65.9%  0+0k 0+0io 4565pf+0w
    netFilter -minGap=10 mm5.net |  hgLoadNet danRer1 netMm5 stdin
# featureBits danRer1 refGene:cds netMm5 -enrichment
# refGene:cds 0.380%, netMm5 36.622%, both 0.331%, cover 87.02%, enrich 2.38x

# CLEANUP BACENDS (DONE, 2004-08-10, hartera)

   ssh kksilo
   cd /cluster/data/danRer1/bed/ZonLab/bacends/clusterBlast/
   nice rm batch batch.bak out *.lst blastToPsl.log para.results \
           clusterBacEnds.psl clusterBacEnds.out
   cd zeroEval
   nice rm blastToPsl.log
   nice rm clusterBacEnds.psl
   nice zip cluster.zip *
   cd /cluster/data/danRer1/bed/ZonLab/bacends
   nice rm -r err
   nice rm bacends.remove bacends.percentNs80 bacends.noseqswithlotsNs \
      bacends_good.fa batch batch.bak bed.tab BACerror.log psl.tab raw.psl \
      seq.tab para.results
   nice rm -r psl
   nice gzip bacends.1/* duplicates/* pairs/* singles/* scores/* &

# BLASTZ MM5 CLEANUP (DONE, 2004-08-10, hartera)
    ssh eieio
    cd /cluster/data/danRer1/bed/blastz.mm5.swap
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
    # unzipped all.chain.gz and mm5.net.gz to make VSMM5 downloadables 
    # and then zip up again (hartera, 2004-09-10)

# MAKE VSMM5 DOWNLOADABLES (DONE, 2004-09-10, hartera)
    ssh kksilo
    # zip chains and nets 
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain
    gunzip all.chain.gz
    cp all.chain mm5.chain
    zip -j /cluster/data/danRer1/zip/mm5.chain.zip mm5.chain
    rm mm5.chain
    gunzip mm5.net.gz
    zip -j /cluster/data/danRer1/zip/mm5.net.zip mm5.net

    ssh hgwdev
    # copy chains and nets to downloads area
    set gp = /usr/local/apache/htdocs/goldenPath/danRer1
    mkdir -p $gp/vsMm5
    cd $gp/vsMm5
    mv /cluster/data/danRer1/zip/mm5*.zip .
    md5sum *.zip > md5sum.txt

    # move axt files to downloads area and zip
    cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChrom
    mkdir -p $gp/vsMm5/axtChrom
    nice cp -p *.axt $gp/vsMm5/axtChrom
    cd $gp/vsMm5/axtChrom
    nice gzip *.axt
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ HG17 CLEANUP (DONE, 2004-08-10, hartera)
    ssh eieio
    cd /cluster/data/danRer1/bed/blastz.hg17
    nice rm -rf raw &
    nice rm -rf axtChrom.orig &
    nice rm -rf axtChain.orig &
    nice rm -rf lav &
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &

# BLASTZ FR1 CLEANUP (DONE, 2004-08-10, hartera)
    ssh eieio
    cd /cluster/data/danRer1/bed/blastz.fr1
    nice rm -rf raw &
    nice rm -rf axtChain.orig &
    nice rm -rf lav &
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &

# REPEAT LIBRARY TEST (DONE, 2004-10-20, hartera)
# Get new repeat library
    ssh kksilo
    cd /cluster/data/danRer1
    wget --timestamp \
         http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501
    # copy to libraries directory for RepeatMasker
    cp Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib
    # temporarily move danio.lib to danioRM.lib.old
    # add danioRW.lib to danio.lib
# for now just mask 0.5Mb so use chr1_1_01.fa - unmasked
    mkdir newRepeats
    #  faSize chr1_1_01.fa
    #  495381 bases (62485 N's 432896 real) in 1 sequences in 1 files
    cp ./1/chr1_1/chr1_1_01.fa > ./newRepeats
    # RepeatMasker script with input directory and name of .fa file
    ssh kolossus 
    cd /cluster/data/danRer1/newRepeats
    nice /cluster/data/danRer1/jkStuff/RMZebrafish \
         /cluster/data/danRer1/newRepeats chr1_1_01.fa
    # can also use -lib option to use a custom library   
    cat << '_EOF_' > jkStuff/RMandRWZebrafish
#!/bin/csh -fe
                                                                                
cd $1
pushd .
/bin/mkdir -p /tmp/danRer1/$2
/bin/cp $2 /tmp/danRer1/$2/
cd /tmp/danRer1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2
popd
/bin/cp /tmp/danRer1/$2/$2.out ./
if (-e /tmp/danRer1/$2/$2.align) /bin/cp /tmp/danRer1/$2/$2.align ./
if (-e /tmp/danRer1/$2/$2.tbl) /bin/cp /tmp/danRer1/$2/$2.tbl ./
if (-e /tmp/danRer1/$2/$2.cat) /bin/cp /tmp/danRer1/$2/$2.cat ./
/bin/rm -fr /tmp/danRer1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMandRWZebrafish
    cp /dev/null newRepeats/new/RMJobs
    echo /cluster/data/danRer1/jkStuff/RMandRWZebrafish \
         /cluster/data/danRer1/newRepeats/test chr1_1_01.fa  \
   '{'check out line+ /cluster/data/danRer1/newRepeats/test/chr1_1_01.fa.out'}' \
        >> newRepeats/test/RMJobs

    #- Do the run
    ssh kk
    cd /cluster/data/danRer1/newRepeats/test
    para create RMJobs
    para try
# para time
# Completed: 1 of 1 jobs

# with previous output some print errors at end of run but try using for mask 
# anyway
    # already have simpleRepeats for chr1_1_01.fa
    
    ssh kksilo
    cd /cluster/data/danRer1/newRepeats/test   
    # lift up to contig level
    set d = /cluster/data/danRer1/1/chr1_1
    liftUp chr1_1.fa.out $d/chr1_1.lft warn chr1_1_01.fa.out > /dev/null
   

    #- Lift pseudo-contigs to chromosome level
      set d = /cluster/data/danRer1/1/
      if (-e $d/lift/ordered.lft && ! -z $d/lift/ordered.lft) then
       liftUp chr1.fa.out $d/lift/ordered.lft warn chr1_1.fa.out \
        > /dev/null
    # the new repeats "Drxxxx" do not have a class type so add this in 
    # otherwise maskOutFa can not process this.
awk 'BEGIN {OFS="\t";} {if ($10 ~ /^Dr[0-9]+$/ ){ print $1, $2, $3, $4, $5,
$6, $7, $8, $9, $10, "DrRW", $11, $12, $13, $14;} else {print;}}' chr1.fa.out > chr1.format.out
    mv chr1.fa.out chr1.fa.out.orig
    mv chr1.format.out chr1.fa.out
    set trfChr=/cluster/data/danRer1/bed/simpleRepeat/trfMaskChrom
    foreach f (./chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/chr1.bed $f
    end
# blastz alignment of 0.5Mb region much improved - check with laj. 
# removes many repeats
 
# MAKE UNIGENE ALIGNMENTS (DONE, 2004-11-01, hartera)
   ssh kksilo
   mkdir /cluster/data/danRer1/bed/unigene
   cd /cluster/data/danRer1/bed/unigene
   # Download zebrafish UniGene and run Chuck's preproc scripts:
   wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.info
   wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.seq.uniq.gz
   wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.data.gz
   gunzip *.gz
   # Chuck's script expects human (Hs) -- use Gga for chicken:
   sed -e 's/Hs/Dr/g' /projects/cc/hg/sugnet/uniGene/countSeqsInCluster.pl \
    > ./countSeqsInCluster.pl
   chmod a+x ./countSeqsInCluster.pl
   perl ./countSeqsInCluster.pl Dr.data counts.tab
   perl /projects/cc/hg/sugnet/uniGene/parseUnigene.pl Dr.seq.uniq \
      Dr.seq.uniq.simpleHeader.fa leftoverData.tab

    # Put on iscratch for cluster run
    ssh kkr1u00
    mkdir /iscratch/i/danRer1/unigene
    cp -p /cluster/data/danRer1/bed/unigene/Dr.seq.uniq.simpleHeader.fa \
      /iscratch/i/danRer1/unigene/
    iSync

    # align on big cluster
    ssh kk
    cd /cluster/data/danRer1/bed/unigene
    mkdir psl
    ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst
    ls -1S /iscratch/i/danRer1/unigene/*.fa > uniGene.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer1/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.lst uniGene.lst template.sub jobList
    para create jobList
    para try, check, push, check
# para time
# Completed: 310 of 310 jobs
# CPU time in finished jobs:      12627s     210.44m     3.51h    0.15d  0.000 y
# IO & Wait Time:                  2304s      38.41m     0.64h    0.03d  0.000 y
# Average job time:                  48s       0.80m     0.01h    0.00d
# Longest job:                       89s       1.48m     0.02h    0.00d
# Submission to last job:           699s      11.65m     0.19h    0.01d

    # postprocess
    ssh kksilo
    cd /cluster/data/danRer1/bed/unigene
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -minAli=0.965 -nearTop=0.002 \
      stdin uniGene.lifted.pslReps.psl /dev/null
    rm raw.psl
    gzip Dr.seq.uniq Dr.data
    # load into db
    ssh hgwdev
    cd /cluster/data/danRer1/bed/unigene
    mkdir /gbdb/danRer1/unigene
    ln -s /cluster/data/danRer1/bed/unigene/Dr.seq.uniq.simpleHeader.fa \
      /gbdb/danRer1/unigene/
    hgLoadSeq danRer1 /gbdb/danRer1/unigene/Dr.seq.uniq.simpleHeader.fa
    hgLoadPsl -fastLoad -table=uniGene_dr danRer1 uniGene.lifted.pslReps.psl

# STS Markers (in progress, 2004-08-09, hartera)
# download new files (2005-04-12, hartera)

   # Check NCBI for latest update for STS makers in dbSTS.
   # Our latest version is in store5 for space
    ssh eieio
    mkdir -p /cluster/store5/sts.2005-04-12
    ln -s /cluster/store5/sts.2005-04-12 /cluster/data/ncbi
    cd /cluster/data/ncbi/sts.2005-04-12
    # The dbSTS.sts does not have as many ids as in the version 
    # on the NCBI ftp site so get new version.
    # update from NCBI
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    gunzip sts.gz
    mv sts dbSTS.fa
    # check against sts.9 and a number of new Danio rerio markers added
    mkdir -p /cluster/store5/UniSTS.2005-04-12
    ln -s /cluster/store5/UniSTS.2005-04-12 /cluster/data/ncbi
    cd /cluster/data/ncbi/UniSTS.2005-04-12
    # sts and aliases files were last updated on 04/12/05 by NCBI
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
    # these mapping panels files were last updated 03/16/05 by NCBI
    wget --timestamp -r l1 \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/
    mkdir danRer
    mv ./ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/* \
       ./danRer
    rm -r ftp.ncbi.nih.gov
    # No difference between UniSTS and sts files so move dbSTS.fa to UniSTS dir
    # convert dbSTS.fa file to easier reading format, and get accessions
    cd /cluster/data/ncbi/UniSTS.2005-04-12
    mv /cluster/data/ncbi/sts.2005-04-12/dbSTS.fa .
    perl /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa
    grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc
    
    ssh kksilo
    mkdir -p /cluster/store8/danRer1/STSmarkers
    ln -s /cluster/store8/danRer1/STSmarkers \
          /cluster/data/danRer1/bed/STSmarkers
    mkdir /cluster/data/danRer1/bed/STSmarkers/downloads
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    # use UniSTSParse.pl to get alias names
    cp /cluster/data/mm5/bed/STSmarkers/downloads/UniSTSParse.pl .
    cp /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.sts .
    cp /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases .
    # create a UniSTS_zebrafish.sts by grepping for lines with "Danio"
    grep "Danio" UniSTS.sts > UniSTS_Danio_rerio.sts
    perl ./UniSTSParse.pl UniSTS_Danio_rerio.sts UniSTS.aliases \
         > UniSTS_Danio_rerio_alias.0
    cat UniSTS_Danio_rerio_alias.0 | sort -u | sort -n \
        > UniSTS_Danio_rerio.alias
    # some UniSTS markers for zebrafish have only primers but no name and no accession
    # Download ZFIN data for all markers for the six mapping panels
    wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt
    # Download ZFIN marker mappings to Genbank accessions
    wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt
    # Download file of old and new names for STS markers from ZFIN
    wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt 
    grep "STS" genetic_markers.txt > zfinIDSTS.txt
    # make file of mapping data
    foreach f (/cluster/data/ncbi/UniSTS.2004-10-06/danRer1/*.txt)
       cat $f >> danRer1STSMaps.txt
    end
    mv /cluster/data/ncbi/UniSTS.2004-10-06/danRer1/danRer1STSMaps.txt \
       /cluster/data/danRer1/bed/STSmarkers/downloads
    
    # Need to download zebrafish ZFIN IDs mapped to GenBank
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt 
    # Download ZFIN IDs and mapping panels
    wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt
    # Create new tables for STS so there are less columns in Info table
    # Download file of old and new names for STS markers
    wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt 
    # sort and uniq mappings.txt as there are duplicate lines
    sort mappings.txt | uniq > mappings.uniq.txt 
    # make a .as file for stsMarkers which is similar to the hg17 stsInfo2 table

    # align primer sequences
    # create a file of UniSTS IDs and the primer sequences
    
    ssh eieio
    cd /cluster/data/ncbi/UniSTS.2005-04-12/danRer
    cp /cluster/data/danRer1/bed/STSmarkers/downloads/UniSTS_Danio_rerio* .
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3}' UniSTS_Danio_rerio.sts > zfish.primers
    # strip out N's and wobbles (KS) as isPcr can't handle them
    # only 1 K, no Ns and no S. K is T or G so just switch for T and if it is a
    # G in that position will just get a mismatch, UniSTS ID 204670
    sed -e 's/K/T/' zfish.primers > zfish.primers.isPcr
    # keep getting error where    
    # bad char in reverse seq: aacckgagcgggtcagct
    # this is the complement of one of the reverse primers, M->K
    # use findChars.pl to find primers with characters other than ACTG
cat > findChars.pl >> 'EOF'
#!/usr/bin/perl -w
use strict;
                                                                                
my @a = </cluster/bluearc/danRer1/stsMarkers/primers/primers_*>;
@a = glob("/cluster/bluearc/danRer1/stsMarkers/primers/primers_*");
while (defined(my $nextName = </cluster/bluearc/danRer1/stsMarkers/primers/primers_*>))
{
print "Processing $nextName ...\n";
my $file = open(FILE, $nextName) || die "Can not open $nextName: $!\n";
while (<FILE>)
   {
   my $l = $_;
   chomp $l;
   if ($l =~ /[^ACTG0-9\t]/)
      {
      print "$l\n";
      }
   }
}
'EOF'
# << for emacs
    chmod +x findChars.pl
    perl findChars.pl
    # output is:
    # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_al ...
    # 204670  TGCTGTCGRCATTTACTGGAAC  TGCTTTATCACGTACAGCTGA
    # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_an ...
    # 238222  GTATAGTAGTGTGTATGAGCATCTGG      AGCTGACCCGCTCMGGTT
    # since R is A or G and M is C or A
    # so change R -> A and M -> C
    perl -pi.bak -e 's/R/A/' zfish.primers.isPcr
    perl -pi.bak -e 's/M/C/' zfish.primers.isPcr
                                                            
    # only one primer with length < 10, so keep. looks like isPcr can now handle
    # shorter primers
    mkdir -p /cluster/bluearc/danRer1/stsMarkers/primers
    cd /cluster/bluearc/danRer1/stsMarkers/primers
    split -l 2000 \
       /cluster/data/ncbi/UniSTS.2005-04-12/danRer/zfish.primers.isPcr primers_
    # keep getting error where    
    # bad char in reverse seq: aacckgagcgggtcagct
    # this is the complement of one of the reverse primers, M->K
    # use findChars.pl to find primers with characters other than ACTG
cat > findChars.pl >> 'EOF'
#!/usr/bin/perl -w
use strict;
                                                                                
my @a = </cluster/bluearc/danRer1/stsMarkers/primers/primers_*>;
@a = glob("/cluster/bluearc/danRer1/stsMarkers/primers/primers_*");
while (defined(my $nextName = </cluster/bluearc/danRer1/stsMarkers/primers/primers_*>))
{
print "Processing $nextName ...\n";
my $file = open(FILE, $nextName) || die "Can not open $nextName: $!\n";
while (<FILE>)
   {
   my $l = $_;
   chomp $l;
   if ($l =~ /[^ACTG0-9\t]/)
      {
      print "$l\n";
      }
   }
}
'EOF'
# << for emacs
    chmod +x findChars.pl
    perl findChars.pl
    # output is:
    # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_al ...
    # 204670  TGCTGTCGRCATTTACTGGAAC  TGCTTTATCACGTACAGCTGA
    # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_an ...
    # 238222  GTATAGTAGTGTGTATGAGCATCTGG      AGCTGACCCGCTCMGGTT
    # since R is A or G and M is C or A
    # so change R -> A and M -> C
    perl -pi.bak -e 's/R/A/' \
         /cluster/bluearc/danRer1/stsMarkers/primers/primers_al
    perl -pi.bak -e 's/M/C/' \
         /cluster/bluearc/danRer1/stsMarkers/primers/primers_an
    rm /cluster/bluearc/danRer1/stsMarkers/primers/*.bak                                                      
    ssh kk
    cd /cluster/data/danRer1/bed/STSmarkers/
    mkdir primers
    cd primers
    mkdir run
    cd run
    ls -1S /iscratch/i/danRer1/trfFa/*.fa > contigs.lst
    ls -1S /cluster/bluearc/danRer1/stsMarkers/primers/primers_* > primers.lst
cat > template << 'EOF'
#LOOP
/cluster/home/hartera/bin/i386/isPcr -out=psl -minPerfect=5 -maxSize=5000 -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/danRer1/stsMarkers/primers/out/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
    # may need to add -mask=lower option

    mkdir -p /cluster/bluearc/danRer1/stsMarkers/primers/out
    gensub2 contigs.lst primers.lst template jobList
    para create jobList
    # 4340 jobs 
    para try, para check, para push, para check etc.
# Completed: 4340 of 4340 jobs
# CPU time in finished jobs:       6296s     104.93m     1.75h    0.07d  0.000 y
# IO & Wait Time:                 14235s     237.26m     3.95h    0.16d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:            95s       1.58m     0.03h    0.00d

    # may need to add -mask=lower option
    

   # Check NCBI for latest update for STS makers in dbSTS.
   # Our latest version is in store5 for space
    ssh eieio
    mkdir -p /cluster/store5/sts.2004-10-06
    ln -s /cluster/store5/sts.2004-10-06 /cluster/data/ncbi
    cd /cluster/data/ncbi/sts.2004-10-06
    # The dbSTS.sts does not have as many ids as in the version 
    # on the NCBI ftp site so get new version.
    # update from NCBI
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    gunzip sts.gz
    mv sts dbSTS.fa
  # checked difference between these and sts.9 files - no Danio sequences added
    mkdir -p /cluster/store5/UniSTS.2004-10-06
    ln -s /cluster/store5/UniSTS.2004-10-06 /cluster/data/ncbi
    cd /cluster/data/ncbi/UniSTS.2004-10-06
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
    wget --timestamp -r l1 \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/
    mkdir danRer1
    cp ./ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/* \
       ./danRer1
    rm -r ftp.ncbi.nih.gov
    # No difference between UniSTS and sts files so move dbSTS.fa to UniSTS dir
    # convert dbSTS.fa file to easier reading format, and get accessions
    cd  /cluster/data/ncbi/UniSTS.2004-10-06
    mv /cluster/data/ncbi/sts.2004-10-06/dbSTS.fa .
    perl /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa
    grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc

    # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, 
    # all.STS.fa, stsAlias.bed files 
    # create stsInfo2.bed.prev and all.STS.fa.prev as empty files
    cp /dev/null stsInfo2.bed.prev
    cp /dev/null all.STS.fa.prev
    updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
                    dbSTS.sts dbSTS.aliases dbSTS.convert.fa new
    # this creates a valid new.primers and new.fa file
    mv new.primers all_primers
    mv new.fa all.STS.fa

    # above does not work well without previous information to add
    # DONT USE THESE FILES - in a test dir
    ssh kksilo
    mkdir -p /cluster/store8/danRer1/STSmarkers
    ln -s /cluster/store8/danRer1/STSmarkers \
          /cluster/data/danRer1/bed/STSmarkers
    mkdir /cluster/data/danRer1/bed/STSmarkers/downloads
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    # use UniSTSParse.pl to get alias names
    cp /cluster/data/mm5/bed/STSmarkers/downloads/UniSTSParse.pl .
    cp /cluster/data/ncbi/UniSTS.2004-10-06/UniSTS.sts .
    cp /cluster/data/ncbi/UniSTS.2004-10-06/UniSTS.aliases .
    # create a UniSTS_zebrafish.sts by grepping for lines with "Danio"
    grep "Danio" UniSTS.sts > UniSTS_Danio_rerio.sts
    perl ./UniSTSParse.pl UniSTS_Danio_rerio.sts UniSTS.aliases \
         > UniSTS_Danio_rerio_alias.0
    cat UniSTS_Danio_rerio_alias.0 | sort -u | sort -n \
        > UniSTS_Danio_rerio.alias
    # Download ZFIN data for all markers
    wget --timestamp http://zfin.org/data_transfer/Downloads/genetic_markers.txt
    grep "STS" genetic_markers.txt > zfinIDSTS.txt
    # make file of mapping data
    foreach f (/cluster/data/ncbi/UniSTS.2004-10-06/danRer1/*.txt)
       cat $f >> danRer1STSMaps.txt
    end
    mv /cluster/data/ncbi/UniSTS.2004-10-06/danRer1/danRer1STSMaps.txt \
       /cluster/data/danRer1/bed/STSmarkers/downloads
    
    # Need to download zebrafish ZFIN IDs mapped to GenBank
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt 
    # Download ZFIN IDs and mapping panels
    wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt
    # Create new tables for STS so there are less columns in Info table
    # Download file of old and new names for STS markers
    wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt 
    # make a .as file for stsMarkers which is similar to the hg17 stsInfo2 table

    cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMarkers.as
table stsMarkers
"Constant STS marker information - Oct 2004 revision"
    (
    uint identNo;                 "UCSC identification number"
    string name;                  "Official UCSC name"
    uint gbCount;                 "Number of related GenBank accessions"
    string[gbCount] genbank;      "Related GenBank accessions"
    string OtherDb;		  "Name of other database of marker IDs"
    uint otherDbCount;            "Number of related other database identifiers"
    string[OtherDbCount] gdb;     "Related other database identifiers"
    uint nameCount;               "Number of alias names"
    string[nameCount] otherNames; "Alias names"
    uint dbSTSid;                 "ID number in UniSTS or dbSTS"
    string leftPrimer;            "5' primer sequence"
    string rightPrimer;           "3' primer sequence"
    string distance;              "Length of STS sequence"
    string organism;              "Organism for which STS discovered"
    uint sequence;                "Whether the full sequence is available (1) or not (0) for STS"
    )
'_EOF_'
    cd $HOME/kent/src/hg/lib/
    autoSql stsMarkers.as stsMarkers
    mv stsMarkers.h $HOME/kent/src/hg/inc

    cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMapping.as
table stsMapping
"STS marker and its position on golden path"
    (
    string chrom;       "Chromosome or 'unknown'"
    int chromStart;     "Start position in chrom - negative 1 if unpositioned"
    uint chromEnd;      "End position in chrom"
    string name;        "Name of STS marker"
    uint score;         "Score of a marker = 1000/(#placements) when placed uniquely, else 1500/(#placements) when placed in multiple locations"
                                                                                
    uint identNo;       "Identification number of STS"
    string ctgAcc;      "Contig accession number"
    string otherAcc; 	"Accession number of other contigs that the marker hits"
    )
'_EOF_'
    
    cd $HOME/kent/src/hg/lib/
    autoSql stsMapping.as stsMapping
    mv stsMapping.h $HOME/kent/src/hg/inc
        
    cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMapPos.as
table stsMapPos
"STS marker and its position on genetic maps"
    (
    uint identNo;       "Identification number of STS"
    string mapName; 	"Name or source of genetic map"
    string name;	"Name of marker in map"
    string chrom;	"Chromosome in map"
    float pos;		"Position in map"
    float LOD;		"LOD score in map"
    )
'_EOF_'

    cd $HOME/kent/src/hg/lib/
    autoSql stsMapPos.as stsMapPos
    mv stsMapPos.h $HOME/kent/src/hg/inc

    # stsAlias already exists in $HOME/kent/src/hg/lib/
    # COMMIT THESE LATER
    # do make to check it works and commit the .as, .sql, .c and .h file to CVS
    # Need to download zebrafish ZFIN IDs mapped to GenBank
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    wget 
    # cluster run of zebrafish sts sequences
    
    ssh kksilo
    cd /cluster/data/danRer1/bed/STSmarkers/downloads
    # use list of zebrafish UniSTS entry accessions
    awk 'BEGIN {FS="\t"} {print $7;}' UniSTS_Danio_rerio.sts \
        | sed -e 's/;/\n/g' > danRer.accs
    sort danRer.accs | uniq > danRer.uniq.accs
    # 26894 lines; 26955 accessions; 17312 unique accessions
    $HOME/bin/i386/faSomeRecords \
         /cluster/data/ncbi/UniSTS.2004-10-06/dbSTS.convert.fa \
         /cluster/data/danRer1/bed/STSmarkers/downloads/danRer.uniq.accs \
         /cluster/data/danRer1/bed/STSmarkers/downloads/danRerdbSTS.fa
    # got 6045 sequences, 11267 not found
    # sequences not found - may all be in GenBank as ESTs, those found 
    # by searching were in GenBank
    # Get sequences from GenBank
    ssh kksilo
    mkdir -p /cluster/data/danRer1/bed/STSmarkers/sequences
    cd /cluster/data/danRer1/bed/STSmarkers/sequences
    echo 'restrict org = "Danio rerio"' > sts.filt
    gbToFaRa sts.filt danRer.fa danRer.ra /dev/null \
             /cluster/data/genbank/data/download/genbank.143.0/gbsts*.gz
    # still does not have these missing records that are in dbEST
    # try getting these sequences that are missing from the EST db
    # make list of these missing accessions
    grep '>' danRerdbSTS.fa | sed -e 's/>//' | sort > drSTS.accs.sort
    # 6045 accs found here, 11266 missing; 17311 in danRer.uniq.acc2
    comm -13 drSTS.accs.sort danRer.uniq.acc2 > notFound.accs
    wc -l notFound.accs
    # 11266
    getRna hg17 notFound.accs notIndbSTSFASTA.fa > notInESTs.accs
    # 11250 accs found so 16 to find still
    awk '{print $1}' notInESTs.accs > accsToFind
    # get these sequences from Batch Entrez at NCBI to batchseq.fa
    perl /cluster/bin/scripts/convertGbFaFile batchseq.fa > batchseq.convert.fa
    # cat sequences together
    cat danRerdbSTS.fa notIndbSTSFASTA.fa batchseq.convert.fa \
        > allSTSMarkerSeqs.fa
    # split sequences to map these to the genome
    mkdir -p /cluster/bluearc/stsMarker/zebrafishMarker/split
    cd /cluster/bluearc/stsMarker/zebrafishMarker/split
    faSplit sequence \
    /cluster/data/danRer1/bed/STSmarkers/downloads/allSTSMarkerSeqs.fa 50 sts 
    # do the blat run
    ssh kk
    cd /cluster/data/danRer1/bed/STSmarkers 
    mkdir run
    cd run
    ls -1 /iscratch/i/danRer1/trfFa/*.fa > genome.lst
    ls -1S /cluster/bluearc/stsMarker/zebrafishMarker/split/sts*.fa > sts.lst
    mkdir -p /cluster/bluearc/danRer1/stsMarkers/sts/out
    foreach f (`cat sts.lst`)
        set d = $f:t:r
        mkdir /cluster/bluearc/danRer1/stsMarkers/sts/out/$d
    end
# create alignments
cat > template << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/danRer1/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/danRer1/stsMarkers/sts/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs

    gensub2 genome.lst sts.lst template jobList
    para create jobList
    # 15190 jobs
    para try, create, push ..... 
# para time
# Completed: 15190 of 15190 jobs
# CPU time in finished jobs:      61925s    1032.09m    17.20h    0.72d  0.002 y
# IO & Wait Time:                 51578s     859.63m    14.33h    0.60d  0.002 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest job:                       49s       0.82m     0.01h    0.00d
# Submission to last job:           187s       3.12m     0.05h    0.00d

    ssh kolossus
    cd /cluster/bluearc/danRer1/stsMarkers/sts
    pslSort dirs raw.psl temp out/*
    rm -rf temp
    pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
        stsMarkers.psl /dev/null
    # Processed 2574993 alignments
    # Lift them and get them ready to combine with primer alignments
    liftUp -nohead stsMarkers.lifted.psl \
        /cluster/data/danRer1/jkStuff/liftAll.lft warn stsMarkers.psl

    /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
        # creates <file>.initial
    /cluster/bin/scripts/findAccession2 -agp stsMarkers.lifted.psl.initial \
        /cluster/data/danRer1
    sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
    # determine how many were mapped
    cut -f 4 stsMarkers.final | sort -n | uniq > stsMarkers.found
    wc -l stsMarkers.found
    # 12365 stsMarkers.found  out of 17311
    # retry with less stringent pslReps 
    pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \
        stsMarkers2.psl /dev/null
    # Processed 2574993 alignments
    # Lift them and get them ready to combine with primer alignments
    liftUp -nohead stsMarkers2.lifted.psl \
        /cluster/data/danRer1/jkStuff/liftAll.lft warn stsMarkers2.psl
    /cluster/bin/scripts/extractPslInfo stsMarkers2.lifted.psl
        # creates <file>.initial
    /cluster/bin/scripts/findAccession2 -agp stsMarkers2.lifted.psl.initial \
        /cluster/data/danRer1
    sort -k 4n stsMarkers2.lifted.psl.initial.acc > stsMarkers2.final
    # determine how many were mapped
    cut -f 4 stsMarkers2.final | sort -n | uniq > stsMarkers2.found
    wc -l stsMarkers2.found
    # 16310 stsMarkers2.found out of 17311 

    # get mappings.txt from ZFIN
    ssh kksilo
    mkdir /cluster/data/danRer1/bed/STSmarkers/downloads/panels
    cd /cluster/data/danRer1/bed/STSmarkers/downloads/panels
    foreach m (GAT HS T51 LN54 MGH MOP)
        echo "# Map Name: ${m}" >> $m.txt
        grep $m ../mappings.txt >> $m.txt
    end
    cat *.txt >> zfin.maps

2005-06-13
    # use formatZfishSts to bring all information together about STS markers
    $HOME/bin/i386/formatZfishSts
    # USAGE: formatZfishSts [-verbose=<level>] <UniSTS file> <ZFIN mappings.txt> <genbank.txt> <UniSTS aliases> <ZFIN aliases> <prefix of output file(*.format)>
    mkdir /cluster/data/danRer1/bed/STSmarkers/downloads/test
    cd /cluster/data/danRer1/bed/STSmarkers/downloads/test
    cat /cluster/data/ncbi/UniSTS.2005-04-12/danRer/*.txt >> UniSTSmaps.txt
    awk 'BEGIN {FS="\t"} {print $2;}' UniSTSmaps.txt > UniSTSmaps.names
    sort UniSTSmaps.names | uniq > UniSTSmaps.names.sort 
    # remove blank line at top
    awk 'BEGIN {FS="\t"} {print $3;}' markers.format | sort | uniq \
        > UniSTS.names.sort
    comm -12 UniSTS.names.sort UniSTSmaps.names.sort > UniSTSnamesvsmaps
    comm -13 UniSTS.names.sort UniSTSmaps.names.sort > onlyinUniSTSmaps.names 
    awk 'BEGIN {FS="\t"} {print $4;}' markers.format | sort | uniq \
       > ZFIN.names.sort
    comm -12 ZFIN.names.sort onlyinUniSTSmaps.names > ZFINnamesvsmaps
    #  14350 ZFINnamesvsmaps
    # 47197 onlyinUniSTSmaps.names
    comm -13 ZFIN.names.sort onlyinUniSTSmaps.names > onlyinUniSTSmaps.names2
    # 32847 onlyinUniSTSmaps.names2
    awk 'BEGIN {FS="\t"} {print $6;}' markers.format > aliases.names
    
# ENABLE AUTO BUILD MGC GENES TRACK (DONE markd)
   - add the following lines to danRer1 entry in /cluster/data/genbank/etc/genbank.conf
      danRer1.mgcTables.default = full
      danRer1.mgcTables.mgc = all
   - wait one day for nightly build to align and load them into the db
   - rebuild trackDb

# LIFTOVER CHAINS TO DANRER2 (DONE, 2005-07-20, hartera)
# Added path for makeLo scripts (DONE, 2005-07-28, hartera)
# CLEAN UP OF DIRECTORIES AND FILES (DONE, 2006-08-11, hartera)
# Create liftOver for only chr1-25 because chrFinished, chrUn and chrNA 
# will cause too many false alignments.

    # run alignment
    # NOTE: split danRer2 to /iscratch/i is doc'ed in makeDanRer2.doc
    # make a nibDir with no chrUn and chrNA
    ssh kkr1u00
    mkdir /iscratch/i/danRer1/nibNoUnandNA
    cp /iscratch/i/danRer1/nib/* /iscratch/i/danRer1/nibNoUnandNA/
    cd /iscratch/i/danRer1/nibNoUnandNA/
    rm chrUn.nib chrNA.nib
    iSync
    
    mkdir /iscratch/i/danRer1/nibUnandNA
    cp /iscratch/i/danRer1/nib/chrUn.nib /iscratch/i/danRer1/nibUnandNA/
    cp /iscratch/i/danRer1/nib/chrNA.nib /iscratch/i/danRer1/nibUnandNA/
    iSync
    
    ssh kk
    cd /cluster/data/danRer1
    set dir = /cluster/bin/scripts
    # run 1-25, Finished and M for danRer1 vs 1-25 and M for danRer2
    $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nibNoUnandNA \
                    danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split
    # run Un and NA for danRer1 separately as these could take a long time
    $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nibUnandNA \
                    danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split
    cd /cluster/data/danRer1/bed/blat.danRer2.2005-05-24/run
    para try, check, push etc.
    # do this next:
    # ssh kkusr01
    # /cluster/bin/scripts/makeLoChain-lift.csh danRer1 danRer2 <lift-dir>

    # Created parasol job in bed/blat.danRer1.2005-05-18/run
    cd bed
    rm blat.danRer2
    ln -s blat.danRer2.2005-05-18 blat.danRer2
    cd blat.danRer2/run
    para try, check, push, check etc..
# para time
# Completed: 702 of 702 jobs
# CPU time in finished jobs:   27720091s  462001.51m  7700.03h  320.83d  0.879 y
# IO & Wait Time:                 50264s     837.74m    13.96h    0.58d  0.002 y
# Average job time:               39559s     659.32m    10.99h    0.46d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          156623s    2610.38m    43.51h    1.81d
# Submission to last job:        541963s    9032.72m   150.55h    6.27d

    ssh kk
    cd /cluster/data/danRer1
    set dir = /cluster/bin/scripts
    $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nib \
        danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
    # Created parasol job in bed/blat.danRer1.2005-05-31/run
    cd bed/blat.danRer2.2005-05-31/run
    para try, check, push, check etc..

    # Lift results
    #  the lift directory was defined in makeHg17.doc when split was performed
    # this expects data in bed/blat.hg17, so symlink must be there 
    # use kolossus for speed
    ssh kolossus
    cd /cluster/data/danRer1/bed
    set dir = /cluster/bin/scripts
    # for script to work, directory must have today's date
    mv blat.danRer2.2005-05-18 blat.danRer2.2005-07-19
    cd /cluster/data/danRer1/bed/blat.danRer2.2005-07-19
    $dir/makeLoChain-lift danRer1 danRer2 \
         /iscratch/i/danRer2/liftOver/liftSplit/lift >&! lift.log &
    tail -100f lift.log
    cd ..
    mv blat.danRer2.2005-07-19 blat.danRer2.2005-05-18
   
    # do manually as this stops after chr1 - same problem seen with mm5->mm6
    # when all runs are finished, combine all the results into raw dir
    # in blat.danRer2 and do lift
    ssh kk
    cd /cluster/data/danRer1/bed/blat.danRer2/raw
    bash # if not working in bash already 
    for nib in `ls /cluster/data/danRer2/nib`; do
       chrom=${nib%.nib}
       echo $chrom
       liftUp -pslQ ../psl/${chrom}.psl /iscratch/i/danRer2/liftOver/liftSplit/lift/${chrom}.lft warn chr*_${chrom}.psl
       echo done $chrom
    done
    rm ../psl/chrUn.psl ../psl/chrNA.psl
    # Un and NA are empty psls as these were not run

    ssh kk9
    set dir = /cluster/bin/scripts
    cd /cluster/data/danRer1/bed
    ln -s blat.danRer2.2005-05-18 blat.danRer2.2005-07-19
    $dir/makeLoChain-chain danRer1 /cluster/data/danRer1/nib danRer2 \
          /cluster/data/danRer2/nib
    cd /cluster/data/danRer1/bed/blat.danRer2.2005-07-19/chainRun
    para try, check, push, check etc.
# para time
# Completed: 26 of 26 jobs
# CPU time in finished jobs:       2501s      41.68m     0.69h    0.03d  0.000 y
# IO & Wait Time:                  1773s      29.55m     0.49h    0.02d  0.000 y
# Average job time:                 164s       2.74m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             210s       3.50m     0.06h    0.00d
# Submission to last job:           504s       8.40m     0.14h    0.01d
    # make alignment net, need access to kkusr01
    ssh kolossus
    cd /cluster/data/danRer1/bed/blat.danRer2
    # remove chrFinished - this also has Ns between contigs and could
    # cause false alignments due to alignments bridging contigs
    rm /cluster/data/danRer1/bed/blat.danRer2/chainRaw/chrFinished.chain
    # run programs as in /cluster/bin/scripts/makeLoChain-net
    # script aborts if it is not on fileserver for 
    # this directory as no local login to kkusr01 at the moment
    # sort chains
    rm -fr chain
    mkdir chain
    chainMergeSort chainRaw/*.chain | chainSplit chain stdin
    rm ./chain/chrFinished.chain
    # create net
    rm -fr net
    mkdir net
    cd chain
    foreach i (*.chain)
       echo $i:r
       chainNet $i /cluster/data/danRer1/chrom.sizes \
            /cluster/data/danRer2/chrom.sizes ../net/$i:r.net /dev/null
       echo done $i
    end
    rm -fr ../over
    mkdir ../over
    foreach i (*.chain)
       echo $i:r
       netChainSubset ../net/$i:r.net $i ../over/$i
    end
    cat ../over/*.chain > ../danRer1ToDanRer2.over.chain
    mkdir -p /cluster/data/danRer1/bed/bedOver
    cp ../danRer1ToDanRer2.over.chain /cluster/data/danRer1/bed/bedOver/

    # load into database and copy to download directory
    ssh hgwdev
    # makeLoChain-load does not make database entry so run manually
    # save to download area
    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer1/liftOver
    cd /usr/local/apache/htdocs/goldenPath/danRer1
    gzip -c /cluster/data/danRer1/bed/bedOver/danRer1ToDanRer2.over.chain \
         > liftOver/danRer1ToDanRer2.over.chain.gz
    # load into database
    mkdir -p /gbdb/danRer1/liftOver
    rm -f /gbdb/danRer1/liftOver/danRer1ToDanRer2.over.chain
    ln -s /cluster/data/danRer1/bed/bedOver/danRer1ToDanRer2.over.chain \
          /gbdb/danRer1/liftOver
    # Need to add entry into hgcentraltest.liftOverChain
    echo 'insert into liftOverChain values("danRer1", "danRer2", "/gbdb/danRer1/liftOver/danRer1ToDanRer2.over.chain", "0.95", "0", "0", "N", "1", "N");' \
    | hgsql hgcentraltest
    # check entry in db
    echo 'select * from liftOverChain where fromDb = "danRer1";' \
         | hgsql hgcentraltest
    # add an entry to the pushQ in the liftOver subqueue - liftOverQueue
    # clean up, remove old directories for blat to danRer2 
    # (hartera, 2006-08-11)
    ssh kkstore03
    cd /cluster/data/danRer1/bed
    rm -r blat.danRer2.2005-05-24 
    rm -r blat.danRer2.2005-05-31
    cd blat.danRer2.2005-05-18
    rm -r chainRaw chainRun raw run
    rm lift.log
    gzip chain/*.chain
    gzip net/*.net
    gzip psl/*.psl
    gzip over/*.chain
    gzip danRer1ToDanRer2.over.chain 

#########################################################################
# ARCHIVING danRer1 (in progress, 2006-05-03, hartera)
   ssh hgwdev
   # remove Blat servers for danRer1 on hgwbeta and the RR
   hgsql -h hgwbeta -e 'delete from blatServers where db = "danRer1";' \
         hgcentralbeta
   hgsql -h genome-centdb -e 'delete from blatServers where db = "danRer1";' \
         hgcentral
   # set active = 0 for danRer1 on hgwbeta and the RR
   hgsql -h hgwbeta -e 'update dbDb set active = 0 where name = "danRer1";' \
         hgcentralbeta
   hgsql -h genome-centdb -e \
         'update dbDb set active = 0 where name = "danRer1";' hgcentral

#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 453 where name = 'danRer1';"

##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer1

##########################################################################
# CLEANUP (DONE, 2007-03-27, hartera)
   ssh kkstore04
   cd /cluster/data/danRer1/bed
   # quality scores not used as no way to relate them to chroms
   du -sh quality
   # 9.7G quality/
   nice rm -r quality
   mv /cluster/store8/danRer1/bed/blastzSelf.2004-10-06 \
      /cluster/data/danRer1/bed/
   # then re-make symbolic link to this directory
   rm blastzSelf
   ln -s /cluster/data/danRer1/bed/blastzSelf.2004-10-06 \
         /cluster/data/danRer1/bed/blastzSelf
   # remove older directory
   du -sh blastzSelf.2004-08-09
   # 21M     blastzSelf.2004-08-09
   rm -r blastzSelf.2004-08-09
   # STS markers never made it into a track - messy data
   du -sh STSmarkers
   # 1.1G    STSmarkers/
   cd /cluster/store8/danRer1
   rm -r STSmarkers
   cd /cluster/data/danRer1/bed/
   rm STSmarkers 
   du -sh Zonlab
   # 112G    ZonLab/
   cd /cluster/data/danRer1/bed/ZonLab
   mv rhSeqAll/rhSeqAll.zip .
   rm -r rhSeqAll
   # du -sh tblastn
   # 103G    tblastn
   cd tblastn
   nice rm -r tmp psl
   rm kgpephg16table.notinfile hg16kgproseqs* *.tab
   # remove *.out files of blast output as have *.format files
   # and gzip *.format files
   foreach f (./blastout/*.out)
     nice rm -r $f 
   end
   foreach f (./blastout/*.format)
     nice gzip $f
   end
   # BACENDS CLEANUP 
   cd /cluster/data/danRer1/bed/bacends
   rm out log jobList *.lst bed.tab pslCheck.log pairsNoReps.bed template
   rm bacEndsPsl*
   # bacEnds.lifted.psl is just lifted version of bacEnds.psl
   rm bacEnds.psl 
   nice gzip bacEnds.lifted.psl
   rm singles/bacEnds*
   rm pairs/bacEnds* pairs/psl.tab.gz
   rm duplicates/bed.tab.gz duplicates/test.gz
   cd scores
   rm bed.tab.gz error.log.gz bacEndPairs.out* singles* pairs* badPairs.hits*
   cd ../cloneandStsAliases
   rm bacClones* bacEnds.* bacPrs.* *.sort markers* sanger* xref.* *.tab
   rm -r old test1 test2 testTables testUniSTS tmp1 
   cd /cluster/data/danRer1/bed/
   du -sh ZonLab
   # 44G     ZonLab/
   # removed 59 GB
   # move out presentations to another directory
   ssh hgwdev
   mv /cluster/data/danRer1/bed/ZonLab/presn/* \
      /cse/staff/hartera/presentations/
   rm -r /cluster/data/danRer1/bed/ZonLab/presn
##########################################################################