# for emacs: -*- mode: sh; -*-

                                                                                
# Danio Rerio (zebrafish) from Sanger, version Zv4 (released 6/30/04)
#  Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/Zv4_assembly_information.shtml
#  NOTE: Error in scaffolds agp file. Notified Sanger and got new scaffolds
# agp and recreated FASTA files from this new one (2004-11-29)
# Previous agp file was missing a scaffold from the end of most chromosomes.
# There is also a chrUn set of scaffolds that are in the chunks agp file - these# just have the identifier Zv4_scaffoldN instead of a chromosome number and 
# they are scaffolds that correspond to FPC contigs but their position is 
# unknown so they are not mapped to a chromosome.

# DOWNLOAD SEQUENCE (DONE, 2004-10-18, hartera)
# ERRORS IN SCAFFOLDS AGP SO GET NEW AGP FROM SANGER AND DOWNLOAD 
# (hartera, 2004-11-29) from Mario Caccamo: mc2@sanger.ac.uk
     ssh kksilo
     mkdir /cluster/store8/danRer2
     ln -s /cluster/store8/danRer2 /cluster/data
     cd /cluster/data/danRer2
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/README
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/stats
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.chunks.agp
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.scaffolds.agp
     wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.fasta
# get new agp file and download to /cluster/data/danRer2 (hartera, 2004-11-29)
     # Remove all chrom directories to start processing with new agp file
     ssh kksilo
     cd /cluster/data/danRer2
     foreach c (`cat chrom.lst`)
        rm -r $c
     end

# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-10-18, hartera)
# DOWNLOAD SEQUENCE AND CREATE FILES AGAIN (hartera, 2004-11-30)
# ADD chrM.chunks.agp (DONE, 2004-12-06, hartera)
# Error reported by a user: chrM.agp is space delimited rather
# than tab delimited so correct this. NCBI defines the agp format as 
# being tab delimited. (DONE, 2005-04-25, hartera)
     ssh kksilo
     mkdir -p /cluster/data/danRer2/M
     cd /cluster/data/danRer2/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for
     # "Danio mitochondrion genome".  That shows the gi number:
     # 8576324 for the accession, AC024175
 # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
     # Edit chrM.fa: make sure the header line says it is the
     # Danio Rerio mitochondrion complete genome, and then replace the
     # header line with just ">chrM".
     perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
     rm *.bak                                                         
     # Make a "pseudo-contig" for processing chrM too:
     mkdir ./chrM_1
     sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
     mkdir ./lift
     echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
     echo "chrM_1" > ./lift/ordered.lst
     echo "0     M/chrM_1        16596   chrM    16596" > ./lift/ordered.lft
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
    cat << '_EOF_' > ./chrM.agp
chrM       1       16596   1       F       AC024175.3      1       16596   +
'_EOF_'
     # Create a chrM.chunks.agp (hartera, 2004-12-06)
     cd /cluster/data/danRer2/M/agps
     awk 'BEGIN {OFS="\t"} \
        {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
         > chrM.chunks.agp
     # chrM.agp is space delimited instead of tab delimited 
     # so correct this (hartera, 2005-04-25)
     cd /cluster/data/danRer2/M
     # edit chrM.agp and change field delimiters from spaces to tabs.
     # add this new file to zipped files of agp and get it pushed to the
     # downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of 
     # this make doc. 

# Create list of chromsosomes (DONE, 2004-10-18, hartera)
# Add "M" for mitochondrion chromosome (2004-10-25, hartera)
# Add "Un" for chrUn (2004-11-29, hartera)
     ssh kksilo
     cd /cluster/data/danRer2
     awk '{print $1;}' Zv4.scaffolds.agp | sort -n | uniq > chrom.lst
     # add NA - these are contigs in the chunks agp
     echo "NA" >> chrom.lst
     # add chrM
     echo "M" >> chrom.lst
     # add chrUn
     echo "Un" >> chrom.lst

# SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-10-19, hartera)
# AGP USED TO CREATE FASTA WAS SCAFFOLDS AGP 
# RE-DO SPLITTING AGP FILES AFTER GETTING NEW SCAFFOLDS AGP 
# (hartera, 2004-11-29)
     ssh kksilo
     cd /cluster/data/danRer2
     # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and 
     # then one for chunks (contigs on danRer1) showing how they map on to 
     # scaffolds.
     # add "chr" prefix for the agp files
     perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
     # for chromosomes:
     foreach c (`cat chrom.lst`)
       mkdir $c
       perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
         ./Zv4.chunks.agp \
         > $c/chr$c.chunks.agp
       perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
         ./Zv4.scaffolds.agp \
         > $c/chr$c.scaffolds.agp
     end

# CREATE AGP AND FASTA FOR chrNA (DONE, 2004-10-25, hartera)
# REMAKE AGP WITH NEW SCAFFOLDS AGP FILE AND CREATE chrUn AGP 
# (hartera, 2004-11-29)

     ssh kksilo
     cd /cluster/data/danRer2
     # for NA make agp files
     grep "Zv4_NA" Zv4.chunks.agp > NA.chunks.agp
     # make a scaffolds agp for NA - use first 9 fields of chunks file
     # and remove ".1" from 6th field
     awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
         NA.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
         > NA.scaffolds.agp 
     # move agps to NA directory created above
     mv NA.scaffolds.agp NA.chunks.agp ./NA
     # from scaffolds agp, get name of scaffolds to get from FASTA file for NA
     foreach c (NA)
       awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
       $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
          $c/chr$c.scaffolds.lst $c/chr$c.fa
     end
     # create agp with 1000Ns between scaffolds as contig gaps for chrNA
     foreach c (NA)
        $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
        mv $c/chr$c.fa $c/chr$c.scaffolds.fa
        perl -pi -e "s/chrUn/chr$c/" $c/chr$c.*
     end 
     # change the type to "W" in the agp for WGS
     perl -pi.bak -e "s/D/W/;" NA/chrNA.agp
     cd NA
     mv chrNA.agp chrNA.scaffolds.agp
     rm *.bak
     # use this chrNA.scaffolds.agp to create chrNA.chunks.agp
cat << '_EOF_' > /cluster/data/danRer2/jkStuff/createChunksAgp.pl
#!/usr/bin/perl -w
use strict;
                                                                                
# input is list of scaffolds and chunks and file of chrN.scaffolds.agp with Ns.
                                                                                
my $accs = $ARGV[0];
my $scaf = $ARGV[1];
                                                                                
open (ACCS, $accs) || die "Can not open $accs: $!";
open (SCAFS, $scaf) || die "Can not open $scaf: $!";
my %accsHash;
while (<ACCS>) {
   chomp;
   my @fi = split(/\t/);
   $accsHash{$fi[0]} = $fi[1];
}
close ACCS;
                                                                                
while (my $line = <SCAFS>) {
   chomp $line;
   my @f = split(/\t/, $line);
   my $acc;
   if ($f[4] ne "N") {
      if (exists($accsHash{$f[5]}) ) {
         $acc = $accsHash{$f[5]};
      else {
          print "$f[5] can not be found\n";
      }
      print "$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$acc\t$f[6]\t$f[7]\t$f[8]";
      print "\t$f[5]\t$f[6]\t$f[7]";
   }
   else {
      print $line;
   }
   print "\n";
}
'_EOF_'
     chmod +x /cluster/data/danRer2/jkStuff/createChunksAgp.pl
     
     awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_NA/) print $1,$6 }' \
         /cluster/data/danRer2/Zv4.chunks.agp > NA.accs 
     perl ../jkStuff/createChunksAgp.pl NA.accs chrNA.scaffolds.agp \
          > chrNA.chunks.agp

   # Also create agp for chrUn - these are scaffolds that map to FPC contigs
   # but are unplaced on the chromosomes
     # for Un, make agp files
     ssh kksilo
     cd /cluster/data/danRer2
     mkdir Un
     # make a scaffolds agp for Un - use first 9 fields of chunks file
     # and remove ".1" from 6th field
     awk 'BEGIN {OFS="\t"} {if ($1 ~ /Zv4_scaffold/) \
         print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
         Zv4.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
         > Un/Un.scaffolds.agp 
     # from scaffolds agp, get name of scaffolds to get from FASTA file for Un
     foreach c (Un)
       awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
       $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
          $c/chr$c.scaffolds.lst $c/chr$c.fa
     end
     # create agp with 1000Ns between scaffolds as contig gaps for chrUn
     foreach c (Un)
        $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
        mv $c/chr$c.fa $c/chr$c.scaffolds.fa
     end 
     # change the type to "W" in the agp for WGS
     perl -pi.bak -e "s/D/W/;" Un/chrUn.agp
     cd Un
     mv chrUn.agp chrUn.scaffolds.agp
     rm *.bak
     # create chunks agp for chrUn
     # this includes accessions so need to get from Zv4.chunks.agp
     # modify perl script above to do this (2004-12-06, hartera)
     awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_/) print $1,$6 }' \
         /cluster/data/danRer2/Zv4.chunks.agp > Un.accs 
     perl ../jkStuff/createChunksAgp.pl Un.accs chrUn.scaffolds.agp \
          > chrUn.chunks.agp

# BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-10-21, hartera)
# Move scaffolds files for NA into scaffolds directory (2004-11-22, hartera)
# RE-BUILD SEQUENCE WITH NEW AGPS FROM CORRECTED SCAFFOLDS AGP 
# (2004-11-30, hartera)
     ssh kksilo
     cd /cluster/data/danRer2
     # Sequence is already in upper case so no need to change
     foreach c (`cat chrom.lst`)
       echo "Processing ${c}"
       $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
         $c/chr$c.fa ./Zv4.fasta
       echo "${c} - DONE"
     end
     # some Ns in sequence files are in lower case so change to upper case
     foreach c (`cat chrom.lst`) 
        cat $c/chr${c}.fa | tr 'n' 'N' > $c/chr${c}.fa.tr
        if ($c == "Un") then  
           perl -pi.bak -e 's/^>chrUN/>chrUn/' $c/chr${c}.fa.tr
        endif
        mv $c/chr${c}.fa.tr $c/chr${c}.fa
     end
     # move scaffolds agp to be chrom agp and clean up (2004-11-30)
     foreach c (`cat chrom.lst`)
        cd $c
        rm *.bak
        cp chr${c}.scaffolds.agp chr${c}.agp
        mkdir agps
        mv chr${c}.*.agp ./agps/
        cd ..
     end

     # move scaffolds files for NA into scaffolds directory (2004-11-22)
     # and again (2004-11-30)
     foreach c (NA Un)
        mkdir -p /cluster/data/danRer2/$c/scaffolds
        cd /cluster/data/danRer2/$c
        mv chr$c.scaffolds.* ./scaffolds
        rm $c.*.agp
        cd .. 
     end

# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-10-21, hartera)
# CHECKED THESE ARE OK (hartera, 2004-11-30)
     # Check that the size of each chromosome .fa file is equal to the
     # last coord of the .agp:
     ssh hgwdev
     cd /cluster/data/danRer2
     foreach c (`cat chrom.lst`)
       foreach f ( $c/chr$c.agp )
         set agpLen = `tail -1 $f | awk '{print $3;}'`
         set h = $f:r
         set g = $h:r
         echo "Getting size of $g.fa"
         set faLen = `faSize $g.fa | awk '{print $1;}'`
         if ($agpLen == $faLen) then
           echo "   OK: $f length = $g length = $faLen"
         else
           echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
         endif
       end
     end
     # all are the OK so FASTA files are the expected size

# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-10-25, hartera)
# RE-DONE (2004-11-30, hartera)
                                                                                
     ssh kksilo
     cd /cluster/data/danRer2
     foreach c (`cat chrom.lst`)
       foreach agp ($c/chr$c.agp)
         if (-e $agp) then
           set fa = $c/chr$c.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end

# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-10-25, hartera)
    # This used to hold scripts -- better to keep them inline here
    # Now it should just hold lift file(s) and
    # temporary scripts made by copy-paste from this file.
    mkdir /cluster/data/danRer2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/danRer2/bed

# CREATING DATABASE (DONE, 2004-10-25, hartera)
    # Create the database.
    # next machine
    ssh hgwdev
    echo 'create database danRer2' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
    echo 'drop database danRer2' | hgsql danRer2
    # Delete and re-create database as above (hartera, 2004-11-30)
    # Use df to make sure there is at least 10 gig free on
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  637G 1023G  39% /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-10-25, hartera)
# RECREATE GRP TABLE (hartera, 2004-11-30)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #  grp in the database danRer1 to the new database danRer2
    echo "create table grp (PRIMARY KEY(NAME)) select * from danRer1.grp" \
      | hgsql danRer2
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
    echo 'drop table grp;' | hgsql danRer2

# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-10-26, hartera)
# There is a new Repeat library at WUSTL that has new repeats for Danio rerio
# This is Dr.repeats.020501
# Add a README about these repeats
    ssh kksilo
    cd /cluster/data/danRer2
    wget --timestamp \
         http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501
    mv Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib
    # add danioRW.lib to danio.lib
    cd /cluster/bluearc/RepeatMasker/Libraries
    cp danio.lib danioRMandRW.lib
    cat danioRW.lib >> danioRMandRW.lib
    # add type as "unknown" to this file as these repeats are not classified
    perl -pi.bak -e 's/^(>Dr[0-9]+)/$1#Unknown/' danioRMandRW.lib
    # these new repeats are not classified by type so add "DrRW" as type later
    # Add a README about these repeats from WUSTL
    wget --timestamp \
          http://www.genetics.wustl.edu/fish_lab/repeats/Readme.txt

#- Split contigs into 500kb chunks, at gaps if possible:
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

#- Make the run directory and job list:
    cd /cluster/data/danRer2
    # use RepeatMasker from January 2004
    cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
                                                                                
cd $1
pushd .
/bin/mkdir -p /tmp/danRer2/$2
/bin/cp $2 /tmp/danRer2/$2/
cd /tmp/danRer2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2
popd
/bin/cp /tmp/danRer2/$2/$2.out ./
if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
/bin/rm -fr /tmp/danRer2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
'_EOF_'
    chmod +x jkStuff/RMZebrafish2
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer2/jkStuff/RMZebrafish \
                 /cluster/data/danRer2/$d $f \
               '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end
                                                                                
    #- Do the run
    ssh kk
    cd /cluster/data/danRer2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# para time
# CPU time in finished jobs:   10326858s  172114.29m  2868.57h  119.52d  0.327 y
# IO & Wait Time:                 33702s     561.71m     9.36h    0.39d  0.001 y
# Average job time:                3081s      51.35m     0.86h    0.04d
# Longest job:                     4065s      67.75m     1.13h    0.05d
# Submission to last job:         39673s     661.22m    11.02h    0.46d


    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/danRer2
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end
                                                                                
    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/danRer2
    hgLoadOut danRer2 */chr*.fa.out
# When try masking sequence (see below) there are 60 instances of negative 
# co-ordinates:
#     2 Dr000074
#     48 Dr000158
#     6 Dr000375
#      1 Dr000511
#      1 Dr000759
#      1 Dr000975
#      5 Dr001181
# Sent sample output from chr1_3_21 to Arian Smit and he suggested that 
# Dr000158 is a Satellite. When this classification is added to the FASTA
# header: >Dr000158#Satellite, the negative co-ordinates disappeared.
# If the classification is changed to "Unknown" then there are negative 
# co-ordinates.
# Took a look at these 7 repeats above and found overlapping matches
# Yi Zhou at Boston Children's Hospital looked at the repeats and split 
# them up into smaller chunks: danioRMandRWsplit.libe
# Running RepeatMasker with this library removed a lot of negative co-ordinates
# but some new ones appeared. There are 7 instances of negative co-ordinates.
# TDR5, (TAGA)n, Dr000355, Dr001182
# Dr001182 has two repeats, the second being an imperfect replicate of the
# first so this was split into two repeats and RepeatMasker run again (11/18/04)
# This time there were TDR5, (TAGA)n, Dr000355, Dr000509 with negative repeats
# but only 5 instances.
# 11/13/04
# try RepeatMasker with 7 repeats with negative co-ords split into smaller
# repeats. get list of repeat names without these then get those sequences
    ssh kksilo
    cd /cluster/data/danRer2/bed/
    $HOME/bin/i386/faSomeRecords \
          /cluster/bluearc/RepeatMasker/Libraries/danioRMandRW.lib \
          rmandrw.txt danioRMandRWsplit.lib
# splitreps is list of split repeats
    cat splitreps >> danioRMandRWsplit.lib
    mv danioRMandRWsplit.lib /cluster/bluearc/RepeatMasker/Libraries/
    # then run repeat masker on problem repeat areas e.g. chr1_3_21.fa
    mkdir /cluster/data/danRer2/RMRun/testSplitReps
    cd /cluster/data/danRer2/RMRun/testSplitReps
    nice /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRWsplit.lib chr1_3_21.fa 
  # works well so change all the classes to unknown and re-run with this library
   perl -pi.bak -e 's/^>(Dr[0-9a-z\-]+)#DrRW/>$1#Unknown/' danioRMandRWsplit.lib
 # then run RepeatMasker as above with new split library (DONE, 2004-11-16)
 # edit RMZebrafish to use this library (library is danioRMandRWsplit.lib) 
 # and run from RMRun2 directory
   # need to remove chrN.fa.masked files
   # then convert chrN.fa back to upper case
    ssh kksilo
    cd /cluster/data/danRer2
    foreach c (`cat chrom.lst`)
       cd $c
       echo "Processing $c ..."
       rm chr${c}.fa.masked
       cat chr${c}.fa | tr '[a-z]' '[A-Z]' > chr${c}.tmp
       perl -pi.bak -e 's/^>CHR([0-9A-Z]+)/>chr$1/' chr${c}.tmp 
       mv chr${c}.tmp chr${c}.fa
       rm chr${c}.tmp.bak
       cd ..
    end
# still get some negative co-ordinates when try masking
# Dr001182 is a repeat sequence which contains two instances of a repeat with
# the second one not being a perfect match to the first
# split these into two repeats and then re-run RepeatMasker
#
# e-mailed Kerstin Jekosch at Sanger as it looks like they have used this
# WUSTL Repeat library to mask the Zv4 assembly for Ensembl. Kerstin recommended
# downloading this new library from RepBase as it has been properly 
# formatted for RepBase version 10. In RepBase, it is the Zebrafish Unclassified
# library and it consists of 958 unfinished consensus sequences of unclassified
# repeats extracted from the library at 
# http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501 
# which has a total of 1225 repeats. 267 repeats present in the library have
# been replaced by a set of consensus sequences of classified transposable 
# elements that are reported in Repbase Update and Repbase Reports.

# DOWNLOAD NEW VERSION OF THE WUSTL ZEBRAFISH REPEATS FROM REPBASE
# (DONE, 2004-11-18, hartera)
# Go to http://www.girinst.org/server/RepBase/
# and select zebunc (Zebrafish unclassified library)
# click on repeatmaskerlibraries and then download 
# repeatmaskerlibrariesJuly2004.tar.gz
    gunzip repeatmaskerlibrariesJuly2004.tar.gz
    tar xvf repeatmaskerlibrariesJuly2004.tar
    perl -pi.bak -e 's/^>(Dr[0-9]+)/>$1#Unknown/' zebunc.ref
    cat danio.lib zebunc.ref >> danioandzebunc.lib

# REDO REPEATMASKER RUN AND LOAD NEW RESULTS (DONE, 2004-11-22, hartera)
# REDONE (hartera, 2004-12-01)
    # sequences already split into 500kb chunks - see above
    # use RepeatMasker open-3.0 version, Sep 2004, this was in 
    #  /cluster/bluearc/RepeatMasker.040909 and is now the default
    # new zebrafish library was downloaded from RepBase - zebunc.ref
    # copy to /cluster/bluearc/RepeatMasker.040909/Libraries
    # add "Unknown" as classification for these repeats
    perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]' zebunc.ref
    # add to RepeatMasker library
    mv RepeatMasker.lib RepeatMasker.lib.old
    cat RepeatMasker.lib.old zebunc.ref >> RepeatMasker.lib

    cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
                                                                                
cd $1
pushd .
/bin/mkdir -p /tmp/danRer2/$2
/bin/cp $2 /tmp/danRer2/$2/
cd /tmp/danRer2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer2/$2/$2.out ./
if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
/bin/rm -fr /tmp/danRer2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
'_EOF_'
    chmod +x jkStuff/RMZebrafish
    rm -r RMRun
    mkdir -p RMRun

    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer2/jkStuff/RMZebrafish \
                 /cluster/data/danRer2/$d $f \
               '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end
                                                                                
    #- Do the run
    ssh kk
    cd /cluster/data/danRer2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# para time
# Completed: 3899 of 3899 jobs
# CPU time in finished jobs:   11636116s  193935.26m  3232.25h  134.68d  0.369 y
# IO & Wait Time:                 39078s     651.31m    10.86h    0.45d  0.001 y
# Average job time:                2994s      49.91m     0.83h    0.03d
# Longest job:                     4064s      67.73m     1.13h    0.05d
# Submission to last job:         19022s     317.03m     5.28h    0.22d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/danRer2
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end
                                                                                
    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/danRer2
    hgLoadOut danRer2 */chr*.fa.out
# Note: 23 records dropped due to repStart > repEndm (2004-12-01)
# Processing 1/chr1.fa.out
bad rep range in: 1354  157     0       0       chr1    7313059 7313288 -5489473
5       -       (0)     (917)   (917)   689     5       3        
bad rep range in: 794   247     63      0       chr1    22624284        22624474
        -39583549       -       (0)     (860)   (860)   659     0       1       
*
bad rep range in: 841   234     44      0       chr1    27730487        27730692
        -34477331       -       (0)     (555)   (555)   342     5       1       
# Processing 11/chr11.fa.out
bad rep range in: 2007  124     74      108     chr11   6853360 6853376 -3076000
2       +       HATN10_DR       DNA     hAT     167     166     -289    2
# Processing 13/chr13.fa.out
bad rep range in: 939   65      0       158     chr13   11182894        11182923
        -26476056       +       DIRS1_DR        LTR     DIRS1   6133    6132    
0       1
# Processing 14/chr14.fa.out
bad rep range in: 350   125     29      137     chr14   39592288        39592300
        -20407524       +       HAT1_DR DNA     hAT     612     611     -2681   
8
# Processing 16/chr16.fa.out
bad rep range in: 311   225     0       136     chr16   38708823        38708835
        -4054346        +       Dr000294        Unknown Unknown 403     400     
-549    6
# Processing 18/chr18.fa.out
bad rep range in: 9780  128     42      58      chr18   12479604        12480762
        -35227702       +       Dr000158        Unknown Unknown 45      -2229   
-3930   1
# Processing 19/chr19.fa.out
bad rep range in: 249   169     0       167     chr19   25584255        25584266
        -26439321       +       Dr000331        Unknown Unknown 314     311     
-844    3
# Processing 2/chr2.fa.out
bad rep range in: 596   206     44      0       chr2    24978519        24978655
        -27097055       -       (1)     (317)   (317)   176     5       4       
 
bad rep range in: 326   56      19      0       chr2    40153004        40153058
        -11922652       -       (129)   (79)    (79)    25      5       2       
 
bad rep range in: 1454  56      0       0       chr2    50993722        50993901
        -1081809        -       (0)     (1155)  (1155)  977     5       8 
# Processing 3/chr3.fa.out
bad rep range in: 605   76      0       11      chr3    42820214        42820307
        -1974931        -       (6)     (5062)  (5062)  4971    5       3       
# Processing 4/chr4.fa.out
bad rep range in: 1072  143     21      100     chr4    920087  920366  -3235941
8       -       (1)     (540)   (540)   284     5       1        
bad rep range in: 330   194     109     29      chr4    1398685 1398823 -3188096
1       -       (204)   (375)   (375)   227     5       14       
bad rep range in: 978   212     58      90      chr4    24351201        24351580
        -8928204        -       (594)   (553)   (553)   187     5       1   
# Processing 5/chr5.fa.out
bad rep range in: 4380  113     49      44      chr5    4937637 4937659 -6284667
7       +       TDR23   DNA     DNA     889     888     -259    1
# Processing 6/chr6.fa.out
bad rep range in: 649   14      0       0       chr6    7782737 7782809 -2542980
7       -       (956)   (419)   (419)   348     5       1        
# Processing 7/chr7.fa.out
bad rep range in: 272   158     0       50      chr7    19882140        19882200
        -42635991       -       (800)   (419)   (419)   363     5       7    
# Processing NA/chrNA.fa.out
bad rep range in: 1068  181     113     122     chrNA   75025954        75025966
        -243271514      +       TDR18   DNA     DNA     188     187     -385    
5
bad rep range in: 493   109     3       153     chrNA   202800523       20280058
5       -115496895      +       Dr000876        Unknown Unknown 1       -32     
-157    1
bad rep range in: 444   271     4       164     chrNA   249521533       24952154
8       -68775932       +       CR1-1_DR        LINE    L2      4833    4832    
-490    9
# Processing Un/chrUn.fa.out
bad rep range in: 237   149     0       152     chrUn   96855194        96855206
        -76596190       +       Dr000331        Unknown Unknown 312     311     
-844    1

# To display the new repeats which are classed as "Unknown", add this class
# to $HOME/kent/src/hg/hgTracks/rmskTrack.c 
# to the repeatClassNames and repeatClasses arrays
# check coverage of repeats masked
# featureBits -chrom=chr1 danRer1 rmsk
# 11589712 bases of 40488791 (28.624%) in intersection
# featureBits -chrom=chr1 danRer2 rmsk
# 26879295 bases of 61678023 (43.580%) in intersection

# MAKE LIFTALL.LFT (DONE, 2004-10-26, hartera)
# RE-DONE (hartera, 2004-12-01)
    ssh kksilo
    cd /cluster/data/danRer2
    cat */lift/ordered.lft > jkStuff/liftAll.lft

# SIMPLE REPEAT [TRF] TRACK  (DONE, 2004-10-26, hartera)
# RE-DONE (DONE, hartera, 2004-12-02)
    # TRF runs pretty quickly now... it takes a few hours total runtime,
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh kksilo
    rm -r /cluster/data/danRer2/bed/simpleRepeat
    mkdir -p /cluster/data/danRer2/bed/simpleRepeat
    cd /cluster/data/danRer2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/danRer2/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
                                                                                
    chmod a+x jobs.csh
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
   # kksilo overloaded so take chrNA_35 onwards as jobs2.csh and run on kolossus 
    liftUp simpleRepeat.bed /cluster/data/danRer2/jkStuff/liftAll.lft warn \
      trf/*.bed
    # Load into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/simpleRepeat
    hgLoadBed danRer2 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql

# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-10-26, hartera)
# RE-DONE (2004-12-02, hartera)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/danRer2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/danRer2
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF 
# (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-02, hartera)
# RE-MAKE MASKED CHR1 SEQUENCE AS INCOMPLETELY MASKED (2005-06-06, hartera)
# Although there was an error in the RepeatMasker output for chr1, this 
# loaded ok for the chr1_rmsk table so that is complete. It is just the 
# actual sequence that was not masked properly.
    ssh kksilo
    cd /cluster/data/danRer2
    # Soft-mask (lower-case) the contig and chr .fa's,
    # then make hard-masked versions from the soft-masked.
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    # errors in the RepeatMasker output with negative co-ordinates 
    # (60 instances) - mainly for specific sequences in the new zebrafish 
    # library so took a look at these together with Yi Zhou from Boston 
    # Children's Hospital who split these repeats into smaller sequences.
    # These were used to replace the original repeats in the repeat library and     # RepeatMasker was re-run. Finally, it was found that a newer version of 
    # RepeatMasker open-3.0.5 version reduced the negative co-ordinates to 
    # 2 instances and a cleaned up version of the new zebrafish library 
    # from RepBase was also used - see above.
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
# with the new version of RepeatMasker, there are 2 instances of negative
# co-ordinates which can just be ignored. 
# WARNING: negative rEnd: -2229 chr18:12479605-12480762 Dr000158
# WARNING: negative rEnd: -32 chrNA:202800524-202800585 Dr000876
# with new agp and chrom sequence, there is also an instance of a * instead
# of a co-ordinate (2004-12-02)
# repeat- and trf-masking 1/chr1.fa
# invalid signed number: "*"
    # mask contigs also
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
# same warnings here too:
# WARNING: negative rEnd: -2229 chr18_3:2110746-2111903 Dr000158
# WARNING: negative rEnd: -32 chrNA_41:1844381-1844442 Dr000876
# repeat- and trf-masking contigs of chr1
# invalid signed number: "*"  (2004-12-02) and the co-ordinates for chr18_3
# are: WARNING: negative rEnd: -2229 chr18_3:1862490-1863647 Dr000158

    # Build nib files, using the soft masking in the fa
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end

    # Re-do masking for chr1.fa. A user noticed that the masking for this 
    # is very low compared to other chroms (2% vs almost 50%). Re-running
    # maskOutFa and debugging revealed that the error reported above
    # "invalid signed number: "*" " is causing the program to abort 
    # so the chrom is incompletely masked. This row of the RepeatMasker file
    # (chr1.fa.out) is incorrect and the maskOutFa program fails when it 
    # tries to load this line into a data structure. 
    # Solution: remove the erroneous line (line 52575) and re-run maskOutFa
    ssh kkstore01
    mkdir /cluster/data/danRer2/tmpMask
    cd  /cluster/data/danRer2/tmpMask
    cp /cluster/data/danRer2/1/chr1.fa .
    cp /cluster/data/danRer2/1/chr1.fa.out .
    # make sure it is all upper case
    cat chr1.fa | tr '[a-z]' '[A-Z]' > chr1.tmp
    sed -e 's/>CHR1/>chr1/' chr1.tmp > chr1.fa
    # remove line 52575 from chr1.fa.out
    # re-run maskOutFa
    set trfCtg=/cluster/data/danRer2/bed/simpleRepeat/trfMask
    set trfChr=/cluster/data/danRer2/bed/simpleRepeat/trfMaskChrom
    
    foreach f (chr1.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    # then add chr1New as FASTA heading and copy to the directory for chr1
    sed -e 's/>chr1/>chr1New/' chr1.fa > chr1New.fa
    sed -e 's/>chr1/>chr1New/' chr1.fa.masked > chr1New.fa.masked
    mv chr1.fa.out /cluster/data/danRer2/1/chr1New.fa.out
    mv chr1New.fa /cluster/data/danRer2/1/
    mv chr1New.fa.masked /cluster/data/danRer2/1/
    cd ..
    rm -r tmpMask
    cd 1
    faSize chr1.fa
    # 62208023 bases (3421437 N's 58786586 real 57507587 upper 1278999 lower) 
    faSize chr1New.fa
    # 2.1% are in lower case
    # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
    # 43% masked as lower case
    faSize chr1.fa.masked
    # 62208023 bases (4700436 N's 57507587 real 57507587 upper 0 lower) 
    # 7.6% are Ns
    faSize chr1New.fa.masked
    # 62208023 bases (30333863 N's 31874160 real 31874160 upper 0 lower)
    # 49% are Ns

    # just use this new masked sequence for downloads and replace the
    # masked chr1 sequences for downloads - see DOWNLOADABLE SEQUENCE
    # section below
 
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-02, hartera)
# MOVE danRer2.2bit out of gbdb nib directory (2004-12-15, hartera)
    # Make symbolic links from /gbdb/danRer2/nib to the real nibs.
    ssh hgwdev
    cd /cluster/data/danRer2
    mkdir -p /gbdb/danRer2/nib
    foreach f (/cluster/data/danRer2/nib/chr*.nib)
      ln -s $f /gbdb/danRer2/nib
    end
    # Load /gbdb/danRer2/nib paths into database and save size info
    # hgNibSeq creates chromInfo table
    hgNibSeq -preMadeNib danRer2 /gbdb/danRer2/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N danRer2 > chrom.sizes
    # take a look at chrom.sizes, should be 28 lines
    wc chrom.sizes
                                             
    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/danRer2/nib because hgBlat looks there:
    faToTwoBit */chr*.fa danRer2.2bit
    ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/nib/
    # Move this link out of nib directory (2004-12-15, hartera)
    rm /gbdb/danRer2/nib/danRer2.2bit
    ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/
    
# MAKE GOLD AND GAP TRACKS (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-03, hartera)
    ssh hgwdev
    cd /cluster/data/danRer2
    # the gold and gap tracks are created from the chrN.agp file and this is
    # the scaffolds or supercontigs agp 
    # move other agp files out of the way to an agps directory
    foreach c (`cat chrom.lst`)
       mkdir ./$c/agps
       mv ./$c/chr${c}.chunks.agp ./$c/agps/
       mv ./$c/chr${c}.scaffolds.agp ./$c/agps/
    end
    # move the scaffolds agp for NA to agps directory
    mv ./NA/scaffolds/chrNA.scaffolds.agp ../agps
    # the gold and gap tracks are created from the chrN.agp file
    hgGoldGapGl -noGl -chromLst=chrom.lst danRer2 /cluster/data/danRer2 .

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER2
# (DONE, 2004-11-22, hartera)
# Correct nibPath for danRer2 in dbDb table (2004-12-15, hartera)
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
                                                                                
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add danRer2 in all the right places and do
    make update
    make alpha
    cvs commit -m "Added danRer2." makefile
    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("danRer2", "June 2004", \
          "/gbdb/danRer1/nib", "Zebrafish", "chr2:16,330,443-16,335,196", 1, \
          37, "Zebrafish", "Danio rerio", \
          "/gbdb/danRer2/html/description.html", 0, 0, \
          "Sanger Centre, Danio rerio Sequencing Project Zv4");' \
    | hgsql -h genome-testdb hgcentraltest
    # set danRer2 to be the default assembly for Zebrafish
    echo 'update defaultDb set name = "danRer2" \
          where genome = "Zebrafish";' \
          | hgsql -h genome-testdb hgcentraltest
    # dbDb table has danRer1 instead of danRer2 in nib path. However, this
    # should point to the position of the danRer2.2bit file
    # (2004-12-15, hartera)
    echo 'update dbDb set nibPath="/gbdb/danRer2" \
         where name = "danRer2";' | hgsql hgcentraltest

# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-11-22, hartera)
    ssh hgwdev
    mkdir /cluster/data/danRer2/html
    # make a symbolic link from /gbdb/danRer2/html to /cluster/data/danRer2/html
    ln -s /cluster/data/danRer2/html /gbdb/danRer2/html
    # Add a description page for zebrafish
    cd /cluster/data/danRer2/html
    cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html .
    # Edit this for zebrafish danRer2
                                                                                
    # create a description.html page here
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish
    cvs add danRer2
    cvs commit danRer2
    # Add description page here too
    cd danRer2 
    cp /cluster/data/danRer2/html/description.html .
    cvs add description.html
    cvs commit -m "First draft of description page for danRer2." \
        description.html

# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-03, hartera)
# PUT MASKED SEQUENCE ON BLUEARC (DONE, 2004-12-22 and 2004-12-23, hartera)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer2/nib
    mkdir -p /iscratch/i/danRer2/nib
    cp -p /cluster/data/danRer2/nib/chr*.nib /iscratch/i/danRer2/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer2/trfFa
    mkdir /iscratch/i/danRer2/trfFa
    foreach d (/cluster/data/danRer2/*/chr*_?{,?})
      cp -p $d/$d:t.fa /iscratch/i/danRer2/trfFa
    end
    rm -rf /iscratch/i/danRer2/rmsk
    mkdir -p /iscratch/i/danRer2/rmsk
    cp -p /cluster/data/danRer2/*/chr*.fa.out /iscratch/i/danRer2/rmsk
    cp -p /cluster/data/danRer2/danRer2.2bit /iscratch/i/danRer2/
    iSync
    # add to the bluearc
    ssh kksilo
    mkdir -p /cluster/bluearc/danRer2/nib
    cp -p /cluster/data/danRer2/nib/chr*.nib /cluster/bluearc/danRer2/nib
    mkdir -p /cluster/bluearc/danRer2/trfFa
    foreach d (/cluster/data/danRer2/*/chr*_?{,?})
      cp -p $d/$d:t.fa /cluster/bluearc/danRer2/trfFa
    end

# CREATE gc5Base wiggle TRACK (DONE, 2004-12-03, hartera)
    ssh kksilo
    mkdir -p /cluster/data/danRer2/bed/gc5Base
    cd /cluster/data/danRer2/bed/gc5Base
    # The number of bases that hgGcPercent claimed it measured is calculated,       # which is not necessarily always 5 if it ran into gaps, and then the 
    # division by 10.0 scales down the numbers from hgGcPercent to the range
    # [0-100].  wigEncode now replaces wigAsciiToBinary and the previous 
    # processing step between these two programs. The result file is *.wig.
    # Each value represents the measurement over five bases beginning with
    # <position>. wigEncode also calculates the zoomed set of data.
    
    nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer2 \
        /iscratch/i/danRer2/nib | \
        wigEncode stdin gc5Base.wig gc5Base.wib
    # load the .wig file back on hgwdev:
    ssh hgwdev
    cd /cluster/data/danRer2/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/danRer2/wib/gc5Base \
                 danRer2 gc5Base gc5Base.wig
    # and symlink the .wib file into /gbdb
    mkdir -p /gbdb/danRer2/wib/gc5Base
    ln -s `pwd`/gc5Base.wib /gbdb/danRer2/wib/gc5Base

# MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-12-03, hartera)
    # Use -repMatch=512 (based on size -- for human we use 1024, and
    # the zebrafish genome is ~50% of the size of the human genome
    ssh kkr1u00
    mkdir /cluster/data/danRer2/bed/ooc
    cd /cluster/data/danRer2/bed/ooc
    mkdir -p /cluster/bluearc/danRer2
    ls -1 /cluster/data/danRer2/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/danRer2/11.ooc -repMatch=512
    # Wrote 44008 overused 11-mers to /cluster/bluearc/danRer2/11.ooc
    # For 10.ooc, repMatch = 4096 for human, so use 2048
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/danRer2/10.ooc -repMatch=2048
    # Wrote 10639 overused 10-mers to /cluster/bluearc/danRer2/10.ooc
    # keep copies of ooc files in this directory and copy to iscratch
    cp /cluster/bluearc/danRer2/*.ooc .
    cp -p /cluster/bluearc/danRer2/*.ooc /iscratch/i/danRer2/
    iSync

# ADD CONTIGS TRACK (DONE, 2004-12-06, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift
# RELOAD CONTIGS TRACK - NOT ALL CONTIGS HAD LOADED INTO DATABASE TABLE
# (DONE, 2005-06-15, hartera)
    ssh kksilo
    mkdir -p /cluster/data/danRer2/bed/ctgPos2
    cd /cluster/data/danRer2/bed/ctgPos2
    # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
    foreach c (`cat /cluster/data/danRer2/chrom.lst`) 
         awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer2/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
    end

    ssh hgwdev
    cd /cluster/data/danRer2/bed/ctgPos2
    # Reload table (2005-06-16, hartera) - not all the contigs had loaded 
    # into the table because the unique primary key for ctgPos2.sql consisted
    # of only the first 14 characters of the contig name. However, contigs for 
    # danRer2 have names such as "Zv4_scaffold99.1" and "Zv4_scaffold99.3" for
    # which the first 14 characters are not unique so only the first one is
    # loaded. Also, there are cases where the contig has an accession and more
    # than one contig share the same accession as their name so names are not
    # unique. Therefore, this was changed in ctgPos2.sql so that the first
    # 20 characters of the contig name and the chromStart are the composite
    # primary key: PRIMARY KEY(contig(20),chromStart),
    echo "drop table ctgPos2" | hgsql danRer2
    hgsql danRer2 < ~/kent/src/hg/lib/ctgPos2.sql
    echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
         | hgsql danRer2

# Add trackDb.ra entry and ctgPos2.html

# ENSEMBL GENES (DONE, 2004-12-06, hartera)
    ssh hgwdev                                                                      mkdir -p /cluster/data/danRer2/bed/ensembl
    cd /cluster/data/danRer2/bed/ensembl
    # Get the ensembl protein data from
    # http://www.ensembl.org/Danio_rerio/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Danio_rerio choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box.
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz
    
    # the Ensembl gene predictions are mapped to chromosomes except for 
    # chrNA and chrUn. So, a lift file needs to be created to lift from the
    # scaffold to chromosome co-ordinates
   
    ssh kksilo
    mkdir -p /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
    cd /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
    # the lift files for scaffolds to chrom for NA and Un are already created
    cp /cluster/data/danRer2/Un/chrUn.lft .
    cp /cluster/data/danRer2/NA/chrNA.lft .
    cat *.lft >> liftUnScaffoldToChrom.lft    

    # get chrUn and chrNA records 
    cd /cluster/data/danRer2/bed/ensembl
    awk '$1 ~ /^Zv4_NA[0-9]+/ || $1 ~ /^Zv4_scaffold[0-9]+/' ensemblGene.gtf \
                    > ensemblGenechrUns.gtf
    # get records for all other chroms
    awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
                                    
    liftUp -type=.gtf ensemblGenechrUns.lifted \
         ./liftSupertoChrom/liftUnScaffoldToChrom.lft warn ensemblGenechrUns.gtf
    # Got 38882 lifts in ./liftSupertoChrom/liftUnScaffoldToChrom.lft
    sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
    cat ensemblGenechrUns.lifted >> ensGene.gtf
    # check some of the lifted co-ordinates
    # load into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ensembl
    /cluster/bin/i386/ldHgGene danRer2 ensGene \            
            /cluster/data/danRer2/bed/ensembl/ensGene.gtf
    # Read 32062 transcripts in 592139 lines in 1 files
    # 32062 groups 27 seqs 1 sources 4 feature types
    # 32062 gene predictions

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    gunzip ensGtp.tsv.gz
    hgsql danRer2 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
         | hgsql -N danRer2

    # Get the ensembl peptide sequences from
    # http://www.ensembl.org/Danio_rerio/martview
    # Choose Danio Rerio as the organism
    # Follow this sequence through the pages:
    # Page 1) Choose the Ensembl Genes choice. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Sequences" tab.
    # Page 4) Choose Transcripts/Proteins and peptide Only as the output,
    # choose text/fasta and gzip compression,
    # name the file ensemblPep.fa.gz and then hit export.

    gunzip ensemblPep.fa.gz
    hgPepPred danRer2 ensembl ensemblPep.fa

# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2004-12-06, hartera)
    # sequences already downloaded for danRer1
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyZebrafish

    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /cluster/bluearc/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer2
    ssh kkr1u00
    mkdir -p /cluster/data/danRer2/bed/affyZebrafish.2004-12-06
    ln -s /cluster/data/danRer2/bed/affyZebrafish.2004-12-06 \
          /cluster/data/danRer2/bed/affyZebrafish
    cd /cluster/data/danRer2/bed/affyZebrafish
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/danRer2/bed/affyZebrafish
    ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
    ls -1 /iscratch/i/danRer2/trfFa/ > genome.lst
                                                                               
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer2/11.ooc /iscratch/i/danRer2/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    
    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    para try, check, push ... etc.
# para time
# Completed: 299 of 299 jobs
# CPU time in finished jobs:       4702s      78.37m     1.31h    0.05d  0.000 y
# IO & Wait Time:                  3077s      51.28m     0.85h    0.04d  0.000 y
# Average job time:                  26s       0.43m     0.01h    0.00d
# Longest job:                      128s       2.13m     0.04h    0.00d
# Submission to last job:           685s      11.42m     0.19h    0.01d
    ssh kksilo
    cd /cluster/data/danRer2/bed/affyZebrafish
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyZebrafish.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least
    # 95% identity in aligned region.
    # do not use minCover since a lot of sequence is in Un, NA and Finished
    # so genes may be split up so good to see all alignments
    pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak
    mv affyZebrafish.psl.bak affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # psl is good
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/affyZebrafish
    hgLoadPsl danRer2 affyZebrafish.psl
                                                                               
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /gbdb/hgFixed/affyProbes
                                                                               
    hgLoadSeq -abbr=Zebrafish: danRer2 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer2

# RH MAP TRACK (DONE, 2004-12-07, hartera)
# create sym links to RH sequences in /gbdb/danRer2 (2004-12-17, hartera)
# RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING 
# (DONE, 2005-05-15, hartera)

    # Data from Leonard Zon's lab at the Childrens Hospital, Boston
    # Radiation hybdrid (RH) map data consists of 8707 genomic sequences:
    #   (1) 2835 BAC clone ends (Max Planck Inst.  Robert
    #   Geisler's lab, Tuebingen, Germany,
    #   (2) 2015 NCBI ESTs (submitted from around the planet,
    #   (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s
    #   Hosp. Boston,
    #   (4) 3686  WZ compiled ESTs from WashU
    #
    ssh kkr1u00
    mkdir -p /cluster/data/danRer2/bed/ZonLab/rhMap
    cd /cluster/data/danRer2/bed/ZonLab/rhMap
    
  # sequences for RHmap are in /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rh.fa\
  # rhcontigs.fa is the sequences all in one file with formatted header
    # copy to iscratch if not there already
    mkdir -p /iscratch/i/danRer2/rhmap
    cp -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
          /iscratch/i/danRer2/rhmap    
    iSync
    # do the blat run to map RH map sequences to danRer2
    ssh kk
    cd /cluster/data/danRer2/bed/ZonLab/rhmap
    mkdir psl
    ls -1S  /iscratch/i/danRer2/rhmap/rhcontigs.fa > rhmap.lst
    ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
# try same parameters as for bacEnds
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
     gensub2 genome.lst rhmap.lst gsub spec
     para create spec
     para try
     para check
     para push, try ... etc.
# Completed: 299 of 299 jobs
# CPU time in finished jobs:       7977s     132.94m     2.22h    0.09d  0.000 y
# IO & Wait Time:                  3114s      51.91m     0.87h    0.04d  0.000 y
# Average job time:                  37s       0.62m     0.01h    0.00d
# Longest job:                       54s       0.90m     0.01h    0.00d
# Submission to last job:           155s       2.58m     0.04h    0.00d

    ssh kksilo
    cd /cluster/data/danRer2/bed/ZonLab/rhMap
    # Make & check the psl table
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create rhmap.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least 80% identity in aligned region.
    # these are the parameters used for STS markers
    # pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \
    #        contig.psl /dev/null
    # Processed 667164 alignments
    # try more stringent filtering for these:  minAli=0.96 and minCover=0.40
    # was suggested by Jim (DONE, 2005-05-15, hartera) - these parameters
    # worked well for danRer1 - see makeDanRer1.doc
    mv rhMap.psl rhMap.psl.old
    mv contig.psl contig.psl.old
    pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \
            contig.psl /dev/null
    # Processed 667164 alignments
    liftUp rhMap.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
    pslCheck rhMap.psl
    # psl is ok.      
    # for previous parameters, the sequence with the most alignments had 1128.
    # over 1400 have multiple alignments and 7064 sequences have 1 alignment
    # now the most is 8. 78 sequences have 3-8 alignments, 820 have 2 and
    # 6709 have 1 alignment.       
    # for the new filtered psl there are 8609 alignments, there were 13821 
    # for the previous parameters (8492 sequences aligned, 98%)                         
    # Load sequence alignments into database.
    # Reload rhMap alignments - more stringent filtering (hartera, 2005-05-15)
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/rhMap
    echo "drop table rhMap;" | hgsql danRer2
    hgLoadPsl danRer2 rhMap.psl
    hgsql -N -e "select qName from rhMap;" danRer2 | sort | uniq | wc
    # 7607 out of 8690 sequences aligned (88%)
    # Add RH Map sequences
    # Copy sequences to gbdb if they are not there already
    # create sym link in /gbdb/danRer2 directory instead of /gbdb/danRer1
    # (2004-12-17, hartera)
    mkdir -p /gbdb/danRer2/rhMap
    ln -s \
      /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
       /gbdb/danRer2/rhMap
    cd /cluster/data/danRer2/bed/ZonLab/rhMap
    hgLoadSeq danRer2 /gbdb/danRer1/rhmap/rhcontigs.fa
    # change extFile entry
    echo 'update extFile set path = "/gbdb/danRer2/rhMap/rhcontigs.fa" where id = 15513;' | hgsql danRer2

# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER2
# (DONE, 2004-12-08, hartera)
   # hgcentraltest is now on hgwdev                                            
    ssh hgwdev
   # DNA port is "0", trans prot port is "1"
 echo 'insert into blatServers values("danRer2", "blat14", "17789", "0", "0");    insert into blatServers values("danRer2", "blat14", "17788", "1", "0");' \
    | hgsql hgcentraltest
    # if you need to delete those entries
    echo 'delete from blatServers where db="danRer2";' \
    | hgsql hgcentraltest
    # to check the entries:
    echo 'select * from blatServers where db="danRer2";' \
    | hgsql hgcentraltest

# AUTO UPDATE GENBANK MRNA AND EST RUN  (DONE, 2004-12-09, hartera)
    ssh eieio
    cd /cluster/data/genbank
    # This is a new assembly, edit the etc/genbank.conf file and add:
# danRer2 (zebrafish)
danRer2.genome = /iscratch/i/danRer2/nib/chr*.nib
danRer2.lift = /cluster/data/danRer2/jkStuff/liftAll.lft
danRer2.downloadDir = danRer2
    # Default includes native genbank mRNAs and ESTs,
    # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but
    # not xeno RefSeq
    cvs commit -m "Added danRer2" etc/genbank.conf
    # No need to edit ~/kent/src/hg/makeDb/genbank/src/align/gbBlat 
    # since the 11.ooc file for danRer1 will be good for both assemblies.
    # as no new sequence was added, just an improvement on the mapping of 
    # unmapped sequence to chromosomes

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # danRer genome information
    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, all sequences:
    nice bin/gbAlignStep -verbose=1 -initial danRer2
 
    # Load results for all sequences:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer2
    # Move results need to be moved out the way
    cd /cluster/data/genbank/work
    mv initial.danRer2 initial.danRer2.allseqs

# BLASTZ SWAP FOR hg17 vs. danRer2 BLASTZ TO CREATE danRer2 vs. hg17
# (DONE, 2004-12-09, hartera)
# USE RESCORED ALIGNMENTS (see makeHg17.doc)
    ssh kolossus
    mkdir -p /cluster/data/danRer2/bed/blastz.hg17.swap
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    # use rescored axtChrom from blastzDanRer2 on hg17
    set aliDir = /cluster/data/hg17/bed/blastz.danRer2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    # 1.1G    /cluster/data/hg17/bed/blastz.danRer2/axtChrom
    # 1.1G    unsorted
    # 1.1G    axtChrom

    rm -r unsorted
    # translate sorted axt files into psl
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    mkdir -p pslChrom
    set tbl = "blastzHg17"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer2 $f
      echo "$f Done"
    end

# CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-12-10, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    # Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
    -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/danRer2/nib \
    /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # create input list and batch for cluster run
    ls -1S /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom/*.axt \
        > input.lst
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       2416s      40.27m     0.67h    0.03d  0.000 y
# IO & Wait Time:                   351s       5.85m     0.10h    0.00d  0.000 y
# Average job time:                  99s       1.65m     0.03h    0.00d
# Longest job:                      144s       2.40m     0.04h    0.00d
# Submission to last job:           524s       8.73m     0.15h    0.01d
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
# only chrNA has a large number of chains with score <= 5000 (>200,000 chains)
# chrUn has about 90,000 chains with score <=5000
    
    # apply filter with minScore=5000
    rm -r chain
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    chainSplit chain all.chain

    # remove repeats from chains and reload into database
    # (2004-12-21, hartera)
    
    mv chain chainRaw
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c 
       nice chainAntiRepeat /iscratch/i/danRer2/nib \
                       /iscratch/i/gs.18/build35/bothMaskedNibs $f \
                       ../chain/$c.chain
    end
    cd ..  
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # load filtered chains and check
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer2 ${c}_chainHg17 $i
        echo done $c
    end
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17 -enrichment
# refGene:cds 0.512%, chainHg17 34.242%, both 0.444%, cover 86.63%, 
# enrich 2.53x
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.512%, chainHg17Link 4.760%, both 0.399%, cover 77.80%, 
# enrich 16.3
# featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.529%, chainHg17Link 3.924%, both 0.409%, cover 77.24%, 
# enrich 19.69x
# number of rows for chr1:
# chainHg17Link 185694
# after using chainAntiRepeat
# chainHg17ARLink 176455
# after chainAntiRepeat done on filtered chains:
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17ARLink -enrichment
# refGene:cds 0.512%, chainHg17ARLink 4.625%, both 0.398%, cover 77.79%, 
# enrich 16.82x

# NET HUMAN (hg17) BLASTZ (DONE, 2004-12-10,hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 126734336, utime 657 s/100, stime 69
    # Add classification info using db tables:
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    # linSpecRep.notInHuman and linSpecRep.notInZebrafish exist 
    # (see makeHg17.doc)
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    time netClass noClass.net danRer2 hg17 humanhg17.net \
         -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman \
         -qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr
    # 87.940u 51.110s 4:05.95 56.5%   0+0k 0+0io 631pf+0w 
    # load net into database
    netFilter -minGap=10 humanhg17.net |  hgLoadNet danRer2 netHg17 stdin
# featureBits danRer1 refGene:cds netHg17 -enrichment
# refGene:cds 0.462%, netHg17 27.868%, both 0.384%, cover 83.21%, enrich 2.99x
# featureBits danRer2 refGene:cds netHg17 -enrichment
# refGene:cds 0.468%, netHg17 30.608%, both 0.406%, cover 86.82%, enrich 2.84x

# BLASTZ SWAP FOR mm5 vs danRer2 BLASTZ TO CREATE danRer2 vs mm5 BLASTZ
# (DONE, 2004-12-13, hartera)

    ssh kolossus
    mkdir -p /cluster/data/danRer2/bed/blastz.mm5.swap
    cd /cluster/data/danRer2/bed/blastz.mm5.swap
    # use rescored axtChrom from blastzDanRer1 on mm5
    set aliDir = /cluster/data/mm5/bed/blastz.danRer2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    # 919M    /cluster/data/mm5/bed/blastz.danRer2/axtChrom
    # 921M    unsorted
    # 919M    axtChrom
    rm -r unsorted
    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer2/bed/blastz.mm5.swap
    mkdir -p pslChrom
    set tbl = "blastzMm5"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer2 $f
      echo "$f Done"
    end

# CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.mm5.swap
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom/*.axt \
                    > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'

    # << this line makes emacs coloring happy

# Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
# create chains with only default filtering on score - minScore = 1000
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/danRer2/nib \
    /cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3
'_EOF_'
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       2405s      40.08m     0.67h    0.03d  0.000 y
# IO & Wait Time:                   447s       7.46m     0.12h    0.01d  0.000 y
# Average job time:                 102s       1.70m     0.03h    0.00d
# Longest job:                      147s       2.45m     0.04h    0.00d
# Submission to last job:           523s       8.72m     0.15h    0.01d

   # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # only chrUn and chrNA have >100,000 chains with score <=5000
    # filter on minScore=5000
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
    rm -r chain
    chainSplit chain all.chainFilt5k

    # remove repeats from chains and reload into database
    # (2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
    mv chain chainRaw
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c 
       nice chainAntiRepeat /iscratch/i/danRer2/nib \
                       /cluster/bluearc/scratch/mus/mm5/softNib $f \
                       ../chain/$c.chain
    end
    cd ..  
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # load chains into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer2 ${c}_chainMm5 $i
        echo done $c
    end

# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5 -enrichment
# refGene:cds 0.512%, chainMm5 34.341%, both 0.436%, cover 85.08%, enrich 2.48x
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.512%, chainMm5Link 4.588%, both 0.395%, cover 77.08%, 
# enrich 16.80x
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5 -enrichment
# refGene:cds 0.529%, chainMm5 42.554%, both 0.459%, cover 86.70%, enrich 2.04x
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.529%, chainMm5Link 7.948%, both 0.412%, cover 77.80%, 
# enrich 9.79x
# after chainAntiRepeats:
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.512%, chainMm5Link 4.499%, both 0.395%, cover 77.08%, 
# enrich 17.13x
 
# before chainAntiRepeat:
# chr1_chainMm5Link 167661
# after chainAntiRepeat:
# chr1_chainMm5Link 162853

# NET MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
    mkdir preNet
    cd chainAR 
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 119873536, utime 849 s/100, stime 81
    # Add classification info using db tables:
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
    cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/mm5/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
       /cluster/bluearc/danRer2/linSpecRep.notInMouse

    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
    time netClass noClass.net danRer2 mm5 mouseMm5.net \
          -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse \
          -qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr
    # 86.210u 51.710s 4:02.29 56.9%   0+0k 0+0io 206pf+0w
    netFilter -minGap=10 mouseMm5.net |  hgLoadNet danRer2 netMm5 stdin

# featureBits danRer2 refGene:cds netMm5 -enrichment
# refGene:cds 0.468%, netMm5 30.706%, both 0.404%, cover 86.40%, enrich 2.81x
# featureBits danRer1 refGene:cds netMm5 -enrichment
# refGene:cds 0.461%, netMm5 36.622%, both 0.395%, cover 85.70%, enrich 2.34x
# add html and trackDb.ra entries 
# after chainAntiRepeat:
# featureBits danRer2 refGene:cds netMm5 -enrichment
# refGene:cds 0.468%, netMm5 30.565%, both 0.405%, cover 86.40%, enrich 2.83x  

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-12-14, hartera)
# CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF
# SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS 
# (DONE, 2005-04-25, hartera)
# REPLACE chr1 WITH chr1New IN chromFa and chromFaMasked - chr1 was
# incompletely masked due to an error in the RepeatMasker output for chr1.
# See section on "MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF"
# (2005-06-06, hartera)
# UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera)
 
    ssh kksilo
    cd /cluster/data/danRer2
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip */chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip */chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip */chr*.fa.masked
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer2 -native GenBank mrna \
        /cluster/data/danRer2/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer2 -xeno GenBank mrna \
        /cluster/data/danRer2/zip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer2 -native refseq mrna \
/cluster/data/danRer2/zip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer2 -native GenBank est \
/cluster/data/danRer2/zip/est.fa
                                                                                
cd /cluster/data/danRer2/zip
# zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j refMrna.zip refMrna.fa
zip -j est.zip est.fa
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log
    cd zip
    # remake just the agp files zip with correcte chrM.agp (2005-04-25,hartera)
    ssh kksilo
    cd /cluster/data/danRer2
    rm /cluster/data/danRer2/zip/chromAgp.zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
    
    #- Look at zipAll.log to make sure all file lists look reasonable.
    # Make upstream files and Copy the .zip files to
    # hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/danRer2/zip
    # make upstream files for zebrafish RefSeq
    featureBits danRer2 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    featureBits danRer2 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f (../*/chr*.fa)
       zip -j $gp/chromosomes/$f:t.zip $f
    end
    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera)
    ssh kksilo
    cd /cluster/data/danRer2
    rm /cluster/data/danRer2/zip/chromAgp.zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
    ssh hgwdev
    cd /cluster/data/danRer2/zip
    #- Check zip file integrity:
    foreach f (chromAgp.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l chromAgp.zip.test
    # looks good
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    rm $gp/bigZips/chromAgp.zip
    rm $gp/bigZips/md5sum.txt
    cp -p chromAgp.zip $gp/bigZips
    # go to directory with zip files and remake the md5sum.txt file
    cd $gp/bigZips
    md5sum *.zip > md5sum.txt

    # Remake chr1 zips for masked file with chr1New.fa (2005-06-06, hartera)
    ssh kkstore01
    cd /cluster/data/danRer2
    # move old chr1 files temporarily out of the way
    mkdir 1/tmp
    mv 1/chr1.fa.out ./tmp
    mv 1/chr1.fa ./tmp
    mv 1/chr1.fa.masked ./tmp
    rm zip/chromOut.zip zip/chromFa.zip zip/chromFaMasked.zip
    # chrom RepeatMasker out files
    zip -j zip/chromOut.zip */chr*.fa.out
    # soft masked chrom fasta
    zip -j zip/chromFa.zip */chr*.fa
    # hard masked chrom fasta
    zip -j zip/chromFaMasked.zip */chr*.fa.masked
 
    cd /cluster/data/danRer2/zip
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    ssh hgwdev
    cd /cluster/data/danRer2/zip
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    # copy over just those modified zip files
    cp -p chromOut.zip $gp/bigZips
    cp -p chromFa.zip $gp/bigZips
    cp -p chromFaMasked.zip $gp/bigZips
    # remove chr1.fa.zip and zip chr1New
    rm $gp/chromosomes/chr1.fa.zip
    zip -j $gp/chromosomes/chr1New.fa.zip ../1/chr1New.fa
    # replace md5sum 
    cd $gp/bigZips
    rm md5sum.txt
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    rm md5sum.txt
    md5sum *.zip > md5sum.txt
    # Update README.txt for each to state that a new masked chr1 was produced
    # updated bigZips and chromosomes README.txt files with new ftp site
    # for the assembly sequence at Sanger (2005-08-04, hartera):
    # now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/

# MAKE VSHG17 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
# (DONE, 2004-12-21, hartera)
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p $gp/vsHg17/axtChrom
    cp -p *.axt $gp/vsHg17/axtChrom
    cd $gp/vsHg17/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
    
    # copy chains and net to downloads area
    # re-make chains and net downloadables (2004-12-21, hartera)
    rm $gp/vsHg17/human*.gz $gp/vsHg17/md5sum.txt
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/humanHg17.chain.gz
    gzip -c humanhg17.net > /cluster/data/danRer2/zip/humanHg17.net.gz
    cd $gp/vsHg17
    mv /cluster/data/danRer2/zip/human*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
 
# MAKE VSMM5 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
# (DONE, 2004-12-21, hartera)
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p $gp/vsMm5/axtChrom
    cp -p *.axt $gp/vsMm5/axtChrom
    cd $gp/vsMm5/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
    
    # copy chains and nets to downloads area
    # re-make chains and net downloadables (2004-12-21, hartera)
    rm $gp/vsMm5/mouse*.gz $gp/vsMm5/md5sum.txt
    cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
    gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/mouseMm5.chain.gz
    gzip -c mouseMm5.net > /cluster/data/danRer2/zip/mouseMm5.net.gz
    cd $gp/vsMm5
    mv /cluster/data/danRer2/zip/mouse*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ HG17 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
    nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
    nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
    
# BLASTZ MM5 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.mm5.swap
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
    nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & 
    nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &

# BLASTZ FOR FUGU (fr1) (DONE, 2004-12-14, hartera)
# Fugu is quite distant from zebrafish, more distant than human/chicken
# so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
# Blastz requires lineage-specific repeats but there are none 
# for these two fish.

    ssh kk
    mkdir -p /cluster/data/danRer2/bed/blastz.fr1.2004-12-13
    ln -s /cluster/data/danRer2/bed/blastz.fr1.2004-12-13 \
          /cluster/data/danRer2/bed/blastz.fr1
    cd /cluster/data/danRer2/bed/blastz.fr1
# use same parameters as for danRer1 vs fr1
cat << '_EOF_' > DEF
# zebrafish (danRer2) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
                                                                                
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
                                                                                
# TARGET - zebrafish (danRer2)
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Fugu (fr1)
# soft-masked chrom nib
SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0
                                                                                
BASE=/cluster/data/danRer2/bed/blastz.fr1
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
                                                                                
#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
                                                                                
    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.danRer2-fr1.2004-12-13
    # setup cluster run
    # source the DEF file
    bash
    . ./DEF
    /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 6055 of 6055 jobs
# CPU time in finished jobs:    1440485s   24008.08m   400.13h   16.67d  0.046 y
# IO & Wait Time:                132074s    2201.24m    36.69h    1.53d  0.004 y
# Average job time:                 260s       4.33m     0.07h    0.00d
# Longest job:                     2054s      34.23m     0.57h    0.02d
# Submission to last job:          4060s      67.67m     1.13h    0.05d
         
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.fr1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 173 of 173 jobs
# CPU time in finished jobs:        249s       4.16m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   695s      11.58m     0.19h    0.01d  0.000 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                       16s       0.27m     0.00h    0.00d
# Submission to last job:           296s       4.93m     0.08h    0.00d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.fr1
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/danRer2/nib \
/cluster/bluearc/fugu/fr1/chromNib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.fr1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:         71s       1.19m     0.02h    0.00d  0.000 y
# IO & Wait Time:                   765s      12.75m     0.21h    0.01d  0.000 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest job:                       66s       1.10m     0.02h    0.00d
# Submission to last job:           223s       3.72m     0.06h    0.00d

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer2/bed/blastz.fr1
    mkdir -p pslChrom
    set tbl = "blastzFr1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
                                                                               
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.fr1/pslChrom                                                   
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer2 $f
      echo "$f Done"
    end
# featureBits -chrom=chr1 danRer2 refGene:cds blastzFr1 -enrichment
# refGene:cds 0.512%, blastzFr1 10.880%, both 0.439%, cover 85.71%, enrich 7.88x
# featureBits -chrom=chr1 danRer1 refGene:cds blastzFr1 -enrichment
# refGene:cds 0.529%, blastzFr1 5.542%, both 0.463%, cover 87.50%, enrich 15.79x

# CHAIN FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.fr1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/danRer2/bed/blastz.fr1/axtChrom/*.axt \
        > input.lst
 cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/danRer2/nib \
    /cluster/bluearc/fugu/fr1/chromNib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:        402s       6.71m     0.11h    0.00d  0.000 y
# IO & Wait Time:                   276s       4.59m     0.08h    0.00d  0.000 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                       56s       0.93m     0.02h    0.00d
# Submission to last job:           101s       1.68m     0.03h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # filter with minScore=5000
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
    rm -r chain
    chainSplit chain all.chainFilt5k
                             
    # remove repeats from chains and reload into database
    # (2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    gunzip all.chainFilt5k.gz
    chainSplit chainRaw all.chainFilt5k 
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c 
       nice chainAntiRepeat /iscratch/i/danRer2/nib \
                       /cluster/bluearc/fugu/fr1/chromNib $f \
                       ../chain/$c.chain
    end
    cd ..  
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # Load chains into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer2 ${c}_chainFr1 $i
        echo done $c
    end
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1 -enrichment
# refGene:cds 0.512%, chainFr1 23.334%, both 0.451%, cover 88.07%, enrich 3.77x
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.512%, chainFr1Link 7.794%, both 0.426%, cover 83.16%, enrich 10.67x

# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1 -enrichment
# refGene:cds 0.529%, chainFr1 19.003%, both 0.479%, cover 90.41%, enrich 4.76x
# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.529%, chainFr1Link 4.686%, both 0.450%, cover 85.11%, enrich 18.16x
# after chainAntiRepeat:
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.512%, chainFr1Link 7.726%, both 0.426%, cover 83.16%, 
# enrich 10.76x 

# NET FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
# REMADE NET FOR FUGU (2004-12-15, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
                                                                                
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
                                                                                
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 108306432, utime 690 s/100, stime 58
    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    # use -noAr option - don't look for ancient repeats
    # redo net as used danRer1 for netClass instead of danRer2 (2004-12-15)
    # and load new net into database
    netClass -noAr noClass.net danRer2 fr1 fuguFr1.net
                                                                                
    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    netFilter -minGap=10 fuguFr1.net | hgLoadNet danRer2 netFr1 stdin
# featureBits danRer2 refGene:cds netFr1 -enrichment
# refGene:cds 0.468%, netFr1 19.389%, both 0.411%, cover 87.95%, enrich 4.54x
# featureBits danRer1 refGene:cds netFr1 -enrichment
# refGene:cds 0.461%, netFr1 17.530%, both 0.391%, cover 84.78%, enrich 4.84x
# after chainAntiRepeat:
# featureBits danRer2 refGene:cds netFr1 -enrichment
# refGene:cds 0.468%, netFr1 19.372%, both 0.412%, cover 87.97%, enrich 4.54x 

# MAKE VSFR1 DOWNLOADABLES (DONE, 2004-12-15, hartera)
# REPLACE FR1 NET WITH NEW VERSION IN DOWNLOADS (2004-12-15, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
# (DONE, 2004-12-21, hartera)

    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p $gp/vsFr1/axtChrom
    cp -p *.axt $gp/vsFr1/axtChrom
    cd $gp/vsFr1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
    
    # copy chains and nets to downloads area
    # re-make chains and net downloadables (2004-12-21, hartera)
    rm $gp/vsFr1/fugu*.gz $gp/vsFr1/md5sum.txt
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/fuguFr1.chain.gz
    gzip -c fuguFr1.net > /cluster/data/danRer2/zip/fuguFr1.net.gz
    cd $gp/vsFr1
    mv /cluster/data/danRer2/zip/fugu*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.fr1
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/all.chainFilt5k axtChain/*.net &
    nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
    nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &

# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2004-12-21, hartera)
# Tetraodon is quite distant from zebrafish, more distant than human/chicken
# so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
    # blastz requires lineage-specific repeats but there are none
    # available between these two fish species 
    ssh kk
    mkdir -p /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21
    ln -s /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21 \
          /cluster/data/danRer2/bed/blastz.tetNig1
    cd /cluster/data/danRer2/bed/blastz.tetNig1
# use tetraodon sequence in contigs for dynamic masking - see below
# for dynamic masking: M=50
cat << '_EOF_' > DEF
# zebrafish (danRer2) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

# use BLASTZ_M=50 in blastz-run1
ALIGN=blastz-run1
BLASTZ=blastz
BLASTZ_H=2500

#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - zebrafish (danRer2)
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=500000
SEQ1_LAP=500

# QUERY - Tetraodon (tetNig1)
# soft-masked chrom nib
SEQ2_DIR=/iscratch/i/tetNig1/contigs
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=
SEQ2_LAP=

BASE=/cluster/data/danRer2/bed/blastz.tetNig1

DEF=$BASE/DEF
RAW=$BASE/raw 
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
                                                                                
    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.danRer2-tetNig1.2004-12-21

    # setup cluster run
    # source the DEF file
    bash
    . ./DEF
    /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 3193 of 3193 jobs
# CPU time in finished jobs:    4460310s   74338.50m  1238.97h   51.62d  0.141 y
# IO & Wait Time:                 41176s     686.27m    11.44h    0.48d  0.001 y
# Average job time:                1410s      23.50m     0.39h    0.02d
# Longest job:                     2398s      39.97m     0.67h    0.03d
# Submission to last job:         12372s     206.20m     3.44h    0.14d
 
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 3193 of 3193 jobs
# CPU time in finished jobs:        327s       5.46m     0.09h    0.00d  0.000 y
# IO & Wait Time:                  8483s     141.38m     2.36h    0.10d  0.000 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           560s       9.33m     0.16h    0.01d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt -fa stdin /iscratch/i/danRer2/nib \
/iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:        224s       3.74m     0.06h    0.00d  0.000 y
# IO & Wait Time:                  1240s      20.66m     0.34h    0.01d  0.000 y
# Average job time:                  52s       0.87m     0.01h    0.00d
# Longest job:                      148s       2.47m     0.04h    0.00d
# Submission to last job:           157s       2.62m     0.04h    0.00d

    # lift up query sequences
    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.tetNig1

    foreach d (axtChrom/chr*.axt)
      echo $d
      liftUp -axtQ ${d}.out.axt \
      /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn $d \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    # check this is working correctly
    set dr = "/cluster/data/danRer2"
    foreach c (`cat ${dr}/chrom.lst`)
      echo lifting $c
      liftUp -axtQ ./axtChrom/chr${c}.out2.axt \
         /cluster/data/tetNig1/jkStuff/liftAll.lft warn \
        ./axtChrom/chr${c}.axt.out.axt > /dev/null
    end
    cd axtChrom
    foreach c (`cat ${dr}/chrom.lst`)
       mv chr${c}.axt chr${c}.axt.old
       mv chr${c}.out2.axt chr${c}.axt
       rm chr${c}.axt.out.axt
    end
   # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    mkdir -p pslChrom
    # need to copy S2.len for whole tetNig1 genome - S2contigs.len
    cp /cluster/data/tetNig1/bed/blastzSelf/S1.len S2contigs.len
    set tbl = "blastzTetNig1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f \
              S1.len S2contigs.len pslChrom/${c}_${tbl}.psl
    end 
    # Load database tables
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.tetNig1/pslChrom                                                   
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl danRer2 $f
      echo "$f Done"
    end
# H=2000
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 0.512%, blastzTetNig1 22.057%, both 0.435%, cover 84.94%, 
# enrich 3.85x
# H=2000 and L=8000
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 0.512%, blastzTetNig1L8k 16.326%, both 0.401%, cover 78.28%, 
# enrich 4.80x
# use query in contigs in one file and use dynamic masking with M=100
# should mask out more sequence in query, tetraodon has no specific repeats
# library so it is not fully repeat masked. H=2500, M=100
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1Contigs -enrichment
# refGene:cds 0.512%, blastzTetNig1Contigs 13.052%, both 0.427%, cover 83.43%, 
# enrich 6.39x
# contigs used as above but H=2000
#featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsH2k -enrichment
# refGene:cds 0.512%, blastzTetNig1ContigsH2k 17.517%, both 0.433%, 
# cover 84.51%, enrich 4.82x
# contigs used with H=2500, L=6000
# refGene:cds 0.512%, blastzTetNig1ContigsL6k 11.380%, both 0.415%,
# cover 80.94%, enrich 7.11x
# use M=50 and H=2500
#featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsM50 -enrichment
# refGene:cds 0.512%, blastzTetNig1ContigsM50 12.643%, both 0.427%, cover 83.42%# ,enrich 6.60x
# using contigs and dynamic masking improves the enrichment and makes little
# difference to coverage. the blastz track % is lower since more sequence
# is being masked and since coverage is not much different, it is probably
# being masked in non coding regions
# at chr1:6,128,109-6,135,734 there are a lot of short alignments in low
# complexity regions that are removed using L=6000.
# Use contigs with H=2500, keep lower scoring alignments until after chaining
# Number of rows:
# blastzTetNig1	2360350 
# blastzTetNig1L8k 1177874
# blastzTetNig1Contigs 725076 
# blastzTetNig1ContigsH2k 825927
# blastzTetNig1ContigsL6k 538149
# blastzTetNig1ContigsM50 479228
# Using M=50 reduces the number of alignments but coverage is the same as 
# for using M=100 so use the dynamic masking with M=50 and H=2500

# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/*.axt \
        > input.lst
 cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/danRer2/nib \
    /iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:        446s       7.43m     0.12h    0.01d  0.000 y
# IO & Wait Time:                   355s       5.92m     0.10h    0.00d  0.000 y
# Average job time:                  29s       0.48m     0.01h    0.00d
# Longest job:                       98s       1.63m     0.03h    0.00d
# Submission to last job:            98s       1.63m     0.03h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    mkdir chain
    # remove repeats from chains
    cd run1/chain
    foreach f (*.chain)
       set c = $f:r
       echo $c 
       nice chainAntiRepeat /iscratch/i/danRer2/nib \
                       /iscratch/i/tetNig1/nib $f \
                       ../../chain/$c.chain
    end
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # filter with minScore=5000
    mv all.chain.antirepeat all.chainAR.unfiltered
    chainFilter -minScore=5000 all.chainAR.unfiltered > all.chainARFilt5k
    rm -r chain
    chainSplit chainAR all.chainARFilt5k
    # only chr1 has more than 100,000 chains with score of 5000
    # or less after filtering             
    # Load chains into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain danRer2 ${c}_chainTetNig1 $i
        echo done $c
    end
# featureBits -chrom=chr1 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.512%, chainTetNig1Link 11.005%, both 0.416%, cover 81.22%, 
# enrich 7.38x 

# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2contigs.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len \
                                 ../../S2contigs.len ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 102502400, utime 670 s/100, stime 69
    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    # use -noAr option - don't look for ancient repeats
    netClass -noAr noClass.net danRer2 tetNig1 tetraTetNig1.net
                                                                                
    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    netFilter -minGap=10 tetraTetNig1.net | hgLoadNet danRer2 netTetNig1 stdin
# featureBits danRer2 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.468%, netTetNig1 19.068%, both 0.410%, cover 87.50%, 
# enrich 4.59x

# BACENDS (DONE, 2005-05-20, hartera)
# Removed incorrect row from bacCloneXRef table (DONE, 2006-04-19, hartera)
# provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston
# Children's Hospital. Collaborated with them on this track.
# Anthony DiBiase:adibiase@enders.tch.harvard.edu
# Yi Zhou:yzhou@enders.tch.harvard.edu
# BAC clone end sequences are from Robert Geisler's lab,
# Max Planck Institute for Developmental Biology, Tuebingen, Germany
# REMAKE AND RELOAD bacCloneAlias AND bacCloneXRef TABLES AFTER FIXING
# BUGS IN THE zfishBacClonesandSts PROGRAM TO STORE ALL THE ALIASES FOR EACH
# SANGER STS NAME AND TO STORE CORRECT INTERNAL AND EXTERNAL NAMES IN ALL
# CASES. DATA FILE OF UniSTS IDS AND SANGER STS NAMES WAS ALSO RE-CREATED
# AS THIS WAS INCORRECT. (DONE, 2005-05-24, hartera)
# ALSO DO TESTING OF THE DATA IN THESE TABLES TO MAKE SURE IT IS CORRECT - SEE
# SECTION ON TESTING BELOW.
    ssh kksilo
    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends
    cd /cluster/data/danRer2/bed/ZonLab/bacends 
    # copy BAC ends sequence from previous assembly
    cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa .
    faSize bacends.fa
    # 486978153 bases (39070196 N's 447907957 real 447907957 upper 0 lower) 
    # in 594614 sequences in 1 files
    # Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za) max 5403 (zC259G13.zb) median 796
    # N count: mean 65.7 sd 154.7
    # U count: mean 753.3 sd 302.6
    # L count: mean 0.0 sd 0.0

    ssh kkr1u00
    cd /cluster/data/danRer2/bed/ZonLab/bacends
    mkdir -p /iscratch/i/danRer2/bacends
    # if not split already, split up sequence for cluster runs
    faSplit sequence bacends.fa 20 /iscratch/i/danRer2/bacends/bacends
    # iSync bacends to kilokluster
    /cluster/bin/iSync

    ssh kk
    cd /cluster/data/danRer2/bed/ZonLab/bacends
    ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst
    ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
   # << this line keeps emacs coloring happy
    mkdir psl
    gensub2 genome.lst bacends.lst template jobList
    para create jobList
    para try, check, push, check, ...
# para time
# Completed: 5980 of 5980 jobs
# CPU time in finished jobs:    5317533s   88625.55m  1477.09h   61.55d  0.169 y
# IO & Wait Time:                 34446s     574.10m     9.57h    0.40d  0.001 y
# Average job time:                 895s      14.92m     0.25h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            9892s     164.87m     2.75h    0.11d
# Submission to last job:         12309s     205.15m     3.42h    0.14d
  
    # back on kksilo (now kkstore01), filter and lift results:
    ssh kksilo
    cd /cluster/data/danRer2/bed/ZonLab/bacends
    pslSort dirs raw.psl temp psl
    # took 9 hours
    # With minCover=0.4, 78% of seqeunces are aligned
    # No minCover, aligns 90% of sequences but with a large number
    # of extra alignments - a lot of noise even compared to using minCover of 
    # 0.05.
    # Total BAC ends sequences: 594614
    # minCover minAli  	#alignments  #sequences aligned %total sequences aligned
    #   0	0.85	13969946     535465		90
    #   0.05	0.85	8140775	     531299		89
    #   0.10	0.75	5099823	     519238             87
    #   0.10	0.85	5098714	     519084		87
    #   0.10	0.90	5096205	     518616		87
    #   0.10	0.92	4984170	     515024		87
    #   0.10	0.95	4264695      498380 		84
    #   0.20	0.85	4515869	     501415		84
    #   0.40	0.85	3806487	     464067		78
    # Using minCover of 0.10 seems good, there are 55017 extra BAC ends 
    # being aligned with an extra 1292227 alignments compared to 0.10, this 
    # is an average of 23.5 alignments per sequence compared to 8 alignments 
    # per sequence on average for minCover of 0.40.
    # Using minAli of 0.90 for minCover 0.10 gives 468 less sequences 
    # aligned and 2509 less alignments. Using 0.75 for minCover, increases 
    # the number of alignments by about 1100 compared to using 0.85 and 
    # sequences aligned only increased by about 150. Using minAli of 0.95 
    # for minCover of 0.10 gives 834,019 less alignments with 20704 less 
    # sequences being aligned than for using minAli of 0.85.
    # for minAli=0.92 and minCover=0.10, there are 114,544 less alignments 
    # than for minAli=0.85 and 4060 less sequences are aligned.
    # try minAli=0.85 and minCover=0.40 and also minAli=0.92 and 
    # minCover=0.10 and compare. More alignments are gained - see below
    # but minCover=0.10 means there are a lot of alignments where < 40% of the
    # BAC end aligns and this does not seem reliable so stick to minCover=0.4 
# Stats for minCover=0.10, minAli=0.92:
# Pairs: there are 205360 orphans and 97458 pairs, 1014 long, 19738 mismatch 
# and 239 slop
# Singles: 17829 bacEnds.orphan and 205360 from pair analysis so a total of 
# 223189 orphans 7828 more than for danRer1 and more than for using 
# minCover=0.40 and minAli=0.85.
# for minCov=0.10, minAli=0.92:
# 90450 pairs.txt
# 69699 pairs.txt.uniq - about 4500 more than for minCover=0.40 and minAli=0.85
# 202661 singles.txt
# 178337 singles.txt.uniq - about 5399 more than for minCover=0.40,minAli=0.85

    # Below, used same parameters as for danRer1: minCover=0.40 and minAli= 0.85
    # all files below and tables are derived from the alignments using
    # these parameters for pslReps filtering.
    # location of store8 has changed to kkstore01 instead of kksilo so now 
    # login there (2005-04-28)
    ssh kkstore01 
    cd /cluster/data/danRer2/bed/ZonLab/bacends
    nice pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \
      bacEnds.psl /dev/null
    liftUp bacEnds.lifted.psl /cluster/data/danRer2/jkStuff/liftAll.lft \
           warn bacEnds.psl
    # Got 299 lifts in /cluster/data/danRer2/jkStuff/liftAll.lft
    wc -l bacEnds.lifted.psl
    # 3806457 bacEnds.lifted.psl
    # 4984175 (for minCov=0.10, minAli=0.92)
    rm -r temp
    pslCheck bacEnds.lifted.psl >& pslCheck.log
    # 536 problems with overlapping exons reported by pslCheck

    awk '{print $10;}' bacEnds.lifted.psl | sort | uniq > bacEnds.names.uniq
    # edit to remove header lines    
    
    wc -l bacEnds.names.uniq
    # 464067 bacEnds.names.uniq
    grep '>' bacends.fa | wc -l
    # 594614 bacends.fa
    # 78% of BAC ends have aligned
    
# Need to add accession information and BAC end sequence pairs
    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    grep ">" ../bacends.fa > bacEnds
    # remove '>' at beginning of each line
    perl -pi.bak -e 's/>//' bacEnds
    # remove ^M char from end of lines
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    # copy over necessary files from danRer1
    set dr1Bacs = "/cluster/data/danRer1/bed/ZonLab/bacends"
    cp $dr1Bacs/bacends.1/stripEndChar.pl.gz .
    cp $dr1Bacs/bacends.1/BACEnd_accessions.txt.gz .
    cp $dr1Bacs/bacends.1/zfish_accs.txt .
    cp $dr1Bacs/bacends.1/getBacEndInfo.pl.gz .
    gunzip *.gz
    
    ssh kkstore01
    cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    perl stripEndChar.pl < bacEnds > allBacEnds.names
    rm bacEnds bacEnds.bak
    # Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk
    # provided accession data for BAC End clones - zfish_accs.txt
    # and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir
    # write and use perl script to split BAC Ends into pairs and singles
    # bacEndPairs.txt and bacEndSingles.txt
    # For pslPairs, the reverse primer (T7 or .z*) should in the first column
    # and the forward primer (SP6 or .y*) should be in the second column
    # There are several reads for some sequences and these have similar names.
    # Reads for the same sequencee should be in a comma separated list.
    # Sequence read names can be translated to external clone names as found
    # in NCBI's clone registry using the name without the suffix and the 
    # translation of prefixes as follows after removal of dashes 
    # and extra zeros in the name:
    # library         external prefix         internal prefix         
    # CHORI-211		CH211-                  zC
    # DanioKey        	DKEY-                   zK
    # DanioKey Pilot  	DKEYP-                  zKp
    # RZPD-71         	RP71-                   bZ
    # BUSM1 (PAC)      	BUSM1-                  dZ
    # e.g. zC001-A03.za becomes CH211-1A03
    # make table of accessions for BAC ends to load into bacEndAlias table
    # find all suffixes used for sequence namesi
    cat << '_EOF_' > getSuffixes.pl
#!/usr/bin/perl -w
use strict;

while(<STDIN>)
{
chomp;
my $l = $_;
if ($l =~ /^[t1_]*[z|Z|b|d][C|K|Z]p?[0-9]+\-?[a-zA-Z]+[0-9]+\.?([A-Z0-9a-z]+)/)
    {
    my $suffix = $1;
    print "$suffix\n";
    }
}
'_EOF_'
    chmod +x getSuffixes.pl
    perl getSuffixes.pl < allBacEnds.names > allBacEnds.suffixes
    sort allBacEnds.suffixes | uniq > allBacEnds.suffixes.uniq
    wc -l allBacEnds.suffixes.uniq
    # 22 different suffixes
    # SP6, SP6A,SP6W,T7,T7A,T7W,Za,p1kSP6,p1kSP6w,q1kT7,q1kT7w,sp6,t7,ya
    # yb,yc,yd,yt,za,zb,zc,zd
    # get suffixes used in BAC ends accessions file
    perl getSuffixes.pl < BACEnd_accessions.txt > BACEnd.names.suffixes
    sort BACEnd.names.suffixes | uniq
    # SP6, SP6A, T7 and T7A - all these endings occur in list of BAC ends
    # in FASTA file but if the ending is something different, need to add "A" 
    # to SP6 and T7 and check this ending also in the BAC end accessions list
    # edit getBacEndInfo.pl to make sure it includes all these suffixes
    # also treats zK32A7SP6 and zK32A7SP6A to be different
    perl getBacEndInfo.pl allBacEnds.names BACEnd_accessions.txt > bacEnds.log
    wc -l *.txt
    # 31317 BACEnd_accessions.txt
    # 180280 bacEndPairs.txt
    # 16020 bacEndSingles.txt
    #  594614 direction.txt
    wc -l bacEndAccs.aliases
    # 47558 bacEndAccs.aliases
    # bacEndAccs.aliases contains sequence read names and their
    # Genbank accessions.  
    # checked that all names from allBacEnds.names appear in either
    # bacEndPairs.txt or bacEndSingles.txt
    # First process BAC end alignments
    ssh kkstore01
    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    # these bacEnds vary in size from around 2800 bp to 626,000 bp
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \
     all_bacends bacEnds
     wc -l bacEnds.*

     # There are 1411 more orphans and 18147 more pairs than for danRer1
     #  980 bacEnds.long
     #  18083 bacEnds.mismatch
     #  201646 bacEnds.orphan
     #  90967 bacEnds.pairs
     #  0 bacEnds.short
     #  190 bacEnds.slop

    # create header required by "rdb" tools
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
          > ../header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
    # make pairs bed file
    cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
               | headchg -del > bacEndPairs.bed
    # also need to process bacEndSingles.txt into a database table
    # for singles in bacEndSingles.txt, create a dummy file where they
    # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
    # sequence, put the dummy sequence in the second column, if the single is
    # a reverse sequence put in first column. use a perl script to do this.
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends
    set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    mkdir singles
    cd singles
    cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/formatSingles.pl.gz .
    gunzip formatSingles.pl.gz
    perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                          $bacDir/bacEndSingles.format
    # then run pslPairs on this formatted sequence
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
     -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
     wc -l bacEnds*
    # 0 bacEnds.long
    # 0 bacEnds.mismatch
    # 15853 bacEnds.orphan
    # 0 bacEnds.pairs
    # 0 bacEnds.short
    # 0 bacEnds.slop
    # there are 15853 orphans here ( 14126 in danRer1) 201646 from 
    # pair analysis so a total of 217499 orphans (2138 more than danRer1)
    cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
    wc -l bacEnds.singles
    # 217499 bacEnds.singles
    # make singles bed file
    cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed
    cp bacEndSingles.bed ../pairs
    cd ../pairs
    # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
    # since orphans are already in bacEndSingles, do not add these
    cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
    # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
    # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
 
    cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
    extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \
                bacEndPairsBadNoOrphans.bed  bacEndSingles.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl
                                /gbdb/danRer2/bacends/BACends.fa
    hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
    # There are rows where the aligments were the same but the lfNames are 
    # different. This is due to the presence of multiple reads for the 
    # same BAC end sequence. Sometimes they are slightly different lengths 
    # so the alignments are a little different. It would be good to consolidate     # all of these. Firstly, the identical rows were merged into one with a 
    # list of all the lfNames corrsponding to that alignment.
   
    # load all alignments into temporary database
    ssh hgwdev
    echo "create database bacsDr2_rah;" | hgsql danRer2
    cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    hgLoadBed bacsDr2_rah bacEndPairs bacEndPairs.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 84405 elements of size 11
    # Loaded 72272 elements of size 11 (danRer1)
    # create a bacEndSingles table like bacEndPairs if not created already
    cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql
    # edit to give correct table name
    # or copy over table definition from danRer1
    cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/bacEndSingles.sql.gz \ 
      ../singles/
    gunzip ../singles/bacEndSingles.sql.gz
    
    hgLoadBed bacsDr2_rah bacEndSingles bacEndSingles.bed \
                 -sqlTable=../singles/bacEndSingles.sql
    # Loaded 196492 elements of size 11
    # Loaded 203683 elements of size 11 (danRer1)
    # load all_bacends later

    # NOTE - this track isn't pushed to RR, just used for assembly QA
    # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
    hgLoadBed bacsDr2_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 18544 elements of size 11
    # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
    # name, different lfNames and same alignments.
    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
    cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
          bacEndPairs order by name, chrom, chromStart;" \
          bacsDr2_rah > pairs.txt
    sort pairs.txt | uniq -c > pairs.txt.uniq
    wc -l pairs*

    # 84405 pairs.txt
    # 65105 pairs.txt.uniq

    # for danRer1:
    # 72272 pairs.txt
    # 53992 pairs.txt.uniq

    # for replicate rows, find all the unique lfNames and put these
    # in one row with the relevant lfStarts, lfSizes and correct lfCount
    cp \
   /cluster/data/danRer1/bed/ZonLab/bacends/duplicates/removeReplicates.pl.gz .
    gunzip removeReplicates.pl.gz
    # modify script so that the database to use is supplied as an argument
    nice perl removeReplicates.pl pairs.txt.uniq bacEndPairs bacsDr2_rah \
         > pairsNoReps.bed
    # repeat for singles
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
       bacEndSingles order by name, chrom, chromStart;" bacsDr2_rah \
       > singles.txt
    sort singles.txt | uniq -c > singles.txt.uniq
    wc -l singles*
    # 196492 singles.txt
    # 173025 singles.txt.uniq
    
    # 203683 singles.txt (danRer1)
    # 179170 singles.txt.uniq (danRer1)
    
    nice perl removeReplicates.pl singles.txt.uniq bacEndSingles bacsDr2_rah \
                             > singlesNoReps.bed   
    # Using badPairs with no orphans                                
    hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
          bacEndPairsBad order by name, chrom, chromStart;" bacsDr2_rah \
          > badPairs.txt
    sort badPairs.txt | uniq -c > badPairs.txt.uniq
    wc -l badPairs*
    # 18544 badPairs.txt
    # 13803 badPairs.txt.uniq

    nice perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad bacsDr2_rah \
         > badPairsNoReps.bed
    # sort by chrom, chromStart - done already
    # Wrote perl script to choose 2 BAC ends to represent each BAC clone.
    # since there are often more than one read for each BAC end in this set,
    # 2 were chosen for each BAC pair or 1 for the singles. This was based on 
    # the ones that had the largest region aligned (using lfSizes).
    ssh kkstore01
    cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
    # run perl script: input bed file, pairs or singles, name of output file
    perl pickLfNames.pl pairsNoReps.bed pairs pairs2lfNames.bed
    mv error.log log.pairs
    # this log.pairs lists the 30 cases where alignments for a BAC clone use 
    # a different pair of sequence reads for the ends. These were checked out
    # and, in most cases, the alignments are almost identical or overlap for 
    # the most part so it is ok to have these extra alignments missing. 
    # CH211-69C14: zC069-C14.ya hits almost 100,000 bp downstream
    # of the .yb version. It is slightly longer and it has a good hit by BLAT
    # also to chrom 15. This is the alignment that is missing of the two for 
    # this clone.
    # run script for singles:
    perl pickLfNames.pl singlesNoReps.bed singles singles1lfName.bed
    mv error.log log.pairs               
    # Singles: there are 80 cases where the alignments for the BAC clone use 
    # a different sequence for the reads. badPairs: there are 13 cases.
    # These were not included in the new bed file and they were checked out -
    # the BAC clone names are for these are in the log file. 
    # For Singles: In all but 3 cases, the alignments were almost
    # identical or mostly overlapped the alignments for the other sequence
    # read chosen to represent that BAC end in the bed file.
    # CH211-98J20, CH211-98O21 and CH211-98G4 have alignments for other
    # sequence reads for the same end represented but the alignments are to
    # different chromosomes (see singlesNoReps.bed).                       
    perl pickLfNames.pl badPairsNoReps.bed pairs badPairs2lfNames.bed
    mv error.log log.badPairs
    # for each of these new bed files, checks were made that there are 
    # only 2 BAC ends per alingmnets for pairs and 1 for singles. 
    # For each pair, there should only be 2 ends which can appear either
    # way round depending on the orientation and there should be 1 end for 
    # the beginning (suffix T7, t7 or z) and one end for the end 
    # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
    # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
    # orientation. For singles, there should be a single BAC end for each 
    # alignment and for each BAC clone, a sequence for either or both types
    # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
    # alignments. 
    # Finally overlaps in BAC clone names were checked. All BAC clones
    # represented in each of the pairs, badPairs and singles bed files are
    # unique to that file. Between all three bed files, 173188 BAC clones
    # have alignments.

    ssh hgwdev
    # NOTE: using sort and uniq on hgwdev produces tab delimited output
    # after merging rows with the same BAC name, the scoring is now
    # wrong in the bed files.
    # Scores should be 1000 if there is 1 row for that name, else
    # 1500/number of rows for that sequence name - calculated by pslPairs.
    # Correct the scores.

    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/scores
    cd /cluster/data/danRer2/bed/ZonLab/bacends/scores
    # copy over correctScores.pl script from danRer1
    cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/correctScores.pl.gz .
    cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/checkScores.pl.gz .
    gunzip *.gz
    # modify correctScores.pl to correctScore2.pl with extra argument to 
    # specify if there is a bin field.
    awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                 | sort | uniq -c > pairs.hits
    perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
                           > bacEndPairsGoodScores.bed

    # same for singles
    awk '{print $4}' ../duplicates/singles1lfName.bed \
                 | sort | uniq -c > singles.hits
    
    perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits noBin \
                         > bacEndSinglesGoodScores.bed
    
    # and for badPairs
    awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                 | sort | uniq -c > badPairs.hits
    perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
                 noBin > bacEndPairsBadGoodScores.bed
    # check that the scores are correct now:
    awk 'BEGIN {OFS = "\t"} {print $4, $5}' bacEndPairsGoodScores.bed \
         | sort | uniq -c > pairs.count
    perl checkScores.pl < pairs.count
    # all the BAC clones should be in good.txt and none in bad.txt
    # wc -l should give same number of lines in good.txt as in pairs.hits
    # check all the GoodScores bed files in this manner - all are correct
    # for singles, 3 end up in bad.txt but it is a rounding error,
    # there are 7 alignments for each of these: CH211-210N9, CH211-30H16
    # and DKEY-31H18. Score is: 214.285714285714
    # this will be rounded to 214 when loading the database table

    # Now load database tables:
    hgLoadBed danRer2 bacEndPairs bacEndPairsGoodScores.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 65075 elements of size 11
    hgLoadBed danRer2 bacEndSingles bacEndSinglesGoodScores.bed \
                 -sqlTable=../singles/bacEndSingles.sql
    # Loaded 172945 elements of size 11
    # load of bacEndSingles did not go as planned: 172945 record(s), 
    # 0 row(s) skipped, 29 warning(s) loading bed.tab
    # warnings are unknown but all of bed file loaded and the number
    # of warnings is small so ignore
    hgLoadBed danRer2 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 13790 elements of size 11
    # load BAC end sequences into seq table so alignments may be viewed
    # symlink to bacends.fa sequences in danRer1
    mkdir -p /gbdb/danRer2/bacends
    ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \
                                /gbdb/danRer2/bacends/BACends.fa
    hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
    ssh kkstore01
    cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    # for all_bacends table, just load the alignments for those sequences
    # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
    # bacEnds.load.psl is the file of alignments
    # get all the names of sequences 
    foreach f (../scores/*GoodScores.bed)
       echo $f
       awk '{print $11;}' $f >> allBacEnds.names
    end
    wc -l allBacEnds.names
    # 251810 allBacEnds.names
    # this is the total number of lines in the *GoodScores.bed files 
    perl -pi.bak -e 's/,/\n/g' allBacEnds.names
    sort allBacEnds.names | uniq > allBacEnds.names.uniq
    # write script to go through and pick alignments
cat > getAlignments.pl << 'EOF'
#!/usr/bin/perl -w
use strict;

my $bacAligns = $ARGV[0]; # psl alignments of BAC ends
my $bacs = $ARGV[1]; # list of BAC ends for which to select alignments

open(ALIGNS, $bacAligns) || die "Can not open $bacAligns: $!\n";
open(BACS, $bacs) || die "Can not open $bacs:$!\n";

my %alignsHash;

# read alignments and store
while (<ALIGNS>)
{
my $l = $_;
my @fields = split(/\t/, $l);
# hash key is BAC end name
push (@{$alignsHash{$fields[9]}}, $l);
}
close ALIGNS;

while (<BACS>)
{
chomp;
my $end = $_;
if (exists($alignsHash{$end}))
   {
   my @als = @{$alignsHash{$end}};
   foreach my $a (@als)
      {
      print $a;
      }
   }
}
close BACS;
'EOF'

    chmod +x getAlignments.pl
    # get alignments for just the BAC ends that are in the database tables
    perl getAlignments.pl bacEnds.load.psl allBacEnds.names.uniq \
         > bacEnds.load.inTables.psl
    # check that alignments are present for all the BAC ends in the list
    # alignments for all BAC ends in allBacEnds.names.uniq are there
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
    # load all_bacends table
    hgLoadPsl danRer2 -table=all_bacends bacEnds.load.inTables.psl
    #  load of all_bacends did not go as planned: 2615155 record(s), 0 row(s) 
    #  skipped, 81 warning(s) loading psl.tab
    ssh kkstore01
    cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
    # make bacEndAlias table with Genbank accessions for ends
    # need to run getBacEndInfo.pl for the BAC end names in the 
    # BAC tables.
    # in the pairs directory, there is the allBacEnds.names.uniq file
    # so use this.
    mkdir bacEndAccs
    cd bacEndAccs
    perl ../getBacEndInfo.pl ../../pairs/allBacEnds.names.uniq \
         ../BACEnd_accessions.txt > bacEndAccs.aliases.log
    mv bacEndAccs.aliases bacEndAccs.aliases.intables
    
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/bacEndAccs
    echo "drop table bacEndAlias" | hgsql danRer2
    hgsql danRer2 < $HOME/kent/src/hg/lib/bacEndAlias.sql
    echo "load data local infile 'bacEndAccs.aliases.intables' into table \
         bacEndAlias" | hgsql danRer2
    # clonemarkers.02.12.04.txt, ctgnames.02.12.04.txt and markers.02.12.04.txt
    # already downloaded from Sanger for danRer1 (see makeDanRer1.doc)
    ssh kkstore01
    mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    set dr1=/cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
    set dr2=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    ln -s $dr1/clonemarkers.02.12.04.txt $dr2
    ln -s $dr1/ctgnames.02.12.04.txt $dr2
    ln -s $dr1/markers.02.12.04.txt $dr2
    ln -s $dr1/zfish_accs.txt $dr2

    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    hgsql -N -e 'select name from bacEndPairs;' danRer2 > bacEnds.names
    hgsql -N -e 'select name from bacEndSingles;' danRer2 >> bacEnds.names
    hgsql -N -e 'select name from bacEndPairsBad;' danRer2 >> bacEnds.names
    sort bacEnds.names | uniq > bacEnds.names.uniq
    # 173188 bacEnds.names.uniq
    # 594614 bacEnd sequences to start with 
    # some of the bacEnd sequences are replicates so count lfNames from table
    hgsql -N -e 'select lfNames from bacEndPairs;' danRer2 > bacEnds.lfNames
    hgsql -N -e 'select lfNames from bacEndSingles;' danRer2 >> bacEnds.lfNames
    hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer2 >> bacEnds.lfNames
    perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames
    sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq
    # 303946 bacEnds.lfNames.uniq
    # pslCheck has 536 bad alignments
    # from psl file
    awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
    # edit to remove first few lines with no names
    sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
    # 464067 bacEndsPsl.names.uniq
    # about 78% align - after filtering
    # Add an alias table for BAC clones 
    # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    # Add a xref table to give external clone registry names, internal names
    # sanger name, relationship between STS and BAC clone (method of finding
    # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, 
    # Genbank accession and STS primer sequences
    # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    wc -l *.02.12.04.txt
    28305 clonemarkers.02.12.04.txt
   167441 ctgnames.02.12.04.txt
   12008 markers.02.12.04.txt
   
    # create list of BAC clones in the tables with chromosome aligned to
    hgsql -N -e 'select name, chrom from bacEndPairs;' danRer2 \
          > bacClones.namesandchrom
    hgsql -N -e 'select name, chrom from bacEndSingles;' danRer2 \
          >> bacClones.namesandchrom
    sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
    # use zfish_accs.txt provided by the Sanger Centre to create a list
    # of BAC clone names, internal names and Genbank accessions
    ssh kkstore01
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    # get list of UniSTS IDs using aliases to search alias file
    # print Sanger name, alias and UniSTS ID, use print_markers2.pl
    # modified version of program written by Tony DiBiase (Boston 
    # Children's Hospital)
    # create find_markers3.pl as the previous version was finding too
    # many UniSTS IDs for some markers as using grep (2005-05-24, hartera)

    cat << '_EOF_' > find_markers3.pl
    # example:
# perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
   chomp;
   ($id, $al) = split /\t/;
   @alsArray = split(/;/, $al);
   foreach my $as (@alsArray)
      {
      push (@{$aliasHash{$as} }, $id);
      }
}
close ALIAS;

while (<MARKERS>) {
    my @idArray;
    ($f, $t, $m, $idArray[0]) = 0;
    my @ids;
    chomp; ($a, $b, $aliases, @rest) = split /\|/;
    if ($verbose > 3) { printf "aliases $aliases \n"; }
    @alias = split /;/, $aliases;
    ALIAS: foreach $s (@alias) {
        if ($s =~ /[\D]+/) {
            if ($verbose > 5) { printf "this $s \n"; }
            if (exists($aliasHash{$s}))
               {
               @idArray = @{$aliasHash{$s}};
               }
            if ($idArray[0]) {
                $f = 1; $t = $s; @ids = @idArray;
                if ($verbose) { printf "this $s found $m \n"; }
                last ALIAS;
            }
        }
    }
    if ($f) 
     { 
     my @sNames = split(/;/, $b);
     foreach my $sn (@sNames)
        {
        foreach my $i (@ids)
           { 
           printf "$sn\t$i\n"; 
           }
        }
    }
}
close MARKERS;
'_EOF_'
    chmod +x find_markers3.pl
    perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \
         markers.02.12.04.txt > sangerandUniSTSId.txt
    # No need to reformat this for zfishBacClonesandSts

    # FPC contig information (i.e. FPC contig number) from ctgnames file is 
    # not included in the tables as these are dynamic and constantly 
    # changing with the assembly.
    # Received a new version of the zfish_accs.txt file:
    # zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to 
    # /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs.txt | sort \
        > zfishOld.accs.sort
    awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs050605.txt \  
        | sort > zfishNew.accs.sort
    comm -13 zfishOld.accs.sort zfishNew.accs.sort > zfishNewAccs.only
    comm -23 zfishOld.accs.sort zfishNew.accs.sort > zfishOldAccs.only
    # 1974 zfishNewAccs.only
    # 303 zfishOldAccs.only
    # this has 1974 new BAC clone and accessions pairs and 303 of the previous 
    # BAC internal name and accession pairs are missing compared to 
    # zfish_accs.txt
    awk '{print $1};' zfishOldAccs.only | sort > zfishOld.namesonly.sort
    awk '{print $1};' zfishNewAccs.only | sort > zfishNew.namesonly.sort
    # also uniq and check no name has more than one accession - each BAC
    # name only appears once in the file
    comm -12 zfishOld.namesonly.sort zfishNew.namesonly.sort \
         > accs.OldandNew.common
    wc -l accs.OldandNew.common
    # 24 accs.OldandNew.common
    # therefore 24 BAC names have new accessions in the new file
    # zfishBacClonesandSts.c only reads the first two fields of each line
    # in the zfish_accs.txt file so create a new file with just these 2 fields.
    # Take the 303 BAC clones and accessions from zfish_accs.txt that are 
    # not in zfish_accs050605.txt and add to 9000 accs in zfishNew.accs.sort 
    cat zfishNew.accs.sort zfishOldAccs.only > zfish_accsMerged.txt
    # 9303 zfish_accsMerged.txt - BAC clone internal names and accessions
       
    # use zfishBacClonesandSts to create tab files for loading into
    # bacCloneAlias and bacCloneXRef tables
    mkdir -p /cluster/bluearc/danRer2/bacEnds/out
    # added code so that output directory now must be specified
    # fixed bug in code so that all aliases are now stored for each Sanger
    # STS name and the correct BAC clone external name is stored for internal 
    # names in all cases.
    # remake tab files using the new file mapping Sanger names and UniSTS IDs
    # and with the corrected program (2005-05-24, hartera)
 
    nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \
      clonemarkers.02.12.04.txt markers.02.12.04.txt \
      bacClones.namesandchrom.uniq zfish_accsMerged.txt \
      sangerandUniSTSId.txt \
      /cluster/bluearc/danRer2/bacEnds/out > zfishBacs.out

    # output is in /cluster/bluearc/danRer2/bacends/out so copy over
    # sort alias tab file by sangerName
    sort -k2 /cluster/bluearc/danRer2/bacEnds/out/bacAlias.tab \
         > bacAlias.sort.tab
    cp /cluster/bluearc/danRer2/bacEnds/out/bacXRef.tab .
    wc -l *.tab
    #  55836 bacAlias.sort.tab
    # 233418 bacXRef.tab
    # there are 234056 entries for danRer1 bacXRef.tab - the difference
    # reflects differences in BAC ends mapped to the assemblies

    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
    # Remove and reload tables after bug fixes in the zfishBacClonesandSts
    # program and the new UniSTS IDs and Sanger names mapping file which 
    # were used to create these tab files (2005-05-25, hartera)
    hgsql danRer2 -e 'drop table bacCloneXRef'
    hgsql danRer2 -e 'drop table bacCloneAlias'

    hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneAlias.sql
    hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneXRef.sql
    nice hgsql danRer2 -e \
          'load data local infile "bacAlias.sort.tab" into table bacCloneAlias'
    nice hgsql danRer2 -e \
          'load data local infile "bacXRef.tab" into table bacCloneXRef'
    cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
    # checked data in tables to check that everything was correctly loaded
    # from the files and errors were corrected - see TEST section below
# edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
# and bacEndSingles tracks as for danRer1. copy over html from danRer1 
# for bacEndPairs and bacEndSingles tracks. Edit to include description 
# of filtering alignments so that there is only one pair of sequence reads 
# for each BAC pair and only one sequence read for each BAC single alignment.

    # Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19)
    # There is one row where the name is "CH211-155M11__" and the 
    # intName is "zC155M11__" which is incorrect.
    # There is a correct row for intName zC155M11 and zC155M11__ and 
    # CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and
    # zC155M11__ is not an alias in bacEndAlias.
    hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer2
    # corrected termRegex for some bacCloneXRef searches so that they work
    # correctly (bacPairsSangerSts and bacSinglesSangerSts)
    # (2006-04-19, hartera)
 
# BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES
# (DONE, 2005-05-25, hartera)
# ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES 
# (DONE, 2005-05-26, hartera)
    # The following tests were carried out to check that all the data
    # in the bacCloneAlias and bacCloneXRef tables is correct.
    mkdir -p \
        /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables

# Check that the correct aliases are associated with their Sanger STS names
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
        ../markers.02.12.04.txt > sNameandaliases
    # write script to get one Sanger name and one alias on each line
    chmod +x getSangerAndAlias.pl
    perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
    sort sNameandaliases.format | uniq > sNameandaliases.sort
    # get Sanger names and aliases from database
    hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer2 \
          | sort | uniq > alias.db.sort
    wc -l alias.db.sort
    # 55836 alias.db.sort
    diff sNameandaliases.sort alias.db.sort
    # No difference between data file and data from database so ok

# Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
# (2005-05-26, hartera)
    # get Sanger names from alias table
    hgsql -N -e 'select sangerName from bacCloneAlias;' danRer2 \
             | sort | uniq > sName.alias.sort  
    wc -l sName.alias.sort 
    # 14940 sName.alias.sort
    # get Sanger names from xRef table
    hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
          is not null;' danRer2 | sort | uniq > sName.xRef.sort
    wc -l sName.xRef.sort
    # 15153 sName.xRef.sort
    comm -23 sName.alias.sort sName.xRef.sort 
    # nothing in alias file only so all sanger names in the alias table are
    # also in the xRef table
    comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias 
    wc -l sNamexRefNotAlias
    # 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table
    # get Sanger names from clonemarkers file
    awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \         > clonemarkers.sNames.sort
    # get Sanger names from markers file
    awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames
    # remove semi-colons and sort 
    sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
    # sanger names unique to markers file
    comm -13 clonemarkers.sNames.sort markers.sNames.sort
    # there are none
    comm -23 clonemarkers.sNames.sort markers.sNames.sort \
         > sNames.clonemarkersOnly
    wc -l sNames.clonemarkersOnly
    # 213 sNames.clonemarkersOnly
    diff sNames.clonemarkersOnly sNamexRefNotAlias
    # no difference so all the extra Sanger Names in the xRef table are 
    # from the clonemarkers file and these have no aliases in the markers file
    # so they are not in the alias table so this is all ok.

# Check that Sanger STS names and primers are associated correctly
    cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
    # get sanger names and primers from markers file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
        ../markers.02.12.04.txt > sNameandPrimers
    # write script to reformat and write with one Sanger name per line
    chmod +x getSangerandPrimers.pl
    perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
    sort sNameandPrimers.format > sNameandPrimers.format.sort
    wc -l sNameandPrim*
    # 12008 sNameandPrimers
    # 14940 sNameandPrimers.format
    # 14940 sNameandPrimers.format.sort
    # get Sanger names and primers from database
    hgsql -N -e \
      'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
      where sangerName is not null and leftPrimer is not null and \
      rightPrimer is not null;' danRer2 | sort | uniq \
      > sNamesandprimers.fromdb.sort
   wc -l sNamesandprimers.fromdb.sort
   # 14940 sNamesandprimers.fromdb.sort
   diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
   # No difference so ok

# Check that UniSTS IDs and Sanger STS names are associated correctly
   # get Sanger names and UniSTS IDs from the database
   hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
         uniStsId is not null;' danRer2 | sort | uniq > sNameUniSTS.fromdb.sort
   wc -l sNameUniSTS.fromdb.sort
   #  5456 sNameUniSTS.fromdb.sort
   # Need to reformat the sNameUniSTS.fromdb.sort
   chmod +x formatUniSts.pl 
   perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
        > sNameUniSTS.fromdb.format.sort
   # get Sanger names from data file and see how many UniSTS IDs there are
   # for each name
   awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
       > sangerandUniSTSId.count
   # the most is 3 UniSTS IDs
   # 3 etID9056.23
   # 3 etID9042.2
   # 3 etID8627.2
   # 3 etID8281.9
   # 3 etID11096.5
   # 2 etID9986.14
   # 2 etID9968.17
 
   sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
   diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
       > sangerandUniSTSIdvsdb
   # No difference between data from original file and that in database so ok

# Check that chrom mappings and external BAC clone names are correct
   # get extNames and chroms they map to from the database
   hgsql -N -e 'select name, chroms from bacCloneXRef where \
         chroms is not null;' danRer2 | sort | uniq \
         > nameandchromsfromdb.sort
   # reformat nameandchromsfromdb.sort
   perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
        > nameandchromsfromdb.format.sort
   # compare extNames and chroms from db to those in data file
   diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
   # no difference - all ok

# Check Genbank accessions and internal BAC clone names
   hgsql -N -e 'select intName,genbank from bacCloneXRef where \
         genbank is not null;' danRer2 | sort | uniq \
         > intNamesandAccs.fromdb.sort
   # this should be a subset of zfish_accsMerged.txt - not all BAC clones
   # listed here appear in either our BAC ends tracks or the markers files.
   sort ../zfish_accsMerged.txt > zfish_accsMerged.sort
   comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort
   # there is nothing in the database that is not in zfish_accsMerged.sort 
   comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs
   wc -l onlyinzfishAccs
   # 87 onlyinzfishAccs
   hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer2
         | sort | uniq > intNamesNoAcc.fromdb.sort
   awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort
   comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
        > indbNoAccsandAccs.out
   # none of these names are common to both so all accessions from
   # zfish_accsMerged.txt are in the database for the internal names stored
   # where there are accessions available.
  
# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
   # get internal names and Sanger names from data file 
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNameandSanger.sort
   hgsql -N -e 'select intName, sangerName from bacCloneXRef \
       where sangerName is not null;' danRer2 \
       | sort | uniq > intNameandSanger.fromdb.sort
   diff intNameandSanger.sort intNameandSanger.fromdb.sort 
   # No difference between data from file and that from database so ok

# Check BAC clone internal name and relationship fields
   # get internal names and relationships from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNameandRelation.sort
   # get internal names and relationships from database
   hgsql -N -e 'select intName, relationship from bacCloneXRef \
       where relationship != 0;' danRer2 \
       | sort | uniq > intNameandrelation.fromdb.sort
   # differences unique to database file
   comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.indbonly
   # differences unique to data file
   comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.incloneMarkersonly
   wc -l intNameRelation*
   # 4345 intNameRelation.incloneMarkersonly
   # 4345 intNameRelation.indbonly

   awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
   awk '{print $1}' intNameRelation.incloneMarkersonly \
       > intNameRelation.incloneMarkersonly.names
   diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
   # there is no difference in the internal names with relationship fields
   # no difference in names and the only places these should differ is that
   # the second column should all be 3 in the data from the database only.
   # this is because all the relationship entries that were blank were
   # in the clonemarkers file were changed to 3 when entered into the database.
   awk '{print $2}' intNameRelation.indbonly | sort | uniq
   # 3 - correct so all ok
   # all the differences should be that those that are blank in clonemarkers
   # are 3 in the database.
   # check that those that have 0 in the database bacCloneXRef relationshipe
   # field are not in the list from cloneMarkers
   # select these internal names with 0 relationship from the database
   hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
         danRer2 | sort | uniq > intNameNoRelation.fromdb.sort  
   # get all the internal names from the data file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \
       | sort | uniq > intNamefromCloneMarkers.sort
   comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
   # nothing in common between these two files as expected so there are
   # no internal names in the db with 0 in the relationship field that
   # appear in the clonemarkers file.

# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
   # get intName and extName from ctgnames file 
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \
       | sort | uniq > intNameandextNamefromCtgNames.sort
   # get intName and extName from database
   hgsql -N -e 'select intName,name from bacCloneXRef;' danRer2 \
       | sort | uniq > intNameandextName.fromdb.sort
   wc -l intNameandextName*
   # 218100 intNameandextName.fromdb.sort
   # 167441 intNameandextNamefromCtgNames.sort
   comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextindbAndCtgNames
   wc -l intandextindbAndCtgNames 
   # 167441 intandextindbAndCtgNames
   # there are 167441 name pairs common between the file and the database
   # and this is the same number of name pairs as in the data file
   diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
   # no difference between those name pairs from the data file and those that
   # are common between the data file and the database so all internal and
   # external names from ctgNames file are in the database
   # get the list of extra ones from db
   comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextNamesindbNotinCtgNames
   wc -l intandextNamesindbNotinCtgNames
   # 50659 intandextNamesindbNotinCtgNames
   # get list of internal names from the clonemarkers file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \
       > clonemarkers.intName.sort
   wc -l clonemarkers.intName.sort
   # 12987 clonemarkers.intName.sort
   # compare these intNames to those from the database not in the ctgnames file
   comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
   # none of these clone markers internal names are in this list so they
   # must all be in the ctgnames file too. These extra internal names will be
   # translations of external names found in the list of mappings of BAC clones
   # to chroms.

# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
   # get all extNames from baclones.namesandchrom.uniq and from ctgnames
   awk '{print $1}' ../bacClones.namesandchrom.uniq > \
       extNames.ctgnamesandbacClones
   awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \
       >> extNames.ctgnamesandbacClones
   wc -l extNames.ctgnamesandbacClones
   # 384421 extNames.ctgnamesandbacClones
   sort extNames.ctgnamesandbacClones | uniq \
        > extNames.ctgnamesandbacClones.sort
   wc -l extNames.ctgnamesandbacClones.sort
   # 218100 extNames.ctgnamesandbacClones.sort
   # get extNames from the database
   hgsql -N -e 'select name from bacCloneXRef;' danRer2 | sort | uniq \
         > extNames.fromdb.sort
   wc -l extNames.fromdb.sort
   # 218100 extNames.fromdb.sort
   comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
         > extNames.fromdbandfiles
   wc -l extNames.fromdbandfiles
   # 218100 extNames.fromdbandfiles
   # find extNames in common from data files and database
   diff extNames.fromdb.sort extNames.fromdbandfiles
   # no difference, all extNames from files are in db

# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
   # get internal names from ctgnames and clonemarkers files
   awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \
       > intNames.ctgnamesandclonemarkers
   awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \
       >> intNames.ctgnamesandclonemarkers
   wc -l intNames.ctgnamesandclonemarkers
   # 195746 intNames.ctgnamesandclonemarkers
   sort intNames.ctgnamesandclonemarkers | uniq \
        > intNames.ctgnamesandclonemarkers.sort
   wc -l intNames.ctgnamesandclonemarkers.sort
   # 167441 intNames.ctgnamesandclonemarkers.sort
   # get internal names from database
   hgsql -N -e 'select intName from bacCloneXRef;' danRer2 | sort | uniq \
        > intNames.fromdb.sort
   wc -l intNames.fromdb.sort
   #  218100 intNames.fromdb.sort
   # some of these intNames are derived from the corresponding extNames
   # all of the intNames from the file should be in the db
   comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
        > intNames.fromdbandfiles
   wc -l intNames.fromdbandfiles
   # 167441 intNames.fromdbandfiles
   diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
   # no difference, all intNames from files are in db
   
# Check that all translations are correct between BAC clone
# external and internal names.
   # write script to get the prefixes from internal and external names
   chmod +x getNamePrefixes.pl
   hgsql -N -e 'select name, intName from bacCloneXRef;' danRer2 \
         | sort | uniq > extandintNames.fromdb.sort
   perl getNamePrefixes.pl < extandintNames.fromdb.sort \
         > extandintNames.prefixes
   sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
   # these all look good 
   # BUSM1   dZ
   # CH211   zC
   # CH211   zc
   # CH73    zH
   # CT7     bP
   # DKEY    zK
   # DKEY    zk
   # DKEYP   zKp
   # RP71    bZ
   # XX      bY
   # zk is a internal name prefix for the external name prefix, DKEY-. There
   # is only one example where this is used (DKEY-81G7) and this in the
   # ctgnames file and is in the bacCloneXRef table so that is ok.
   # All data looks good in these tables now.

# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/zebrafish_image.jpg'
	-O zebrafish_image.jpg
    #	no crop images for this one, make the thumbnail directly:
    convert -geometry 300x200 -quality 80 -sharpen 0 -normalize \
	zebrafish_image.jpg Danio_rerio.jpg

    cp -p Danio_rerio.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push

# EXTRACT AXTs and MAFs FROM FUGU (fr1) NET (DONE, 2004-12-22, hartera)
# GZIP AXTs and MAFs (DONE, 2005-05-16, hartera)
    ssh kksilo
    # create axts
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    gunzip fuguFr1.net.gz
    gunzip chainAR/*.chain.gz
    netSplit fuguFr1.net fuguFr1Net
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (fuguFr1Net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt fuguFr1Net/$c.net chainAR/$c.chain \
                 /cluster/data/danRer2/nib \
                 /cluster/data/fr1/nib ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log
    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/danRer2/bed/blastz.fr1
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end
    # create maf
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.fr1
    cd axtNet
    mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.fr1.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/danRer2/chrom.sizes /cluster/data/fr1/chrom.sizes \
            ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=fr1.
    end
'EOF'
    chmod +x makeMaf.csh
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
    nice gzip fuguFr1.net chainAR/*.chain &
    # gzip axts and mafs (hartera, 2005-05-16)
    ssh kkstore01 
    cd /cluster/data/danRer2/bed/blastz.fr1
    nice gzip axtNet/*.axt mafNet/*.maf &

# EXTRACT AXTs and MAFs FROM TETRAODON (tetNig1) NET (DONE, 2004-12-22, hartera)
# GZIP AXTs and MAFs AND CHAINS (DONE, 2005-05-16, hartera)
    ssh kksilo
    # create axts
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    netSplit tetraTetNig1.net tetraTetNig1Net
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (tetraTetNig1Net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt tetraTetNig1Net/$c.net chainAR/$c.chain \
                 /cluster/data/danRer2/nib \
                 /cluster/data/tetNig1/nib ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log
    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end
    # create maf
    ssh kksilo
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    cd axtNet
    mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.tetNig1.maf
      echo translating $f to $maf
      axtToMaf $f \
          /cluster/data/danRer2/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
          ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=tetNig1.
    end
'EOF'
    chmod +x makeMaf.csh
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    # gzip axts and mafs (hartera, 2005-05-16)
    ssh kkstore01 
    cd /cluster/data/danRer2/bed/blastz.tetNig1
    nice gzip axtNet/*.axt mafNet/*.maf &
    # gzip chains again
    cd axtChain
    nice gzip chainAR/*.chain &

# TIGR GENE INDEX (DONE, 2004-12-22, hartera)
# Data from rsultana@tigr.org (Razvan Sultana at TIGR)
    ssh kksilo
    mkdir -p /cluster/data/danRer2/bed/tigr
    cd /cluster/data/danRer2/bed/tigr
wget ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer2_12-2004.tgz    
    tar xvzf TGI*.tgz
    foreach f (*g_gallus*)
       set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
       mv $f $f1
    end

    foreach f (*drosoph*)
    set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
       mv $f $f1
    end

    foreach o (Dmelano chicken elegans human mouse rat zfish)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/danRer2/bed/tigr
    hgsql danRer2 -e "drop table tigrGeneIndex"

    nice ldHgGene -exon=TC danRer2 tigrGeneIndex *.gff
    # Read 114013 transcripts in 395310 lines in 196 files
    # 114013 groups 28 seqs 1 sources 1 feature types
    # 114013 gene predictions

    hgsql danRer2 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql danRer2 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords danRer2 tigrGeneIndex
    /cluster/bin/scripts/runGeneCheck /cluster/data/danRer2/bed/tigr
    # 135739 badUtrSplice
    # 114013 noCDS - fixed these in table as above
    # 30191 gap

    gzip *.gff *TCs
    # make changes in doTigrGeneIndex function in hgc to add original species
    # name back when constructing URL to TIGR web site.

# 3-WAY MULTIZ MULTIPLE ALIGNMENT 
# (zebrafish danRer2, fugu fr1 and tetraodon tetNig1)
# (DONE, 2004-12-23, hartera)
# use v.8 of multiz (see makeHg17.doc for 8-WAY alignment)
    ssh kksilo
    set multizDir = multiz.2004-12-22
    set workingDir = /cluster/bluearc/danRer2/$multizDir
    ln -s $workingDir /cluster/bluearc/danRer2/multiz3way
    mkdir -p $workingDir
    mkdir -p /cluster/data/danRer2/bed/$multizDir
    ln -s /cluster/data/danRer2/bed/$multizDir \
          /cluster/data/danRer2/bed/multiz3way
    cd /cluster/data/danRer2/bed/multiz3way

# wrapper script for multiz
    # NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
    # NOTE: next time, modify script so it only needs one arg -- saves the
    # multiple dirname in a file for use by the next run
cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
    cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/danRer2/multiz3way/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/danRer2/multiz3way/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/danRer2/multiz3way/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
    chmod +x doMultiz.csh

    ssh kksilo
    set workingDir = /cluster/bluearc/danRer2/multiz3way
    # copy mafs to bluearc for tetraodon (tetNig1)
    mkdir $workingDir/tetNig1
    cd /cluster/data/danRer2/bed/blastz.tetNig1/mafNet
    foreach f (*.maf)
       set c = $f:t:r
       set g = $c:t:r
       echo $g
       cp $f $workingDir/tetNig1/${g}.maf
    end
    cd /cluster/data/danRer2/bed/multiz3way
    ls $workingDir/tetNig1/*.maf > chrom.lst

    # fugu (fr1)
    mkdir $workingDir/fr1
    cd /cluster/data/danRer2/bed/blastz.fr1/mafNet
    foreach f (*.maf)
       set c = $f:t:r
       set g = $c:t:r
       echo $g
       cp $f $workingDir/fr1/${g}.maf
    end

    # one multiz - add in fugu to zebrafish/tetraodon
    ssh kki
    set workingDir = /cluster/bluearc/danRer2/multiz3way
    cd /cluster/data/danRer2/bed/multiz3way
    mkdir run.fr1
    cd run.fr1
    echo "fr1/tetNig1" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
    # 28 jobs in batch 
    para try, check, push, check ...
# para time
# CPU time in finished jobs:        337s       5.62m     0.09h    0.00d  0.000 y
# IO & Wait Time:                   153s       2.55m     0.04h    0.00d  0.000 y
# Average job time:                  18s       0.29m     0.00h    0.00d
# Longest job:                       66s       1.10m     0.02h    0.00d
# Submission to last job:           134s       2.23m     0.04h    0.00d

    cd ..
    # copy 3-way mafs to build directory
    ssh kksilo
    set workingDir = /cluster/bluearc/danRer2/multiz3way
    ln -s $workingDir/tetNig1fr1 $workingDir/maf
    cd /cluster/data/danRer2/bed/multiz3way
    mkdir maf
    cp $workingDir/maf/*.maf maf

# WZ EST CLUSTERS ALIGNMENTS (DONE, 2004-12-23, hartera)
# WZ ESTs are compiled ESTs from WashU. They were provided by
# Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard
# Contact: adibiase@enders.tch.harvard.edu
# http://zon.tchlab.org
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/ZonLab/wzESTs
     cd /cluster/data/danRer2/bed/ZonLab/wzESTs
     # put WZ ESTs in this directory as wzcontigs.txt -
     # obtained from the Zon Lab, these are unmasked.
     # There are 42857 ESTs in this file.
     # Translated to upper case for danRer1
     cp /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa .
     cd /cluster/data/danRer2/bed/ZonLab/wzESTs
     mkdir -p /cluster/bluearc/danRer2/wzESTs
     faSplit sequence wzcontigs.fa 20 /cluster/bluearc/danRer2/wzESTs/wzcontigs

     ssh kki
     cd /cluster/data/danRer2/bed/ZonLab/wzESTs
     mkdir psl
     ls -1 /cluster/bluearc/danRer2/wzESTs/*.fa > est.lst
     ls -1S /cluster/bluearc/danRer2/trfFa/*.fa > genome.lst
     cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer2/11.ooc} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
     gensub2 genome.lst est.lst gsub spec
     para create spec
     para try, check, push, try ....
# para time
# Completed: 5980 of 5980 jobs
# CPU time in finished jobs:      17176s     286.27m     4.77h    0.20d  0.001 y
# IO & Wait Time:                 22870s     381.16m     6.35h    0.26d  0.001 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                       49s       0.82m     0.01h    0.00d
# Submission to last job:          2890s      48.17m     0.80h    0.03d

     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create wzEsts.psl
     ssh kksilo
     cd /cluster/data/danRer2/bed/ZonLab/wzESTs
     pslSort dirs raw.psl tmp psl
     # only use alignments that have at least
     # 96% identity in aligned region. use parameters used by auto
     # GenBank update for native ESTs
    pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp wz_ests.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
    # check psl
    pslCheck wz_ests.psl >& pslCheck.log
    # looks good 
    # Load EST alignments into database.
    ssh hgwdev
    cd /cluster/data/danRer2/bed/ZonLab/wzESTs
    hgLoadPsl danRer2 wz_ests.psl

    # Add WZ EST sequences
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/danRer2/wzESTs
    ln -s \
       /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \
       /gbdb/danRer2/wzESTs

    hgLoadSeq danRer2 /gbdb/danRer2/wzESTs/wzcontigs.fa

# MAKE Human Proteins track (hg17 DONE 2004-12-15 braney)
    ssh kksilo
    mkdir -p /cluster/data/danRer2/blastDb
    cd /cluster/data/danRer2/blastDb
    cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
    for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
    ln -s `cat list` .
    for i in *.fa
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa list
    cd ..
    for i in `cat blastDb/chrom.list`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
    rm blastDb/chrom.list

    mkdir /cluster/data/danRer2/scaffoldBlastDb
    cd /cluster/data/danRer2/scaffoldBlastDb
    cat ../Un/scaffolds/*.fa ../NA/scaffolds/*.fa |  faSplit sequence stdin 500 scaf
    for i in *.fa
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/danRer2/blastDb
    cd /cluster/data/danRer2/blastDb
    for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/blastDb     ; echo $i; done

    mkdir -p /iscratch/i/danRer2/scaffoldBlastDb
    cd /cluster/data/danRer2/scaffoldBlastDb
    for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/scaffoldBlastDb     ; echo $i; done

    cd
    (iSync) > sync.out

    mkdir -p /cluster/data/danRer2/bed/tblastn.hg17KG
    cd /cluster/data/danRer2/bed/tblastn.hg17KG
    echo  /iscratch/i/danRer2/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  
    echo  /iscratch/i/danRer2/scaffoldBlastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > scaffold.lst  
    # back to kksilo
    exit

    # we want around 250000 jobs
    cd /cluster/data/danRer2/bed/tblastn.hg17KG
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(250000/3899) = 657.464976
    mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
    split -l 657 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2       
        liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3       
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4       
        if pslCheck -prot $3.tmp                                                                          
        then                                                                                              
            mv $3.tmp $3                                                                                  
            rm -f $f.1 $f.2 $f.3 $f.4                                                                              
        fi
        exit 0                                                                                            
    fi                                                                                                    
fi                                                                                                        
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4      
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    # I ended up doing the scaffolds separately from the chopped up chrom segments
    # but next time I wouldn't
    gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec

    ssh kk
    cd /cluster/data/danRer2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 253435 of 253435 jobs
# CPU time in finished jobs:   58003988s  966733.13m 16112.22h  671.34d  1.839 y
# IO & Wait Time:              13196517s  219941.96m  3665.70h  152.74d  0.418 y
# Average job time:                 281s       4.68m     0.08h    0.00d
# Longest job:                     6238s     103.97m     1.73h    0.07d
# Submission to last job:        174503s    2908.38m    48.47h    2.02d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/danRer2/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 1287 of 1287 jobs
# CPU time in finished jobs:      72236s    1203.94m    20.07h    0.84d  0.002 y
# IO & Wait Time:                 22886s     381.43m     6.36h    0.26d  0.001 y
# Average job time:                  74s       1.23m     0.02h    0.00d
# Longest job:                     3374s      56.23m     0.94h    0.04d
# Submission to last job:          5982s      99.70m     1.66h    0.07d

    exit
    # back to kksilo
    cd /cluster/data/danRer2/bed/tblastn.hg17KG/blastOut
    # again some weirdness because I did the NA and Un scaffolds separately
    for i in kg??
    do 
	cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl
	sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" cs60.$i.psl > ms60.$i.psl
	echo $i
    done

    for i in kg??
    do 
	cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/danRer2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..
    # lift the chrUn and chrNA scaffolds 

    ssh hgwdev
    cd /cluster/data/danRer2/bed/tblastn.hg17KG
    hgLoadPsl danRer2 blastHg17KG.psl

    # back to kksilo
    rm -rf blastOut
# End tblastn

# ACCESSIONS FOR CONTIGS (WORKING 2005-03-17 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/danRer2/bed
    mkdir -p contigAcc
    cd contigAcc
    
    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, and send to file.
    # output to file <species>.acc

    grep scaffold zfish.acc | wc -l
    # 21333

    # edit hg/encode/regionAgp/contigAccession.pl to recognize zfish format
    # and install in /cluster/bin/scripts
    contigAccession zfish.acc > contigAcc.tab
    wc -l contigAcc.tab
    # 21333
    grep Zv4 /cluster/data/danRer2/?{,?}/*.agp | wc -l
    # 21333
    hgsql danRer2 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql danRer2
    hgsql danRer2 -e "SELECT COUNT(*) FROM contigAcc"
        # 21333

# KNOWN GENES TRACK (in progress, 2005-02-14, hartera)
# Found number of loci for Ensembl and mRNA by clustering
    cd $HOME/kent/src/hg/near/hgClusterGenes
    hgClusterGenes -noProt danRer2 ensGene ensCluster ensCanonical 
    # creates 23675 clusters, there are 32062 entries in ensGene
    # remove extra tables created by this program
    echo 'drop table ensCluster;' | hgsql danRer2
    echo 'drop table ensCanonical;' | hgsql danRer2
    cd ./kent/src/hg/geneBounds/clusterRna
    clusterRna danRer2 rna.bed est.bed -noEst
    wc -l rna.bed
    # 10,383 clusters, RefGene has 8397 accessions so that is about right
    rm *.bed
    # Therefore Ensembl should be the basis for the Known Genes tracks
    # since it represents the most loci.
    # move Ensembl to be the first track on the browser display and 
    # with default visibility of pack 
    # edit zebrafish/danRer2/trackDb.ra
    # track ensGene
    # shortLabel Ensembl Genes
    # longLabel Ensembl Gene Predictions
    # group genes
    # priority 32.8
    # visibility pack
    # color 150,0,0
    # type genePred ensPep
    # url http://www.ensembl.org/perl/transview?transcript=$$
    # hgGene on
    
    cd kent/src/hg/hgGene
    # edit hgGene.c to add special case of ensGene table for Zebrafish 
    # from kent/src/hg/near/makeNear.doc
    ssh hgwdev
    cd kent/src/hg/near/hgNear/hgNearData
    mkdir Zebrafish
    cvs add Zebrafish
    cd Zebrafish
    cp ../C_elegans/genome.ra .
    # edit this to be specific for zebrafish and commit to CVS
    cvs add genome.ra
    cvs commit genome.ra
    cd $HOME/kent/src/hg/near/hgClusterGenes
    # Cluster together various alt splice forms
    hgClusterGenes -noProt danRer2 ensGene ensIsoforms ensCanonical
    # Got 23675 clusters, from 32062 genes in 28 chromosomes
    # need to add genome.ra in hgGeneData
    cd $HOME/kent/src/hg/hgGene/hgGeneData
    mkdir Zebrafish
    cp ../C_elegans/genome.ra .
    # edit genome.ra and commit to CVS
    cp ../C_elegans/links.ra .
    # edit links.ra and commit to CVS
    # download protein accessions from Ensembl
    # go to http://www.ensembl.org/Multi/martview
    # select Ensembl 29 and Danio rerio genes (ZFISH 4)
    # go to Features and select Ensembl Gene ID from Ensembl Attributes, 
    # then from External References, select UniProt/Swiss-ProtAC, 
    # and UniProt AC and RefSeq DNA accession 
    # copy to this directory and unizip
    gunzip dr2EnsemblUniProt.gz
    wc -l dr2EnsemblUniProt
    # 34115 dr2EnsemblUniProt
    awk 'BEGIN{FS="\t"} {print $3;}' dr2EnsemblUniProt > ensembl.uniprot
    sort ensembl.uniprot | uniq > ensembl.uniprot.uniq
    # 4361 
    # download uniProt AC, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID
    # remove blank lines from ensembl.uniprot
    sed -e '/^$/d' ensembl.uniprot > ensembl.uniprot.accsonly
    # there are 6171 UniProt accessions so some Ensembl IDs share accessions


#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# EXTRACT AXTs and MAFs FROM MOUSE (mm6) NET - already done, see
# /cluster/data/danRer2/bed/blastz.mm6.swap/mafNet and makeMm6.doc 
# EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET 
# (DONE, 2005-05-13, hartera) - see makeMonDom1.doc

# EXTRACT AXTs AND MAF's FROM HUMAN NET (DONE, 2005-05-15, hartera)    
    ssh kkstore01
    # create axts
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    gunzip humanhg17.net.gz
    gunzip chainAR/*.chain.gz
    netSplit humanhg17.net humanHg17Net
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (humanHg17Net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt humanHg17Net/$c.net chainAR/$c.chain \
                 /cluster/data/danRer2/nib \
                 /cluster/data/hg17/nib ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log
    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end
    # create maf
    ssh kkstore01
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtNet
    mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.hg17.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/danRer2/chrom.sizes /cluster/data/hg17/chrom.sizes \
            ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=hg17.
    end
'EOF'
    chmod +x makeMaf.csh
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
    nice gzip humanhg17.net chainAR/*.chain &
    cd /cluster/data/danRer2/bed/blastz.hg17.swap
    nice gzip axtNet/*.axt mafNet/*.maf &

# PREP FOR LIFTOVER CHAINS FOR THIS ASSEMBLY (DONE, 2005-05-15, hartera)
    # split into 3k chunks
    ssh kkr1u00
    cd /cluster/data/danRer2
    set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
    mkdir -p $liftDir
    cd $liftDir
    mkdir -p split lift
cat > split.csh << 'EOF'
    set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
    cd /cluster/data/danRer2
    foreach n (`ls ?{,?}/*.fa`)
        set d = $n:h
        set c = $n:t:r
        echo $c
        faSplit -lift=$liftDir/lift/$c.lft size \
            /cluster/data/danRer2/$d/$c.fa -oneFile 3000 $liftDir/split/$c
    end
'EOF'
# << for emacs
    chmod +x split.csh
    csh split.csh >&! split.log &
    tail -100f split.log
    mkdir /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
    mv /iscratch/i/danRer2/liftOver/liftSplit/split/chrUn.fa \
       /iscratch/i/danRer2/liftOver/liftSplit/split/chrNA.fa \
       /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
    iSync
 
# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-05-16, hartera)
    ssh hgwdev
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p $gp/vsTetNig1/axtChrom
    cp -p *.axt $gp/vsTetNig1/axtChrom
    cd $gp/vsTetNig1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
    
    # copy chains and net to downloads area
    cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
    gzip -c all.chainARFilt5k > /cluster/data/danRer2/zip/tetraTetNig1.chain.gz
    gzip -c tetraTetNig1.net > /cluster/data/danRer2/zip/tetraTetNig1.net.gz
    cd $gp/vsTetNig1
    mv /cluster/data/danRer2/zip/tetra*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# 6-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2005-05-20, hartera)
# Species: zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), 
# fugu(fr1) and tetraodon(tetNig1)
# Tetraodon and Fugu are quite distant from zebrafish, more distant than 
# human/chicken so use the HoxD55.q matrix in future for blastz with these 
# species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu
# were done using the default matrix for blastz. (2005-06-01, hartera)
# zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), 
# fugu(fr1) and tetraodon(tetNig1)
# Prepared tree image for Conservation track details page (2005-06-07, hartera)
    ssh kkstore01
    mkdir -p /cluster/data/danRer2/bed/multiz6way
    cd /cluster/data/danRer2/bed/multiz6way
    mkdir mafLinks
    # set up directories for links to mafs for each pairwise alignment
    mkdir mafLinks/hg17
    mkdir mafLinks/mm6
    mkdir mafLinks/monDom1
    mkdir mafLinks/fr1
    mkdir mafLinks/tetNig1
   
    set dir=/cluster/data/danRer2/bed
    set dirMonDom=/cluster/data/monDom1/bed
    # need to make all the links so that they are just chrN.maf.gz 
    # without the species db in the middle e.g. chrN.hg17.maf.gz
    ln -s $dir/blastz.mm6.swap/mafNet/*.maf.gz ./mafLinks/mm6
    echo "$dir/blastz.hg17.swap" > dir.lst
    echo "$dirMonDom/blastz.monDom1.to.danRer2" >> dir.lst
    echo "$dir/blastz.fr1" >> dir.lst
    echo "$dir/blastz.tetNig1" >> dir.lst

    foreach d (`cat dir.lst`)
      foreach m ($d/mafNet/*.maf.gz)
         foreach c (`cat /cluster/data/danRer2/chrom.lst`)
            foreach sp (hg17 monDom1 fr1 tetNig1)
               set maf = ${d}/mafNet/chr${c}.${sp}.maf.gz
               if ($m == $maf) then
                  set s = $sp
                  echo $s
               endif
            end     
            ln -s $d/mafNet/chr${c}.${s}.maf.gz ./mafLinks/$s/chr${c}.maf.gz
         end
      end
    end
     
    # Copy MAFS to Iservers for kluster run
    ssh kkr1u00
    mkdir /iscratch/i/danRer2/multiz6way
    cd /iscratch/i/danRer2/multiz6way
    rsync -a --copy-links --progress \
          /cluster/data/danRer2/bed/multiz6way/mafLinks/ .
    # 321 Mb of data - takes seconds
    mkdir penn
    cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/maf_project penn
    /cluster/bin/iSync

#       Progressive alignment up the tree w/o stager,
#       using multiz.v10 (var_multiz)
#       Method: align internal subtrees (using 0 flag to var_multiz)
#               Then, align these to human (using 1 flag to var_multiz)
#       NOTE: must use maf_project after each multiz run, in order
#       to order output.  Single-cov guaranteed by use of net MAF's,
#       so it is not necessary to run single_cov2.

    # make output dir and run dir
    ssh kki
    cd /cluster/data/danRer2/bed/multiz6way
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run var_multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/danRer2/multiz6way.$c
    set pairs = /iscratch/i/danRer2/multiz6way
    
    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif
    
    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif
    
    set s1 = $2
    set s2 = $3
    set flag = $4
    
    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi
    if (-d $pairs/$s1) then
        set d1 = $pairs
        set f1 = $d1/$s1/$c.maf.gz
        set t1 = /tmp/$s1.$c.maf
        zcat $f1 > $t1
    else
        set f1 = $d1/$s1/$c.maf
        set t1 = /tmp/$s1.$c.maf
        cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
        set f2 = $d2/$s2/$c.maf.gz
        set t2 = /tmp/$s2.$c.maf
        zcat $f2 > $t2
    else
        set f2 = $d2/$s2/$c.maf
        set t2 = /tmp/$s2.$c.maf
        cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out
    
    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
        /iscratch/i/danRer2/multiz6way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        /iscratch/i/danRer2/multiz6way/penn/maf_project $out/$c.tmp.maf danRer2.$c > $out/$c.maf
        rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
        rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
        rm -f $t2
    endif
'EOF'
# << keep emacs coloring happy
    chmod +x oneMultiz.csh

    #   Copy this script to iscratch
    ssh kkr1u00
    cd /iscratch/i/danRer2/multiz6way/penn
    cp -p /cluster/data/danRer2/bed/multiz6way/run/oneMultiz.csh .
    /cluster/bin/iSync
    #   back to run the job
    ssh kki
    cd /cluster/data/danRer2/bed/multiz6way/run
    
    #   This tree.nh was used in the distant past for early versions
    #   of phastCons.  Now, this is merely a convenient reference to the
    #   tree under construction.  This is also used to draw a graphic
    #   tree as species.nh, see below.
    cat << '_EOF_' > tree.nh
((hg17,mm6),monDom1),((tetNig1,fr1),danRer2))
'_EOF_'
    # << this line keeps emacs coloring happy

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c hg17 mm6  0
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c monDom1 hg17mm6  0
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1 fr1  1
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1 monDom1hg17mm6  1
# get final alignment file
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1monDom1hg17mm6 /cluster/data/danRer2/bed/multiz6way/maf/$c.maf
#cleanup
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c
'EOF'
# << keep emacs coloring happy
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer2/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'EOF'

    cd /cluster/data/danRer2/bed/multiz6way/run
    awk '{print $1}' ../../../chrom.sizes > chrom.lst
    
    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check
    para push
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       5903s      98.39m     1.64h    0.07d  0.000 y
# IO & Wait Time:                   227s       3.78m     0.06h    0.00d  0.000 y
# Average job time:                 219s       3.65m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1328s      22.13m     0.37h    0.02d
# Submission to last job:          2098s      34.97m     0.58h    0.02d

    # do not filter mafs as only removes a small fraction of alignments
    # better to keep them all. check for single column alignments (these
    # just have a single base for each species in the alignment), if 
    # present need to do glueing step. 
    ssh hgwdev
    # create tab file for chr1.maf
    cd /cluster/data/danRer2/bed/multiz6way
    hgLoadMaf danRer2 multiz6way -test=./maf/chr1.maf
    awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $5, $6, $4-$3;}' \
        multiz6way.tab > chr1.tab.lengths
    sort -n -k6 chr1.tab.lengths > chr1.tab.lengths.sort
    # chr1 - 10 single column alignments
    # also looked at chrUn - 14 single column alignments 
    # there are also a small number of 2,3,4 column alignments.
    # This is ok as it is such a small number.
    # use mafAddIRows to 'chain' the alignments 
    ssh kkstore01
    cd /cluster/data/danRer2/bed/multiz6way
    mkdir mafAddIRows
    foreach m (./maf/*.maf)
       set n=$m:t
       echo "Processing $n ..."
       mafAddIRows $m /cluster/data/danRer2/danRer2.2bit ./mafAddIRows/$n
    end
    # cat maf files into one file for those alignments without iRows, 
    # do not use alignments with iRows in browser at the moment

    catDir maf > multiz6way.maf
    # 978 Mb of data
    # -rw-rw-r--  1 hartera protein 977796776 May 20 12:06 multiz6way.maf
    # Load into database  
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way
    mkdir /gbdb/danRer2/multiz6way
    ln -s /cluster/data/danRer2/bed/multiz6way/multiz6way.maf \
        /gbdb/danRer2/multiz6way
    hgLoadMaf danRer2 multiz6way
    # Loaded 1230321 mafs in 1 files from /gbdb/danRer2/multiz6way
    # took about 2 minutes to load
    hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 danRer2 \
        multiz6waySummary multiz6way.maf
    # Processed 2565228 components in 1230321 mafs from multiz6way.maf
    # took about 2 minutes to load

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_2"
    hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_3"

    # create tree image - like tree.nh but with common names, added zebrafish
    # to tree (2005-06-07, hartera):
    cat << '_EOF_' > species6.nh
(((human,mouse),opossum),((tetraodon,fugu),zebrafish))
'_EOF_'
    /cluster/bin/phast/draw_tree -b -s species6.nh > species6.ps
    # used GIMP to reduce the amount of whitespace to make it
    # smaller, then save as jpg
    display species6.jpg
    # use the Transform->Chop and then Direction->horizontal or vertical
    # to reduce the tree size while keeping label sizes the same
    cp species6.jpg /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
    # change permissions for display
    chmod +r /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
# add all.joiner entry for danRer2 multiz6way (2005-05-31, hartera)
    # add trackDb.ra entry
# track multiz6way
# shortLabel 6-Way Conservation
# longLabel 6-Way Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 109
# visibility hide
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# #wiggle phastCons6way
# yLineOnOff Off
# autoScale Off
# summary multiz6waySummary
# speciesGroups mammal vertebrate
# sGroup_mammal hg17 mm6 monDom1
# sGroup_vertebrate fr1 tetNig1
# treeImage phylo/DanRer2_6way.jpg

# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT 
# (DONE, 2005-06-01, hartera)
# Tetraodon and Fugu are quite distant from zebrafish, more distant than 
# human/chicken so use the HoxD55.q matrix in future for blastz with these 
# species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu 
# that were used for the 6-way multiz were done using the default matrix 
# for blastz. (2005-06-01, hartera)
    ssh kkstore01
    mkdir /cluster/data/danRer2/bed/multiz6way/cons
    cd /cluster/data/danRer2/bed/multiz6way/cons

    # create a starting-tree.mod based on chr5 (67Mb - largest chrom)
    # chr5 is the largest chrom apart from NA and Un
    /cluster/bin/phast/msa_split ../maf/chr5.maf \
        --refseq ../../../5/chr5.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1
    # takes about 2 minutes
    /cluster/bin/phast/phyloFit -i SS s1.*.ss \
        --tree "((tetNig1,fr1),((mm6,hg17),monDom1))" \
        --out-root starting-tree
    # only takes about 5 minutes
    rm s1.*.ss
    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.309863 0.189473 0.189412 0.311252
    # This implies a GC content of 0.1895 + 0.1895 = 0.379
    # If you do *not* have a global tree model and you do not know
    # your GC content, you can get it directly from the MAFs with
    # a command like:
    # /cluster/bin/phast/msa_view \
    # --aggregate danRer2,tetNig1,fr1,mm6,hg17,monDom1 -i MAF \ 
    # --summary-only /cluster/data/danRer2/bed/multiz6way/maf/chr*.maf \
    # > maf_summary.txt
    # this gives GC content of 0.4026 so similar
# break up the genome-wide MAFs into pieces
    ssh kkstore01
    mkdir -p /cluster/bluearc/danRer2/cons/ss
    cd /cluster/bluearc/danRer2/cons/ss
    bash
    for C in `awk '{print $1}' /cluster/data/danRer2/chrom.sizes`
    do
      if [ -s /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf ]; then
        echo msa_split $C
        chrN=${C/chr/}
        /cluster/bin/phast/msa_split \
            /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf \
            --refseq /cluster/data/danRer2/${chrN}/${C}.fa \
            --in-format MAF --windows 1000000,0 --between-blocks 5000 \
            --out-format SS -I 1000 --out-root ${C}
      fi
    done
    # took 15 minutes to run
    # Create a random list of 50 1 mb regions  (do not use the NA and Un)
    ls -l | grep -v "NA" > fileList
    cat fileList | grep -v "Un" > fileList2
    mv fileList2 fileList
    cat fileList | awk '$5 > 4000000 {print $9;}' | \
        randomLines stdin 50 ../randomSs
    rm fileList

    # Set up parasol directory to calculate trees on these 50 regions
    ssh kk9
    mkdir -p /cluster/bluearc/danRer2/cons/treeRun1
    cd /cluster/bluearc/danRer2/cons/treeRun1
    mkdir tree log

    # now set up cluster job to estimate model parameters.  Parameters
    # will be estimated separately for each alignment fragment then
    # will be combined across fragments
    #   Tuning this loop should come back to here to recalculate
    # Create little script that calls phastCons with right arguments
    cat > makeTree << '_EOF_'
    /cluster/bin/phast/phastCons ../ss/$1.ss \
      /cluster/data/danRer2/bed/multiz6way/cons/starting-tree.mod \
      --gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #   emacs happy
    chmod a+x makeTree

    # msa_view as this is from the whole genome. Also, notice the
    # target coverage of 0.17.  Here we are going to aim for 65% coverage
    # of coding regions by conserved elements.
    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree $(root1)
#ENDLOOP
'_EOF_'
    #   happy emacs
                                                                             
    # Make cluster job and run it
    gensub2 ../randomSs single template jobList
    para create jobList
    para try/push/check/etc
# para time
# Completed: 50 of 50 jobs
# CPU time in finished jobs:       4013s      66.89m     1.11h    0.05d  0.000 y
# IO & Wait Time:                   139s       2.31m     0.04h    0.00d  0.000 y
# Average job time:                  83s       1.38m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             135s       2.25m     0.04h    0.00d
# Submission to last job:          5260s      87.67m     1.46h    0.06d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ls tree/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
        --output-average ../ave.cons.mod > cons_summary.txt
    ls tree/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
        --output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/danRer2/bed/multiz6way/cons
    #   measuring entropy
    #   consEntropy <target coverage> <expected lengths>
    #            ave.cons.mod ave.noncons.mod --NH 9.78
    #   never stops with the --NH argument
    # target entropy should be L_min*H=9.8 bits, the expected length
    # that produces this entropy is the one to use for phastCons.
    /cluster/bin/phast/consEntropy 0.17 12 \
                        ave.cons.mod ave.noncons.mod

# the above steps were repeated with the target coverage and expected lengths
# parameters set as below:

# -target-coverage=0.17 -expected-lengths 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=0.768727 bits/site
# Expected min. length: L_min=13.932145 sites
# Expected max. length: L_max=8.869545 sites
# Total entropy: L_min*H=10.710017 bits

# -target-coverage=0.20 -expected-lengths 10
#Transition parameters:gamma=0.200000,omega=10.000000, mu=0.100000, nu=0.025000
# Relative entropy: H=0.799627 bits/site
# Expected min. length: L_min=12.358880 sites
# Expected max. length: L_max=7.593192 sites
# Total entropy: L_min*H=9.882496 bits

# -target-coverage=0.20 -expected-lengths=9
#Transition parameters:gamma=0.200000, omega=9.000000, mu=0.111111, nu=0.027778
# Relative entropy: H=0.805459 bits/site
# Expected min. length: L_min=12.022440 sites
# Expected max. length: L_max=7.111297 sites
# Total entropy: L_min*H=9.683580 bits

# -target-coverage=0.25 -expected-lengths=10
#Transition parameters:gamma=0.250000, omega=10.000000, mu=0.100000, nu=0.033333
# Relative entropy: H=0.843206 bits/site
# Expected min. length: L_min=10.846871 sites
# Expected max. length: L_max=6.973477 sites
# Total entropy: L_min*H=9.146148 bits

# -target-coverage=0.35 -expected-lengths=12
#Transition parameters:gamma=0.350000, omega=12.000000, mu=0.083333, nu=0.044872
# Relative entropy: H=0.917684 bits/site
# Expected min. length: L_min=9.169807 sites
# Expected max. length: L_max=6.687135 sites
# Total entropy: L_min*H=8.414989 bits

# -target-coverage=0.35 -expected-lengths=14
#Transition parameters:gamma=0.350000,omega=14.000000, mu=0.071429, nu=0.038462
# Relative entropy: H=0.903639 bits/site
# Expected min. length: L_min=9.778765 sites
# Expected max. length: L_max=7.300778 sites
# Total entropy: L_min*H=8.836478 bits

# -target-coverage=0.35 -expected-lengths=18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
# Relative entropy: H=0.881986 bits/site
# Expected min. length: L_min=10.798324 sites
# Expected max. length: L_max=8.320602 sites
# Total entropy: L_min*H=9.523968 bits

# -target-coverage=0.35 -expected-lengths=19
#Transition parameters:gamma=0.350000,omega=19.000000, mu=0.052632, nu=0.028340
# Relative entropy: H=0.877572 bits/site
# Expected min. length: L_min=11.021340 sites
# Expected max. length: L_max=8.542773 sites
# Total entropy: L_min*H=9.672025 bits

# -target-coverage=0.32 -expected-lengths=20
#Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
# Relative entropy: H=0.853466 bits/site
# Expected min. length: L_min=11.824480 sites
# Expected max. length: L_max=9.084203 sites
# Total entropy: L_min*H=10.091797 bits

#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
# -target-coverage=0.32 -expected-lengths=18
#Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
# Relative entropy: H=0.860401 bits/site
# Expected min. length: L_min=11.402969 sites
# Expected max. length: L_max=8.648243 sites
# Total entropy: L_min*H=9.811131 bits

# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the 
# coverage of coding regions by the most conserved elements
    # Create cluster dir to do main phastCons run
    ssh kk9
    mkdir /cluster/bluearc/danRer2/cons/consRun1
    cd /cluster/bluearc/danRer2/cons/consRun1
    mkdir ppRaw bed
    # Create script to run phastCons with right parameters
    # This job is I/O intensive in its output files.  To make this
    # cluster safe, it would be better to do this work somewhere over
    # in /tmp/... and copy the final result back.  kk9 can do this
    # run, but kk cannot.
    # Run whole genome, this goes fast in this case.
cat > doPhast << '_EOF_'
mkdir -p ppRaw/$2
/cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
   --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \
   --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
   ppRaw/$2/$1.pp
'_EOF_'
    # emacs happy
    chmod a+x doPhast

    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast $(file1) $(root1)
#ENDLOOP
'_EOF_'
    #   happy emacs
                                                                                
    # Create parasol batch and run it for the whole genome
    ls -1 ../ss | sed 's/.ss//' > in.lst
    gensub2 in.lst single template jobList
    para create jobList
    para try/check/push/etc.
# para time
# Completed: 1604 of 1604 jobs
# CPU time in finished jobs:      11005s     183.42m     3.06h    0.13d  0.000 y
# IO & Wait Time:                 30912s     515.19m     8.59h    0.36d  0.001 y
# Average job time:                  26s       0.44m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              55s       0.92m     0.02h    0.00d
# Submission to last job:           608s      10.13m     0.17h    0.01d

    # combine predictions and transform scores to be in 0-1000 interval
    #   it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /cluster/bluearc/danRer2/cons/consRun
    catDir bed | awk ' \
        {printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
        /cluster/bin/scripts/lodToBedScore /dev/stdin > \
        /cluster/data/danRer2/bed/multiz6way/mostConserved.bed
                                                                                
    # Figure out how much is actually covered by the bed files as so:
    ssh kkstore01
    cd /cluster/data/danRer2/bed/multiz6way
    foreach f ( `cat /cluster/data/danRer2/chrom.lst` )
       cat $f/chr${f}.fa >> allChroms.fa
    end
    faSize allChroms.fa
    # 1589273282 bases (101846391 N's 1487426891 real 813899141 upper 
    # 673527750 lower) in 28 sequences in 1 files

    # Total size: mean 56759760.1 sd 58536773.4 min 16596 (chrM) 
    # max 318297480 (chrNA) median 42763181 
    
    awk '{sum+=$3-$2} \
END{printf "%% %.2f = 100.0*%d/1487426891\n",100.0*sum/1487426891,sum}' \
        mostConserved.bed
    -target-coverage 0.17: % 1.99 = 100.0*29631918/1487426891 length 12
    -target-coverage 0.20: % 1.96 = 100.0*29183170/1487426891 length 10
    -target-coverage 0.25: % 2.11 = 100.0*31391360/1487426891 length 10
    -target-coverage 0.35  % 2.69 = 100.0*39940068/1487426891 length 19
    -target-coverage 0.32  % 2.60 = 100.0*38730791/1487426891 length 18
    #   the non-N genome size, from faSize on all chroms: 1487426891

# check with featureBits
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way
    # get an or of refGene and mgcGenes CDS regions 
    featureBits danRer2 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
    # 9745850 bases of 1560497282 (0.625%) in intersection
    featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # -target-coverage=0.17 -expected-lengths=12
    # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.899%, both 0.350%, 
    # cover 56.07%, enrich 29.53x
    # -target-coverage=0.20 -expected-lengths=10
    # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.870%, both 0.347%, 
    # cover 55.54%, enrich 29.70x
    # -target-coverage=0.25 -expected-length=10
    # refSeqOrMgcCds.bed 0.625%, mostConserved4.bed 2.012%, both 0.364%, 
    # cover 58.35%, enrich 29.01x
    # -target-coverage=0.35 -expected-lengths=19
    # refSeqOrMgcCds.bed 0.625%, mostConserved5.bed 2.559%, both 0.431%, 
    # cover 68.98%, enrich 26.95x
    # -target-coverage=0.32 -expected-lengths=18
    # refSeqOrMgcCds.bed 0.625%, mostConserved6.bed 2.482%, both 0.423%, 
    # cover 67.75%, enrich 27.30x
    # looking at ensGene:
    # featureBits danRer2 ensGene:cds mostConserved.bed -enrichment
    # ensGene:cds 2.135%, mostConserved.bed 2.482%, both 1.210%, 
    # cover 56.67%, enrich 22.83x

# so use this result with -target-coverage=0.32 -expected-lengths=18
# total entropy is 9.81 which is good, aiming for 9.8 and 65% coverage
# of coding regions with most conserved elements, here it is 67.8%

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way
    hgLoadBed danRer2 phastConsElements mostConserved.bed
    # Loaded 393654 elements of size 5
    # less than 1 minute load time
    featureBits danRer2 -enrichment refSeqOrMgcCds.bed phastConsElements
# refSeqOrMgcCds.bed 0.625%, phastConsElements 2.482%, both 0.423%, 
# cover 67.75%, enrich 27.30x
    featureBits danRer2 mgcGenes:cds phastConsElements -enrichment
# mgcGenes:cds 0.486%, phastConsElements 2.482%, both 0.336%, 
# cover 69.06%, enrich 27.82x
    featureBits danRer2 refGene:cds phastConsElements -enrichment
# refGene:cds 0.597%, phastConsElements 2.482%, both 0.400%, 
# cover 66.97%, enrich 26.98x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore01
    cd /cluster/bluearc/danRer2/cons/consRun
    # interesting sort here on the chr name and position.
    # first convert all . and - characters to special strings x/ and x_/
    #   to get a consistent delimiter of / for all fields to be sorted.
    #   Then do the sort on the chrom name and the start position, after
    #   the sort convert the special stringx x_/ and x/ back to - and .
    #   respectively.  This gets everything in order by chrom name and
    #   chrom start.
    find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \
        sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \
            wigEncode stdin phastCons6way.wig phastCons6way.wib
    # about 3 minutes for above

    ssh kkstore01
    cd /cluster/bluearc/danRer2/cons/consRun
    cp -p phastCons6way.wi? /cluster/data/danRer2/bed/multiz6way/cons
                                                                                
    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way/cons
    mkdir -p /gbdb/danRer2/wib
    ln -s `pwd`/phastCons6way.wib /gbdb/danRer2/wib/phastCons6way.wib
    # use this if need to reload table
    hgsql -e 'drop table phastCons6way;' danRer2
    # load table
    hgLoadWiggle danRer2 phastCons6way phastCons6way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way/cons
    bash
    time hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
            -db=danRer2 phastCons6way > histogram.data 2>&1
    # real    3m1.185s  user    2m19.440s  sys     0m11.850s
 
        #   create plot of histogram:
    cat << '_EOF_' > histo.gp
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer2 Histogram phastCons6 track"
set xlabel " phastCons6 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
                                                                                
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'

    #   happy emacs
    gnuplot histo.gp > histo.png
    display histo.png &

# add line: wiggle phastCons6way to trackDb.ra for multiz6way to display the 
# wiggle for the conservation track.
# copy over html for multiz and edit.

# PHASTCONS SCORES DOWNLOADABLES (DONE, 2005-05-31, hartera)
    #   prepare compressed copy of ascii data values for downloads
    ssh kkstore01
    cd /cluster/bluearc/danRer2/cons/consRun
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
                                                                                
TOP=`pwd`
export TOP
                                                                                
mkdir -p phastCons6wayScores
                                                                                
ls ppRaw | while read D
do
    out=${TOP}/phastCons6wayScores/${D}.gz
    echo -n "$out ... "
    cd ${TOP}/ppRaw/${D}
    gzip -c `ls *.pp  | sed -e "s#-#.x-x.#g;" | \
        sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out}
    echo "done"
done
'_EOF_'
    #   happy emacs
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    # 104.769u 8.789s 3:00.18 63.0%   0+0k 0+0io 11pf+0w
    # 182 Mb of data
    #  copy scores for downloads
    ssh kkstore01
    mkdir /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
    cd /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
    rsync -a --progress \
          /cluster/bluearc/danRer2/cons/consRun/phastCons6wayScores/ .

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
    cd /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
    ln -s /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores/*.gz .
    md5sum *.gz > md5sum.txt
    # copy over and edit README.txt.

# MULTIZ 6-WAY DOWNLOADABLES (DONE, 2005-05-31, hartera)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer2
    mkdir -p multiz6way
    cd multiz6way
    foreach f (/cluster/data/danRer2/bed/multiz6way/maf/*.maf)
        set c = $f:r:t
        echo $c
        nice gzip -c $f > $c.maf.gz
    end
    # copy README and edit
                                                                                
    # Create upstream files for download
    ssh hgwdev
    cd /cluster/data/danRer2/bed/multiz6way
    echo danRer2 tetNig1 fr1 mm6 hg17 monDom1 > org.txt
    # mafFrags is quick for these
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits danRer2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4,
0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags danRer2 multiz6way up.bed upstream$i.maf -orgs=org.txt
        rm up.bed
    end
    # took less than 3 minutes
 
    ssh kkstore01
    cd /cluster/data/danRer2/bed/multiz6way
    nice gzip upstream{1000,2000,5000}.maf

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer2
    mv /cluster/data/danRer2/bed/multiz6way/upstream*.maf.gz multiz6way
    cd multiz6way
    md5sum *.gz > md5sum.txt

###########################################################################
# LIFTOVER CHAINS TO DANRER3 (DONE, 2006-04-18, hartera)
    # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
    # Do what makeLoChain-split says to do next (start blat alignment)
    ssh kk
    mkdir -p /cluster/data/danRer2/bed/liftOver
    cd /cluster/data/danRer2/bed/liftOver
    makeLoChain-align danRer2 /iscratch/i/danRer2/nib danRer3 \
        /iscratch/i/danRer3/split10k \
        /iscratch/i/danRer3/danRer3_11.ooc >&! align.log &
    # Took about 4 minutes.
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/run
    para try, check, push, check, ...
    para time >&! run.time
# Completed: 784 of 784 jobs
# CPU time in finished jobs:    4022758s   67045.97m  1117.43h   46.56d  0.128 y
# IO & Wait Time:                 71872s    1197.86m    19.96h    0.83d  0.002 y
# Average job time:                5223s      87.05m     1.45h    0.06d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           42935s     715.58m    11.93h    0.50d
# Submission to last job:         42935s     715.58m    11.93h    0.50d

    # lift alignments
    ssh kkr1u00
    cd /cluster/data/danRer2/bed/liftOver
    makeLoChain-lift danRer2 danRer3 >&! lift.log &
    # Took about 10 minutes to run.

    # chain alignments
    ssh kki
    cd /cluster/data/danRer2/bed/liftOver
    makeLoChain-chain danRer2 /iscratch/i/danRer2/nib \
                danRer3 /iscratch/i/danRer3/nib >&! chain.log &
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/chainRun
    para try, check, push ...
    para time >&! run.time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       2697s      44.95m     0.75h    0.03d  0.000 y
# IO & Wait Time:                   548s       9.13m     0.15h    0.01d  0.000 y
# Average job time:                 116s       1.93m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             439s       7.32m     0.12h    0.01d
# Submission to last job:           439s       7.32m     0.12h    0.01d
    # net alignment chains
    ssh kkstore04
    cd /cluster/data/danRer2/bed/liftOver
    makeLoChain-net danRer2 danRer3 >&! net.log &
    # Took about 20 minutes to run.
    # load reference to over.chain into database table,
    # and create symlinks  /gbdb  and download area
    ssh hgwdev
    cd /cluster/data/danRer2/bed/liftOver
    makeLoChain-load danRer2 danRer3 >&! load.log &
    # test by converting a region using the "convert" link on
    # the browser, and comparing to blat of the same region
   
###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM DANRER3 (DONE, 2006-04-25, hartera)
    # followed instructions used in makePanTro2.doc
    ssh kkr1u00
    cd /cluster/data/danRer2/bed
    mkdir -p liftOver
    cd liftOver
    makeLoChain-split danRer2 /cluster/data/danRer2/nib >&! split.log &
    # Took about 30 minutes to run.
 
#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 452 where name = 'danRer2';"

##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer2
##########################################################################
# BACENDS CLEANUP (DONE, 2007-03-27, hartera)
   ssh kkstore04  
   cd /cluster/data/danRer2/bed/
   du -sh ZonLab
   # 150G     ZonLab/
   cd ZonLab
   # Remove old BAC ENDS directories
   nice rm -r bacends2 bacends3
   cd bacends
   rm bacEndsCov* bacEndsMinCov* bl2seq.out blast.out diff bacEnds \
      bacEnds.names.uniq jobList
   nice rm -r psl err minCov10MinAli92
   # bacEnds.lifted.psl is lifted version of bacEnds.psl so remove latter
   rm bacEnds.psl batch batch.bak *.lst para.results names.test template *.log
   cd bacends.1
   rm accs* bacEndAccs.aliases *.log allBacEnds* zfishaccs* diffaccs bacEnds \
      BACEnd*
   rm -r test test2
   nice gzip *.psl
   cd ../pairs
   # bacEndSingles.bed is already in singles directory 
   rm bacEnds.* bed.tab bacEndSingles.bed psl.tab bacEndSingles.sql *.bak 
   rm -r old test
   cd ../singles
   rm bacEnds.*
   rm -r old
   cd ../scores
   rm *.counts *.count *.hits good.txt error.log bed.tab bad.txt
   rm -r test
   cd ../duplicates
   rm -r old2 test test1
   rm error* sing *.counts singles.log.may10 log* out* bed.tab err
   rm list list.aligns singles* *.uniq *.tmp *.tmp2 100names* pairs.* \
      pairsTest*
   rm -r badPrsTest pairsTest
   cd ../cloneandStsAliases
   rm *.bak dr1* dr2* xref.names.uniq zfishNew* zfishOld* *.tab accs.*
   mv testTables/*.pl .
   rm -r test testTables
   cd /cluster/data/danRer2/bed/
   du -sh ZonLab
   # 46G     ZonLab/
   # removed 104 GB 
##########################################################################