# for emacs: -*- mode: sh; -*-


# Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
#  Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#    ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_assembl_information.shmtl

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+-------------------------+
#  | tableName | type                    |
#  +-----------+-------------------------+
#  | refGene   | genePred refPep refMrna |
#  | mgcGenes  | genePred                |
#  | genscan   | genePred genscanPep     |
#  +-----------+-------------------------+


###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2006-03-29, hartera)
# CHANGED NAME OF SCAFFOLDS AGP FILE (DONE, 2006-04-13, hartera)
     ssh kkstore01
     mkdir /cluster/store8/danRer4 
     ln -s /cluster/store8/danRer4 /cluster/data
     cd /cluster/data/danRer4
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/README
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.chunks.agp
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.scaffold.agp
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.fa
     wget --timestamp \
    ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.stats
     # keep agp file name consistent with Zv5 (hartera, 2006-04-13)
     mv Zv6.scaffold.agp Zv6.scaffolds.agp

###########################################################################
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2006-03-29, hartera)
# ADDED CHUNKS AGP FILE (DONE, 2006-04-13, hartera)
     ssh kkstore01
     mkdir -p /cluster/data/danRer4/M
     cd /cluster/data/danRer4/M
     # go to http://www.ncbi.nih.gov/ and search the Nucleotide database for
     # "Danio mitochondrion genome".  That shows the gi number:
     # 8576324 for the accession, AC024175
 # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
     # Edit chrM.fa: make sure the header line says it is the
     # Danio Rerio mitochondrion complete genome, and then replace the
     # header line with just ">chrM".
     perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
     rm *.bak
     # Make a "pseudo-contig" for processing chrM too:
     mkdir ./chrM_1
     sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
     mkdir ./lift
     echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
     echo "chrM_1" > ./lift/ordered.lst
     # make sure this is tab delimited:
     echo "0\tM/chrM_1\t16596\tchrM\t16596" > ./lift/ordered.lft
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
     echo "chrM\t1\t16596\t1\tF\tAC024175.3\t1\t16596\t+" \
          > chrM.agp
     # Create a chrM.chunks.agp (hartera, 2006-04-13)
     mkdir -p /cluster/data/danRer4/M/agps
     cd /cluster/data/danRer4/M/agps
     awk 'BEGIN {OFS="\t"} \
        {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' \
        ../chrM.agp > chrM.chunks.agp
     # make sure that all above *.agp files are tab delimited

###########################################################################
# CREATE LIST OF CHROMOSOMES (DONE, 2006-04-12, hartera)
# Change names of random chroms to chrNA_random and chrUn_random
# (DONE, hartera, 2006-04-21)
     ssh kkstore01
     cd /cluster/data/danRer4
     awk '{if ($1 !~ /Zv6/) print $1;}' Zv6.scaffolds.agp \
         | sort -n | uniq > chrom.lst
     cp chrom.lst chrom1to25.lst
     # add chrM, chrUn and chrNA
     echo "M" >> chrom.lst
     echo "NA" >> chrom.lst
     echo "Un" >> chrom.lst
     # Change names of random chroms to reflect that
     perl -pi.bak -e 's/NA/NA_random/' chrom.lst
     perl -pi.bak -e 's/Un/Un_random/' chrom.lst
     rm *.bak

###########################################################################
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2006-04-12, hartera)
    ssh kkstore01
    cd /cluster/data/danRer4
    # This used to hold scripts -- better to keep them inline here 
    # Now it should just hold lift file(s) and
    # temporary scripts made by copy-paste from this file.
    mkdir /cluster/data/danRer4/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/danRer4/bed

###########################################################################
# CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2006-04-13, hartera)
# 
    ssh kkstore01
    cd /cluster/data/danRer4
    mkdir -p /cluster/data/danRer4/scaffolds
    cd /cluster/data/danRer4/scaffolds
    faSize detailed=on ../Zv6_scaffolds.fa > Zv6.scaffolds.sizes
    # Check that these sizes correspond to the sizes in the scaffolds agp file
    # use script compareSizes2.pl
    cat << '_EOF_' > ../jkStuff/compareSizes2.pl
#!/usr/bin/perl -w
use strict;

my ($file, $agp);

$file = $ARGV[0];
$agp = $ARGV[1];

open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";

my ($l, @f, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
@f = split(/\t/, $l);

$name = $f[0]; 
$size = $f[1];
$scafsHash{$name} = $size;
}
close FILE;

while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;

if ($line =~ /Zv/)
   {
   @fi = split(/\t/, $line);
   $scaf = $fi[5];
   $end = $fi[7];

   if (exists($scafsHash{$scaf}))
      {
      if ($scafsHash{$scaf} == $end)
         {
         print OUT "$scaf - ok\n";
         }
      else
         {
         print OUT "$scaf - different size to sequence\n";
         }
      }
   else
      {
      print OUT "$scaf - does not exist in list of sizes\n";
      }
   }
}
close AGP;
close OUT;
'_EOF_'
   # << happy emacs
   chmod +x ../jkStuff/compareSizes2.pl
   perl /cluster/data/danRer4/jkStuff/compareSizes2.pl \
        Zv6.scaffolds.sizes ../Zv6.scaffolds.agp 
   grep different log.txt
   grep not log.txt
   # these are all consistent with the sequence sizes
   # check that the co-ordinates in the agp files are consistent:
   # field 2 is the start position, field 3 is the end and field 8 is the size
   # so check that this is consistent.
   cd /cluster/data/danRer4
   awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.scaffolds.agp \
       > Zv6.scaffolds.coordCheck 
   # this file is empty so they are ok. do the same for the chunks.agp file
   awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.chunks.agp \
       > Zv6.chunks.coordCheck
   # this file is empty so ok
   # check that the difference between 7th and 8th fields is the same as the 
   # difference between 11th and 12th fields. 
   awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
       Zv6.chunks.agp > Zv6.chunks.coordCheck2
   # these are all ok
   rm Zv6.*.coord*

   cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;

my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp

open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";

while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv/)
   {
   $name = $f[5];
   $e = $f[2];
   $scafsHash{$name} = $e;
   }
}
close SCAFS;

my $scaf = "";
my $prev = "";
my $prevEnd = 0;

while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);

# if it is not a gap line
if ($fi[4] ne "N")
   {
   $scaf = $fi[9];
   if (($scaf ne $prev) && ($prev ne ""))
      {
      checkCoords($prev, $prevEnd);
      }
$prev = $scaf;
$prevEnd = $fi[2];
   }
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;

sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
   {
   if ($scafsHash{$prev} != $prevEnd)
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev is not consistent between agps\n";
      }
   else
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev - ok\n";
      }
   }
}
'_EOF_'
   # << happy emacs
   chmod +x ./jkStuff/checkSizesInAgps.pl
   cd scaffolds
   perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
        Zv6.scaffolds.agp Zv6.chunks.agp > Zv6.scafsvschunks
   grep "not consistent" Zv6.scafsvschunks
   # no lines were inconsistency was reported
   wc -l Zv6.scafsvschunks
   # 6653 Zv6.scafsvschunks
   grep "Zv6" Zv6.scaffolds.agp | wc -l
   # 6653
   # so all the scaffolds were checked and were ok.
   cd ..
   rm -r scaffolds
 
###########################################################################
# SPLIT AGP FILES BY CHROMOSOME (DONE, 2006-04-13, hartera)
# GENOME FASTA FROM SANGER WAS CREATED USING SCAFFOLDS AGP
   ssh kkstore01
   cd /cluster/data/danRer4
   # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
   # then one for chunks (contigs on danRer1) showing how they map on to
   # scaffolds.
  
   # get list of scaffolds from FASTA file and check these are in agp
   grep '>' Zv6_scaffolds.fa | sed -e 's/>//' | sort | uniq > Zv6FaScafs.lst
   # get list of scaffolds from agp - do not print from gap lines
   awk '{if ($7 !~ /contig/) print $6;}' Zv6.scaffolds.agp \
        | sort | uniq > Zv6AgpScafs.lst
   diff Zv6FaScafs.lst Zv6AgpScafs.lst
   # no difference so all scaffolds are in the FASTA file
   # add "chr" prefix for the agp files
   perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
   # for chromosomes 1 to 25, create 2 agps for each chrom, one for scaffolds 
   # and one for chunks:
   foreach c (`cat chrom1to25.lst`)
     echo "Processing $c ..."
     mkdir $c
     perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv6.chunks.agp \
          > $c/chr$c.chunks.agp
     perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv6.scaffolds.agp \
          > $c/chr$c.scaffolds.agp
   end
   
###########################################################################
# CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2006-04-13, hartera)
# RECREATE AGP FILES WITH chrNA and chrUn RENAMED AS chrNA_random 
# AND chrUn_random (DONE, 2006-04-21, hartera)
# NOTE: IN THIS ASSEMBLY AND IN FUTURE, NAME chrNA AND chrUn AS 
# chrNA_random AND chrUn_random TO REFLECT THAT THEY ARE UNORDERED 
# COLLECTIONS OF SCAFFOLDS.  
   ssh kkstore01
   # chrNA_random consists of WGS contigs that could not be related to any
   # FPC contig and the scaffolds and contigs are named Zv5_NAN in the
   # first field of the agp files where the second N is an number.
   cd /cluster/data/danRer4
   mkdir ./NA_random
   awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.chunks.agp \
       > ./NA_random/NA_random.chunks.agp
   awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
       > ./NA_random/NA_random.scaffolds.agp
   # change the first field to "chrNA_random" then can use agpToFa to process
   perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/*.agp
   wc -l ./NA_random/NA_random.scaffolds.agp
   # 2898 ./NA_random/NA_random.scaffolds.agp
   # check files and remove backup files
   # these are not sorted numerically by scaffold number
   rm ./NA_random/*.bak
   # then process chrUn_random - this is made from scaffolds and
   # contigs where the name is Zv6_scaffoldN in the first field of the
   # agp files. These scaffolds and contigs are unmapped to chromosomes
   # in the agp file. chrUn_random is made up of WGS scaffolds that mapped to
   # FPC contigs, but the chromosome is unknown.
   mkdir ./Un_random
   awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.chunks.agp \
       > ./Un_random/Un_random.chunks.agp
   awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.scaffolds.agp \
       > ./Un_random/Un_random.scaffolds.agp
   # change the first field to "chrUn_random" then can use agpToFa to process
   perl -pi.bak -e 's/Zv6_scaffold[0-9]+/chrUn_random/' ./Un_random/*.agp
   wc -l ./Un_random/Un_random.scaffolds.agp
   # 68 ./Un_random/Un_random.scaffolds.agp
   # check files and remove backup files
   rm ./Un_random/*.bak
   # get FASTA file of sequences for NA_random and Un_random and create agp with
   # Ns between scaffolds
   # from scaffolds agp, get name of scaffolds to be retrieved from the 
   # FASTA file to make the NA_random and Un_random chromosomes.
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
     awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
         $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
         $c/chr$c.scaffolds.lst $c/chr$c.fa
   end
   # check that all scaffolds in the list are in the FASTA file for 
   # NA_random and Un_random.
   # made a change to scaffoldFaToAgp.c so that the the number of Ns to be
   # inserted between scaffolds can be specified as an option.
   # There are less and smaller random scaffolds than before so use 50,000 Ns
   # between scaffolds as for the human random chromosomes.
   foreach c (NA_random Un_random)              
     $HOME/bin/i386/scaffoldFaToAgp -scaffoldGapSize=50000 $c/chr$c.fa
     mv $c/chr$c.fa $c/chr$c.scaffolds.fa
   end
   # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
   # forUn_random. Change D to W for NA_random and Un_random..
   sed -e 's/chrUn/chrNA_random/' ./NA_random/chrNA_random.agp \
       | sed -e 's/D/W/' > ./NA_random/chrNA_random.scaffolds.agp
   # the scaffolds agp for chrNA_random is now sorted numerically by 
   # scaffold number
   sed -e 's/chrUn/chrUn_random/' ./Un_random/chrUn_random.agp \
       | sed -e 's/D/W/' > ./Un_random/chrUn_random.scaffolds.agp
   # edit ./NA_random/chrNA_random.scaffolds.agp and 
   # ./Un_random/chrUn_random.scaffolds.agp and remove last line as this 
   # just adds an extra 50000 Ns at the 
   # end of the sequence.
   rm ./NA_random/chrNA_random.agp ./Un_random/chrUn_random.agp
cat << '_EOF_' > ./jkStuff/createAgpWithGaps.pl
#!/usr/bin/perl
use strict;

# This script takes a chunks agp and inserts Ns between scaffolds for 
# the chunks (contigs) agp file. Could also insert Ns between scaffolds
# for scaffolds agp.

my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
my $chrom = $ARGV[0]; # chromosome name
my $numN = $ARGV[1];  # number of Ns to be inserted 
my $type = $ARGV[2]; # contigs or scaffolds

$prev = "";
$st = 1;
$prevEnd = 0;
$id = 0;

while (<STDIN>)
{
my $l = $_;
my @f = split(/\t/, $l);

if ($type eq "contigs")
   {
   $name = $f[9];
   }
else
   {
   $name = $f[5]
   }

my $currSt = $f[1];
my $currEnd = $f[2];
my $size = $currEnd - $currSt;

$id++;
$st = $prevEnd + 1;
$end = $st + $size;

if (($prev ne "") && ($prev ne $name))
   {
   $st = $prevEnd + 1;
   $end = ($st + $numN) - 1;
   print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
   $prevEnd = $end;
   $id++;
   }

$st = $prevEnd + 1;
$end = $st + $size;
print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
if ($type eq "contigs")
   {
   print "\t$f[9]\t$f[10]\t$f[11]";
   }

$prevEnd = $end;
$prev = $name;
}
'_EOF_'
   chmod +x ./jkStuff/createAgpWithGaps.pl
   cd /cluster/data/danRer4/NA_random
   # for NA_random, sort the chunks.agp by contig number
   perl -pi.bak -e 's/Zv6_NA//' NA_random.chunks.agp
   sort -k6,6n NA_random.chunks.agp > NA_random.chunks2.agp
   # then put back Zv6_NA
   perl -pi.bak -e 's/([0-9]+\.[0-9]+)/Zv6_NA$1/' NA_random.chunks2.agp
   mv NA_random.chunks2.agp NA_random.chunks.agp
   # Un_random.chunks.agp is already sorted by scaffold number
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
      cd $c
      perl /cluster/data/danRer4/jkStuff/createAgpWithGaps.pl \
           chr${c} 50000 contigs < ${c}.chunks.agp > chr${c}.chunks.agp
      cd ..
   end
   # check co-ordinates
   # field 2 is the start position, field 3 is the end and field 8 is the size
   # so check that this is consistent in scaffolds and chunks agp. 
   # check that the difference between 7th and 8th fields is the same as the 
   # difference between 11th and 12th fields for chunks agp. 
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
        awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
            $c/chr${c}.scaffolds.agp > $c/chr${c}.scaffolds.coordCheck 
        awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
            $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck 
        awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
            $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck2 
   end
   # check the outputs are empty
   wc -l NA_random/*.coord*
   wc -l Un_random/*.coord*
   rm NA_random/*.coord* Un_random/*.coord*
   # check that the scaffolds and chunks agp files are consistent with
   # each other. 
cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;

my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp

open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";

while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv/)
   {
   $name = $f[5];
   $e = $f[2];
   $scafsHash{$name} = $e;
   }
}
close SCAFS;

my $scaf = "";
my $prev = "";
my $prevEnd = 0;

while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);

# if it is not a gap line
if ($fi[4] ne "N")
   {
   $scaf = $fi[9];
   if (($scaf ne $prev) && ($prev ne ""))
      {
      checkCoords($prev, $prevEnd);
      }
$prev = $scaf;
$prevEnd = $fi[2];
   }
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;

sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
   {
   if ($scafsHash{$prev} != $prevEnd)
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev is not consistent between agps\n";
      }
   else
      {
      my $ed = $scafsHash{$prev};
      print "Scaffold $prev - ok\n";
      }
   }
}
'_EOF_'
   # << happy emacs   
   chmod +x jkStuff/checkSizesInAgps.pl
   foreach c (NA_random Un_random)
      perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
           $c/chr${c}.scaffolds.agp $c/chr${c}.chunks.agp \
           > $c/${c}.scafsvschunks
   end
   foreach c (NA_random Un_random)
     grep "not consistent" $c/${c}.scafsvschunks
   end 
   wc -l NA_random/NA_random.scafsvschunks 
   wc -l Un_random/Un_random.scafsvschunks 
   # no lines were inconsistency was reported
   rm NA_random/NA_random.scafsvschunks Un_random/Un_random.scafsvschunks
   # clean up
   foreach c (NA_random Un_random)
      rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
         $c/chr${c}.scaffolds.lst $c/*.bak
   end
'_EOF_'

###########################################################################
# BUILD CHROM-LEVEL SEQUENCE (DONE, 2006-04-13, hartera)
# REPEAT THIS FOR chrNA_random AND chrUn_random (DONE, 2006-04-21, hartera)
   ssh kkstore01
   cd /cluster/data/danRer4
   # Ignore warnings about chrM files not existing - this chrom has 
   # already been processed - see mitochondrion section above.
   # Sequence is already in upper case so no need to change
   foreach c (`cat chrom.lst`)
     echo "Processing ${c}"
     $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
         $c/chr$c.fa ./Zv6_scaffolds.fa
     echo "${c} - DONE"
   end
   # move scaffolds agp to be chrom agp and clean up
   foreach c (`cat chrom.lst`)
      cd $c
      cp chr${c}.scaffolds.agp chr${c}.agp
      mkdir -p agps
      mv chr${c}.*.agp ./agps/
      cd ..
   end
   # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
   foreach c (NA_random Un_random)
     echo "Processing ${c}"
     $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
         $c/chr$c.fa ./Zv6_scaffolds.fa
     echo "${c} - DONE"
   end
   # move scaffolds agp to be chrom agp and clean up
   foreach c (NA_random Un_random)
      cd $c
      cp chr${c}.scaffolds.agp chr${c}.agp
      mkdir -p agps
      mv chr${c}.*.agp ./agps/
      cd ..
   end

##########################################################################
# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2006-04-14, hartera)
# RE-CHECK THESE AFTER CREATING chrNA_random AND chrUn_random SEQUENCE FILES 
# (DONE, 2006-04-20, hartera)
   # Check that the size of each chromosome .fa file is equal to the last
   # co-ordinate of the corresponding agp file.
   ssh hgwdev
   cd /cluster/data/danRer4
   foreach c (`cat chrom.lst`)
     foreach f ( $c/chr$c.agp )
       set agpLen = `tail -1 $f | awk '{print $3;}'`
       set h = $f:r
       set g = $h:r
       echo "Getting size of $g.fa"
       set faLen = `faSize $g.fa | awk '{print $1;}'`
       if ($agpLen == $faLen) then
         echo "   OK: $f length = $g length = $faLen"
       else
         echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
       endif
     end
   end
   # all are the OK so FASTA files are the expected size

###########################################################################
# CREATING DATABASE (DONE, 2006-04-14, hartera)
    # Create the database.
    # next machine
    ssh hgwdev
    echo 'create database danRer4' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
    echo 'drop database danRer4' | hgsql danRer4
    # Use df to make sure there is at least 10 gig free on
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  1.5T  173G  90% /var/lib/mysql

###########################################################################
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2006-04-14, hartera)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #  grp in the database mm8 to the new database danRer4. Use one of the
    #  newest databases to copy from to make sure that the groupings are
    #  up to date.
    echo "create table grp (PRIMARY KEY(NAME)) select * from mm8.grp" \
      | hgsql danRer4
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
    echo 'drop table grp;' | hgsql danRer4

###########################################################################
# MAKE HGCENTRALTEST ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
# CHANGE DATE FORMAT ON HGCENTRALTEST ENTRY (DONE, 2006-04-21, hartera)
    # Make entry into dbDb and defaultDb so test browser knows about it.
    ssh hgwdev
    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("danRer4", "March 2006", \
          "/gbdb/danRer4", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
          37, "Zebrafish", "Danio rerio", \
          "/gbdb/danRer4/html/description.html", 0,  0, \
          "Sanger Centre, Danio rerio Sequencing Project Zv6");' \
    | hgsql -h genome-testdb hgcentraltest
    # reformat the date (2006-04-21, hartera)
    echo 'update dbDb set description = "Mar. 2006" where name = "danRer4";' \
         | hgsql -h genome-testdb hgcentraltest

    # Create /gbdb directory for danRer4
    mkdir /gbdb/danRer4
    # SET AS DEFAULT LATER WHEN READY FOR RELEASE
    # set danRer4 to be the default assembly for Zebrafish
    echo 'update defaultDb set name = "danRer4" \
          where genome = "Zebrafish";' \
          | hgsql -h genome-testdb hgcentraltest

###########################################################################
# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2006-04-14, hartera)
# RE-DONE JUST FOR chrNA_random AND chrUn_random (DONE, 2006-04-20, hartera)
     ssh kkstore01
     cd /cluster/data/danRer4
     foreach c (`cat chrom.lst`)
       foreach agp ($c/chr$c.agp)
         if (-e $agp) then
           set fa = $c/chr$c.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end
     
     # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
     ssh kkstore01
     cd /cluster/data/danRer4
     foreach c (NA_random Un_random)
       foreach agp ($c/chr$c.agp)
         if (-e $agp) then
           set fa = $c/chr$c.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end

###########################################################################
# MAKE LIFTALL.LFT (DONE, 2006-04-14, hartera)
# REMAKE LIFTALL.LFT WITH chrNA_random AND chrUn_random 
# (DONE, 2006-04-21, hartera)
     ssh kkstore01
     cd /cluster/data/danRer4
     rm jkStuff/liftAll.lft
     foreach c (`cat chrom.lst`)
       cat $c/lift/ordered.lft >> jkStuff/liftAll.lft
     end

###########################################################################
# MAKE TRACKDB ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
# Should add this later when adding gold/gap tracks. Angie created a 
# temporary chromInfo table otherwise make update/alpha causes an error
# (2006-04-17)
    # Make trackDb table so browser knows what tracks to expect.
    ssh hgwdev
    mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish
    cvs add danRer4
    cvs commit danRer4
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add danRer4 in all the right places and do
    make update DBS=danRer4
    make alpha DBS=danRer4
    cvs commit -m "Added danRer4." makefile

###########################################################################
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2006-04-14, hartera)
    ssh hgwdev
    mkdir /cluster/data/danRer4/html
    # make a symbolic link from /gbdb/danRer4/html to /cluster/data/danRer4/html
    ln -s /cluster/data/danRer4/html /gbdb/danRer4/html
    # Add a description page for zebrafish
    cd /cluster/data/danRer4/html
    cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer3/description.html .
    # Edit this for zebrafish danRer4

    # create a description.html page here
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
    # Add description page here too
    cp /cluster/data/danRer4/html/description.html .
    cvs add description.html
    cvs commit -m "First draft of description page for danRer4." \
        description.html
    cd ~/kent/src/hg/makeDb/trackDb
    make update DBS=danRer4
    make alpha  DBS=danRer4

###########################################################################
# SIMPLE REPEAT [TRF] TRACK  (DONE, 2006-04-14, hartera)
# RE-RUN FOR chrNA AND chrUn RENAMED AS chrNA_random AND chrUn_random
# AND RELOAD THE TABLE (DONE, 2006-04-21, hartera)
# MADE A NOTE IN THE HISTORY TABLE TO EXPLAIN WHY THE simpleRepeats TABLE
# WAS RELOADED (DONE, 2006-04-22, hartera)
    # TRF can be run in parallel with RepeatMasker on the file server
    # since it doesn't require masked input sequence.
    # Run this on the kilokluster. Need to mask contig and chromosome 
    # sequences so run trf using contig sequences.
    # First copy over contig sequences to iscratch and then rsync to cluster.
    ssh kkr1u00
    rm -r /iscratch/i/danRer4/contigsNoMask
    mkdir -p /iscratch/i/danRer4/contigsNoMask
    cd /cluster/data/danRer4
    foreach d (/cluster/data/danRer4/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
          echo "Copyig $f ..."
          cp $f /iscratch/i/danRer4/contigsNoMask/
       end
    end
    ls /iscratch/i/danRer4/contigsNoMask/*.fa | wc -l
    # 317 sequence files
    # rsync to cluster machines
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
    end
     
    ssh kki
    mkdir -p /cluster/data/danRer4/bed/simpleRepeat
    cd /cluster/data/danRer4/bed/simpleRepeat
    mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
 # << keep emacs coloring happy
    chmod +x runTrf

cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    ls -1S /iscratch/i/danRer4/contigsNoMask/chr*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    # 317 jobs
    para create jobList
    para try, check, push, check etc...
    para time
# Completed: 317 of 317 jobs
# CPU time in finished jobs:      25083s     418.05m     6.97h    0.29d  0.001 y
# IO & Wait Time:                   933s      15.55m     0.26h    0.01d  0.000 y
# Average job time:                  82s       1.37m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2732s      45.53m     0.76h    0.03d
# Submission to last job:          4604s      76.73m     1.28h    0.05d

    # Re-do only for chrNA_random and chrUn_random (2006-04-21, hartera)
    ssh kki
    cd /cluster/data/danRer4/bed/simpleRepeat
    rm trf/chrNA*.bed
    rm trf/chrUn*.bed
    rm simpleRepeat.bed
    mkdir -p randomsRun/trf
    cd randomsRun
    cp ../runTrf .
    cp ../gsub .
    ls -1S /iscratch/i/danRer4/contigsNoMask/chr*_random*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    # 46 jobs
    para try, check, push, check etc...
    para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       1904s      31.73m     0.53h    0.02d  0.000 y
# IO & Wait Time:                   103s       1.72m     0.03h    0.00d  0.000 y
# Average job time:                  44s       0.73m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             241s       4.02m     0.07h    0.00d
# Submission to last job:           269s       4.48m     0.07h    0.00d 
    cp ./trf/*.bed /cluster/data/danRer4/bed/simpleRepeat/trf/
    # lift up to chrom level
    cd /cluster/data/danRer4/bed/simpleRepeat
    rm simpleRepeat.bed
    liftUp simpleRepeat.bed /cluster/data/danRer4/jkStuff/liftAll.lft warn \
           trf/*.bed
    # Reload into the database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/simpleRepeat
    hgsql -e 'drop table simpleRepeat;' danRer4
    hgLoadBed danRer4 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Loaded 759659 elements of size 16
    # Make a note in the history table to explain why the simpleRepeats
    # table was reloaded (2006-04-22, hartera)
    hgsql -e 'update history set errata = \
      "Dropped table for reloading after changing names of random chroms." \
      where ix = 2;' danRer4

###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/danRer4/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed

###########################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
# RE-DO AFTER RENAMING RANDOM CHROMS AS chrNA_random AND chrUn_random
# (DONE, 2006-04-21, hartera)
   # After the simpleRepeats track has been built, make a filtered version
   # of the trf output: keep trf's with period <= 12:
   ssh kkstore01
   cd /cluster/data/danRer4/bed/simpleRepeat
   rm -r trfMask
   mkdir -p trfMask
   foreach f (trf/chr*.bed)
     awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
   end

   # Lift up filtered trf output to chrom coords as well:
   cd /cluster/data/danRer4
   rm -r ./bed/simpleRepeat/trfMaskChrom
   mkdir bed/simpleRepeat/trfMaskChrom
   
   foreach c (`cat chrom.lst`)
     if (-e $c/lift/ordered.lst) then
       perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
         $c/lift/ordered.lst > $c/lift/oTrf.lst
       liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
         jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
     endif
     if (-e $c/lift/random.lst) then
       perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/random.lst > $c/lift/rTrf.lst
       liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
         jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
     endif
   end

###########################################################################
# GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER AND ADD TO
# DANIO LIBRARY FOR REPEATMASKER (DONE, 2006-04-14, hartera)
# Go to http://www.girinst.org/server/RepBase/RepBase11.02.fasta
# (03-15-2006) and download zebunc.ref.txt containing unclassified zebrafish 
# repeats.
# Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
   ssh hgwdev
   cd /cluster/bluearc/RepeatMasker/Libraries
   # This is /cluster/bluearc/RepeatMasker060320/Libraries
   # Do a dummy run of RepeatMasker with the -species option. This creates
   # a zebrafish-specific library from the EMBL format RepBase library.
   # Then the zebunc.ref unclassified repeats can be added to this library.
   /cluster/bluearc/RepeatMasker/RepeatMasker -spec danio /dev/null
   # RepeatMasker version development-$Id: RepeatMasker,v 1.13 2006/03/21 
   # This creates a specieslib in Libraries/20060315/danio
   # Format the zebunc.ref library:
   # Sequence is upper case, change to lower case like the specieslib
   cat zebunc.ref.txt | tr '[A-Z]' '[a-z]' > zebunc.ref.format 
   perl -pi.bak -e 's/>dr([0-9]+)/>Dr$1#Unknown/' zebunc.ref.format
   grep '>' zebunc.ref.format | wc -l
   # 958
   cd /cluster/bluearc/RepeatMasker/Libraries/20060315/danio
   grep '>' specieslib | wc -l
   # 219
   mv specieslib danio.lib
   cat danio.lib ../../zebunc.ref.format > specieslib  
   grep '>' specieslib | wc -l
   # 1177
   rm danio.lib
   # make a copy in Libraries directory in case this directory of libraries
   # is removed.
   cp specieslib /cluster/bluearc/RepeatMasker/Libraries/danio.lib
 
###########################################################################
# SPLIT SEQUENCE FOR REPEATMASKER RUN (DONE, 2006-04-14, hartera)
# SPLIT SEQUENCE AGAIN JUST FOR chrNA_random AND chrUn_random AFTER RENAMING
# THESE RANDOM CHROMS (DONE, 2006-04-21, hartera)
   ssh kkstore01
   cd /cluster/data/danRer4
   
   # break up into 500 kb sized chunks at gaps if possible 
   # for RepeatMasker runs
   foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
   end
   # took about 3 minutes. 
   # split just for chrNA_random and chrUn_random (2006-04-21, hartera)
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

###########################################################################
# REPEATMASKER RUN (DONE, 2006-04-21, hartera)
   # Originally run 2006-04-14. There was one sequence chr16_4_10.fa that 
   # failed with a division by zero error. Sent this as a test case with the 
   # danio library to Robert Hubley who fixed the bug and sent a new
   # version of ProcessRepeats. Checked this into CVS for 
   # /cluster/bluearc/RepeatMasker on 2006-04-19.
   # When a new library is added for this version of RepeatMasker, need to 
   # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made 
   # up of a date e.g. 20060315 here and inside this are species directories
   # for which RepeatMasker has already been run. In this directory it creates
   # a specieslib of the danio repeats. If this exists, this is used for the
   # RepeatMasker run for that species. Check that this contains the 
   # unclassified Zebrafish repeats with IDs beginning with Dr. This library
   # with these repeats should have been created in the section above:
   # Use sequence split into 500 kb chunks.
   ssh kkstore01
   cd /cluster/data/danRer4
   mkdir RMRun
   # Record RM version used:
   ls -l /cluster/bluearc/RepeatMasker
   # lrwxrwxrwx  1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320
   # March 20 2006 (open-3-1-5) version of RepeatMasker
   # get RM database version
   grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl \
        > RMdatabase.version
   # RELEASE 20060315

   cd /cluster/data/danRer4
   cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/danRer4/$2
/bin/cp $2 /tmp/danRer4/$2/
cd /tmp/danRer4/$2
/cluster/bluearc/RepeatMasker060320/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer4/$2/$2.out ./
if (-e /tmp/danRer4/$2/$2.align) /bin/cp /tmp/danRer4/$2/$2.align ./
if (-e /tmp/danRer4/$2/$2.tbl) /bin/cp /tmp/danRer4/$2/$2.tbl ./
if (-e /tmp/danRer4/$2/$2.cat) /bin/cp /tmp/danRer4/$2/$2.cat ./
/bin/rm -fr /tmp/danRer4/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4
'_EOF_'
   # << emacs
   chmod +x jkStuff/RMZebrafish

   # move old files out the way and re-run on 2006-04-19
   cd /cluster/data/danRer4
   mkdir RMOutOld
   foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      foreach c ($d/$contig*.fa.*)
         set t=$c:t
         mv $c /cluster/data/danRer4/RMOutOld/$t.bak
      end
   end  
 
   cp /dev/null RMRun/RMJobs
   foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer4/jkStuff/RMZebrafish \
                 /cluster/data/danRer4/$d $f \
               '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
   end

   # Do the run again with new version of ProcessRepeats used 
   # for RepeatMasker.
   ssh pk
   cd /cluster/data/danRer4/RMRun
   para create RMJobs
   # 4382 jobs written to batch
   para try, check, push, check ... etc.
   para time
# Completed: 4382 of 4382 jobs
# CPU time in finished jobs:   11745656s  195760.94m  3262.68h  135.95d  0.372 y
# IO & Wait Time:                 18953s     315.88m     5.26h    0.22d  0.001 y
# Average job time:                2685s      44.75m     0.75h    0.03d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3878s      64.63m     1.08h    0.04d
# Submission to last job:         41887s     698.12m    11.64h    0.48d 
   
   #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
   ssh kkstore01
   cd /cluster/data/danRer4
   foreach d (*/chr*_?{,?})
     set contig = $d:t
     echo $contig
     liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
   end

   #- Lift pseudo-contigs to chromosome level
   foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
   end
   # Re-run for just chrNA_random and chrUn_random (start on 2006-04-21)
   ssh kkstore01
   mkdir /cluster/data/danRer4/RMRun/randomsRun
   cd /cluster/data/danRer4
   cp /dev/null RMRun/randomsRun/RMJobs
   foreach c (NA_random Un_random)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/danRer4/jkStuff/RMZebrafish \
                 /cluster/data/danRer4/$d $f \
               '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
              >> RMRun/randomsRun/RMJobs
          end
      end
   end

   # Do the run again for chrNA_random and chrUn_random.
   ssh pk
   cd /cluster/data/danRer4/RMRun/randomsRun
   para create RMJobs
   # 468 jobs written to batch
   para try, check, push, check ... etc.
   para time
# Completed: 468 of 468 jobs
# CPU time in finished jobs:     551863s    9197.71m   153.30h    6.39d  0.017 y
# IO & Wait Time:                  2217s      36.96m     0.62h    0.03d  0.000 y
# Average job time:                1184s      19.73m     0.33h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3836s      63.93m     1.07h    0.04d
# Submission to last job:          9086s     151.43m     2.52h    0.11d

   #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
   ssh kkstore01
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
     foreach d (${c}/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
              > /dev/null
     end
   end

   #- Lift pseudo-contigs to chromosome level
   foreach c (NA_random Un_random)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      cd ..
   end

   # Load tables
   #- Load the .out files into the database with:
   ssh hgwdev
   cd /cluster/data/danRer4
   hgLoadOut danRer4 */chr*.fa.out -verbose=2 >& load.log
# bad rep range [5031, 4990] line 51895 of 14/chr14.fa.out 
# bad rep range [4559, 4558] line 59431 of 16/chr16.fa.out 
# bad rep range [1202, 1201] line 131633 of 16/chr16.fa.out 
# bad rep range [280, 252] line 93608 of 17/chr17.fa.out 
# bad rep range [429, 272] line 43230 of 22/chr22.fa.out 
# bad rep range [262, 261] line 167346 of 3/chr3.fa.out 
# bad rep range [889, 888] line 28495 of 5/chr5.fa.out 
# bad rep range [349, 348] line 113404 of 5/chr5.fa.out 
# bad rep range [1133, 1132] line 200654 of 5/chr5.fa.out 
# bad rep range [965, 920] line 3567 of 8/chr8.fa.out 
# bad rep range [292, 291] line 6354 of NA_random/chrNA_random.fa.out
# note: 11 records dropped due to repStart > repEnd
   # Not too many errors so just ignore, but send examples to Arian Smit
   # and Robert Hubley. 
   # check coverage of repeats masked
   featureBits -chrom=chr1 danRer3 rmsk
   # 25822888 bases of 55500710 (46.527%) in intersection 
   featureBits -chrom=chr1 danRer4 rmsk 
   # 32880041 bases of 70589895 (46.579%) in intersection

###########################################################################
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
# (DONE, 2006-04-22, hartera)
# MASK PSEUDO-CONTIGS AS NOT DONE BEFORE (DONE, 2006-05-27, hartera)
    ssh kkstore01
    cd /cluster/data/danRer4
    # Soft-mask (lower-case) the contig and chr .fa's,
    # then make hard-masked versions from the soft-masked.
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    # for the chromosomes:
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end

    # check percent sequence masked
    faSize /cluster/data/danRer4/1/chr1.fa
    # 70589895 bases (904883 N's 69685012 real 36751306 upper 
    # 32933706 lower) in 1 sequences in 1 files

    faSize /cluster/data/danRer3/1/chr1.fa
    # 55805710 bases (1047706 N's 54758004 real 28887275 upper 
    # 25870729 lower) in 1 sequences in 1 files
    # 47% of danRer4 chr1.fa is in lower case so masked
    # Build nib files, using the soft masking in the fa
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    ls ./nib/* | wc
    # 28
    
    # for the contigs (2006-05-27, hartera)
    ssh kkstore04
    cd /cluster/data/danRer4
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    
###########################################################################
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION AND CREATE 2BIT FILE
# (DONE, 2006-04-23, hartera)
# CHANGE FILENAME TO 2BIT FILE IN CHROMINFO AND REMOVE NIB DIR IN /gbdb
# (DONE, 2006-05-24, hartera)
    # Make symbolic links from /gbdb/danRer4/nib to the real nibs
    ssh hgwdev
    cd /cluster/data/danRer4
    mkdir -p /gbdb/danRer4/nib
    foreach f (/cluster/data/danRer4/nib/chr*.nib)
      ln -s $f /gbdb/danRer4/nib
    end
    # Load /gbdb/danRer4/nib paths into database and save size info
    # hgNibSeq creates chromInfo table
    hgNibSeq -preMadeNib danRer4 /gbdb/danRer4/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N danRer4 > chrom.sizes
    # take a look at chrom.sizes, should be 28 lines
    wc chrom.sizes
    # 28      56     422 chrom.sizes

    # Make one big 2bit file as well, and make a link to it in
    # /gbdb/danRer4 because hgBlat looks there:
    faToTwoBit */chr*.fa danRer4.2bit
    # check the 2bit file
    twoBitInfo danRer4.2bit 2bit.tab
    diff 2bit.tab chrom.sizes
    # should be the same and they are so ok.
    rm 2bit.tab
    # add link to this 2bit file from gbdb danRer4 directory 
    ln -s /cluster/data/danRer4/danRer4.2bit /gbdb/danRer4/
    # (hartera, 2006-05-24)
    # change chromInfo table to have 2bit file for filename
    hgsql -e 'update chromInfo set fileName = "/gbdb/danRer4/danRer4.2bit";' \
          danRer4
    # then remove nib directory in /gbdb/danRer4 as do not need both nibs
    # and 2 bit file which is in /gbdb/danRer4.
    rm -r /gbdb/danRer4/nib

###########################################################################
# MAKE GOLD AND GAP TRACKS (DONE, 2006-04-23, hartera)
    ssh hgwdev
    cd /cluster/data/danRer4
    # the gold and gap tracks are created from the chrN.agp file and this is
    # the scaffolds or supercontigs agp 
    hgGoldGapGl -noGl -chromLst=chrom.lst danRer4 /cluster/data/danRer4 .

    # featureBits danRer4 gold
    # 1626093931 bases of 1626093931 (100.000%) in intersection
    # featureBits danRer3 gold
    # 1630323462 bases of 1630323462 (100.000%) in intersection

    # featureBits danRer4 gap
    # 148566200 bases of 1626093931 (9.136%) in intersection
    # featureBits danRer3 gap
    # 13709500 bases of 1630323462 (0.841%) in intersection
    # there are larger gaps now in chrNA and chrUn so compare just chr1
    # featureBits -chrom=chr1 danRer4 gap
    # 16000 bases of 70573895 (0.023%) in intersection
    # featureBits -chrom=chr1 danRer3 gap
    # 305000 bases of 55500710 (0.550%) in intersection
    # without random or chrUn chroms:
    # featureBits -noRandom danRer4 gap
    # 366200 bases of 1546950119 (0.024%) in intersection
    # featureBits -noRandom danRer3 gap
    # 6240000 bases of 1200146216 (0.520%) in intersection
# Add trackDb.ra entries for gold and gap tracks and also create
# gap.html and gold.html pages.

###########################################################################
# PUT MASKED SEQUENCE OUT ON iSERVERS AND THE SAN FOR CLUSTER RUNS
# (DONE, 2006-04-23, hartera)
# TRFFA SEQUENCED WAS NOT MASKED SO ADD MASKED SEQUENCE TO iSERVERS AND 
# THE SAN FOR CLUSTER RUNS (DONE, 2006-05-30, hartera)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/danRer4/nib
    mkdir -p /iscratch/i/danRer4/nib
    cp -p /cluster/data/danRer4/nib/chr*.nib /iscratch/i/danRer4/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    # Add these pseudo-contigs that have been repeat- and trf-masked
    # and rsync again. (2006-05-30, hartera)
    rm -rf /iscratch/i/danRer4/trfFa
    mkdir /iscratch/i/danRer4/trfFa
    foreach d (/cluster/data/danRer4/*/chr*_?{,?})
      cp -p $d/$d:t.fa /iscratch/i/danRer4/trfFa
    end
    rm -rf /iscratch/i/danRer4/rmsk
    mkdir -p /iscratch/i/danRer4/rmsk
    cp -p /cluster/data/danRer4/*/chr*.fa.out /iscratch/i/danRer4/rmsk
    cp -p /cluster/data/danRer4/danRer4.2bit /iscratch/i/danRer4/
    # rsync files - faster than using iSync
    # rsync again - still can not rsync to kkr2u00 (hartera, 2006-05-30)
    foreach R (2 3 4 5 6 7 8)
      echo "rsync for kkr${R}u00 ..."
      rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
    end
    # error rsyncing to kkr2u00: 
    # connect to host kkr2u00 port 22: No route to host

    # then add the same sequence files to the san
    ssh kkstore01
    # Chrom-level mixed nibs that have been repeat- and trf-masked:

    mkdir -p /san/sanvol1/scratch/danRer4/nib
    rm -rf /san/sanvol1/scratch/danRer4/nib
    cp -p /cluster/data/danRer4/nib/chr*.nib /san/sanvol1/scratch/danRer4/nib
    cp /cluster/data/danRer4/danRer4.2bit /san/sanvol1/scratch/danRer4
    # Pseudo-contig fa that have been repeat- and trf-masked:
    # Add these pseudo-contigs again (2006-05-30, hartera)
    ssh kkstore04
    rm -rf /san/sanvol1/scratch/danRer4/trfFa
    mkdir /san/sanvol1/scratch/danRer4/trfFa
    foreach d (/cluster/data/danRer4/*/chr*_?{,?})
      cp -p $d/$d:t.fa /san/sanvol1/scratch/danRer4/trfFa
    end
    
###########################################################################
# ADD CONTIGS TRACK (DONE, 2006-04-23, hartera)
    # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
    # chunks (contigs) agp files.
    ssh kkstore01
    mkdir -p /cluster/data/danRer4/bed/ctgPos2
    cd /cluster/data/danRer4/bed/ctgPos2
    # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
         awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer4/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
    end
    # load the ctgPos2 table
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ctgPos2
    # use hgLoadSqlTab as it gives more error messages than using 
    # "load data local infile ...".
    /cluster/bin/i386/hgLoadSqlTab danRer4 ctgPos2 \
            ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
# create trackDb.ra entry and html page for ctgPos2 track.
# add search for the track and make sure the termRegex will handle
# contigs named "Zv6_scaffoldN.N" where N is an integer and all the 
# contig accessions in the *.chunks.agp files.

###########################################################################
# CREATE gc5Base WIGGLE TRACK (DONE, 2006-04-23, hartera)
    ssh kkstore01
    mkdir -p /cluster/data/danRer4/bed/gc5Base
    cd /cluster/data/danRer4/bed/gc5Base
    nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer4 \
        /cluster/data/danRer4 | wigEncode stdin gc5Base.wig gc5Base.wib
    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/danRer4/danRer4.2bit
    #       File stdout created
    #   Converted stdin, upper limit 100.00, lower limit 0.00
    # runs for about 7 minutes 

    # load database with the .wig file and add .wib file to /gbdb/danRer4
    ssh hgwdev
    cd /cluster/data/danRer4/bed/gc5Base
    mkdir /gbdb/danRer4/wib
    ln -s `pwd`/gc5Base.wib /gbdb/danRer4/wib
    time hgLoadWiggle -pathPrefix=/gbdb/danRer4/wib danRer4 gc5Base gc5Base.wig
    # 17 second load time

    #   verify index is correct:
    hgsql danRer4 -e "show index from gc5Base;"
    #   should see good numbers in Cardinality column

###########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2005-04-24, hartera)
    # Use -repMatch=512 (based on size -- for human we use 1024, and
    # the zebrafish genome is ~50% of the size of the human genome
    ssh kkr1u00
    mkdir /cluster/data/danRer4/bed/ooc
    cd /cluster/data/danRer4/bed/ooc
    mkdir -p /san/sanvol1/scratch/danRer4
    ls -1 /cluster/data/danRer4/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc -repMatch=512
    # Wrote 50424 overused 11-mers to /cluster/bluearc/danRer4/11.ooc
    # For 10.ooc, repMatch = 4096 for human, so use 2048
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_10.ooc -repMatch=2048
    # Wrote 12231 overused 10-mers to /cluster/bluearc/danRer4/10.ooc 
    # keep copies of ooc files in this directory and copy to iscratch
    cp /san/sanvol1/scratch/danRer4/*.ooc .
    cp -p /san/sanvol1/scratch/danRer4/*.ooc /iscratch/i/danRer4/
    # rsync to iServers
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer4/*.ooc \
             kkr${R}u00:/iscratch/i/danRer4/
    end
     
###########################################################################
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer4 (DONE, 2006-04-27, hartera)
   ssh hgwdev
   # DNA port is "0", trans prot port is "1"  
   echo 'insert into blatServers values("danRer4", "blat17", "17788", "1", "0");  insert into blatServers values("danRer4", "blat17", "17789", "0", "1");' \
    | hgsql hgcentraltest
   # this enables blat and isPcr, isPcr is enabled by loading blat server
   # with tilesize=5 (ask for this when request blat servers from
   # cluster admin).
   # if you need to delete those entries
   echo 'delete from blatServers where db="danRer4";' | hgsql hgcentraltest
 
###########################################################################
# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2006-04-24, hartera)
# UPDATED (2006-09-28) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
# NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
# -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
# higher minIdentity is causing alignments to be dropped that should not be. 
# e.g.  blat -fine -minIdentity=90 -ooc=11.ooc  
# $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
# pslReps can be used to handle filtering at a later step. Blat's minIdentity 
# seems to be more severe than that for pslReps as it takes insertions and 
# deletions into account. 
# CHECKED ALIGNMENTS USING MASKED TRFFA AND RESULTS ARE THE SAME
# (DONE, 2006-05-30, hartera)
# array chip sequences already downloaded for danRer1
    ssh hgwdev
    # need to copy sequences to the bluearc first to transfer to the iServers
    cd /projects/compbio/data/microarray/affyZebrafish
    mkdir -p /cluster/bluearc/affy
    cp -p \
      /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /cluster/bluearc/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer3
    ssh kkr1u00
    mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-04-24
    ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-04-24 \
          /cluster/data/danRer4/bed/affyZebrafish
    cd /cluster/data/danRer4/bed/affyZebrafish
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/affy/*.fa \
             kkr${R}u00:/iscratch/i/affy/
    end
    # small cluster run to align sequences
    ssh kki
    cd /cluster/data/danRer4/bed/affyZebrafish
    ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
    ls -1 /iscratch/i/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
    # for output:
    mkdir -p psl
    echo '#LOOP\n/cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/iscratch/i/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 genome.lst affy.lst template.sub para.spec
    para create para.spec
    para try, check, push, check .... etc.
# para time
# Completed: 271 of 271 jobs
# CPU time in finished jobs:      15331s     255.51m     4.26h    0.18d  0.000 y
# IO & Wait Time:                   737s      12.29m     0.20h    0.01d  0.000 y
# Average job time:                  59s       0.99m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             101s       1.68m     0.03h    0.00d
# Submission to last job:          1557s      25.95m     0.43h    0.02d

    # do pslSort and liftUp
    ssh kkstore04
    cd /cluster/data/danRer4/bed/affyZebrafish
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyZebrafish.psl
    pslSort dirs raw.psl tmp psl
    # only use alignments that have at least 95% identity in aligned region.
    # try minCover as now there is less sequence in chrUn and chrNA
    # so less likely that genes are split up.
    grep '>' /cluster/bluearc/affy/Zebrafish_consensus.fa | wc -l
    # 15502
    pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    # see how many sequences are aligned:
    awk '{print $10;}' contig.psl > contigAligned
    tail +6 contigAligned | sort | uniq -c | sort -nr > contigAligned.count
    wc -l contigAligned.count 
    # 14819 contigAligned.count 
    tail +6 contig.psl | wc -l
    # 21486
    # 96% of sequences are aligned. The sequence with the most alignments 
    # aligns 177 times, then the next is 105, then 86, 85, 69, 69, 54, 54 etc.
    # for danRer3, 14335 were aligned (92% aligned). The sequence with 
    # the most alignments aligned 96 times, then 31, 27, 22, 20, 19 times. 
    # also 854 sequences aligned for danRer4 that did not align for danRer3.
    # 370 were aligned in danRer3 but not for danRer4.
    # USED THESE pslReps PARAMETERS:
    pslReps -minCover=0.30 -minAli=0.95 -nearTop=0.005 \
            raw.psl contig2.psl /dev/null
    # see how many sequences are aligned:
    awk '{print $10;}' contig2.psl > contig2Aligned
    tail +6 contig2Aligned | sort | uniq -c | sort -nr > contig2Aligned.count
    wc -l contig2Aligned.count 
    # 14528 contig2Aligned.count
    tail +6 contig2.psl | wc -l
    # 18744
    # danRer3 has 21196 total alignments and 14335 sequences aligned. 
    # 94% of sequences are aligned.
    # 785 sequences were aligned for danRer4 using minCover but not for 
    # danRer3 after using pslReps. 592 sequences were aligned for danRer3 
    # but not for danRer4 using minCover after using pslReps.
    # the sequence with the most alignments aligns 105 times, then 85, 69,
    # 54, 50, 47, 44, 37, 26, 31, 29:
# No. of alignments Sequence Name
# 105 Zebrafish:Dr.15955.1.A1_at
# 85 Zebrafish:Dr.20178.1.A1_at
# 69 Zebrafish:Dr.885.1.S1_at
# 54 Zebrafish:Dr.15958.1.S1_at
# 50 Zebrafish:Dr.25427.1.A1_at
# 47 Zebrafish:Dr.16470.1.A1_at
# 44 Zebrafish:Dr.490.1.S1_at
# 37 Zebrafish:Dr.7806.1.A1_at
# 36 Zebrafish:Dr.19.1.A1_at
# 31 Zebrafish:Dr.2825.1.A1_at
# 29 Zebrafish:Dr.19556.1.A1_at
    # aligning with the -mask=lower option doesn't make a difference to the
    # number of alignments and sequences aligned.  
    # there are 291 extra sequences that align when minCover option is
    # not used. Only 7 of these have 22 or more alignments. 
# 86 Zebrafish:Dr.24316.1.S1_at
# 69 Zebrafish:Dr.14452.1.A1_at
# 39 Zebrafish:Dr.12372.1.S1_at
# 26 Zebrafish:Dr.18296.2.S1_a_at
# 23 Zebrafish:Dr.7519.1.A1_at
# 22 Zebrafish:Dr.8680.1.S1_at
# 22 Zebrafish:Dr.22175.1.S1_at
    # clean up 
    rm contig* 
    # use pslReps without the minCover option as it does allow quite a lot
    # more alignments and the number of total alignments/number of sequences
    # aligned is still close to that for danRer3. Using nearTop=0.001 does
    # decrease the number of alignments but also means that some good 
    # alignments are lost.  
    pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
    # shorten names in psl file:
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
    mv affyZebrafish.psl.tmp affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # co-ordinates are ok. psl is good.
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/affyZebrafish
    hgLoadPsl danRer4 affyZebrafish.psl
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=Zebrafish: danRer4 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
# trackDb.ra entry and html are already there in trackDb/zebrafish/
    
###########################################################################
# CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY AND 
# ADD CHROM SIZES FOR BLASTZ CLUSTER RUNS (DONE, 2006-04-24, hartera)
    # There are no lineage-specific repeats for zebrafish and other species
    # so use all repeats.
    ssh pk
    mkdir -p /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
    foreach f (/cluster/data/danRer4/*/chr*.fa.out)
     cp -p $f \
        /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers/$f:t:r:r.out.spec
    end
    cp -p /cluster/data/danRer4/chrom.sizes \
          /san/sanvol1/scratch/danRer4/

###########################################################################
# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# HUMAN (hg18) (DONE, 2006-04-24 - 2006-04-25, hartera)
# LOAD BLASTZ PSLS INTO DATABASE AND CHECK FOR HUMAN CONTAMINATION
# (DONE, 2006-05-11, hartera)
    ssh pk
    # Blastz uses lineage-specific repeats. There are none for human
    # and zebrafish so use all repeats.
    # There is a lineage-specific repeats directory for zebrafish (see
    # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
    # DIRECTORY. lineage-specific repeats for hg18 already made - see
    # makeHg18.doc (BLASTZ ZEBRAFISH section).
    mkdir -p /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
    cd /cluster/data/danRer4/bed
    ln -s blastz.hg18.2006-04-24 blastz.hg18
    cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
    # only 5% of the danRer4 genome is now in the random unordered chroms
    # so not running only scaffolds for these chroms - run as virtual chroms
    # and use same parameters as for danRer2.
    cat << 'EOF' > DEF
# danRer4 zebrafish target, human hg18 query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for  hg16-fr1, danRer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q

# TARGET: zebrafish (danRer4)
# Use all chroms, including both randoms (chrNA_random and chrUn_random)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: human (hg18) - single chunk big enough to run each chrom by itself
# Use all chroms, including all randoms 
SEQ2_DIR=/san/sanvol1/scratch/hg18/nib
SEQ2_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ2_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.hg18.2006-04-24
TMPDIR=/scratch/tmp
'EOF'
   # << happy emacs
   chmod +x DEF 
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF >& doBlastz.log &
   # Start: Mon Apr 24 19:20 Stop: Tues Apr 25 05:42
   # Did not finish:
   # netChains: looks like previous stage was not successful     
   # (can't find [danRer4.hg18.]all.chain[.gz]).
   # This file is there so run again. Continue chainMerge step so remove
   # all.chain file and chain directory.
   # NOTE: can leave these files and continue from the net step and it 
   # will work.
   cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
   rm ./axtChain/*.all.chain.gz
   rm -r ./axtChain/chain
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -continue chainMerge `pwd`/DEF >& chainMerge.log &
   # Took about 10 minutes.
   # Check results with featureBits and compare to those 
   # for danRer3 and danRer2:
   ssh hgwdev
   featureBits danRer4 chainHg18Link
   # 64196991 bases of 1626093931 (3.948%) in intersection
   featureBits danRer3 chainHg18Link
   # 69559338 bases of 1630323462 (4.267%) in intersection
   featureBits danRer2 chainHg17Link
   # 70046373 bases of 1560497282 (4.489%) in intersection

   # After Genbank tracks are loaded, (hartera, 2006-04-27)
   featureBits -chrom=chr1 danRer4 refGene:cds chainHg18Link -enrichment
   # refGene:cds 0.732%, chainHg18Link 4.140%, both 0.558%, cover 76.19%, 
   # enrich 18.40x
   featureBits -chrom=chr1 danRer3 refGene:cds chainHg18Link -enrichment
   # refGene:cds 0.769%, chainHg18Link 4.124%, both 0.604%, cover 78.49%, 
   # enrich 19.03x
   featureBits -chrom=chr1 danRer4 refGene:cds netHg18 -enrichment
   # refGene:cds 0.732%, netHg18 31.154%, both 0.624%, cover 85.21%, 
   # enrich 2.73x
   featureBits -chrom=chr1 danRer3 refGene:cds netHg18 -enrichment
   # refGene:cds 0.774%, netHg18 35.434%, both 0.679%, cover 87.73%, 
   # enrich 2.48x
   # Similar coverage and enrichment as for hg18 chains and net on danRer3.
   # do the swap for Blastz chains over to human (hg18) and create net,
   # axtNet, mafNet, liftOver and Downloads. see also makeHg18.doc for
   # featureBits on these alignments.
   ssh pk
   cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
   nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
   # Took about 15 minutes.
   # Load Blastz results into database (DONE, 2006-05-11, hartera)
   ssh kkstore04
   cd /cluster/data/danRer4/bed/blastz.hg18/pslParts
   # cat together Blastz for each chrom
   mkdir pslChrom
   foreach c (`cat /cluster/data/danRer4/chrom.lst`)
      echo "Processing $c ..."
      foreach p (chr${c}.nib*)
         zcat $p >> ./pslChrom/chr${c}_blastzHg18.psl
      end
   end
   # load Blastz psls into the database
   ssh hgwdev
   cd /cluster/data/danRer4/bed/blastz.hg18/pslParts/pslChrom
   foreach f (*.psl)
      /cluster/bin/i386/hgLoadPsl danRer4 $f
      echo "$f Done"
   end
   # Then determine how much sequence has 100% identity to human with a 
   # stretch of at least 300 bp. Human contamination was also found in
   # danRer1 and a user reported it more recently.

   foreach c (`cat /cluster/data/danRer4/chrom.lst`)
      echo "chr$c" >> humanContamination.txt
      hgsql -e "select count(*) from chr${c}_blastzHg18 where matches >= 300 and        misMatches = 0;" danRer4 >> humanContamination.txt
   end
   # There are 4 on chr11 that fit this criteria (same if decrease to regions
   # of >= 200 bp with 100% ID).
   hgsql -e \
   'select * from chr11_blastzHg18 where matches >= 300 and mismatches = 0;' \
   danRer4 > chr11HumanSeq
   # only 2 of these also have no query inserts and 1 of the others only has
   # a 1 base insert: regions are of size 303, 310 and 367 bp. The region of 
   # 330 bp has a 45 bp insert on the query side - see below
#bin	matches	misMatches	repMatches	nCount	qNumInsert	qBaseInsert	tNumInsert	tBaseInsert	strand	qName	qSize	qStart	qEnd	tName	tSize	tStart	tEnd	blockCount	blockSizes	qStarts	tStarts
#588	303	0	0	0	0	0	0	0	-	chr4	191273063	69879746	69880049	chr11	52342180	502145	502448	1	303,	121393014,	502145,
#588	330	0	0	0	1	45	0	0	-	chr4	191273063	69879319	69879694	chr11	52342180	502545	502875	2	1,329,	121393369,121393415,	502545,502546,
#588	310	0	0	0	0	0	0	0	-	chr4	191273063	69878956	69879266	chr11	52342180	502928	503238	1	310,	121393797,	502928,
#588	667	0	0	0	1	1	0	0	-	chr4	191273063	69878268	69878936	chr11	52342180	503258	503925	2	453,214,	121394127,121394581,	503258,503711,

###########################################################################
# BLASTZ/CHAIN/NET PREP (DONE 4/25/06 angie)
    ssh kkstore04
    cd /cluster/data/danRer4
    cp -p danRer4.2bit /san/sanvol1/scratch/danRer4/

    # Create a 2bit file for danRer4 with all chroms (1-25 and M) and the
    # scaffolds for NA and Un:
    awk '$1 == $6 {print $1;}' Zv6.scaffolds.agp \
    | faSomeRecords Zv6_scaffolds.fa stdin stdout \
    | faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa stdin \
       /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
    twoBitInfo /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit \
      /san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes

    # Make a lift file for scaffolds --> {chrUn, chrNA}:
    mkdir /cluster/data/danRer4/liftSupertoChrom
    cd /cluster/data/danRer4/liftSupertoChrom
    /cluster/bin/scripts/agpToLift \
      < ../NA_random/agps/chrNA_random.scaffolds.agp \
      > chrNA_random.lft
    /cluster/bin/scripts/agpToLift \
      < ../Un_random/agps/chrUn_random.scaffolds.agp \
      > chrUn_random.lft
    cat chr*.lft > liftNAandUnScaffoldsToChrom.lft
    cp -p liftNAandUnScaffoldsToChrom.lft /san/sanvol1/scratch/danRer4/

    # Distribute on /iscratch/i too (danRer4.2bit is already there):
    ssh kkr1u00
    cd /iscratch/i/danRer4
    cp -p /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit .
    twoBitInfo danRer4ChrUnNAScafs.2bit chromsUnNAScafs.sizes
    cp -p \
      /cluster/data/danRer4/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft .
    iSync


###########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/26/06 angie)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
    cd /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
    cat << '_EOF_' > DEF
# zebrafish vs. frog
BLASTZ=/cluster/bin/penn/i386/blastz

# Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer4
SEQ1_DIR=/iscratch/i/danRer4/danRer4.2bit
SEQ1_CTGDIR=/iscratch/i/danRer4/danRer4ChrUnNAScafs.2bit
SEQ1_LIFT=/iscratch/i/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
SEQ1_CTGLEN=/iscratch/i/danRer4/chromsUnNAScafs.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/cluster/bluearc/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
'_EOF_'
    # << emacs
    # kkstore04 can't see /iscratch so use an iServer as fileServer:
    doBlastzChainNet.pl -blastzOutRoot=/cluster/bluearc/danRer4XenTro2 \
      -bigClusterHub=kk -fileServer=kkr8u00 -workhorse=kkr8u00 \
      -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.xenTro2.2006-04-25 /cluster/data/danRer4/bed/blastz.xenTro2

###########################################################################
# CREATE LIFT FILES FOR RANDOM CHROMOSOMES' SCAFFOLDS
# (DONE, 2006-04-25, hartera)
   # scaffolds lift files created by scaffoldFaToAgp when agp files created 
   # for chrNA_random and chrUn_random. remove last line as this is an extra
   # gap line that was removed from the agp.

   ssh kkstore01
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
     mkdir -p /cluster/data/danRer4/$c/tmp
   end
   # NA_random doesn't have .lft and .gap files from scaffoldFaToAgp so
   # recreate. It had no tmp dir with the NA_random.scaffolds.agp.
   awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
       > ./NA_random/tmp/NA_random.scaffolds.agp
   # change the first field to "chrNA_random" then can use agpToFa to process
   perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/tmp/*.agp
   wc -l ./NA_random/tmp/NA_random.scaffolds.agp
   # 2898 ./NA_random/tmp/NA_random.scaffolds.agp

   cd /cluster/data/danRer4
   foreach c (NA_random)
     awk '{print $6;}' $c/tmp/$c.scaffolds.agp > $c/tmp/chr$c.scaffolds.lst
         $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
         $c/tmp/chr$c.scaffolds.lst $c/tmp/chr$c.fa
   end
   cd /cluster/data/danRer4/NA_random/tmp
   scaffoldFaToAgp -scaffoldGapSize=50000 chrNA_random.fa
   # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
   # forUn_random. Change D to W for NA_random and Un_random..
   sed -e 's/chrUn/chrNA_random/' chrNA_random.agp \
       | sed -e 's/D/W/' > chrNA_random.scaffolds.agp
   mv chrNA_random.fa chrNA_random.scaffolds.fa
   # also move the Un_random .lft and .gap files to Un_random/tmp
   mv ./Un_random/chrUn_random.lft ./Un_random/tmp/chrUn_random.lft
   mv ./Un_random/chrUn_random.gap ./Un_random/tmp/chrUn_random.gap
   # for chrNA_random and chrUn_random: remove last line as this is an extra
   # gap line that was removed from the chrN_random.agp. Add these 
   # scaffold lift files to liftAll.lft. Also need to change the last 
   # field so that the correct total number of bases is being shown in the
   # last column.
   cd /cluster/data/danRer4
   foreach c (NA_random Un_random)
     head -n -1 $c/tmp/chr${c}.lft > $c/tmp/chr${c}.scaffolds.lft
     perl -pi.bak -e "s/chrUn/chr${c}/" $c/tmp/chr${c}.scaffolds.lft
     if ($c == "NA_random") then
        perl -pi.bak -e 's/208064280/208014280/' \
             $c/tmp/chrNA_random.scaffolds.lft
     else
        perl -pi.bak -e 's/19379532/19329532/' \
             $c/tmp/chrUn_random.scaffolds.lft
     endif
     cat $c/tmp/chr${c}.scaffolds.lft >> ./jkStuff/liftAll.lft
     rm $c/tmp/chr${c}.lft $c/tmp/chr${c}.gap *.bak
   end

###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN 
# (DONE, 2006-04-25 - 2006-04-26, hartera)
   ssh hgwdev
   cd ~kent/src/hg/makeDb/genbank
   cvs update -d -P etc
   # edit etc/genbank.conf to add danRer4 and commit this to CVS.
# danRer4 (zebrafish)
# Lift file partitions unplaced sequence pseudo-chroms
danRer4.serverGenome = /cluster/data/danRer4/danRer4.2bit
danRer4.clusterGenome = /iscratch/i/danRer4/danRer4.2bit
danRer4.ooc = /iscratch/i/danRer4/danRer4_11.ooc
danRer4.align.unplacedChroms = chrNA_random chrUn_random
danRer4.lift = /cluster/data/danRer4/jkStuff/liftAll.lft
danRer4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
danRer4.downloadDir = danRer4
danRer4.mgcTables.default = full
danRer4.mgcTables.mgc = all
   # end of section added to etc/genbank.conf
   cvs commit -m "Added danRer4." etc/genbank.conf
   # update /cluster/data/genbank/
   make etc-update

   # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
   # danRer genome information

   ssh kkstore02
   cd /cluster/data/genbank
   nice bin/gbAlignStep -initial danRer4 &
   # Start: Tues Apr 25 12:53 Finish: Wed Apr 26 08:38
   # logFile: var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
   # check log file
   tail -f var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
   # check it has finished (last line in log file):
   # kkstore02 2006.04.26-08:38:36 danRer4.initalign: finish
   # load database when finished
   ssh hgwdev
   cd /cluster/data/genbank
   nice ./bin/gbDbLoadStep -drop -initialLoad danRer4 &
   # logFile: var/dbload/hgwdev/logs/2006.04.26-15:45:19.dbload.log
   # check it is finished: hgwdev 2006.04.26-17:48:07 dbload: finish
   # Took about 2 hours.

###########################################################################
# SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn AND chrNA INTO SCAFFOLDS
# ADD SOFT-MASKED SCAFFOLDS TO ISERVERS AND THE SAN FOR CLUSTER RUNS
# (DONE, 2006-04-27, hartera)

    ssh kkstore01
    cd /cluster/data/danRer4
    # for chrNA_random and chrUn_random, get soft-masked sequence.
    foreach c (NA_random Un_random)
      cd $c
      mkdir scaffoldsSoftMask
      awk 'BEGIN {FS="\t"}{if ($5 != "N") \
       print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
       >> ./scaffoldsSoftMask/faFragSoftMask.csh
      cd ..
    end
    # change permissions run scripts to get sequences
    foreach d (NA_random Un_random)
       chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
    end
    # wrapper shell script to run script to get the soft-masked scaffolds
    cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
#!/bin/csh
foreach c (NA_random Un_random)
   set dir=/cluster/data/danRer4
   echo "Processing $c"
   cd $dir/$c/scaffoldsSoftMask
   cp ../chr${c}.fa .
   echo "Getting soft-masked sequences ..."
   nice faFragSoftMask.csh >& faFrag.log
end
'_EOF_'
   chmod +x jkStuff/getMaskedScaffolds.csh
   nice ./jkStuff/getMaskedScaffolds.csh &
   # Took about 2.5 hours.
   # check a few sequences that they are correct
   # add name of scaffold to sequence fasta and cat together
   foreach c (NA_random Un_random)
      set dir = /cluster/data/danRer4
      cd $dir/$c/scaffoldsSoftMask
      foreach f (Zv*)
        set g=$f:r
        set sc=scaffold${c}.fa
        perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:_]+/>$g/" $f
        cat $f >> $sc
        rm *.bak
      end
      cp scaffold* $dir/$c/
   end
   grep '>' NA_random/scaffoldNA_random.fa | wc -l
   # 2898
   grep '>' Un_random/scaffoldUn_random.fa | wc -l
   # 68
   # check sizes of final FASTA file with all sequences. check a few
   # sequence files to see that they are correct - ok 
   cd /cluster/data/danRer4
cat << '_EOF_' > ./jkStuff/checkFastaSizes.csh
#!/bin/csh -fe

set scafName=$1
set agpLen=$2

set pref=`echo $scafName | cut -c1-2`
if ($pref == "Zv") then
  set g=/cluster/data/danRer4/*/scaffoldsSoftMask/${scafName}.fa
  set h=$g:t
  echo "Getting size of $h"
  set faLen = `faSize $g | awk '{print $1;}'`

  if ($agpLen == $faLen) then
     echo "   OK: apg length = $h length = $faLen"
  else
     echo "ERROR:  length = $agpLen, but $h length = $faLen"
  endif
endif
'_EOF_'
   # << happy emacs
   chmod +x ./jkStuff/checkFastaSizes.csh
   # use bash as doing a cat in C shell seems to split the line up by space
   bash
   for c in NA_random Un_random
   do
     echo "Processing $c scaffolds ..."; 
     cat $c/chr${c}.agp  | while read line;
     do
     scaf=`echo $line | cut -d " " -f6`;
     size=`echo $line | cut -d " " -f8`; 
     nice ./jkStuff/checkFastaSizes.csh $scaf $size >> checkFastaSizes.log;
     done
   done 
   exit # back to C shell
   grep "ERROR:" checkFastaSizes.log | wc -l

   # No errors so all are the OK so FASTA files are the expected size
   # Add soft-masked scaffolds to the Iservers and the san for cluster runs 
   
   ssh kkr1u00
   cd /cluster/data/danRer4
   mkdir /iscratch/i/danRer4/scaffoldsSoftMask
   foreach c (NA_random Un_random)
      foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
         cp -p $f /iscratch/i/danRer4/scaffoldsSoftMask
      end
      cp -p /cluster/data/danRer4/$c/scaffold${c}.fa /iscratch/i/danRer4
   end
   ls /iscratch/i/danRer4/scaffoldsSoftMask/ | wc
   # 2966
   # all files are there   
   # rsync to cluster machines
   foreach R (2 3 4 5 6 7 8)
      rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
   end

   ssh pk
   mkdir -p /san/sanvol1/scratch/danRer4/scaffoldsSoftMask
   foreach c (NA_random Un_random)
      foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
        rsync -a --progress $f /san/sanvol1/scratch/danRer4/scaffoldsSoftMask/
      end
      rsync -a --progress /cluster/data/danRer4/${c}/scaffold${c}.fa \
            /san/sanvol1/scratch/danRer4/
   end
   foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
     echo $f >> files.log
   end
   wc -l files.log 
   # 2966 files.log
   rm files.log
   # All files have transferred.

###########################################################################
## SWAP MM8 blastz result (DONE - 2006-04-28 - Hiram)
# ADD SYMBOLIC LINK TO SWAP DIR (DONE, 2006-05-04, hartera)
# RE-MAKE MM8 CHAINS AND NET SWAP WITH DANRER4 RANDOM CHROMS 
# (DONE, 2006-05-24, hartera) ADDED LINK TO SWAP DIR (2006-05-27, hartera)
    ssh pk
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    # blastz parameters used in blastz alignment of danRer4 on mm8:
    # BLASTZ_ABRIDGE_REPEATS=1
    # BLASTZ_H=2000
    # BLASTZ_Y=3400
    # BLASTZ_L=6000
    # BLASTZ_K=2200
    # BLASTZ_M=50
    # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &
    
    ssh hgwdev
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    time nice -n +19 featureBits danRer4 chainMm8Link \
	> fb.danRer4.chainDanRer4Link 2>&1 &
    cat fb.danRer4.chainDanRer4Link
    # 60721886 bases of 1626093931 (3.734%) in intersection
    # Add symbolic link to new swap directory (2006-05-27, hartera)
    ssh kkstore04
    cd /cluster/data/danRer4/bed
    ln -s blastz.mm8.swap blastz.mm8
   
###########################################################################
# MONDOM4 BLASTZ TESTS USING LINEAGE-SPECIFIC REPEATS OR DYNAMIC MASKING
# AND SWAP (DONE, 2006-04-28, hartera)
   # used no lineage specific-repeats and M=50 for dynamic masking
   featureBits danRer4 chainMonDom4
   # 541863023 bases of 1626093931 (33.323%) in intersection
   featureBits danRer4 chainMonDom4NoDyMsk
   # 534445657 bases of 1626093931 (32.867%) in intersection
   featureBits monDom4 chainDanRer4
   # 856404995 bases of 3501643220 (24.457%) in intersection
   featureBits monDom4 chainDanRer4NoDyMsk
   # 812142533 bases of 3501643220 (23.193%) in intersection
   featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4Link -enrichment
   # refGene:cds 0.732%, chainMonDom4Link 5.573%, both 0.550%, cover 75.20%, 
   # enrich 13.49x
   featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4NoDyMskLink 
   -enrichment
   # refGene:cds 0.732%, chainMonDom4NoDyMskLink 4.083%, both 0.550%, 
   # cover 75.15%, enrich 18.40x
   featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
   # refGene:cds 0.001%, chainDanRer4Link 2.448%, both 0.000%, 
   # cover 55.63%, enrich 22.73x 
   featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4NoDyMskLink 
   -enrichment
   # refGene:cds 0.001%, chainDanRer4NoDyMskLink 1.807%, both 0.000%, 
   # cover 43.85%, enrich 24.27x
   # There are only 36 RefSeq genes for monDom4 so results are misleading.
   # Try mrna and xenoRefGene table.
   # for mrna tables, not much difference:
   featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
   # mrna 0.004%, chainDanRer4Link 2.448%, both 0.002%, cover 54.59%, 
   # enrich 22.30x
   featureBits -chrom=chr1 monDom4 mrna chainDanRer4NoDyMskLink -enrichment
   # mrna 0.004%, chainDanRer4NoDyMskLink 1.807%, both 0.002%, 
   # cover 52.67%, enrich 29.15x
   featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
   # xenoRefGene:cds 0.820%, chainDanRer4Link 2.448%, both 0.655%, 
   # cover 79.88%, enrich 32.63x
   featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4NoDyMskLink 
   -enrichment
   # xenoRefGene:cds 0.820%, chainDanRer4NoDyMskLink 1.807%, both 0.661%, 
   # cover 80.63%, enrich 44.63x

   # For the nets:
   featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4 -enrichment
   # refGene:cds 0.732%, netMonDom4 31.056%, both 0.612%, 
   # cover 83.58%, enrich 2.69x
   featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4NoDyMsk -enrichment
   # refGene:cds 0.732%, netMonDom4NoDyMsk 31.002%, both 0.617%, 
   # cover 84.31%, enrich 2.72x
   featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4 -enrichment
   # refGene:cds 0.001%, netDanRer4 25.224%, both 0.000%, 
   # cover 66.95%, enrich 2.65x
   featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4NoDyMsk -enrichment
   # refGene:cds 0.001%, netDanRer4NoDyMsk 24.539%, both 0.000%, 
   # cover 49.19%, enrich 2.00x
   # rows in tables for chr1
   # Assembly  Table 		Number of rows
   # danRer4   chainMonDom4          	36931
   # danRer4   chainMonDom4Link        	426659 
   # danRer4   chainMonDom4NoDyMsk    	34363
   # danRer4   chainMonDom4NoDyMskLink	361572
   # monDom4   chainDanRer4            	170759
   # monDom4   chainDanRer4Link	        2552995
   # monDom4   chainDanRer4NoDyMsk	139797	
   # monDom4   chainDanRer4NoDyMskLink	1806858
   # all chroms:
   # danRer4   netMonDom4	        399531
   # danRer4   netMonDom4NoDyMsk	346482
   # monDom4   netDanRer4		395881
   # monDom4   netDanRer4NoDyMsk	321288

   # Use lineage-specific repeats and no dynamic masking, seem to get 
   # better enrichment and coverage compared to gene CDS regions and also 
   # there are less chains being produced.

###########################################################################
# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# OPOSSUM (monDom4) (DONE, 2006-04-28 - 2006-04-29, hartera)
    ssh hgwdev
    # Remove all test chain and net tables and start again
    foreach c (`cat chrom.lst`)
       hgsql -e "drop table chr${c}_chainMonDom4;" danRer4
       hgsql -e "drop table chr${c}_chainMonDom4Link;" danRer4
       hgsql -e "drop table chr${c}_chainMonDom4NoDyMsk;" danRer4
       hgsql -e "drop table chr${c}_chainMonDom4NoDyMskLink;" danRer4
    end
    hgsql -e "drop table netMonDom4;" danRer4
    hgsql -e "drop table netMonDom4NoDyMsk;" danRer4
    # remove downloads
    rm -r /usr/local/apache/htdocs/goldenPath/danRer4/vsMonDom4
    rm \
    /usr/local/apache/htdocs/goldenPath/danRer4/liftOver/danRer4ToMonDom4.over.chain.gz
    rm /cluster/data/danRer4/bed/liftOver/danRer4ToMonDom4.over.chain.gz
    # remove old Blastz swap
    rm -r /cluster/data/danRer4/bed/blastz.monDom4.swap
    # remove link to old blastz directory
    rm -r /cluster/data/danRer4/bed/blastz.monDom4

    # see makeMonDom4.doc for removal of test tables and download files
    # and swap directory on monDom4.
    
    ssh pk
    # Blastz uses lineage-specific repeats. There are none for human
    # and zebrafish so use all repeats.
    # There is a lineage-specific repeats directory for zebrafish (see
    # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
    # DIRECTORY. lineage-specific repeats for monDom4 made and also nibs - see
    # makeMonDom4.doc. Need nib files when running Blastz with
    # lineage-specific repeats.
    
    mkdir -p /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
    cd /cluster/data/danRer4/bed
    ln -s blastz.monDom4.2006-04-28 blastz.monDom4
    cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
    # only 5% of the danRer4 genome is now in the random unordered chroms
    # so not running only scaffolds for these chroms - run as virtual chroms
    # and use same parameters as for danRer2 but use all repeats as 
    # lineage-specific as monDom4 is now mapped to chroms.
    cat << 'EOF' > DEF
# danRer4 zebrafish target, opossum monDom4 query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q

# TARGET: zebrafish (danRer4)
# Use all chroms, including both randoms (chrNA_random and chrUn_random)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: opossum (monDom4) 
SEQ2_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
SEQ2_CHUNK=50000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
TMPDIR=/scratch/tmp
'EOF'
   # << happy emacs
   chmod +x DEF 
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF >& doBlastz.log &
   # Start: Fri Apr 28 13:27 Finish: Apr 29 01:28
   # Stopped after making and merging chains:
   # netChains: looks like previous stage was not successful 
   # (can't find [danRer4.monDom4.]all.chain[.gz]). 
   # Start again with net step and continue:

   cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -continue net `pwd`/DEF >& net.log &
   # Took about 15 minutes to finish.
   # Do swap to get danRer4 alignments on monDom4:
   # see also makeMonDom4.doc  
   cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
   nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
   # Took about 15 minutes.

###########################################################################
# BLASTZ FOR FUGU (fr1) (DONE, 2006-04-28 - 2006-04-29, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    # No lineage-specific repeats for this species pair. fr1 is in scaffolds
    # so not so easy to use repeats with this run anyway. There is a 2bit 
    # file of scaffolds on the Iservers. 
    # Run this with dynamic masking instead. 
    # copy masked fr1 scaffolds 2 bit file to the san - see makeFr1.doc    
    # size of scaffolds FASTA file:
    ssh kkr1u00
    faSize /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa
    # 329140338 bases
    ssh pk
    mkdir /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
    cd /cluster/data/danRer4/bed
    ln -s blastz.fr1.2006-04-28 blastz.fr1
    cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
# use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as Fugu is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
cat << '_EOF_' > DEF
# zebrafish (danRer4) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - zebrafish (danRer4)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000

# QUERY - Fugu (fr1)
SEQ2_DIR=/san/sanvol1/scratch/fr1/fr1.2bit
# soft-masked scaffolds in 2bit format
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/UnScaffolds/scaffolds.sizes
# large enough chunk to do whole genome at once
SEQ2_CHUNK=500000000
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.fr1.2006-04-28
TMPDIR=/scratch/tmp
'_EOF_'
   # << this line keeps emacs coloring happy
   chmod +x DEF
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF >& doBlastz.log &

   # Start: Fri Apr 28 18:54 Finish: Apr 29 06:35
   # Stopped after making and merging chains:
   # netChains: looks like previous stage was not successful 
   # (can't find [danRer4.fr1.]all.chain[.gz]). 
   # Start again with net step and continue:

   cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -continue net `pwd`/DEF >& net.log &
   # Took about an hour to finish.
   # check coverage:
   featureBits danRer4 chainFr1Link
   # 139280554 bases of 1626093931 (8.565%) in intersection
   featureBits danRer3 chainFr1Link
   # 137698495 bases of 1630323462 (8.446%) in intersection

   featureBits -chrom=chr1 danRer4 refGene:cds chainFr1Link -enrichment
   # refGene:cds 0.732%, chainFr1Link 8.464%, both 0.660%, 
   # cover 90.18%, enrich 10.66x
   featureBits -chrom=chr1 danRer3 refGene:cds chainFr1Link -enrichment
   # refGene:cds 0.774%, chainFr1Link 8.364%, both 0.713%, 
   # cover 92.09%, enrich 11.01x
   featureBits -chrom=chr1 danRer4 refGene:cds netFr1 -enrichment
   # refGene:cds 0.732%, netFr1 52.712%, both 0.710%, 
   # cover 96.97%, enrich 1.84x
   featureBits -chrom=chr1 danRer3 refGene:cds netFr1 -enrichment
   # refGene:cds 0.774%, netFr1 58.353%, both 0.759%, 
   # cover 97.95%, enrich 1.68x
   # Do the Blastz swap to get danRer4 alignments on fr1
   # see also makeFr1.doc for featureBits on these alignments.
   ssh pk
   cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
   nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
   # Took about 30 minutes.

###########################################################################
# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2006-04-29 - 2006-04-30, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    # No lineage-specific repeats for this species pair. 
    # Tetraodon also has no species-specific repeats in the RepeatMasker
    # library so run this using dynamic masking instead as for danRer2 and
    # danRer3.
    # The tetraodon 2bit file of chroms and scaffolds 
    # (tetNig1ChromsRandomScafs.2bit) - this contains sequences for chroms
    # and for scaffolds of random chroms.
    ssh pk
    mkdir /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
    cd /cluster/data/danRer4/bed
    ln -s blastz.tetNig1.2006-04-29 blastz.tetNig1
    cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
# use parameters for tetNig1 in makeDanRer3.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as tetraodon is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
cat << '_EOF_' > DEF
# zebrafish (danRer4) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - zebrafish (danRer4)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000

# QUERY - Tetraodon (tetNig1)
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
# large enough chunk to do whole genome at once
SEQ2_CHUNK=1000000000
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
TMPDIR=/scratch/tmp
'_EOF_'
   # << this line keeps emacs coloring happy
   chmod +x DEF
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF >& doBlastz.log &
   # Start: Sat Apr 29 18:10 Finish: Apr 29 22:41 
   # Stopped after making and merging chains:
   # netChains: looks like previous stage was not successful 
   # (can't find [danRer4.tetNig1.]all.chain[.gz]). However, this file
   # is there so start again with net step and continue:

   cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -continue net `pwd`/DEF >& net.log &
   # Took about 20 minutes to finish.
   # check coverage compared to danRer3:
   featureBits danRer4 chainTetNig1Link
   # 119439512 bases of 1626093931 (7.345%) in intersection
   featureBits danRer3 chainTetNig1Link
   # 109205244 bases of 1630323462 (6.698%) in intersection
   featureBits -chrom=chr1 danRer4 refGene:cds chainTetNig1Link -enrichment
   # refGene:cds 0.732%, chainTetNig1Link 7.536%, both 0.645%, 
   # cover 88.08%, enrich 11.69x
   featureBits -chrom=chr1 danRer3 refGene:cds chainTetNig1Link -enrichment
   # refGene:cds 0.774%, chainTetNig1Link 6.821%, both 0.692%, 
   # cover 89.34%, enrich 13.10x
   featureBits -chrom=chr1 danRer4 refGene:cds netTetNig1 -enrichment
   # refGene:cds 0.732%, netTetNig1 55.116%, both 0.705%, 
   # cover 96.33%, enrich 1.75x
   featureBits -chrom=chr1 danRer3 refGene:cds netTetNig1 -enrichment
   # refGene:cds 0.774%, netTetNig1 61.540%, both 0.753%, 
   # cover 97.24%, enrich 1.58x
   # Similar coverage as for tetNig1 chains and nets on zebrafish danRer3.
   # Do the Blastz swap to get danRer4 alignments on tetNig1
   # see also makeTetNig1.doc for featureBits for these alignments on tetNig1.
   ssh pk
   cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
   nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
   # Took about 22 minutes to run.

###########################################################################
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2006-05-01, hartera)
# RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
# BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM 
# (DONE, 2007-03-29, hartera)
# NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
   ssh kkstore01
   cd /cluster/data/danRer4
   #- Build the .tar.gz and *.gz files for bigZips
   cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
# soft masked chrNA and chrUn scaffolds
tar cvzf bigZips/scaffoldRandomsFa.tar.gz NA_random/scaffoldNA_random.fa \
    Un_random/scaffoldUn_random.fa
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer4 -native GenBank mrna \
        /cluster/data/danRer4/bigZips/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer4 -xeno GenBank mrna \
        /cluster/data/danRer4/bigZips/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer4 -native refseq mrna \
/cluster/data/danRer4/bigZips/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer4 -native GenBank est \
/cluster/data/danRer4/bigZips/est.fa

# gzip the Genbank sequences and create upstream sequence files for RefSeq.
cd /cluster/data/danRer4/bigZips
gzip *.fa
'_EOF_'
   # << this line makes emacs coloring happy
   chmod +x jkStuff/zipAll.csh
   csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
   # Took about 35 minutes.    
   #- Look at zipAll.log to make sure all file lists look reasonable.

   # Make upstream files for zebrafish RefSeq and Copy the .gz files to
   # hgwdev:/usr/local/apache/...
   ssh hgwdev
   cd /cluster/data/danRer4/bigZips
   foreach I (1000 2000 5000)
     featureBits danRer4 refGene:upstream:${I} -fa=stdout \
              | gzip -c > upstream${I}.fa.gz
     echo "upstream${I} done"
   end
   set gp = /usr/local/apache/htdocs/goldenPath/danRer4
   mkdir -p $gp/bigZips
   cp -p *.gz $gp/bigZips
   mkdir -p $gp/chromosomes

   # Add individual chromosomes and file of scaffolds for each random chrom
   # to chromosomes downloads directory.
   foreach f (../*/chr*.fa)
     cp $f $gp/chromosomes
   end
   foreach c (NA_random Un_random)
     cd /cluster/data/danRer4/$c
     cp scaffold${c}.fa $gp/chromosomes
   end
   # create md5sum for bigZips
   cd $gp/bigZips
   md5sum *.gz > md5sum.txt
   # gzip each chrom or scaffolds for chrom separately in chromosomes dir
   cd $gp/chromosomes
   foreach f (*.fa)
      gzip $f
   end
   # create md5sum for chromosomes
   md5sum *.gz > md5sum.txt
   # Take a look at bigZips/* and chromosomes/*
   # copy README.txt's from danRer3 and update

   # RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
   # BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM 
   # (DONE, 2007-03-29, hartera)
   # NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
   ssh kkstore04
   cd /cluster/data/danRer4
   #- Rebuild the .tar.gz (agp, files for bigZips
   cat << '_EOF_' > jkStuff/zip2.csh
rm -r bigZips/chromAgp.tar.gz
rm -r bigZips/chromOut.tar.gz
rm -r bigZips/chromFa.tar.gz
rm -r bigZips/chromFaMasked.tar.gz
tar cvzf bigZips/chromAgp.tar.gz ?{,?}{,_random}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}{,_random}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}{,_random}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}{,_random}/chr*.fa.masked
'_EOF_'
   # << this line makes emacs coloring happy
   chmod +x jkStuff/zip2.csh
   csh -ef ./jkStuff/zip2.csh >& zip2.log &
   # Took about 10 minutes
   # Links to these files already exist from the 
   # /usr/local/apache/htdocs/goldenpath/danRer4/bigZips directory.
   # Recreate the md5sum there to include these new files.
   cd /usr/local/apache/htdocs/goldenpath/danRer4/bigZips
   rm md5sum.txt
   md5sum *.gz > md5sum.txt
   
###########################################################################
# HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-04-28 - 2006-05-03, hartera)
   ssh kkstore01
   bash # if not using bash shell already
   # make Blast database for non-random chrom sequences
   mkdir -p /cluster/data/danRer4/blastDb
   cd /cluster/data/danRer4/blastDb
   cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA_random/d" \ 
       | sed "/Un_random/d" > chrom.list
   for i in `cat chrom.list`; 
       do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
   ln -s `cat list` .
   for i in *.fa
    do
        /projects/compbio/bin/i686/formatdb -i $i -p F
    done
   rm *.log *.fa list
   cd /cluster/data/danRer4
   for i in `cat blastDb/chrom.list`; 
       do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
   rm blastDb/chrom.list
   # Now make Blast database for random scaffolds sequences.
   mkdir /cluster/data/danRer4/scaffoldBlastDb
   cd /cluster/data/danRer4/scaffoldBlastDb

   # Take file of all scaffolds for NA_random and Un_random and cat together
   cat ../NA_random/scaffoldNA_random.fa ../Un_random/scaffoldUn_random.fa \
       > allRandomScafs.fasta
   grep '>' allRandomScafs.fasta | wc -l
   # 2966
   faSplit sequence allRandomScafs.fasta 500 scaf
   rm allRandomScafs.fasta
   for i in *.fa
     do
       /projects/compbio/bin/i686/formatdb -i $i -p F
     done
   rm *.log *.fa
   # combine databases for chroms and random chroms 
   mkdir -p /san/sanvol1/scratch/danRer4/comboBlastDb
   cd /cluster/data/danRer4/blastDb
   for i in nhr nin nsq; 
    do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; 
    done
   cd /cluster/data/danRer4/scaffoldBlastDb
   for i in nhr nin nsq; 
     do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; 
     done
   mkdir -p /cluster/data/danRer4/bed/tblastn.hg18KG
   cd /cluster/data/danRer4/bed/tblastn.hg18KG
   echo  /san/sanvol1/scratch/danRer4/comboBlastDb/*.nsq  \
         | xargs ls -S | sed "s/\.nsq//"  > query.lst
   wc -l query.lst
   # 4377 query.lst
   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
   # 36727/(250000/4377) = 643.016316
   mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa
   split -l 643 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl \
        /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice /cluster/home/braney/bin/x86_64/pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   exit # back to tcsh
   cd /cluster/data/danRer4/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer4/jkStuff/subChr.lft carry $f.2
        liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/danRer4/jkStuff/liftAll.lft carry $f.3
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    
    # then run the Blast cluster jobs
    ssh kk
    cd /cluster/data/danRer4/bed/tblastn.hg18KG
    para create blastSpec
    para try, check, push, check etc.
    # pushed 100,000 jobs at a time so need to do para push again later
    para time
# Completed: 253866 of 253866 jobs
# CPU time in finished jobs:   52410110s  873501.83m 14558.36h  606.60d  1.662 y
# IO & Wait Time:               5508786s   91813.10m  1530.22h   63.76d  0.175 y
# Average job time:                 228s       3.80m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2162s      36.03m     0.60h    0.03d
# Submission to last job:        147825s    2463.75m    41.06h    1.71d

    # Took a while as had to repush some crashed jobs.
    ssh kkstore01
    cd /cluster/data/danRer4/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS \
      /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/danRer4/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.
# Completed: 58 of 58 jobs
# CPU time in finished jobs:     759034s   12650.56m   210.84h    8.79d  0.024 y
# IO & Wait Time:                217724s    3628.74m    60.48h    2.52d  0.007 y
# Average job time:               16841s     280.68m     4.68h    0.19d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          208828s    3480.47m    58.01h    2.42d
# Submission to last job:        208891s    3481.52m    58.03h    2.42d
    ssh kkstore01
    cd /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    liftUp -nohead -type=.psl stdout \
           /cluster/data/danRer4/jkStuff/liftAll.lft carry u.*.psl m60* | \
           sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq \
           > /cluster/data/danRer4/bed/tblastn.hg18KG/blastHg18KG.psl
    pslCheck blastHg18KG.psl
    # this is ok.
    # load table 
    ssh hgwdev
    cd /cluster/data/danRer4/bed/tblastn.hg18KG
    hgLoadPsl danRer4 blastHg18KG.psl
    # check coverage
    featureBits danRer4 blastHg18KG
    # 21159392 bases of 1626093931 (1.301%) in intersection 
    
    featureBits danRer3 blastHg17KG
    # 21063005 bases of 1630323462 (1.292%) in intersection
    
    featureBits -chrom=chr1 danRer4 refGene:cds blastHg18KG -enrichment
    # refGene:cds 0.732%, blastHg18KG 1.333%, both 0.428%, cover 58.43%, 
    # enrich 43.83x
    featureBits -chrom=chr1 danRer3 refGene:cds blastHg17KG -enrichment
    # refGene:cds 0.774%, blastHg17KG 1.370%, both 0.450%, cover 58.05%, 
    # enrich 42.38x
    # Similar coverage compared to refGene CDS as for hg17 proteins on danRer3.
    # back to kkstore04 to clean up
    ssh kkstore04
    rm -rf /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut

    # add trackDb.ra entry and html to ~/kent/src/hg/makeDb/trackDb/trackDb.ra
    # also added the blastHg18KG.html here. 
    # blastKGPep04 and blastKGRef04 tables required on hg18 - these have
    # been created - see makeHg18.doc. update of hgc.c, hgTrackUi.c and 
    # hgTracks.c was required - done by Brian.

###########################################################################
# MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK 
# (DONE, 2006-05-04 - 2006-05-10, hartera)
# RE-MAKE WITH DANRER4 RANDOMS FOR MM8 AND ADDED FRAMES TABLE AND 
# MULTIZ7WAY DOWNLOADS (DONE, 2006-05-28 - 2005-05-29, hartera)
#   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28
    cd /cluster/data/danRer4/bed
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
    # copy MAFs to a cluster-friendly server
    rm -r /san/sanvol1/scratch/danRer4/mafNet
    mkdir /san/sanvol1/scratch/danRer4/mafNet
    foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
       echo $s
       rsync -av /cluster/data/danRer4/bed/blastz.$s/mafNet/* \
        /san/sanvol1/scratch/danRer4/mafNet/$s/
    end
    # prune the hg17 17way tree to just these 7 and update db names:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
      --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
      /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
    # carefully edit so that danRer4 is first. copy first to new file
    cp 7way.nh 7way_zfishFirst.nh
  #  /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
    # also made the ps file for the 7way.nh and compared to make sure
    # that the tree with zebrafish at the top looks correct.
    /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
    grep danRer4 7way.distances | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    cat distances.txt
# 1.4749  tetraodon_tetNig1
# 1.5154  fugu_fr1
# 1.7480  human_hg18
# 1.7782  monodelphis_monDom4
# 1.8771  xenopus_xenTro2
# 2.1058  mouse_mm8
    # the order in the browser display will be by tree topology,
    # not by distance, so they will be:
    # danRer4
    # 1.5154  fugu_fr1
    # 1.4749  tetraodon_tetNig1
    # 1.8771  xenopus_xenTro2
    # 1.7782  monodelphis_monDom4
    # 2.1058  mouse_mm8
    # 1.7480  human_hg18

    # create species list and stripped down tree for autoMZ
    sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
        7way_zfishFirst.nh > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst

    ssh pk
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
    mkdir maf run
    cd run

    # stash binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = danRer4
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == $db) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs
    awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para create jobList
    para try, check, push, check etc. ...
    para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       7022s     117.03m     1.95h    0.08d  0.000 y
IO & Wait Time:                   142s       2.37m     0.04h    0.00d  0.000 y
Average job time:                 256s       4.26m     0.07h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             368s       6.13m     0.10h    0.00d
Submission to last job:           705s      11.75m     0.20h    0.01d

   # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
   # to request a push of that file.  The treeImage setting in trackDb.ra 
   # is phylo/danRer4_7way.jpg (relative to htdocs/images).
   ssh hgwdev
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
cat << '_EOF_' > species7.nh
((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
'_EOF_'

   /cluster/bin/phast/draw_tree species7.nh > species7way.ps
   # ask Bob to resize image for Browser track description page and convert
   # to JPEG and rename as danRer4_7way.jpg

   # Build maf annotation and load dataabase
   ssh kolossus
   mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
   mkdir maf run
   cd run
   rm -f sizes nBeds
 foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
      ln -s  /cluster/data/$db/chrom.sizes $db.len
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
      echo $db.len  >> sizes
  end
    echo date > jobs.csh
    # do smaller jobs first:
    foreach f (`ls -1rS ../../maf/*.maf`)
      echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
        /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
        >> jobs.csh
      echo "echo $f" >> jobs.csh
    end
    echo date >> jobs.csh
    csh -efx jobs.csh >&! jobs.log & 
    tail -f jobs.log
    # Took 27 minutes to run.

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
    mkdir -p /gbdb/danRer4/multiz7way/anno/maf
    ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf/*.maf \
      /gbdb/danRer4/multiz7way/anno/maf
    # delete old files from extFile table
    hgsql -e 'delete from extFile where path like "%multiz7way/anno/maf%";' \
          danRer4
    cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7way/anno/maf danRer4 multiz7way
date
'EOF'
    # << emacs
    csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
    # Took about 1 minute.
    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kkr7u00
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
    cat *.maf \
    | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
    -maxSize=200000 -test multiz7waySummary stdin
    # Created 820403 summary blocks from 4245668 components and 
    # 2120803 mafs from stdin
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
    sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz7waySummary.sql
    time nice hgLoadSqlTab danRer4 multiz7waySummary \
         /tmp/multiz7waySummary.sql multiz7waySummary.tab
    # 0.000u 0.000s 2:05.26 0.0%      0+0k 0+0io 209pf+0w
    rm *.tab /tmp/multiz7waySummary.sql
 
    # zip mafs:
    ssh kkstore04
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
cat > zipMafs.csh << 'EOF'
    date
    foreach f (chr*.maf)
        set c = $f:r
        echo $c
        nice gzip -c $f > $c.maf.gz
    end
    date
'EOF'
    time csh -efx zipMafs.csh >&! zip.log
    # 219.706u 1.939s 3:41.75 99.9%   0+0k 0+0io 0pf+0w
    rm *.maf
    # add Frames table:
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
    # The following is adapted from MarkD's Makefile used for mm7...
    # and used in makeRn4.doc.
    #------------------------------------------------------------------------
    # get the genes for all genomes
    # using mrna for danRer4
    # using knownGene for mm8 hg18
    # using mgcGenes for xenTro2
    # using ensGene for fr1
    # no genes for monDom4 and tetNig1
    # targetDb = danRer4
    # queryDbs = mm8 hg18 xenTro2 fr1 (to build frames for)
    # genePreds; (must keep only the first 10 columns for knownGene)
    
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes: danRer4
    mkdir genes
    foreach queryDb (danRer4)
      set tmpExt = `mktemp temp.XXXXXX`
      set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
      set tmpMrna = ${queryDb}.mrna.${tmpExt}
      set tmpCds = ${queryDb}.cds.${tmpExt}
      echo $queryDb
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${queryDb} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end

    # using knownGene for mm8 hg18
    # using mgcGenes for xenTro2
    # using enesGene for fr1
    foreach queryDb (mm8 hg18 xenTro2 fr1)
      if ($queryDb == "xenTro2") then
        set geneTbl = mgcGenes
      else if ($queryDb == "fr1") then
        set geneTbl = ensGene
      else
        set geneTbl = knownGene
      endif
      hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end
 
    #------------------------------------------------------------------------
    # create frames
    set clusterDir = /cluster/bluearc/danRer4/multiz7wayFrames
    set multizDir = /cluster/data/danRer4/bed/multiz7way.2006-05-28
    set mafDir = $multizDir/maf
    set geneDir = $multizDir/frames/genes
    set clusterMafDir = ${clusterDir}/maf
    set clusterGeneDir = ${clusterDir}/genes
    set clusterFramesDir = ${clusterDir}/mafFrames.kki

    # copy mafs to cluster storage
    mkdir $clusterDir
    ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"

    # copy genes to cluster storage
    ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"

    # run cluster jobs
    set tmpExt = `mktemp temp.XXXXXX`
    set paraDir = $multizDir/frames/para.${tmpExt}
    mkdir mafFrames $paraDir
    rm -f $paraDir/jobList
    mkdir ${clusterFramesDir}
    foreach queryDb (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
      mkdir ${clusterFramesDir}/${queryDb}
      foreach c (`awk '{print $1;}' /cluster/data/danRer4/chrom.sizes`)
        if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
          echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} danRer4 \
            ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \
            ${clusterFramesDir}/${queryDb}/$c.mafFrames \
            >> $paraDir/jobList
        endif
      end
    end
    rm -f $tmpExt
    ssh -x kki "cd ${paraDir} && para make jobList && para time"
# Completed: 140 of 140 jobs
# CPU time in finished jobs:        255s       4.25m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   360s       6.00m     0.10h    0.00d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:               8s       0.13m     0.00h    0.00d
# Submission to last job:            55s       0.92m     0.02h    0.00d

    # combine results from cluster
    foreach queryDb (`cat ../species.lst`)
      echo $queryDb
      ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
    end
    #------------------------------------------------------------------------
    # load the database
    hgLoadMafFrames danRer4 multiz7wayFrames mafFrames/*.mafFrames.gz


    #------------------------------------------------------------------------
    # clean up
    rm -rf ${clusterDir}

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore04
    cd /cluster/data/danRer4/bed/multiz7way/frames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (zcat  ../maf/*.maf.gz | time genePredToMafFrames danRer4 stdin stdout danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log&
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way/frames

    hgLoadMafFrames danRer4 multiz7wayFrames multiz7way.mafFrames.gz >&log&

    # end of multiz7way frames and load

    cd /cluster/data/danRer4/bed
    ln -s multiz7way.2006-05-28 /cluster/data/danRer4/bed/multiz7way
    # create and add the tree image for the description page
    # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
    # to request a push of that file.  The treeImage setting in trackDb.ra 
    # is phylo/danRer4_7way.jpg (relative to htdocs/images).
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
cat << '_EOF_' > species7.nh
((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
'_EOF_'

    /cluster/bin/phast/draw_tree species7.nh > species7way.ps
    # ask Bob to resize image for Browser track description page and convert
    # to JPEG and rename as danRer4_7way.jpg
    ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/danRer4_7way.jpg \
          /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
    # change permissions for display if not already readable to all
    chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg

# check for all.joiner entry for 7-way - it is there already.

# add html and trackDb.ra entry for danRer4:
# track multiz7way
# shortLabel Conservation
# longLabel Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 104
# visibility pack
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons7way
# pairwiseHeight 12
# spanList 1
# yLineOnOff Off
# autoScale Off
# windowingFunction mean
# summary multiz7waySummary
# frames multiz7wayFrames
# irows on
# speciesGroups vertebrate mammal
# sGroup_vertebrate fr1 tetNig1 xenTro2
# sGroup_mammal monDom4 mm8 hg18
# treeImage phylo/danRer4_7way.jpg

###########################################################################
# MAF DOWNLOADS FOR MULTIZ7WAY (DONE, 2006-05-29, hartera)
# GZIPPED UPSTREAM FILES AND ADDED TO DOWNLOADS AND RE-MADE md5sum.txt
# (DONE, 2006-06-02, hartera)
   ssh hgwdev
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
   mkdir mafDownloads
   cd mafDownloads
    # upstream mafs
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits danRer4 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags danRer4 multiz7way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'
    time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
    # 57.823u 105.238s 4:13.15 64.4%  0+0k 0+0io 2pf+0w
    # add maf downloads for annotated mafs
    ssh kkstore04
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../anno/maf/chr*.maf)
        set c = $f:t:r
        echo $c
        nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    # 446.734u 5.629s 7:38.09 98.7%       0+0k 0+0io 2pf+0w
   
    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
    mkdir $dir
    ln -s \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{*.gz,md5sum.txt} \
    $dir
    cp /usr/local/apache/htdocs/goldenPath/danRer3/multiz5way/README.txt $dir
    # edit README.txt

    # gzip the upstream maf downloads and remake md5sum.txt 
    # (2006-06-02, hartera)
    ssh kkstore04
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
    foreach f (upstream*.maf)
      nice gzip -c $f > $f.maf.gz
    end 
    rm md5sum.txt
    md5sum *.gz > md5sum.txt
    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
    rm $dir/md5sum.txt
    ln -s \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{upstream*.gz,md5sum.txt} $dir 

###########################################################################
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT 
# (DONE, 2006-05-17 - 2006-05-24, hartera) 
# REMAKE CONSERVATION TRACK USING MULTIZ 7-WAY INCLUDING DANRER4 RANDOM CHROMS
# FOR MM8 ALIGNMENTS (DONE, 2006-05-29, hartera)
   ssh kkstore04
   # Need unzipped maf files for this.
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
   foreach f (*.maf.gz)
     echo $f
     gunzip -c $f > $f:r
   end 

   mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons

   # create a starting-tree.mod based on chr14 (92 Mb)
   # chr14 is the largest chrom apart from chrNA_random
   /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
        --refseq ../../../14/chr14.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1

   /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
        --tree "`cat ../tree-commas.nh`" \
        --out-root starting-tree
   # took less than a minute
   rm s1.*ss
   
   # Get genome-wide average GC content (for all species together,
   # not just the reference genome).  If you have a globally
   # estimated tree model, as above, you can get this from the
   # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.305239 0.194225 0.194292 0.306244
   # add up the C and G:
   grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
   # 0.389 is the GC content. This is used in the -gc argument below.
   # If you do *not* have a global tree model and you do not know your
   # GC content, you can get it directly from the MAFs with a command
   # like:
   /cluster/bin/phast/$MACHTYPE/msa_view \
    --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
    -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
   # This gives a GC content of 0.426 so use this as it is from mafs for
   # the whole genome.
   # break up the genome-wide MAFs into pieces on the san filesystem
   ssh pk
   set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
   mkdir -p $WINDOWS
   cd $WINDOWS
   cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
set N = `echo $c | sed -e 's/chr//'`
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
       -M /cluster/data/danRer4/$N/$c.fa \
       -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
echo "Done" >> $c.done
'EOF'
# << emacs
   chmod +x doSplit.csh
   rm -f jobList
   foreach c (`cat /cluster/data/danRer4/chrom.lst`)
    echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
   end

   para create jobList
   para push, check etc.
   para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:        831s      13.86m     0.23h    0.01d  0.000 y
# IO & Wait Time:                   634s      10.56m     0.18h    0.01d  0.000 y
# Average job time:                  52s       0.87m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             118s       1.97m     0.03h    0.00d
# Submission to last job:           118s       1.97m     0.03h    0.00d

    # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)

    ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
       awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
    
    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
    mkdir -p $dir
    cd $dir
    # now set up cluster job to estimate model parameters.  Parameters
    # will be estimated separately for each alignment fragment then
    # will be combined across fragments. Tuning this loop should come
    # back to here to recalculate. Tuning target-coverage and expected-length.
    # Create little script that calls phastCons with right arguments

    cat > makeTree.csh << 'EOF'
#!/bin/csh -fe
set C = $1:h
set treeRun = $2
set cov = $3
set len = $4
set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
/cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
  /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/starting-tree.mod \
  --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
  --expected-length $len --target-coverage $cov \
  --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
'EOF'
    # << emacs
    chmod a+x makeTree.csh
    # Make sure that the correct GC content is substituted in here. Notice
    # the target coverage of 0.17. Here we are going to aim
    # for 65% coverage of coding regions by conserved elements.
    # Create gensub file
# need to add cov and len parameters
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(path2)
#ENDLOOP
'_EOF_'
    #   happy emacs
    # Make cluster job and run it to try out a few parameters close
    # to those used for danRer3 and danRer2 phastCons runs.
    echo "treeRun1 0.17 12" > tree.lst
    echo "treeRun2 0.32 18" >> tree.lst
    echo "treeRun3 0.32 20" >> tree.lst
    echo "treeRun4 0.35 18" >> tree.lst
    gensub2 randomSs.list tree.lst template jobList
    para create jobList
    para try,check,push,check etc.
# para time
# Completed: 200 of 200 jobs
# CPU time in finished jobs:      68652s    1144.20m    19.07h    0.79d  0.002 y
# IO & Wait Time:                  2521s      42.02m     0.70h    0.03d  0.000 y
# Average job time:                 356s       5.93m     0.10h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             629s      10.48m     0.17h    0.01d
# Submission to last job:          2356s      39.27m     0.65h    0.03d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
    foreach d ($dir/treeRun*)
       cd $d
       ls tree/chr*/*.cons.mod > cons.txt
       /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
         --output-average ave.cons.mod > cons_summary.txt
       ls tree/chr*/*.noncons.mod > noncons.txt
       /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
         --output-average ave.noncons.mod > noncons_summary.txt
    end
       #   measuring entropy
    #   consEntropy <target coverage> <expected lengths>
    #            ave.cons.mod ave.noncons.mod --NH 9.78
    #   never stops with the --NH argument
    # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
    # the expected length that produces this entropy is the one
    # to use for phastCons.
    # foreach treeRun, set the appropriate coverage and length
    # file: treeRunN cov len
    # use awk to split up
    cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
    cp tree.lst entropy.csh 
    perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh 
    chmod +x entropy.csh
    entropy.csh >& entropy.out
# entropy.out
#Coverage = 0.17 Length = 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
#Relative entropy: H=0.857449 bits/site
#Expected min. length: L_min=12.298748 sites
#Expected max. length: L_max=8.165741 sites
#Phylogenetic information threshold: PIT=L_min*H=10.545544 bits

#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
# These are the same as for danRer2 and give the targeted L_min*H value.
# This is from treeRun2.
#Coverage = 0.32 Length = 18
#Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
#Relative entropy: H=0.818130 bits/site
#Expected min. length: L_min=12.025818 sites
#Expected max. length: L_max=9.281106 sites
#Phylogenetic information threshold: PIT=L_min*H=9.838688 bits
###

#Coverage = 0.32 Length = 20
#Transition parameters:gamma=0.320000,omega=20.000000, mu=0.050000, nu=0.023529
#Relative entropy: H=0.795926 bits/site
#Expected min. length: L_min=12.724131 sites
#Expected max. length: L_max=9.927736 sites
#Phylogenetic information threshold: PIT=L_min*H=10.127467 bits

#Coverage = 0.35 Length = 18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
#Relative entropy: H=0.827604 bits/site
#Expected min. length: L_min=11.542637 sites
#Expected max. length: L_max=9.061627 sites
#Phylogenetic information threshold: PIT=L_min*H=9.552732 bits

# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the
# coverage of coding regions by the most conserved elements
    # Create cluster dir to do main phastCons run
    ssh pk
    mkdir -p \
       /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
    cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
    cp -p ../treeRun2/ave.*.mod .
    cp -p ../treeRun2/ave.*.mod \
       /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    mkdir ppRaw bed
    # Create script to run phastCons with right parameters
    #   This job is I/O intensive in its output files, thus it is all
    #   working over in /scratch/tmp/
    # Use the expected length and target coverage determined above and 
    # the corresponding average conserved and nonconserved models
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
        --expected-length 18 --target-coverage 0.32 --quiet \
        --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #   root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
   #   happy emacs

   # Create parasol batch and run it
   ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

   gensub2 in.list single template jobList
   para create jobList
   para try/check/push/etc.
   para time
# Completed: 191 of 191 jobs
# CPU time in finished jobs:       4660s      77.67m     1.29h    0.05d  0.000 y
# IO & Wait Time:                  2927s      48.78m     0.81h    0.03d  0.000 y
# Average job time:                  40s       0.66m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              83s       1.38m     0.02h    0.00d
# Submission to last job:          2246s      37.43m     0.62h    0.03d
 
# combine predictions and transform scores to be in 0-1000 interval
   ssh kkstore04
   cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
   #   The sed's and the sort get the file names in chrom,start order 
   # (Hiram tricks -- split into columns on [.-/] with
   #    identifying x,y,z, to allow column sorting and
   #    restoring the filename.  Warning: the sort column
   # will depend on how deep you are in the dir
   find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
        | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #   ~ 1 minute
    cp -p mostConserved.bed \
        /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
# Figure out how much is actually covered by the mostConserved data as so:
    cd /cluster/data/danRer4
    faSize */chr*.fa
    # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper 
    # 782542294 lower) in 28 sequences in 28 files
    # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
    # max 208014280 (chrNA_random) median 59765243
    # The non-N size is 1598880803 bases
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
        mostConserved.bed
    -target-coverage 0.32: % 3.18 = 100.0*50871950/1598880803 length=18 

    # want to aim for 65% coverage of coding regions
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way/phastCons
    # get an or of refGene and mgcGenes CDS regions
    featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
    # 11770580 bases of 1626093931 (0.724%) in intersection

    featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 3.128%, both 0.463%, 
    # cover 63.94%, enrich 20.44x

    # for danRer3:
    featureBits danRer3 refSeqOrMgcCdsDanRer3.bed \
      /cluster/data/danRer3/bed/multiz5way/mostConserved.bed -enrichment
    # refSeqOrMgcCdsDanRer3.bed 0.714%, 
    # /cluster/data/danRer3/bed/multiz5way/mostConserved.bed 2.998%, 
    # both 0.474%, cover 66.40%, enrich 22.14x
    # so use this result for -target-coverage=0.32 -expected-lengths=18
    # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
    # 63.9% coverage of coding regions with most conserved elements
    # (aiming for about 65%)
    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    hgsql -e 'drop table phastConsElements;' danRer4
    hgLoadBed danRer4 phastConsElements mostConserved.bed
    # Loaded 676058 elements of size 5
    featureBits danRer4 mgcGenes:cds phastConsElements -enrichment
    # mgcGenes:cds 0.560%, phastConsElements 3.128%, both 0.366%, 
    # cover 65.36%, enrich 20.89x
    # Create merged posterier probability file and wiggle track data files
    # the sed business gets the names sorted by chromName, chromStart
    # so that everything goes in numerical order into wigEncode
    ssh kkstore04
    cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | wigEncode stdin phastCons7way.wig phastCons7way.wib
    # takes a few minutes
    ls -l phastCons*
    # -rw-rw-r--  1 hartera protein 255524779 May 29 19:49 phastCons7way.wib
    # -rw-rw-r--  1 hartera protein  61525690 May 29 19:49 phastCons7way.wig
    cp -p phastCons7way.wi? /cluster/data/danRer4/bed/multiz7way/phastCons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    mkdir -p /gbdb/danRer4/wib
    rm /gbdb/danRer4/wib/phastCons7way.wib
    ln -s `pwd`/phastCons7way.wib /gbdb/danRer4/wib/phastCons7way.wib
    # use this if need to reload table
    hgsql -e 'drop table phastCons7way;' danRer4
    # load table
    hgLoadWiggle danRer4 phastCons7way phastCons7way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    bash
    time hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
            -db=danRer4 phastCons7way > histogram.data 2>&1
# real    0m30.234s
# user    0m23.721s
# sys     0m3.234s

        #   create plot of histogram:
    cat << '_EOF_' > histo.gp
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
     "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'

    #   happy emacs
    gnuplot histo.gp > histo.png
    display histo.png &

# add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
# wiggle for the conservation track.
# check all.joiner for entries for phastCons7way and phastConsElements7way -ok
# copy over html for multiz and edit.

###########################################################################
# PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE, 2006-05-30, hartera)
    #   prepare compressed copy of ascii data values for downloads
    ssh kolossus
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
    mkdir phastConsDownloads
    cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun/ppRaw
foreach chr (`awk '{print $1}' /cluster/data/danRer4/chrom.sizes`)
  echo $chr
  cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
       | nice gzip -c \
   > /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/$chr.gz
end
date
'EOF'
    # << emacs
    csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
    # Took ~5 minutes.
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads
    set dir = /usr/local/apache/htdocs/goldenPath/danRer4/phastCons7wayScores
    mkdir $dir
    ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/{*.gz,md5sum.txt} $dir
    # copy over and edit README.txt
    cd $dir
    cp \
 /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores/README.txt .
    # Clean up after phastCons run.
    ssh kkstore04
    rm /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/*.tab
    rm -r /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons

###########################################################################
# CREATED RECIPROCAL BEST NETS AND MAF NETS FOR ALL SPECIES WITH PAIRWISE
# ALIGNMENTS USED FOR MULTIZ MULTIPLE ALIGNMENT 
# (DONE, 2006-05-12 - 2006-05-15 , hartera)
#   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
    ssh kolossus
    mkdir /cluster/data/danRer4/bed/rBestRunForMultiz/
    cd /cluster/data/danRer4/bed/rBestRunForMultiz
    # need to re-run chainNet and keep first output (target-referenced,
    # target-centric nets) and second output that we usually /dev/null
    # (query-referenced, target-centric nets).
cat > rBestNet.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
  echo "Creating Reciprocal Best Net for $s..."
  set binDir=/cluster/home/hartera/bin/i386
  set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
  cd $dir
# Run chainNet again, this time keeping the second output:
  chainPreNet danRer4.$s.all.chain.gz /cluster/data/danRer4/chrom.sizes \
       /cluster/data/$s/chrom.sizes stdout \
       | $binDir/chainNet stdin /cluster/data/danRer4/chrom.sizes \
       /cluster/data/$s/chrom.sizes /dev/null stdout | \
       netSyntenic stdin $dir/$s.danRer4_ref.net
# get the other species chains from the other species-referenced 
# (but danRer4-centric) net:
  chainSwap danRer4.$s.all.chain.gz $s.danRer4.all.chain
  netChainSubset -verbose=0 $s.danRer4_ref.net \
       $s.danRer4.all.chain stdout \
       | chainSort stdin $s.danRer4_ref.subset.chain
# Net those (sorted) danRer4 chains, and keep both outputs, to get
# reciprocal best nets referenced to both species:
  chainPreNet $s.danRer4_ref.subset.chain \
       /cluster/data/$s/chrom.sizes /cluster/data/danRer4/chrom.sizes stdout \
       | $binDir/chainNet stdin /cluster/data/$s/chrom.sizes \
       /cluster/data/danRer4/chrom.sizes tmp1 tmp2

  netSyntenic tmp1 $s.danRer4.rbest.net
  netSyntenic tmp2 danRer4.$s.rbest.net
  rm tmp1 tmp2
  nice gzip *.rbest.net
end
'EOF'
    chmod +x rBestNet.csh
    nice rBestNet.csh >& rBestNet.log &
    # Took about 11 minutes to complete.
    # Then make axtNet and mafNet 
cat > makeMafRBestNet.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
  echo "Creating mafs for $s ..."
  set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
  set seqDir=/san/sanvol1/scratch
  cd $dir
# extract recriprocal best chains from the zebrafish-other species rbest.net
  echo "Get reciprocal best chains for best zebrafish-$s"
  netChainSubset danRer4.$s.rbest.net.gz danRer4.$s.all.chain.gz \
      danRer4.$s.rbest.chain
# need to make sure this is sorted and assing unique chain IDs
  chainSort danRer4.$s.rbest.chain stdout | chainMergeSort stdin \
       > danRer4.$s.rbest.newids.chain
# need to re-net with new ids 
  chainNet danRer4.$s.rbest.newids.chain /cluster/data/danRer4/chrom.sizes \
          /cluster/data/$s/chrom.sizes danRer4.$s.rbest.newids.net /dev/null
# split reciprocal best chains and net
  chainSplit rBestChain danRer4.$s.rbest.newids.chain 
  netSplit danRer4.$s.rbest.newids.net rBestNet
  mkdir ../axtRBestNet
# make axtNet for reciprocal best
  echo "Making axtRBestNet for $s ..."
  foreach f (rBestNet/*.net)
    netToAxt $f rBestChain/$f:t:r.chain \
      $seqDir/danRer4/danRer4.2bit $seqDir/$s/$s.2bit stdout \
      | axtSort stdin stdout \
      | gzip -c > ../axtRBestNet/$f:t:r.danRer4.$s.net.axt.gz
  end
# make mafNet for reciprocal best
  cd ..
  mkdir mafRBestNet  
  echo "Making mafRBestNet for $s ..."
  foreach f (axtRBestNet/*.danRer4.$s.net.axt.gz)
  axtToMaf -tPrefix=danRer4. -qPrefix=$s. $f \
     /cluster/data/danRer4/chrom.sizes /cluster/data/$s/chrom.sizes stdout \
     | gzip -c > mafRBestNet/$f:t:r:r:r:r:r.maf.gz
  end
end
'EOF'
    chmod +x makeMafRBestNet.csh 
    nice makeMafRBestNet.csh >& mafRBestNet.log &
    # Took about an hour.
    # NOTE: Must use chainSort and chainMergeSort to reassign unique IDs
    # to the chains extracted from the rbest.net and then re-net the chains
    # with the new IDs ortherwise netToAxt crashes due to duplicate chain IDs. 
    # Now do the multiple alignment using reciprocal best mafNets as input
    # for multiz.
    # Load up nets and chains from rBestChain and rBestNet 
    ssh hgwdev
    cd /cluster/data/danRer4/bed/rBestRunForMultiz
    # Nets from Reciprocal Best have no type field or repeat/gap stats so need
    # to add these.
cat > loadRBest.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
  set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
  if ($s == "tetNig1") then
     set g = TetNig1
  else if ($s == "fr1") then
     set g = Fr1
  else if ($s == "xenTro2") then
     set g = XenTro2
  else if ($s == "monDom4") then
     set g = MonDom4 
  else if ($s == "mm8") then
     set g = Mm8
  else if ($s == "hg18") then
     set g = Hg18
  endif
# load chains
  echo "Loading chains for $s ..."
  cd $dir/rBestChain
  foreach f (*.chain)
    set c = $f:r
    hgLoadChain danRer4 ${c}_chainRBest${g} $f 
  end   
# load nets
  cd $dir
  echo "Loading nets for $s ..."
# add type field
  netSyntenic danRer4.${s}.rbest.newids.net noClassRBest.net
# add gap/repeat stats to net file using database tables
  netClass -verbose=0 -noAr noClassRBest.net danRer4 $s \
     danRer4.${s}.rbest.withClass.net
  netFilter -minGap=10 danRer4.${s}.rbest.withClass.net \
     | hgLoadNet -verbose=0 danRer4 netRBest${g} stdin
end
'EOF'

    << emacs
    chmod +x loadRBest.csh
    nohup nice loadRBest.csh >& loadRBest.log &

###########################################################################
# MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK - USING RECIPROCAL BEST NETS
# (DONE, 2006-05-18 - 2006-05-24, hartera)
#   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18

    # copy MAFs to a cluster-friendly server
    # use bluearc as the san is down
    mkdir /cluster/bluearc/danRer4/mafRBestNet
    foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
       echo $s
       rsync -av /cluster/data/danRer4/bed/blastz.$s/mafRBestNet/* \
        /cluster/bluearc/danRer4/mafRBestNet/$s/
    end
    # prune the hg17 17way tree to just these 7 and update db names:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
      --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
      /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
    # carefully edit so that danRer4 is first. copy first to new file
    cp 7way.nh 7way_zfishFirst.nh
    # DO THIS LATER AND CREATE FROM TREE WITHOUT DISTANCES
    /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
    # also made the ps file for the 7way.nh and compared to make sure
    # that the tree with zebrafish at the top looks correct.
    /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
    grep danRer4 7way.distances | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    cat distances.txt
# 1.4749  tetraodon_tetNig1
# 1.5154  fugu_fr1
# 1.7480  human_hg18
# 1.7782  monodelphis_monDom4
# 1.8771  xenopus_xenTro2
# 2.1058  mouse_mm8
    # the order in the browser display will be by tree topology,
    # not by distance, so they will be:
    # danRer4
    # 1.5154  fugu_fr1
    # 1.4749  tetraodon_tetNig1
    # 1.8771  xenopus_xenTro2
    # 1.7782  monodelphis_monDom4
    # 2.1058  mouse_mm8
    # 1.7480  human_hg18

    # create species list and stripped down tree for autoMZ
    sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
        7way_zfishFirst.nh > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
    cp tree-commas.nh 7way.nh

    ssh pk
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18
    mkdir maf run
    cd run

    # stash binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = danRer4
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /cluster/bluearc/$db/mafRBestNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == $db) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs
    awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para create jobList
    para try, check, push, check etc. ...
    # Took less than 10 minutes to run 
   # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
   # to request a push of that file.  The treeImage setting in trackDb.ra
   # is phylo/danRer4_7way.jpg (relative to htdocs/images).
 #  ssh hgwdev
  # DO LATER
#   cd /cluster/data/danRer4/bed/multiz7way.2006-05-04
#   pstopnm -stdout 7way.ps | pnmtojpeg > danRer4_7way.jpg
   # ask Bob to resize image for Browser track description page.

   # Build maf annotation and load database
   ssh kolossus
   mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
   mkdir maf run
   cd run
   rm -f sizes nBeds
 foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-18/species.lst`)
      ln -s  /cluster/data/$db/chrom.sizes $db.len
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
      echo $db.len  >> sizes
  end
    echo date > jobs.csh
    # do smaller jobs first:
    foreach f (`ls -1rS ../../maf/*.maf`)
      echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
        /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
        >> jobs.csh
      echo "echo $f" >> jobs.csh
    end
    echo date >> jobs.csh
    csh -efx jobs.csh >&! jobs.log &
    tail -f jobs.log

    # Load anno/maf  
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
    mkdir -p /gbdb/danRer4/multiz7wayRBest/anno/maf
    ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf/*.maf \
      /gbdb/danRer4/multiz7wayRBest/anno/maf
    # Reload as not working correctly. 
    hgsql -e 'drop table multiz7wayRBest;' danRer4
    hgsql -e 'delete from extFile where path like "%multiz7wayRBest%";' \
          danRer4
    cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7wayRBest/anno/maf danRer4 multiz7wayRBest
date
'EOF'
    # << emacs
    csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
    # Do the computation-intensive part of hgLoadMafSummary on a workhorse
    # machine and then load on hgwdev:
    ssh kkr7u00
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
    cat *.maf \
    | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
    -maxSize=200000 -test multiz7wayRBestSummary stdin
    # Created 526386 summary blocks from 1972659 components and 1105457 mafs 
    # from stdin

    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
    sed -e 's/mafSummary/multiz7wayRBestSummary/' \
      ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz7wayRBestSummary.sql
    time nice hgLoadSqlTab danRer4 multiz7wayRBestSummary \
         /tmp/multiz7wayRBestSummary.sql multiz7wayRBestSummary.tab
    # 0.000u 0.000s 0:07.56 0.0%      0+0k 0+0io 4pf+0w
    rm *.tab /tmp/multiz7wayRBestSummary.sql
  #  ln -s multiz7way.2006-05-18 /cluster/data/danRer4/bed/multiz7way
  #  ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/danRer4_7way.jpg \
   #       /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
    # change permissions for display if not already readable to all
  #  chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg

# check for all.joiner entry for 7-way - it is there already.
# add trackDb.ra entry for danRer4:

###########################################################################
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT USING MAFS
# FROM RECIPROCAL BEST NET (DONE, 2006-05-19 - 2005-05-24, hartera)
   ssh kkstore04
   mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
   cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons

   # create a starting-tree.mod based on chr14 (92 Mb)
   # chr14 is the largest chrom apart from chrNA_random
   /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
        --refseq ../../../14/chr14.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1
   /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
        --tree "`cat ../tree-commas.nh`" \
        --out-root starting-tree
   # took less than a minute
   rm s1.*ss
   
   # Get genome-wide average GC content (for all species together,
   # not just the reference genome).  If you have a globally
   # estimated tree model, as above, you can get this from the
   # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.309665 0.189697 0.189720 0.310918
   # add up the C and G:
   grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
   # 0.379 is the GC content. This is used in the -gc argument below.
   # If you do *not* have a global tree model and you do not know your
   # GC content, you can get it directly from the MAFs with a command
   # like:
   /cluster/bin/phast/$MACHTYPE/msa_view \
    --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
    -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
   # This gives a GC content of 0.426 so use this as it is from mafs for
   # the whole genome.
   # break up the genome-wide MAFs into pieces on the san filesystem
   ssh pk
   # should use a directory on the san but it is down and para create is
   # not working on kk.
   set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
   mkdir -p $WINDOWS
   cd $WINDOWS
   cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf
set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
set N = `echo $c | sed -e 's/chr//'`
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
       -M /cluster/data/danRer4/$N/$c.fa \
       -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
echo "Done" >> $c.done
'EOF'
# << emacs
   chmod +x doSplit.csh
   rm -f jobList
   foreach c (`cat /cluster/data/danRer4/chrom.lst`)
    echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
   end

   para create jobList
   para push, check etc.
   para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:        847s      14.12m     0.24h    0.01d  0.000 y
# IO & Wait Time:                  9741s     162.35m     2.71h    0.11d  0.000 y
# Average job time:                 378s       6.30m     0.11h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             539s       8.98m     0.15h    0.01d
# Submission to last job:           581s       9.68m     0.16h    0.01d

    # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)

    ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
       awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
    
    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
    mkdir -p $dir
    cd $dir
    # now set up cluster job to estimate model parameters.  Parameters
    # will be estimated separately for each alignment fragment then
    # will be combined across fragments. Tuning this loop should come
    # back to here to recalculate. Tuning target-coverage and expected-length.
    # Create little script that calls phastCons with right arguments

    cat > makeTree.csh << 'EOF'
#!/bin/csh -fe
set C = $1:h
set treeRun = $2
set cov = $3
set len = $4
set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
/cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
  /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons/starting-tree.mod \
  --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
  --expected-length $len --target-coverage $cov \
  --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
'EOF'
    # << emacs
    chmod a+x makeTree.csh
    # Make sure that the correct GC content is substituted in here. Notice
    # the target coverage of 0.17. Here we are going to aim
    # for 65% coverage of coding regions by conserved elements.
    # Create gensub file
# need to add cov and len parameters
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(path2)
#ENDLOOP
'_EOF_'
    #   happy emacs
    # Make cluster job and run it
    echo "treeRun1 0.17 12" > tree.lst
    echo "treeRun2 0.32 18" >> tree.lst
    echo "treeRun3 0.32 20" >> tree.lst
    echo "treeRun4 0.35 18" >> tree.lst
    gensub2 randomSs.list tree.lst template jobList
    para create jobList
    para try,check,push,check etc.
# para time
# Completed: 200 of 200 jobs
# CPU time in finished jobs:      45500s     758.33m    12.64h    0.53d  0.001 y
# IO & Wait Time:                 31478s     524.64m     8.74h    0.36d  0.001 y
# Average job time:                 385s       6.41m     0.11h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             622s      10.37m     0.17h    0.01d
# Submission to last job:           821s      13.68m     0.23h    0.01d
    # try again, mkdir test2. if aim for about 5% coverage and for chr1 on
    # hg18, netDanRer4 covers about 31% of bases then 0.05/0.30 = 0.156
    # want length of about 20 bp to influence the model towards detecting
    # shorter conserved regions such as TFBSs.
    cd test2
    echo "treeRun5 0.156 20" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test3
    echo "treeRun6 0.156 15" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test4
    # increase coverage and compensate a bit by lowering the expected length
    echo "treeRun7 0.25 8" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test5
    echo "treeRun8 0.35 12" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test6
    echo "treeRun9 0.5 20" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test7
    echo "treeRun10 0.5 24" > tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test8
    echo "treeRun11 0.45 22" > tree.lst
    echo "treeRun12 0.5 26" >> tree.lst
    echo "treeRun13 0.5 28" >> tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test9
    echo "treeRun14 0.45 24" > tree.lst
    echo "treeRun15 0.45 20" >> tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test10
    echo "treeRun16 0.40 24" > tree.lst
    echo "treeRun17 0.40 20" >> tree.lst
    echo "treeRun18 0.42 20" >> tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    cd test11
    echo "treeRun19 0.38 24" > tree.lst
    echo "treeRun20 0.38 22" >> tree.lst
    echo "treeRun21 0.38 20" >> tree.lst
    gensub2 ../randomSs.list tree.lst template jobList
    para create jobList
    # Now combine parameter estimates.  We can average the .mod files
    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
    foreach d ($dir/treeRun*)
       cd $d
       ls tree/chr*/*.cons.mod > cons.txt
       /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
         --output-average ave.cons.mod > cons_summary.txt
       ls tree/chr*/*.noncons.mod > noncons.txt
       /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
         --output-average ave.noncons.mod > noncons_summary.txt
    end
       #   measuring entropy
    #   consEntropy <target coverage> <expected lengths>
    #            ave.cons.mod ave.noncons.mod --NH 9.78
    #   never stops with the --NH argument
    # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
    # the expected length that produces this entropy is the one
    # to use for phastCons.
    # foreach treeRun, set the appropriate coverage and length
    # file: treeRunN cov len
    # use awk to split up
    cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
    cp tree.lst entropy.csh 
    perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh 
    chmod +x entropy.csh
    entropy.csh >& entropy.out
# entropy.out
#Coverage = 0.17 Length = 12
#Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
#Relative entropy: H=0.782279 bits/site
#Expected min. length: L_min=13.655129 sites
#Expected max. length: L_max=8.801144 sites
#Phylogenetic information threshold: PIT=L_min*H=10.682123 bits

#Coverage = 0.32 Length = 18
#Transition parameters:gamma=0.320000, omega=18.000000, mu=0.055556, nu=0.026144
#Relative entropy: H=0.757117 bits/site
#Expected min. length: L_min=13.055080 sites
#Expected max. length: L_max=9.912578 sites
#Phylogenetic information threshold: PIT=L_min*H=9.884225 bits

#Coverage = 0.32 Length = 20
#Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
#Relative entropy: H=0.736191 bits/site
#Expected min. length: L_min=13.815340 sites
#Expected max. length: L_max=10.615242 sites
#Phylogenetic information threshold: PIT=L_min*H=10.170732 bits

#Coverage = 0.35 Length = 18
#Transition parameters:gamma=0.350000, omega=18.000000, mu=0.055556, nu=0.029915
#Relative entropy: H=0.768872 bits/site
#Expected min. length: L_min=12.471015 sites
#Expected max. length: L_max=9.642561 sites
#Phylogenetic information threshold: PIT=L_min*H=9.588610 bits

#Coverage = 0.156 Length = 20
#Transition parameters:gamma=0.156000, omega=20.000000, mu=0.050000, nu=0.009242
#Relative entropy: H=0.676147 bits/site
#Expected min. length: L_min=17.857722 sites
#Expected max. length: L_max=12.694666 sites
#Phylogenetic information threshold: PIT=L_min*H=12.074436 bits

#Coverage = 0.156 Length = 15
#Transition parameters:gamma=0.156000, omega=15.000000, mu=0.066667, nu=0.012322
#Relative entropy: H=0.726430 bits/site
#Expected min. length: L_min=15.713919 sites

#Transition parameters: gamma=0.250000, omega=8.000000, mu=0.125000, nu=0.041667
#Relative entropy: H=0.950194 bits/site
#Expected min. length: L_min=8.951612 sites
#Expected max. length: L_max=5.560228 sites
#Phylogenetic information threshold: PIT=L_min*H=8.505767 bits

#Coverage = 0.5 Length = 20
#Transition parameters:gamma=0.500000, omega=20.000000, mu=0.050000, nu=0.050000
#Relative entropy: H=0.817081 bits/site
#Expected min. length: L_min=10.397809 sites
#Expected max. length: L_max=9.006386 sites
#Phylogenetic information threshold: PIT=L_min*H=8.495855 bits

# Coverage = 0.5 Length = 24
#Transition parameters:gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667
#Relative entropy: H=0.772807 bits/site
#Expected min. length: L_min=11.706841 sites
#Expected max. length: L_max=10.170845 sites
#Phylogenetic information threshold: PIT=L_min*H=9.047124 bits

# Coverage = 0.5 Length = 26
#Transition parameters:gamma=0.500000,omega=26.000000, mu=0.038462, nu=0.038462
#Relative entropy: H=0.755159 bits/site
#Expected min. length: L_min=12.299010 sites
#Expected max. length: L_max=10.697444 sites
#Phylogenetic information threshold: PIT=L_min*H=9.287712 bits

#Coverage = 0.5 Length = 28
#Transition parameters:gamma=0.500000,omega=28.000000, mu=0.035714, nu=0.035714
#Relative entropy: H=0.739661 bits/site
#Expected min. length: L_min=12.856932 sites
#Expected max. length: L_max=11.193931 sites
#Phylogenetic information threshold: PIT=L_min*H=9.509775 bits

########USED THESE PARAMETERS##################
#Coverage = 0.45 Length = 24
#Transition parameters:gamma=0.450000, omega=24.000000, mu=0.041667, nu=0.034091
#Relative entropy: H=0.749572 bits/site
#Expected min. length: L_min=12.663020 sites
#Expected max. length: L_max=10.634682 sites
#Phylogenetic information threshold: PIT=L_min*H=9.491841 bits

#Coverage = 0.40 Length = 24
#Transition parameters:gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778
#Relative entropy: H=0.730161 bits/site
#Expected min. length: L_min=13.607002 sites
#Expected max. length: L_max=11.092981 sites
#Phylogenetic information threshold: PIT=L_min*H=9.935307 bits

#Coverage = 0.38 Length = 20
#Transition parameters:gamma=0.380000, omega=20.000000, mu=0.050000, nu=0.030645
#Relative entropy: H=0.758676 bits/site
#Expected min. length: L_min=12.652818 sites
#Expected max. length: L_max=10.063048 sites
#Phylogenetic information threshold: PIT=L_min*H=9.599385 bits
 
#Coverage = 0.38 Length = 24
#Transition parameters:gamma=0.380000, omega=24.000000, mu=0.041667, nu=0.025538
#Relative entropy: H=0.723105 bits/site
#Expected min. length: L_min=13.987286 sites
#Expected max. length: L_max=11.279443 sites
#Phylogenetic information threshold: PIT=L_min*H=10.114270 bits

    # Create cluster dir to do main phastCons run
    ssh pk
    mkdir -p \
       /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
    cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
    cp -p ../treeRun1/ave.*.mod .
    cp -p ../treeRun1/ave.*.mod \
       /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    mkdir ppRaw bed
    # Create script to run phastCons with right parameters
    #   This job is I/O intensive in its output files, thus it is all
    #   working over in /scratch/tmp/
    # Use the expected length and target coverage determined above and 
    # the corresponding average conserved and nonconserved models
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
        --expected-length 18 --target-coverage 0.32 --quiet \
        --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #   root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
   #   happy emacs

   # Create parasol batch and run it
   ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

   gensub2 in.list single template jobList
   para create jobList
   para try/check/push/etc.
   para time
# Completed: 191 of 191 jobs
# CPU time in finished jobs:       4421s      73.69m     1.23h    0.05d  0.000 y
# IO & Wait Time:                121036s    2017.26m    33.62h    1.40d  0.004 y
# Average job time:                 657s      10.95m     0.18h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             726s      12.10m     0.20h    0.01d
# Submission to last job:           874s      14.57m     0.24h    0.01d
 
# combine predictions and transform scores to be in 0-1000 interval
   ssh kkstore04
   cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
   #   The sed's and the sort get the file names in chrom,start order 
   # (Hiram tricks -- split into columns on [.-/] with
   #    identifying x,y,z, to allow column sorting and
   #    restoring the filename.  Warning: the sort column
   # will depend on how deep you are in the dir
   find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
        | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    cp -p mostConserved.bed \
        /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
# Figure out how much is actually covered by the mostConserved data as so:
    cd /cluster/data/danRer4
    faSize */chr*.fa
    # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper 
    # 782542294 lower) in 28 sequences in 28 files
    # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
    # max 208014280 (chrNA_random) median 59765243
    # 782542294 lower) in 28 sequences in 28 files
    # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
    # max 208014280 (chrNA_random) median 59765243
    # The non-N size is 1598880803 bases
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
        mostConserved.bed
    -target-coverage 0.17: % 1.51 = 100.0*24186350/1598880803 length=12
    -target-coverage 0.156: % 1.44 = 100.0*22973222/1598880803 length=20
    -target-coverage 0.156: % 1.32 = 100.0*21177329/1598880803 length=15
    -target-coverage 0.25: % 1.32 = 100.0*21104503/1598880803 length=8
    -target-coverage 0.32: % 1.88 = 100.0*30014509/1598880803 length=20
    -target-coverage 0.5: % 3.00 = 100.0*47931076/1598880803 length=20
    -target-coverage 0.5: % 2.95 = 100.0*47170018/1598880803 length=24
    -target-coverage 0.5: % 2.24 = 100.0*35801661/1598880803 length=28
    -target-coverage 0.45: % 2.50 = 100.0*39965003/1598880803 length=24
    -target-coverage 0.40: % 2.22 = 100.0*35436744/1598880803 length=24
    -target-coverage 0.38: % 2.12 = 100.0*33911465/1598880803 length=20
    -target-coverage 0.38: % 2.13 = 100.0*33986115/1598880803 length=24

    # want to aim for 65% coverage of coding regions
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    # get an or of refGene and mgcGenes CDS regions
    featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
    # 11753378 bases of 1626093931 (0.723%) in intersection

#    featureBits danRer3 refGene:cds mgcGenes:cds -or \
 #               -bed=refSeqOrMgcCdsDanRer3.bed
    # 11633092 bases of 1630323462 (0.714%) in intersection
    featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.487%, both 0.332%, 
    # cover 45.97%, enrich 30.90x
    # for length = 12 and cov = 0.17 PIT=10.7
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, 
    # cover 53.74%, enrich 29.12x
    # for length = 20 and cov = 0.156 PIT=12.1
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.413%, both 0.333%, 
    # cover 46.04%, enrich 32.59x
    # for length = 15 and cov = 0.156 PIT=11.4
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.302%, both 0.313%, 
    # cover 43.36%, enrich 33.30x
    # decrease length and increase coverage to compensate
    # for length = 8 and cov = 0.25 PIT=8.5, PIT is too low
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.298%, both 0.304%, 
    # cover 42.06%, enrich 32.40x
    # try length = 20 and cov = 0.32 PIT=10.8
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, 
    # cover 53.74%, enrich 29.12x
    # length = 20 and cov = 0.5 PIT=8.5
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.948%, both 0.459%, 
    # cover 63.53%, enrich 21.55x
    # coverage good, need to increase the PIT value so increase the length.
    # length = 24 and cov = 0.5 PIT=9.05
    # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.901%, both 0.458%, 
    # cover 63.35%, enrich 21.84x
    # length = 28 and cov = 0.5 PIT=9.5
    # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.202%, both 0.431%, 
    # cover 59.57%, enrich 27.06x
    # length = 24 and cov = 0.45 PIT=9.5
    featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
    # 11770580 bases of 1626093931 (0.724%) in intersection
    featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
    # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.458%, both 0.438%, 
    # cover 60.57% enrich 24.64x
    # length = 20 and cov = 0.38 PIT=9.6
    # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.085%, both 0.411%, 
    # cover 56.76%, enrich 27.22x
    # length = 24 and cov = 0.38 PIT=10.1
    # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.090%, both 0.413%, 
    # cover 57.07%, enrich 27.30x
    # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
    # 53.3% coverage of coding regions with most conserved elements
    # (aiming for about 65%)

    # use consRun14 length = 24 cov=0.45
    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    hgLoadBed danRer4 phastConsRBestElements mostConserved.bed
    # Loaded  elements of size 5
    featureBits danRer4 mgcGenes:cds phastConsRBestElements -enrichment
    # mgcGenes:cds 0.560%, phastConsRBestElements 2.458%, both 0.349%, 
    # cover 62.23%, enrich 25.32x
    # Create merged posterier probability file and wiggle track data files
    # the sed business gets the names sorted by chromName, chromStart
    # so that everything goes in numerical order into wigEncode
    ssh kkstore04
    cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun14
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | wigEncode stdin phastConsRBest7way.wig phastConsRBest7way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    # takes a few minutes
    ls -l phastCons*
    #-rw-rw-r--  1 hartera protein 133817339 May 24 22:48 phastConsRBest7way.wib
    #-rw-rw-r--  1 hartera protein  36947021 May 24 22:48 phastConsRBest7way.wig 
    cp -p phastConsRBest7way.wi? \
          /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    
    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    mkdir -p /gbdb/danRer4/wib
    ln -s `pwd`/phastConsRBest7way.wib /gbdb/danRer4/wib/phastConsRBest7way.wib
    # use this if need to reload table
    hgsql -e 'drop table phastConsRBest7way;' danRer4
    # load table
    hgLoadWiggle danRer4 phastConsRBest7way phastConsRBest7way.wig

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-04/phastCons
    bash
    time hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
            -db=danRer4 phastCons7way > histogram.data 2>&1
# real    2m33.069s
# user    1m58.310s
# sys     0m16.170s

        #   create plot of histogram:
    cat << '_EOF_' > histo.gp
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
     "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'

    #   happy emacs
    gnuplot histo.gp > histo.png
    display histo.png &

# add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
# wiggle for the conservation track.
# check all.joiner for entries for phastCons7way and phastConsElements7way -ok
# copy over html for multiz and edit.


###########################################################################
# BACENDS TRACK (DONE, 2006-08-25, hartera)
   # Obtain these from the NCBI Trace archive
   ssh kolossus
   mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
   cd /cluster/data/danRer4/bed/bacEnds/
   ln -s /san/sanvol1/scratch/danRer4/bacEnds/sequences .
   cd sequences
   # go to NCBI Trace Archive
   # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?
   cat << '_EOF_' > query_tracedb
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use HTTP::Request::Common 'POST';

$ENV{'LANG'}='C';
$ENV{'LC_ALL'}='C';

my $query = join ' ', @ARGV;
$query = 'help' if $query =~ /^(\-h|\-\-help|\-)$/;
$query = join('', <STDIN>) if ! $query;

my $req = POST 'http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=raw', [query=>$query];
my $res =  LWP::UserAgent->new->request($req, sub { print $_[0] });
die "Couldn't connect to TRACE server\n" if ! $res->is_success;
'_EOF_'
   chmod +x query_tracedb
   # ./query_tracedb usage
   # command to see the help screen with usage examples
   # count number of entries for zebrafish
   query_tracedb "query count species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'"
   # 473060
   # 428904 (08-16-06)
   # Therefore this is 11 files of 40000 results each.
   # so get from ftp site:
   
   cat << '_EOF_' > getZfishSeqs.csh
#!/bin/csh -fe
foreach n (0 1 2 3 4 5 6 7 8 9 10)
   echo "Fetching page $n ..."
   (echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'") | query_tracedb > data${n}.tgz
end
'_EOF_'
   chmod +x getZfishSeqs.csh
   mkdir -p downloads
   cp query_tracedb getZfishSeqs.csh ./downloads
   cd downloads
   nohup nice getZfishSeqs.csh >& zfishSeqs.log &
   # Took 5 hours 14 minutes.
   ##  Start: Wed May 10 09:57 Finished: 14:51
   # Start: May  2 21:43 Finish: May 3 03:08
   ssh kkstore04
   # unzip and untar the downloads
   cd /cluster/data/danRer4/bed/bacEnds/sequences/downloads
   gunzip *.tgz
cat << '_EOF_' > unTarBacs.csh
#!/bin/csh -fe
foreach t (0 1 2 3 4 5 6 7 8 9 10 11)
   tar xvf data${t}.tar
end
'_EOF_'
   chmod +x unTarBacs.csh
   nohup unTarBacs.csh >& unTarBacs.log &
foreach d (2006*)
   echo "Processing $d"
   nice cat ${d}/TRACEINFO.xml >> allTraceInfo.xml
   nice catBacs.csh >& catBacs.log &
   # The last archive obtained is empty so try downloading from the ftp site
   # to be sure to get everything.

   # get BAC end sequences from NBCI Trace archive ftp site:
   ssh kkstore04
   mkdir /cluster/data/danRer4/bed/bacEnds/sequences2
   mkdir /cluster/bluearc/danRer4/bacEndsDownloads
   cd /cluster/data/danRer4/bed/bacEnds/sequences2
   ln -s /cluster/bluearc/danRer4/bacEndsDownloads
   cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
   # get index page and ftp for the trace server
   wget --timestamping \
        ftp://ftp.ncbi.nih.gov/pub/TraceDB/danio_rerio/
   # grab just the ftp link for each file.
   grep "anc" index.html > ancillary.lst
   perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' ancillary.lst
   rm *.bak
   # this contains just the ftp link for each file to get the ancillary
   # information files.
cat << '_EOF_' > getFtpFiles.csh
#!/bin/csh -fe
set s=$1
foreach f (`cat "${s}"`)
   echo $f
   nice wget --timestamping $f 
end
'_EOF_'
   chmod +x getFtpFiles.csh 
   nohup nice getFtpFiles.csh ancillary.lst >& ancillary.log &
   # Took about 25 minutes.
   grep "fasta" index.html > otherFiles.lst
   grep "mate_pairs" index.html >> otherFiles.lst
   grep "xml" index.html >> otherFiles.lst
   perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' otherFiles.lst
   rm *.bak
   mkdir otherFiles
   cd otherFiles
   cp ../otherFiles.lst .
   # then get these files by ftp
   nice ../getFtpFiles.csh otherFiles.lst >& otherFiles.log &
   # Took about 6 hours and 50 minutes.
   # There are 181 files as expected.
   foreach f (*.gz)
     nice gunzip $f
   end
   cd ..
   cat ./otherFiles/fasta* > danRerBacEnds.fa
   # Took about 20 minutes
   grep '>' danRerBacEnds.fa | wc -l
   # 14566448
   cat ./otherFiles/xml* > danRer.xml
   # Took 4 hours and 40 minutes.
   # find out which have CLONEEND information in them
cat << '_EOF_' > findCloneEnds.csh
#!/bin/csh -fe
foreach f (otherFiles/xml.*)
   echo $f >> cloneEndsXml.txt
   grep CLONEEND $f >> cloneEndsXml.txt
end
'_EOF_'
   chmod +x findCloneEnds.csh
   nice findCloneEnds.csh & 
   # Took 1.5 hours
   # CLONEEND is only in xml.danio_rerio.024 and xml.danio_rerio.033
   cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
   cat otherFiles/xml.danio_rerio.024 otherFiles/xml.danio_rerio.033 \
       > cloneEnds.xml
   # cleanup xml files
   rm otherFiles/xml.*
   # get list of libraries:
   grep "LIBRARY_ID" cloneEnds.xml | sort | uniq > libraries.xml.txt
   
   grep "TRACE_NAME" cloneEnds.xml | wc -l
   # 985980
   grep "TRACE_NAME" cloneEnds.xml | sort | uniq -c > traceName.xml.count
   # Hard to tell which are the BAC clone end sequences. These ftp files
   # contain a mixture of sequences from different sources
   # Try downloading sequences from Sanger instead. Not all of the sequences
   # may have been submitted to NCBI anyway yet. 
   ssh kkstore04
   cd /cluster/data/danRer4/bed/bacEnds
   mkdir -p /san/sanvol1/danRer4/bacEnds/ensemblSeqs
   ln -s /san/sanvol1/danRer4/bacEnds/ensemblSeqs
   cd ensemblSeqs
   wget --timestamping  \
        ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
   # gets index.html page
   # get list of cloneEnd FASTA files
   grep cloneEnd index.html > cloneEndsFile
   perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsFile
   rm *.bak
   foreach f (`cat cloneEndsFile`)
     echo $f
     wget --timestamping $f 
   end
   # then do the same to get the trace info xml files:
   wget --timestamping \
      ftp://ftp.ensembl.org/pub/traces/danio_rerio/traceinfo/
   grep cloneEnd index.html > cloneEndsXmlFile
   perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsXmlFile
   rm *.bak
   foreach f (`cat cloneEndsXmlFile`)
     echo $f
     wget --timestamping $f
   end
   gunzip *.gz
   # check for multiple occurrences of same sequence ID
   grep trace_name *.xml | sort | uniq -c | sort -nr > traceNames.count
   # top of list has count of 1 so the end names are unique.
   grep clone_id *.xml | sort | uniq -c | sort -nr > cloneIds.count
   # top of list has count of 4. All those clone IDs that appear 3 or 4 times
   # do so in the CHORI-1073 library - this is the fosmid library.
   # move CHORI-1073 out of the way
   mkdir fosmids
   mv sanger-zfish-CHORI-1073-cloneEnd* ./fosmids
   
   # FASTA files have clone end names as sequence names
   # concatenate the 18 fasta files
   cat *.fasta > Zv6BacEnds.fa    
   grep '>' Zv6BacEnds.fa | wc -l
   # 694170
   # Zv5 had 729101 but these were not unique reads for each sequence. 
   faSize Zv6BacEnds.fa >& Zv6.faSize.txt
   # there are 31 sequence names with no sequence.
   awk '{print $10}' Zv6.faSize.txt > cloneEnds.noSeq
   # remove extra lines at end of file 
   # list of FASTA files that they are in. 
   grep -f cloneEnds.noSeq *.fasta > cloneEnds.noSeq.files
   # sent this list of sequence names and files to Kerstin Howe
   # at Sanger: kj2@sanger.ac.uk . Sanger said that these are just missing
   # sequences due to poor quality.
   # invalid FASTA file format
   # remove these from FASTA file:
   grep -v -f cloneEnds.noSeq Zv6BacEnds.fa > tmp.fa
   grep '>' tmp.fa | wc -l
   # 694139
   mv tmp.fa Zv6BacEnds.fa
   faSize Zv6BacEnds.fa
   # 728424771 bases (11822219 N's 716602552 real 716602552 upper 0 lower) in
   # 694139 sequences in 1 files
   # Total size: mean 1049.4 sd 277.3 min 4 (zKp108D7.za) max 5403 (zC259G13.zb)
   # median 982
   # N count: mean 17.0 sd 42.1
   # U count: mean 1032.4 sd 265.3
   # L count: mean 0.0 sd 0.0
   
   # Blat these BAC ends vs the danRer4 genome assembly. Gaps between
   # scaffolds in the NA_random and Un_random chroms are 50,000 so 
   # alignments of BAC ends across adjacent scaffolds are unlikely, 
   # but alignments done separately just in case:
   ssh pk
   mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
   cd /cluster/data/danRer4/bed/bacEnds/ensemblSeqs
   cp Zv6BacEnds.fa /san/sanvol1/scratch/danRer4/bacEnds/sequences
   mkdir -p /cluster/data/danRer4/bed/bacEnds/chromsRun
   cd /cluster/data/danRer4/bed/bacEnds/chromsRun
   ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
          > bacends.lst 
   ls -1S /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > seqs.lst
   # create out dir
   mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl
   # use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl/$(root1)_$(root2).psl}
#ENDLOOP 
'_EOF_'
# << this line keeps emacs coloring happy
    gensub2 seqs.lst bacends.lst template jobList
    para create jobList
    para try, check, push, check, ...
# para time
# Completed: 271 of 271 jobs
# CPU time in finished jobs:    1063126s   17718.77m   295.31h   12.30d  0.034 y
# IO & Wait Time:                  2531s      42.18m     0.70h    0.03d  0.000 y
# Average job time:                3932s      65.54m     1.09h    0.05d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            9404s     156.73m     2.61h    0.11d
# Submission to last job:          9891s     164.85m     2.75h    0.11d

   # Repeat for random chroms, but use separate scaffolds:
   mkdir -p /cluster/data/danRer4/bed/bacEnds/randomsRun
   cd /cluster/data/danRer4/bed/bacEnds/randomsRun
   ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
          > bacends.lst 
   foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/Zv6*.fa)
      ls -1S $f >> seqs.lst
   end
   # create out dir
   mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl
   # use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl/$(root1)_$(root2).psl}
#ENDLOOP 
'_EOF_'
# << this line keeps emacs coloring happy
    gensub2 seqs.lst bacends.lst template jobList
    para create jobList
    para try, check, push, check, ...
# para time 
# Completed: 2966 of 2966 jobs
# CPU time in finished jobs:     240259s    4004.31m    66.74h    2.78d  0.008 y
# IO & Wait Time:                 84042s    1400.71m    23.35h    0.97d  0.003 y
# Average job time:                 109s       1.82m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             997s      16.62m     0.28h    0.01d
# Submission to last job:         11925s     198.75m     3.31h    0.14d
    # lift chrom alignments and randoms alignments and then merge and filter.
    ssh kolossus
    cd /cluster/data/danRer4/bed/bacEnds/
    nice pslSort dirs rawChroms.psl tmp \
         /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl >& chromSort.log 
    # Took 2 hours
    # very large output so do the randoms on the san
    cd /san/sanvol1/scratch/danRer4/bacEnds/
    nice pslSort dirs rawRandoms.psl tmp randomsPsl >& randomsSort.log
    # Took 12 minutes
    # move the rawChroms.psl over to the san
    mv /cluster/data/danRer4/bed/bacEnds/rawChroms.psl \
       /san/sanvol1/scratch/danRer4/bacEnds/
    cd /san/sanvol1/scratch/danRer4/bacEnds/
    # for danRer3, hg18 etc.:
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                rawChroms.psl bacEndsChroms.psl /dev/null
    # Took about 1 hour.
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                rawRandoms.psl bacEndsRandoms.psl /dev/null
    # Took 2 minutes.
    # merge files. There is a single liftOver file that works for both the
    # pseudocontigs and the scaffolds.
    # remove header for bacEndsRandoms.psl
    tail +6 bacEndsRandoms.psl > tmp.psl 
    cat bacEndsChroms.psl tmp.psl > bacEndsNoLift.psl
    # liftUp file to chrom coordinates.
    liftUp bacEnds.psl \
           /cluster/data/danRer4/jkStuff/liftAll.lft warn bacEndsNoLift.psl 
    # Took 2 minutes
    # REPROCESS BACENDS - see section at end (2006-10-06 - 2006-10-11, hartera)
    # Now put together the pairs information:
    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds
    mv /san/sanvol1/danRer4/bacEnds/bacEnds.psl .
    # cat together the xml files of BAC clone end information
    cat ensemblSeqs/*.xml > danRerBacEnds.xml
    # get mate-pair information from xml, forward is SP6, reverse is T7 
    # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
    # BAC ends. Not all entries in the xml file have clone_id or trace_end
    # but sometimes they have trace_direction instead of trace_end.
    # correct directions:
cat << '_EOF_' > getZfishBacInfo.pl
#!/usr/bin/perl -w
use strict;

my ($file, $outFile, $name, $clone, $library, $dir);
$file = $ARGV[0];
$outFile = $ARGV[1];

open (FILE, $file) || die "Can not open $file : $!\n";
open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
my %cloneHash = qw {
   zC   CH211-
   zK  DKEY-
   zKp DKEYP-
   bZ  RP71-
   dZ  BUSM1-
   CHORI73_ CH73-
};

$name = "";
$clone = "";
$dir = "";
while (<FILE>)
{
chomp;
my $l = $_;
if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
   {
   $name = $1;
   }
elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
   {
   $clone = $1;
   }
elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
   {
   $library = $1;
   if ($library eq "Daniokey Pilot")
      {
      $library = "DKEYP";
      }
   }
elsif ($l =~ /<trace_end>(F|R)/)
   {
   $dir = $1;
   }
elsif ($l =~ /<trace_direction>(F|R)/)
   {
   $dir = $1;
   }
# find end of record and print out end information
if ($l =~ /^\s+<\/trace>/)
   {
   printInfo($name, $clone, $library, $dir);
   $name = $clone = $dir = $library = "";
   }
}
close FILE;
close OUT;
close STDERR;

sub printInfo  {
   my ($name, $clone, $lib, $d) = @_; 
   # if no clone name read from file then create from trace name
   if ($clone  eq "")
      {
      foreach my $c (keys(%cloneHash))
         {
         if ($name =~ /$c/)
             {
             if (exists($cloneHash{$c})) 
                {
                my $prefix = $cloneHash{$c};
                $clone = $name;
                # change to clone name
                $clone =~ s/$c/$prefix/;
                # remove suffix
                $clone =~ s/\.[a-z]+|SP6|T7//;
                }
             }
         }
      }
   # convert forward or reverse direction to T7 or SP6
   if ($d ne "")
      {
      if ($d eq "F")
         {
         $d = "T7";
         }
      elsif ($d eq "R")
         {
         $d = "SP6";
         }
      }
   else 
      {
      print STDERR "No direction for $name found\n";
      }
   # print clone end information
   print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
}
'_EOF_'
    # << for emacs
    chmod +x getZfishBacInfo.pl
    perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
    # check all the names are there 
    grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
    perl -pi.bak -e 's/>//' names
    sort names | uniq > names.sort
    awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
    comm -13 bacEndInfo.names.sort names.sort
    # no difference so all clone ends in the FASTA file are also 
    # in the xml file.
    rm *.bak *.sort names  
    # create mate-pair information
    cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
    # comment out line 43 as this removes the suffix after a . from the
    # trace names. In this case, we need to keep those. 
    # line 43:  ($acc, $ver) = split(/\./,$acc);
    # here used wrong script - used old one.
    ./convertZfishBacEndInfo bacEndInfo.txt
    # creates pairs and singles files
    # 312901 pairs and 35479 singles
    # looks like pairs were made for both DKEY-32B21A and DKEY-32B21
    # need to find singles that could be used in pairs. 
    awk '{print $2}' bacEndSingles.txt > singles.names
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' singles.names
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' singles.names
    sort singles.names | uniq -c | sort -nr > singles.names.count
    # 209 have 2 ends for the BAC clone.
    # some are duplicates of the same end e.g. .ya and .yb but these
    # have the same BAC clone name.
    head -209 singles.names.count | awk '{print $2}' > singles.withPairs.names
    awk '{print $2}' bacEndPairs.txt > pairs.names
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' pairs.names
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' pairs.names
  
    mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
    cd /cluster/data/danRer4/bed/bacEnds/pairs
    set dir = /cluster/data/danRer4/bed/bacEnds
    # use parameters from REDO of danRer3 BAC ends
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
    wc -l *
#   1714 bacEnds.long
#  14889 bacEnds.mismatch
# 109213 bacEnds.orphan
# 105294 bacEnds.pairs
#    347 bacEnds.short
#    782 bacEnds.slop

    # create header required by "rdb" tools
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
         > ../header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
    # edit header to make sure \t is/become tab character
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairs.bed
    # create bad BAC ends set
    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
    # Also create a bad BAC ends set with no orphans since orphans are
    # already added to the singles track and do not want to add these orphans
    # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
    # file when extracting PSLs for adding to the all_bacends table.
    cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
    # To create singles set:
    # also need to process bacEndSingles.txt into a database table
    # for singles in bacEndSingles.txt, create a dummy file where they
    # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
    # sequence, put the dummy sequence in the second column, if the single is
    # a reverse sequence put in first column. use a perl script to do this.
    cd /cluster/data/danRer4/bed/bacends
    set bacDir = /cluster/data/danRer4/bed/bacEnds
    mkdir singles
    cd singles
    cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
    perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                           $bacDir/bacEndSingles.format
    # then run pslPairs on this formatted file
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
     -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
    wc -l bacEnds.*
    #     0 bacEnds.long
    #     0 bacEnds.mismatch
    # 22036 bacEnds.orphan
    #     0 bacEnds.pairs
    #     0 bacEnds.short
    #     0 bacEnds.slop
    cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles 
    wc -l bacEnds.singles
    # 131249 bacEnds.singles
    # Of these, 109213 are from pair analysis and 22036 from singles.
    # For danRer3: there are 11439 orphans from singles and 242235 from 
    # pair analysis so a total of 253674 orphans so this has improved.
    # Although for danRer3, some of these could be replicate reads for the
    # same BAC clone end.
    # make singles bed file
    cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed

    # check if there are any overlapping alignments that can be removed.
    cd /cluster/data/danRer4/bed/bacEnds
    mkdir -p duplicates/overlapRun
    cd duplicates/overlapRun
    sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
         > bacEndPairs.lfs 
    wc -l *.lfs
    # 104546 bacEndPairs.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
         -name -minOverlap=0.999 -notBlocks
    # Loaded 104546 elements of size 11
    # only 5 lines removed
    sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
         > bacEndSingles.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
        -name -minOverlap=0.999 -notBlocks
    # Loaded 125695 elements of size 11
    # No lines removed.
    sort -k1,2 \
         /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
         > bacEndPairsBadNoOrphans.lfs 
    wc -l *.lfs
    # 17611 bacEndPairsBadNoOrphans.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
         bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
    # Loaded 17611 elements of size 11
    # Saving 17608 records to bacEndPairsBadNoOrphans.bed
    # Only 3 alignments were removed. 
    # Therefore no point in doing using these files. Use the original bed 
    # files for pairs and singles. No further processing of BED files is 
    # needed as they have not been changed in any way.
    # Remove duplicates directory.
    rm -r /cluster/data/danRer4/bed/bacEnds/duplicates
    # use new extract program that extracts PSLs using name and position:
    ssh kkstore04
    set bacDir=/cluster/data/danRer4/bed/bacEnds
    cd $bacDir/pairs

    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
        $bacDir/bacEnds.psl bacEndPairs.bed bacPairs.psl
    # for this, use bacEndPairsBadNoOrphans since pairs orphans are already
    # included in bacEndSingles
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndPairsBadNoOrphans.bed bacPairsBadNoOrphans.psl
    # then for singles
    cd $bacDir/singles
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndSingles.bed bacSingles.psl
    cd $bacDir
    cat pairs/*.psl singles/bacSingles.psl > allBacends.load.psl
    # try old program and compare
    extractPslLoad -noBin bacEnds.psl pairs/bacEndPairs.bed \
        pairs/bacEndPairsBadNoOrphans.bed singles/bacEndSingles.bed \
        | sorttbl tname tstart | headchg -del > bacEnds.load.psl
    wc -l *.load.psl
    # 364457 allBacends.load.psl
    # 4568907 bacEnds.load.psl
    # Much reduced by using only BAC end alignments that are in BED files.
    # load into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/bacEnds/pairs
    hgLoadBed danRer4 bacEndPairs bacEndPairs.bed -notItemRgb \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 104546 elements of size 11
    # note - this next track isn't pushed to RR, just used for assembly QA
    hgLoadBed danRer4 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
              -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 121728 elements of size 11
    cd /cluster/data/danRer4/bed/bacEnds/singles
    cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql .
    hgLoadBed danRer4 bacEndSingles bacEndSingles.bed -notItemRgb \
              -sqlTable=bacEndSingles.sql
    # Loaded 125695 elements of size 11
    cd /cluster/data/danRer4/bed/bacEnds
    hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
    # All alignments were loaded into the table - no problems.
    # load BAC end sequences into seq table so alignments may be viewed
    # symlink to FASTA sequence file in ncbi directory
    # move BAC ends to the ncbi directory
    mkdir -p /cluster/data/ncbi/bacends/zebrafish/bacends.1
    # remove some files
    cd ensemblSeqs 
    rm tmp clone* index.html
    cd /cluster/data/danRer4/bed/bacEnds
    mv /cluster/data/danRer4/bed/bacEnds/ensemblSeqs/* \
       /cluster/data/ncbi/bacends/zebrafish/bacends.1
    rm -r ensemblSeqs
    mkdir -p /gbdb/danRer4/bacends
    ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
          /gbdb/danRer4/bacends/Zv6BacEnds.fa
    hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
    # check trackDb.ra entry and description
    # cleanup:
    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds/
    rm -r sequences
    rm -r /san/sanvol1/scratch/danRer4/bacEnds/sequences
    rm -r sequences2
    rm changes.txt bacEnds.load.psl *.log
    du -sh /cluster/data/danRer4/bed/bacEnds
    # 2.4G    /cluster/data/danRer4/bed/bacEnds
    gzip *.psl *.txt danRerBacEnds.xml
    du -sh /cluster/data/danRer4/bed/bacEnds
    # 599M    /cluster/data/danRer4/bed/bacEnds
    # (hartera, 2006-10-02)
    # NOTE: Some BAC clones have duplicate reads and these end in the 
    # suffixes SP6A, T7A, SP6W and T7W. There is a corresponding read name
    # without the W or A suffix. The names of the BAC clones 
    # are also suffixed with A or W for these reads. e.g There is a BAC
    # clone called DKEY-32M8. DKEY-32M8A is the same one sequenced with
    # different read ending in SP6A and T7A. The BAC ends names are 
    # zK32M8SP6A and zK32M8T7A. 
    # Check if there are any cases where both the version without the W or A 
    # suffix and the version with it are in the same track:
    ssh hgwdev 
    cd /cluster/data/danRer4/bed/bacEnds
    mkdir duplicates
    cd duplicates
    # found that there are some alignments in all_bacends where there 
    # is SP6W, SP6A, T7W, T7A suffixes for BAC ends. These are duplicate
    # reads, there is a corresponding read name without the W or A suffix.
# Suffix Alignments Unique Names
# SP6W	179	153
# SP6A	254	245
# T7W 	53	48
# T7A	247	238
    hgsql -e 'select count(*) from bacEndPairs where lfNames like "%SP6A%";' \
      danRer4
    # 126 were found
    hgsql -e \
 'select count(distinct(name)) from bacEndPairs where lfNames like "%SP6A%";' \
      danRer4
    # 122 with distinct names
    hgsql -N -e \
     'select name, lfNames from bacEndPairs where lfNames like "%SP6A%";' \
     danRer4 | sort > names.SP6A.txt
    awk '{print $1}' names.SP6A.txt | sed -e 's/A$//' > names.SP6.txt
    hgsql -N -e \
    'select name, lfNames from bacEndPairs where lfNames not like "%SP6A%";' \
     danRer4 | sort > pairs.nameswithoutA.txt
    grep -w -f names.SP6.txt pairs.nameswithoutA.txt | sort | uniq \
         > pairs.withAandwithout.txt
    # there are 23 BAC clones in the bacEndPairs table where there are
    # entries for both the clone names ending in A and that without the A.
    hgsql -N -e 'select name, lfNames from bacEndSingles where (lfNames like
"%SP6A%") or (lfNames like "%SP6W%") or (lfNames like "%T7A%") or (lfNames
like "%T7W%");' danRer4 | sort | uniq > singles.names.sort
    awk '{print $1}' singles.names.sort | sed -e 's/A$//' | sed -e 's/W$//' \
        > names.SP6andT7.txt
    wc -l names.SP6andT7.txt
    # 372 names.SP6andT7.txt
    sort names.SP6andT7.txt | uniq > names.SP6andT7.uniq
    wc -l names.SP6andT7.uniq
    # 309 names.SP6andT7.uniq
    # Some may have both names ending in W and in A or could those 
    # where the SP6 and T7 end are both present.
    hgsql -N -e 'select name, lfNames from bacEndSingles;' danRer4 \
          > singles.names.txt
    grep -w -f names.SP6andT7.uniq singles.names.txt | sort | uniq \
         > singles.withAorWandwithout.txt
    wc -l singles.withAorWandwithout.txt
    # 212 singles.withAandwithout.txt
    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles
    # Check to see if any pairs can be made that do not have the same 
    # suffix: A, W or without. Only for cases where there is not a pair 
    # already.
    awk '{print $2}' singles.names.sort | sort | uniq > bacEnds.namesAorW.sort
    # also add the BAC ends for those with the same name but withour A or W
    awk '{print $2}' singles.withAorWandwithout.txt | sort | uniq \
        > singles.withAorWandwithout.ends
    cat bacEnds.namesAorW.sort singles.withAorWandwithout.ends \
        | sort | uniq > bacEnds.namesAorWorwithout.sort
    # make pairs where there is none with the same ending already. If an end
    # has W and/or A suffix and/or no suffix, use just one and discard others.
    # use a script to do this.
    wc -l *.txt
    # 93 diffSuffix.txt
    # 69 sameSuffix.txt
    # 212 singles.withAorWandwithout.txt
    # 92 singlesEnds.txt
    # changed program to do second pass using the extra ends.
    # 76 diffSuffix.txt
    # 78 extraEnds.txt
    # 39 extraEnds2.txt
    # 86 sameSuffix.txt
    # 92 singlesEnds.txt
    
    # /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles/test2
    # now check to see if any of the BACs represented by singles or pairs
    # are already in the original file created.  
    # extraEnds2.txt are those to be removed
    # diffSuffix.txt, sameSuffix.txt and singlesEnds.txt should all
    # be checked against the entries in the bacEndPairs table since
    # these are sequences that already passed all the criteria for 
    # being in the BAC end pairs track.
    mkdir /cluster/data/danRer4/bed/bacEnds/duplicates/remove
    cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove 
    cp /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed .
    # those that have the same suffix will have already been paired. It is
    # the ones that are different that should be put into the pairs file 
    # and those that are singles should go into the singles file before
    # processing the BAC ends. 

    # first remove the 23 that are duplicated in the bacEndPairs table.
    cp ../pairs.withAandwithout.txt .
   # cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove
    awk '{print $1"A"}' pairs.withAandwithout.txt > bacsToRemove.txt
    # remove these from the BAC end pairs file
    grep -wv -f bacsToRemove.txt bacEndPairs.bed > bacEndPairsRemBacA.bed 
    wc -l *.bed
    # 104546 bacEndPairs.bed
    # 104523 bacEndPairsRemBacA.bed
    
    # then find out if there are any BACs with more than one set of pairs
    # in each of the lists: sameSuffix.txt and diffSuffix.txt
    cp ../*Suffix.txt .
    #  the first column has the stem of the BAC end names without the 
    #  SP6 or T7 part of the suffix.
    awk '{print $1;}' sameSuffix.txt | sort | uniq -c | sort -nr \
        > sameSuff.count
    # no duplicates within the file
    awk '{print $1;}' diffSuffix.txt | sort | uniq -c | sort -nr \
        > diffSuff.count
    # no duplicates within the file
    cat sameSuffix.txt diffSuffix.txt > allSuff.txt
    awk '{print $1;}' allSuff.txt | sort | uniq -c | sort -nr \
        > allSuff.count
    # no duplicates between files
    rm *.count
    # then check if any of these are represented in the pairs table:
    # All of these BAC end names begin with zK, these are DKEY- BAC clones
    # translate names in column 1 to BAC clone names
    awk '{print $1}' allSuff.txt | sed -e 's/zK/DKEY\-/' | sort \
        > allSuff.BACclones.txt
    grep -w -f allSuff.BACclones.txt bacEndPairsRemBacA.bed \
         > newPairsDupsInPairsBed.txt
    # only one is found: DKEY-32B21: zK32B21T7,zK32B21SP6
    awk '{print $4}' newPairsDupsInPairsBed.txt \
        > newPairsDupsInPairsBed.name
    grep "zK32B21" *.txt
    # found in sameSuffix.txt so delete from this file and from allSuff.txt
    grep -wv "zK32B21" sameSuffix.txt > sameSuffix2.txt
    grep -wv "zK32B21" allSuff.txt > allSuff2.txt
    # in this case the zK32B21T7A alignment is much better than the
    # zK32B21T7 alignment, also zK32B21SP6A is better than the zK32B21SP6
    # alignment therefore it should be replaced with the SP6A and T7A
    # versions.

    cp /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed .
    grep "zK32B21" bacEndSingles.bed 
    # then repeat this for the singles and see if any of those already
    # have pairs in the bacEndPairsRemBacA.bed file.
    cp ../singlesEnds.txt .
    cp ../extraEnds2.txt .
    # all these ends begin with "zK" so from "DKEY-" library.
    # get BAC end prefixes and conver to DKEY BAC clone names.
    awk '{print $1}' singlesEnds.txt | sed -e 's/zK/DKEY\-/' | sort \
        > singles.BACclones.txt
    grep -w -f singles.BACclones.txt bacEndPairsRemBacA.bed \
        > singlesInPairsBed.txt
    wc -l singlesInPairsBed.txt
    # 40 singlesInPairsBed.txt
    # get those names from the clone name in bacEndPairsRemBacA.bed
    awk '{print $4}' singlesInPairsBed.txt | sed -e 's/DKEY\-/zK/' \
        | sort | uniq > singlesDupsInPairs.txt
    wc -l singlesDupsInPairs.txt
    # 37 singlesDupsInPairs.txt
    # All of these versions are in Genbank.
    cat newPairsDupsInPairsBed.name singlesDupsInPairs.txt \
        | sed -e 's/zK/DKEY\-/' > allDupsInPairs.txt
    # BEST WAY FORWARD IS TO START AGAIN WITH PROCESSING THE BAC ENDS AND
    # PROCESS DUPLICATES AS FOR danRer3.

##############################################################################
# REPROCESS BAC ENDS TO DEAL WITH DUPLICATES AND REDO BACENDS TRACKS
# (2006-10-06 - 2006-10-11, hartera)
    # The bacEnds.psl from the first BACENDS TRACK section is used so all 
    # processing is the same up to that point.
    # Now put together the pairs information:
    ssh kkstore04
    # move old bacends dir out the way
    mv /cluster/data/danRer4/bed/bacEnds /cluster/data/danRer4/bed/bacEndsOld
    mkdir /cluster/data/danRer4/bed/bacEnds
    cd /cluster/data/danRer4/bed/bacEnds
   # mv /cluster/data/danRer4/bed/bacEndsOld/bacEnds.psl .
    # cat together the xml files of BAC clone end information
    cat ensemblSeqs/*.xml > danRerBacEnds.xml
    # get mate-pair information from xml, 
    # in convertBacEndInfo, forward is T7, reverse is SP6. Use this 
    # although before used the other way round. Arbitrary really as long 
    # as use the same in the same library. CHORI73 library has it the opposite 
    # way round to above.  
    # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
    # BAC ends. Not all entries in the xml file have clone_id or trace_end
    # but sometimes they have trace_direction instead of trace_end.
    # correct directions:
cat << '_EOF_' > getZfishBacInfo.pl
#!/usr/bin/perl -w
use strict;

my ($file, $outFile, $name, $clone, $library, $dir);
$file = $ARGV[0];
$outFile = $ARGV[1];

open (FILE, $file) || die "Can not open $file : $!\n";
open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
my %cloneHash = qw {
   zC   CH211-
   zK  DKEY-
   zKp DKEYP-
   bZ  RP71-
   dZ  BUSM1-
   CHORI73_ CH73-
};

$name = "";
$clone = "";
$dir = "";
while (<FILE>)
{
chomp;
my $l = $_;
if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
   {
   $name = $1;
   }
elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
   {
   $clone = $1;
   }
elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
   {
   $library = $1;
   if ($library eq "Daniokey Pilot")
      {
      $library = "DKEYP";
      }
   }
elsif ($l =~ /<trace_end>(F|R)/)
   {
   $dir = $1;
   }
elsif ($l =~ /<trace_direction>(F|R)/)
   {
   $dir = $1;
   }
# find end of record and print out end information
if ($l =~ /^\s+<\/trace>/)
   {
   printInfo($name, $clone, $library, $dir);
   $name = $clone = $dir = $library = "";
   }
}
close FILE;
close OUT;
close STDERR;

sub printInfo  {
   my ($name, $clone, $lib, $d) = @_; 
   # if no clone name read from file then create from trace name
   if ($clone  eq "")
      {
      foreach my $c (keys(%cloneHash))
         {
         if ($name =~ /$c/)
             {
             if (exists($cloneHash{$c})) 
                {
                my $prefix = $cloneHash{$c};
                $clone = $name;
                # change to clone name
                $clone =~ s/$c/$prefix/;
                # remove suffix
                $clone =~ s/\.[a-z]+|SP6|T7//;
                }
             }
         }
      }
   # convert forward or reverse direction to T7 or SP6
   if ($d ne "")
      {
      if ($d eq "F")
         {
         $d = "T7";
         }
      elsif ($d eq "R")
         {
         $d = "SP6";
         }
      }
   else 
      {
      print STDERR "No direction for $name found\n";
      }
   # print clone end information
   print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
}
'_EOF_'
    # << for emacs
    chmod +x getZfishBacInfo.pl
    perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
    # check all the names are there 
    grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
    perl -pi.bak -e 's/>//' names
    sort names | uniq > names.sort
    awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
    comm -13 bacEndInfo.names.sort names.sort
    # no difference so all clone ends in the FASTA file are also 
    # in the xml file.
    rm *.bak *.sort names  
    # create mate-pair information
    # convertBacEndPairInfo does not deal with replicate names. These can 
    # be in a comma separated list in the pairs and singles files.
    # edit the script so that it does this and parses the bacEndInfo.txt file.
    cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
    # comment out line 43 as this removes the suffix after a . from the
    # trace names. In this case, we need to keep those. 
    # line 43:  ($acc, $ver) = split(/\./,$acc);
cat << 'EOF' > convertZfishBacEndInfo
#!/usr/local/bin/perl
# File: convertBacEndPairZfishInfo
# Date: 10/2006
# Description: Converts bacends.cl_acc_gi_len_primer format file to 
# bacEnds.pair file used for creating BAC End Pairs tracks

# Usage message
if ($#ARGV < 0) {
  print stderr "USAGE: convertBacEndPairInfo <cl_acc_gi_len_primer>\n";
  exit(1);
}

$file = shift(@ARGV);
open(FILE, "$file") || die("Could not open $file\n");

$pair = $single = 0;

# Read in and record end info
print stderr "Reading in end info\n";
while ($line = <FILE>) {
  chomp($line);
  ($clone, $acc, $gi, $center, $length, $end) = split('\t',$line);
 # ($acc, $ver) = split(/\./,$acc);
  $end =~ tr/a-z/A-Z/;
  $found{$clone} = 1;
  $clone{$acc} = $clone;
  $printa{$acc} = 0;
  $print{$clone} = 0;
  $end{$acc} = $end;
  if (&isForward($end)) {
   #  print "Adding $acc for $clone as $end \n";
    $t7{$clone} .= "$acc,";
   # print "The entry for $clone is $t7{$clone}\n";
  } elsif (&isReverse($end)) {
    $sp6{$clone} .= "$acc,";
  } elsif ($end) {
    print stderr "End $end for $acc / $clone\n";
  }
}
close(OUT);

# Print out pairs
open(OUT, ">bacEndPairs.txt");
print stderr "Writing out pair info\n";
foreach $clone (keys %found) {
  if ($t7{$clone} && $sp6{$clone}) {
    print OUT "$t7{$clone}\t$sp6{$clone}\t$clone\n";
    $print{$clone} = 1;
    @acc = split(/\,/,$t7{$clone});
    for ($i = 0; $i <= $#acc; $i++) {
       $printa{$acc[$i]} = 1;
    }
    @acc = split(/\,/,$sp6{$clone});
    for ($i = 0; $i <= $#acc; $i++) {
       $printa{$acc[$i]} = 1;
    }
    $pair++;
  }
}
close(OUT);

# Print out singletons
print stderr "Writing out singleton info\n";
open(OUT, ">bacEndSingles.txt");
%sp6Singles;
%t7Singles;

foreach $acc (keys %printa) {
  $clone = $clone{$acc};
  # if not printed already then add to a new hash for singles
  if (!$printa{$acc}) {
     if (&isForward($end{$acc})) {
        $t7Singles{$clone} .= "$acc,";
     }
     elsif (&isReverse($end{$acc})) {
        $sp6Singles{$clone} .="$acc,";
     }
     else {
        print stderr "$acc has unknown end\n";
     }
  }
}
# then print out the singles:
foreach $cl (keys %t7Singles) {
   print OUT "$t7Singles{$cl}\t$cl\tT7\n";
   $single++;
}
foreach $cl (keys %sp6Singles) {
   print OUT "$sp6Singles{$cl}\t$cl\tSP6\n";
   $single++;
}

close(OUT);
print stderr "$pair pairs and $single singles\n";

sub isForward {
    $end = shift(@_);
    if (($end =~ /FORWARD/) || ($end =~ /^T7/) || ($end eq "F") ||
	($end eq "M13-21") || ($end eq "1") || ($end =~ /^TK/) ||
	($end =~ /^EC1/) || ($end =~ /^RM1/)) {
	return 1;
    } else {
	return 0;
    }
}


sub isReverse {
    if (($end =~ /REVERSE/) || ($end =~ /^SP6/) || ($end eq "R") ||
	($end =~ /^TJ/)) {
	return 1;
    } else {
	return 0;
    }
}
'EOF'
    # remove all W and A suffixes from the end of bacEndInfo.txt clone names
    cp bacEndInfo.txt bacEndInfo2.txt 
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' bacEndInfo2.txt
    perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' bacEndInfo2.txt
    ./convertZfishBacEndInfo bacEndInfo2.txt
    # creates pairs and singles files
    # 312850 pairs and 34935 singles
  
    mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
    cd /cluster/data/danRer4/bed/bacEnds/pairs
    set dir = /cluster/data/danRer4/bed/bacEnds
    # use parameters from REDO of danRer3 BAC ends
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
    wc -l *
#    2724 bacEnds.long
#   22959 bacEnds.mismatch
#  179405 bacEnds.orphan
#  156241 bacEnds.pairs
#     565 bacEnds.short
#    1196 bacEnds.slop

    # create header required by "rdb" tools
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
         > ../header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
    # edit header to make sure \t is/become tab character
    cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairs.bed
    # create bad BAC ends set
    cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
    # Also create a bad BAC ends set with no orphans since orphans are
    # already added to the singles track and do not want to add these orphans
    # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
    # file when extracting PSLs for adding to the all_bacends table.
    cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBadNoOrphans.bed
    # To create singles set:
    # also need to process bacEndSingles.txt into a database table
    # for singles in bacEndSingles.txt, create a dummy file where they
    # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
    # sequence, put the dummy sequence in the second column, if the single is
    # a reverse sequence put in first column. use a perl script to do this.
    cd /cluster/data/danRer4/bed/bacEnds
    set bacDir = /cluster/data/danRer4/bed/bacEnds
    mkdir singles
    cd singles
    cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
    perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                           $bacDir/bacEndSingles.format
    # then run pslPairs on this formatted file
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
     -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
     -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
     all_bacends bacEnds
    wc -l bacEnds.*
    #     0 bacEnds.long
    #     0 bacEnds.mismatch
    # 23398 bacEnds.orphan
    #     0 bacEnds.pairs
    #     0 bacEnds.short
    #     0 bacEnds.slop
    cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles 
    wc -l bacEnds.singles
    # 202803 bacEnds.singles
    # Of these, 179405 are from pair analysis and 23398 from singles.
    # For danRer3: there are 11439 orphans from singles and 242235 from 
    # pair analysis so a total of 253674 orphans so this has improved.
    # Although for danRer3, some of these could be more replicate reads for the
    # same BAC clone end.
    # make singles bed file
    cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                  | headchg -del > bacEndSingles.bed

    # check if there are any overlapping alignments that can be removed.
    cd /cluster/data/danRer4/bed/bacEnds
    mkdir -p duplicates/overlapRun
    cd duplicates/overlapRun
    sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
         > bacEndPairs.lfs 
    wc -l *.lfs
    # 154732 bacEndPairs.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
         -name -minOverlap=0.999 -notBlocks
    # Loaded 154732 elements of size 11
    # Took about 2.5 hours.
    wc -l bacEndPairs*
    # 154634 bacEndPairs.bed
    # 154732 bacEndPairs.lfs

    sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
         > bacEndSingles.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
        -name -minOverlap=0.999 -notBlocks
    # Loaded 187638 elements of size 11
    # Took about 4.5 hours
    sort -k1,2 \
         /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
         > bacEndPairsBadNoOrphans.lfs 
    wc -l *.lfs
    # 27301 bacEndPairsBadNoOrphans.lfs
    nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
         bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
    # Loaded 27301 elements of size 11
    # Took 5 minutes
     
    # check the numbers of lines are correct

    foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
             | sort | uniq -c | sort -nr > ${f}.uniqCount
    end
    wc -l *
#  154634 bacEndPairs.bed
#  154732 bacEndPairs.lfs
#  154656 bacEndPairs.uniqCount
#   27282 bacEndPairsBadNoOrphans.bed
#   27301 bacEndPairsBadNoOrphans.lfs
#   27293 bacEndPairsBadNoOrphans.uniqCount
#  187601 bacEndSingles.bed
#  187638 bacEndSingles.lfs
#  187624 bacEndSingles.uniqCount
    # different numbers for unique count since some of these alignments
    # were not identical but very close to identical (>0.999 overlap)
    rm *.uniqCount
    cd /cluster/data/danRer4/bed/bacEnds/duplicates 
    mv ./overlapRun/* .
    rm -r overlapRun
    # copy perl script used for danRer3 to choose 2 BAC ends to represent
    # each BAC clone since there are often more than one read for each BAC end
    # in this set,  2 were chosen for each BAC pair or 1 for the singles. This
    # was based on the ones that had the largest region aligned (using lfSizes).
    cp /cluster/data/danRer3/bed/bacends/duplicatesNew/pickLfNamesv2.pl .

    # need to sort by chrom, chromStart
    foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
    end
    # run perl script: input bed file, pairs or singles, name of output file
    perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
    mv error.log log.pairs

    perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
    mv error.log log.singles

    perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
          badPairs2lfNames.bed
    mv error.log log.badPairs
    wc -l log*
    # 1 log.badPairs
    # 3 log.pairs
    # 13 log.singles
    # In future, could pick which set of alignments to pick based on the 
    # Blat score computed by pslScore. 
    # For badPairs, CH211-115F14 has 2 sets of pairs: zC115F14.zb,zC115F14.ya
    # has a longer region between ends than for zC115F14.za,zC115F14.ya.
    # so the latter was removed.
    # for Pairs, CH211-74D17: the alignment with zC74D17.zb,zC74D17.yb was
    # removed but there is also one with zC74D17.zb,zC74D17.yb to the same 
    # region that was retained so remove this one as zC74D17.zb,zC74D17.ya
    # covers a longer region.
    # CH211-98O15 has zC98O15.ya,zC98O15.za aligning to chr3 and 
    # zC98O15.yb,zC98O15.zb align to chr17. There is no similarity between
    # zC98O15.ya and zC98O15.yb by bl2seq.
    # CH211-98E22 has zC98E22.ya,zC98E22.za aligning to chr3 and 
    # zC98E22.yb,zC98E22.zb aligning to chr14. zC98E22.ya and zC98E22.yb
    # has no similarity by bl2seq.
    # For singles, there are 13 will alignments to more than 1 read for a 
    # BAC end:
    # CH211-66E17: remove zC66E17.za as it has more mismatches and inserts.
    # CH211-74O5: remove zC74O5.ya as is has more mismatches. 
    # CH211-42B4: remove zC42B4.yb as it has a shorter alignment. Not much
    # difference in mismatches or inserts between this and zC42B4.ya.
    # CH211-98O3: zC98O3.yb aligns to chr13 and zC98O3.ya aligns to chr16 and
    # they have no similarity to each other.
    # CH211-89J7: remove zC89J7.zb as it has more mismatches and inserts.
    # CH211-97A18: remove zC97A18.yb has more mismatches and inserts.
    # CH211-48O20: zC48O20.zb aligns to chr22 and zC48O20.za aligns twice
    # to chr16. No similarity by bl2seq.
    # CH211-60H17: remove zC60H17.ya as it has a more mismatches.
    # CH211-189J23: remove zC189J23.yb as it has a large tNumInsert.
    # CH211-124G12: remove zC124G12.za as it has more mismatches and inserts.
    # CH211-60P6: remove zC60P6.ya as it has more inserts. 
    # CH211-42A6: remove zC42A6.za as it has more inserts.
    # CH211-69K2: remove zC69K22.za as it has more inserts.

    # Reported discrepancies to Mario Caccamo at Sanger (mc2@sanger.ac.uk)
    # Here is his reply:
    # This looks like a clone swap problem where names where associated to 
    # the wrong clones. All the examples you mention below are from 
    # projects sequenced at Max Planck (Germany).
    # CH211-98O15 - the right place for this one is in chr3. This clone is
    # currently assigned to ctg247 in chr3.
    # CH211-98O3 - should go to chr14 (there is a problem in Zv6 most
    # probably). This clone is assigned to ctg3009. The b ends are correct.
    # CH211-48O20 - unfortunately this clone is not fingerprinted so I don't
    # have any independent information to confirm the right placement.
    # So for pairs, 
    # CH211-98O15: retain zC98O15.ya,zC98O15.za aligning to chr3
    # CH211-98O3: retain zC98O3.yb and zC98O3.zb (should go to chr14)
    # NOTE: For some singles, the lfStart does not equal the chromStart.
    # Also chromStart - chromEnd should equal lfSizes.
    # pslPairs has added min/2 to the end or subtracted min/2 from the start
    # depending on whether it is a left or a right BAC end and the
    # alignment orientation. min used here was 25000.
    # That is ok. This is what gives the display where the aligning block is
    # shown with a line with arrows on it showing the direction.

    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds/duplicates 
    # create remove lists for each set of alignments.
cat << 'EOF' > pairsToRemove
zC74D17.zb,zC74D17.yb
zC98O15.yb,zC98O15.zb
zC98E22.ya,zC98E22.za
zC98E22.yb,zC98E22.zb
'EOF'
 
cat << 'EOF' > singlesToRemove
zC66E17.za
zC74O5.ya 
zC42B4.yb
zC98O3.ya
zC89J7.zb
zC97A18.yb
zC48O20.zb 
zC48O20.za
zC60H17.ya
zC189J23.yb
zC124G12.za
zC60P6.ya 
zC42A6.za
'EOF'
    mv pairs2lfNames.bed pairs2lfNamesOld.bed
    mv singles1lfName.bed singles1lfNameOld.bed
    # recreate these files removing alignments for ends in lists above
    grep -wv -f pairsToRemove bacEndPairsSort.bed > pairs2lfNames.bed
    grep -wv -f singlesToRemove bacEndSinglesSort.bed > singles1lfName.bed
 
    # for each of these new bed files, checks were made that there are
    # only 2 BAC ends per alignments for pairs and 1 for singles.
    # For each pair, there should only be 2 ends which can appear either
    # way round depending on the orientation and there should be 1 end for
    # the beginning (suffix T7, t7 or z) and one end for the end
    # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
    # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
    # orientation. For singles, there should be a single BAC end for each
    # alignment and for each BAC clone, a sequence for either or both types
    # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
    # alignments.

    e.g. 
    wc -l pairs2lfNames.bed
    # 154632 pairs2lfNames.bed
    grep ',' pairs2lfNames.bed | wc -l
    # 154632
    # should be the same number, every line should have a comma
    # should be twice the number of above, just 2 end names per line
    awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
    wc -l pairs.ends
    # 147668 pairs.ends
    sed -e 's/,/\n/g' pairs.ends > pairs.ends2
    wc -l pairs.ends2
    # 295336 pairs.ends2
    # should be twice the number of above, just 2 end names per lines so
    # correct.
    perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
    sort pairs.ends | uniq > pairs.ends.uniq
    # check that these have the right combination of ends - one forward and
    # one reverse. all ok.
    # repeat for badPairs and singles
    # badPairs:
    wc -l badPairs2lfNames.bed
    # 27281 badPairs2lfNames.bed
    grep ',' badPairs2lfNames.bed | wc -l
    # 27281
    # should be the same number, every line should have a comma
    # should be twice the number of above, just 2 end names per line
    awk '{print $11}' badPairs2lfNames.bed | sort | uniq > badPairs.ends
    wc -l badPairs.ends
    # 25795 badPairs.ends
    sed -e 's/,/\n/g' badPairs.ends > badPairs.ends2
    wc -l badPairs.ends2
    # 51590 badPairs.ends2
    # should be twice the number of above, just 2 end names per lines so
    # correct.
    perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' badPairs.ends
    sort badPairs.ends | uniq > badPairs.ends.uniq
    # check that these have the right combination of ends - one forward and
    # one reverse. all ok.
    # for singles
    wc -l singles1lfName.bed
    # 187587 singles1lfName.bed
    grep ',' singles1lfName.bed | wc -l
    # 0
    # should be 0 as there should only be one BAC end name per line.
    awk '{print $11}' singles1lfName.bed | sort | uniq > singles.ends
    wc -l singles.ends
    # 172981 singles.ends
    # some singles have more than 1 alignment so appear more than once.
    perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1/g' singles.ends
    sort singles.ends | uniq > singles.ends.uniq
    # check that these have the the right suffixes for the BAC ends. all ok.
    # clean up
    rm *.bak *.ends *.ends2 *.uniq

    # Finally overlaps in BAC clone names were checked. All BAC clones
    # represented in each of the pairs, badPairs and singles bed files are
    # unique to that file. Between all three bed files, 302606 BAC clones
    # have alignments. 
    foreach f (pairs2lfNames.bed badPairs2lfNames.bed singles1lfName.bed)
        awk '{print $4}' $f | sort | uniq > ${f}.names
    end
    wc -l *.names
    # 25421 badPairs2lfNames.bed.names
    # 147501 pairs2lfNames.bed.names
    # 129684 singles1lfName.bed.names
    # 302606 total

    comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
    comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
    comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names

    # None of these files should have any BAC clone names in common and
    # they do not so they are ok.
    # NOTE: using sort and uniq on hgwdev produces tab delimited output
    # after merging rows with the same BAC name, the scoring is now
    # wrong in the bed files.
    # Scores should be 1000 if there is 1 row for that name, else
    # 1500/number of rows for that sequence name - calculated by pslPairs.
    # Correct the scores. 
    ssh kkstore04
    mkdir -p /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
    cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
    # copy over correctScores2.pl and checkscores.pl scripts from danRer3 and
    # scripts were edited so that hits file is split on space,not on tabs
    cp \
     /cluster/data/danRer3/bed/bacends/scoresAndCoords/correctScores2.pl .
     cp \
     /cluster/data/danRer3/bed/bacends/scoresAndCoords/checkScores.pl .
    awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                 | sort | uniq -c > pairs.hits
    perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits \
          noBin > bacEndPairsGoodScores.bed
    # same for singles
    awk '{print $4}' ../duplicates/singles1lfName.bed \
                 | sort | uniq -c > singles.hits
    perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
                 noBin > bacEndSinglesGoodScores.bed
    # and for badPairs
    awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                 | sort | uniq -c > badPairs.hits
    perl correctScores2.pl ../duplicates/badPairs2lfNames.bed \
          badPairs.hits noBin > bacEndPairsBadGoodScores.bed

    # check that the scores are now correct
    awk '{print $4, $5}' bacEndPairsGoodScores.bed \
         | sort | uniq -c > pairs.count
    perl checkScores.pl < pairs.count
    # all the BAC clones should be in good.txt and none in bad.txt
    # wc -l should give same number of lines in good.txt as in pairs.hits
    # and therefore bad.txt should be empty.
    # repeat for other bed files
    awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
         | sort | uniq -c > badPairs.count
    perl checkScores.pl < badPairs.count
    awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
         | sort | uniq -c > singles.count
    perl checkScores.pl < singles.count
    # for the singles, 6 ended up in bad.txt because their scores are
    # 214.285714285714 which is correct for 7 alignments. Rounding the score
    # caused the discrepancy.
    # round these values otherwise get a loading error when loading database:
    perl -pi.bak -e 's/214\.285714285714/214/' bacEndSinglesGoodScores.bed
    # clean up
    rm error.log *.txt *.count *.hits

    ssh hgwdev
    cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
    # copy over table definition from danRer3
    cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql \
        ../singles
    
    # Now load database tables:
    hgsql -e 'drop table bacEndPairs;' danRer4
    hgLoadBed danRer4 bacEndPairs bacEndPairsGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
    # Loaded 154632 elements of size 11
    hgsql -e 'drop table bacEndSingles;' danRer4
    hgLoadBed danRer4 bacEndSingles bacEndSinglesGoodScores.bed \
               -sqlTable=../singles/bacEndSingles.sql -notItemRgb
    # Loaded 187587 elements of size 11
    hgsql -e 'drop table bacEndPairsBad;' danRer4
    hgLoadBed danRer4 bacEndPairsBad bacEndPairsBadGoodScores.bed \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
    # Loaded 27281 elements of size 11
    # clean up
    rm *.tab *.bak error.log 
    # The Zv6 BAC end sequences are already in /gbdb/danRer4/bacends/ and 
    # they have been loaded into the seq table - this is from the first section 
    # on BACENDS tracks. No need to repeat this here.
    # loaded BAC end sequences into seq table so alignments may be viewed
    # moved BAC ends to the ncbi directory previously
    # symlink to FASTA sequence file in ncbi directory
    mkdir -p /gbdb/danRer4/bacends
    ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
          /gbdb/danRer4/bacends/Zv6BacEnds.fa
    hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
    # use new extract program that extracts PSLs using name and position:
    ssh kkstore04
    set bacDir=/cluster/data/danRer4/bed/bacEnds
    cd $bacDir/scoresAndCoords

    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
        $bacDir/bacEnds.psl bacEndPairsGoodScores.bed bacPairs.psl
    # for this, use bacEndPairsGoodScores.bed which was derived from 
    # bacEndPairsBadNoOrphans since pairs orphans are already
    # included in bacEndSingles
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndPairsBadGoodScores.bed bacPairsBad.psl
    # then for singles
    nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl bacEndSinglesGoodScores.bed bacSingles.psl
    cd $bacDir
    cat $bacDir/scoresAndCoords/*.psl > allBacends.load.psl
    wc -l *.load.psl
    # 542725 allBacends.load.psl
    # load PSL file into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/bacEnds/
    hgsql -e 'drop table all_bacends;' danRer4
    hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
    # All alignments were loaded into the table - no problems.
    # check trackDb.ra entry and modify description.
    # Moved the searches up to the top level zebrafish trackDb.ra file
    # in trackDb/zebrafish/ since the searches are common to all zebrafish
    # assemblies. Deleted searches from each assembly trackDb.ra.

###########################################################################
# CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES
# (bacEndAlias, bacCloneAlias and bacCloneXRef) 
# (DONE, 2006-09-29 - 2006-10-27, hartera)
    # Process data and create bacEndAlias table
    ssh kkstore04
    # create a list of BAC end names and their accessions
    # Downloaded BAC ends accessions from SRS
    # SRS at Sanger is no longer available.
    # Go to http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?CMD=search&DB=nucgss
    # This is dbGSS at NCBI: GSS is Genomic Sequence Survey
    # Search: Danio rerio[Organism] AND BAC
    # There are 159020 entries. This is the same as for the BACEndAccs.txt
    # for danRer3 in: /cluster/data/danRer3/bed/bacends/bacends.1
    # getBacEndInfo.pl and extToIntNames.pl was used to create 
    # BACEnd_accessions.txt. Use this from danRer3 to load table.
    cd /cluster/data/danRer4/bed/bacEnds
    cp /cluster/data/danRer3/bed/bacends/bacends.1/BACEnd_accessions.txt .
    grep '>' /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
         | sed -e 's/>//' > allBacEnds.names
    # copy over getBacEndInfov2.pl - this produces the bacEndAccs.aliases file
    cp /cluster/data/danRer3/bed/bacends/bacends.1/getBacEndInfov2.pl .
    # edit to remove section that creates pairs and singles files 
    # and rename to getBacEndAliases.pl
cat << 'EOF' > getBacEndAliases.pl
#!/usr/bin/perl -w
use strict;
                                                                                
my $file = $ARGV[0]; # list of BAC end sequence read Sanger names
my $file2 = $ARGV[1]; # list of BAC ends and GenBank accessions
# translation for sequence prefixes from Sanger internal names to external names
my %cloneHash = qw {  
   zC   CH211-
   ZC   CH211-
   zK   DKEY-
   zKp  DKEYP-
   bZ   RP71-
   dZ   BUSM1-
   CHORI73_   CH73-
};
# need to get bacends into pairs and singles
# find duplicates also

# Get and store BAC ends and accessions
my %bacEnds;

open (BACENDS, $file2) || die "Can not open $file2: $!";
while (<BACENDS>) {
   chomp;
   my ($be, $a) = split(/\t/);
   print "bac end $be and acc is $a\n";
   $bacEnds{$be} = $a;
}
close BACENDS;

my %bacs;
my %bacAccs;

open(FILE, $file) || die "Can not open $file: $!";
open(STDERR, ">bacs.log") || die "Can not create bacs.log: $!"; 
open(OUT, ">direction.txt") || die "Can not create direction.txt:$!";
open(ACCS, ">bacEndAccs.aliases") || die "Can not create bacEndsAccs.aliases: $!";                                                                               
while (<FILE>) {
   chomp;
   my $seqName = $_;
   print "seqName is $seqName here\n";
   $seqName =~ /^([CHORI73]*[|z|Z|b|d]?[C|K|Z|_]p?)([0-9]+[A-Z][0-9]+)\.?[pq1k]*(SP6|T7|ASP6|AT7|SP6W|T7W|y|z|Z)/;

   my $prefix = $1;
   my $rest = $2;
   print "prefix is $prefix, rest is $rest\n";
   my $dir = $3;
   print STDERR "dir is $dir\n";
   print OUT "$dir\n";
   my $direction;
   # forward  or reverse direction
   if (($dir =~ /SP6/) || ($dir =~ /T7/) ) {
      $direction = $dir;
   }
   # reverse direction (as in convertZfishBacEndInfo)
   elsif ($dir =~ /(sp6)/i || $dir =~ /y/i) {
      $direction = "SP6";
   }
   # forward direction (as in convertZfishBacEndInfo)
   elsif ($dir =~ /(t7)/i ||$dir =~ /z/i) {
      $direction = "T7";
   }
   else {
      print STDERR "seqName is $seqName - direction not found\n";
   }
   print "dir is $dir and direction is $direction\n";
   my $extName = "";
   my $intName = $prefix.$rest;
   print "prefix is $prefix\n";
   my $mid = "";
   $mid = $rest;
   $mid =~ s/\-//;
   $mid =~ tr/a-z/A-Z/;
   print "after trans, mid is $mid here\n";
   if ($mid =~ /^([A-Z]*)0*([0-9]+[A-Z]+)0*([0-9]+$)/) {
      print "matched mid $mid here\n";
      my $new = $1.$2.$3;
      $mid = $new;
      print "new mid is $mid\n";
   }

   if (exists ($cloneHash{$prefix})) {
      my $extPrefix = $cloneHash{$prefix};
      $extName = $extPrefix.$mid;
      print "External name is $extName\n";
   }
   else {
      $extName = "";
   }
   # need to get duplicate clones, if switch to lower case and remove
   # . and - and use as key to bacs hash
   # add the internal and external name for BAC to hash
   my $fullName = $seqName;
   # my $intNameStem = $intName;
   my $upDir = $dir;
   $dir =~ tr/a-z/A-Z/;
   # preserve prefix and change middle part of name to upper case
   my $upperIntName = $prefix.$mid;
   my $upperFullName = $prefix.$mid.$dir;

   print "upper internal name is $upperIntName here\n";
  # my $newFullName = "";
   print "internal name is $intName, altered seq name is $upperIntName\n";
   print "full name is now $upperFullName\n";
   if (exists($bacEnds{$upperFullName})) {
      my $ac = $bacEnds{$upperFullName};
      print "seq is $upperFullName; acc is $ac\n";
      $bacs{$upperIntName}->{$upperFullName}->{acc} = $ac;
   }
   push (@{$bacs{$upperIntName}->{$upperFullName}->{seqs} }, $seqName);
   $bacs{$upperIntName}->{$upperFullName}->{extName} = $extName; 
   $bacs{$upperIntName}->{$upperFullName}->{direction} = $direction;
   if (exists($bacAccs{$upperIntName}) ){
      my $bacAcc = $bacAccs{$upperIntName};
      print "bacacc is $bacAcc\n";
      $bacs{$upperIntName}->{$upperFullName}->{bacAcc} = $bacAcc;
   }
   if (exists($bacEnds{$upperFullName} )) {
      my $bacEndAcc = $bacEnds{$upperFullName};
      print "bacendacc is $bacEndAcc\n"; 
      $bacs{$upperIntName}->{$upperFullName}->{bacEndAcc} = $bacEndAcc;
   }
}
close FILE;

# print accessions for BacEnds with BAC end aliases
my $count = 0;
print "printing accessions.\n";
foreach my $a (keys(%bacs)) {
   print "$a is bac end from bacEnds hash\n";
   foreach my $f (keys %{ $bacs{$a} } ) {
      if (exists($bacs{$a}->{$f}->{acc} ) ) {
      my $acc = $bacs{$a}->{$f}->{acc};
      my @ids = @{$bacs{$a}->{$f}->{seqs} };
      foreach my $i (@ids) {
         $count++;
         print ACCS "$i\t$count\t$acc\n";

      }
    }
  }

}
'EOF'
    chmod +x getBacEndAliases.pl
    perl getBacEndAliases.pl allBacEnds.names BACEnd_accessions.txt \
         > bacEnds.log
    wc -l bacEndAccs.aliases
    # 159370 bacEndAccs.aliases
    # clean up 
    rm *.log direction.txt
    # Only the DKEY- library clone ends have accessions in Genbank
    # load this alias table and accessions for clone ends
    ssh hgwdev
    cd /cluster/data/danRer4/bed/bacEnds
    # Carry on and process this file into the bacEndAlias table.
    hgLoadSqlTab danRer4 bacEndAlias ~/kent/src/hg/lib/bacEndAlias.sql \
          bacEndAccs.aliases
    # Loaded successfully.

    # Get the latest versions of the clonemarkers, contig names and markers
    # files from Sanger: Provided by Mario Caccamo (mc2@sanger.ac.uk)
    # at the Sanger Institute.
    ssh kkstore04
    mkdir -p /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
    cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
    # Problem with the markers file - generated incorrectly. Contacted
    # Sanger to ask for new markers file on 10/12/06 and new set of files 
    # were put up for ftp on 10/26/06. Another problem with markers file
    # was found - there was a number in the second field instead of the 
    # sanger sts name which is an ID beginning with "et" or "st". Notified'
    # Sanger and new files put out for ftp on 10/27/06.
    wget --timestamp \
      ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
    wget --timestamp \
         ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/clonemarkers.27.10.06.txt
    wget --timestamp \
         ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/ctgnames.27.10.06.txt
    wget --timestamp \
         ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/markers.27.10.06.txt
    wc -l *27.10.06.txt
    # 32612 clonemarkers.27.10.06.txt
    # 168828 ctgnames.27.10.06.txt
    # 12407 markers.27.10.06.txt
    # get list of BAC end names, lfNames
    foreach f (../scoresAndCoords/*.bed)
       echo $f
       awk '{print $11;}' $f >> allBacEnds.names
    end
    wc -l allBacEnds.names
    # 369500 allBacEnds.names
    # this is the total number of lines in the *.bed files
    perl -pi.bak -e 's/,/\n/g' allBacEnds.names
    sort allBacEnds.names | uniq > allBacEnds.names.uniq
    # get list of BAC clone names
    foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
      awk '{print $4}' \
      /cluster/data/danRer4/bed/bacEnds/scoresAndCoords/${f}GoodScores.bed \
          >> bacs.names
    end
    sort bacs.names | uniq > bacs.names.uniq
    wc -l *.uniq
    # 518827 allBacEnds.names.uniq
    # 302606 bacs.names.uniq
   
    # from psl file
    awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
    # remove first few lines with no names
    tail +6 bacEndsPsl.names | sort | uniq > bacEndsPsl.names.uniq
    wc -l bacEndsPsl.names.uniq
    # 549034 bacEndsPsl.names.uniq
    # this is all the BAC ends that originally had alignments
    # Add an alias table for BAC clones
    # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    # Add a xref table to give external clone registry names, internal names
    # sanger name, relationship between STS and BAC clone (method of finding
    # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
    # Genbank accession and STS primer sequences
    # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
    set dir=/cluster/data/danRer4/bed/bacEnds/
    awk 'BEGIN {OFS="\t"}{print $4, $1}' \
    $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
    awk 'BEGIN {OFS="\t"}{print $4, $1}' \
    $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
    sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
    wc -l bacClones.namesandchrom.uniq
    # 306079 bacClones.namesandchrom.uniq
    # so created a list of names and chroms for BAC clones only in pairs
    # and singles, exclude bad Pairs since this track is not shown on RR.
    # use a list of internal names,Genbank accessions, and BAC clone names
    # use BACClonesIdsandAccs.txt.
    # get list of UniSTS IDs using aliases to search alias file
    # print Sanger name, alias and UniSTS ID, use find_markers3.pl
cat << '_EOF_' > find_markers3.pl
    # example:
# perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
   chomp;
   ($id, $al) = split /\t/;
   @alsArray = split(/;/, $al);
   foreach my $as (@alsArray)
      {
      push (@{$aliasHash{$as} }, $id);
      }
}
close ALIAS;

while (<MARKERS>) {
    my @idArray;
    ($f, $t, $m, $idArray[0]) = 0;
    my @ids;
    chomp; ($a, $b, $aliases, @rest) = split /\|/;
    if ($verbose > 3) { printf "aliases $aliases \n"; }
    @alias = split /;/, $aliases;
    ALIAS: foreach $s (@alias) {
        if ($s =~ /[\D]+/) {
            if ($verbose > 5) { printf "this $s \n"; }
            if (exists($aliasHash{$s}))
               {
               @idArray = @{$aliasHash{$s}};
               }
            if ($idArray[0]) {
                $f = 1; $t = $s; @ids = @idArray;
                if ($verbose) { printf "this $s found $m \n"; }
                last ALIAS;
            }
        }
    }
    if ($f) 
     { 
     my @sNames = split(/;/, $b);
     foreach my $sn (@sNames)
        {
        foreach my $i (@ids)
           { 
           printf "$sn\t$i\n"; 
           }
        }
    }
}
close MARKERS;
'_EOF_'
   chmod +x find_markers3.pl
   # download latest version of UniSTS (2006-10-26)
   ssh kkstore02
   mkdir -p /cluster/store5/sts.2006-10-26
   ln -s /cluster/store5/sts.2006-10-26 /cluster/data/ncbi
   cd /cluster/data/ncbi/sts.2006-10-26
   wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
   mkdir -p /cluster/store5/UniSTS.2006-10-26
   ln -s /cluster/store5/UniSTS.2006-10-26 /cluster/data/ncbi
   cd /cluster/data/ncbi/UniSTS.2006-10-26
   wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
   wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
   wget --timestamp -r l1 \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/
   mv
/cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio
/cluster/data/ncbi/UniSTS.2006-10-26
    rm -r /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov
    # then back to danRer4 BAC ends tables:
    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
    # change internal names in files to have CHORI73 instead of zH to 
    # keep names the same as those used in the BAC end tables.
    perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.10.06.txt 
    perl find_markers3.pl /cluster/data/ncbi/UniSTS.2006-10-26/UniSTS.aliases \
         markers.27.10.06.txt > sangerandUniSTSId.txt
    # Need to sort and uniq this file since the UniSTS IDs are being
    # replicated for each instance of the sanger name in field 2 of the
    # markers file. In some cases the sanger name is replicated.
    sort sangerandUniSTSId.txt | uniq > sangerandUniSTSId.uniq
    # No need to reformat this for zfishBacClonesandSts
    # FPC contig information (i.e. FPC contig number) from ctgnames file is
    # not included in the tables as these are dynamic and constantly
    # changing with the assembly.
    # bacs.names.uniq has the list of BACS in this track
    # Get accessions for BAC clones from Genbank (as for danRer3)
    # go to http://www.ncbi.nlm.nih.gov
    # 1) select "Nucleotide" as the search database.
    # 2) Search string: 
    # Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
    # Including only those with BAC in the record seems to exclude some of the
    # BAC clones as well as other types of sequence so this "BAC" was not 
    # used in the search.
    # Those sequences with "genomic survey" in the title appear to be
    # BAC clone end accessions. Here, we want only BAC clone accessions.
    # 3) There are 1148560 sequences. (2006-10-27). Select File from Send To
    # pulldown menu and name file "BACClones.gbAccs.txt".
    # use script from danRer3 to parse out clone ID and the accession:
cat << '_EOF_' > getAccsandIdsFromGb.pl
#!/usr/bin/perl -w
use strict;

my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {  
   CH211-  zC
   DKEY-   zK
   DKEYP-  zKp 
   RP71-   bZ
   BUSM1-  dZ
   CH73-   CHORI73_
};

my $found = "FALSE";
my $acc = "";
my $id = "";
while (<STDIN>)
{
my ($l, @f, $intId, $extPref, $intPref);
$intPref = "";
$extPref = "";

chomp;
$l = $_;
if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
   {
   $acc = "";
   $acc = $1;
   $found = "FALSE";
   }
elsif ($l =~ /clone/)
   {
   $id = "";
   # check for clone name in this line
   foreach my $p (@clonePrefixes)
      {
      if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
         {
         $id = $1;
         # translate to upper case
         $id =~ tr/a-z/A-Z/;
         $extPref = $p;
         $found = "TRUE";
         }
      }
   }
if ($found eq "TRUE")
   {
   if (exists($cloneHash{$extPref}))
      {
      $intPref = $cloneHash{$extPref};
      }
   $intId = $id;
   # translate this to internal ID
   $intId =~ s/$extPref/$intPref/;
   print "$intId\t$acc\t$id\n";
   $found = "FALSE";
   }
}
'_EOF_'
    chmod +x getAccsandIdsFromGb.pl
    nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
          > BACClonesIdsandAccs.txt &
    # Took about 1 minute
    # compare the BAC clones for which accessions were found to those 
    # for danRer3:
    awk '{print $3}' BACClonesIdsandAccs.txt | sort | uniq \
        > clonesWithAccs.dr4
    awk '{print $3}' \
      /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt \
      | sort | uniq > clonesWithAccs.dr3
    comm -13 clonesWithAccs.dr4 clonesWithAccs.dr3
# DKEY-188F22
# DKEY-30O13
    # Checked these out for searching for each in the Nucleotide database
    # at Genbank. DKEY-30O13 only has accessions for the 
    # end sequences. DKEY-188F22 has an accession for the BAC clone: AP007256
    # For some reason this was not found by the search.
    # Add this to list:
    echo "zK188F22\tAP007256\tDKEY-188F22" >> BACClonesIdsandAccs.txt

    # use zfishBacClonesandSts to create tab files for loading into
    # bacCloneAlias and bacCloneXRef tables
    # make output directory
    mkdir out
    # Asked Sanger for another version of the file with the Sanger sts aliases
    # instead of these numbers in the second field of the markers file.
    # (2006-10-26). Received new file (2006-10-27)
    # Increased NUMSANGER from 5 to 40 and MAXSANGER from 50 to 60
    # because there are multiple occurrences of Sanger names in the second
    # field of the markers file and this can be quite a long list.
    # clonemarkers file now has 0 for relationship where before it was blank.
    # change this to blank again otherwise processed incorrectly.
    perl -pi.bak -e 's/\|0/\|/' clonemarkers.27.10.06.txt
    nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.10.06.txt \
      clonemarkers.27.10.06.txt markers.27.10.06.txt \
      bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
      sangerandUniSTSId.uniq ./out > ./out/zfishBacs.out &
    # output is in out directory so copy over
    cp ./out/*.tab .
    # sort alias tab file by sangerName
    wc -l *.tab
    # 120211 bacAlias.tab
    # 507274 bacXRef.tab
    # make sure there are no replicate lines:
    # also sort alias tab file by sangerName
    sort bacAlias.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
    sort bacXRef.tab | uniq > bacXRef.tab.uniq
    wc -l bac*.tab.uniq
    # 58758 bacAlias.sort.tab.uniq
    # 353042 bacXRef.tab.uniq

    ssh hgwdev 
    cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
    hgsql -e 'drop table bacCloneAlias;' danRer4
    hgsql -e 'drop table bacCloneXRef;' danRer4
     
    hgLoadSqlTab danRer4 bacCloneAlias \
          $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
    hgLoadSqlTab danRer4 bacCloneXRef \
          $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq

###########################################################################
# BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2006-10-27, hartera)
    # The following tests were carried out to check that all the data
    # in the bacCloneAlias and bacCloneXRef tables is correct.
    ssh hgwdev
    cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
    mkdir -p testTables
    cd testTables
    # copy scripts over from danRer3:
    cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getName*.pl .  
    cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getSanger*.pl . 
    cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/formatUniSts.pl . 
    # scripts were created for danRer2 - see danRer2.txt

# Check that the correct aliases are associated with their Sanger STS names
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
        ../markers.27.10.06.txt > sNameandaliases
    # use script to get one Sanger name and one alias on each line
    perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
    sort sNameandaliases.format | uniq > sNameandaliases.sort
    # get Sanger names and aliases from database
    hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer4 \
          | sort | uniq > alias.db.sort
    wc -l alias.db.sort
    # 58758 alias.db.sort
    diff sNameandaliases.sort alias.db.sort
    # No difference between data file and data from database so ok
    # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
    # get Sanger names from alias table
    hgsql -N -e 'select sangerName from bacCloneAlias;' danRer4 \
             | sort | uniq > sName.alias.sort
    wc -l sName.alias.sort
    # 15595 sName.alias.sort
    # get Sanger names from xRef table
    hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
          is not null;' danRer4 | sort | uniq > sName.xRef.sort
    wc -l sName.xRef.sort
    # 15946 sName.xRef.sort
    comm -23 sName.alias.sort sName.xRef.sort
    # nothing unique to alias file so all Sanger names in the alias table are
    # also in the xRef table
    comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
    wc -l sNamexRefNotAlias
    # 351 sNamexRefNotAlias
    awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.10.06.txt | sort | uniq \
        > clonemarkers.sNames.sort
    # get Sanger names from markers file
    awk 'BEGIN {FS="|"}{print $2}' ../markers.27.10.06.txt > markers.sNames
    # remove semi-colons and sort
    sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
    # sanger names unique to markers file
    comm -13 clonemarkers.sNames.sort markers.sNames.sort
    # there are none
    comm -23 clonemarkers.sNames.sort markers.sNames.sort \
         > sNames.clonemarkersOnly
    wc -l sNames.clonemarkersOnly
    # 351 sNames.clonemarkersOnly
    diff sNames.clonemarkersOnly sNamexRefNotAlias
    # No difference so all the extra Sanger Names in the xRef 
    # table are from the clonemarkers file and these have no aliases in 
    # the markers file so they are not in the alias table so this is all ok.
  
# Check that Sanger STS names and primers are associated correctly
    cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases/testTables
    # get sanger names and primers from markers file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
        ../markers.27.10.06.txt > sNameandPrimers
    # use script to reformat and write with one Sanger name per line
    chmod +x getSangerandPrimers.pl
    perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
    # Need to sort and uniq due to multiple occurrences of the same 
    # Sanger name in some lines of the markers file.
    sort sNameandPrimers.format | uniq > sNameandPrimers.format.sort
    wc -l sNameandPrim*
    # 12407 sNameandPrimers
    # 32098 sNameandPrimers.format
    # 15595 sNameandPrimers.format.sort

    # get Sanger names and primers from database
    hgsql -N -e \
      'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
      where sangerName is not null and leftPrimer is not null and \
      rightPrimer is not null;' danRer4 | sort | uniq \
      > sNamesandprimers.fromdb.sort
    wc -l sNamesandprimers.fromdb.sort
    # 15595 sNamesandprimers.fromdb.sort
    diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
    # No difference so ok.

# Check that UniSTS IDs and Sanger STS names are associated correctly
   # get Sanger names and UniSTS IDs from the database
   hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
       uniStsId is not null;' danRer4 | sort | uniq > sNameUniSTS.fromdb.sort
   wc -l sNameUniSTS.fromdb.sort
   # 5699 sNameUniSTS.fromdb.sort
   # Need to reformat the sNameUniSTS.fromdb.sort
   chmod +x formatUniSts.pl
   perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
        > sNameUniSTS.fromdb.format.sort
   # get Sanger names from data file and see how many UniSTS IDs there are
   # for each name
   awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
       > sangerandUniSTSId.count
   # the most is 160 - this is high due to replicate occurrences of sanger
   # STS names (sangerName) in the markers file. Replicates are removed
   # during processing.
#   160 etID9511.14
#   132 etID8743.18
#    124 etID9682.15
#    124 etID9681.15
#     96 etID10372.18
#     84 etID8170.14
#     76 etID10495.5
#     66 etID9328.14
#     56 etID9708.3

   # use uniq'd file used to create database tables.
   sort ../sangerandUniSTSId.uniq > sangerandUniSTSId.txt.sort
   diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort 
   # No difference between data from original file and that in database so ok

# Check that chrom mappings and external BAC clone names are correct
   # get extNames and chroms they map to from the database
   hgsql -N -e 'select name, chroms from bacCloneXRef where \
         chroms is not null;' danRer4 | sort | uniq \
         > nameandchromsfromdb.sort
   # reformat nameandchromsfromdb.sort
   perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
        > nameandchromsfromdb.format.sort
   # compare extNames and chroms from db to those in data file
   cp ../bacClones.namesandchrom .
   sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
   diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
   # no difference - all ok

# Check Genbank accessions and internal BAC clone names
   hgsql -N -e 'select intName,genbank from bacCloneXRef where \
         genbank is not null;' danRer4 | sort | uniq \
         > intNamesandAccs.fromdb.sort
   # this should be a subset of zfish_accsMerged.txt - not all BAC clones
   # listed here appear in either our BAC ends tracks or the markers files.
   awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
       | sort -u > BACClonesIntandAccs.sort
   comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
   # there is nothing in the database that is not in BACClonesIntandAccs.sort
   comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
            > onlyinzfishAccs
   wc -l onlyinzfishAccs
   # 86 onlyinzfishAccs
   hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
         danRer4 | sort | uniq > intNamesNoAcc.fromdb.sort
   awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
   comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
        > indbNoAccsandAccs.out
   # none of these names are common to both so all accessions from
   # BACClonesIdsandAccs.txt are in the database for the internal names stored
   # where there are accessions available.

# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
   # get internal names and Sanger names from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.10.06.txt \
       | sort | uniq > intNameandSanger.sort
   hgsql -N -e 'select intName, sangerName from bacCloneXRef \
       where sangerName is not null;' danRer4 \
       | sort | uniq > intNameandSanger.fromdb.sort
   diff intNameandSanger.sort intNameandSanger.fromdb.sort
   # No difference between data from file and that from database so ok

# Check BAC clone internal name and relationship fields
   # get internal names and relationships from data file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.10.06.txt \
       | sort | uniq > intNameandRelation.sort
   # get internal names and relationships from database, some internal names
   # may have different relationships associated with each internal name
   # and Sanger sts name pair
   hgsql -N -e 'select intName, relationship from bacCloneXRef \
       where relationship != 0;' danRer4 \
       | sort | uniq > intNameandrelation.fromdb.sort
   # differences unique to database file
   comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.indbonly
   # differences unique to data file
   comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
       > intNameRelation.incloneMarkersonly
   wc -l intNameRelation*
   # 5051 intNameRelation.incloneMarkersonly
   # 5051 intNameRelation.indbonly
  
   awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
   awk '{print $1}' intNameRelation.incloneMarkersonly \
       > intNameRelation.incloneMarkersonly.names
   diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
   # there is no difference in the internal names with relationship fields
   # no difference in names and the only places these should differ is that
   # the second column should all be 3 in the data from the database only.
   # this is because all the relationship entries that were blank were
   # in the clonemarkers file were changed to 3 when entered into the database.
   awk '{print $2}' intNameRelation.indbonly | sort | uniq
   # 3 - correct so all ok
   # all the differences should be that those that are blank in clonemarkers
   # are 3 in the database.
   # check that those that have 0 in the database bacCloneXRef relationshipe
   # field are not in the list from cloneMarkers
   # select these internal names with 0 relationship from the database
   hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
         danRer4 | sort | uniq > intNameNoRelation.fromdb.sort
   # get all the internal names from the data file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt \
       | sort | uniq > intNamefromCloneMarkers.sort
   comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
   # nothing in common between these two files as expected so there are
   # no internal names in the db with 0 in the relationship field that
   # appear in the clonemarkers file.

# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
   # get intName and extName from ctgnames file
   awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.10.06.txt \
       | sort | uniq > intNameandextNamefromCtgNames.sort
   # get intName and extName from database
   hgsql -N -e 'select intName,name from bacCloneXRef;' danRer4 \
       | sort | uniq > intNameandextName.fromdb.sort
   wc -l intNameandextName*
   # 334890 intNameandextName.fromdb.sort
   # 168828 intNameandextNamefromCtgNames.sort

   comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextindbAndCtgNames
   wc -l intandextindbAndCtgNames
   # 168828 intandextindbAndCtgNames
   # there are 168828 name pairs common between the file and the database
   # and this is the same number of name pairs as in the data file
   diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
   # no difference between those name pairs from the data file and those that
   # are common between the data file and the database so all internal and
   # external names from ctgNames file are in the database
   # get the list of extra ones from db
   comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
        > intandextNamesindbNotinCtgNames
   wc -l intandextNamesindbNotinCtgNames
   # 166062 intandextNamesindbNotinCtgNames
   # get list of internal names from the clonemarkers file
   awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt | sort | uniq \
       > clonemarkers.intName.sort
   wc -l clonemarkers.intName.sort
   # 14460 clonemarkers.intName.sort
   # compare these intNames to those from the database not in the ctgnames file
   comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
   # none of these clone markers internal names are in this list so they
   # must all be in the ctgnames file too. These extra internal names will be
   # translations of external names found in the list of mappings of BAC clones
   # to chroms.

# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
   # get all extNames from baclones.namesandchrom.uniq and from ctgnames
   awk '{print $1}' ../bacClones.namesandchrom.uniq > \
       extNames.ctgnamesandbacClones
   awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.10.06.txt \
       >> extNames.ctgnamesandbacClones
   wc -l extNames.ctgnamesandbacClones
   # 474907 extNames.ctgnamesandbacClones
   sort extNames.ctgnamesandbacClones | uniq \
        > extNames.ctgnamesandbacClones.sort
   wc -l extNames.ctgnamesandbacClones.sort
   # 334890 extNames.ctgnamesandbacClones.sort
   # get extNames from the database
   hgsql -N -e 'select name from bacCloneXRef;' danRer4 | sort | uniq \
         > extNames.fromdb.sort
   wc -l extNames.fromdb.sort
   # 334890 extNames.fromdb.sort
   comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
         > extNames.fromdbandfiles
   wc -l extNames.fromdbandfiles
   # 334890 extNames.fromdbandfiles
   # find extNames in common from data files and database
   diff extNames.fromdb.sort extNames.fromdbandfiles
   # no difference, all extNames from files are in db

# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
   # get internal names from ctgnames and clonemarkers files
   awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.10.06.txt \
       > intNames.ctgnamesandclonemarkers
   awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.10.06.txt \
       >> intNames.ctgnamesandclonemarkers
   wc -l intNames.ctgnamesandclonemarkers
   # 201440 intNames.ctgnamesandclonemarkers
   sort intNames.ctgnamesandclonemarkers | uniq \
        > intNames.ctgnamesandclonemarkers.sort
   wc -l intNames.ctgnamesandclonemarkers.sort
   # 168828 intNames.ctgnamesandclonemarkers.sort
   # get internal names from database
   hgsql -N -e 'select intName from bacCloneXRef;' danRer4 | sort | uniq \
        > intNames.fromdb.sort
   wc -l intNames.fromdb.sort
   # 334890 intNames.fromdb.sort
   # some of these intNames are derived from the corresponding extNames
   # all of the intNames from the file should be in the db
   comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
        > intNames.fromdbandfiles
   wc -l intNames.fromdbandfiles
   # 168828 intNames.fromdbandfiles
   comm -13 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort 
   comm -23 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
   # no difference, all intNames from files are in db
                                                                                
# Check that all translations are correct between BAC clone
# external and internal names.
   # write script to get the prefixes from internal and external names
   chmod +x getNamePrefixes.pl
   hgsql -N -e 'select name, intName from bacCloneXRef;' danRer4 \
         | sort | uniq > extandintNames.fromdb.sort
   perl getNamePrefixes.pl < extandintNames.fromdb.sort \
         > extandintNames.prefixes
   sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
   # these all look good
   # BUSM1   dZ
   # CH211   zC
   # CH211   zc
   # CH73    CHORI
   # CT7     bP
   # DKEY    zK
   # DKEY    zk
   # DKEYP   zKp
   # RP71    bZ
   # XX      bY
   # zk is a internal name prefix for the external name prefix, DKEY-. There
   # is only one example where this is used (DKEY-81G7) and this in the
   # ctgnames file and is in the bacCloneXRef table so that is ok.
   # All data looks good in these tables now.

###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER DANRER ASSEMBLIES
# (DONE, 2006-06-27, hartera)
# ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)
   ssh kkr3u00
   # change script to do this and only rsync to 4,5,6,7, and 8 as
   # kkr1u00 and kkr2u00 are down.
   cd /cluster/data/danRer4/bed
   mkdir -p liftOver
   cd liftOver
   # commented out lines in local copy that makes the script abort if 
   # kkr1u00 not used. can not connect to kkr1u00 at the moment.
   ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh danRer4 \
         /cluster/data/danRer4/nib >&! split.log &
   # rsync didn't work properly so do manually
   foreach R (4 5 6 7 8)
    rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
   end
   ssh kk
   # add split10k to san for pk runs (2006-05-30, hartera)
   rsync -a --progress /iscratch/i/danRer4/split10k \
         /san/sanvol1/scratch/danRer4/
   
###########################################################################
# LIFTOVER CHAINS TO DANRER3 (DONE, 2006-05-30 = 2006-05-31, hartera)
   # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
   # Do what makeLoChain-split says to do next (start blat alignment)
   # Took too long on kk. Try pk. Scripts only run on kk so run manually.
   ssh pk
   mkdir -p /cluster/data/danRer4/bed/liftOver
   cd /cluster/data/danRer4/bed/liftOver
cat << '_EOF_' > align.csh
#!/bin/csh -fe
set oldAssembly = $1
set oldNibDir = $2
set newAssembly = $3
set newSplitDir = $4
set ooc = $5
if ("$ooc" != "") then
    set ooc = '-ooc='$ooc
endif

set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
echo "Setting up blat in $blatDir"
rm -fr $blatDir
mkdir $blatDir
cd $blatDir
mkdir raw psl run
cd run

echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
       '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
  >> gsub
echo '#ENDLOOP' >> gsub

# target
ls -1S $oldNibDir/*.{nib,2bit} > old.lst
# query
ls -1S $newSplitDir/*.{nib,fa} > new.lst

gensub2 old.lst new.lst gsub spec
/parasol/bin/para create spec

echo ""
echo "First two lines of para spec:"
head -2 spec
echo ""
echo "DO THIS NEXT:"
echo "    cd $blatDir/run"
echo "    para try, check, push, check, ..."
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x align.csh
   align.csh danRer4 /san/sanvol1/scratch/danRer4/nib danRer3 \
       /san/sanvol1/scratch/danRer3/split10k \
       /san/sanvol1/scratch/danRer3/danRer3_11.ooc >&! align.log &
   # Took a few seconds.
   # Do what its output says to do next (start cluster job)
   cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/run
   para try, check, push, check, ...
   para time >&! run.time
# Completed: 784 of 784 jobs
# CPU time in finished jobs:    1482693s   24711.54m   411.86h   17.16d  0.047 y
# IO & Wait Time:                  2873s      47.89m     0.80h    0.03d  0.000 y
# Average job time:                1895s      31.58m     0.53h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           11350s     189.17m     3.15h    0.13d
# Submission to last job:         13914s     231.90m     3.87h    0.16d

   ssh pk
   cd /cluster/data/danRer4/bed/liftOver
 
cat << '_EOF_' > lift.csh
#!/bin/csh -ef
set oldAssembly = $1
set newAssembly = $2
set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k

set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
set blatDir = `ls -td $prefix.20* | head -1`
echo "using dir $blatDir"

if ( ! -e $blatDir/raw ) then
    echo "Can't find $blatDir/raw"
endif

if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
    echo "Can't find any .lft files in $newLiftDir"
    exit 1
endif
cd $blatDir/raw

foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
    echo $chr
    liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
end

set execDir = $0:h
echo ""
echo "DO THIS NEXT:"
echo "    ssh pk"
echo "    $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x lift.csh
   lift.csh danRer4 danRer3 >&! lift.log &
   # makeLoChain-chain can be run on pk. chain alignments
   
   makeLoChain-chain danRer4 /san/sanvol1/scratch/danRer4/nib \
                     danRer3 /san/sanvol1/scratch/danRer3/nib >&! chain.log &
   cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/chainRun
   para try, check, push, check, ...
   para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs:       4030s      67.16m     1.12h    0.05d  0.000 y
# IO & Wait Time:                   939s      15.66m     0.26h    0.01d  0.000 y
# Average job time:                 177s       2.96m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             797s      13.28m     0.22h    0.01d
# Submission to last job:           953s      15.88m     0.26h    0.01d
   # net alignment chains
   ssh kkstore04
   cd /cluster/data/danRer4/bed/liftOver
   makeLoChain-net danRer4 danRer3 >&! net.log &
   # load reference to over.chain into database table,
   # and create symlinks  /gbdb  and download area
   ssh hgwdev
   cd /cluster/data/danRer4/bed/liftOver
   makeLoChain-load danRer4 danRer3 >&! load.log &
   # clean up
   rm *.log
   # add md5sum.txt to include this new liftOver file
   cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
   md5sum *.gz > md5sum.txt
   # copy README.txt from another liftOver directory.
   # test by converting a region using the "convert" link on
   # the browser, and comparing to blat of the same region

###########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE, 2006-05-27, hartera)
   # Use scaffolds for random chroms to avoid getting false predictions
   # spanning scaffolds in chrNA_random and chrUn_random.
   ssh kkstore04
   cd /cluster/data/danRer4
   # already have a file of soft-masked scaffolds for chrNA_random and
   # chrUn_random. Use this to create hard-masked scaffolds FASTA file
   # for Genscan run.
   foreach c (NA_random Un_random)
      cd /cluster/data/danRer4/$c
      mkdir scaffoldsHardMask
      echo "Hard-masking scaffolds for $c ..."
      cd scaffoldsSoftMask
      foreach f (*.fa)
        maskOutFa $f hard ../scaffoldsHardMask/${f}.masked
      end
   end
  
   ssh hgwdev
   mkdir /cluster/data/danRer4/bed/genscan
   cd /cluster/data/danRer4/bed/genscan
   cvs co hg3rdParty/genscanlinux

   ssh pk
   cd /cluster/data/danRer4/bed/genscan
   # Make 3 subdirectories for genscan to put their output files in
   mkdir gtf pep subopt
   # Generate a list file, genome.list, of all the hard-masked contigs that 
   # *do not* consist of all-N's (which would cause genscan to blow up)
   cp /dev/null genome.list
   foreach c (`cat /cluster/data/danRer4/chrom.lst`)
    echo $c
    if (($c == "NA_random") || ($c == "Un_random")) then
     foreach s (/cluster/data/danRer4/${c}/scaffoldsHardMask/Zv6_*.fa.masked)
      egrep '[ACGT]' $s > /dev/null
      if ($status == 0) echo $s >> genome.list
     end
    else
     foreach f ( `ls -1S /cluster/data/danRer4/$c/chr*_*/chr*_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
    endif
   end
   wc -l genome.list
   # 3237 genome.list
   # Create template file, gsub, for gensub2.  For example (3-line file):
   cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
   # << this line makes emacs coloring happy
   gensub2 genome.list single gsub jobList
   para create jobList
   para try, check, push, check ... etc.
   para time
# Completed: 3236 of 3237 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      46601s     776.69m    12.94h    0.54d  0.001 y
# IO & Wait Time:                 10409s     173.48m     2.89h    0.12d  0.000 y
# Average job time:                  18s       0.29m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             363s       6.05m     0.10h    0.00d
# Submission to last job:           445s       7.42m     0.12h    0.01d

   # If there are crashes, diagnose with "para problems" / "para crashed".  
   # If a job crashes due to genscan running out of memory, re-run it 
   # manually with "-window=1200000" instead of "-window=2400000".
   para problems > problems
   nice /cluster/bin/x86_64/gsBig /cluster/data/danRer4/8/chr8_5/chr8_5.fa.masked gtf/chr8_5.fa.gtf -trans=pep/chr8_5.fa.pep -subopt=subopt/chr8_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 >& chr8_5.fa.log & 
   # Took about 5 minutes to run
   # check log and then remove it
   rm chr8_5.fa.log

   ssh kkstore04
   cd /cluster/data/danRer4/bed/genscan
   liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
   liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
   cat pep/*.pep > genscan.pep

   # Load into the database as so:
   ssh hgwdev
   cd /cluster/data/danRer4/bed/genscan
   ldHgGene danRer4 genscan genscan.gtf
   # Read 44534 transcripts in 325488 lines in 1 files
   # 44534 groups 28 seqs 1 sources 1 feature types
   # 44534 gene predictions
   hgPepPred danRer4 generic genscanPep genscan.pep
   hgLoadBed danRer4 genscanSubopt genscanSubopt.bed
   # Loaded 332782 elements of size 6
   # compare to other assemblies:
   featureBits danRer4 genscan
   # 64448019 bases of 1626093931 (3.963%) in intersection
   featureBits rn4 genscan
   # 54781052 bases of 2571531505 (2.130%) in intersection
   featureBits monDom4 genscan
   # 45991425 bases of 3501643220 (1.313%) in intersection
   featureBits tetNig1 genscan
   # 30459626 bases of 342403326 (8.896%) in intersection

   featureBits -chrom=chr1 refGene genscan -enrichment
   # refGene 1.129%, genscan 4.195%, both 0.653%, cover 57.80%, enrich 13.78x 
   # check CDS only
   featureBits -chrom=chr1 danRer4 refGene:cds genscan:cds -enrichment
   # refGene:cds 0.746%, genscan:cds 4.195%, both 0.631%, cover 84.52%, 
   # enrich 20.15x 


###########################################################################
# BLASTZ/CHAIN/NET GALGAL3 (DONE 5/30/06 angie)
    ssh pk
    mkdir /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
    cd /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
    cat << '_EOF_' > DEF
# zebrafish vs. chicken
BLASTZ=/cluster/bin/penn/i386/blastz

# Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer4
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
SEQ1_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Chicken galGal3 - single chunk big enough to run while chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/danRer4GalGal3 \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.galGal3.2006-05-30 /cluster/data/danRer4/bed/blastz.galGal3

###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
# UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
# RELOADING INTO hgFixed (see hgFixed.txt for details). 
# (DONE, 2006-10-20, hartera)
# UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
# VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2007-01-08, hartera)
# RE-ORDERED DISPLAY IN TRACK - see ZON LAB WILD TYPE MICROARRAY DATA section
# in danRer3.txt make doc. (DONE, hartera, 2007-04-09)
# Array data is for whole embryos of five wild type zebrafish strains. 
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
    ssh hgwdev
    mkdir /cluster/data/danRer4/bed/ZonLab/wtArray
    cd /cluster/data/danRer4/bed/ZonLab/wtArray
    
    # use AllRatio table for mapping. There are not many arrays in this 
    # dataset so using AllRatio will allow the selection of All Arrays
    # from the track controls on the track description page. Also set up the
    # Zebrafish microarrayGroups.ra so that the Medians of replicates or
    # Means of replicates can also be selected for display.
    # Create mapped data in zebrafishZonWT.bed.
    rm zebrafishZonWT.bed
    hgsql -e 'drop table affyZonWildType;' danRer4
    hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
         /cluster/data/danRer4/bed/affyZebrafish/affyZebrafish.psl
    # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
    # Mapped 14952,  multiply-mapped 3867, missed 0, unmapped 665

    hgLoadBed danRer4 affyZonWildType zebrafishZonWT.bed
    # Loaded 18819 elements of size 15
    # add trackDb.ra entry at trackDb/zebrafish level
    # look at range of scores:
    hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
          > ratioExps.out
    perl -pi.bak -e 's/,/\n/g' ratioExps.out
    sort ratioExps.out | uniq -c > ratioExps.uniq.count
    textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
        ratioExps.out > expRatios.hist
    # Most values are between -3 and +2.
    # Therefore use the following trackDb entry:

# track affyZonWildType
# shortLabel Wild Type Array
# longLabel Zon Lab Expression data for Wild Type Zebrafish strains
# group regulation
# priority 80
# visibility hide
# type expRatio
# expScale 2.0
# expStep 0.2
# groupings affyZonWildTypeGroups
    # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
    # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
    # which is microarrayGroups.ra defines how the array data is
    # displayed and also grouped for the Medians and Means of Replicates.
    # It also defines the labels for the track controls for showing 
    # All Arrays, Arrays Grouped By Replicate Means or
    # Arrays Grouped By Replicate Medians. This is in the description field.
    # RE-ORDERED DISPLAY IN TRACK - see danRer3.txt make doc 
    # (hartera, 2007-04-09)
    # 14 somites and 15 somites should come before 36 hpf
    # 14-19 somites stage is 16-19h.
    # from hgFixed.zebrafishZonWTAllExps
    # for AB, 0-8 should go after 14, 
    # for TL, 16-22 should go after 24
    # for TU, 25-27 should go after 32
    # re-order accordingly in the config file:
    # ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra

###########################################################################
# HUMAN ORTHOLOGS ADDED TO AFFY ZEBRAFISH TRACK DETAILS
# (DONE, 2006-06-08, hartera)
    # Human orthologs were mapped to Affy Zebrafish probes by 
    # Tony DiBiase (adibiase@enders.tch.harvard.edu) from Len Zon's group
    # at Children's Hospital, Boston. They map to human hg16.
    ssh kkstore04
    mkdir -p /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
    cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
    sed -e 's/"//g' cumuList.gedi.2005oct12.txt > hg16Orthologs.txt
    awk \
    'BEGIN {FS="\t"} {OFS="\t"} {if ($2 == $1) print $1,"",""; else print;}' \
        hg16Orthologs.txt > hg16Orthologs.tab 
    # create a table definition for this set:
cat << 'EOF' > orthologs.sql
# Link together an item with an ortholog
CREATE TABLE affyToHg16Orthologs (
   name varchar(255) not null,        # Item ID
   geneSymbol longblob not null,  # Gene Symbol of ortholog
   description longblob not null, # Description of ortholog
       # Indices
   INDEX(name(20)),
   INDEX(geneSymbol(20))
);
'EOF'    
   # load table
   ssh hgwdev 
   cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
   hgsql -e 'drop table affyToHg16Orthologs;' danRer4
   hgLoadSqlTab danRer4 affyToHg16Orthologs orthologs.sql hg16Orthologs.tab
   # edit hgc.c to use this table on affyZebrafish details page and add
   # a search to use the human ortholog gene symbol in a search:
   # affyZebrafishHg16Ortholog, put in trackDb/zebrafish/trackDb.ra

###########################################################################
#  SWAP rn4 BLASTZ CHAIN/NET (DONE, 2006-06-19, hartera)
#  See also makeRn4.doc
    ssh pk
    cd /cluster/data/rn4/bed/blastzDanRer4.2006-06-19
    # blastz parameters used in blastz alignment of danRer4 on mm8:
    # BLASTZ_ABRIDGE_REPEATS=0
    # BLASTZ_H=2000
    # BLASTZ_Y=3400
    # BLASTZ_L=6000
    # BLASTZ_K=2200
    # BLASTZ_M=50
    # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF >& swap.log &
    
    ssh hgwdev
    featureBits danRer4 chainRn4Link 
    # 68978593 bases of 1626093931 (4.242%) in intersection
    featureBits danRer4 refGene:cds chainRn4Link -chrom=chr1 -enrichment
    # refGene:cds 0.746%, chainRn4Link 4.333%, both 0.564%, 
    # cover 75.55%, enrich 17.43x
    featureBits danRer3 refGene:cds chainRn4Link -chrom=chr1 -enrichment
    # refGene:cds 0.786%, chainRn4Link 4.320%, both 0.604%, 
    # cover 76.87%, enrich 17.80x
    featureBits danRer4 refGene:cds netRn4 -chrom=chr1 -enrichment
    # refGene:cds 0.746%, netRn4 29.601%,both 0.623%,cover 83.49%,enrich 2.82x
    featureBits danRer3 refGene:cds netRn4 -chrom=chr1 -enrichment
    # refGene:cds 0.786%, netRn4 33.103%, both 0.671%,cover 85.33%,enrich 2.58x
    # Add symbolic link to new swap directory 
    ssh kkstore04
    cd /cluster/data/danRer4/bed
    ln -s blastz.rn4.swap blastz.rn4
    # Check README.txt for downloads. 

#######################################################################
# VEGA GENES (DONE, 2006-08-14 - 2006-08-25, hartera)
# ADD DESCRIPTIONS FOR VEGA GENES (DONE, 2006-09-25 - 2006-09-26, hartera)
# Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
# and also Mario Caccamo: mc2@sanger.ac.uk 
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/vega
    cd /cluster/data/danRer4/bed/vega
    wget --timestamping \
         ftp://ftp.sanger.ac.uk/pub/kj2/gff/vega_in_ensembl.gff
    wget --timestamping \
         ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/genes_for_tom_new.txt
    # checked list of genes found in vega_in_ensembl.gff but not in 
    # genes_for_tom_new.txt against this file
    grep -f genesWithNoInfo.txt genes_for_tom_20060725.txt
    # got a list of 20 that were not in this file: genesWithNoInfo2.txt
    # e-mailed Kerstin at Sanger and got the information for these 20 genes:
    # moreInfo.txt
    # Need to rewrite this file using tabs:
    # checked format for VEGA genes in hg17. Includes an alternate name.
    cd /cluster/data/hg17/bed/vega30
    # to look at human VEGA
    # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
    awk '{if ((($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDARG/)) || \
        (($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDART/))) print $9;}' \
        vega_in_ensembl.gff | sort | uniq > vegaIDs.txt
    perl -pi.bak -e 's/ID=//' vegaIDs.txt
    # list of transcript ID and corresponding gene ID for Vega
    perl -pi.bak -e 's/;Parent=/\t/' vegaIDs.txt
    perl -pi.bak -e 's/;Note=Only//' vegaIDs.txt

    # write a script to reformat the GFF3 file to GFF format.
    # some exon and CDS items belong to more than one transcript ID so these
    # lines can just be duplicated. Those items that are labelled as mRNA or
    # gene can be ignored and not added to the GFF file. Some of these lines
    # have an extra comment e.g. Note="   . These will be ignored anyway as
    # they are on the lines with mRNA or gene in them so they will not be in
    # the final GFF file.  
cat << '_EOF_' > formatGff3ToGff.pl
#!/usr/bin/perl -w
use strict;

my (%idsHash, $gffFile, $idsFile);
$gffFile = $ARGV[0];
open(GFF, $gffFile) || die "Can not open $gffFile\n";

while (<GFF>)
{
my ($line, @f, $t, @trans, $r, $chr);
$line = $_;
if ($line !~ /^#/)
   {
   @f = split(/\t/, $line);
   $chr = "chr" . $f[0];
   if (($f[2] ne "gene") && ($f[2] ne "mRNA"))
      {
      $f[8] =~ /Parent=(OTTDART[0-9]+[A-Z0-9,]+)/;
      $t = $1;
      @trans = split(/,/, $t);
      foreach $r (@trans)
         {
         print "$chr\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$r\n";
         }
      }
   }
else 
   {
   # print lines beginning with "#"
   print $line;
   }
}
close GFF;
'_EOF_'
    chmod +x formatGff3ToGff.pl
    # Use script to format the GFF3 file to GFF format in order to load 
    # using ldHgGene
    perl formatGff3ToGff.pl vega_in_ensembl.gff > vega.gff
    # then use the info file to grab those genes that are pseudogenes, get the
    # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
    # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE 
    # or .KNOWN or .NOVEL from the method column and add as a separate 
    # confidence column. 
    # check number of items on each line: there are 4 or 6.
    # Some genes have more than one clone ID in a comma separated list
    # so create two files for loading into two tables. 
    # Found that some of the clone ID fields have comma separated lists 
    # and for OTTDARG00000006367, there are 30. Therefore create two info 
    # tables where one is just for clone IDs.  
    # NOTE: in future, make sure each row of vegaInfoZfish.txt output has 
    # 8 fields. The pseudogene entries are missing an entry in the 
    # confidence field so this should be an empty field.
cat << '_EOF_' > formatVegaInfo.pl
#!/usr/bin/perl -w
use strict;

# format Vega additional information into one file for vegaInfoZfish table
# and another for the vegaToCloneIdZfish table which contains the
# geneId and cloneId for each gene since there are multiple clone IDs for
# some of the genes.
my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
$outFile1 = $ARGV[2];
$outFile2 = $ARGV[3];

open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
open (STDERR, ">info.log") || die "Can not create info.log: $!\n";

while (<IDS>)
{
my ($line, @f);
chomp;
$line = $_;
@f = split(/\t/, $line);

$idsHash{$f[1]} = $f[0];
}
close IDS;

while (<INFO>)
{
my ($line,@fi,$id,$gene,$trans,@transIds, $tr,@clones, $c,@t, $method, $conf);
chomp;
$gene = "";
$line = $_;
@fi = split(/\t/, $line);
$id = $gene = $fi[1];
# get all the transcript IDs for a gene
while (exists($idsHash{$id}))
   {
   $trans = $idsHash{$id};
   push(@transIds, $trans);
   $id = $trans;
   }
# push clone IDs into an array:
@clones = split(/,/, $fi[2]);
@t = split(/\./, $fi[3]);
$method = $t[0];
if ($#t > 0)
{
$conf = $t[1];
}
elsif ($#t == 0)
   {
   $conf = "";
   }
else
   {
   print STDERR "Should be 4 or 6 items per row, found $#fi \n";
   }
foreach $tr (@transIds)
   {
   print OUT1 "$tr\t$fi[1]\t$fi[0]";
   if ($#fi == 5)
      {
      print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
      }
   elsif ($#fi == 3)
      {
      print OUT1 "\t\t\t$method\t\t$conf\n";
      }
   # print out clone IDs for each transcript
   foreach $c (@clones)
      {
      print OUT2 "$tr\t$c\n";
      }
   }
if($gene && !exists($idsHash{$gene})) 
   {
   print STDERR "$gene\n";
   }
}
close IDS;
close INFO;
close OUT1;
close OUT2;
close STDERR;
'_EOF_'
    chmod +x formatVegaInfo.pl 
    wc -l genes_for_tom_new.txt
    # 4822 genes_for_tom_new.txt
    awk '{print $2}' genes_for_tom_new.txt | sort | uniq > genesWithInfo.txt
    awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.txt \
        | sort | uniq > genesFromGff.txt
    wc -l genesFromGff.txt
    # 4947 genesFromGff.txt
    comm -12 genesWithInfo.txt genesFromGff.txt | wc -l
    # 4033 
    comm -13 genesWithInfo.txt genesFromGff.txt | wc -l
    # 914
    comm -13 genesWithInfo.txt genesFromGff.txt > genesWithNoInfo.txt
    # sent this list to Sanger to ask about getting additional information
    # for these genes.
    comm -23 genesWithInfo.txt genesFromGff.txt | wc -l
    # 789
    # got another file from Sanger that should contain the information 
    # for the 914 genes missing information above.
    ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/060725/genes_for_tom_20060725.txt
    # check if this contains all of the list missing before
    sort genesWithNoInfo.txt | uniq > genesWithNoInfo.sort
    awk '{print $2}' genes_for_tom_20060725.txt | sort | uniq > genes.txt
    comm -13 genes.txt genesWithNoInfo.uniq > genesWithNoInfo2.txt
    # there are 20 of these. Sent these to Sanger and received 
    # information for these. Copied and pasted these from e-mail into
    # moreInfo.txt. Write script to reformat: addTabs.pl
    perl addTabs.pl < moreInfo.txt > geneInfo3.txt
    grep -f genesWithInfo.txt genes_for_tom_20060725.txt > tmp
    wc -l tmp
    # 4738
    wc -l genesWithInfo.txt
    # 4822 genesWithInfo.txt
    # Not all of these are in genes_for_tom_20060725.txt so merge all the 
    # info files and uniq:
    cat genes_for_tom_new.txt genes_for_tom_20060725.txt geneInfo3.txt \
        | sort | uniq > allGeneInfo.txt
    awk '{print $2}' allGeneInfo.txt | sort | uniq -c | sort -nr > count
    # counts gene names - often occur twice but with more information in 
    # one case than the other. Seems like newer file has most information for
    # each gene.
    grep -f genesFromGff.txt genes_for_tom_20060725.txt > info1.txt
    # then list genes in info1.txt
    comm -13 genesInInfo1.sort genesFromGff.txt > genes1
    wc -l genes1
    # 55 genes1
    grep -f genes1 genes_for_tom_new.txt > info2.txt
    awk '{print $2}' info2.txt | sort | uniq > genesInInfo2.txt
    comm -13 genesInInfo2.sort genes1 > genes2
    wc -l genes2
    # 20 genes2
    # genes2 is list of genes not found in either file. Should be 20 left.
    awk '{print $2}' geneInfo3.txt | sort | uniq > genes3
    comm -12 genes2 genes3 | wc -l
    # 20 - so these are the same 20 that are in geneInfo3.txt
    # These are in geneInfo3.txt. cat all these files together
    cat info1.txt info2.txt geneInfo3.txt > allGeneInfo2.txt
    # Recreate the tab file for loading into the vegaInfoZfish table:
    rm vegaInfoZfish.txt
    # Use new version that prints out one row for each accession in field 3.
    perl formatVegaInfo.pl vegaIDs.txt allGeneInfo2.txt vegaInfoZfish.txt \  
         vegaToCloneId.txt
    # info.log contains genes for which are not in the gff file of VEGA
    # and this is empty as it should be.
    wc -l vegaInfoZfish.txt
    # 6606 vegaInfoZfish.txt
    wc -l vegaToCloneId.txt
    # 7245 vegaToCloneId.txt
    awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
    # transcripts only have 1 entry
    awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
    comm -13 infogenes.txt genesFromGff.txt 
    # There are no genes in the GFF file that are not in vegaInfoZfish.txt
    # Then remake the pseudogenes track from this.
    # Next step is to find which transcripts are pseudogenes.
    grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
    # There are only 51 in the info file, and all of these are in the GFF
    # file. Anyway, this is too sparse for a separate track, but
    # a subtrack could be created.
    # Get transcript IDs for pseudogenes.
    grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids 
    grep -f pseudogenes.ids vega.gff > vegaPseudoGene.gff 
    awk '{print $9}' vegaPseudoGene.gff |sort | uniq | wc -l
    # 51 
    grep -v -f pseudogenes.ids vega.gff > vegaGene.gff
    wc -l vega*ff
    #  98170 vega.gff
    #  97999 vegaGene.gff
    #    171 vegaPseudoGene.gff
    
    # load gff files:
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega
    hgsql -e 'drop table vegaGene;' danRer4
    hgsql -e 'drop table vegaPseudoGene;' danRer4
    ldHgGene danRer4 vegaGene vegaGene.gff
    # Read 6555 transcripts in 88104 lines in 1 files
    # 6555 groups 25 seqs 1 sources 2 feature types
    # 6555 gene predictions
 
    ldHgGene danRer4 vegaPseudoGene vegaPseudoGene.gff
    # Read 51 transcripts in 171 lines in 1 files
    # 51 groups 9 seqs 1 sources 1 feature types
    # 51 gene predictions

    # Then create SQL table for adding the zebrafish-specific information
    # Add clone_id to a separate table instead of this one. 
cat << '_EOF_' > ~/kent/src/hg/lib/vegaInfoZfish.as
table vegaInfoZfish
"Vega Genes track additional information"
    (
    string transcriptId;	"Vega transcript ID"
    string geneId;		"Vega gene ID (OTTER ID)"
    string sangerName;		"Sanger gene name"
    string zfinId;		"ZFIN ID"
    string zfinSymbol;		"ZFIN gene symbol"
    string method;		"GTF method field"
    string geneDesc; 		"Vega gene description"
    string confidence;		"Status (KNOWN, NOVEL, PUTATIVE, PREDICTED)"
    )
'_EOF_'
    cd ~/kent/src/hg/lib/
    autoSql vegaInfoZfish.as vegaInfoZfish
    mv vegaInfoZfish.h ../inc/
    # commit vegaInfoZfish{.h,.c,.as,.sql} files to CVS
    # add INDEX(geneId) to vegaInfoZfish.sql
    # Need to change geneDesc to longblob type because some descriptions
    # are long (2006-09-26, hartera)
    cd ~/kent/src/hg/lib
    perl -pi.bak -e 's/geneDesc varchar\(255\)/geneDesc longblob/' \
         vegaInfoZfish.sql
    
    # create a second table for the cloneId accessions since there
    # are multiple ids for some VEGA genes. Otherwise, there would be 
    # a comma separated list in this field or many rows repeated but just
    # different in the cloneId field. Associate transcript ID to clone IDs.  
    grep ',' allGeneInfo2.txt | wc -l
    # 378
cat << '_EOF_' > ~/kent/src/hg/lib/vegaToCloneId.as
table vegaToCloneId
"Vega Genes track cloneId information"
    (
    string transcriptId;        "Vega transcript ID"
    string cloneId;             "clone ID"
    )
'_EOF_'
    cd ~/kent/src/hg/lib/
    autoSql vegaToCloneId.as vegaToCloneId
    # replace PRIMARY KEY(transcriptId) with Indices on geneId and cloneId:
    perl -pi.bak -e \
    's/PRIMARY KEY\(transcriptId\)/INDEX\(transcriptId\),\nINDEX\(cloneId\)/' \
       vegaToCloneId.sql
    rm *.bak
  #  mv vegaInfoZfish.h ../inc/
    
    cd /cluster/data/danRer4/bed/vega
    hgsql -e 'drop table vegaInfoZfish;' danRer4
    hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegaInfoZfish.txt
    hgsql -e 'drop table vegaToCloneId;' danRer4
    hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                 vegaToCloneId.txt

    # Add code to hgc.c so that this works for Zebrafish and creates the
    # relevant links. Add searches by vega transcript ID, ZFIN ID and 
    # clone ID.  Add a Vega zebrafish-specific description to
    # trackDb/zebrafish. The Pseudogenes are a subtrack of the Genes track
    # because it is too sparse to show as a separate track. 
# Added entry in zebrafish/trackDb.ra to create these tracks as subtracks of
# a Vega Genes track.
# track vegaGeneZfish
# compositeTrack on
# shortLabel Vega Genes 
# longLabel Vega Annotations 
# group genes
# priority 37
# visibility hide
# chromosomes chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25
# type genePred
# url http://vega.sanger.ac.uk/Danio_rerio/geneview?transcript=$$
    
#    track vegaGene
#    subTrack vegaGeneZfish
#    shortLabel Vega Genes
#    longLabel Vega Gene Annotations
#    priority 1
#    color 0,100,180
    
#    track vegaPseudoGene
#    subTrack vegaGeneZfish
#    shortLabel Vega Pseudogenes
#    longLabel Vega Annotated Pseudogenes
#    priority 2
#    color 155,0,125

    # ADD Descriptions for Vega Genes (2006-09-25 - 2006-09-26, hartera)
    # Looked into using description from BioMart for VEGA genes but easier
    # to get them all directly from Sanger. Kerstin sent a list of 
    # descriptions: for_rachel.txt
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/vega/description
    # copy file here and rename
    cd /cluster/data/danRer4/bed/vega/description
    mv for_rachel.txt vegaDesc.txt
    # get list of VEGA gene IDs in vegaInfoZfish
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega/description
    hgsql -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
          > geneIdsFromInfo.sort
    # get sorted list of gene IDs from description file:
    awk '{print $2;}' vegaDesc.txt | sort | uniq > vegaDesc.ids.sort
    wc -l *.sort
    comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
    # 4892
    comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
    # 55 with no description. sent this list to Sanger and got the
    # descriptions for these too: descriptions_for_Rachel.txt
    awk '{print $1}' descriptions_for_Rachel.txt | sort | uniq \
        > geneIds.newDesc.sort
    comm -12 genesNoDesc.txt geneIds.newDesc.sort | wc
    # 55 gene names in common
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega/description
    cat vegaDesc.txt descriptions_for_Rachel.txt > vegaAllDesc.txt
    wc -l vegaAllDesc.txt 
    # 6440 vegaAllDesc.txt
    # clean up
    rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
    # Then add these to the vegaInfoZish table
cat << 'EOF' > addDesc.pl
#!/usr/bin/perl -w
use strict;

my ($infoFile, $descFile, %descHash);
$infoFile = $ARGV[0]; # vegaInfoZfish.txt file
$descFile = $ARGV[1]; # file of descriptions

open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
open(DESC, $descFile) || die "Can not open $descFile : $!\n";

while (<DESC>)
{
my($line, @f, $id, $desc);
chomp;
$line = $_;
@f = split(/\t/, $line);
if ($#f > 0 && $f[1] =~ /^OTTDARG/)
  {
  $id = $f[1];
  $desc = $f[2];
  }
elsif ($f[0] =~ /^(OTTDARG[0-9]+)\s*(.+)/)
  {
  # some lines have just id and description with only a space between
  $id = $1;
  $desc = $2;
  }
else 
  {
  print "OTTDARG ID not found \n";
  }
$descHash{$id} = $desc;
}
close DESC;

while (<INFO>) 
{
my ($li, @fi, $de, $i, $last);
$de = "";
chomp;
$li = $_;
@fi = split(/\t/, $li);
if ($fi[1] =~ /OTTDARG/)
   {
   if (exists($descHash{$fi[1]}))
      {
      $de = $descHash{$fi[1]};
      }
   else
      {
      print "There is no description for $fi[1] available.\n";
      }
   
   }
$last = $#fi;
for ($i = 0; $i <= 5; $i++ )
   {
   print "$fi[$i]\t";
   }
print "$de\t";
if ($last == 5)
   {
   # if there are only 5 fields, the last one is missing so add extra tab
   print "\t\n";
   }
else 
   {
   print "$fi[$last]\n";
   }
}
close INFO;
'EOF'
     chmod +x addDesc.pl
     # add new descriptions to vegaInfoZfish.txt file
     perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt \
          > vegaInfoZfishWithDesc.txt
    # Reload vegaInfoZfish table 
    ssh hgwdev  
    cd /cluster/data/danRer4/bed/vega/description
    # 105 warnings when loading the table
    # remove "\N" from desc
    perl -pi.bak -e 's/\\N//' vegaInfoZfishWithDesc.txt
    # this removed 3 warnings
    # after dumping the contents of the table and diffing with the input
    # file, found that the pseudogenes are missing the confidence field
    # and so there is a tab missing from the file. Modified addDesc.pl to
    # add the extra tab when only 7 tabbed fields instead of 8 is found
    # in a row.
    hgsql -e 'drop table vegaInfoZfish;' danRer4
    hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegaInfoZfishWithDesc.txt

    # Try loading GTF format file (2006-10-19)
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega/new
    wget ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega/new
    ldHgGene -bin -genePredExt danRer4 vegaNew vega.gtf
    # Error: Read 6371 transcripts in 88275 lines in 1 files
#  6371 groups 25 seqs 4 sources 2 feature types
# invalid gffGroup detected on line: chr22        NOVEL   exon    6782575
# 67832400.000000 -       .       gene_id "si:rp71-1i20.2"; transcript_id
# "si:rp71-1i20.2-001"; 
# GFF/GTF group si:rp71-1i20.2-001 on chr22+, this line is on chr22-, all group
# members must be on same seq and strand
# transcript_id is not unique. otter_transcript_id is unique so switch these.
    cp vega.gtf vegaNew.gtf
    # ldHgGene groups by transcript Id so use OTTER IDS instead
    perl -pi.bak -e 's/transcript_id/other_transcript_id/' vegaNew.gtf
    perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaNew.gtf
    ldHgGene -bin -genePredExt danRer4 vegaNew vegaNew.gtf
    # worked ok
    # Added this as a vegaGeneNew subtrack for Vega Genes

    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega/new
    # find genes that has same transcript Ids for different OTTER gene_ids
    awk 'BEGIN {FS="\t"} {print $9}' vega.gtf > vegaAttributes
    awk 'BEGIN {FS=";"} {print $2, $5}' vegaAttributes \
        > vegaAttrib.transIdandotterId
    sort vegaAttrib.transIdandotterId | uniq \
         > vegaAttrib.transIdandotterId.uniq
    awk '{print $2}' vegaAttrib.transIdandotterId.uniq | sed -e 's/\s//' \
        | sort | uniq -c | sort -nr > vegaAttrib.transId.count 
    # 88 of these transcripts have more than one entry in gtf file. Need
    # to check if they have different OTTER gene ids in each case.
    head -88 vegaAttrib.transId.count | awk '{print $2}' > transIds.morethan1
    grep -w -f transIds.morethan1 vegaAttrib.transIdandotterId.uniq \
         > transIdswithDiffOtterGeneIds.txt
    awk '{print $2}' transIdswithDiffOtterGeneIds.txt | sort | uniq \
        > transIds.diffOtterGeneIds.txt
    # send transIdswithDiffOtterGeneIds.txt to Kerstin at Sanger. List
    # of transcript Ids with different instances of OTTER gene ids.

    # WAITING NOW FOR VEGA GENE UPDATE (2006-10-19)
    # Received e-mail from Ian Sealy at Sanger (is1@sanger.ac.uk) that 
    # Vega gene update is ready in gtf format (2006-11-02)
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega
    mkdir update
    cd update
    wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega/update
    ldHgGene -bin -genePredExt danRer4 vegaUpdate vega.gtf
    # Read 6823 transcripts in 93253 lines in 1 files
  6823 groups 25 seqs 4 sources 2 feature types
invalid gffGroup detected on line: chr22        PUTATIVE        exon
67909276791256  0.000000        -       .       gene_id "si:rp71-1i20.2";
transcript_id "RP71-1I20.1-001";
GFF/GTF group RP71-1I20.1-001 on chr22+, this line is on chr22-, all group
members must be on same seq and strand
   # Still has non-unique transcript IDs - need to wait for next release
   # of VEGA genes and Ensembl for this to be fixed. 
   
   # Received new update of VEGA from Ian Sealy (is1@sanger.ac.uk) on 
   # 2007-02-14.
   ssh kkstore04
   cd /cluster/data/danRer4/bed/vega
   wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
   # Load into database
   # 2007-03-09
   ssh hgwdev
   cd /cluster/data/danRer4/bed/vega
   ldHgGene -bin -genePredExt danRer4 vega vega.gtf
    
   invalid gffGroup detected on line: chr4 NOVEL   exon    35259893
352599940.000000        +       .       gene_id "sinup"; transcript_id
"siah2l-001"; 
GFF/GTF group siah2l-001 on chr21-, this line is on chr4+, all group members
must be on same seq and strand
    # still get duplicate transcript IDs on different chromosome. 
    # Below is what Kerston Howe (kj2@sanger.ac.uk) advised on these cases:
# "this will continue to happen as long as the map still changes. The  
# gene in question was annotated on two adjacent clones which were  
# apparently then broken up and assigned to different chromosomes.  
# Usually, this is not too alarming (just delete those cases, please)"
    # Find other such cases:   
    awk 'BEGIN{OFS="\t"} {print $1, $12}' vega.gtf > vegachromAndId.txt
    sort vegachromAndId.txt | uniq > vegachromAndId.uniq
    awk '{print $2}' vegachromAndId.uniq | sort | uniq -c | sort -nr \
        > vegaIds.count
# These transcript IDs all appear twice on different chromosomes. There could
# be cases where there are transcripts that are duplicated on the same
# chromosome.
# 2 "taf6-001";
# 2 "siah2l-001";
# 2 "rasgrf2-001";
# 2 "lmx1b-001";
# 2 "fvt1-001";
# 2 "ckmt2-002";
# 2 "ckmt2-001";
# 2 "accn2c-001";
    # There are some cases where the gene is on the same chrom but different
    # strands.
    awk 'BEGIN{OFS="\t"} {print $1, $7, $12}' vega.gtf \
        | sort | uniq > vegachromStrandAndId.uniq
    awk '{print $1, $3}' vegachromStrandAndId.uniq | sort | uniq -c \
        | sort -nr > vegaIdsAndChroms.count
# These occur twice on different strands of the same chromosome:
# 2 chr19 "DKEY-264N13.5-001";
# 2 chr14 "stx5a-001";
    
    # Remove these from the GTF file as suggested by Kerstin Howe (Sanger)
    head -8 vegaIds.count | awk '{print $2}' > transcriptIds.remove
    head -2 vegaIdsAndChroms.count | awk '{print $3}' >> transcriptIds.remove
    grep -v  -f transcriptIds.remove vega.gtf > vega2.gtf
    # reload into danRer4 database
    hgsql -e 'drop table vegaUpdate;' danRer4
    ldHgGene -bin -genePredExt danRer4 vegaUpdate vega2.gtf
    # successfully loads now.

    # ldHgGene groups by transcript Id so use OTTER IDS instead
    sed -e 's/transcript_id/other_transcript_id/' vega.gtf > vegaFormat.gtf
    perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaFormat.gtf
    # Now it loads ok without removing duplicate transcript IDs:
    ldHgGene -bin -genePredExt danRer4 vegaFormat vegaFormat.gtf
    # Read 8817 transcripts in 119707 lines in 1 files
    # 8817 groups 29 seqs 4 sources 2 feature types
    # 8817 gene predictions

    hgsql -N -e 'select distinct(name2) from vegaFormat;' danRer4 > name2
    # Extra information obtained from Sanger:
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega
    wget --timestamping \
         ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt   
    sort name2 > name2.sort
    awk '{print $1}' genes_for_tom.txt | sort | uniq > genesfortom.symbs.sort
    comm -23 name2.sort genesfortom.symbs.sort > vegaGtfOnly
    wc -l vegaGtfOnly
    # 4021
    awk '{print $6}' genes_for_tom.txt | sort | uniq > genesfortom.altsymb.sort
    comm -23 vegaGtfOnly genesfortom.altsymb.sort
    # rest of symbols found as alternate symbols in column 6 of this file
    # subtract this from original list
    comm -13 vegaGtfOnly name2.sort > genesincol1
    # Also received descriptions file and additional information from Sanger. 
    # Now the track can be updated since the vega.gtf file loads into the 
    # database see VEGA UPDATE section below.

#######################################################################
# VEGA UPDATE (DONE, 2007-03-26 - 2007-03-28, hartera)
# Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
# and also Ian Sealy: is1@sanger.ac.uk
# GTF file sent on 2007-02-14
# Updated formatVegaInfo.pl script as not all transcripts were being included
# in the vegaInfoZfish and the vegaToCloneId tables so all tables were
# re-made (DONE, 2007-04-06, hartera)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/vega.2007-02-14
    cd /cluster/data/danRer4/bed/vega
    ln -s /cluster/data/danRer4/bed/vega.2007-02-14 \
          /cluster/data/danRer4/bed/vega
    wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
    wget --timestamping \
         ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt   
    # list of gene descriptions by Kerstin Howe (2007-03-12)
    mv for_rachel.txt vegaDescriptions.txt
    mv genes_for_tom.txt vegaInformation.txt
     
    # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
    # Get otter transcript ID and otter gene ID:
    awk 'BEGIN{OFS="\t"} \
        {if (($17 ~ /otter_gene_id/) && ($19 ~ /otter_transcript_id/)) \
        print $20, $18;}' vega.gtf \
        > vegaIDs.txt
    perl -pi.bak -e 's/;//g' vegaIDs.txt
    perl -pi.bak -e 's/\"//g' vegaIDs.txt
    # list of transcript ID and corresponding gene ID for Vega
    sort vegaIDs.txt | uniq > vegaIDs.uniq
    # then use the info file to grab those genes that are pseudogenes, get the
    # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
    # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE 
    # or .KNOWN or .NOVEL from the method column and add as a separate 
    # confidence column. 
    # check number of items on each line: there are 4 or 6.
    # Some genes have more than one clone ID in a comma separated list
    # so create two files for loading into two tables. 
    # Found that some of the clone ID fields have comma separated lists 
    # and for OTTDARG00000006367, there are 30. Therefore create two info 
    # tables where one is just for clone IDs.  
    # NOTE: in future, make sure each row of vegaInfoZfish.txt output has 
    # 8 fields. The pseudogene entries are missing an entry in the 
    # confidence field so this should be an empty field.
    # Updated formatVegaInfo.pl as not getting all transcript IDs in the 
    # vegaInfoZfish table (hartera, 2007-04-06)
cat << '_EOF_' > formatVegaInfo.pl
#!/usr/bin/perl -w
use strict;

# format Vega additional information into one file for vegaInfoZfish table
# and another for the vegaToCloneIdZfish table which contains the
# geneId and cloneId for each gene since there are multiple clone IDs for
# some of the genes.
my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
$idsFile = $ARGV[0];  # list of Transcript IDs and Gene IDs
$infoFile = $ARGV[1]; # information file for Vega genes
$outFile1 = $ARGV[2]; # output1 is the formatted file of Vega info for table
$outFile2 = $ARGV[3]; # output2 is a vega to clone ID conversion table

open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
open (STDERR, ">info.log") || die "Can not create info.log: $!\n";

while (<IDS>)
{
my ($line, @f);
chomp;
$line = $_;
@f = split(/\t/, $line);

# hash is keyed by gene ID but there could be more than one transcript
# associated with a gene ID so need to create an array for the hash
push @{$idsHash{$f[1]}}, $f[0];
}
close IDS;

while (<INFO>)
{
my ($line,@fi,$id,$gene,@transIds, $tr,@clones, $c,@t, $method, $conf);
chomp;
$gene = "";
$line = $_;
@fi = split(/\t/, $line);
$id = $gene = $fi[1];
# get all the transcript IDs for a gene
if (exists($idsHash{$id}))
   {
   @transIds = @{$idsHash{$id}};
   }
# push clone IDs into an array:
@clones = split(/,/, $fi[2]);
@t = split(/\./, $fi[3]);
$method = $t[0];
if ($#t > 0)
{
$conf = $t[1];
}
elsif ($#t == 0)
   {
   $conf = "";
   }
else
   {
   print STDERR "Should be 4 or 6 items per row, found $#fi \n";
   }
foreach $tr (@transIds)
   {
   print OUT1 "$tr\t$fi[1]\t$fi[0]";
   if ($#fi == 5)
      {
      print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
      }
   elsif ($#fi == 3)
      {
      print OUT1 "\t\t\t$method\t\t$conf\n";
      }
   # print out clone IDs for each transcript
   foreach $c (@clones)
      {
      print OUT2 "$tr\t$c\n";
      }
   }
if($gene && !exists($idsHash{$gene})) 
   {
   print STDERR "$gene\n";
   }
}
close IDS;
close INFO;
close OUT1;
close OUT2;
close STDERR;
'_EOF_'
    chmod +x formatVegaInfo.pl 
    wc -l vegaInformation.txt
    # 7169 vegaInformation.txt
    awk '{print $2}' vegaInformation.txt | sort | uniq > genesWithInfo.txt
    awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.uniq \
        | sort | uniq > genesFromGtf.txt
    # Number of genes with info AND in gtf file:
    wc -l genesFromGtf.txt
    # 6171 genesFromGtf.txt
    comm -12 genesWithInfo.txt genesFromGtf.txt | wc -l
    # 6171
    # Number of genes wih no info:
    comm -13 genesWithInfo.txt genesFromGtf.txt | wc -l
    # 0
    # Use perl script above to extract vegaInfo table information.
    # Re-did this with updated perl script to get all transcript IDs
    # (hartera, 2007-04-07)
    perl formatVegaInfo.pl vegaIDs.uniq vegaInformation.txt \ 
         vegaInfoZfish.txt vegaToCloneId.txt
    # info.log contains genes for which are not in the gff file of VEGA
    # and this is empty as it should be.
    wc -l vegaInfoZfish.txt
    # 8817 vegaInfoZfish.txt
    wc -l vegaToCloneId.txt
    # 9698 vegaToCloneId.txt
    # The vegaToCloneId.txt file is also larger than before as it now 
    # has all the transcript IDs (hartera, 2007-04-05).
    awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
    # transcripts only have 1 entry
    awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
    comm -13 infogenes.txt genesFromGtf.txt 
    # There are no genes in the GFF file that are not in vegaInfoZfish.txt
    # However, there are genes in the information file that do not have 
    # transcripts represented that are in the GFF file.
    # Then remake the pseudogenes track from this.
    # Next step is to find which transcripts are pseudogenes.
    grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
    # Once vegaInfoZfish.txt updated, found 53 pseudogenes so need to update
    # the pseudogene track 
    # There are only 53 in the info file, and all of these are in the GFF
    # file. Anyway, this is too sparse for a separate track, but
    # a subtrack could be created.
    # Get transcript IDs for pseudogenes.
    grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids 
    grep -w -f pseudogenes.ids vega.gtf > vegaPseudoGene.gtf 
    awk '{print $20}' vegaPseudoGene.gtf | sort | uniq | wc -l
    # 53
    # Need to remake the vegGene table:
    grep -vw -f pseudogenes.ids vega.gtf > vegaGene.gtf
    wc -l vega*gtf
    # 119707 vega.gtf
    # 119529 vegaGene.gtf
    # 178 vegaPseudoGene.gtf
    # Need to relabel IDs to get the name to be the otter transcript ID
    # and name 2 to be the transcript_id (needs to be labeled as gene_id)
    # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
    # groups the rows by this ID.  
    sed -e 's/gene_id/tmp_id/' vegaGene.gtf > vegaGeneFormat.gtf
    perl -pi.bak -e 's/transcript_id/gene_id/' vegaGeneFormat.gtf
    perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaGeneFormat.gtf

    # Do the same for the pseudogene GTF files:
    sed -e 's/gene_id/tmp_id/' vegaPseudoGene.gtf > vegaPseudoGeneFormat.gtf
    perl -pi.bak -e 's/transcript_id/gene_id/' vegaPseudoGeneFormat.gtf
    perl -pi.bak -e 's/otter_transcript_id/transcript_id/' \
         vegaPseudoGeneFormat.gtf
    rm *.bak
    # load GTF files for Vega genes and pseudogenes:
    # Reloaded all tables after updating as above (2007-04-06, hartera)
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega
    hgsql -e 'drop table vegaGene;' danRer4
    hgsql -e 'drop table vegaPseudoGene;' danRer4

    ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormat.gtf
    # Read 8764 transcripts in 119529 lines in 1 files
    #  8764 groups 29 seqs 3 sources 2 feature types
    # 8764 gene predictions

    ldHgGene -bin -genePredExt danRer4 vegaPseudoGene vegaPseudoGeneFormat.gtf
    # Read 53 transcripts in 178 lines in 1 files
    #   53 groups 11 seqs 1 sources 1 feature types
    # 53 gene predictions

    hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer4 \
         | sort | uniq 
    hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer4 \
         | sort | uniq 
    # vegaGene includes several scaffolds so need to lift file to chrom 
    # level for these and reload vegaGene. vegaPseudoGene has no scaffolds.
    
    # scaffolds in vegaGene:
# chrZv6_scaffold3697
# chrZv6_scaffold3723
# chrZv6_scaffold3731
# chrZv6_scaffold3734
    # These are all on the chrUn_random virtual chrom
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega
    sed -e 's/chrZv6_scaffold/Zv6_scaffold/g' vegaGeneFormat.gtf \
           > vegaGeneFormat2.gtf 
    liftUp vegaGeneFormatLifted.gtf \
           /cluster/data/danRer4/jkStuff/liftAll.lft carry vegaGeneFormat2.gtf
    # Reload vegaGene table:
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega
    hgsql -e 'drop table vegaGene;' danRer4 
    ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormatLifted.gtf
    # Read 8764 transcripts in 119529 lines in 1 files
    #   8764 groups 26 seqs 3 sources 2 feature types
    # 8764 gene predictions

    # Vega information tables:
    # mySQL table definition and autosql-generated files created previously 
    # for zebrafish-specific information (vegaInfoZfish) in the VEGA GENES 
    # section above.
    # Add clone_id to a separate table instead of this one. 
    
    # created a second table for the cloneId accessions since there
    # are multiple ids for some VEGA genes. Otherwise, there would be 
    # a comma separated list in this field or many rows repeated but just
    # different in the cloneId field. Associate transcript ID to clone IDs.  
    # see VEGA GENES section
    # load these tables: 
    cd /cluster/data/danRer4/bed/vega
    hgsql -e 'drop table vegaInfoZfish;' danRer4
    hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegaInfoZfish.txt
    hgsql -e 'drop table vegaToCloneId;' danRer4
    hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                 vegaToCloneId.txt

    # Add code to hgc.c so that this works for Zebrafish and creates the
    # relevant links. Add searches by vega transcript ID, ZFIN ID and 
    # clone ID. trackDb entry added as in VEGA GENES section above.
    # Added track handler to hgTracks.c for vegaGeneZfish so that the
    # transcript names from the name2 column of the genePred table is 
    # used for the item name displayed in the track.
    # Add a Vega zebrafish-specific html description to trackDb/zebrafish. 
    # The Pseudogenes are a subtrack of the Genes track
    # because it is too sparse to show as a separate track. 

    # ADD Descriptions for Vega Genes 
    # Looked into using description from BioMart for VEGA genes but easier
    # to get them all directly from Sanger. Kerstin sent a list of 
    # descriptions: for_rachel.txt
    # Add these again to updated tables (2007-04-06, hartera)
    ssh kkstore04
    mkdir -p /cluster/data/danRer4/bed/vega/description
    # copy file here and rename
    cd /cluster/data/danRer4/bed/vega/description
    mv ../vegaDescriptions.txt .
    # get list of VEGA gene IDs in vegaInfoZfish
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega/description
    hgsql -N -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
          > geneIdsFromInfo.sort
    # get sorted list of gene IDs from description file:
    awk '{print $1;}' vegaDescriptions.txt | sort | uniq > vegaDesc.ids.sort
    wc -l *.sort
    # 6171 geneIdsFromInfo.sort
    # 14150 vegaDesc.ids.sort

    comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
    # 6168
    comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
    # There are 3 with no description
    # OTTDARG00000004654
    # OTTDARG00000018757
    # OTTDARG00000018760
    # Searched for these three at
    # http://vega.sanger.ac.uk/Danio_rerio/index.html
    # and found that these three do not have a description.
    # add them to the descriptions list
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega/description
    # add the three with no description to the descriptions list
    cat vegaDescriptions.txt genesNoDesc.txt > vegaAll.txt
    # remove header
    tail +2 vegaAll.txt | sort | uniq > vegaAllDesc.txt
    wc -l vegaAll*
    # 23058 vegaAll.txt
    # 15460 vegaAllDesc.txt
    # clean up
    rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
    # Then add these to the vegaInfoZish table
cat << 'EOF' > addDesc.pl
#!/usr/bin/perl -w
use strict;

my ($infoFile, $descFile, %descHash);
$infoFile = $ARGV[0]; # vegaInfoZfish.txt file
$descFile = $ARGV[1]; # file of descriptions

open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
open(DESC, $descFile) || die "Can not open $descFile : $!\n";
open(ERROR, ">error.log") || die "Can not create error.log : $!\n";
open(OUT, ">out.txt") || die "Can not create out.txt: $!\n";

while (<DESC>)
{
my($line, @f, $id, $desc);
chomp;
$line = $_;
@f = split(/\t/, $line);
if ($f[0] =~ /^OTTDARG/)
  {
  $id = $f[0];
  $desc = $f[1];
  }
else
  {
  print ERROR "OTTDARG ID is not found on a line of the descriptions file.\n";
  }
$descHash{$id} = $desc;
}
close DESC;

while (<INFO>)
{
my ($li, @fi, $de, $i, $last);
$de = "";
chomp;
$li = $_;
@fi = split(/\t/, $li);
if ($fi[1] =~ /OTTDARG/)
   {
   if (exists($descHash{$fi[1]}))
      {
      $de = $descHash{$fi[1]};
      }
   else
      {
      print ERROR "There is no description for $fi[1] available.\n";
      }
   }
$last = $#fi;
for ($i = 0; $i <= 5; $i++ )
   {
   print OUT "$fi[$i]\t";
   }
print OUT "$de\t";
if ($last == 5)
   {
   # if there are only 5 fields, the last one is missing so add extra tab
   print OUT "\t\n";
   }
else
   {
   print OUT "$fi[$last]\n";
   }
}
close INFO;
close ERROR;
'EOF'
    chmod +x addDesc.pl
    # add new descriptions to vegaInfoZfish.txt file
    perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt
    # check output in out.txt then rename
    mv out.txt vegaInfoZfishWithDesc.txt
    rm error.log # empty
    # Reload vegaInfoZfish table 
    ssh hgwdev  
    cd /cluster/data/danRer4/bed/vega/description
    hgsql -e 'drop table vegaInfoZfish;' danRer4
    hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegaInfoZfishWithDesc.txt
    # No errors loading
    
# Added code already to hgc.c so that this works for Zebrafish and creates the
# relevant links. Add searches by vega transcript ID, ZFIN ID and 
# clone ID. trackDb entry added as in VEGA GENES section above.
# Added track handler to hgTracks.c for vegaGeneZfish so that the
# transcript names from the name2 column of the genePred table are
# used for the item name displayed in the track.
# Add a Vega zebrafish-specific html description to trackDb/zebrafish. 
# The Pseudogenes are a subtrack of the Genes track because it is too sparse 
# to show as a separate track. 

##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
    cd /cluster/data/danRer4/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_gtf
    wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_ptx
    # clean up and rename downloaded directorys:
    mv ardor.wustl.edu/jeltje/zebrafish/chr_gtf .
    mv ardor.wustl.edu/jeltje/zebrafish/chr_ptx .
    rm -rf ardor.wustl.edu
    rm chr_*/index.html*
    gzip chr_*/*
    chmod a-w chr_*/*.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt danRer4 nscanGene chr_gtf/chr*.gtf.gz

    # load protein, add .1 suffix to match transcript id
    hgPepPred -suffix=.1 danRer4 generic nscanPep chr_ptx/chr*.fa.gz
    rm *.tab

    # update trackDb; need a danRer4-specific page to describe informants
    zebrafish/danRer4/nscanGene.html   (copy from mm8 and edit)
    zebrafish/danRer4/trackDb.ra
    # changed search regex to
    termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]

#######################################################################
# UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND
# USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT
# ALIGNMENTS (DONE, 2006-09-27 - 2006-09-28, hartera)
# With the previous version of this track, QA found a number of short
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish.
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations.
# NOTE: Also added alignments for NA_random and Un_random, these had not 
# been done for the original affyZebrafish track but should have been. 

    # Array chip sequences already downloaded for danRer1
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyZebrafish
    mkdir -p /san/sanvol1/scratch/affy
   cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /san/sanvol1/scratch/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer3
    mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-09-27
    # remove old link and create new one
    rm /cluster/data/danRer4/bed/affyZebrafish
    ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-09-27 \
          /cluster/data/danRer4/bed/affyZebrafish
    # Align sequences on the pitakluster. Scaffolds were aligned for NA_random
    # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
    # were aligned as ~5 Mb chunks.
    ssh pk
    cd /cluster/data/danRer4/bed/affyZebrafish
    ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
    ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
    foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst 
    # 3237 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer4/affy/psl
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub

    gensub2 genome.lst affy.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
    para time
# Completed: 3237 of 3237 jobs
#CPU time in finished jobs:      19319s     321.98m     5.37h    0.22d  0.001 y
#IO & Wait Time:                  9297s     154.95m     2.58h    0.11d  0.000 y
#Average job time:                   9s       0.15m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              98s       1.63m     0.03h    0.00d
#Submission to last job:          3135s      52.25m     0.87h    0.04d
    
    # need to do pslSort and lift up
    ssh pk
    cd /san/sanvol1/scratch/danRer4/affy
    # Do sort, liftUp and then best in genome filter.
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed.
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments.
    pslSort dirs raw.psl tmp psl
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#             total:     15272   828202
#drop minNonRepSize:     2763    741674
#     drop minIdent:     2656    39188
#     drop minCover:     2550    10784
#        weird over:     359     1439
#        kept weird:     277     347
#    drop localBest:     2830    17737
#              kept:     14952   18819
# Kept 97.9% of alignments. There are 15502 Affy sequences originally
# aligned so there are now 96.5% remaining.
     
    # lift up the coordinates to chrom level
    liftUp affyZebrafish.psl \
           /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
    # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
    # Lifting contig.psl
    
    # rsync these psl files
    rsync -a --progress /san/sanvol1/scratch/danRer4/affy/*.psl \
         /cluster/data/danRer4/bed/affyZebrafish/
    
    ssh kkstore04
    cd /cluster/data/danRer4/bed/affyZebrafish
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
    mv affyZebrafish.psl.tmp affyZebrafish.psl
    pslCheck affyZebrafish.psl
    # psl is good
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/affyZebrafish
    hgsql -e 'drop table affyZebrafish;' danRer4
    hgLoadPsl danRer4 affyZebrafish.psl
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
    /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes
    # these sequences were loaded previously so no need to reload.
    hgLoadSeq -abbr=Zebrafish: danRer3 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
    # check number of short alignments:
    hgsql -e \
     'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
    # 7
    # for previous filtered set, there were 1272 alignments of <= 50 bp so
    # this has improved.
    hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
    # 14952
    # Previously, there were 14819 so more sequences have aligned but less
    # short alignments are retained. Many of the short alignments may also 
    # have longer alignments to different regions of the genome that are good.

#########################################################################
# COMPUGEN ZEBRAFISH OLIGOS TRACK (in progress, 2006-10-20, hartera)
# Align the zebrafish oligos from Compugen used to create the arrays
# used by GIS to study expression at different developmental stages.
    ssh hgwdev
    mkdir -p /projects/compbio/data/microarray/compugen/zebrafish
    # save Compugen oligos FASTA file here. obtained from 
    # Sinnakaruppan Mathavan <mathavans@gis.a-star.edu.sg> at the 
    # Genome Institute of Singapore (GIS).
    # Permission was obtained from Compugen to display the sequences 
    # along with a disclaimer. see README.txt
    cd /projects/compbio/data/microarray/compugen/zebrafish
    unzip Zebrafish\ Oligos_Compugen_XEBLIB96_pov_070704.zip
    # this gives an Excel file, XEBLIB96_pov_070704.xls
    # save as a tab separated text file using Excel: XEBLIB96_pov_070704.txt
    # Remove quotation marks
    sed -e 's/"//g' XEBLIB96_pov_070704.txt > GISArray.txt
    # also remove other unwanted characters, ^@, which is ASCII for NULL
    tr -d '\0' < GISArray.txt > GISArray.format.txt 
    awk 'BEGIN{FS="\t"} {if ($2 !~ /Serial/ && ($2 != "")) print ">"$2"\n"$4}' \
        GISArray.format.txt > GISZfishArray.fa
    grep '>' GISZfishArray.fa | wc -l
    # 16399
    # align sequences to the zebrafish genome on pk
    mkdir -p /san/sanvol1/scratch/compugen
   cp /projects/compbio/data/microarray/compugen/zebrafish/GISZfishArray.fa \
       /san/sanvol1/scratch/compugen/
    # Set up cluster job to align Zebrafish consensus sequences to danRer4
    mkdir -p /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03
    ln -s /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03 \
          /cluster/data/danRer4/bed/compugenZebrafish
    # Align sequences on the pitakluster. Scaffolds were aligned for NA_random
    # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
    # were aligned as ~5 Mb chunks.
    ssh pk
    cd /cluster/data/danRer4/bed/compugenZebrafish
    ls -1 /san/sanvol1/scratch/compugen/GISZfishArray.fa > oligos.lst
    ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
    foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst 
    # 3237 genome.lst
    # oligos are 65 bp in length.
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer4/compugen/psl
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/compugen/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    gensub2 genome.lst oligos.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
    para time
# Completed: 3237 of 3237 jobs
# CPU time in finished jobs:       1948s      32.46m     0.54h    0.02d  0.000 y
# IO & Wait Time:                 11145s     185.75m     3.10h    0.13d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             428s       7.13m     0.12h    0.00d
# Submission to last job:           621s      10.35m     0.17h    0.01d

    # need to do pslSort and lift up
    ssh pk
    cd /san/sanvol1/scratch/danRer4/compugen
      
    # Do sort, liftUp and then best in genome filter.
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed.
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments.
    pslSort dirs raw.psl tmp psl
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
# for Compugen:
# Dropping minCover to 0.10 doesn't make a difference. Decreasing the minId to
# 0.92 increases the number of sequences aligned and does not increase
# the number of alignments for sequences with the most alignments. 
# Removed the minimum non repeat filter does significantly increase the 
# number of alignments for some sequences.

145 CGENZEB_456015402_0
     79 CGENZEB_456008445_0
     72 CGENZEB_456015991_0
     53 CGENZEB_456012678_0
     46 CGENZEB_456004521_0

# Total sequences: 16399
seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     8       16
    drop localBest:     1288    7040
              kept:     14632   19420
# 89.2% are kept.


# minCov = 0.10 minNonRepSize = 8
# seqs    aligns
seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     8       16
    drop localBest:     1288    7040
              kept:     14632   19420

# 89.2% are kept.
# minCov=0.10 minNonRepSize = 10
seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1015    72795
     drop minIdent:     811     3462
        weird over:     13      48
        kept weird:     8       16
    drop localBest:     1278    6901
              kept:     14616   19396

# 89.1% kept.
# minNonRepSize = 0
                        seqs    aligns
             total:     15544   102554
     drop minIdent:     1344    23893
        weird over:     42      271
        kept weird:     24      44
    drop localBest:     1772    49794
              kept:     15338   28867
# 93.8% kept from total
# but there are large numbers of alignments for some probes:
# 62 CGENZEB_456005547_0
    603 CGENZEB_456005221_0
    454 CGENZEB_456010007_0
    409 CGENZEB_456014900_0
    372 CGENZEB_456009900_0

# try increase identity but low minReps
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
       -ignoreNs -bestOverlap -minId=0.97 -minCover=0.15 raw.psl \
       contigMinRep8minId97.psl
# seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     1982    8772
        weird over:     9       29
        kept weird:     7       14
    drop localBest:     766     2915
              kept:     13715   18322
# this has improved highest number of hits a lot but this is similar to 
# that achieved with higher identity too
# but only kept 80% of seqeuences.
145 CGENZEB_456015402_0
     79 CGENZEB_456008445_0
     72 CGENZEB_456015991_0
     53 CGENZEB_456012678_0
     46 CGENZEB_456004521_0
# lower minCov:
   pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.08 raw.psl \
        contigMinCov8.psl

# seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     8       16
    drop localBest:     1288    7040
              kept:     14632   19420

# 89.2%, now nearBest = 0.1%
   pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.10 raw.psl \
        contigMinCov10NearBest1percent.psl
# seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     7       15
    drop localBest:     1350    7451
              kept:     14632   19009
# same number of sequences aligning but less overall alignments:
# 115 CGENZEB_456015402_0
#     71 CGENZEB_456015991_0
#    71 CGENZEB_456008445_0
#    46 CGENZEB_456004521_0
#    38 CGENZEB_456008610_0
# CGENZEB_456012678_0 now went down to 1. 
# 89.2% aligned

   # use minCover = 0.40
   pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
        contig.psl
seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     7       15
    drop localBest:     1350    7451
              kept:     14632   19009

   # little difference using minCover=0.60


   cd /san/sanvol1/scratch/danRer4/compugen
   rm contig*
# Use these parameters:
   pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
        contig.psl
   # use minCover = 0.40
    seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
        weird over:     13      48
        kept weird:     7       15
    drop localBest:     1350    7451
              kept:     14632   19009
    # use minCover=0.60
 seqs    aligns
             total:     15544   102554
drop minNonRepSize:     1004    72545
     drop minIdent:     825     3549
     drop minCover:     198     507
        weird over:     9       39
        kept weird:     4       12
    drop localBest:     1285    7009
              kept:     14588   18944

 
   # lift up the coordinates to chrom level
   liftUp compugenZebrafish.psl \
          /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
   # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
   # Lifting contig.psl
    
   # rsync these psl files
   rsync -a --progress /san/sanvol1/scratch/danRer4/compugen/*.psl \
         /cluster/data/danRer4/bed/compugenZebrafish
    
   ssh kkstore04
   cd /cluster/data/danRer4/bed/compugenZebrafish
   # shorten names in psl file
   pslCheck compugenZebrafish.psl
   # psl is good
   # load track into database
   ssh hgwdev
   cd /cluster/data/danRer4/bed/compugenZebrafish
   hgsql -e 'drop table compugenZebrafish;' danRer4
   hgLoadPsl danRer4 compugenZebrafish.psl
   # Add entry in trackDb/zebrafish/trackDb.ra and a search for hgFindSpec
   # Add a description page. 
   # Need to add disclaimer for sequences.

    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/compugenProbes
    ln -s \
    /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes
    # these sequences were loaded previously so no need to reload.
    hgLoadSeq -abbr=Zebrafish: danRer3 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak contig.psl raw.psl
    # check number of short alignments:
    hgsql -e \
     'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
    # 7
    # for previous filtered set, there were 1272 alignments of <= 50 bp so
    # this has improved.
    hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
    # 14952
    # Previously, there were 14819 so more sequences have aligned but less
    # short alignments are retained. Many of the short alignments may also 
    # have longer alignments to different regions of the genome that are good.
    
#########################################################################
# ENSEMBL GENES TRACKS FOR ENSEMBL VERSION 42
# ENSEMBL GENES (PROTEIN-CODING) AND ENSEMBL NON-CODING GENES 
# (DONE, 2007-01-08 - 2007-01-09 hartera)
# Obtained from BioMart at Ensembl (The Wellcome Trust Sanger Institute)
   # Starting downloading Ensembl v41 genes (2006-12-13)
   # get "unexpected end of file" error with the peptide download. 
   # Notified Ensembl (2006-12-15).
   # Ensembl helpdesk say that the files sometimes get terminated early
   # for large downloads so try using this link to BioMart instead:
   # http://www.biomart.org/biomart/martview
   # Repeat above using this link. This has Ensembl42 though so e-mailed 
   # Ensembl to ask if they are releasing Ensembl42 soon (2006-12-18)
# Ensembl was updated to v42 in Dec. 2006 so use this new data set 
# (2007-01-08):
    ssh kkstore04
    mkdir -p /cluster/data/danRer4/bed/ensembl42
    cd /cluster/data/danRer4/bed/ensembl42

    # Get the Ensembl gene data from BioMart at:
    # http://www.biomart.org/biomart/martview
    # Follow this sequence through the pages: (NOTE: this interface has changed
    # significantly since danRer3). Ensembl version is 42 (Dec 2006).
    # 1) The Dataset link in the left panel is selected. Select the 
    # Ensembl dataset (v42 here) and the Danio_rerio choice (ZFISH6 here).
    # 2) Click on the Attributes link in the left panel.  
    # 3) Select Structures. Click on the + next to GENE to expand it
    # and check the boxes for the Ensembl Gene ID and Ensembl
    # Transcript ID. 
    # 4) Clicking on the "Count" link on the top black menu shows that there
    # are  28,508 / 28,508 Genes selected in Danio rerio genes (ZFISH6)
    # 5) Click on the "Results" link on the top black menu and then select GFF
    # as the format and select to export all results to a 
    # "Compressed web file (notify by e-mail)" and hit the "Go" button and
    # enter e-mail address as requested.
    # When results are ready, you will receive an e-mail with a link to
    # download the results, save as ensemblGene42.gff.gz
    # Save as and move file to 
    # /cluster/data/danRer4/bed/ensembl42
    gunzip ensemblGene42.gff.gz
    # file unzips ok.
    # Repeat above but at step 3, selec the Features Attribute
    # select Ensembl Transcript ID and Biotype under the GENE section.
    # Select "Text, tab separated" as the output format and gzip 
    # compression. Biotype gives information to separate the genes into
    # protein-coding and RNA genes and pseudogenes.
    # For step 5, select CSV as the output and then select to export all
    # results to a "Compressed web file (notify by e-mail)" and hit the 
    # "Go" button and enter e-mail address as requested.
 
    # Save as ensemblGene42Biotype.tsv.gz and move to 
    # /cluster/data/danRer4/bed/ensembl42
    gunzip ensemblGene42Biotype.tsv.gz
    # file unzips ok.

    # The Ensembl gene predictions are mapped to chromosomes except for 
    # chrNA and chrUn. Use lift files for scaffolds to these chroms.
    # get chrUn and chrNA Ensembl records.
 
    ssh kkstore04
    cd /cluster/data/danRer4/bed/ensembl42
    
    # need to lift up the NA and Un scaffolds to chroms
    liftUp -type=.gtf ensemblGene42.lifted \
     /cluster/data/danRer4/jkStuff/liftAll.lft carry ensemblGene42.gff
    # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
    # Lifting ensemblGene42.gff

    wc -l ensemblGene42*
    # 807871 ensemblGene42.gff
    # 807871 ensemblGene42.lifted
    # 39626 ensemblGene42Biotype.tsv
    
    # check there are no scaffolds left in lifted file:
    grep Zv6_NA ensemblGene42.lifted
    grep Zv6_scaffold ensemblGene42.lifted
    # there are none so ok.
    # add chr at beginning of each line. NA and Un already have "chr" 
    # prefix so then remove the extra one.
    sed -e "s/^/chr/" ensemblGene42.lifted | sed -e "s/chrchr/chr/" \
        > ensGene42.gff
    # check file sizes -ok and some of the lifted co-ordinates
    # Also remove the suffix that denotes the transcript version number. 
    # This is not in the ensGtp or ensPep tables.
    perl -pi.bak -e 's/\.[0-9]+//'g ensGene42.gff

   # Next split up the gff into a protein-coding gene set and a RNA gene and
   # pseudogene set and load into different tracks.
 
   # get transcript IDs only for protein coding transcripts
   grep "protein_coding" ensemblGene42Biotype.tsv | awk '{print $1}' \
        > ensGene42ProteinCoding.ids
   # skip header line and grab everything else from the file
   tail +2 ensemblGene42Biotype.tsv | grep -v "protein_coding" \
        | awk '{print $1}' > ensGene42NonCoding.ids
   wc -l ensGene42*ids
   # 3560 ensGene42NonCoding.ids
   # 36065 ensGene42ProteinCoding.ids
   # 39625 total
   wc -l ensemblGene42Biotype.tsv
   # 39626 ensemblGene41Biotype.tsv
   # extra line is the header line
   # then get only the protein-coding trancsripts from the GFF file
   # write a script to do this as grep is slow
cat << 'EOF' > getIds.pl
#!/usr/bin/perl -w
use strict;

my ($in, $file, %ids);
$in = $ARGV[0]; # list of ids
$file = $ARGV[1]; # GFF file or other data file

open(IN, $in) || die "Can not open $in :$!\n"; 
open (FILE, $file) || die "Can not open $file :$!\n"; 
open (FOUND, ">found.log") || "Can not create found.log: $!\n";

while (<IN>) {
   chomp;
   my $l = $_;
   $ids{$l} = 1;
}
close(IN);

# read GFF file or other data file and check whether transcript ID is in 
# the hash before printing out that line.
while  (<FILE>){
   my ($line, $transId);

   $line = $_;
   $transId = "";

   if ($line =~ /(ENSDART[0-9]+)/){
      $transId = $1;
   }
   if (exists($ids{$transId})){
      print $line;
      print FOUND "$transId\n";
   }
}
close(FILE);
'EOF'
   chmod +x getIds.pl
   perl getIds.pl ensGene42ProteinCoding.ids ensGene42.gff \
        > ensGene42ProteinCoding.gff
   # uniq found.log and check against input ids
   sort found.log | uniq > foundProtein.uniq
   sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
   comm -13 foundProtein.uniq ens42ProteinIds.sort
   # All ids were found in the gff file

   perl getIds.pl ensGene42NonCoding.ids ensGene42.gff \
        > ensGene42NonCoding.gff
   sort found.log | uniq > foundNonCoding.uniq
   sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
   comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
   # All ids were found in the gff file
   rm *.sort *.uniq *.bak found.log
   wc -l ensGene42*.gff
   # 807871 ensGene42.gff
   #   3695 ensGene42NonCoding.gff
   # 804176 ensGene42ProteinCoding.gff

   # load into database
   ssh hgwdev
   cd /cluster/data/danRer4/bed/ensembl42
   hgsql -e 'drop table ensGene;' danRer4
   hgsql -e 'drop table ensGeneNonCoding;' danRer4
   /cluster/bin/x86_64/ldHgGene danRer4 ensGene ensGene42ProteinCoding.gff
   # Read 36065 transcripts in 804176 lines in 1 files
   # 36065 groups 27 seqs 1 sources 4 feature types
   # 36065 gene predictions

   /cluster/bin/x86_64/ldHgGene danRer4 ensGeneNonCoding ensGene42NonCoding.gff
   # Read 3560 transcripts in 3695 lines in 1 files
   # 3560 groups 27 seqs 1 sources 1 feature types
   # 3560 gene predictions
   # The only difference between Ensembl v42 and v41 for zebrafish is two
   # extra gene predictions in the non-coding category in v42.
 
   # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
   # hgKnownToSuper.  Use BioMart to create it as above, except:
   # Step 3) Choose "Features". Expand the GENE section and under 
   # "Ensembl Attributes", check boxes for Ensembl Gene ID, 
   # Ensembl Transcript ID, Ensembl Peptide ID.
   # After clicking on the Results link in the top black menu,
   # Choose CSV as the output format and Export all results to a 
   # "Compressed web file (notify by e-mail)" and hit the 
   # "Go" button and enter e-mail address as requested.
   # Result name: ensembl42Gtp.tsv.gz

   ssh kkstore04 
   cd /cluster/data/danRer4/bed/ensembl42
   gunzip ensembl42Gtp.tsv.gz
   # separate the IDs for protein-coding genes and the rest (RNA genes and
   # pseudogenes).
   # transcript ID and gene ID are in different columns than before so switch
   # Gene ID should be in first column and Transcrip ID in the second column.
   awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2,$1,$3}' ensembl42Gtp.tsv \
       > ens42GtpFormat.tsv
   perl getIds.pl ensGene42ProteinCoding.ids ens42GtpFormat.tsv \
        > ensGtpProteinCoding.txt
   # uniq found.log and check against input ids
   sort found.log | uniq > foundProtein.uniq
   sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
   comm -13 foundProtein.uniq ens42ProteinIds.sort
   
   perl getIds.pl ensGene42NonCoding.ids ens42GtpFormat.tsv \
        > ensGtpNonCoding.txt
   # uniq found.log and check against input ids
   sort found.log | uniq > foundNonCoding.uniq
   sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
   comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
   # All ids were found in the gff file
   rm *.sort *.uniq *.bak found.log
   wc -l ensGtp*.txt
   # 3560 ensGtpNonCoding.txt
   # 36065 ensGtpProteinCoding.txt
   # The non-coding set have only gene ids and transcript ids and
   # no protein ids.
   
   # Load database
   ssh hgwdev
   cd /cluster/data/danRer4/bed/ensembl42/
   hgsql -e 'drop table ensGtp;' danRer4
   # load ensGtp for protein-coding genes
   hgLoadSqlTab danRer4 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
            ensGtpProteinCoding.txt
   # only load IDs for the protein coding genes. The non-coding genes
   # have no protein ID.

   # Get the ensembl peptide sequences from
   # http://www.biomart.org/biomart/martview
   # Follow this sequence:
   # 1) Choose the Ensembl Genes 42 as the database and then 
   # Danio Rerio genese (ZFISH6) as the dataset.
   # 2) Click on the Attributes link in the left panel. Select sequences.
   # 3) Expand the SEQUENCES section and choose Peptide as type of sequence 
   # to export and then expand the Header Information section and select 
   # Ensembl Gene ID from Gene Attributes and 
   # Ensembl Transcript ID and Ensembl Peptide ID from 
   # Transcript Attributes 
   # 4) Click on the Filters link in the left panel and expand the GENE
   # section. Select the Gene type box and then select protein_coding as 
   # these are the only genes with an associated protein sequence.
   # 5) Click on the Results link in the top black menu bar and 
   # choose FASTA for the output and export all results to
   # Compressed file (notify by e-mail).
   # save the file as ensembl42Pep.fasta.gz and move to
   # /cluster/data/danRer4/bed/ensembl42
   # Got results URL by e-mail but BioMart seems to be currently inaccessible
   ssh kkstore04
   cd /cluster/data/danRer4/bed/ensembl42
   gunzip ensembl42Pep.fasta.gz
   grep '>' ensembl42Pep.fasta | wc -l
   # 36048
   grep '>' ensembl42Pep.fasta > headers
   awk 'BEGIN {FS="|"} {print $2;}' headers > pepTranscript.ids
   sort pepTranscript.ids | uniq > pepTranscript.ids.sort
   sort ensGene42ProteinCoding.ids | uniq > proteinCoding.ids.sort
   comm -13 proteinCoding.ids.sort pepTranscript.ids.sort
   # no difference
   comm -23 proteinCoding.ids.sort pepTranscript.ids.sort > noPep
   # There are 17 of these.
   # found some of them on the Ensembl zebrafish Genome Browser and found
   # the peptide sequences. E-mailed Ensembl's helpdesk to ask how to get
   # peptide sequences for these 17 transcript IDs (2007-01-09).
   # Then downloaded peptide sequences for just this set of 17, but only got
   # 16 of them. To do this, follow the instructions as above for the  
   # obtaining the peptide sequences but on the Filters page, expand the GENE
   # section and check the box for ID list limit and select 
   # Ensembl Transcript ID(s) and paste in the list. Name output file
   # otherIDs.fasta.gz
   gunzip otherIDs.fasta.gz
   grep '>' otherIDs.fasta > headers2
   awk 'BEGIN {FS="|"} {print $2;}' headers2 > otherPepTranscript.ids
   sort otherPepTranscript.ids | uniq > otherPepTranscript.ids.sort
   comm -13 noPep otherPepTranscript.ids.sort
   # no difference
   comm -23 noPep otherPepTranscript.ids.sort
   # ENSDART00000049311
   # Repeat above procedure to query for peptide sequence for just this 
   # transcript ID and name file: otherIDs2.fasta.gz 
   # E-mailed helpdesk@ensembl.org to report all these problems (2007-01-09)
   gunzip otherIDs2.fasta.gz
   # Concatenate all sequences:
   cat ensembl42Pep.fasta otherIDs.fasta otherIDs2.fasta > ens42Pep.fasta
   grep '>' ens42Pep.fasta | wc 
   # 36065 
   grep '>' ens42Pep.fasta > all.headers
   awk 'BEGIN {FS="|"} {print $2;}' all.headers | sort | uniq > allTxIds.sort
   comm -13 proteinCoding.ids.sort allTxIds.sort
   # no difference
   comm -23 proteinCoding.ids.sort allTxIds.sort
   # no difference so got all protein sequences for the protein-coding 
   # trancsript IDs now.

   # load into database
   ssh hgwdev
   cd /cluster/data/danRer4/bed/ensembl42
   hgsql -e 'drop table ensPep;' danRer4
   hgPepPred danRer4 ensembl ensembl42Pep.fasta

   # edit trackDb/zebrafish/danRer4 to have an ensGene entry with the 
   # archive date for Enembl v42 which is used for creating stable archive 
   # links for the transcript ID and protein ID to make sure that these
   # always connect to the correct version of Ensembl Genes.
   # added track handler to hgTracks.c for ensGeneNonCoding and added
   # code to hgc.c to handle creating the correct stable archive link for
   # a particular version of Ensembl.
# trackDb/zebrafish/danRer4/trackDb.ra entries for ensGene and
# ensGeneNonCoding include these lines for creating the correct URLs:
# url http://dec2006.archive.ensembl.org/Danio_rerio/transview?transcript=$$
# urlName gene
# archive dec2006

    # Add Biotype and External Gene ID to the Ensembl Non-Coding genes table
    # These can be retrieved from BioMart using the method as above for
    # Biotype but also selecting the External Gene ID. Click on the Filter 
    # link on the left panel and expand the GENE section and check the box
    # for Gene Type and select all types except for protein_coding.
    # Select TSV as the output and Compressed file (*.gz) as the format.
    # save as ensNonCoding.biotype.txt.gz
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ensembl42
    gunzip ensNonCoding.biotype.txt.gz
    tail +2 ensNonCoding.biotype.txt > ensNonCoding.biotype.tab
cat << 'EOF' > ensBiotype.sql
CREATE TABLE ensBiotype (
    transcriptId varchar(255) not null,
    biotype varchar(255) not null,
    extGeneId varchar(255) not null
);
'EOF'
    hgLoadSqlTab danRer4 ensBiotype ensBiotype.sql ensNonCoding.biotype.tab
    # Add extra fields to ensNonCoding genePred table:
    hgsql -e \
         'alter table ensGeneNonCoding add biotype varchar(255) NOT NULL;' \
         danRer4
    hgsql -e \
         'alter table ensGeneNonCoding add extGeneId varchar(255) NOT NULL;' \
         danRer4
    # Add index to the extGeneId column:
    hgsql -e 'alter table ensGeneNonCoding add index(extGeneId);' danRer4
    hgsql -e 'select count(*) from ensGeneNonCoding;' danRer4
    # 3560
    hgsql -e 'update ensGeneNonCoding set biotype = "";' danRer4
    hgsql -e 'update ensGeneNonCoding set extGeneId = "";' danRer4
    # Now populate these columns with data from the ensBiotype table
    hgsql -e 'select count(*) from ensGeneNonCoding as g, ensBiotype as b \
          where g.name = b.transcriptId;' danRer4
    # 3560
    hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
          set g.biotype = b.biotype where g.name = b.transcriptId;' danRer4
    hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
          danRer4
    # 3560
    # then set the External Gene ID:
    hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
          set g.extGeneId = b.extGeneId where g.name = b.transcriptId;' danRer4
    hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
          danRer4
    # 3393
    # This is correct since 167 rows in the ensNonCoding.biotype.tab have no
    # external Gene ID:
    awk '{if ($3 == "") print;}' ensNonCoding.biotype.tab | wc -l
    # 167
    # 3393 + 167 = 3360
    # Now check code in hgc.c for handling the details page for this track.
 
#########################################################################
# RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-01-12 - 2007-01-23, hartera)
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
# Latest RH map sequences and primers received on 2006-10-03 from
# Anhua (Peter) Song - asong@enders.tch.harvard.edu
# Changed the name of rhMapInfo table and related files to rhMapZfishInfo
# to make the name more zebrafish-specific (2007-02-08, hartera)
# Remake track as one of the primer sequences was in the sequence for
# 1942C.INSERTMUT and also changed another marker name to remove a forward 
# slash. Remade rhMapZfishInfo table and removed spaces from primer sequences. 
# (2007-02-14, hartera)
# Collected stats on RH map alignments for Yi Zhou (DONE, 2007-03-28, hartera)

    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/ZonLab/rhMap-2006-10-03
    cd /cluster/data/danRer4/bed/ZonLab
    ln -s rhMap-2006-10-03 rhMap
    cd rhMap
    # download data files from e-mail:
    # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
    unzip rhSequenceSubmit100306.zip
    unzip rhSequenceSubmitSeq100306.zip
    dos2unix rhSequenceSubmit100306.txt
    dos2unix rhSequenceSubmitSeq100306.txt
    # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
    # information are in rhSequenceSubmi100306.txt
    grep '>' rhSequenceSubmitSeq100306.txt | wc -l
    # 11514
    wc -l  rhSequenceSubmit100306.txt
    # 13438 rhSequenceSubmit100306.txt
   
    grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names

    # remove '>' from names and grab first field
    perl -pi.bak -e 's/>//' rhMap.names
    awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
        > rhMap.namesOnly.sort
    awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
        > rhMapPrimers.namesOnly.sort
    wc -l *.sort
    # 11514 rhMap.namesOnly.sort
    # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
    
    # get a list of headers from the FASTA file
    grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
    awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
# BAC_END
# EST
# GENE
# SSLP
# STS
    # There are 5 types of sequence here.
    awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
#BACends
#Custom
#Insertion_Mutant
#Insertion_Mutants
#MGH
#NCBI
#Sanger SG
#Sequencing_Project
#ThisseClone
#Thisse_Clone
#other_zfEst
#wu_zfEst
#wz
    awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
    # CHBG
    # MPIEB

# Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
# So there are 11 different sources.
    # There are 2 sequences with problem primers. E-mailed Peter Song about
    # these and he suggested to delete thoser primers:
    # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
    # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
    # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
    # need to reformat FASTA headers so they are in the format: 
    # NAME.SOURCE.TYPE.ORIGIN
    # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
    # so change these to have the same name. Also shorten Sanger SG to
    # Shotgun.
    sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
       | sed -e 's/Insertion_Mutant/InsertMut/' \
       | sed -e 's/Sanger SG/Shotgun/' \
       | sed -e 's/ThisseClone/Thisse/' \
       | sed -e 's/Thisse_Clone/Thisse/' \
       | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
    # Do the same for the primers and information file:
    sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
       | sed -e 's/Insertion_Mutant/InsertMut/' \
       | sed -e 's/Sanger SG/Shotgun/' \
       | sed -e 's/ThisseClone/Thisse/' \
       | sed -e 's/Thisse_Clone/Thisse/' \
       | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
    # edit these files to remove the extra newline char after the first primer
    # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to 
    # an underscore (2007-02-14, hartera)
    perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
         rhMap100306.fa
    perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
         rhMapPrimers100306.txt 
    # use a script to reformat the names for the FASTA headers to the format 
    # >NAME.SOURCE where name is the first field separated by "|" and source
    # is the 9th field. The source is used to make the name unique. Some
    # of these names are BAC ends that occur in the BAC ends track so there
    # are name clashes in the seq table if the names are not made unique.
    # Also make the name upper case as for those for the danRer1 and danRer2
    # RH map and remove base numbering on each sequence line of FASTA file.
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    split(toupper($0), a, "\\|");
    print a[1]"."a[9];
    next;
}

/^[0-9]+ / {
    $0 = $2;
}

{
    print $0;
}

'_EOF_'
# << keep emacs coloring happy
    chmod +x rhFix
    rhFix rhMap100306.fa > rhMap.fa
    # Blat sequences vs danRer4 genome
    ssh pk
    mkdir -p /cluster/data/danRer4/bed/ZonLab/rhMap/blatRun
    cd /cluster/data/danRer4/bed/ZonLab/rhMap
    # put the rhMap sequences on the san 
    mkdir -p /san/sanvol1/scratch/danRer4/rhMap
    cp rhMap.fa /san/sanvol1/scratch/danRer4/rhMap/
    # do blat run to align RH map sequences to danRer4 and and use
    # chrNA_random and chrUn_random separated into scaffolds.
    cd blatRun
    ls -1S /san/sanvol1/scratch/danRer4/rhMap/rhMap.fa > rhMap.lst
    ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
    foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst 
    # 3237 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer4/rhMap/psl
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub

    gensub2 genome.lst rhMap.lst template.sub para.spec
    para create para.spec
    para try, check, push ... etc.
    para time
# Completed: 3237 of 3237 jobs
#CPU time in finished jobs:       4787s      79.78m     1.33h    0.06d  0.000 y
#IO & Wait Time:                  8080s     134.67m     2.24h    0.09d  0.000 y
#Average job time:                   4s       0.07m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              18s       0.30m     0.01h    0.00d
#Submission to last job:           752s      12.53m     0.21h    0.01d

    # need to do pslSort and lift up
    ssh pk
    cd /san/sanvol1/scratch/danRer4/rhMap
    # Do sort, liftUp and then best in genome filter.
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed.
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments.
    pslSort dirs raw.psl tmp psl
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     3442    104586
#     drop minCover:     2838    205568
#        weird over:     163     1124
#        kept weird:     107     172
#    drop localBest:     3011    17130
#              kept:     11121   14216

# 11514
# The percentage aligned is 11121/11514 = 96.6%
# Number of alignments for markers with most alignments after filtering:
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 12 Z13442.MGH
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 9 FJ07G09.X1.WU_ZFEST
# 8 ZK4I5.T7.BACENDS
# 8 ZC27I3.ZA.BACENDS

    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#            total:     11326   1628158
#     drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     2740    60578
#     drop minCover:     3083    223430
#        weird over:     318     3132
#        kept weird:     154     249
#    drop localBest:     3480    43022
#              kept:     11212   14470
# Percentage aligned is 11212/11514 = 97.4%

    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=10 \
       -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3026    1258275
#     drop minIdent:     2902    72521
#     drop minCover:     3256    231002
#        weird over:     344     3365
#        kept weird:     157     252
#    drop localBest:     3604    51799
#            kept:     11228   14560

# There isn't much difference 11228/11514 = 97.5%
     awk '{print $10}' contig.psl | sort | uniq -c | sort -nr 
# Top numbers of hits:
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 13 FJ07G09.X1.WU_ZFEST
# 12 Z13442.MGH
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 9 Z3157.MGH
# 8 ZK4I5.T7.BACENDS

    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.90 -minCover=0.15 raw.psl contig.psl
#    seqs    aligns
#           total:     11326   1628158
#      drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     2306    34000
#     drop minCover:     3166    230461
#        weird over:     388     5030
#        kept weird:     168     270
#    drop localBest:     3647    62505
#              kept:     11232   14534

# Percent sequences aligned: 11232/11514 = 97.6%
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.90 -minCover=0.20 raw.psl contig.psl
#                           seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     2306    34000
#     drop minCover:     3418    245102
#        weird over:     343     4235
#        kept weird:     159     252
#    drop localBest:     3206    48291
#              kept:     11189   14107
# Percent sequences aligned: 11189/11514 = 97.2%

    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.80 -minCover=0.20 raw.psl contig.psl
# 			seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3068    1286657
#     drop minIdent:     1       2
#     drop minCover:     3599    256955
#        weird over:     414     8594
#        kept weird:     173     270
#    drop localBest:     3410    70389
#              kept:     11205   14154
# Percent sequences aligned: 11205/11514 = 97.3%
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 13 FJ07G09.X1.WU_ZFEST
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 8 ZK4I5.T7.BACENDS
# 8 ZC27I3.ZA.BACENDS
# 8 Z7243.MGH
# 8 Z3157.MGH

    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.80 -minCover=0.15 raw.psl contig.psl
#                         seqs    aligns
#            total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3068    1286657
#     drop minIdent:     1       2
#     drop minCover:     3322    238087
#        weird over:     470     9995
#        kept weird:     181     288
#    drop localBest:     3876    88821
#              kept:     11246   14590
# Percent sequences aligned: 11246/11514 = 97.7%
# Use lower minId and higher minCover (0.20) as for the BAC ends and for
# the RH map on other zebrafish assemblies.
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.20 raw.psl contig.psl
#                        seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     3552    255528
#        weird over:     403     7578
#        kept weird:     171     268
#    drop localBest:     3358    68020
#              kept:     11203   14146
# 97.3% (11203/11514) of sequences are aligned using these filter criteria
    # Loaded these sequences as below and then checked the rhMap track in the
    # danRer4 Genome Browser to see if there are any pileups.
    # there is one big pileup on chr24 that is in the same region as 
    # that was found for danRer3 after using liftOver:
    # i.e. chr13:8,112,962-8,113,055 on danRer3 which lifts over to
    # chr24:8,191,404-8,191,497 on danRer4 and there is also a pileup
    # of RH map sequences here. If you look at Z33743, it has 3 alignments
    # to chr23, chr24 and chrNA_random. The chr23 alignment is the best and
    # this is where its primers map to. If a higher threshold is taken
    # for min coverage in the filtering, this may be avoided. Checked all the 
    # whole chromosome views in the Browser and chr24 is the only one that 
    # appears to have this large pileup. 

    # try increasing the minCover parameter:
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.25 raw.psl contig.psl
#                        seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     3754    271241
#        weird over:     358     6379
#        kept weird:     157     252
#    drop localBest:     2916    52769
#              kept:     11100   13684
    # Percent sequences aligned: 11100/11514 = 96.4%
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.30 raw.psl contig.psl
#  			seqs    aligns
#      	total:     11326   1628158
# drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     3929    283124
#        weird over:     310     5451
#        kept weird:     145     236
#    drop localBest:     2549    41325
#              kept:     10938   13245
    # Percent sequences aligned: 10938/11514 = 95.0%
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.40 raw.psl contig.psl
#  			seqs    aligns
#             total:     11326   1628158
#      drop invalid:     1       1
#drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     4293    298517
#        weird over:     245     4052
#        kept weird:     128     211
#    drop localBest:     2079    26658
#              kept:     10489   12519
    # Percent sequences aligned: 10489/11514 = 91.1%
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.35 raw.psl contig.psl
# 			seqs    aligns
#     	     total:     11326   1628158
#      drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     4119    292022
#        weird over:     274     4640
#        kept weird:     137     227
#    drop localBest:     2279    32801
#              kept:     10724   12871
    # Percent sequences aligned: 10724/11514 = 93.1%
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.32 raw.psl contig.psl
# 			seqs    aligns
#             total:     11326   1628158
# 	drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#     drop minIdent:     775     3806
#     drop minCover:     4001    287002
#        weird over:     296     5113
#        kept weird:     144     235
#    drop localBest:     2437    37599
#              kept:     10862   13093
    # Percent sequences aligned: 10862/11514 = 94.3%
    rm contig*
    # Final parameters: use minCover=0.33
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
# 			seqs    aligns
# 	     total:     11326   1628158
#     drop invalid:     1       1
# drop minNonRepSize:     3068    1286657
#      drop minIdent:     775     3806
#     drop minCover:     4045    288763
#        weird over:     287     4946
#        kept weird:     142     233
#    drop localBest:     2375    35906
#              kept:     10818   13025
    # Percent sequences aligned: 10818/11514 = 94.0%
    # This is a compromise between reducing the number of sequences piling
    # up on chr24 but not losing all alignments for too many sequences.
    cd /cluster/data/danRer4/bed/ZonLab/rhMap
    # lift up to genome level coordinates
    rm rhMap*psl
    liftUp rhMap.psl \
           /cluster/data/danRer4/jkStuff/liftAll.lft warn \
           /san/sanvol1/scratch/danRer4/rhMap/contig.psl
    # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
    pslCheck rhMap.psl
    # psl looks ok
    # cleanup 
    rm *.bak rhMap.headers rhMap.names *.sort headers.new 
    # Load sequence alignments into the database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ZonLab/rhMap   
    # drop test tables and reload final psl file
    # drop old rhMap table
    hgsql -e 'drop table rhMap;' danRer4 
    hgLoadPsl danRer4 rhMap.psl
     
    # Copy sequences to gbdb if they are not already there.
    mkdir -p /gbdb/danRer4/rhMap
    # remove old sequences
    rm /gbdb/danRer4/rhMap/rhMap20061003.fa
    ln -s \
      /cluster/data/danRer4/bed/ZonLab/rhMap/rhMap.fa \
      /gbdb/danRer4/rhMap/rhMap20061003.fa

    # then add sequences to database:
    # remove old sequences (2007-02-14, hartera)
    hgsql -e 'select * from extFile where path like "%rhMap%";' danRer4
# id     | name             | path                                 | size    |
+--------+------------------+--------------------------------------+---------+
#| 709793 | rhMap20061003.fa | /gbdb/danRer4/rhMap/rhMap20061003.fa | 7456887 |
    hgsql -e 'select count(*) from seq where extFile = 709793;' danRer4
    # 11514
    hgsql -e 'delete from seq where extFile = 709793;' danRer4
    hgsql -e 'delete from extFile where id = 709793;' danRer4
    
    # then reload the new sequence file 
    hgLoadSeq danRer4 /gbdb/danRer4/rhMap/rhMap20061003.fa
    # loaded succesfully
    # Check in the Browser and see if there are many pileups
    # Much reduced now on chr24. Took 10 random sequences in the pileup from
    # minCover=0.20 and found that 7 of them still align to danRer4 
    # with minCover=0.33 and 2 of those that don't also have primers that
    # do not map using the hgPcr tool.
    # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
    # also add the search specs for hgFindSpec to trackDb.ra

    # Add table of related information for the RH map details pages:
    
    # Check that all the headers from rhMap.headers are also in the primers
    # file which seems to contain the same headers from the FASTA file
    # as well as additional markers.
    # Remake the rhMapZfishInfo table too (hartera, 2007-02-14) so that 
    # new line is removed from 1942C.INSERTMUT line and also the underscore is
    # added to the FJ34C05.Y1_FJ56G09.Y1.WU_ZFEST ID in place of "/". 
    ssh kkstore04
    cd /cluster/data/danRer4/bed/ZonLab/rhMap/
    grep '>' rhMap100306.fa > rhMap.headers

    perl -pi.bak -e 's/>//' rhMap.headers
    sort rhMap.headers > rhMap.headers.sort
    sort rhMapPrimers100306.txt > rhMapPrimers.sort
    wc -l *.sort
    # 11514 rhMap.headers.sort
    # 13437 rhMapPrimers.sort
    comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
    # 11514 in common
    # so all FASTA headers from rhMap022306.fa are in the primers file
    # Get headers again from rhMap.fa file as the names of the sources have
    # been changed. Parse out information from headers to add to an rhMapInfo
    # table so that this information can be displayed on the details page for
    # the RH map markers.
    # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the 
    # RH map for that linkage group, 4 - distance (in cR) from the 
    # top of a linkage group, 4 - position number in entire RH map (ordered 
    # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
    # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
    # 12 - 3' reverse primer.
    # Sort headers by linkage group and by position
    grep '>' rhMap100306.fa > rhMap.headers2
    # then use the rhMap.headers2 file to extract the marker information
    # and to reformat the names for the FASTA headers to the format 
    # >NAME.SOURCE where name is the first field separated by "|" and source
    # is the 9th field so that names in the rhMap and rhMapInfo tables are 
    # the same. The source is used to make the name unique. 
cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f 

#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
    sub(/>/,"",$0);
    split(toupper($0), a, "\\|");
    print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
    next;
}
'_EOF_'
# << keep emacs coloring happy
    chmod +x getRhInfo
    getRhInfo rhMap.headers2 > rhMapInfo.tab
    # Sort headers by linkage group (LG) and by position
    sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
    wc -l rhMapInfoSorted.tab
    # 11514 rhMapInfoSorted.tab
    # Need to add ZFIN IDs - data received on 2006-06-23
    # rhSeqWithZdbNameToRachel.zip
    unzip rhSeqWithZdbNameToRachel.zip
    tail +3 rhSeqWithZdbNameToRachel.txt \
            | awk 'BEGIN {OFS= "\t"} {print $1, $7}' \
            | sort | uniq > rhSeqZfinIds.txt
    # translate names to upper case
    cat rhSeqZfinIds.txt | tr '[a-z]' '[A-Z]' > rhSeqZfinIds.format.txt      
    # then map these marker names and ZFIN IDs to markers in
    # rhMapInfoSorted.tab. Also remove spaces - some of the primer sequences
    # have spaces (hartera, 2007-02-14)  
cat << 'EOF' > mapZfinIds.pl
#!/usr/bin/perl -w
use strict;

my ($zf, $rh, %zfinIds);
$zf = $ARGV[0]; # file of ZFIN IDs and marker names
$rh = $ARGV[1]; # rhMapInfo.tab file

open (ZFIN, $zf) || die "Can not open $zf :$!\n";
open (RH, $rh) || die "Can not open $rh : $!\n";

while (<ZFIN>){
   my ($line, @fi);
   chomp;
   $line = $_;
   @fi = split(/\t/, $line);
   # store ZFIN ID in hash keyed by marker name
   $zfinIds{$fi[1]} = $fi[0];
}
close ZFIN;

# read in the markers from rhMapInfo file
while (<RH>){
   my ($li, @f, $marker, @m, $mName, $j, $i);
   $mName = "";
   $zf = "";
   chomp;
   $li = $_;
   @f = split(/\t/, $li);
   $marker = $f[0];
   # split by "."
   @m = split(/\./, $marker);
   
   # remove the extension after the last "." 
   $mName = $m[0];
   if (($mName ne "") && (exists($zfinIds{$mName}))) {
      $zf = $zfinIds{$mName};
   }
   for ($j = 1; $j < $#m; $j++){
      $mName = $mName . "." . $m[$j];
   }
   if (($mName ne "") && (exists($zfinIds{$mName}))) {
      $zf = $zfinIds{$mName};
   }
   print "$f[0]\t$zf";
   # print other fields and remove spaces
   for ($i = 1; $i <= $#f; $i++){
      $f[$i] =~ s/\s//g;
      print "\t$f[$i]";
   }
   if ($#f == 6){
      print "\t\t";
   }
   print "\n";
}
'EOF'
    chmod +x mapZfinIds.pl
    perl mapZfinIds.pl rhSeqZfinIds.format.txt rhMapInfoSorted.tab \
         > rhMapInfoWithZfinIds.tab
    # There are 1867 markers with no ZFIN ID
    wc -l rhMapInfo*
    # 11514 rhMapInfo.tab
    # 11514 rhMapInfoSorted.tab
    # 11514 rhMapInfoWithZfinIds.tab

    # When loading, found that 1942.C has only 1 primer. Problem with
    # rhMapPrimers100306.txt. There was a new line between the primers
    # for this file so remove it there and in rhMap100306.fa and then 
    # process it again (now this was done at an earlier step, 2007-02-14).

    # Create a table with RH map item information including type, source,
    # origin and primer sequences.
    # already created rhMapInfo.sql, rhMapInfo.c and rhMapInfo.h files 
    # using autosql - see danRer3.txt. None of the assemblies with RH
    # map on the RR have this rhMapInfo table so it can be redefined.
    # load these into a table called rhMapInfo2 - this is rhMapInfo
    # with an extra column for the ZFIN ID. 
    # Use autosql to create a .sql file.
    ssh hgwdev
    # rename the information table and make it zebrafish specific
    # (2007-02-08, hartera)
cat << 'EOF' > ~/kent/src/hg/lib/rhMapZfishInfo.as
table rhMapZfishInfo
"Zebrafish Radiation Hybrid map information"
(
string name;            "Name of Radiation Hybrid (RH) map marker"
string zfinId; 		"ZFIN ID for the marker"
string linkageGp;       "Linkage group to which the marker was mapped"
uint position;          "Position number in RH map for this linkage group"
uint distance;          "Distance from the top of linkage group (cR)"
string markerType;      "Type of marker"
string source;          "Source of marker"
string mapSite;         "Institution that mapped the marker"
string leftPrimer;      "Forward primer sequence"
string rightPrimer;     "Reverse primer sequence"
)
'EOF'
# << happy emacs
    # create .sql, .c and .h files using autoSql
    cd ~/kent/src/hg/lib
    autoSql rhMapZfishInfo.as rhMapZfishInfo
    mv rhMapZfishInfo.h ../inc
    # edit rhMapZfishInfo.sql and add an index (INDEX(zfinId)).
    # commit these files (*.as, *sql, *.c and *.h) to CVS replacing 
    # the original rhMapInfo* files. 
    # make changes to hgc so that it prints the ZFIN ID in addition to the
    # other rhMapZfishInfo fields. 
 
    # reload table with new name (2007-02-08, hartera):
    cd /cluster/data/danRer4/bed/ZonLab/rhMap
    hgsql -e 'drop table rhMapInfo;' danRer4 
    # reloaded the rhMapZfishInfo table (2007-02-08, hartera)
    hgsql -e 'drop table rhMapZfishInfo;' danRer4
    hgLoadSqlTab danRer4 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
          rhMapInfoWithZfinIds.tab
    
    # add code to hgc.c to print ZFIN ID, if available, on the details page
    # together with the other marker-related information.
    # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for 
    # the ZFIN IDs to link to the relevant page at http://www.zfin.org 
    # and added an html page for the track.
    # Added the rhMapZfishInfo.h file to the makefile in src/hg/lib
    # and replaced rhMapInfo with rhMapZfishInfo in src/hg/hgc/hgc.c

    # RH MAP STATISTICS
    # Get some stats for Yi Zhou at Harvard (2007-03-20 & 2007-03-28)
    # Of the 11514 markers with sequence information, 10818 aligned (94%)
    # using a filter for 85% sequence identity and all portions of all 
    # alignments for a sequence must be within 0.5% of the identity of the 
    # best alignments for each portion of the marker. The query must have at
    # least 0.33 of the sequence aligned and at least 16 bases must not be in
    # repeat regions.
    cd /cluster/data/danRer4/bed/ZonLab/rhMap
    mkdir stats
    cd stats
    hgsql -e 'select count(distinct(qName)) from rhMap;' danRer4
    # 10818
    hgsql -N -e 'select qName from rhMap;' danRer4 | sort | uniq -c \
          | sort -nr > qNames.count
    # send this list too
    # 1701 markers have 2 or more BLAT alignment that pass the filter.
    hgsql -N -e 'select name, linkageGp from rhMapZfishInfo;' danRer4 \
          > markers.linkageGroups
    hgsql -N -e 'select qName, tName from rhMap;' danRer4 > rhMap.align.chroms

    ssh kkstore04
    cd /cluster/data/danRer4/bed/ZonLab/rhMap/stats
    sed -e 's/LG/chr/' markers.linkageGroups > markers.rhMap.chroms
    # some marker names contain "LG"
    awk '{print $1}' markers.linkageGroups | grep "LG" 
    # there are 18 and all begin with "TLG"
    sed -e 's/Tchr/TLG/' markers.rhMap.chroms > markers.rhMap.chroms2
    sort markers.rhMap.chroms2 | uniq > markers.rhMap.chroms.sort
    wc -l markers.rhMap.chroms*
    # 11514 markers.rhMap.chroms
    # 11514 markers.rhMap.chroms.sort
    # 11514 markers.rhMap.chroms2
    # same when uniqued
    sort rhMap.align.chroms | uniq > rhMap.align.chroms.sort
    wc -l rhMap.align*
    # 13025 rhMap.align.chroms
    # 11344 rhMap.align.chroms.sort
    
    # Find how well the RH map and Zv6 agree in terms of chromosome 
    # assignment given that linkage group number is the same as the 
    # chromosome number.
    comm -23 rhMap.align.chroms.sort markers.rhMap.chroms.sort \
             > diffChromInGenome
    # need to find just those in rhMap.align.chroms.sort that are 
    # in rhMap.
    awk '{print $1}' rhMap.align.chroms.sort | sort | uniq > rhMap.align.names
    foreach n (`cat rhMap.align.names`)
      echo $n
      grep -w $n markers.rhMap.chroms.sort >> markers.rhMap.chroms.aligned
    end
    # 10818 in markers.rhMap.chroms.aligned
    # 10818 rhMap.align.names
    # then compare this list to the ones that are aligned to the genome
    comm -13 rhMap.align.chroms.sort markers.rhMap.chroms.aligned \
         > diffChromInRHMap
    wc -l diffChromInRHMap
    # 1392 diffChromInRHMap
    # these are the markers that have a different chromosome (linkage group)
    # assigned in the RH map to that found by BLAT alignment of the marker
    # sequence to the genome. This list shows the linkage groups (chr) in the 
    # RH map then generate a list of where these align in the genome
    # These are markers that have at least one alignment to the same chrom
    # as in the linkage map. They may be aligning to other chroms too.
    awk '{print $1}' diffChromInRHMap > diffChromInRHMap.names
    foreach n (`cat diffChromInRHMap.names`)
      echo $n
      grep -w $n rhMap.align.chroms.sort >> rhMap.genomeAlign.diffInRHmap
    end
    wc -l rhMap.genomeAlign.diffInRHmap
    # 1562 rhMap.genomeAlign.diffInRHmap
    # This is the list of markers that differ in chrom between the RH map
    # and genome alignment with the list of chroms to which they are 
    # aligned by BLAT in an alignment of the marker sequence to the genome. 
    # There are more lines in this file because some markers align more than 
    # once to the genome so they appear more than once in the file.
    # Therefore of those markers aligned, 10818, there are 1392 (12.9%)
    # that are aligning to a different chromosome.
    # Some of these may be aligning to chrUn_random or chrNA_random
    grep random rhMap.genomeAlign.diffInRHmap | awk '{print $1}' \
         | sort | uniq > diffInRHmap.alignedToRandom
    wc -l diffInRHmap.alignedToRandom
    # 142 diffInRHmap.alignedToRandom
    # Of the markers with different chroms in the genome alignment and the 
    # linkage map, 142 (1.3% of 10818) are aligning to chrUn_random or 
    # chrNA_random so the sequence containing these markers has
    # not yet been placed on a chromosome.

#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 450 where name = 'danRer4';"

##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer4

#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-01-19,20 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
    cd /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
    cat << '_EOF_' > DEF
# Zebrafish vs. Medaka

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer4, no randoms or Un in this sequence
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -verbose=2 \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/danRer4OryLat1 > do.log 2>&1 &
    #	real    556m6.806s
    cat fb.danRer4.chainOryLat1Link.txt
    #	209746583 bases of 1626093931 (12.899%) in intersection
    cd /cluster/data/danRer4/bed
    ln -s blastz.oryLat1.2007-01-19 blastz.oryLat1

    ## swap to oryLat1 - also in oryLat1.txt
    mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer4
    cd /cluster/data/oryLat1/bed/blastz.swap.danRer4
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk  > swap.log 2>&1 &

    cat fb.oryLat1.chainDanRer4Link.txt
    #	156014546 bases of 700386597 (22.275%) in intersection
    cd /cluster/data/oryLat1/bed
    ln -s blastz.swap.danRer4 blastz.danRer4

#########################################################################
# BLASTZ/CHAIN/NET fr2 (DONE - 2007-01-29 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
    cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
    cat << '_EOF_' > DEF
# Zebrafish vs. Fugu

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer4, no randoms or Un in this sequence
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/danRer4/bed/blastz.fr2.2007-01-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl DEF -verbose=2 \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > do.log 2>&1 &
    ## recover from pk kluster problems and finish blastz job
    time doBlastzChainNet.pl DEF -verbose=2 \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=cat -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > cat.log 2>&1 &
    ## recover from kki kluster problems and finish chain job
    time doBlastzChainNet.pl DEF -verbose=2 \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=chainMerge -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > chainMerge.log 2>&1 &
    #	real    554m13.214s

    ## swap
    mkdir /cluster/data/fr2/bed/blastz.danRer4.swap
    cd /cluster/data/fr2/bed/blastz.danRer4.swap

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	running 2007-01-30 - 16:35
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
    time nice -n +19 featureBits danRer4 chainFr2Link \
	> fb.danRer4.chainFr2Link.txt 2>&1
    #	138918185 bases of 1626093931 (8.543%) in intersection
    time nice -n +19 featureBits fr2 chainDanRer4Link \
	> fb.fr2.chainDanRer4Link.txt 2>&1
    #	80963231 bases of 393312790 (20.585%) in intersection

# ASZ (3-22-2007)this process failed to create four tables, so I created 
# them an left them empty (as discussed with Hiram).

 CREATE TABLE `danRer4`.`chrUn_random_chainFr2` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`score` double NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tSize` int( 10 ) unsigned NOT NULL default '0',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qName` varchar( 255 ) NOT NULL default '',
`qSize` int( 10 ) unsigned NOT NULL default '0', 
`qStrand` char( 1 ) NOT NULL default '',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`qEnd` int( 10 ) unsigned NOT NULL default '0',
`id` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `id` ( `id` )
) TYPE = MYISAM ;

 CREATE TABLE `danRer4`.`chrUn_random_chainFr2Link` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`chainId` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `chainId` ( `chainId` )
) TYPE = MYISAM ;

 CREATE TABLE `danRer4`.`chrNA_random_chainFr2` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`score` double NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tSize` int( 10 ) unsigned NOT NULL default '0',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qName` varchar( 255 ) NOT NULL default '',
`qSize` int( 10 ) unsigned NOT NULL default '0', 
`qStrand` char( 1 ) NOT NULL default '',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`qEnd` int( 10 ) unsigned NOT NULL default '0',
`id` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `id` ( `id` )
) TYPE = MYISAM ;

 CREATE TABLE `danRer4`.`chrNA_random_chainFr2Link` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`chainId` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `chainId` ( `chainId` )
) TYPE = MYISAM ;

###########################################################################
# CREATE LIFTOVER FROM danRer4 TO danRer5 
# (DONE, 2007-09-21 - 2007-09-22, hartera)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/blat.danRer5
    cd /cluster/data/danRer4/bed/blat.danRer5
    time nice doSameSpeciesLiftOver.pl danRer4 danRer5 \
        -bigClusterHub pk \
        -ooc /san/sanvol1/scratch/danRer4/danRer4_11.ooc \
	-buildDir=/cluster/data/danRer4/bed/blat.danRer5 >& do.log &
    # 0.337u 0.208s 4:58:26.59 0.0%   0+0k 0+0io 28pf+0w
    # Remove symbolic link to liftOver chains and copy over the file
    rm ../liftOver/danRer4ToDanRer5.over.chain.gz
    cp -p danRer4ToDanRer5.over.chain.gz ../liftOver
    
    # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has 
    # already been made to this file and md5sum.txt needs to be updated
    ssh hgwdev 
    cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
    md5sum *.gz > md5sum.txt

    md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
    ln -s /cluster/data/danRer5/bed/liftOver/danRer4ToDanRer5.over.chain.gz .

#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>

    cd /cluster/data/danRer4/bed/contrastGene/
    wget http://www.stanford.edu/~ssgross/contrast.danRer4.bed
    # this is a custom track, not a pure BED
    tail +2 contrast.danRer4.bed | hgLoadBed -tab danRer4 contrastGene stdin

    # verify 
    # load track db (ra and contrastGene.html are global
    # request push of contrastGene


###########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
danRer4.upstreamGeneTbl = refGene
danRer4.upstreamMaf = multiz7way /hive/data/genomes/danRer4/bed/multiz7way/species.lst