# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the 
# Dog (Canis familiaris) May 2005 release.

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | nscanGene   | genePred nscanPep               |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+


# CREATE BUILD DIRECTORY (DONE 6/1/05 angie)
    ssh kkstore01
    mkdir /cluster/store9/canFam2
    ln -s /cluster/store9/canFam2 /cluster/data/canFam2


# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 6/1/05 angie)
    mkdir /cluster/data/canFam2/M
    cd /cluster/data/canFam2/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
    # "canis familiaris mitochondrion genome".  That shows the gi number:
    # 17737322
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Canis familiaris mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/1/05 angie)
    # This used to hold scripts -- better to keep them inline in the .doc 
    # so they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/canFam2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/canFam2/bed


# DOWNLOAD AGP, FASTA & QUAL (DONE 6/2/05 angie)
    ssh kkstore01
    mkdir /cluster/data/canFam2/broad
    cd /cluster/data/canFam2/broad
    ftp ftp.broad.mit.edu
      prompt
      bin
      cd pub/assemblies/mammals/canFam2
      mget contigs.bases.gz AAEX02.full_AGP Dog2.0.agp supercontigs
      mget assembly.agp assembly.format contigs.quals.gz supercontigs.summary
      bye
    # Jean Chang is going to remove AAEX02.full_AGP to avoid confusion,
    # so make sure we can tell people how to quickly regenerate it from 
    # Dog2.0.agp:
    perl -wpe 'if (/contig_(\d+)/) { \
                 $a = sprintf "AAEX020%05d", $1+1;  s/contig_\d+/$a/; }' \
      Dog2.0.agp > /tmp/1.agp 
    diff /tmp/1.agp AAEX02.full_AGP | wc -l
#0
    # Meanwhile, the AGP has chr01, chr02 instead of chr1, chr2... yecch.
    # Substitute those out:
    sed -e 's/^chr0/chr/' Dog2.0.agp > UCSC_Dog2.0.agp


# BUILD CHROM FA (DONE 6/28/05 angie)
    ssh kkstore01
    cd /cluster/data/canFam2/broad
    awk '{print $1;}' UCSC_Dog2.0.agp | uniq > ../chrom.lst
    nice gunzip contigs.bases.gz 
    foreach chr (`cat ../chrom.lst`)
      set c = `echo $chr | sed -e 's/chr//'`
      mkdir ../$c
      awk '$1 == "'$chr'" {print;}' UCSC_Dog2.0.agp > ../$c/$chr.agp
      agpToFa -simpleMultiMixed ../$c/$chr.agp $chr ../$c/$chr.fa contigs.bases
    end
    faSize ../*/chr*.fa
#2531673953 bases (146677410 N's 2384996543 real 2384996543 upper 0 lower) in 41 sequences in 41 files
#Total size: mean 61748145.2 sd 25246010.8 min 16727 (chrM) max 126883977 (chrX) median 61280721
#N count: mean 3577497.8 sd 1401887.8
#U count: mean 58170647.4 sd 24664411.9
#L count: mean 0.0 sd 0.0
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    cd /cluster/data/canFam2
    foreach agp (?{,?}/chr*.agp)
      set fa = $agp:r.fa
      echo checking consistency of $agp and $fa
      checkAgpAndFa $agp $fa | tail -1
    end
    nice gzip broad/contigs.bases
    echo "chrM" >> chrom.lst


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 6/28/05 angie)
    ssh kkstore01
    cd /cluster/data/canFam2
    foreach agp (?{,?}/chr*.agp)
      set fa = $agp:r.fa
      echo splitting $agp and $fa
      cp -p $agp $agp.bak
      cp -p $fa $fa.bak
      nice splitFaIntoContigs $agp $fa . -nSize=5000000
    end

    # No _random's in this assembly, so no need to clean up after them.
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    set msize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft
    foreach f ( ?{,?}/chr*.fa.bak )
      nice faCmp $f $f:r
    end


# MAKE LIFTALL.LFT (DONE 6/28/05 angie)
    ssh kkstore01
    cd /cluster/data/canFam2
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# CREATING DATABASE (DONE 6/28/05 angie)
    ssh hgwdev
    hgsql '' -e 'create database canFam2'
    # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  940G  720G  57% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/28/05 angie)
    ssh hgwdev
    hgsql canFam2 -e \
      "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp"


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 6/28/05 angie)
    # Make .2bit, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kkstore01
    cd /cluster/data/canFam2
    nice faToTwoBit ?{,?}/chr*.fa canFam2.2bit
    mkdir bed/chromInfo
    twoBitInfo canFam2.2bit stdout \
    | awk '{print $1 "\t" $2 "\t/gbdb/canFam2/canFam2.2bit";}' \
      > bed/chromInfo/chromInfo.tab

    # Make symbolic links from /gbdb/canFam2/ to the real .2bit.
    ssh hgwdev
    mkdir /gbdb/canFam2
    ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/
    # Load /gbdb/canFam2/canFam2.2bit paths into database and save size info.
    cd /cluster/data/canFam2
    hgsql canFam2  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql canFam2 -e 'load data local infile \
      "/cluster/data/canFam2/bed/chromInfo/chromInfo.tab" \
      into table chromInfo;'
    echo "select chrom,size from chromInfo" | hgsql -N canFam2 > chrom.sizes
    # take a look at chrom.sizes, should be 41 lines
    wc chrom.sizes
#     41      82     603 chrom.sizes


# GOLD AND GAP TRACKS (DONE 6/28/05 angie)
    ssh hgwdev
    cd /cluster/data/canFam2
    hgGoldGapGl -noGl -chromLst=chrom.lst canFam2 /cluster/data/canFam2 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql canFam2
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql canFam2 -e "create table chrM_gap select * from chr1_gap where 0=1"


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM2 (DONE 8/1/05 angie)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add canFam2 in all the right places and do
    make update
    cvs commit makefile
    mkdir -p dog/canFam2
    cvs add dog dog/canFam2
    cvs ci -m "trackDb dir for dog genome(s)" dog/canFam2
    # Do this in a clean (up-to-date, no edits) tree:
    make alpha

    # Add dbDb entry (not a new organism so defaultDb and genomeClade already 
    # have entries):
    hgsql -h genome-testdb hgcentraltest \
      -e 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("canFam2", "May 2005", \
          "/gbdb/canFam2/nib", "Dog", "chr14:11072309-11078928", 1, \
          18, "Dog", "Canis familiaris", \
          "/gbdb/canFam2/html/description.html", 0, 0, \
          "Broad Institute v. 2.0");'


# REPEAT MASKING (DONE braney/angie 7-10-05)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kkstore01
    cd /cluster/data/canFam2
    foreach chr (`cat chrom.lst`)
      set c = `echo $chr | sed -e 's/chr//'`
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/canFam2
    cat << '_EOF_' > jkStuff/RMDog
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/canFam2/$2
/bin/cp $2 /tmp/canFam2/$2/
cd /tmp/canFam2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec dog $2
popd
/bin/cp /tmp/canFam2/$2/$2.out ./
if (-e /tmp/canFam2/$2/$2.align) /bin/cp /tmp/canFam2/$2/$2.align ./
if (-e /tmp/canFam2/$2/$2.tbl) /bin/cp /tmp/canFam2/$2/$2.tbl ./
if (-e /tmp/canFam2/$2/$2.cat) /bin/cp /tmp/canFam2/$2/$2.cat ./
/bin/rm -fr /tmp/canFam2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDog
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach chr (`cat chrom.lst`)
      set c = `echo $chr | sed -e 's/chr//'`
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/canFam2/jkStuff/RMDog \
                 /cluster/data/canFam2/$d $f \
               '{'check out line+ /cluster/data/canFam2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/canFam2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...

# Completed: 6149 of 6149 jobs
# CPU time in finished jobs:   32138805s  535646.75m  8927.45h  371.98d  1.019 y
# IO & Wait Time:                346449s    5774.15m    96.24h    4.01d  0.011 y
# Average job time:                5283s      88.05m     1.47h    0.06d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            8642s     144.03m     2.40h    0.10d
# Submission to last job:        147094s    2451.57m    40.86h    1.70d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore01
    cd /cluster/data/canFam2
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      set dir=`echo $c | sed "s/chr//" `
      cd $dir
      liftUp $c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/canFam2
    hgLoadOut canFam2 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS  (DONE 2005-07-11 braney)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on canFam2 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits canFam2 rmsk
# 968054174 bases of 2384996543 (40.589%) in intersection
#canFam1 is
#896773874 bases of 2359845093 (38.001%) in intersection

# SIMPLE REPEATS (TRF)  (DONE 2005-07-11 braney)
# PARTIALLY REDONE 2005-12-02 angie -- See partial redo notes below...
    ssh kkstore01
    mkdir /cluster/data/canFam2/bed/simpleRepeat
    cd /cluster/data/canFam2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/canFam2/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh
      end
    end
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed canFam2 simpleRepeat \
      /cluster/data/canFam2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# load of simpleRepeat did not go as planned: 637318 record(s), 0 row(s) skipped, 1172 warning(s) loading bed.tab

    featureBits canFam2 simpleRepeat
# 52855902 bases of 2384996543 (2.216%) in intersection
# canFam1 is
#36509895 bases of 2359845093 (1.547%) in intersection

# 2005-12-02 Error with simpleRepeats found during QA. 1622 entries have what
# appears to be a parsing error. The "sequence" value is a float number of
# format X.XX instead of an ATCG sequence. Lines removed from active tables.
# Copy of original data on hgwdev.canFam2.simpleRepeat_1202problem

    # Partial redo 12/2/05 -- Jen found some lines with invalid values in 
    # the database.  All were from trf/chr10_1.bed which I moved aside to 
    # trf/chr10_1.bed.bad.  When I reran the chr10_1 job from jobs.csh, 
    # it produced a valid output file.  So I did the rest of the steps and 
    # reloaded.
    ssh kolossus
    cd /cluster/data/canFam2/bed/simpleRepeat
    mv trf/chr10_1.bed{,.bad}
    /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf /cluster/data/canFam2/10/chr10_1/chr10_1.fa /dev/null -bedAt=trf/chr10_1.bed -tempDir=/tmp
    mv simpleRepeat.bed simpleRepeat.bed.bad
    liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed
    # on hgwdev:
    featureBits canFam2 simpleRepeat
#52885863 bases of 2384996543 (2.217%) in intersection

# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/canFam2/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
    /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed


# PROCESS SIMPLE REPEATS INTO MASK  (DONE 2005-07-11 braney)
# 2005-12-02 angie: continuing redo of chr10_1, see below. 
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/canFam2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/canFam2
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      set dir=`echo $c | sed "s/chr//" `
      if (-e $dir/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' $dir/lift/ordered.lst > $dir/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$dir.bed jkStuff/liftAll.lft warn `cat $dir/lift/oTrf.lst`
      endif
    end
    # Here's the coverage for the filtered TRF:
    ssh hgwdev
    cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
    featureBits canFam2 /tmp/filtTrf.bed
#23111877 bases of 2384996543 (0.969%) in intersection
# canFam1 was
#23017541 bases of 2359845093 (0.975%) in intersection
    featureBits canFam2 /tmp/filtTrf.bed \!rmsk
# 1205611 bases of 2384996543 (0.051%) in intersection
#canFam1 was
#1275941 bases of 2359845093 (0.054%) in intersection

    # 12/2/2005 -- since I regenerated trf/chr10_1.bed above, continue here...
    ssh kolossus
    cd /cluster/data/canFam2/bed/simpleRepeat
    mv trfMask/chr10_1.bed{,.orig}
    awk '{if ($5 <= 12) print;}' trf/chr10_1.bed  > trfMask/chr10_1.bed
    mv trfMaskChrom/chr10.bed{,.orig}
    cd ../..
    liftUp bed/simpleRepeat/trfMaskChrom/chr10.bed jkStuff/liftAll.lft warn `cat 10/lift/oTrf.lst`
    ssh hgwdev
    cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
    featureBits canFam2 /tmp/filtTrf.bed
#23133898 bases of 2384996543 (0.970%) in intersection
    featureBits canFam2 /tmp/filtTrf.bed \!rmsk
#1206965 bases of 2384996543 (0.051%) in intersection


# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2005-07-11 braney)
# REDONE 2005-12-02 angie (to pick up the chr10_1 simpleRepeat redo)
    ssh kkstore01
    cd /cluster/data/canFam2
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    # Tons of warnings like this, mostly for L1M*:
#WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e
    foreach c (`cat chrom.lst`)
      set c=`echo $c | sed "s/chr//" `
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    #- Rebuild the nib files, using the soft masking in the fa:
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make one big 2bit file as well, and make a link to it in 
    # /gbdb/canFam2/nib because hgBlat looks there:
    faToTwoBit */chr*.fa canFam2.2bit
    ssh hgwdev
    ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/nib/


# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 8/1/05 angie)
    ssh hgwdev
    mkdir /gbdb/canFam2/html
    # Write ~/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html
    # Check it in and copy (ideally using "make alpha" in trackDb) to 
    # /gbdb/canFam2/html


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 8/1/05 angie)
    # pitakluster (blastz etc):
    ssh pk
    mkdir /san/sanvol1/scratch/canFam2
    rsync -av /cluster/data/canFam2/nib /san/sanvol1/scratch/canFam2/
    mkdir /san/sanvol1/scratch/canFam2/rmsk
    cp -p /cluster/data/canFam2/*/chr*.fa.out \
       /san/sanvol1/scratch/canFam2/rmsk
    # small cluster (chaining etc):
    ssh kkr1u00
    mkdir -p /iscratch/i/canFam2/nib
    rsync -av /cluster/data/canFam2/nib /iscratch/i/canFam2/
    iSync
    # big cluster (genbank):
    ssh kkstore01
    mkdir /cluster/bluearc/scratch/hg/canFam2
    rsync -av /cluster/data/canFam2/nib /cluster/bluearc/scratch/hg/canFam2/
    # ask cluster-admin to rsync that to all big cluster nodes' /scratch/...


# MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 8/1/05 angie)
    ssh kolossus
    cd /san/sanvol1/scratch/canFam2/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Human in extra column 1, Mouse in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query dog -comp human -comp mouse
    end
    # Now extract human (extra column 1), mouse (extra column).
    cd ..
    mkdir linSpecRep.notInHuman
    mkdir linSpecRep.notInMouse
    foreach f (rmsk/*.out_homo-sapiens_mus-musculus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInHuman/$base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInMouse/$base.out.spec
    end
    wc -l rmsk/*.out
#4533630 total
    wc -l linSpecRep.notInHuman/*
#1542788 total
    wc -l linSpecRep.notInMouse/*
#1546408 total
    # Clean up.
    rm rmsk/*.out_h*


# PRODUCING GENSCAN PREDICTIONS (DONE 8/11/05 angie)
    ssh hgwdev
    mkdir /cluster/data/canFam2/bed/genscan
    cd /cluster/data/canFam2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/canFam2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/data/canFam2/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
#495
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para make jobList
#Completed: 493 of 495 jobs
#Crashed: 2 jobs
#Average job time:                 918s      15.30m     0.25h    0.01d
#Longest finished job:           30383s     506.38m     8.44h    0.35d
#Submission to last job:        132236s    2203.93m    36.73h    1.53d
    # If there are crashes, diagnose with "para problems" and "para crashed".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    ssh kkr7u00
    cd /cluster/data/canFam2/bed/genscan
    /cluster/bin/x86_64/gsBig /cluster/data/canFam2/1/chr1_23/chr1_23.fa.masked gtf/chr1_23.fa.gtf -trans=pep/chr1_23.fa.pep -subopt=subopt/chr1_23.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/x86_64/gsBig /cluster/data/canFam2/27/chr27_1/chr27_1.fa.masked gtf/chr27_1.fa.gtf -trans=pep/chr27_1.fa.pep -subopt=subopt/chr27_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000   
    ls -1 gtf | wc -l
#    495
    endsInLf gtf/*

    # Convert these to chromosome level files as so:
    ssh kkstore01
    cd /cluster/data/canFam2/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/canFam2/bed/genscan
    ldHgGene -gtf canFam2 genscan genscan.gtf
    hgPepPred canFam2 generic genscanPep genscan.pep
    hgLoadBed canFam2 genscanSubopt genscanSubopt.bed


# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 8/1/05 angie)
    ssh kolossus
    # numerator is canFam2 gapless bases as reported by featureBits, 
    # denominator is hg17 gapless bases as reported by featureBits,
    # 1024 is threshold used for human -repMatch:
    calc \( 2384996543 / 2867328468 \) \* 1024
# ( 2384996543 / 2867328468 ) * 1024 = 851.746316
    # ==> use -repMatch=852 according to size scaled down from 1024 for human.
    mkdir /cluster/bluearc/canFam2
    mkdir /cluster/data/canFam2/bed/ooc
    cd /cluster/data/canFam2/bed/ooc
    ls -1 /cluster/data/canFam2/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/canFam2/11.ooc -repMatch=852
#Wrote 27388 overused 11-mers to /cluster/bluearc/canFam2/11.ooc


# AUTO UPDATE GENBANK MRNA RUN  (DONE 8/8/05 angie)
    # 2/22/06: added refGene
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

    # Edit etc/genbank.conf and add these lines:
# canFam2 (dog)
canFam2.genome = /scratch/hg/canFam2/nib/chr*.nib
canFam2.lift = /cluster/data/canFam2/jkStuff/liftAll.lft
canFam2.refseq.mrna.native.load = no
canFam2.refseq.mrna.xeno.load = yes
canFam2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
canFam2.genbank.mrna.xeno.load = yes
canFam2.genbank.est.xeno.load = no
canFam2.downloadDir = canFam2

    cvs ci etc/genbank.conf
    # Edit src/align/gbBlat to add /cluster/bluearc/canFam2/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) refseq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
    featureBits canFam2 xenoRefGene
#41278562 bases of 2384996543 (1.731%) in intersection
    # Clean up:
    rm -rf work/initial.canFam2

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
    featureBits canFam2 mrna
#2061533 bases of 2384996543 (0.086%) in intersection
    featureBits canFam2 xenoMrna
#63057460 bases of 2384996543 (2.644%) in intersection
    # Clean up:
    rm -rf work/initial.canFam2

    ssh eieio
    # -initial for ESTs:
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 canFam2 &
    featureBits canFam2 intronEst
#16241242 bases of 2384996543 (0.681%) in intersection
    featureBits canFam2 est
#41719045 bases of 2384996543 (1.749%) in intersection
    # Clean up:
    rm -rf work/initial.canFam2

    # 2/22/06: add native refseq
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    # genbank.conf: canFam2.refseq.mrna.native.load = yes
    make etc-update
    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \
      canFam2 &
    tail -f var/build/logs/2006.02.22-21:03:48.canFam2.initalign.log
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 &
    tail -f var/dbload/hgwdev/logs/2006.02.22-21:19:53.dbload.log
    featureBits canFam2 refGene
#1222436 bases of 2384996543 (0.051%) in intersection


# SWAP CHAINS FROM HG17, BUILD NETS ETC. (DONE 8/3/05 angie - REDONE 12/13/05 - REDONE 2/8/06)
# hg17-canFam2 was redone 12/12/05 (makeHg17.doc) so re-running this...
# and again 2/8/06, sheesh...
    mkdir /cluster/data/canFam2/bed/blastz.hg17.swap
    cd /cluster/data/canFam2/bed/blastz.hg17.swap
    doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.canFam2/DEF \
      >& do.log &
    tail -f do.log
    # Add {chain,net}Hg17 to trackDb.ra if necessary.

# RE-RUN NETTOAXT, AXTTOMAF FOR HG17 (DONE 10/31/05 angie)
# Obsoleted 12/13/05 by re-run of canFam2-hg17 above.
    # Kate fixed netToAxt to avoid duplicated blocks, which is important 
    # for input to multiz.  Regenerate maf using commands from sub-script 
    # netChains.csh generated by doBlastzChainNet.pl above.  
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.hg17.swap/axtChain
    netSplit canFam2.hg17.net.gz net
    chainSplit chain canFam2.hg17.all.chain.gz
    cd ..
    mv axtNet axtNet.orig
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/canFam2/nib /iscratch/i/hg17/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.canFam2.hg17.net.axt.gz
    end
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.canFam2.hg17.net.axt.gz)
      axtToMaf -tPrefix=canFam2. -qPrefix=hg17. $f \
            /cluster/data/canFam2/chrom.sizes /cluster/data/hg17/chrom.sizes \
            stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    rm -r axtChain/{chain,net}/ axtNet.orig


# QUALITY SCORES (DONE 8/11/05 angie)
    ssh kkstore01
    mkdir /cluster/data/canFam2/bed/quality
    cd /cluster/data/canFam2/bed/quality
    qaToQac ../../broad/contigs.quals.gz stdout \
    | qacAgpLift ../../broad/UCSC_Dog2.0.agp stdin chrom.qac
    mkdir wigData
    # Build .wig, .wib files in current directory so that "wigData/" doesn't 
    # appear in the .wig's:
    cd wigData
    foreach agp (../../../*/chr*.agp)
      set chr = $agp:t:r
      set abbrev = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
      echo $chr to qual_$abbrev wiggle
      qacToWig -fixed ../chrom.qac -name=$chr stdout \
      | wigEncode stdin qual_$abbrev.{wig,wib}
    end
    # Verify size of .wib file = chrom length
    foreach f (*.wib)
      set abbrev = $f:t:r
      set chr = `echo $abbrev | sed -e 's/^qual_/chr/;  s/r$/_random/;'`
      set wibLen = `ls -l $f | awk '{print $5;}'`
      set chromLen = `grep -w $chr ../../../chrom.sizes | awk '{print $2;}'`
      if ($wibLen != $chromLen) then
        echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
      else
        echo $chr OK.
      endif
    end

    # /gbdb & load:
    ssh hgwdev
    cd /cluster/data/canFam2/bed/quality/wigData
    mkdir -p /gbdb/canFam2/wib
    ln -s `pwd`/*.wib /gbdb/canFam2/wib
    hgLoadWiggle canFam2 quality *.wig


# GC 5 BASE WIGGLE TRACK (DONE 8/11/05 angie)
    ssh kki
    mkdir /cluster/data/canFam2/bed/gc5Base
    cd /cluster/data/canFam2/bed/gc5Base
    cat > doGc5Base.csh <<'_EOF_'
#!/bin/csh -fe
set chr = $1
set c = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
/cluster/bin/x86_64/hgGcPercent \
  -chr=${chr} -wigOut -doGaps -file=stdout -win=5 canFam2 \
  /iscratch/i/canFam2/nib \
| /cluster/bin/x86_64/wigEncode stdin gc5Base_${c}{.wig,.wib}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doGc5Base.csh
    cp /dev/null spec
    foreach c (`cat ../../chrom.lst`)
      echo "./doGc5Base.csh $c" >> spec
    end
    para make spec
    para time
#Completed: 41 of 41 jobs
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest finished job:              44s       0.73m     0.01h    0.00d
#Submission to last job:           259s       4.32m     0.07h    0.00d
    # /gbdb and load track on hgwdev
    ssh hgwdev
    cd /cluster/data/canFam2/bed/gc5Base
    mkdir -p /gbdb/canFam2/wib
    ln -s `pwd`/*.wib /gbdb/canFam2/wib
    hgLoadWiggle canFam2 gc5Base *.wig


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 8/12/05 angie)
    ssh kolossus
    cd /panasas/store/canFam2/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query dog -comp rat
    end
    # Now extract rat (extra column 1):
    cd ..
    mkdir linSpecRep.notInRat
    foreach f (rmsk/*.out_rattus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractRepeats 1 $f > \
		linSpecRep.notInRat/$base.out.spec
    end
    # Clean up.
    rm rmsk/*.out_rat*


# LOAD CPGISSLANDS (DONE 8/12/05 angie)
    ssh hgwdev
    mkdir -p /cluster/data/canFam2/bed/cpgIsland
    cd /cluster/data/canFam2/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/canFam2/bed/cpgIsland/
    
    ssh kolossus
    cd /cluster/data/canFam2/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam2/bed/cpgIsland
    hgLoadBed canFam2 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    wc -l cpgIsland.bed 
#  47804 cpgIsland.bed
    featureBits canFam2 cpgIslandExt
#38974374 bases of 2384996543 (1.634%) in intersection


# ANDY LAW CPGISSLANDS (DONE 8/12/05 angie)
    # See notes in makeGalGal2.doc.
    ssh kolossus
    mkdir /cluster/data/canFam2/bed/cpgIslandGgfAndy
    cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
    # Use masked sequence since this is a mammal...
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../*/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc and run on masked $chr
      /cluster/home/angie/bin/i386/preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed canFam2 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits canFam2 cpgIslandExt
#38974374 bases of 2384996543 (1.634%) in intersection
    featureBits canFam2 cpgIslandGgfAndyMasked
#99187178 bases of 2384996543 (4.159%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#   47804 ../cpgIsland/cpgIsland.bed
#  138037 cpgIslandGgfAndyMasked.bed


# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & BEYOND (DONE 8/15/05 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Do the same for dog.  
    # Scripts expect *.out.spec filenames, so set that up:
    ssh kkstore01
    cd /cluster/data/canFam2
    mkdir /panasas/store/canFam2/linSpecRep.notInNonMammal
    foreach f (/panasas/store/canFam2/rmsk/chr*.fa.out)
      ln $f /panasas/store/canFam2/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
    end


# SWAP CHAINS FROM MM6, BUILD NETS ETC. (DONE 8/15/05 angie)
# REDONE by hiram 12/7/05 -- tables loaded as {chain,net}Mm6u1 then 
# renamed to {chain,net}Mm6 12/13/05 angie
    mkdir /cluster/data/canFam2/bed/blastz.mm6.swap
    cd /cluster/data/canFam2/bed/blastz.mm6.swap
    doBlastzChainNet.pl -swap /cluster/data/mm6/bed/blastz.canFam2/DEF \
      >& do.log
    echo "check /cluster/data/canFam2/bed/blastz.mm6.swap/do.log" \
    | mail -s "check do.log" $USER
    # Add {chain,net}Mm6 to trackDb.ra if necessary.

# RE-RUN NETTOAXT, AXTTOMAF FOR MM6 (DONE 11/1/05 angie)
# Obsoleted by re-run of canFam2-mm6 above.
    # Kate fixed netToAxt to avoid duplicated blocks, which is important 
    # for input to multiz.  Regenerate maf using commands from sub-script 
    # netChains.csh generated by doBlastzChainNet.pl above.  
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain
    netSplit canFam2.mm6.net.gz net
    chainSplit chain canFam2.mm6.all.chain.gz
    cd ..
    mv axtNet axtNet.orig
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/canFam2/nib /cluster/data/mm6/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.canFam2.mm6.net.axt.gz
    end
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.canFam2.mm6.net.axt.gz)
      axtToMaf -tPrefix=canFam2. -qPrefix=mm6. $f \
            /cluster/data/canFam2/chrom.sizes /cluster/data/mm6/chrom.sizes \
            stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    rm -r axtChain/{chain,net}/ axtNet.orig


# SWAP CHAINS FROM RN3, BUILD NETS ETC. (DONE 8/16/05 angie)
    mkdir /cluster/data/canFam2/bed/blastz.rn3.swap
    cd /cluster/data/canFam2/bed/blastz.rn3.swap
    doBlastzChainNet.pl -swap /cluster/data/rn3/bed/blastz.canFam2/DEF \
      >& do.log
    echo "check /cluster/data/canFam2/bed/blastz.rn3.swap/do.log" \
    | mail -s "check do.log" $USER
    # Add {chain,net}Rn3 to trackDb.ra if necessary.
    # Found that downloads in rn3 was missing 2005-12-21 - remake - Hiram
    cd /cluster/data/rn3/bed/blastz.canFam2
    /cluster/bin/scripts/doBlastzChainNet.pl -continue=download \
	/cluster/data/rn3/bed/blastz.canFam2/DEF \
	-chainMinScore=1000 -chainLinearGap=loose > downloads.out 2>&1
    
# RE-RUN NETTOAXT, AXTTOMAF FOR RN3 (DONE 11/2/05 angie)
    # Kate fixed netToAxt to avoid duplicated blocks, which is important 
    # for input to multiz.  Regenerate maf using commands from sub-script 
    # netChains.csh generated by doBlastzChainNet.pl above.  
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.rn3.swap/axtChain
    netSplit canFam2.rn3.net.gz net
    chainSplit chain canFam2.rn3.all.chain.gz
    cd ..
    mv axtNet axtNet.orig
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/canFam2/nib /cluster/data/rn3/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.canFam2.rn3.net.axt.gz
    end
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.canFam2.rn3.net.axt.gz)
      axtToMaf -tPrefix=canFam2. -qPrefix=rn3. $f \
            /cluster/data/canFam2/chrom.sizes /cluster/data/rn3/chrom.sizes \
            stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    rm -r axtChain/{chain,net}/ axtNet.orig


# MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (DONE 11/28/05 angie)
    hgsql -h genome-testdb hgcentraltest \
      -e 'update defaultDb set name = "canFam2" where genome = "Dog";'


# MAKE Human Proteins track (DONE braney 2005-12-10)
    ssh kkstore01
    cd /cluster/data/canFam2
    cat */chr*/*.lft > jkStuff/subChr.lft   
    mkdir blastDb
    for i in */*/*_*_*.fa; do ln `pwd`/$i blastDb; done 
    cd blastDb
    for i in *.fa; do /cluster/bluearc/blast229/formatdb -p F -i $i; rm $i; done

    ssh pk
    destDir=/san/sanvol1/scratch/canFam2/blastDb
    mkdir -p $destDir
    cd /cluster/data/canFam2/blastDb
    for i in nin nsq nhr; do cp *.$i $destDir; done
    mkdir -p /cluster/data/canFam2/bed/tblastn.hg17KG
    cd /cluster/data/canFam2/bed/tblastn.hg17KG
    find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst        

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
# 37365/(500000/6149) = 459.514770
    split -l 460 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf  /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
    mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
    ln -s  /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    exit
    chmod +x blastSome
    gensub2 target.lst kg.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 504218 of 504218 jobs
# CPU time in finished jobs:   28863259s  481054.31m  8017.57h  334.07d  0.915 y
# IO & Wait Time:               4255045s   70917.42m  1181.96h   49.25d  0.135 y
# Average job time:                  66s       1.09m     0.02h    0.00d
# Longest finished job:             443s       7.38m     0.12h    0.01d
# Submission to last job:        166648s    2777.47m    46.29h    1.93d

    ssh kki
    cd /cluster/data/canFam2/bed/tblastn.hg17KG
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
ename $1`.psl)
'_EOF_'

    chmod +x chainSome
    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec ../../chrom.lst
    para create chainSpec
    para push

# Completed: 3362 of 3362 jobs
# CPU time in finished jobs:    1078406s   17973.43m   299.56h   12.48d  0.034 y
# IO & Wait Time:                 85905s    1431.75m    23.86h    0.99d  0.003 y
# Average job time:                 346s       5.77m     0.10h    0.00d
# Longest finished job:           22556s     375.93m     6.27h    0.26d
# Submission to last job:        106376s    1772.93m    29.55h    1.23d

    ssh kkstore01
    cd /cluster/data/canFam2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/canFam2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..
    wc blastHg17KG.psl
# 64343 1351203 10913814 blastHg17KG.psl
    pslUniq blastDm2FB.psl stdout | wc                                                                                    
#  18827  395367 2992653
    cat kgfa/*fa | grep ">" | wc                                                                                  
# 309494  309494 4810327

    ssh hgwdev
    cd /cluster/data/canFam2/bed/tblastn.hg17KG
    hgLoadPsl canFam2 blastHg17KG.psl
    featureBits canFam2 blastHg17KG
# 35781547 bases of 2384996543 (1.500%) in intersection

    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn

# BLASTZ SELF  (DONE braney 8-29-2005)
# For future reference -- doBlastzChainNet.pl should have been used
    ssh pk
    mkdir -p /cluster/data/canFam2/bed/blastz.canFam2
    cd /cluster/data/canFam2/bed/blastz.canFam2
    cat << '_EOF_' > DEF
# dog vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
# Dog
SEQ1_DIR=/scratch/hg/canFam2/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Dog
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/canFam2/bed/blastz.canFam2

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    cp /cluster/data/canFam2/chrom.sizes S1.len
    cp /cluster/data/canFam2/chrom.sizes S2.len
    mkdir run
    cd run
    partitionSequence.pl 10000000 10000 /cluster/data/canFam2/nib ../S1.len \
      -xdir xdir.sh -rawDir ../lav \
    > canFam2.lst
    csh -ef xdir.sh
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 canFam2.lst canFam2.lst gsub jobList
    para create jobList
    para try, check, push, check, ...

# Completed: 70756 of 70756 jobs
# CPU time in finished jobs:    5743034s   95717.24m  1595.29h   66.47d  0.182 y
# IO & Wait Time:                199800s    3330.00m    55.50h    2.31d  0.006 y
# Average job time:                  84s       1.40m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2758s      45.97m     0.77h    0.03d
# Submission to last job:         17852s     297.53m     4.96h    0.21d

    # convert lav files to psl 
    # First, combine all files per partition (too many files per chrom
    # to do in one swoop!).  Then do another round to collect all files 
    # per chrom.  
    ssh kki
    cd /cluster/data/canFam2/bed/blastz.canFam2
    mkdir pslChrom pslParts run.lavToPsl
    cd run.lavToPsl
    ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst
    # For self alignments, we need lavToAxt's -dropSelf behavior, 
    # so go lav->axt->psl... could add -dropSelf to lavToPsl, but that 
    # might be somewhat invasive and perhaps not really much faster (?) 
    # because the sequence must be dug up in order to rescore...
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cat $1/*.lav \
| lavToAxt -dropSelf stdin /iscratch/i/canFam2/nib \
                           /iscratch/i/canFam2/nib stdout \
| axtToPsl stdin ../S1.len ../S2.len stdout \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz }
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 parts.lst single gsub jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 266 of 275 jobs
# Crashed: 9 jobs
# CPU time in finished jobs:       1228s      20.47m     0.34h    0.01d  0.000 y
# IO & Wait Time:                  7117s     118.61m     1.98h    0.08d  0.000 y
# Average job time:                  31s       0.52m     0.01h    0.00d
# Longest finished job:              74s       1.23m     0.02h    0.00d
# Submission to last job:           641s      10.68m     0.18h    0.01d

    awk '{print $1;}' /cluster/data/canFam2/chrom.sizes > chroms.lst
    cat << '_EOF_' > doChrom.csh
#!/bin/csh -ef
zcat ../pslParts/$1.*.psl.gz \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChrom.csh
    cat << '_EOF_' > gsub.chroms
#LOOP
./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz }
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chroms.lst single gsub.chroms jobList
    para create jobList
    para try, check, push, check, ...

# Completed: 40 of 40 jobs
# CPU time in finished jobs:        463s       7.71m     0.13h    0.01d  0.000 y
# IO & Wait Time:                   178s       2.97m     0.05h    0.00d  0.000 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest finished job:              38s       0.63m     0.01h    0.00d
# Submission to last job:            61s       1.02m     0.02h    0.00d

# CHAIN SELF BLASTZ 
# For future reference -- doBlastzChainNet.pl should have been used
# REDONE partially, see below (from the filtering step onward, to get rid 
# of duplicates, 12/6/05 angie)
# CHAINS REMERGED to reassign IDs so that they're unique genome-wide, and 
# nets deleted to avoid confusion, 12/14/05 angie.  The IDs were not unique 
# genome-wide (only per-chrom) because the filtering was done directly on 
# run1/chain files which had not yet had their IDs reassigned by 
# chainMergeSort, and after filtering, chainMergeSort -saveId was used... doh.
# It would have been OK if the filtering had been done post-merging,
# or if -saveId had been omitted after filtering.  caught by joinerCheck.
# Also could have been avoided by just using doBlastzChainNet.pl the first time.
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/canFam2/bed/blastz.canFam2
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S ../../pslChrom/*.psl.gz > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
setenv PATH /cluster/bin/x86_64:$PATH
axtChain -verbose=0 -psl $1 /iscratch/i/canFam2/nib \
                            /iscratch/i/canFam2/nib stdout \
| chainAntiRepeat /iscratch/i/canFam2/nib /iscratch/i/canFam2/nib \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 40 of 41 jobs
#Crashed: 1 jobs
#Average job time:                 185s       3.09m     0.05h    0.00d
#Longest job:                      490s       8.17m     0.14h    0.01d
#Submission to last job:         74789s    1246.48m    20.77h    0.87d

    # now on the cluster server, sort chains
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    # Had these steps actually been followed it would have saved some 
    # trouble, but instead some custom filtering was performed directly 
    # on run1/chain files and they never got their IDs reassigned:
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # trim to minScore=20000 to cut some of the fluff
    mkdir chainFilt
    foreach f (chain/*.chain)
      chainFilter -minScore=20000 $f > chainFilt/$f:t
    end
    gzip -c all.chain > all.chain.unfiltered.gz
    rm -r chain
    mv chainFilt chain
    chainMergeSort -saveId chain/*.chain > all.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain canFam2 ${c}_chainSelf $i
    end

    # 12/6/05 angie -- Jen found that all chains were duplicated --
    # turns out that axtChain/run1/chain/ contained some chr*.filt.chain
    # in addition to chr*.psl.chain, so *.chain resulted in duplicates.  
    # Redo the filtering and chain merge, then re-run
    # doBlastzChainNet.pl -continue net
    # to redo the subsequent steps.
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    chainMergeSort run1/chain/chr*.psl.chain \
    | gzip -c > all.chain.unfiltered.gz
    rm -r chain
    chainSplit chain all.chain.unfiltered.gz
    mkdir chainFilt
    foreach f (run1/chain/chr*.psl.chain)
      chainFilter -minScore=20000 $f > chainFilt/$f:t:r:r.chain
    end
    rm -r chain
    mv chainFilt chain
    chainMergeSort -saveId chain/*.chain \
    | gzip -c > canFam2.canFam2.all.chain.gz

    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.canFam2
    rm -r axtNet mafNet
    rm -r /usr/local/apache/htdocs/goldenPath/canFam2/vsSelf/
    doBlastzChainNet.pl -continue net DEF >& redo.log &
    tail -f redo.log

    # 12/14/05 angie -- Jen ran joinerCheck and it complained about 
    # non-unique IDs in the chain tables.  fixing...
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    chainSplit chain canFam2.canFam2.all.chain.gz 
    mv canFam2.canFam2.all.chain.gz canFam2.canFam2.all.chain.badIDs.gz
    chainMergeSort chain/*.chain \
    | gzip -c > canFam2.canFam2.all.chain.gz
    rm -r chain
    chainSplit chain canFam2.canFam2.all.chain.gz 

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain canFam2 ${c}_chainSelf $i
    end
    cd ..
    rm -r chain
    # I removed the self net in order to avoid confusion, so no need to 
    # regenerate it.


# NET SELF BLASTZ 
# For future reference -- doBlastzChainNet.pl should have been used
# REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
    ssh kolossus
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    chainPreNet all.chain.gz ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    netClass -noAr noClass.net canFam2 canFam2 self.net
    rm noClass.net
    # Load the nets into database 
    netFilter -minGap=10 self.net |  hgLoadNet canFam2 netSelf stdin
    # Add entries for chainSelf, netSelf to dog/canFam2 trackDb


# ADD SELF DOWNLOADABLE FILES (DONE 11/21/05 angie)
# For future reference -- doBlastzChainNet.pl should have been used for 
# all steps from blastz up to this point.
# REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
    cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
    mv self.net canFam2.canFam2.net
    gzip canFam2.canFam2.net
    cd ..
    doBlastzChainNet.pl -continue download -stop download DEF \
      >& doDownload.log
    tail -f doDownload.log


# MULTIZ.V10 4WAY (DOG/H/M/R) (DONE 11/7/05 angie - REDONE 12/13/05)
# hg17 and mm6 alignments were redone -- re-run this to pick up improvements.
    # Tree: ((canFam2 hg17) (mm6 rn3))
    ssh kkstore01
    mkdir /cluster/data/canFam2/bed/multiz4way.2005-12-13
    rm -f /cluster/data/canFam2/bed/multiz4way
    ln -s /cluster/data/canFam2/bed/multiz4way.2005-12-13 \
      /cluster/data/canFam2/bed/multiz4way
    cd /cluster/data/canFam2/bed/multiz4way
    # Setup: Copy pairwise MAF to /san/sanvol1/scratch:
    mkdir /san/sanvol1/scratch/dogMultiz4way
    foreach db (hg17 mm6 rn3)
      echo $db
      cp -pR /cluster/data/canFam2/bed/blastz.$db/mafNet \
        /san/sanvol1/scratch/dogMultiz4way/$db
    end
    ls -lLR /san/sanvol1/scratch/dogMultiz4way
    # Make output dir and script to use /cluster/bin/scripts/runMultizV10.csh:
    mkdir maf
    cat << '_EOF_' > doMultizAll.csh
#!/bin/csh -fex
set chr = $1
set tmpDir = /scratch/dogMultiz4way.$chr
mkdir $tmpDir

set mafScratch = /san/sanvol1/scratch/dogMultiz4way

# Really should write a perl script to take a tree like this and generate 
# commands like the ones below:
# ((canFam2 hg17) (mm6 rn3))

/cluster/bin/scripts/runMultizV10.csh \
  $mafScratch/mm6/$chr.maf.gz \
  $mafScratch/rn3/$chr.maf.gz \
  0 canFam2.$chr $tmpDir $tmpDir/$chr.MusRat.maf
/cluster/bin/scripts/runMultizV10.csh \
  $mafScratch/hg17/$chr.maf.gz \
  $tmpDir/$chr.MusRat.maf \
  1 canFam2.$chr $tmpDir maf/$chr.maf

rm -f $tmpDir/*.maf
rmdir $tmpDir
'_EOF_'
    # << for emacs
    chmod 775 doMultizAll.csh
    awk '{print "./doMultizAll.csh " $1;}' /cluster/data/canFam2/chrom.sizes \
      > jobs.lst
    # Run on brawny cluster
    ssh pk
    cd /cluster/data/canFam2/bed/multiz4way
    para make jobs.lst
    para time
#Completed: 41 of 41 jobs
#Average job time:                1296s      21.61m     0.36h    0.02d
#Longest finished job:            2879s      47.98m     0.80h    0.03d
#Submission to last job:          2879s      47.98m     0.80h    0.03d
    ls -1 missing*
#ls: No match.

    # make /gbdb/ links to 4way maf files:
    ssh hgwdev
    mkdir -p /gbdb/canFam2/multiz4way/maf/multiz4way
    ln -s /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
      /gbdb/canFam2/multiz4way/maf/multiz4way/
    # load into database
    cd /tmp
    hgLoadMaf -warn canFam2 multiz4way \
      -pathPrefix=/gbdb/canFam2/multiz4way/maf/multiz4way
    # load summary table to replace pairwise
    cat /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
    | nice hgLoadMafSummary canFam2 multiz4waySummary stdin

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_2"
    hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_3"

    # put 4way MAF out for download
    ssh kolossus
    cd /cluster/data/canFam2/bed/multiz4way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    md5sum *.maf.gz > md5sum.txt
    # make a README.txt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way
    ln -s /cluster/data/canFam2/bed/multiz4way/mafDownload/{*.maf.gz,*.txt} \
      /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way

    # Cleanup
    rm -rf /san/sanvol1/scratch/dogMultiz4way/


# PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 11/9/05 angie - REDONE 12/13/05)
# ((canFam2,hg17),(mm6,rn3))
# redo of hg17, mm6 -> multiz -> phastCons.
    ssh kkstore01
    mkdir -p /san/sanvol1/scratch/canFam2/chrom
    cp -p /cluster/data/canFam2/?{,?}/chr*.fa /san/sanvol1/scratch/canFam2/chrom/
    # Split chrom fa into smaller windows for phastCons:
    ssh pk
    mkdir /cluster/data/canFam2/bed/multiz4way/phastCons
    mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
    set WINDOWS = /san/sanvol1/scratch/canFam2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/canFam2/chrom
set WINDOWS=/san/sanvol1/scratch/canFam2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O canFam2,hg17,mm6,rn3 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/canFam2/bed/multiz4way/maf/*.maf)
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para make jobList
    para time
#Completed: 41 of 41 jobs
#Average job time:                 132s       2.21m     0.04h    0.00d
#Longest finished job:             232s       3.87m     0.06h    0.00d
#Submission to last job:           232s       3.87m     0.06h    0.00d

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/canFam2/bed/multiz4way/phastCons
    /cluster/bin/phast/msa_view ../maf/chr{2,20,37}.maf \
        --aggregate canFam2,hg17,mm6,rn3 \
        -i MAF -o SS > all.ss
    /cluster/bin/phast/phyloFit all.ss \
      --tree "((canFam2,hg17),(mm6,rn3))" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -378601835.277927
#BACKGROUND: 0.285684 0.213599 0.213743 0.286974 
#RATE_MAT:
#  -0.889874    0.169851    0.563150    0.156874 
#   0.227172   -1.149929    0.168661    0.754097 
#   0.752694    0.168547   -1.148977    0.227736 
#   0.156169    0.561285    0.169622   -0.887076 
#TREE: ((canFam2:0.161521,hg17:0.166335):0.114597,(mm6:0.071038,rn3:0.076403):0.114597);
# also get GC content from model -- if similar enough, no need to extract it 
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
#0.427342
    # OK, use .427 for --gc below.

    # Use values of --target-coverage and --expected-lengths sort of like 
    # those in the latest run on human, 0.25 and 12, just as a starting point.
    # Multiply each subst rate on the TREE line by 3.75 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate/ave.*
    /cluster/bin/phast/tree_doctor -s 3.75 starting-tree.mod \
      > starting-tree.noncons.mod
    /cluster/bin/phast/consEntropy --NH 9.7834 0.25 12 \
      starting-tree{,.noncons}.mod
#( Solving for new omega: 12.000000 13.069907 13.002421 13.002166 )
#Transition parameters: gamma=0.250000, omega=12.000000, mu=0.083333, nu=0.027778
#Relative entropy: H=0.803151 bits/site
#Expected min. length: L_min=11.957635 sites
#Expected max. length: L_max=9.271001 sites
#Phylogenetic information threshold: PIT=L_min*H=9.603785 bits
#Recommended expected length: omega=13.002166 sites (for L_min*H=9.783400)
    # OK, use --expected-lengths 13.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.18 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH.
    # -- it gets that error unless I raise coverage to 0.34.  Well, 
    # stick with 13 for now...
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.15 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH.
    # -- it gets that error unless I raise coverage to 0.31.  Well, 
    # stick with 13 for now...

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh pk
    mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate

    # REDO ONLY
    cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
    # REDO ONLY -- skip the first iteration stuff from here on out, and 
    # do subsequent iteration stuff using params from 11-07 run.

    # FIRST ITERATION: Use ../starting-tree.mod:
    # REDO ONLY
    cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
    # REDO ONLY -- skip the First iteration stuff from here on out, and 
    # do subsequent iteration stuff using params from 11-07 run.

    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 13 --target-coverage 0.25 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 13 --target-coverage 0.15 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    para make jobList
    para time
#Completed: 2434 of 2434 jobs
#Average job time:                  29s       0.48m     0.01h    0.00d
#Longest finished job:              59s       0.98m     0.02h    0.00d
#Submission to last job:           214s       3.57m     0.06h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.063241,hg17:0.067179):0.047808,(mm6:0.029756,rn3:0.031924):0.047808);
#ave.noncons.mod:TREE: ((canFam2:0.201135,hg17:0.213229):0.151937,(mm6:0.094208,rn3:0.101249):0.151937);
    # SECOND ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.056147,hg17:0.059693):0.042487,(mm6:0.026610,rn3:0.028533):0.042487);
#ave.noncons.mod:TREE: ((canFam2:0.192186,hg17:0.203958):0.145480,(mm6:0.090727,rn3:0.097451):0.145480);
    # THIRD ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.052533,hg17:0.055873):0.039758,(mm6:0.024967,rn3:0.026765):0.039758);
#ave.noncons.mod:TREE: ((canFam2:0.188439,hg17:0.200074):0.142743,(mm6:0.089242,rn3:0.095829):0.142743);
    # REDO:
#ave.cons.mod:TREE: ((canFam2:0.052883,hg17:0.056022):0.039955,(mm6:0.024815,rn3:0.026742):0.039955);
#ave.noncons.mod:TREE: ((canFam2:0.188924,hg17:0.199837):0.142839,(mm6:0.088391,rn3:0.095457):0.142839);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # FIRST ITERATION ONLY:
    # Check the total entropy figure to see if we're way off.
    # This takes an hour for 4way (exponential in #species) and has never 
    # produced a different answer from the input after the first iteration,
    # so do this for the first iteration only:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.25 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH
    # Dang.  That is a new one.  Oh well, I'll proceed with 13.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh pk
    mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 13 --target-coverage 0.15 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/$chr/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
    foreach chr (`awk '{print $1;}' /cluster/data/canFam2/chrom.sizes`)
      mkdir /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}/$chr
    end
    rm -f jobList
    foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para make jobList
    para time
#Completed: 2434 of 2434 jobs
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest finished job:              39s       0.65m     0.01h    0.00d
#Submission to last job:            74s       1.23m     0.02h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/canFam2/bed/multiz4way/phastCons
    cp /dev/null all.bed
    foreach d (/san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/*)
      echo $d:t
      awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
        $d/*.bed \
      | /cluster/bin/scripts/lodToBedScore >> all.bed
    end

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    # However we don't have dog gene annotations -- we just have xenoRefGene 
    # which is of course biased towards conserved genes.  So cover will be 
    # artificially high.  Shoot for "all.bed 5%" and make sure cover is 
    # somewhat higher than 68.9%.  
    cd /cluster/data/canFam2/bed/multiz4way/phastCons
    featureBits -enrichment canFam2 xenoRefGene:cds all.bed
    # FIRST ITERATION: too high, reduce target-coverage:
#xenoRefGene:cds 1.268%, all.bed 6.812%, both 1.091%, cover 86.06%, enrich 12.63x
    # SECOND ITERATION: still too high.  
#xenoRefGene:cds 1.268%, all.bed 5.432%, both 1.048%, cover 82.67%, enrich 15.22x
    # THIRD ITERATION: close enough.
#xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.83%, enrich 16.29x
    # REDO: close enough.
#xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.85%, enrich 16.30x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed canFam2 phastConsElements4way all.bed

    # Create wiggle
    ssh pk
    mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
    rm -rf /san/sanvol1/scratch/canFam2/phastCons/wib
    mkdir -p /san/sanvol1/scratch/canFam2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/canFam2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS \
                  | sed -e 's/\/$//'`)
      echo doWigEncode $chr >> jobList
    end
    para make jobList
    para time
#Completed: 41 of 41 jobs
#Average job time:                  35s       0.59m     0.01h    0.00d
#Longest finished job:              67s       1.12m     0.02h    0.00d
#Submission to last job:            67s       1.12m     0.02h    0.00d

    # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from san/sanvol1
    cd /cluster/data/canFam2/bed/multiz4way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /san/sanvol1/scratch/canFam2/phastCons/wib .
    rsync -av /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/canFam2/multiz4way/wib
    cd /cluster/data/canFam2/bed/multiz4way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/canFam2/multiz4way/wib/
    hgLoadWiggle canFam2 phastCons4way \
      -pathPrefix=/gbdb/canFam2/multiz4way/wib wib/*.wig
    rm wiggle.tab

    # and clean up san/sanvol1.
    rm -r /san/sanvol1/scratch/canFam2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /san/sanvol1/scratch/canFam2/chrom
    # Offer raw scores for download since fly folks are likely to be interested:
    # back on kolossus
    cd /cluster/data/canFam2/bed/multiz4way/phastCons/POSTPROBS
    mkdir ../postprobsDownload
    foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
      zcat `ls -1 $chr/$chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
        > ../postprobsDownload/$chr.pp.gz
    end
    cd ../postprobsDownload
    md5sum *.gz > md5sum.txt
    # Make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
    cd /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
    ln -s /cluster/data/canFam2/bed/multiz4way/phastCons/postprobsDownload/* .


# UNCERTIFIED ASSEMBLY REGIONS (DONE 11/9/05 angie)
    ssh hgwdev
    mkdir /cluster/data/canFam2/bed/uncertified
    cd /cluster/data/canFam2/bed/uncertified
    mv ../../broad/canFam2.0uncertifiedRegions.txt .
    tail +2 canFam2.0uncertifiedRegions.txt \
    | awk -F"\t" '{print "chr" $1 "\t" $2 "\t" $3 "\t" $7 "\t" $6;}' \
    | sed -e 's/Missing read partners/MRP/; s/Haplotype inconsistency/HI/; \
              s/Negative gap/NG/; s/Linking inconsistency/LI/; \
              s/Illogical links/IL/; s/; /+/g;' \
    > uncertified.bed
    hgLoadBed -tab canFam2 uncertified uncertified.bed


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/21/05 angie)
# REDONE parts: Sequence and simpleRepeat.  12/5/05 angie
    ssh kolossus
    cd /cluster/data/canFam2
    #- Build the .tar.gz files -- no genbank for now.
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
    # << this line makes emacs coloring happy
    csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    cd bigZips
    md5sum *.gz > md5sum.txt
    # Make a README.txt
    cd ..
    mkdir chromGz
    foreach f ( ?{,?}/chr*.fa )
      echo $f:t:r
      gzip -c $f > chromGz/$f:t.gz
    end
    cd chromGz
    md5sum *.gz > md5sum.txt
    # Make a README.txt

    #- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    set gp = /usr/local/apache/htdocs/goldenPath/canFam2
    mkdir -p $gp/bigZips
    ln -s /cluster/data/canFam2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
    mkdir -p $gp/chromosomes
    ln -s /cluster/data/canFam2/chromGz/{chr*.gz,*.txt} $gp/chromosomes
    # Take a look at bigZips/* and chromosomes/*
    # Can't make refGene upstream sequence files - no refSeq for dog.
    mkdir database
    # Create README.txt files in database/ to explain the files.


# CHORI BAC END PAIRS  (DONE 11/22/05 angie)
    # Rachel downloaded and parsed BAC end sequences and pair info from 
    # CHORI and NCBI -- seee makeCanFam1.doc "BAC END PAIRS".  Use those 
    # files and align to canFam2.  

    # Do BLAT alignments of BAC ends to the genome on the pitakluster.
    # copy over masked contigs to the san
    ssh kkstore01
    cd /cluster/data/canFam2
    mkdir /san/sanvol1/scratch/canFam2/maskedContigs
    foreach d (?{,?}/chr*_?{,?})
      echo $d:t
      cp -p $d/$d:t.fa /san/sanvol1/scratch/canFam2/maskedContigs/
    end
    # copy over 11.ooc file to the san
    cp -p /cluster/bluearc/canFam2/11.ooc /san/sanvol1/scratch/canFam2/
    # make output directory, run directory and bed/ directory
    mkdir -p /san/sanvol1/scratch/canFam2/bacendsRun/psl
    mkdir /cluster/data/canFam2/bed/bacends
    ssh pk
    cd /san/sanvol1/scratch/canFam2/bacendsRun
    echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam2/11.ooc {check out line+ /san/sanvol1/scratch/canFam2/bacendsRun/psl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub
    ls -1S /san/sanvol1/scratch/canFam2/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst
    gensub2 contigs.lst bacends.lst gsub jobList
    para make jobList
    para time
#Completed: 49005 of 49005 jobs
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest finished job:             105s       1.75m     0.03h    0.00d
#Submission to last job:          1884s      31.40m     0.52h    0.02d

    # back on kkstore01, retrieve pitakluster results
    cd /cluster/data/canFam2/bed/bacends
    rsync -av /san/sanvol1/scratch/canFam2/bacendsRun/{*.lst,batch*,g*,j*,p*} .
    
    # lift alignments
    ssh kolossus
    cd /cluster/data/canFam2/bed/bacends
    pslSort dirs raw.psl tmp psl
    # started 08:15, PID: 16561
    pslCheck raw.psl >& check.log 
    wc -l check.log
#4379 check.log
    grep '< previous block' check.log | wc -l
#2194
    grep -v 'Error: invalid PSL' check.log | grep -v '< previous block' | wc -l
#0
    wc -l raw.psl
#27352955 raw.psl -- 2194 / 27352955 = 0.000080 = 0.008% overlapping
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl  bacEnds.psl /dev/null
    # Processed 27147454 alignments
    wc -l bacEnds.psl
#769149 bacEnds.psl
    liftUp bacEnds.lifted.psl /cluster/data/canFam2/jkStuff/liftAll.lft \
           warn bacEnds.psl
    awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l
#317042
    faSize /san/sanvol1/scratch/canFam1/bacends/bacends*.fa
#290482800 bases (10817785 N's 279665015 real 279665015 upper 0 lower) in 393408 sequences in 99 files
    calc 317042 / 393408
#317042 / 393408 = 0.805886
    # Make BAC end pairs track: 
    mkdir pairs
    cd pairs
    set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
    wc -l *
#      40 bacEnds.long
#     460 bacEnds.mismatch
#   44366 bacEnds.orphan
#  133248 bacEnds.pairs
#     478 bacEnds.short
#     816 bacEnds.slop
#  179408 total

    # Filter by score and sort by {chrom,chromStart}:
    awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
    cat bacEnds.{slop,short,long,mismatch,orphan} \
    | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
    wc -l *.bed
#  133241 bacEndPairs.bed
#   46057 bacEndPairsBad.bed
#  179298 total
    extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed   \
      bacEndPairsBad.bed \
    | sorttbl tname tstart | headchg -del \
    > bacEnds.load.psl

    # load into database
    ssh hgwdev
    cd /cluster/data/canFam2/bed/bacends/pairs
    hgLoadBed canFam2 bacEndPairs bacEndPairs.bed -notItemRgb \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#Loaded 133241 elements of size 11
    # note - this next track isn't pushed to RR, just used for assembly QA
    hgLoadBed canFam2 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
              -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
#Loaded 46057 elements of size 11
    hgLoadPsl canFam2 -table=all_bacends bacEnds.load.psl
#load of all_bacends did not go as planned: 748544 record(s), 0 row(s) skipped, 928 warning(s) loading psl.tab
    # Diagnose...
    echo select \* from all_bacends | hgsql -N canFam2 > /tmp/1
    diff psl.tab /tmp/1 | less
    # Looks like some rows of psl.tab have negative numbers in the 
    # qBaseInsert column!

    # load BAC end sequences into seq table so alignments may be viewed
    # symlink to FASTA sequence file in ncbi directory
    mkdir -p /gbdb/canFam2/bacends
    ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \
          /gbdb/canFam2/bacends/canFamBacends.fa
    hgLoadSeq canFam2 /gbdb/canFam2/bacends/canFamBacends.fa
#393408 sequences
# featureBits canFam1 all_bacends
#211644790 bases of 2359845093 (8.969%) in intersection
    featureBits canFam2 all_bacends
#218709219 bases of 2384996543 (9.170%) in intersection

# featureBits canFam1 bacEndPairs
#2334084046 bases of 2359845093 (98.908%) in intersection
    featureBits canFam2 bacEndPairs
#2353239742 bases of 2384996543 (98.668%) in intersection

# featureBits canFam1 bacEndPairsBad
# 548130287 bases of 2359845093 (23.227%) in intersection
    featureBits canFam2 bacEndPairsBad
#534195657 bases of 2384996543 (22.398%) in intersection

# add trackDb entry and html
    # Clean up
    ssh pk
    rm -r /san/sanvol1/scratch/canFam2/bacendsRun


# ACCESSIONS FOR CONTIGS (DONE 2006-01-04 kate)
# Found WGS project entry in Genbank for canFam2:
# LOCUS       AAEX02000000           35696 rc    DNA     linear   MAM 13-DEC-2005
# DEFINITION  Canis familiaris whole genome shotgun sequencing project.
# Project has a total of 35696 contigs, accessioned as
#  AAEX02000001-AAEX02035696, also named cont2.0--cont2.35695
# Poked around after submitting "canis[ORGN] AND WGS[KYWD]" search
# to Entrez Nucleotide

    cd /cluster/data/canFam2/broad
    grep contig Dog2.0.agp | wc -l
        # 35696
    grep contig Dog2.0.agp | head -1
        # chr01   3000001 3024025 2       W       contig_30884   ...
    hgsql canFam2 -Ns -e "select frag from chr1_gold limit 1"
        # contig_30884
    grep contig_0 Dog2.0.agp
        # chrUn   57343783        57365839        7071    W       contig_0
    grep contig_35695 Dog2.0.agp
        # chrUn   73893245        73902697        11515   W       contig_35695 
    grep contig_35696 Dog2.0.agp
        # nothing
    # 
    # in Genbank entries)

    # Looked at summary file of contig entries from Genbank by
    # clicking the project entry (AAEX02000000), then clicking on
    # the WGS contig acession list at the end of the page
    # the AGP and our Gap tables frag names contig_* correspond to the
    # cont2.* names and AEX02* names (-1) in the Genbank entries,
    # so we don't need to to parse out the comments from the summary
    # entries.

    # create file mapping our contig names to Genbank accessions
    cd /cluster/data/canFam2/bed
    mkdir -p contigAcc
    cd contigAcc
    sed -n 's/.*contig_\([0-9][0-9]*\).*/\1/p' \
                /cluster/data/canFam2/broad/Dog2.0.agp | \
        sort -un | \
    awk '{printf ("contig_%d\tAAEX020%05d.1\n", $1, $1+1)}' > contigAcc.tab
    wc -l contigAcc.tab
        # 35696 contigAcc.tab

    # load into database
    hgsql canFam2 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql canFam2
    hgsql canFam2 -Nse "SELECT COUNT(*) FROM contigAcc"
        # 35696

############################################################################
# QA NOTE (2007-08-16 brooke):
# rhesus nets on canFam2 were requested by Merck (see the following
# thread: http://www.soe.ucsc.edu/pipermail/genome/2007-August/014414.html).
# We may want to consider adding rhesus nets/chains to next dog release.

############################################################################
# BLASTZ BOSTAU2 (DONE - 2006-01-17 - 2006-01-18 - Hiram)

    ssh kk
    mkdir /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
    cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7
BLASTZ_M=50

# dog vs. cow
# TARGET: Dog
SEQ1_DIR=/iscratch/i/canFam2/nib
SEQ1_LEN=/iscratch/i/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow (bosTau2)
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=300

BASE=/cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
    #	 << happy emacs

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	running 2006-01-17 13:58
    #	done 2005-01-18 03:40
    #	real    823m27.169s

    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
    time featureBits canFam2 chainBosTau2Link \
	> fb.canFam2.chainBosTau2Link 2>&1 &
    #	real    32m3.460s
    #	1376066425 bases of 2384996543 (57.697%) in intersection


# SWAP CHAINS FROM RN4, BUILD NETS ETC. (DONE 2/25/06 angie)
    mkdir /cluster/data/canFam2/bed/blastz.rn4.swap
    cd /cluster/data/canFam2/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.canFam2/DEF \
      >& do.log
    echo "check /cluster/data/canFam2/bed/blastz.rn4.swap/do.log" \
    | mail -s "check do.log" $USER
    ln -s blastz.rn4.swap /cluster/data/canFam2/bed/blastz.rn4


############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzCanFam2.2006-02-18
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits canFam2 chainMm8Link
    #   816262344 bases of 2384996543 (34.225%) in intersection

############################################################################
##  BLASTZ swap from panTro2 alignments (DONE 2006-05-04 markd)
# FINISHED (2006-07-21 kate)
    ssh hgwdev
    mkdir /cluster/data/canFam2/bed/blastz.panTro2.swap
    ln -s blastz.panTro2.swap /cluster/data/canFam2/bed/blastz.panTro2
    cd /cluster/data/canFam2/bed/blastz.panTro2.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.out&

   # create the net files
    ssh hgwdev
    cd /cluster/data/canFam2/bed/blastz.panTro2.swap/axtChain
    nice netClass -verbose=0 -noAr noClass.net canFam2 panTro2 canFam2.panTro2.net
    nice gzip canFam2.panTro2.net

    # Mark stopped here
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=load \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.2.out&


##########################################################################
# NSCAN track - (2006-06-01 markd)
    cd /cluster/data/canFam2/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv http://genes.cs.wustl.edu/jeltje/canFam2/canFam2.nscan.gtf
    gzip canFam2.nscan.gtf
    wget -nv -r -np http://genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/
    cat genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/*.fa >canFam2.nscan.pep.fa
    gzip canFam2.nscan.pep.fa
    rm -rf genes.cs.wustl.edu
    chmod a-w *.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt canFam2 nscanGene canFam2.nscan.gtf.gz
    # add .a suffix to match transcript id
    WRONG, see below: hgPepPred -suffix=.a canFam2 generic nscanPep canFam2.nscan.pep.fa.gz
    rm *.tab

    # update trackDb; need a canFam2-specific page to describe informants
    dog/canFam2/nscanGene.html
    dog/canFam2/trackDb.ra

    # 2006-07-24 markd: reloaded without -suffix
    hgPepPred canFam2 generic nscanPep canFam2.nscan.pep.fa.gz

##########################################################################
# MAKE Human Proteins (hg18) track (DONE braney 2006-08-17)
    ssh pk
    destDir=/san/sanvol1/scratch/canFam2/blastDb
    mkdir -p /cluster/data/canFam2/bed/tblastn.hg18KG
    cd /cluster/data/canFam2/bed/tblastn.hg18KG
    find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst        

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
# 36727/(500000/6149) = 451.668646
    split -l 452 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf  /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
    mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
    ln -s  /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    exit
    chmod +x blastSome
    gensub2 target.lst kg.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 504218 of 504218 jobs
# CPU time in finished jobs:   28268903s  471148.38m  7852.47h  327.19d  0.896 y
# IO & Wait Time:               5565118s   92751.97m  1545.87h   64.41d  0.176 y
# Average job time:                  67s       1.12m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             710s      11.83m     0.20h    0.01d
# Submission to last job:         91362s    1522.70m    25.38h    1.06d

    ssh kki
    cd /cluster/data/canFam2/bed/tblastn.hg18KG
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
ename $1`.psl)
'_EOF_'

    chmod +x chainSome
    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst ../../chrom.lst chainGsub chainSpec 
    para create chainSpec
    para push

# Completed: 3335 of 3335 jobs
# CPU time in finished jobs:    1056945s   17615.76m   293.60h   12.23d  0.034 y
# IO & Wait Time:                 75548s    1259.13m    20.99h    0.87d  0.002 y
# Average job time:                 340s       5.66m     0.09h    0.00d
# Longest finished job:           25716s     428.60m     7.14h    0.30d
# Submission to last job:         61450s    1024.17m    17.07h    0.71d

    ssh kkstore04
    cd /cluster/data/canFam2/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* | uniq  > /cluster/data/canFam2/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    wc blastHg18KG.psl
# 62741  1317561 12491782 blastHg18KG.psl
# hg17  64343 1351203 10913814 blastHg18KG.psl
    pslUniq blastHg18KG.psl stdout | wc                                                                                    
# 36272  761712 8427975
#  hg17 18827  395367 2992653
    cat kgfa/*fa | grep ">" | wc                                                                                  
#  303516  303516 4739313
# hg17 309494  309494 4810327

    ssh hgwdev
    cd /cluster/data/canFam2/bed/tblastn.hg18KG
    hgLoadPsl canFam2 blastHg18KG.psl
    featureBits canFam2 blastHg18KG
# 32565727 bases of 2384996543 (1.365%) in intersection
# hg17  35781547 bases of 2384996543 (1.500%) in intersection

    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 16 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/canFam2 is 94% full
    mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
    ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 /cluster/data/canFam2/bed/blastz.felCat3
    cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
    cat << '_EOF_' > DEF

BLASTZ_M=50

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/hg/canFam2/nib
SEQ1_LEN=/scratch/hg/canFam2/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Cat felCat3 
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    doBlastzChainNet.pl DEF \
      -bigClusterHub pk
      -chainMinScore=3000 -chainLinearGap=medium
      -blastzOutRoot /cluster/bluearc/felCat3/blastz.canFam2 >& do.log &
    tail -f do.log

    nice featureBits -chrom=chr1 canFam2 chainFelCat3Link
    # 65223887 bases of 121613690 (53.632%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET DOG AND HORSE (DONE 2/19/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
    cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
    cat << '_EOF_' > DEF
# Horse vs. Dog

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Dog canFam2
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log &
    tail -f do.log

# 6 jobs failed.
# Robert re-ran those 6 jobs. (2/22/07)

doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-continue cat -bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log &
tail -f do2.log
    ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2
    
    ssh hgwdev
    nice featureBits equCab1 -chrom=chrI chainCanFam2Link
    # 128747357 bases of 177498097 (72.534%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainCanFam2Link \
	> fb.equCab1.chainCanFam2Link.txt 2>&1
    # 1717664968 bases of 2421923695 (70.922%) in intersection

    ssh kkstore04
    mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap
    cd /cluster/data/canFam2/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    # real    115m55.880s

    cat *.txt
    # 1673177547 bases of 2384996543 (70.154%) in intersection

#########################################################################

#########################################################################
# BLASTZ OPOSSUM monDom4 (DONE 2007-05-27 braney)
    ssh kolossus
    mkdir /cluster/data/canFam2/bed/blastz.monDom4
    cd /cluster/data/canFam2/bed/blastz.monDom4
    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Dog (canFam2)
SEQ1_DIR=/san/sanvol1/scratch/canFam2/nib
SEQ1_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
#SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
#SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes        
SEQ2_LEN=/cluster/data/monDom4/chrom.sizes        
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/canFam2/bed/blastz.monDom4
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    doBlastzChainNet.pl DEF \
      -bigClusterHub=pk  -chainMinScore=5000 -chainLinearGap=loose \
      >& blastz.out 2>&1 &

#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-28 - 30 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    #	managing disk space on full filesystem
    mkdir -p /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28
    ln -s /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28 \
	/cluster/data/canFam2/bed
    cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28

    cat << '_EOF_' > DEF
# Dog vs. Marmoset
BLASTZ_M=50

# TARGET: Dog MonDom4
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=2000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
	> do.log 2>&1 &
    #	real    864m49.290s
# Completed: 266999 of 267000 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:   17058860s  284314.33m  4738.57h  197.44d  0.541 y
# IO & Wait Time:               1031855s   17197.59m   286.63h   11.94d  0.033 y
# Average job time:                  68s       1.13m     0.02h    0.00d
# Longest finished job:            1302s      21.70m     0.36h    0.02d
# Submission to last job:         47998s     799.97m    13.33h    0.56d
    #	one job ran out of memory on the kk nodes.  It was finished on kolossus
    /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/canFam2/nib/chr21.nib:chr21:20000000-40010000 qParts/part238.lst \
../DEF \
../psl/chr21.nib:chr21:20000000-40010000/chr21.nib:chr21:20000000-40010000_part238.lst.psl

    #	continuing
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
	-continue=cat -chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
	> cat.log 2>&1 &
    #	had a problem with files missing from /scratch/data/ on
    #	the Iservers - rsync that directory /iscratch/data/ full on
    #	kkr1u00 from /cluster/bluearc/scratch/data/ and push it to
    #	the other Iservers
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
	-continue=chainMerge -chainLinearGap=medium -bigClusterHub=kk \
	-verbose=2 > chainMerge.log 2>&1 &
    #	real    79m39.909s

    cat fb.canFam2.chainCalJac1Link.txt
    #	1369690756 bases of 2384996543 (57.429%) in intersection

    mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap
    cd /cluster/data/calJac1/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    #	encountered difficulties with /scratch/data/ on kolossus
    #	had to finish the netChains.csh script manually, then continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=kk > load.log 2>&1 &
    #	real    56m44.375s

    cat fb.calJac1.chainCanFam2Link.txt
    #	1451345669 bases of 2929139385 (49.549%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
    ssh kkstore04
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
    cd /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11

    cat << '_EOF_' > DEF
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -verbose=2 \
	-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1
    #	the pk became free as this job had 3 days to go.  So, chill out
    #	the batch on memk, let it finish its running jobs, then finish
    #	the batch manually on pk.  Continuing:
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -verbose=2 -smallClusterHub=memk \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -continue=cat -syntenicNet > cat.log 2>&1
    #	real    175m41.674s
    cat fb.canFam2.chainBosTau4Link.txt
    #	1367017426 bases of 2384996543 (57.317%) in intersection

    mkdir /cluster/data/bosTau4/bed/blastz.canFam2.swap
    cd /cluster/data/bosTau4/bed/blastz.canFam2.swap
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11/DEF \
	-verbose=2 -smallClusterHub=memk \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > swap.log 2>&1
    #	real    388m46.213s
    cat fb.bosTau4.chainCanFam2Link.txt
    #	1447088347 bases of 2731830700 (52.971%) in intersection

#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
    ssh kkstore04
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
    cd /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
    cat << '_EOF_' > DEF
# Dog vs. Horse

BLASTZ_M=50

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes 
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >& do.log &
	0.117u 0.073s 9:20:00.22 0.0%   0+0k 0+0io 1pf+0w


six parasol jobs failed; retried with:

    para -retries=5 push

disable bad servers:

      1 kkr10u31.kilokluster.ucsc.edu
      1 kkr11u16.kilokluster.ucsc.edu
      1 kkr12u29.kilokluster.ucsc.edu

    pushd /cluster/bluearc/equCab2/blastz.canFam2/psl
    find . -name '*.psl' | wc
    64625   64625 6657995

This is the correct number of psl files, so continue with cat step:

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >>& do.log &
    0.212u 0.173s 2:49:45.35 0.0%   0+0k 0+0io 62pf+0w

    ln -s blastz.equCab2.2008-04-18 /cluster/data/canFam2/bed/blastz.equCab2
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# lastz Poodle canFamPoodle1 (DONE - 2009-06-06,22 - Hiram)
    mkdir /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
    cd /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06

    cat << '_EOF_' > DEF
# Tasha boxer dog vs Shadow poodle
# parameters for very close alignment from human-chimp example
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Tasha, canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LAP=0
SEQ2_LIMIT=800

BASE=/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
        -verbose=2 \
        `pwd`/DEF \
        -noDbNameCheck -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=5000 -chainLinearGap=medium > do.log 2>&1
    #	real    8250m12.188s
# Completed: 374825 of 374825 jobs
# CPU time in finished jobs:  187350504s 3122508.40m 52041.81h 2168.41d  5.941 y
# IO & Wait Time:               4127960s   68799.33m  1146.66h   47.78d  0.131 y
# Average job time:                 511s       8.51m     0.14h    0.01d
# Longest finished job:            2339s      38.98m     0.65h    0.03d
# Submission to last job:        494836s    8247.27m   137.45h    5.73d

    #	the lastz run thought it failed, but it didn't, continuing:
    time nice -n +19 doBlastzChainNet.pl \
        -verbose=2 \
        `pwd`/DEF \
        -continue=cat -noDbNameCheck -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=5000 -chainLinearGap=medium > cat.log 2>&1
    #	real    4841m48.581s <- this is actually a fb time on cavPor3
    #	this finish step was less than an hour
    fb.canFam2.chainCanFamPoodle1Link.txt 
    #	1405528799 bases of 2384996543 (58.932%) in intersection

    mkdir /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap
    cd /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap

    time nice -n +19 doBlastzChainNet.pl \
        -verbose=2 \
	/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06/DEF \
        -swap -noDbNameCheck -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1
    #	real    8585m58.789s
    cat fb.canFamPoodle1.chainCanFam2Link.txt 
    #	1377478896 bases of 1517497798 (90.773%) in intersection

############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29 - Hiram
    mkdir /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
    cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29

    cat << '_EOF_' > DEF
# Dog vs. Horse

BLASTZ_M=50

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    338m57.973s
    cat fb.canFam2.chainEquCab2Link.txt 
    #	1676663178 bases of 2384996543 (70.300%) in intersection
    #	this is identical to what went down before ?

    mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
    cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
    time doBlastzChainNet.pl \
	/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \
	-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    286m51.658s
    fb.equCab2.chainCanFam2Link.txt 
    #	1721407500 bases of 2428790173 (70.875%) in intersection

############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# BLASTZ FOR ZEBRAFISH (danRer5) (DONE, 2009-08-07, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
   mkdir /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
   cd /hive/data/genomes/canFam2/bed
   ln -s blastz.danRer5.2009-08-07 blastz.danRer5
   cd /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
 
   cat << '_EOF_' > DEF
# dog (canFam2) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/canFam2.2bit
SEQ1_LEN=/hive/data/genomes/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/hive/data/genomes/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
TMPDIR=/scratch/tmp
'_EOF_'
   # << happy emacs
   chmod +x DEF
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=5000 -chainLinearGap=loose \
        >& doBlastz.log &
# 0.806u 0.488s 3:58:52.14 0.0%   0+0k 0+0io 0pf+0w

cat fb.canFam2.chainDanRer5Link.txt 
# 29234486 bases of 2384996543 (1.226%) in intersection

# DO BLASTZ SWAP TO CREATE CANFAM2 CHAINS, NETS, AXNET, MAFNET AND
# ALIGNMENT DOWNLOADS ON DANRER5 - documented in danRer5.txt

##########################################################
# monDom5 chains/nets (2009-03-02 Andy)

ssh hgwdev
cd /hive/data/genomes/monDom5/bed/blastz.canFam2
doBlastzChainNet.pl -swap DEF
## (should have also put in a -noLoadChainSplit... see below)

##########################################################
# monDom5 chain table unsplit (2009-09-08 Andy)

ssh hgwdev
cd /hive/data/genomes/canFam2/bed/blastz.monDom5.swap/axtChain
hgLoadChain -tIndex canFam2 chainMonDom5 canFam2.monDom5.all.chain.gz 
netFilter -minGap=10 canFam2.monDom5.net.gz | hgLoadNet -verbose=0 canFam2 netMonDom5 stdin
echo "show tables like 'chr%chainMonDom5%'" | hgsql canFam2 | tail -n +2 \
  | while read -a line; do echo drop table $line | hgsql canFam2; done

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
    mkdir /hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04
    cd /hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04

    cat << '_EOF_' > DEF
# Dog vs. Panda
#	parameters from the Panda paper supplemental where they describe
#	their lastz parameters
BLASTZ_K=2200
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_H=2000
BLASTZ_C=2
BLASTZ_T=2

# our usual M
BLASTZ_M=50

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm -smallClusterHub=memk -workhorse=hgwdev \
	-noLoadChainSplit \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    360m31.044s
    cat fb.canFam2.chainAilMel1Link.txt 
    #	1791212709 bases of 2384996543 (75.103%) in intersection

    mkdir /hive/data/genomes/ailMel1/bed/blastz.canFam2.swap
    cd /hive/data/genomes/ailMel1/bed/blastz.canFam2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=pk \
	-smallClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
	> swap.log 2>&1 &
    #	real    128m41.005s
     cat fb.ailMel1.chainCanFam2Link.txt 
    #	1788107935 bases of 2245312831 (79.637%) in intersection

############################################################################
# LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram)
    #	use a screen to control this job
    screen
    mkdir /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12
    cd /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12

    cat << '_EOF_' > DEF
# dog vs marmoset
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=75
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 `pwd`/DEF \
	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    # real    1988m5.805s
    cat fb.canFam2.chainCalJac3Link.txt 
    #	1363307334 bases of 2384996543 (57.162%) in intersection

    mkdir /hive/data/genomes/calJac3/bed/blastz.canFam2.swap
    cd /hive/data/genomes/calJac3/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    129m56.144s
    cat fb.calJac3.chainHg19Link.txt 
    #	1397333116 bases of 2752505800 (50.766%) in intersection
#####################################################################
# LASTZ Dog Swap (DONE 2010-06-01 - Chin)

   cd /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01
   cat fb.canFam2.chainFelCat4Link.txt

    mkdir /hive/data/genomes/felCat4/bed/blastz.canFam2.swap
    cd /hive/data/genomes/felCat4/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01/DEF \
        -swap -syntenicNet -noDbNameCheck \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    # real    327m42.053s

    cat fb.felCat4.chainCanFam2Link.txt
    # 1467506008 bases of 1990635005 (73.720%) in intersection


#########################################################################
# LASTZ Cow BosTau6 (DONE - 2011-05-25 - Chin)
    screen
    mkdir /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25
    cd /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25

    cat << '_EOF_' > DEF
# dog vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau6 
SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit
SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real     261m49.574
    cat fb.canFam2.chainBosTau6Link.txt
    # 1389712912 bases of 2384996543 (58.269%) in intersection
    # Create link
    cd /hive/data/genomes/canFam2/bed
    ln -s  lastzBosTau6.2011-05-25 lastz.bosTau6


    #   and the swap 
    mkdir /hive/data/genomes/bosTau6/bed/blastz.canFam2.swap
    cd /hive/data/genomes/bosTau6/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real     88m38.555s
    cat fb.bosTau6.chainCanFam2Link.txt
    # 1404315725 bases of 2649682029 (52.999%) in intersection
    cd /hive/data/genomes/bosTau6/bed
    ln -s blastz.canFam2.swap lastz.canFam2


############################################################################
# SNP131 (DONE 6/13/11 angie)

    mkdir -p /hive/data/outside/dbSNP/131/dog
    cd /hive/data/outside/dbSNP/131/dog
    # Unfortunately, the contigs we have from Broad are not exactly the same
    # as the NCBI RefSeq assembly (distinct from the NCBI GenBank assembly)
    # that dbSNP used.  So ignoreDbSnpContigs tosses out all the Un's and randoms's.
    cat > config.ra <<EOF
db canFam2
orgDir dog_9615
build 131
buildAssembly 2_1
liftUp /hive/data/outside/dbSNP/131/dog/suggested.lft
ignoreDbSnpContigs NW_(87633[4-9]|8763[4-9][0-9]|876[4-9][0-9][0-9]|87[78][0-9][0-9][0-9]|879[0-4][0-9][0-9]|8795[0-5][0-9]|879560)
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log


#########################################################################
# LASTZ Cow BosTau7 (DONE - 2012-01-24 - Chin)
    screen
    mkdir /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24
    cd /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24

    cat << '_EOF_' > DEF
# dog vs cow 
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
    
# QUERY: Cow bosTau7 
SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0


BASE=/hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    # real     229m55.587
    cat fb.canFam2.chainBosTau7Link.txt
    # 1381869475 bases of 2384996543 (57.940%) in intersection
    # Create link
    cd /hive/data/genomes/canFam2/bed
    ln -s  lastzBosTau7.2012-01-24 lastz.bosTau7

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real      86m11.258s
    cat fb.bosTau7.chainCanFam2Link.txt
    # 1458512906 bases of 2804673174 (52.003%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.canFam2.swap lastz.canFam2

#####################################################################
# POLYA-SEQ TRACK (from Adnan Derti, Merck) (DONE, Andy 2012-02-06)
# (see hg19.txt for multi-species build notes)

############################################################################
# construct liftOver to canFam3 (DONE - 2012-05-02 - Hiram) 
    screen -S canFam2	# manage this longish running job in a screen
    mkdir /hive/data/genomes/canFam2/bed/blat.canFam3.2012-05-02
    cd /hive/data/genomes/canFam2/bed/blat.canFam3.2012-05-02
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/canFam2/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev canFam2 canFam3 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/canFam2/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev canFam2 canFam3 > do.log 2>&1
    # something odd with three jobs that took forever to complete
    #   real    3649m15.243s
# Completed: 7392 of 7392 jobs
# CPU time in finished jobs:     997168s   16619.46m   276.99h   11.54d  0.032 y
# IO & Wait Time:                238352s    3972.54m    66.21h    2.76d  0.008 y
# Average job time:                 167s       2.79m     0.05h    0.00d
# Longest finished job:          215892s    3598.20m    59.97h    2.50d
# Submission to last job:        216676s    3611.27m    60.19h    2.51d

    # verify this file exists:
    og -L /gbdb/canFam2/liftOver/canFam2ToCanFam3.over.chain.gz
# -rw-rw-r-- 1 583607 May  5 02:16 /gbdb/canFam2/liftOver/canFam2ToCanFam3.over.chain.gz

    # and try out the conversion on genome-test from canFam2 to canFam3 

############################################################################