# for emacs: -*- mode: sh; -*-


# Drosophila simulans -- 
# 
# WUSTL's 
# http://www.genome.wustl.edu/???
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | geneid      | genePred geneidPep              |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+


# DOWNLOAD SEQUENCE (DONE 4/11/05 angie)
    ssh kolossus
    mkdir /cluster/store8/droSim1
    cd /cluster/data
    ln -s /cluster/store8/droSim1 droSim1
    cd /cluster/data/droSim1
    mkdir downloads
    cd downloads
    wget ftp://genome.wustl.edu/private/lhillier/old/simulans_050411.tar.gz
    tar xzvf simulans_050411.tar.gz
    foreach c (2L 2R 2h 3L 3R 3h 4 U X Xh Yh)
      mkdir /cluster/data/droSim1/$c
      mv chr$c{,_random}.{fa,agp} /cluster/data/droSim1/$c/
    end
    # Some warnings from mv about chroms that are _random-only or _random-less,
    # no problem.

    # download mitochondrion sequence
    mkdir /cluster/data/droSim1/M
    cd /cluster/data/droSim1/M
    # go to http://www.ncbi.nih.gov/ and search Genome for 
    # "simulans mitochondrion complete".  That shows the gi number:
    # 45332829
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=45332829&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Drosophila simulans mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".
    cd ..
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo checking consistency of $agp and $fa
          checkAgpAndFa $agp $fa | tail -1
        endif
      end
    end
    faSize */chr*.fa
#142420719 bases (15167212 N's 127253507 real 127253507 upper 0 lower) in 18 sequences in 18 files
#Total size: mean 7912262.2 sd 9722195.0 min 14972 (chrM) max 27517382 (chr3R) median 2996586
#N count: mean 842622.9 sd 861219.0
#U count: mean 7069639.3 sd 9068950.6
#L count: mean 0.0 sd 0.0
    # see what kind of gap types we have:
    awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq
#contig
#fragment
    # biggest gap size is 3000:
    awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1
    # rough hist of gap sizes from AGP:
    awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100
    # fancy hist of gap size from FASTA:
    cat */chr*.fa | faGapSizes -niceSizes=10,50,100,200,300,700,1000,3000 stdin


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 4/11/05 angie)
    ssh kolossus
    cd /cluster/data/droSim1
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo splitting $agp and $fa
          cp -p $agp $agp.bak
          cp -p $fa $fa.bak
          splitFaIntoContigs $agp $fa . -nSize=5000000
        endif
      end
    end
    # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
    # back into the main chrom dirs and get rid of the _random dirs.
    foreach d (*_random)
      set base = `echo $d | sed -e 's/_random$//'`
      mkdir -p $base/lift
      mv $d/lift/oOut.lst $base/lift/rOut.lst
      mv $d/lift/ordered.lft $base/lift/random.lft
      mv $d/lift/ordered.lst $base/lift/random.lst
      rmdir $d/lift
      mv $d/* $base
      rmdir $d
    end
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    set msize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 4/11/05 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/droSim1/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/droSim1/bed


# CREATING DATABASE (DONE 4/11/05 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  881G  779G  54% /var/lib/mysql
    hgsql -e 'create database droSim1'


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/11/05 angie)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
      | hgsql droSim1


# REPEATMASKER (DONE 4/11/05 angie)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kolossus
    cd /cluster/data/droSim1
    foreach c (?{,?})
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/droSim1
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/droSim1/$2
/bin/cp $2 /tmp/droSim1/$2/
cd /tmp/droSim1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/droSim1/$2/$2.out ./
if (-e /tmp/droSim1/$2/$2.align) /bin/cp /tmp/droSim1/$2/$2.align ./
if (-e /tmp/droSim1/$2/$2.tbl) /bin/cp /tmp/droSim1/$2/$2.tbl ./
if (-e /tmp/droSim1/$2/$2.cat) /bin/cp /tmp/droSim1/$2/$2.cat ./
/bin/rm -fr /tmp/droSim1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droSim1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droSim1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (?{,?})
      foreach d ($c/chr${c}{,_random}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/droSim1/jkStuff/RMDrosophila \
                 /cluster/data/droSim1/$d $f \
               '{'check out line+ /cluster/data/droSim1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/droSim1/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 320 of 320 jobs
#Average job time:                5548s      92.47m     1.54h    0.06d
#Longest finished job:            6926s     115.43m     1.92h    0.08d
#Submission to last job:          7331s     122.18m     2.04h    0.08d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kolossus
    cd /cluster/data/droSim1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (?{,?})
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/droSim1
    hgLoadOut droSim1 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 4/11/05 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on droSim1 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits droSim1 rmsk
#15094026 bases of 127256433 (11.861%) in intersection
    # compare to dm2:
    featureBits dm2 rmsk
#16178239 bases of 131698467 (12.284%) in intersection
    # comparable...


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 4/11/05 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    # NOTE: next time, should make .2bit -- smoother to replace later on.
    ssh kolossus
    cd /cluster/data/droSim1
    mkdir nib
    foreach c (?{,?})
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          faToNib $f nib/$f:t:r.nib
        endif
      end
    end

    # Make symbolic links from /gbdb/droSim1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/droSim1/nib
    foreach f (/cluster/data/droSim1/nib/chr*.nib)
      ln -s $f /gbdb/droSim1/nib
    end
    # Load /gbdb/droSim1/nib paths into database and save size info.
    cd /cluster/data/droSim1
    hgsql droSim1  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib droSim1 /gbdb/droSim1/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N droSim1 > chrom.sizes
    # take a look at chrom.sizes, should be 18 lines
    wc chrom.sizes
#     18      36     314 chrom.sizes


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/11/05 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("droSim1", "Apr. 2005", "/gbdb/droSim1/nib", "D. simulans", \
               "chr2L:827101-841600", 1, 52, "D. simulans", \
                "Drosophila simulans", "/gbdb/droSim1/html/description.html", \
                0, 0, "WUSTL version 1.0");'
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO defaultDb (genome, name) values ("D. simulans", "droSim1");'
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO genomeClade (genome, clade, priority) \
            values ("D. simulans", "insect", 15);'

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add droSim1 and do
    mkdir drosophila/droSim1
    make update DBS=droSim1

    # go public on genome-test
    mkdir /gbdb/droSim1/html
    cvs commit makefile
    cvs add drosophila/droSim1
    cvs commit drosophila/droSim1
    # make alpha in a clean tree's trackDb/

    # Also add a first-cut trackDb/drosophila/droSim1/description.html, 
    # check it in and "make alpha" in clean tree again.


# MAKE LIFTALL.LFT (DONE 4/11/05 angie)
    # Redone 7/13/04: chrM/lift/ordered.lft was regenerated w/correct size.
    ssh kolossus
    cd /cluster/data/droSim1
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# SIMPLE REPEATS (TRF) (DONE 4/11/05 angie)
    ssh kolossus
    mkdir /cluster/data/droSim1/bed/simpleRepeat
    cd /cluster/data/droSim1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/droSim1/?{,?}/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \
      trf/*.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed droSim1 simpleRepeat \
      /cluster/data/droSim1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits droSim1 simpleRepeat
#2608862 bases of 127256433 (2.050%) in intersection
    featureBits dm2 simpleRepeat
#1597029 bases of 131698467 (1.213%) in intersection


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 4/11/05 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kolossus
    cd /cluster/data/droSim1/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      echo -n "filtering $f... "
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords:
    mkdir trfMaskChrom
    foreach f (../../?{,?}/chr*.fa)
      set c = $f:t:r
      echo $c
      liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \
        trfMask/${c}_[0-9]*.bed > /dev/null
    end
    # No simpleRepeats in chrM --> liftUp warning, but it's OK:
#chrM
#No lines lifted!


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 4/11/05 angie)
    ssh kolossus
    cd /cluster/data/droSim1
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    foreach c (?{,?})
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    # Make one big masked 2bit file, and make a link to it from /gbdb/droSim1/
    faToTwoBit */chr*.fa droSim1.2bit
    twoBitInfo droSim1.2bit stdout \
    | awk '{printf "%s\t%s\t/gbdb/droSim1/droSim1.2bit\n", $1, $2}' \
      > chromInfo.tab
    ssh hgwdev
    cd /cluster/data/droSim1
    ln -s /cluster/data/droSim1/droSim1.2bit /gbdb/droSim1/
    # Clean up unmasked nib and update dbDb and chromInfo to point at .2bit:
    hgsql -h genome-testdb hgcentraltest \
      -e 'update dbDb set nibPath = "/gbdb/droSim1" where name = "droSim1"'
    hgsql droSim1 -e 'drop table chromInfo'
    hgsql droSim1 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql droSim1 \
      -e 'load data local infile "chromInfo.tab" into table chromInfo;'
    rm -r /gbdb/droSim1/nib
    rm -r /cluster/data/droSim1/nib/


# GOLD AND GAP TRACKS (DONE 4/11/05 angie)
    ssh hgwdev
    cd /cluster/data/droSim1
    cp /dev/null chrom.lst
    foreach f (?{,?}/chr*.agp chrM)
      echo $f:t:r >> chrom.lst
    end
    hgGoldGapGl -noGl -chromLst=chrom.lst droSim1 /cluster/data/droSim1 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql droSim1
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql droSim1 -e 'create table chrM_gap select * from chr2L_gap where 0=1'


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/13/05 angie)
    ssh kolossus
    cd /cluster/data/droSim1
    #- Build the .tar.gz files -- no genbank for now.
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip zipTmp
mkdir zip
mkdir zipTmp
cd zipTmp
ln -s ../?{,?}/chr*.agp .
tar chvzf ../zip/chromAgp.tar.gz *.agp
rm -f ./*; ln -s ../?{,?}/chr*.fa.out .
tar chvzf ../zip/chromOut.tar.gz *.fa.out
rm -f ./*; ln -s ../?{,?}/chr*.fa .
tar chvzf ../zip/chromFa.tar.gz *.fa
rm -f ./*; ln -s ../?{,?}/chr*.fa.masked .
tar chvzf ../zip/chromFaMasked.tar.gz *.fa.masked
cd ../bed/simpleRepeat
tar cvzf ../../zip/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
rm -rf zipTmp
'_EOF_'
    # << this line makes emacs coloring happy
    csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check .tar.gz file integrity:
    foreach f (zip/*.tar.gz)
      tar tvzf $f > $f.test
      tail -1 $f.test
    end
    wc -l zip/*.tar.gz.test

    #- Copy the compressed files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    set gp = /usr/local/apache/htdocs/goldenPath/droSim1
    mkdir -p $gp/bigZips
    ln -s /cluster/data/droSim1/zip/*.gz $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( /cluster/data/droSim1/*/chr*.fa )
      echo $f
      nice gzip -c $f > $gp/chromosomes/$f:t.gz
    end

    cd $gp/bigZips
    nice md5sum *.gz > md5sum.txt
    cd $gp/chromosomes
    nice md5sum *.gz > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Can't make refGene upstream sequence files - no refSeq for simulans.


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 4/11/05 angie)
    ssh kkr1u00
    # .2bit of chroms that have been repeat- and trf-masked:
    mkdir -p /iscratch/i/droSim1
    cp -p /cluster/data/droSim1/droSim1.2bit /iscratch/i/droSim1/
    iSync


# PRODUCING GENSCAN PREDICTIONS (DONE 4/11/05 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh hgwdev
    mkdir /cluster/data/droSim1/bed/genscan
    cd /cluster/data/droSim1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kolossus
    cd /cluster/data/droSim1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach f ( `ls -1S /cluster/data/droSim1/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ssh kki
    cd /cluster/data/droSim1/bed/genscan
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 41 of 41 jobs
#Average job time:                 103s       1.71m     0.03h    0.00d
#Longest finished job:             158s       2.63m     0.04h    0.00d
#Submission to last job:           710s      11.83m     0.20h    0.01d
    # If there were crashes, we would diagnose with "para problems".  
    # If a job had crashed due to genscan running out of memory, we would 
    # re-run it manually with "-window=1200000" instead of "-window=2400000".

    # Convert these to chromosome level files as so:
    ssh kolossus
    cd /cluster/data/droSim1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/droSim1/bed/genscan
    ldHgGene -gtf droSim1 genscan genscan.gtf
    hgPepPred droSim1 generic genscanPep genscan.pep
    hgLoadBed droSim1 genscanSubopt genscanSubopt.bed


# MAKE 11.OOC FILE FOR BLAT (DONE 4/11/05 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kolossus
    mkdir /cluster/bluearc/droSim1
    blat /iscratch/i/droSim1/droSim1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/droSim1/11.ooc -repMatch=100
#Wrote 4617 overused 11-mers to /cluster/bluearc/droSim1/11.ooc
    ssh kkr1u00
    cp -p /cluster/bluearc/droSim1/*.ooc /iscratch/i/droSim1/
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 4/13/05 angie)
    # get some idea of whether this species is represented in refSeq or 
    # has a significant number of native mRNAs.  version numbers will 
    # change:
    ssh eieio
    cd /cluster/data/genbank/data/processed/refseq.10/full
    awk '{print $4 " " $5}' mrna.gbidx | sort | uniq -c | sort -nr
    cd /cluster/data/genbank/data/processed/genbank.146.0/full
    awk '$4 == "Drosophila" {print $4 " " $5}' mrna.gbidx \
    | sort | uniq -c | sort -nr
#     27 Drosophila simulans
    # Wow... with that few native mRNAs, it's questionable whether we 
    # should even bother with a native mRNA track.  But we're doing an 
    # mrna run to get xenoMrna, so what the heck, 27 > limit for single 
    # hgBlat query.
    grep 'Drosophila simulans' est.*.gbidx | wc -l
#4105

    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

    # Edit etc/genbank.conf and add these lines:
# droSim1 (D. simulans)
droSim1.genome = /iscratch/i/droSim1/nib/chr*.nib
droSim1.lift = /cluster/data/droSim1/jkStuff/liftAll.lft
droSim1.refseq.mrna.native.load = no
droSim1.refseq.mrna.xeno.load = yes
droSim1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
droSim1.genbank.mrna.xeno.load = yes
droSim1.genbank.est.xeno.load = no
droSim1.downloadDir = droSim1

    cvs ci etc/genbank.conf
    # Since D. simulans is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    # Edit src/align/gbBlat to add /iscratch/i/droSim1/11.ooc
    cvs diff |& grep -v Diffing
    make
    cvs ci -m "Added droSim1 (D. simulans)."

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) RefSeq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droSim1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droSim1
    featureBits droSim1 xenoRefGene
#25294349 bases of 127256433 (19.877%) in intersection
    # Clean up:
    rm -rf work/initial.droSim1

    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droSim1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droSim1
    featureBits droSim1 mrna
#17381 bases of 127256433 (0.014%) in intersection
    featureBits droSim1 xenoMrna
#22557993 bases of 127256433 (17.726%) in intersection
    # Clean up:
    rm -rf work/initial.droSim1

    ssh eieio
    # -initial for ESTs
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial droSim1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 droSim1 &
    featureBits droSim1 intronEst
#514738 bases of 127256433 (0.404%) in intersection
    featureBits droSim1 est
#1193700 bases of 127256433 (0.938%) in intersection
    # Clean up:
    rm -rf work/initial.droSim1


# MAKE GCPERCENT (DONE 4/11/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droSim1/bed/gcPercent
    cd /cluster/data/droSim1/bed/gcPercent
    # create and load gcPercent table
    hgsql droSim1  < ~/src/hg/lib/gcPercent.sql
    hgGcPercent droSim1 ../../nib


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 5/13/05 alisq)
    ssh hgwdev
    echo 'insert into blatServers values("droSim1", "blat9", "17780", 1, 0); \
          insert into blatServers values("droSim1", "blat9", "17781", 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 5/23/05 angie)
    mkdir /cluster/data/droSim1/bed/blastz.dm2.swap
    cd /cluster/data/droSim1/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droSim1/DEF \
      >& do.log &
    tail -f do.log
    # Add {chain,net}Dm2 to trackDb.ra if necessary.
    ln -s blastz.dm2.swap /cluster/data/droSim1/bed/blastz.dm2


# MAKE Drosophila Proteins track  (DONE braney 2005-04-20)
    ssh kolossus
    mkdir -p /cluster/data/droSim1/blastDb
    cd /cluster/data/droSim1/blastDb
    awk "{print \$2}" ../*/chr*/*.lft > subChr.lst
    for i in `cat subChr.lst`; do ln -s ../*/*/$i.fa; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log
    cd ..
    cat */chr*/*.lft > jkStuff/subChr.lft   

    ssh kkr1u00
    mkdir -p /iscratch/i/droSim1/blastDb
    cp /cluster/data/droSim1/blastDb/* /iscratch/i/droSim1/blastDb
    iSync > sync.out
    
    mkdir -p /cluster/data/droSim1/bed/tblastn.dm1FB
    cd /cluster/data/droSim1/bed/tblastn.dm1FB
    ls -1S /iscratch/i/droSim1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kolossus
    cd /cluster/data/droSim1/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/320) = 39.968000
    split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/dm1/bed/blat.dm1FB/protein.lft carry $f.2
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droSim1/jkStuff/subChr.lft carry $f.3
	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/droSim1/jkStuff/liftAll.lft carry $f.4

        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp 
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 bug.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/droSim1/bed/tblastn.dm1FB
    para create blastSpec
    para try, push
# Completed: 150080 of 150080 jobs
# CPU time in finished jobs:    2042525s   34042.08m   567.37h   23.64d  0.065 y
# IO & Wait Time:                631624s   10527.07m   175.45h    7.31d  0.020 y
# Average job time:                  18s       0.30m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             239s       3.98m     0.07h    0.00d
# Submission to last job:          3533s      58.88m     0.98h    0.04d

    ssh kki
    cd /cluster/data/droSim1/bed/tblastn.dm1FB
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    para create chainSpec

    para try, push...
# Completed: 469 of 469 jobs
# CPU time in finished jobs:       3454s      57.57m     0.96h    0.04d  0.000 y
# IO & Wait Time:                  3742s      62.37m     1.04h    0.04d  0.000 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest finished job:             329s       5.48m     0.09h    0.00d
# Submission to last job:           649s      10.82m     0.18h    0.01d

    exit
    # back to kolossus
    cd /cluster/data/droSim1/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droSim1/bed/tblastn.dm1FB/blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/droSim1/bed/tblastn.dm1FB

    hgLoadPsl droSim1 blastDm1FB.psl

# End tblastn


# BLASTZ/CHAIN/NET DROYAK1 (REDONE 6/26/05 angie)
    # Originally run w/default human-mouse params 4/19/05.
    mkdir /cluster/data/droSim1/bed/blastz.droYak1.2006-06-26
    cd /cluster/data/droSim1/bed/blastz.droYak1.2006-06-26
    cat << '_EOF_' > DEF
# D.simulans vs. D.yakuba

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. simulans
SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/droSim1/chrom.sizes

# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak1/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak1/chrom.sizes

BASE=/cluster/data/droSim1/bed/blastz.droYak1.2006-06-26
'_EOF_'
    # << this line keeps emacs coloring happy
    ~/kent/src/utils/doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/droSim1droYak1 >& do.log &
    tail -f do.log
    ln -s blastz.droYak1.2006-06-26 /cluster/data/droSim1/bed/blastz.droYak1
    # Add {chain,net}DroYak1 to trackDb.ra if necessary.
    featureBits -chrom=chr2R droSim1 chainDroYak1Link
#16921386 bases of 18190314 (93.024%) in intersection
    featureBits -chrom=chr2R droSim1 chainDm2Link
#17421444 bases of 18190314 (95.773%) in intersection
    featureBits -chrom=chr4 droSim1 chainDroYak1Link
#634681 bases of 807953 (78.554%) in intersection
    featureBits -chrom=chr4 droSim1 chainDm2Link
#696892 bases of 807953 (86.254%) in intersection


# BLASTZ/CHAIN/NET DP2 (DONE 4/21/05 angie)
    mkdir /cluster/data/droSim1/bed/blastz.dp2.2005-04-20
    cd /cluster/data/droSim1/bed/blastz.dp2.2005-04-20
    cat << '_EOF_' > DEF
# D.simulans vs. D.pseudoobscura

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. simulans
SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/droSim1/chrom.sizes

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp2/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp2/chrom.sizes

BASE=/cluster/data/droSim1/bed/blastz.dp2.2005-04-20
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF -bigClusterHub=kk9 \
      -blastzOutRoot /cluster/bluearc/droSim1dp2 >& do.log &
    tail -f do.log
    rmdir /cluster/bluearc/droSim1dp2
    ln -s blastz.dp2.2005-04-20 /cluster/data/droSim1/bed/blastz.dp2
    # Add {chain,net}Dp2 to trackDb.ra if necessary.
    featureBits -chrom=chr2R droSim1 chainDp2Link
#13268189 bases of 18190314 (72.941%) in intersection
    featureBits -chrom=chr4 droSim1 chainDp2Link
#300051 bases of 807953 (37.137%) in intersection


# BLASTZ/CHAIN/NET DP3 (DONE 4/21/05 angie)
    mkdir /cluster/data/droSim1/bed/blastz.dp3.2005-04-21
    cd /cluster/data/droSim1/bed/blastz.dp3.2005-04-21
    cat << '_EOF_' > DEF
# D.simulans vs. D.pseudoobscura

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. simulans
SEQ1_DIR=/iscratch/i/droSim1/droSim1.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/droSim1/chrom.sizes

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp3/chrom.sizes

BASE=/cluster/data/droSim1/bed/blastz.dp3.2005-04-21
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/droSim1dp3 >& do.log &
    tail -f do.log
    rmdir /cluster/bluearc/droSim1dp3
    ln -s blastz.dp3.2005-04-21 /cluster/data/droSim1/bed/blastz.dp3
    # Add {chain,net}Dp3 to trackDb.ra if necessary.
    featureBits -chrom=chr2R droSim1 chainDp3Link
#13351459 bases of 18190314 (73.399%) in intersection
    featureBits -chrom=chr4 droSim1 chainDp3Link
#314925 bases of 807953 (38.978%) in intersection


# 4-FLY SIM-MEL-YAK-PSEUDO MULTIZ (DONE 4/22/05 angie)
    # Tree (4-way):
    # (((droSim1 dm2) droYak1) dp3)
    ssh kkstore01
    mkdir /cluster/data/droSim1/bed/multiz4way.2005-04-21
    ln -s /cluster/data/droSim1/bed/multiz4way.2005-04-21 \
      /cluster/data/droSim1/bed/multiz4way
    cd /cluster/data/droSim1/bed/multiz4way
    # Setup: Copy pairwise MAF to /cluster/bluearc:
    mkdir /cluster/bluearc/flyMultiz4way
    foreach db (dm2 droYak1 dp3)
      cp -pR /cluster/data/droSim1/bed/blastz.$db/mafNet \
        /cluster/bluearc/flyMultiz4way/$db
    end
    # Make output dir:
    mkdir maf
    # Create script to run multiz.v10 in the right order:
    cat << '_EOF_' > doMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/flyMultiz4way.$chr
mkdir $tmp

set REF = droSim1.$chr
set MEL = /cluster/bluearc/flyMultiz4way/dm2/$chr.maf.gz
set YAK = /cluster/bluearc/flyMultiz4way/droYak1/$chr.maf.gz
set PSE = /cluster/bluearc/flyMultiz4way/dp3/$chr.maf.gz
set DEST = /cluster/data/droSim1/bed/multiz4way/maf/$chr.maf

set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project

if ( -s $MEL && -s $YAK ) then 
    echo "Aligning $MEL $YAK..."
    zcat $MEL > $tmp/$chr.Mel.maf
    zcat $YAK > $tmp/$chr.Yak.maf
    $MZ10 $tmp/$chr.Mel.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.MelYak.maf
else if ( -s $MEL ) then
    zcat $MEL > $tmp/$chr.MelYak.maf
else if ( -s $YAK ) then
    zcat $YAK > $tmp/$chr.MelYak.maf
endif

if ( -s $PSE && -s $tmp/$chr.MelYak.maf ) then
    echo "Adding $PSE..."
    zcat $PSE > $tmp/$chr.Pse.maf
    $MZ10 $tmp/$chr.MelYak.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.MelYakPse.maf
    mv $tmp/$chr.MelYakPse.maf $DEST
else if ( -s $PSE ) then
    zcat $PSE > $DEST
else if ( -s $tmp/$chr.MelYak.maf ) then
    cp $tmp/$chr.MelYak.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doMultiz.csh
    awk '{print "./doMultiz.csh " $1;}' /cluster/data/droSim1/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /cluster/data/droSim1/bed/multiz4way
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 18 of 18 jobs
#Average job time:                 380s       6.34m     0.11h    0.00d
#Longest finished job:            1735s      28.92m     0.48h    0.02d
#Submission to last job:          1735s      28.92m     0.48h    0.02d

    # make /gbdb/ links to 4way maf files:
    ssh hgwdev
    mkdir -p /gbdb/droSim1/multiz4way/maf/multiz4way
    ln -s /cluster/data/droSim1/bed/multiz4way/maf/chr*.maf \
      /gbdb/droSim1/multiz4way/maf/multiz4way/

    # load into database
    cd /tmp
    hgLoadMaf -warn droSim1 multiz4way \
      -pathPrefix=/gbdb/droSim1/multiz4way/maf/multiz4way

    # put 4way MAF out for download
    ssh kkstore01
    cd /cluster/data/droSim1/bed/multiz4way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    nice md5sum *.maf.gz > md5sum.txt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droSim1/multiz4way
    ln -s /cluster/data/droSim1/bed/multiz4way/mafDownload/{*.maf.gz,md5sum.txt} \
      /usr/local/apache/htdocs/goldenPath/droSim1/multiz4way
    # make a README.txt

    # Cleanup
    rm -rf /cluster/bluearc/flyMultiz4way/


# PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 4/25/05 angie)
    # Same procedure as for latest dm1 4way -- using the param estimation 
    # methods described in Adam's Genome Research paper.
    ssh kkstore01
    mkdir -p /cluster/bluearc/droSim1/chrom
    cp -p /cluster/data/droSim1/?{,?}/chr*.fa /cluster/bluearc/droSim1/chrom/
    # Split chrom fa & maf into smaller windows for phastCons:
    ssh kki
    mkdir /cluster/data/droSim1/bed/multiz4way/phastCons
    mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.split
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.split
    set WINDOWS = /cluster/bluearc/droSim1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/droSim1/chrom
set WINDOWS=/cluster/bluearc/droSim1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O droSim1,dm2,droYak1,dp3 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/droSim1/bed/multiz4way/maf/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 18 of 18 jobs
#Average job time:                  15s       0.26m     0.00h    0.00d
#Longest finished job:              49s       0.82m     0.01h    0.00d
#Submission to last job:            49s       0.82m     0.01h    0.00d

    # Get genome-wide all-species average GC content 
    # by running msa_view --aggregate --summary-only on the WINDOWS .SS files.
    # msa_view needs seekable single-ss file, so unpack each to /tmp and 
    # compute overall average:
    # Note: for larger organisms, this trick can be used to take a random
    # sample of windows:
    # foreach f (`ls -1 /cluster/bluearc/droSim1/phastCons/WINDOWS/* \
    #             | randomLines stdin 50 stdout`)
    ssh kolossus
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/
    cp /dev/null gcs.txt
    foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*)
      zcat $f > /tmp/droSim1.sample.ss
      /cluster/bin/phast/msa_view \
        --aggregate droSim1,dm2,droYak1,dp3 \
        -i SS --summary-only /tmp/droSim1.sample.ss \
      | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \
      >> gcs.txt
    end
    ave gcs.txt
#average 0.428524
    # OK, so use 0.429 as the --gc parameter below.
    rm /tmp/droSim1.sample.ss

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/droSim1/bed/multiz4way/phastCons
    /cluster/bin/phast/msa_view ../maf/chr3R.maf \
        --aggregate droSim1,dm2,droYak1,dp3 -i MAF -o SS \
        > all.ss
    /cluster/bin/phast/phyloFit all.ss \
      --tree "(((droSim1,dm2),droYak1),dp3)" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -74270041.694256
#BACKGROUND: 0.280086 0.220054 0.219835 0.280025 
#RATE_MAT:
#  -0.934494    0.207662    0.469274    0.257559 
#   0.264314   -1.083783    0.220083    0.599387 
#   0.597889    0.220301   -1.081724    0.263534 
#   0.257614    0.471019    0.206888   -0.935522 
#TREE: (((droSim1:0.023894,dm2:0.030119):0.030350,droYak1:0.061782):0.147792,dp3:0.147792);

    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use the values of --target-coverage and --expected-lengths from the 
    # last iteration of the dm2 8way run, 0.565 and 36.
    # phyloFit makes one .mod but consEntropy wants two...
    # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
      > starting-tree.noncons.mod
    /cluster/bin/phast/consEntropy --NH 9.7834 0.565 36 \
      starting-tree*.mod
#( Solving for new omega: 36.000000 37.779742 37.741115 )
#Transition parameters: gamma=0.565000, omega=36.000000, mu=0.027778, nu=0.036079
#Relative entropy: H=0.509558 bits/site
#Required length: N=18.908610 sites
#Total entropy: NH=9.635032 bits
#Recommended expected length: omega=37.741115 sites (for NH=9.783400)
    # OK, stick with 0.565 but use --expected-lengths 37.75.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.46 36.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=26.963371 sites (for NH=9.783400)
    # OK, --expected-lengths 27
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.40 27 ave.{cons,noncons}.mod
#Recommended expected length: omega=21.874376 sites (for NH=9.783400)
    # ==> 21.9
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.35 21.9 ave.{cons,noncons}.mod
#Recommended expected length: omega=17.941014 sites (for NH=9.783400)
    # ==> 17.9
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.357 18 ave.{cons,noncons}.mod
#Recommended expected length: omega=18.561695 sites (for NH=9.783400)
    # ==> 18.6

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh kk9
    mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.429 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 37.75 --target-coverage 0.565 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 18.6 --target-coverage 0.357 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    para create jobList
    para try, check, push, check, ...#
#Completed: 153 of 153 jobs
#Average job time:                 105s       1.76m     0.03h    0.00d
#Longest finished job:             185s       3.08m     0.05h    0.00d
#Submission to last job:           259s       4.32m     0.07h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore01
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((droSim1:0.013566,dm2:0.015700):0.013732,droYak1:0.031231):0.077792,dp3:0.077792);
#ave.noncons.mod:TREE: (((droSim1:0.061644,dm2:0.071348):0.064130,droYak1:0.145205):0.370123,dp3:0.370123);
    # SECOND ITERATION:
#ave.cons.mod:TREE: (((droSim1:0.012146,dm2:0.014020):0.012179,droYak1:0.027991):0.068610,dp3:0.068610);
#ave.noncons.mod:TREE: (((droSim1:0.058376,dm2:0.067404):0.060221,droYak1:0.137752):0.346037,dp3:0.346037);
    # THIRD ITERATION:
#ave.cons.mod:TREE: (((droSim1:0.011226,dm2:0.012932):0.011189,droYak1:0.025859):0.062687,dp3:0.062687);
#ave.noncons.mod:TREE: (((droSim1:0.056522,dm2:0.065153):0.058010,droYak1:0.133412):0.331867,dp3:0.331867);
    # FOURTH ITERATION:
#ave.cons.mod:TREE: (((droSim1:0.010304,dm2:0.011845):0.010208,droYak1:0.023706):0.056800,dp3:0.056800);
#ave.noncons.mod:TREE: (((droSim1:0.054861,dm2:0.063129):0.056035,droYak1:0.129452):0.318836,dp3:0.318836);
    # FIFTH ITERATION:
#ave.cons.mod:TREE: (((droSim1:0.010464,dm2:0.012033):0.010379,droYak1:0.024080):0.057819,dp3:0.057819);
#ave.noncons.mod:TREE: (((droSim1:0.055095,dm2:0.063418):0.056328,droYak1:0.130012):0.320744,dp3:0.320744);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off.
    # We probably don't need to do this, since it has always been very close 
    # if not the same as what we used above, but it only takes a second.
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.565 37.75 ave.{cons,noncons}.mod
#Recommended expected length: omega=37.562610 sites (for NH=9.783400)
    # OK, tweak --expected-lengths to 37.6 below (and for next iteration).
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.46 27 ave.{cons,noncons}.mod
#Recommended expected length: omega=26.967450 sites (for NH=9.783400)
    # ==> keep at 27.
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.40 21.9 ave.{cons,noncons}.mod
#Recommended expected length: omega=21.904601 sites (for NH=9.783400)
    # ==> keep at 21.9.
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.35 17.9 ave.{cons,noncons}.mod
#Recommended expected length: omega=18.018268 sites (for NH=9.783400)
    # ==> tweak to 18.
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.357 18.6 ave.{cons,noncons}.mod
#Recommended expected length: omega=18.541147 sites (for NH=9.783400)
    # ==> tweak to 18.5.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.phast
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 18.5 --target-coverage 0.357 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /cluster/bluearc/droSim1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/droSim1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/droSim1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/droSim1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/droSim1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 153 of 153 jobs
#Average job time:                  22s       0.37m     0.01h    0.00d
#Longest finished job:              32s       0.53m     0.01h    0.00d
#Submission to last job:            56s       0.93m     0.02h    0.00d

    # back on kkstore01:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/droSim1/bed/multiz4way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/droSim1/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    # Make a table with just the D. mel. refGenes for measuring coverage
    # (turned out to be almost same as xenoRefGene anyway).
    hgsql droSim1 -e \
      'select id from organism where name like "Drosophila melanogaster%";'
#| 352 |
   hgsql droSim1 -e \
      'create table dmRefGene select xenoRefGene.* \
       from xenoRefGene,gbCdnaInfo \
       where xenoRefGene.name = gbCdnaInfo.acc and gbCdnaInfo.organism = 352;'
    cd /cluster/data/droSim1/bed/multiz4way/phastCons
    featureBits -enrichment droSim1 dmRefGene:cds all.bed
    # FIRST ITERATION: too high; decrease --target-coverage, re-estimate.
#xenoRefGene:cds 15.498%, all.bed 52.561%, both 12.532%, cover 80.86%, enrich 1.54x
#    # SECOND ITERATION: still too high:
#xenoRefGene:cds 15.498%, all.bed 47.696%, both 11.771%, cover 75.95%, enrich 1.59x
    # THIRD ITERATION: still!
#xenoRefGene:cds 15.498%, all.bed 44.505%, both 11.208%, cover 72.32%, enrich 1.62x
#dmRefGene:cds 15.482%, all.bed 44.505%, both 11.194%, cover 72.30%, enrich 1.62x
    # FOURTH ITERATION: OK, bump back up just a bit.
#dmRefGene:cds 15.482%, all.bed 41.363%, both 10.587%, cover 68.38%, enrich 1.65x
    # FIFTH ITERATION: Close enough.
#dmRefGene:cds 15.482%, all.bed 41.860%, both 10.691%, cover 69.05%, enrich 1.65x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed droSim1 phastConsElements4way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/droSim1/bed/multiz4way/phastCons/run.wib
    cd /cluster/data/droSim1/bed/multiz4way/phastCons/run.wib
    rm -rf /cluster/bluearc/droSim1/phastCons/wib
    mkdir -p /cluster/bluearc/droSim1/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /cluster/bluearc/droSim1/phastCons/wib
zcat `ls -1 /cluster/bluearc/droSim1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/droSim1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 18 of 18 jobs
#Average job time:                   8s       0.12m     0.00h    0.00d
#Longest finished job:              23s       0.38m     0.01h    0.00d
#Submission to last job:            23s       0.38m     0.01h    0.00d

    # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/droSim1/bed/multiz4way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/droSim1/phastCons/wib .
    rsync -av /cluster/bluearc/droSim1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/droSim1/multiz4way/wib
    cd /cluster/data/droSim1/bed/multiz4way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/droSim1/multiz4way/wib/
    hgLoadWiggle droSim1 phastCons4way \
      -pathPrefix=/gbdb/droSim1/multiz4way/wib wib/*.wig
    rm wiggle.tab

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/ds-top5000-4way \
      "top 5000 conserved elements (4way)" droSim1

    # and clean up bluearc.
    rm -r /cluster/bluearc/droSim1/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /cluster/bluearc/droSim1/chrom


# RELOAD AGP (DONE 5/19/05 angie)
    # Many gap types that should have been labeled "contig" (type) were 
    # labeled "fragment" in the original AGP.  LaDeana sent corrected AGP 
    # in an email attachment -- saved to 
    # /cluster/data/droSim1/downloads/simulans_agp.tar.gz
    ssh kkstore01
    cd /cluster/data/droSim1/downloads
    tar xvzf simulans_agp.tar.gz 
    foreach f (*.agp)
      set chr = $f:r
      set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'`
      mv ../$c/$f ../$c/$f.bak
      mv $f ../$c/$f
    end
    cd ..
    # check that line counts are the same, and that the files have no 
    # differences if you ignore the gap type and bridged columns 
    # (and extra whitespace @EOL.)
    foreach f (*/*.agp)
      set lc1 = `wc -l < $f`
      set lc2 = `wc -l < $f.bak`
      if ($lc1 != $lc2) then
        echo "LINE COUNT DIFFERENCE: $f $lc1   vs.   $f.bak $lc2"
      else
        echo $f:t:r ok, $lc1 == $lc2
      endif
      awk '$5 == "N" {$7 = $8 = "ignored";} {print;}' $f \
      | perl -wpe 's/\s+$/\n/' > /tmp/1
      awk '$5 == "N" {$7 = $8 = "ignored";} {print;}' $f.bak \
      | perl -wpe 's/\s+$/\n/' > /tmp/2
      set lcd = `diff /tmp/1 /tmp/2 | wc -l`
      if ($lcd) then
        echo "DIFFERENCE OTHER THAN GAP TYPE $f vs. $f.bak"
      else
        echo $f:t:r ok, no diff other than gap type
      endif
    end
    # run checkAgpAndFa
    foreach f (*/*.fa)
      echo -n "$f:t:r - "
      checkAgpAndFa $f:r.agp $f | tail -1
    end
    # eyeball to make sure all AGP's have at least some contig gaps
    foreach f (*/*.agp)
      echo -n "$f:t:r contig gaps: "
      awk '$5 == "N" && $7 == "contig" {print $6;}' $f | sort -n | uniq -c
    end
    # eyeball distribution of fragment gap sizes too:
    awk '$5 == "N" && $7 == "fragment" {print $6;}' */*.agp | sort -n | uniq -c
    # Hmmm, 0 fragment gaps with size 1000, fewer than would expect by chance:
#      8 997
#      1 998
#      4 999
#      3 1001
#      6 1002
#      5 1003
    # super-paranoid: look for contig gaps in the middle of a run of 
    # similarly-named contigs (implying same supercontig).
    foreach f (*/*.agp)
      echo $f
      perl -we ' \
        while (<>) { \
	  @prevprevwords = @prevwords if (defined @prevwords); \
	  @prevwords = @words if (defined @words); \
          chomp; @words = split; \
	  if (defined $prevwords[4] && $prevwords[4] eq "N" && \
              $prevwords[6] eq "contig") { \
	    $lastsuper = $prevprevwords[5]; $lastsuper =~ s/\.\d+$//; \
	    $thissuper = $words[5]; $thissuper =~ s/\.\d+$//; \
	    if ($lastsuper eq $thissuper) { \
              print "line $.: contig gap in $thissuper\n"; \
            } \
	  } \
        }' $f
    end
#2R/chr2R.agp
#line 3433: contig gap in W501_Contig9
#3L/chr3L.agp
#line 3473: contig gap in W501_Contig4
#X/chrX.agp
#line 4000: contig gap in W501_Contig112
#X/chrX_random.agp
#line 3379: contig gap in W501_Contig763
    # Got the go-ahead from LaDeana to edit those 4 cases to "fragment\tyes"
    # and she will do the same.  Re-ran checks, everything looks good now.

    # Reload gap (and gold) tables from agp:
    ssh hgwdev
    cd /cluster/data/droSim1
    hgGoldGapGl -noGl -chromLst=chrom.lst droSim1 /cluster/data/droSim1 .

# MAKE Drosophila Proteins track  (DONE 06-30-05 braney)
    ssh kk
    mkdir -p /cluster/data/droSim1/bed/tblastn.dm2FB
    cd /cluster/data/droSim1/bed/tblastn.dm2FB
    ls -1S /iscratch/i/droSim1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\)
#18929/(164630/320) = 36.793294

    split -l 50 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/droSim1/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/droSim1/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droSim1/jkStuff/subChr.lft carry $f.2
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droSim1/jkStuff/liftAll.lft carry $f.3
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/droSim1/bed/tblastn.dm2FB
    para create blastSpec
    para push

# Completed: 121280 of 121280 jobs
# CPU time in finished jobs:     879967s   14666.12m   244.44h   10.18d  0.028 y
# IO & Wait Time:                463525s    7725.41m   128.76h    5.36d  0.015 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              84s       1.40m     0.02h    0.00d
# Submission to last job:         16896s     281.60m     4.69h    0.20d


    ssh kki
    cd /cluster/data/droSim1/bed/tblastn.dm2FB
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push
 
# Completed: 379 of 379 jobs
# CPU time in finished jobs:       3593s      59.89m     1.00h    0.04d  0.000 y
# IO & Wait Time:                  9877s     164.61m     2.74h    0.11d  0.000 y
# Average job time:                  36s       0.59m     0.01h    0.00d
# Longest finished job:             610s      10.17m     0.17h    0.01d
# Submission to last job:          1585s      26.42m     0.44h    0.02d

    cd /cluster/data/droSim1/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/droSim1/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/droSim1/bed/tblastn.dm2FB
    hgLoadPsl droSim1 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droSim1/bed/geneid
    cd /cluster/data/droSim1/bed/geneid
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/D.simulans/golden_path_200504/geneidv1.2/$chr.gtf
    end
    ldHgGene -gtf -genePredExt droSim1 geneid *.gtf


# BLASTZ/CHAIN/NET DROYAK2 (DONE 7/28/05 angie)
    mkdir /cluster/data/droSim1/bed/blastz.droYak2.2006-07-24
    cd /cluster/data/droSim1/bed/blastz.droYak2.2006-07-24
    cat << '_EOF_' > DEF
# D.simulans vs. D.yakuba

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. simulans
SEQ1_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/droSim1/chrom.sizes

# QUERY - D. yakuba
SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak2/chrom.sizes

BASE=/cluster/data/droSim1/bed/blastz.droYak2.2006-07-24
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -blastzOutRoot /san/sanvol1/scratch/droSim1droYak2 >& do.log &
    tail -f do.log
    # The chain job for droSim1.chrU kept running out of mem, so I 
    # made a separate .csh script that would chain chunks of chrU.
    # Some chains may be split because of this, but they're on chrU 
    # anyway...
    ssh kkr6u00
    cd /cluster/store8/droSim1/bed/blastz.droYak2.2006-07-24/axtChain/run
    para time > run.time
    cat run.time >>& ../../do.log 
    cat > chain_chrU.csh <<'_EOF_'
#!/bin/csh -efx
foreach f (../../pslParts/$1*.psl.gz)
  zcat $f \
  | axtChain -psl -verbose=0 -scoreScheme=/cluster/data/blastz/HoxD55.q   -linearGap=loose stdin \
      /san/sanvol1/scratch/droSim1/droSim1.2bit \
      /san/sanvol1/scratch/droYak2/droYak2.2bit \
      stdout \
  | chainAntiRepeat /san/sanvol1/scratch/droSim1/droSim1.2bit \
      /san/sanvol1/scratch/droYak2/droYak2.2bit \
      stdin $f:t:r:r.chain
end
chainMergeSort $1*.chain > $2
'_EOF_'
    # << emacs
    ./chain_chrU.csh droSim1.2bit:chrU: chain/droSim1.2bit:chrU:.chain \
      >& chain_chrU.log & tail -f chain_chrU.log
    cat chain_chrU.log >> ../../do.log
    cd ../..
    doBlastzChainNet.pl DEF -continue chainMerge -workhorse=kkr6u00 \
      >>& do.log & tail -f do.log
    ln -s blastz.droYak2.2006-07-24 /cluster/data/droSim1/bed/blastz.droYak2
    featureBits -chrom=chr2R droSim1 chainDroYak2Link
#16933107 bases of 18190314 (93.089%) in intersection
    featureBits -chrom=chr2R droSim1 chainDm2Link
#17421444 bases of 18190314 (95.773%) in intersection
    featureBits -chrom=chr4 droSim1 chainDroYak2Link
#635094 bases of 807953 (78.605%) in intersection
    featureBits -chrom=chr4 droSim1 chainDm2Link
#696892 bases of 807953 (86.254%) in intersection


# RECIPROCAL BEST DROSIM1-DROYAK2 (IN PROGRESS 8/1/06 angie)
    # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
    ssh kolossus
    cd /cluster/data/droSim1/bed/blastz.droYak2/axtChain
    # Swap droSim1-best chains to be droYak2-referenced:
    chainStitchId droSim1.droYak2.over.chain.gz stdout \
    | chainSwap stdin stdout \
    | chainSort stdin droYak2.droSim1.tBest.chain
    # Net those on droYak2 to get droYak2-ref'd reciprocal best net:
    chainPreNet droYak2.droSim1.tBest.chain \
      /cluster/data/{droYak2,droSim1}/chrom.sizes stdout \
    | chainNet -minSpace=1 stdin /cluster/data/{droYak2,droSim1}/chrom.sizes \
        stdout /dev/null \
    | netSyntenic stdin droYak2.droSim1.rbest.net
    # Extract droYak2-ref'd reciprocal best chain:
    netChainSubset droYak2.droSim1.rbest.net droYak2.droSim1.tBest.chain stdout \
    | chainStitchId stdin droYak2.droSim1.rbest.chain
    # Swap to get droSim1-ref'd reciprocal best chain:
    chainSwap droYak2.droSim1.rbest.chain stdout \
    | chainSort stdin droSim1.droYak2.rbest.chain
    # Net those on droSim1 to get droSim1-ref'd reciprocal best net:
    chainPreNet droSim1.droYak2.rbest.chain \
      /cluster/data/{droSim1,droYak2}/chrom.sizes stdout \
    | chainNet -minSpace=1 -minScore=0 \
      stdin /cluster/data/{droSim1,droYak2}/chrom.sizes stdout /dev/null \
    | netSyntenic stdin droSim1.droYak2.rbest.net
    # Compress and clean up.
    rm droYak2.droSim1.tBest.chain
    gzip *.rbest.*
    md5sum *.rbest.*.gz > md5sum.rbest.txt

    # Test coverage of *.rbest.* -- should be equal.
    netToBed -maxGap=1 droYak2.droSim1.rbest.net.gz droYak2.droSim1.rbest.net.bed
    netToBed -maxGap=1 droSim1.droYak2.rbest.net.gz droSim1.droYak2.rbest.net.bed
    chainToPsl droYak2.droSim1.rbest.chain.gz \
      /cluster/data/{droYak2,droSim1}/chrom.sizes \
      /cluster/data/droYak2/droYak2.2bit /cluster/data/droSim1/droSim1.2bit \
      droYak2.droSim1.rbest.chain.psl
    chainToPsl droSim1.droYak2.rbest.chain.gz \
      /cluster/data/{droSim1,droYak2}/chrom.sizes \
      /cluster/data/droSim1/droSim1.2bit /cluster/data/droYak2/droYak2.2bit \
      droSim1.droYak2.rbest.chain.psl
    ssh hgwdev
    cd /cluster/data/droSim1/bed/blastz.droYak2/axtChain
    featureBits droYak2 droYak2.droSim1.rbest.net.bed
#101989543 bases of 162681153 (62.693%) in intersection
    featureBits droYak2 droYak2.droSim1.rbest.chain.psl
#101989543 bases of 162681153 (62.693%) in intersection
    featureBits droSim1     droSim1.droYak2.rbest.chain.psl
#101989543 bases of 127256433 (80.145%) in intersection
    featureBits droSim1     droSim1.droYak2.rbest.net.bed
#101989543 bases of 127256433 (80.145%) in intersection
    mkdir experiments
    mv *.bed *.psl experiments

    # Put out for download (hgwdev-only)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droSim1/vsDroYak2/reciprocalBest
    cd /usr/local/apache/htdocs/goldenPath/droSim1/vsDroYak2/reciprocalBest
    ln -s /cluster/data/droSim1/bed/blastz.droYak2/axtChain/*.rbest.*.gz .
    ln -s /cluster/data/droSim1/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt
    # Make a README.txt.


###########################################################################
# SWAP/CHAIN/NET DM3 (DONE 6/8/07 angie)
    ssh kkstore02
    mkdir /cluster/data/droSim1/bed/blastz.dm3.swap
    cd /cluster/data/droSim1/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.droSim1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /cluster/data/droSim1/bed/blastz.dm3