#######################################################################
# ENCODE Regulation track  (In progress March 2010, very rough instructions
# as am still defining how to build track...).  JK

      # make root dir and link in our bad.bed file for filtering out
      # blacklisted and multiple mapping regions.
      mkdir -p /cluster/data/hg19/bed/wgEncodeReg
      cd /cluster/data/hg19/bed/wgEncodeReg
      ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed

      # Create the DNAse peak clusters subtrack.
      # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq
      # linked into directory /hive/users/kent/regulate/dnase/peaks
      mkdir dnase
      cd dnase
      mkdir peaks
      ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks

      # Process these into clusters in a bed file and load clusters into
      # table.
      /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst
      regClusterMakeTableOfTables uw02 peak.lst peak.table
      regCluster peak.table /dev/null peak.bed
      awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed
      hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed

      # Make wgEncodeRegDnaseClusteredInput table.  Start with mdbQuery, and
      # then do some massaging since not completely in sync with file list.
      mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab
      cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst
      weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert 
      hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \
	  wgEncodeRegDnaseClusteredInputs.tab

      # Make transcription factor binding site clusters.  Start with Anshul's
      # nice uniform peak calls, downloaded from 
      #   ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal
      # using the user name and password in the ENCODE wiki at:
      #    http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data
      # Put all of the ones that actually are based on two replicates into
      #    /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks
      # Once that is done do a little work towards making the cluster input.
      cd /cluster/data/hg19/bed/wgEncodeReg/tfbs
      /bin/ls peaks/*.narrowPeak.gz > peak.lst
      regClusterMakeTableOfTables ans01 peak.lst partial.table

      # Unfortunately at this point only about 90% of the peak files actually
      # have metaDb data associated with them in metaDb, so run C program that
      # takes it from metaDb if possible, otherwise tries to get it from file
      # names.
      regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table

      # One more complication - there are often several cases where there is
      # more than one data set on the same cell+treatment+antibody combination. 
      # For good reasons hgBedToBedExps only wants one.
      # This next bit filters them out based on a picks file that was hand
      # created - picksFromDupes.  This was based on the criteria that
      # the Broad CTCF were used,  and that Snyd was used in preference
      # to OpenChromatin, which in turn was used in preference over Haib.
      # The rational for this is the Broad CTCF set has been in use for a 
      # while with no complaints,  and when examing the Pol2 signals, which
      # were with CTCF the main component of the Dupes, the Haib were the
      # weakest and the Snyd the strongest.  
      cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
      wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table

      # Make three column file: fileName, cell+treatment, antibody
      awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab


      # Create config file out for clustering job, using a table assigning
      # cell lines+treatments to single letter codes edited by hand.  In
      # the future may be able to automate this by having fixed upper case codes for
      # tier1&2 cell lines, and then just use first letter lower case codes
      # for rest. That is the pattern cellLetter2.tab follows anyway.
      cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab .
      regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab

      # Do the actual clustering 
      hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps

      # Load database (reloaded 3/26/2012 to fix a relatively rare bug)
      hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed
      hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps

      # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab>
      awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab
      hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab


      # Handle the RNA seq, starting by pooling it and filtering out multiple
      # mapping and blacklisted regions.
      mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
      cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
      bigWigMerge -adjust=-1 \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
	  /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
	  | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
      # Now have to normalize.  This takes two passes.  The first just creates
      # a tab-separated file with normalization values. A batch file is made
      # by running awk on the tab file, and this batch file executes the
      # second half, transforming the bedGraphs to normalized values. The
      # normalization standardizes all files as if the total signal integrated to 10
      # billion over the genome.
      rm -f ../norm.tab
      foreach i (*SigPooled.bedGraph)
	  echo processing $i
	  echo -n $i " " >> ../norm.tab
	  awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
      end

      awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh
      chmod a+x ../norm.csh
      mkdir ../normalized
      ../norm.csh

      # Make bigWigs
      cd ../normalized
      foreach i (*.bedGraph)
	  echo processing $i
          bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
      end
      
      # Link into /gbdb
      ln -s `pwd`/*.bw /gbdb/hg19/bbi

      # Tell database about it
      foreach i (*.bw)
          hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i
      end

      # Clean up
      cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn
      rm pooled/*.bedGraph normalized/*.bedGraph

      # Make metadata: (forgive the long line)
      # JK TODO - add Hepg2 and Helas3 from here on down

      mdbQuery out=ra "select * from hg19 where \
	  obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \
	  obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \
      | sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \
      | grep -v ^replicate > mdb.ra 

      # Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra


      # At this point actually are done with this track, except for
      # all the trackDb work on the overlayed histone mod tracks.
      # The rest is for some coloring stuff that is not yet actually used.


      # Do work to figure out how to color cells.  This is based on
      # hierarchical clustering of wgEncodeUwAffyAllExon. 
      mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors
      cd /hive/data/genomes/hg19/bed/wgEncodeReg

      # Start with some alas time consuming data cleanup to get rid of
      # overlaps - losing about 10% of the data in the process which is
      # totally ok.
      mkdir allExon
      cd allExon
      mkdir inLinks noOverlap onlyCommon
      ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks
      cd inLinks
      foreach i (*.gz)
          bedRemoveOverlap $i ../noOverlap/$i:r
      end
      cd ..
      
      # Now make up a file of regions present in all inputs for further data
      # cleanup
      bedCommonRegions noOverlap/*.* > commonExons.bed
      cd noOverlap
      foreach i (*.broadPeak)
	  echo processing $i
          bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i
      end
      rm -r noOverlap

      # Now pool replicates where possible
      mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \
         | sed 's/\.gz//g' > replicate.ra
      encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged
      mkdir merged
      chmod a+x merge.sh
      merge.sh

      # Resort merged output - somehow gets slightly messed by
      # encodeMergeReplicates.
      cd merged
      foreach i (*.broadPeak)
          echo sorting $i
          sort -k 1,1 -k 2,2n $i > tmp
	  mv tmp $i
      end

      # Make file that converts fileName to cell
      raToTab merged.ra -cols=fileName,cell stdout \
         | sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab

      # Now build the tree. Took 2.5 minutes
      ls -1 merged/*.broadPeak > clusterInput.lst
      regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances

#########################################################################
# 2012 ENCODE Regulation supertrack updates: #7526 (kate)

    #########################################################################
    # DNase Clusters: #9954  (2013-01-11)
    #
    # Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs).  
    # See doc/encodeDccHg19/awgUniform.txt
    # There are 125 cell type+treatments  represented -- UW, Duke, and Uw+Duke for 14 cell types
    # Previous track had 74
 
    cd /hive/data/genomes/hg19/bed/wgEncodeReg
    mkdir -p dnaseAwg
    cd dnaseAwg
    set dir = `pwd`

    cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak
    ls *.narrowPeak > $dir/peak.lst

    # generate a normalization factor
    # output format:
    # <filename> 0 1 2 <scoreIx> <scoreFactor>
    # wgEncodeAwgDnaseDuke8988tPeak.bed       0       1       2       6       10.7234 .
    regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out &

    # Make cluster table:  wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
    regCluster $dir/peak.table /dev/null $dir/peak.bed
    # Read 125 sources from peak.table
    # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes

    # filter out low scoring, single peaks in cluster and load in database
    cd $dir
    awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
    wc -l wgEncodeRegDnaseClusteredV2.bed
    # 1281988 wgEncodeRegDnaseClusteredV2.bed

    hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed

    #  NOTE: V1 had ~1M clusters, V2 has 1.2M
    # It would be nice if hgc sorted datasets by signal or cellType
    # Create metadata table:  wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
    #  was: objName, cellType, treatment, replicate, lab, dateUnrestricted
    # For V2, AWG is providing pooled replicates, and sometimes pooled labs.  Also,
    #   this data has all passed timestamp restriction, so don't need to show
    #      objName, cellType, treatment, lab(s)
    # DISPLAY: Consider item name change to % assays represented instead of count.
    #  Also, could color item name label to highlight ubiquitous vs. cell-type-specific
    #  (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)

    # Make wgEncodeRegDnaseClusteredInputV2 table.
    mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab

    hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \
        ~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab

    # Set up downloads
    cd $dir
    mkdir downloads
    gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz
    gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz

    # use (existing) ENCODE DCC downloads dir
    set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered
    cd $dload

    # call existing stuff release1 (use ENCODE-style downloads scheme)
    mkdir release1
    ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1
    cp README.txt md5sum.txt files.txt release1 

    # new release dir
    mkdir release2
    ln -s release2 beta
    ln -s release2 releaseLatest
    cd $dir/downloads
    ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload
    cd $dload
    ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest
    # include older files as well
    ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest
    cp README.txt releaseLatest
    cd releaseLatest
    md5sum *.gz > md5sum.txt
    # vim README.txt
    cp README.txt md5sum.txt ..

    # add minimal metaObject to metaDb to associate new table with (pseudo-)composite 
    # wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads
    cat > wgEncodeRegDnaseClustered.ra << 'EOF'
metaObject wgEncodeRegDnaseClusteredV2
objType table
composite wgEncodeRegDnaseClustered
fileName wgEncodeRegDnaseClusteredV2.bed.gz
tableName wgEncodeRegDnaseClusteredV2
project wgEncode
'EOF'


#########################################################################
# Preliminary/Experimental/Older work below
#########################################################################
# ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate)
# This version will have:
#       * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use)
#              http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt 
#              These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell),
#               AWG did not pool reps or plus/minus for their hub (but we will)
#               AWG has filtered for bad regions so no longer necessary
#       * Possibly a 10th cell type (HMEC) for Tx and Histone
#               Note: single replicate for HMEC Tx (others have 2)
#       * Two additional cell lines for Histones (Hela, HepG2)
#               - Note Hela & HepG2 already added to Tx track (July2012 release)
#       * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac
#             (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/) 
#                       wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw
#       * Add latest TF's, and use AWG uniform peak calls

#       * Add latest DNase calls (possibly use AWG uniform)
#          (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub)
#          AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
#               - can use subGroups line to determine cell type, but need to take care
#                       with treatments, which are in filename but not subGroups
#               - NOTE: AWG includes Duke data processed like UW.  Where both 
#                       labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+)
#          Previous track:  74 cell types (2 reps), all UW.  This version:  125 cell types, pooled replicates, merged UW+Duke

#  Also considering adding new data types from AWG hub:
#               - Short RNA's
#               - AWG uniform RNA contigs
#          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/)
#               - Segmentations (chromHMM, segWay, or combined)
#          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations)
#               - FAIRE (cell-specific, union over 25 cell types)
#          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/)
# NOTE: following Jim's methods from initial version

    # make root dir and link in our bad.bed file for filtering out
    # blacklisted and multiple mapping regions.
    cd /hive/data/genomes/hg19/bed
    mkdir wgEncodeRegV2
    cd wgEncodeRegV2
    ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed .
    #(was /hive/users/kent/regulate/companion/mapping/bad.bed)

    ######
    # Small RNA's
    # For signal, pool strands and replicates
    ######
    cd /hive/data/genomes/hg19/bed/
    mkdir -p wgEncodeRegV2/shortRna/pooled
    cd wgEncodeRegV2/shortRna/pooled

cat > list.csh << 'EOF'
    foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf)
      echo "processing ${cell}"
      ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig
'EOF'
    csh list.csh >&! list.out &
    # There are 6 of overalpping with histones and long RNAs.  4 other tier2's.

cat > pool.csh << 'EOF'
    foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7)
      echo "processing ${cell}"
      bigWigMerge -adjust=-1 \
          /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \
            wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph
    end
'EOF'
    csh pool.csh >&! pool.out &

    # got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing 
    #  for now with others to see if its worth it.

cat > big.csh << 'EOF'
    foreach i (*.bedGraph)
        echo processing $i
        bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
    end
'EOF'
    csh big.csh >&! big.out &

    # add to database
    set dir = /gbdb/hg19/bbi/encodeRegV2
    mkdir $dir
    ln -s `pwd`/*.bw $dir
    foreach i (*.bw)
        hgBbiDbLink hg19 $i:r $dir/$i
    end

    ######
    # Handle the RNA seq, starting by pooling it and filtering out multiple
    # mapping and blacklisted regions.
    ######
    cd /hive/data/genomes/hg19/bed/
    mkdir -p wgEncodeRegV2/txn/pooled
    cd wgEncodeRegV2/txn/pooled

cat > poolTx.csh << 'EOF'
    foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf)
      echo "processing ${cell}"
      bigWigMerge -adjust=-1 \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \
            wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
    end
'EOF'
    csh poolTx.csh >&! poolTx.out &

    #  ~8 minutes per cell type
    # NOTE:  25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex)

    # Add in HMEC, which has single replicates

cat > poolHmec.csh << 'EOF'
    foreach cell (Hmec)
      echo "processing ${cell}"
      bigWigMerge -adjust=-1 \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
          /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
              wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
    end
'EOF'
    csh poolHmec.csh >&! poolHmec.out &

    # Normalize.  This takes two passes.  The first creates
    # a tab-separated file with normalization values. A batch file is made
    # by running awk on the tab file, and this batch file executes the
    # second half, transforming the bedGraphs to normalized values. The
    # normalization standardizes all files as if the total signal integrated to 10
    # billion over the genome.
    rm -f ../norm.tab

cat > makeNorm.csh << 'EOF'
    foreach i (*SigPooled.bedGraph)
        echo processing $i
        echo -n $i " " >> ../norm.tab
        awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
    end
    awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh
'EOF'
    csh makeNorm.csh >&! makeNorm.out &
# ~2 mins per cell type

    mkdir ../normalized
    csh ../norm.csh  >&! ../norm.out &
#### STOPPED HERE
    cd ../normalized

    # make bigWigs
cat > big.csh << 'EOF'
    foreach i (*.bedGraph)
        echo processing $i
        bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
    end
'EOF'
    csh big.csh >&! big.out &

    # add to database
    set dir = /gbdb/hg19/bbi/encodeRegV2
    mkdir $dir
    ln -s `pwd`/*.bw $dir
    foreach i (*.bw)
        hgBbiDbLink hg19 $i:r $dir/$i
    end

    ######
    # Possible new track:  FAIRE Peaks Union across 25 cells, from AWG
    ######
    cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
    mkdir -p faire/peaks
    cd faire/peaks
    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt
    sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh
    csh wget.csh >&! wget.out &
    mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb
    bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed 
    # head -1 *.bed
     chr1    10033   10376
    # bed 3
    hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed 
    # Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed

    ######
    # Create the DNAse peak clusters subtrack.
    ######
    cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
    mkdir -p dnase/peaks
    cd dnase
    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html
    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
    sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh
    csh wget.csh >&! wget.out &
# 125 files (.bb), dated Nov 22, 2011
# Sample case had ~150K peaks (by bigBedInfo)
# These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats
#$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head
chr1    10180   10330   .       0       .       26.000000       -1      -1      -1
# NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform

    # rename to UCSC style and convert to bed: wgEncodeAwgDnase<lab><cell>Peak
    ls *.fdr01peaks.hg19.bb | \
        perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh
    # convert to bed and rename
    csh unbig.csh >&! unbig.out
    # udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0
    #  This dataset was truncated
    #  Repeated wget and bigBedToBed for HepG2 (8/23)
    ls -1 *AwgDnase*.bed > peak.lst

    # Datasets with treated cells (hints from long label)
    grep '(' uniformDnase.txt | grep longLabel > treated.txt
    wc -l treated.txt
# 7 datasets
    # Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox
    # TODO: Edit to rename

    # generate a normalization factor to
    # convert signalValue to score (0-1000).  Also note score field; narrowPeak is 7 (default)
    # output format:
    # <filename> 0 1 2 <scoreIx> <scoreFactor>
    # wgEncodeAwgDnaseDuke8988tPeak.bed       0       1       2       6       10.7234 .
    regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out &

    # Make cluster table:  wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
    regCluster peak.table /dev/null peak.bed
    #Read 125 sources from peak.table
    # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes

    # filter out low scoring, single peaks in cluster and load in database
    awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
    wc -l wgEncodeRegDnaseClusteredV2.bed

    hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
    # Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed

    # load individual peak files as a track (needed by hgc)
    # TODO: generate labels out of metadata

cat > loadDnase.csh << 'EOF'
    set b = wgEncodeAwgDnase
    set t = ${b}Peak
    foreach f (${b}*)
        set table = $f:r
        hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f
        set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'`
        echo "    track $table" >> $t.ra
        echo "    type narrowPeak" >> $t.ra
        echo "    parent $t" >> $t.ra
        echo "    shortLabel $cell DNase" >> $t.ra
        echo "    longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra
        echo "    subGroups cellType=$cell treatment=None" >> $t.ra
        echo "" >> $t.ra
    end
'EOF'

    #  NOTE: V1 had ~1M clusters.
    # Other data tables: Need to load the peak files as they are used by hgc
    #   These should probably also be subtracks.
    # It would be nice if hgc sorted datasets by signal or cellType
    # Create metadata table:  wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
    #  was: objName, cellType, treatment, replicate, lab, dateUnrestricted
    # For V2, AWG is providing pooled replicates, and sometimes pooled labs.  Also,
    #   this data has all passed timestamp restriction, so don't need to show
    #      objName, cellType, treatment, lab(s)
    # DISPLAY: Consider item name change to % assays represented instead of count.
    #  Also, could color item name label to highlight ubiquitous vs. cell-type-specific
    #  (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)


    ######
    # Uniform histone signal for overlay wiggles
    # Note: there are UW signal files as well -- omitting these for consistency,
    # Check if merging would be a better option
    ######
    cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
    mkdir -p histone/signal
    cd histone/signal

cat > getHistones.csh << 'EOF'
    foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec)
        foreach histone (H3k4me1 H3k4me3 H3k27ac)
            set var = "wgEncodeBroadHistone${cell}${histone}"
            echo $var
            curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw
        end
    end
'EOF'
    csh getHistones.csh >&! getHistones.out &

    # save real files (>1k size) as curl creates info files when match used
    mkdir sav
    find . -name '*.bw' -a -size +1k -exec mv '{}' sav \;
    rm *.bw
    mv sav/* .
    rmdir sav

    # note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why)
    # use curl to retrieve by pattern match as we don't have read perms on directory.  Then remove non-useful files
    curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw
    # good one is '*Std_1Rep', remove 2Rep and 3Rep
    rm *Hela*Std_[2-3]*
    wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw

    ######
    # TFBS from Jan2011 freeze
    ######
    # The TFBS track is a specialized type:  factorSource.  Requires 3 tables:
    #   track table: wgEncodeRegTfbsClustered (bed15)
    #   source table: wgEncodeRegTfbsCells
    #          Maps cell type (including treatment) to single letter
    #          Currently, there are fixed unique assignments for Tier1 and 2.
    #           There are all upper case.
    #           Tier 3 just use first char of cell name, lower-cased.  Not unique at all!
    # Looks like only rows 2 and 3 of the table are actually used/informative
    # Ex. row         0 G     GM12878 n/a     n/a     n/a     3       n/a,n/a/,GM12878,
    #   inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs
    # DISPLAY: Better with target protein instead of antibody ?
    # DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak
    #           found it the cell type
    # DISPLAY: fix ordering of cell letters -- they should be sorted 
    #   (in cellLetter.tab first, so Exps files has correct id's)

    cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
    mkdir -p tfbs/peaks
    cd tfbs/peaks

    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt 
    # this file has metadata
    # there are 3 'method' values:  SPP, PeakSeq, wiggler
    # there are 2 'quality' values:  Good, Caution
    # There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets
    # Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets
    # -> basically the same data, but cleaned up

    sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh
    sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh
    csh wget.csh >&! wget.out &
    # 416 files
    csh wget2.csh >&! wget2.out &
    # 41 (have Histone in filename)
    # sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb
    # from hub trackDb, metadata is in subGroups line:
    #  subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP
    # Note 'quality'.  Think about dropping those with quality='Caution'. Unfortunately some
    #  Tier1 , esp. Pol2 are only available with this quality
    # Q: What does Rep1 mean ?  Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual)
    # (Jim's methods exluded those not based on 2 replicates)
    # Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC)
    # Q: How is quality assigned ?

    # rename to UCSC style and convert to bed: wgEncodeAwgTfbs<lab><cell>Peak
    cd ..
    ls peaks/spp.optimal.*.bb | \
        perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh
    # convert to bed and rename
    csh unbig.csh >&! unbig.out &

    # NOTE: ~10% (49 of 457 total) are dupes from different labs
    # Factors are:  CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK
    # MAX, POL2, RAD21, SRF, YY1

    ls -1 peaks/*.bed > peak.lst

    ######
    # TFBS from July 2012 freeze
    #  Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on
    #   input Anshul received from users.  And there are more data sets (708 vs. 495)
    #   Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering
    #   (as in previous track)
    ######
  
    cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
    mkdir -p tfbs2012/peaks
    cd tfbs2012/peaks
    ls -l filelist.txt
    # 708 filelist.txt
    # Downloaded from
    # set user = encode-box-01
    # set pwd = enc*deDOWN
    wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal
    # using the user name and password in the ENCODE wiki at:
    # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp)
    # These are also available at:
    # http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt

    # NOTE: quality metrics are available here:
    # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7
    # -> Check with Anshul for list of data sets to exclude from clustering based on quality.
    # For now, use all.
    # sample filename:
    #   spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz

    cd ..
    ls peaks/*.narrowPeak.gz > peak.lst

    # extract UCSC object name from peaks file name, and generate a per-file normalization factor to
    # convert signalValues to scores (0-1000).  
    # Also note score field position; for narrowPeak this is 6 (default)
    # output format:
    # <filename> 0 1 2 <scoreCol> <scoreNormFactor>
    # (fields 2, 3, 4 are unused)

    regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! regTable.out &
    wc -l partial.table
    # 708
    head -1 partial.table
    # peaks/spp.idrOptimal.bf.wgEncodeBroadHistoneDnd41CtcfAlnRep0.bam_VS_wgEncodeBroadHistoneDnd41ControlStdAlnRep0.bam.narrowPeak.gz        0       1       2       6       4.52593 wgEncodeBroadHistoneDnd41CtcfAlnRep1 

    # add metadata from metaDb or as fallback, from parsing filename
    regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3

    # failed in 1 object: 
    # Can't get metadata for wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1 from metaDb, parsing filename 
    # UCSC metadata has Rep3 for pcr2x protocol, but V0415101 for others ?
    # Tweak this one as special case:
    # TODO: revisit this
    sed 's/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep3/' partial.table > partial.fixed.table  

    # cluster by antibody, and by target
    regClusterAttachMetadataToTableOfTables hg19 partial.fixed.table withDupes.table -verbose=3
    regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table withDupes.target.table -verbose=3

    # find dupes in target-based table (already done for antibody-based)

    # 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs

    # limit to single cell/factor/treatment (remove cross-lab duplication).
    # Consider pooling instead ??  Or picking based on quality metrics ?
    # -> wait for Anshul's recommendations (he will summarize) -- now they are too
    #   hard to compare:
    # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7 
    # eg., there are GM12878-CTCF from 4 labs, and all have some error highlighting
    # Reusing Jim's priorities (Broad, Snyd, OpenChrom, Haib); I'm ranking UW just below OpenChrom (Kate)
    # Also, when multiple from a single lab prioritizing what appear to be newer controls & protocols:
    # HAIB: V<date>, PCR?x
    # SYDH: iggrab, iggmus, std
      # One more complication - there are often several cases where there is
      # more than one data set on the same cell+treatment+antibody combination. 
      # For good reasons hgBedToBedExps only wants one.
      # This next bit filters them out based on a picks file that was hand
      # created - picksFromDupes.  This was based on the criteria that
      # the Broad CTCF were used,  and that Snyd was used in preference
      # to OpenChromatin, which in turn was used in preference over Haib.
      # The rational for this is the Broad CTCF set has been in use for a 
      # while with no complaints,  and when examing the Pol2 signals, which
      # were with CTCF the main component of the Dupes, the Haib were the
      # weakest and the Snyd the strongest.  
      cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
      wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table

    # wc -l peak.table
    # 656
    # reduced by 52

    # Make three column file: fileName, cell+treatment, antibody
    awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
    awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' withDupes.target.table > fileCellAb.target.tab

    # Create config file out for clustering job, using a table assigning
    # cell lines+treatments to single letter codes edited by hand.

    # Find cell types that aren't yet in cellLetter file:
    cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellLetter3.tab
    regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab -noLetter > & ! noLetter.out &
    sort noLetter.out | uniq | awk '{print $2}' > noLetter.txt
    wc -l noLetter.txt
    # 43 new cell type+treatment combos
    cat noLetter.txt >> cellLetter3.tab

    # hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90)

    regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab
    regClusterBedExpCfg -tabList fileCellAb.target.tab cluster.target.cfg -cellLetter=cellLetter3.tab -noLetter

    # Do the actual clustering 
    hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps >&! beds2exps.out
    # 17 errs (dupes detected) --> add these to picksFromDupes2 and hand edit
    # NOTE: 4 of these were from same lab, different protocols, etc.:
    # USF-1, A549+EtOH; NRSF, HepG2; NRSF, SK-N-SH; Pol2(phosphS2), K562
    # Leaving these out for now (hand-editing peak.table), saving in file dupesBad.txt
    cat picksFromDupes.tab beds2exps.out | sort >  picksFromDupes2.tab
    wc -l picksFromDupes2.tab
    # 52
    wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes2.tab peak.table

    # Prune above 4 dupes, and redo config & clustering
    wc -l peak.table
    # 635
    awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
    regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab
    hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps > beds2exps2.out
    # Loaded 635 records from cluster.cfg
    # Got 131 sources
    # Got 201 factors

    # Load database
    hgLoadBed hg19 wgEncodeRegTfbsClusteredV3 peak.bed
    # Read 4735242 elements of size 15 from peak.bed
    # was 2750490 in previous version 
    hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV3 ~/kent/src/hg/lib/expRecord.sql peak.exps

    # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab>
    awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table > wgEncodeRegTfbsClusteredInputs3.tab
    hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV3 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV3.tab
    #######################################################################

    # continue with target-based clustering
    hgBedsToBedExps -dupeLetterOk cluster.target.cfg peak.target.bed peak.target.exps >&! beds2exps.target.out
    wc -l beds2exps.target.out
    # 95 dupes

    # NOTE: this was too laborious by hand.  When redone, script to list pairs in date submitted/resubmitted or experiment ID order, to ease picking later experiment


    # EXPERIMENTAL:  TF motif tracks, from EMBL/Aus (MEME toolkit)
    # These are generated using FIMO tool:
    # Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011
    # http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full
    #  http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi

    mkdir /cluster/data/hg19/bed/tfbsFimo
    cd /cluster/data/hg19/bed/tfbsFimo

    hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed
    # Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed

    hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed
    # Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed
    #######################################################################