#######################################################################
# ENCODE Regulation track (In progress March 2010, very rough instructions
# as am still defining how to build track...). JK
# make root dir and link in our bad.bed file for filtering out
# blacklisted and multiple mapping regions.
mkdir -p /cluster/data/hg19/bed/wgEncodeReg
cd /cluster/data/hg19/bed/wgEncodeReg
ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed
# Create the DNAse peak clusters subtrack.
# Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq
# linked into directory /hive/users/kent/regulate/dnase/peaks
mkdir dnase
cd dnase
mkdir peaks
ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks
# Process these into clusters in a bed file and load clusters into
# table.
/bin/ls -1 peaks/*.narrowPeak.gz > peak.lst
regClusterMakeTableOfTables uw02 peak.lst peak.table
regCluster peak.table /dev/null peak.bed
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed
hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed
# Make wgEncodeRegDnaseClusteredInput table. Start with mdbQuery, and
# then do some massaging since not completely in sync with file list.
mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab
cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst
weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert
hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \
wgEncodeRegDnaseClusteredInputs.tab
# Make transcription factor binding site clusters. Start with Anshul's
# nice uniform peak calls, downloaded from
# ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal
# using the user name and password in the ENCODE wiki at:
# http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data
# Put all of the ones that actually are based on two replicates into
# /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks
# Once that is done do a little work towards making the cluster input.
cd /cluster/data/hg19/bed/wgEncodeReg/tfbs
/bin/ls peaks/*.narrowPeak.gz > peak.lst
regClusterMakeTableOfTables ans01 peak.lst partial.table
# Unfortunately at this point only about 90% of the peak files actually
# have metaDb data associated with them in metaDb, so run C program that
# takes it from metaDb if possible, otherwise tries to get it from file
# names.
regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table
# One more complication - there are often several cases where there is
# more than one data set on the same cell+treatment+antibody combination.
# For good reasons hgBedToBedExps only wants one.
# This next bit filters them out based on a picks file that was hand
# created - picksFromDupes. This was based on the criteria that
# the Broad CTCF were used, and that Snyd was used in preference
# to OpenChromatin, which in turn was used in preference over Haib.
# The rational for this is the Broad CTCF set has been in use for a
# while with no complaints, and when examing the Pol2 signals, which
# were with CTCF the main component of the Dupes, the Haib were the
# weakest and the Snyd the strongest.
cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table
# Make three column file: fileName, cell+treatment, antibody
awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
# Create config file out for clustering job, using a table assigning
# cell lines+treatments to single letter codes edited by hand. In
# the future may be able to automate this by having fixed upper case codes for
# tier1&2 cell lines, and then just use first letter lower case codes
# for rest. That is the pattern cellLetter2.tab follows anyway.
cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab .
regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab
# Do the actual clustering
hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps
# Load database (reloaded 3/26/2012 to fix a relatively rare bug)
hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed
hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps
# Create inputTrackTable - six columns:
awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab
hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab
# Handle the RNA seq, starting by pooling it and filtering out multiple
# mapping and blacklisted regions.
mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
# Now have to normalize. This takes two passes. The first just creates
# a tab-separated file with normalization values. A batch file is made
# by running awk on the tab file, and this batch file executes the
# second half, transforming the bedGraphs to normalized values. The
# normalization standardizes all files as if the total signal integrated to 10
# billion over the genome.
rm -f ../norm.tab
foreach i (*SigPooled.bedGraph)
echo processing $i
echo -n $i " " >> ../norm.tab
awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
end
awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh
chmod a+x ../norm.csh
mkdir ../normalized
../norm.csh
# Make bigWigs
cd ../normalized
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
# Link into /gbdb
ln -s `pwd`/*.bw /gbdb/hg19/bbi
# Tell database about it
foreach i (*.bw)
hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i
end
# Clean up
cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn
rm pooled/*.bedGraph normalized/*.bedGraph
# Make metadata: (forgive the long line)
# JK TODO - add Hepg2 and Helas3 from here on down
mdbQuery out=ra "select * from hg19 where \
obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \
| sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \
| grep -v ^replicate > mdb.ra
# Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra
# At this point actually are done with this track, except for
# all the trackDb work on the overlayed histone mod tracks.
# The rest is for some coloring stuff that is not yet actually used.
# Do work to figure out how to color cells. This is based on
# hierarchical clustering of wgEncodeUwAffyAllExon.
mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors
cd /hive/data/genomes/hg19/bed/wgEncodeReg
# Start with some alas time consuming data cleanup to get rid of
# overlaps - losing about 10% of the data in the process which is
# totally ok.
mkdir allExon
cd allExon
mkdir inLinks noOverlap onlyCommon
ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks
cd inLinks
foreach i (*.gz)
bedRemoveOverlap $i ../noOverlap/$i:r
end
cd ..
# Now make up a file of regions present in all inputs for further data
# cleanup
bedCommonRegions noOverlap/*.* > commonExons.bed
cd noOverlap
foreach i (*.broadPeak)
echo processing $i
bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i
end
rm -r noOverlap
# Now pool replicates where possible
mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \
| sed 's/\.gz//g' > replicate.ra
encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged
mkdir merged
chmod a+x merge.sh
merge.sh
# Resort merged output - somehow gets slightly messed by
# encodeMergeReplicates.
cd merged
foreach i (*.broadPeak)
echo sorting $i
sort -k 1,1 -k 2,2n $i > tmp
mv tmp $i
end
# Make file that converts fileName to cell
raToTab merged.ra -cols=fileName,cell stdout \
| sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab
# Now build the tree. Took 2.5 minutes
ls -1 merged/*.broadPeak > clusterInput.lst
regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances
#########################################################################
# 2012 ENCODE Regulation supertrack updates: #7526 (kate)
#########################################################################
# DNase Clusters: #9954 (2013-01-11)
#
# Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs).
# See doc/encodeDccHg19/awgUniform.txt
# There are 125 cell type+treatments represented -- UW, Duke, and Uw+Duke for 14 cell types
# Previous track had 74
cd /hive/data/genomes/hg19/bed/wgEncodeReg
mkdir -p dnaseAwg
cd dnaseAwg
set dir = `pwd`
cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak
ls *.narrowPeak > $dir/peak.lst
# generate a normalization factor
# output format:
# 0 1 2
# wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 .
regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out &
# Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
regCluster $dir/peak.table /dev/null $dir/peak.bed
# Read 125 sources from peak.table
# 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
# filter out low scoring, single peaks in cluster and load in database
cd $dir
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
wc -l wgEncodeRegDnaseClusteredV2.bed
# 1281988 wgEncodeRegDnaseClusteredV2.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
# NOTE: V1 had ~1M clusters, V2 has 1.2M
# It would be nice if hgc sorted datasets by signal or cellType
# Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
# was: objName, cellType, treatment, replicate, lab, dateUnrestricted
# For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also,
# this data has all passed timestamp restriction, so don't need to show
# objName, cellType, treatment, lab(s)
# DISPLAY: Consider item name change to % assays represented instead of count.
# Also, could color item name label to highlight ubiquitous vs. cell-type-specific
# (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
# Make wgEncodeRegDnaseClusteredInputV2 table.
mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab
hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \
~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab
# Set up downloads
cd $dir
mkdir downloads
gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz
gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz
# use (existing) ENCODE DCC downloads dir
set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered
cd $dload
# call existing stuff release1 (use ENCODE-style downloads scheme)
mkdir release1
ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1
cp README.txt md5sum.txt files.txt release1
# new release dir
mkdir release2
ln -s release2 beta
ln -s release2 releaseLatest
cd $dir/downloads
ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload
cd $dload
ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest
# include older files as well
ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest
cp README.txt releaseLatest
cd releaseLatest
md5sum *.gz > md5sum.txt
# vim README.txt
cp README.txt md5sum.txt ..
# add minimal metaObject to metaDb to associate new table with (pseudo-)composite
# wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads
cat > wgEncodeRegDnaseClustered.ra << 'EOF'
metaObject wgEncodeRegDnaseClusteredV2
objType table
composite wgEncodeRegDnaseClustered
fileName wgEncodeRegDnaseClusteredV2.bed.gz
tableName wgEncodeRegDnaseClusteredV2
project wgEncode
'EOF'
#########################################################################
# Preliminary/Experimental/Older work below
#########################################################################
# ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate)
# This version will have:
# * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use)
# http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt
# These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell),
# AWG did not pool reps or plus/minus for their hub (but we will)
# AWG has filtered for bad regions so no longer necessary
# * Possibly a 10th cell type (HMEC) for Tx and Histone
# Note: single replicate for HMEC Tx (others have 2)
# * Two additional cell lines for Histones (Hela, HepG2)
# - Note Hela & HepG2 already added to Tx track (July2012 release)
# * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/)
# wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw
# * Add latest TF's, and use AWG uniform peak calls
# * Add latest DNase calls (possibly use AWG uniform)
# (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub)
# AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
# - can use subGroups line to determine cell type, but need to take care
# with treatments, which are in filename but not subGroups
# - NOTE: AWG includes Duke data processed like UW. Where both
# labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+)
# Previous track: 74 cell types (2 reps), all UW. This version: 125 cell types, pooled replicates, merged UW+Duke
# Also considering adding new data types from AWG hub:
# - Short RNA's
# - AWG uniform RNA contigs
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/)
# - Segmentations (chromHMM, segWay, or combined)
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations)
# - FAIRE (cell-specific, union over 25 cell types)
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/)
# NOTE: following Jim's methods from initial version
# make root dir and link in our bad.bed file for filtering out
# blacklisted and multiple mapping regions.
cd /hive/data/genomes/hg19/bed
mkdir wgEncodeRegV2
cd wgEncodeRegV2
ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed .
#(was /hive/users/kent/regulate/companion/mapping/bad.bed)
######
# Small RNA's
# For signal, pool strands and replicates
######
cd /hive/data/genomes/hg19/bed/
mkdir -p wgEncodeRegV2/shortRna/pooled
cd wgEncodeRegV2/shortRna/pooled
cat > list.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf)
echo "processing ${cell}"
ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig
'EOF'
csh list.csh >&! list.out &
# There are 6 of overalpping with histones and long RNAs. 4 other tier2's.
cat > pool.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \
wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph
end
'EOF'
csh pool.csh >&! pool.out &
# got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing
# for now with others to see if its worth it.
cat > big.csh << 'EOF'
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
'EOF'
csh big.csh >&! big.out &
# add to database
set dir = /gbdb/hg19/bbi/encodeRegV2
mkdir $dir
ln -s `pwd`/*.bw $dir
foreach i (*.bw)
hgBbiDbLink hg19 $i:r $dir/$i
end
######
# Handle the RNA seq, starting by pooling it and filtering out multiple
# mapping and blacklisted regions.
######
cd /hive/data/genomes/hg19/bed/
mkdir -p wgEncodeRegV2/txn/pooled
cd wgEncodeRegV2/txn/pooled
cat > poolTx.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \
wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
end
'EOF'
csh poolTx.csh >&! poolTx.out &
# ~8 minutes per cell type
# NOTE: 25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex)
# Add in HMEC, which has single replicates
cat > poolHmec.csh << 'EOF'
foreach cell (Hmec)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
end
'EOF'
csh poolHmec.csh >&! poolHmec.out &
# Normalize. This takes two passes. The first creates
# a tab-separated file with normalization values. A batch file is made
# by running awk on the tab file, and this batch file executes the
# second half, transforming the bedGraphs to normalized values. The
# normalization standardizes all files as if the total signal integrated to 10
# billion over the genome.
rm -f ../norm.tab
cat > makeNorm.csh << 'EOF'
foreach i (*SigPooled.bedGraph)
echo processing $i
echo -n $i " " >> ../norm.tab
awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
end
awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh
'EOF'
csh makeNorm.csh >&! makeNorm.out &
# ~2 mins per cell type
mkdir ../normalized
csh ../norm.csh >&! ../norm.out &
#### STOPPED HERE
cd ../normalized
# make bigWigs
cat > big.csh << 'EOF'
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
'EOF'
csh big.csh >&! big.out &
# add to database
set dir = /gbdb/hg19/bbi/encodeRegV2
mkdir $dir
ln -s `pwd`/*.bw $dir
foreach i (*.bw)
hgBbiDbLink hg19 $i:r $dir/$i
end
######
# Possible new track: FAIRE Peaks Union across 25 cells, from AWG
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p faire/peaks
cd faire/peaks
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt
sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh
csh wget.csh >&! wget.out &
mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb
bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed
# head -1 *.bed
chr1 10033 10376
# bed 3
hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed
# Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed
######
# Create the DNAse peak clusters subtrack.
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p dnase/peaks
cd dnase
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh
csh wget.csh >&! wget.out &
# 125 files (.bb), dated Nov 22, 2011
# Sample case had ~150K peaks (by bigBedInfo)
# These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats
#$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head
chr1 10180 10330 . 0 . 26.000000 -1 -1 -1
# NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform
# rename to UCSC style and convert to bed: wgEncodeAwgDnasePeak
ls *.fdr01peaks.hg19.bb | \
perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh
# convert to bed and rename
csh unbig.csh >&! unbig.out
# udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0
# This dataset was truncated
# Repeated wget and bigBedToBed for HepG2 (8/23)
ls -1 *AwgDnase*.bed > peak.lst
# Datasets with treated cells (hints from long label)
grep '(' uniformDnase.txt | grep longLabel > treated.txt
wc -l treated.txt
# 7 datasets
# Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox
# TODO: Edit to rename
# generate a normalization factor to
# convert signalValue to score (0-1000). Also note score field; narrowPeak is 7 (default)
# output format:
# 0 1 2
# wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 .
regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out &
# Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
regCluster peak.table /dev/null peak.bed
#Read 125 sources from peak.table
# 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
# filter out low scoring, single peaks in cluster and load in database
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
wc -l wgEncodeRegDnaseClusteredV2.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
# Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed
# load individual peak files as a track (needed by hgc)
# TODO: generate labels out of metadata
cat > loadDnase.csh << 'EOF'
set b = wgEncodeAwgDnase
set t = ${b}Peak
foreach f (${b}*)
set table = $f:r
hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f
set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'`
echo " track $table" >> $t.ra
echo " type narrowPeak" >> $t.ra
echo " parent $t" >> $t.ra
echo " shortLabel $cell DNase" >> $t.ra
echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra
echo " subGroups cellType=$cell treatment=None" >> $t.ra
echo "" >> $t.ra
end
'EOF'
# NOTE: V1 had ~1M clusters.
# Other data tables: Need to load the peak files as they are used by hgc
# These should probably also be subtracks.
# It would be nice if hgc sorted datasets by signal or cellType
# Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
# was: objName, cellType, treatment, replicate, lab, dateUnrestricted
# For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also,
# this data has all passed timestamp restriction, so don't need to show
# objName, cellType, treatment, lab(s)
# DISPLAY: Consider item name change to % assays represented instead of count.
# Also, could color item name label to highlight ubiquitous vs. cell-type-specific
# (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
######
# Uniform histone signal for overlay wiggles
# Note: there are UW signal files as well -- omitting these for consistency,
# Check if merging would be a better option
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p histone/signal
cd histone/signal
cat > getHistones.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec)
foreach histone (H3k4me1 H3k4me3 H3k27ac)
set var = "wgEncodeBroadHistone${cell}${histone}"
echo $var
curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw
end
end
'EOF'
csh getHistones.csh >&! getHistones.out &
# save real files (>1k size) as curl creates info files when match used
mkdir sav
find . -name '*.bw' -a -size +1k -exec mv '{}' sav \;
rm *.bw
mv sav/* .
rmdir sav
# note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why)
# use curl to retrieve by pattern match as we don't have read perms on directory. Then remove non-useful files
curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw
# good one is '*Std_1Rep', remove 2Rep and 3Rep
rm *Hela*Std_[2-3]*
wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw
######
# TFBS from Jan2011 freeze
######
# The TFBS track is a specialized type: factorSource. Requires 3 tables:
# track table: wgEncodeRegTfbsClustered (bed15)
# source table: wgEncodeRegTfbsCells
# Maps cell type (including treatment) to single letter
# Currently, there are fixed unique assignments for Tier1 and 2.
# There are all upper case.
# Tier 3 just use first char of cell name, lower-cased. Not unique at all!
# Looks like only rows 2 and 3 of the table are actually used/informative
# Ex. row 0 G GM12878 n/a n/a n/a 3 n/a,n/a/,GM12878,
# inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs
# DISPLAY: Better with target protein instead of antibody ?
# DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak
# found it the cell type
# DISPLAY: fix ordering of cell letters -- they should be sorted
# (in cellLetter.tab first, so Exps files has correct id's)
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p tfbs/peaks
cd tfbs/peaks
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt
# this file has metadata
# there are 3 'method' values: SPP, PeakSeq, wiggler
# there are 2 'quality' values: Good, Caution
# There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets
# Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets
# -> basically the same data, but cleaned up
sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh
sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh
csh wget.csh >&! wget.out &
# 416 files
csh wget2.csh >&! wget2.out &
# 41 (have Histone in filename)
# sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb
# from hub trackDb, metadata is in subGroups line:
# subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP
# Note 'quality'. Think about dropping those with quality='Caution'. Unfortunately some
# Tier1 , esp. Pol2 are only available with this quality
# Q: What does Rep1 mean ? Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual)
# (Jim's methods exluded those not based on 2 replicates)
# Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC)
# Q: How is quality assigned ?
# rename to UCSC style and convert to bed: wgEncodeAwgTfbsPeak
cd ..
ls peaks/spp.optimal.*.bb | \
perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh
# convert to bed and rename
csh unbig.csh >&! unbig.out &
# NOTE: ~10% (49 of 457 total) are dupes from different labs
# Factors are: CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK
# MAX, POL2, RAD21, SRF, YY1
ls -1 peaks/*.bed > peak.lst
######
# TFBS from July 2012 freeze
# Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on
# input Anshul received from users. And there are more data sets (708 vs. 495)
# Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering
# (as in previous track)
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p tfbs2012/peaks
cd tfbs2012/peaks
ls -l filelist.txt
# 708 filelist.txt
# Downloaded from
# set user = encode-box-01
# set pwd = enc*deDOWN
wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal
# using the user name and password in the ENCODE wiki at:
# http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp)
# These are also available at:
# http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt
# NOTE: quality metrics are available here:
# https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7
# -> Check with Anshul for list of data sets to exclude from clustering based on quality.
# For now, use all.
# sample filename:
# spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz
cd ..
ls peaks/*.narrowPeak.gz > peak.lst
# extract UCSC object name from peaks file name, and generate a per-file normalization factor to
# convert signalValues to scores (0-1000).
# Also note score field position; for narrowPeak this is 6 (default)
# output format:
# 0 1 2
# (fields 2, 3, 4 are unused)
regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! regTable.out &
wc -l partial.table
# 708
head -1 partial.table
# peaks/spp.idrOptimal.bf.wgEncodeBroadHistoneDnd41CtcfAlnRep0.bam_VS_wgEncodeBroadHistoneDnd41ControlStdAlnRep0.bam.narrowPeak.gz 0 1 2 6 4.52593 wgEncodeBroadHistoneDnd41CtcfAlnRep1
# add metadata from metaDb or as fallback, from parsing filename
regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3
# failed in 1 object:
# Can't get metadata for wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1 from metaDb, parsing filename
# UCSC metadata has Rep3 for pcr2x protocol, but V0415101 for others ?
# Tweak this one as special case:
# TODO: revisit this
sed 's/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep3/' partial.table > partial.fixed.table
# cluster by antibody, and by target
regClusterAttachMetadataToTableOfTables hg19 partial.fixed.table withDupes.table -verbose=3
regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table withDupes.target.table -verbose=3
# find dupes in target-based table (already done for antibody-based)
# 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs
# limit to single cell/factor/treatment (remove cross-lab duplication).
# Consider pooling instead ?? Or picking based on quality metrics ?
# -> wait for Anshul's recommendations (he will summarize) -- now they are too
# hard to compare:
# https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7
# eg., there are GM12878-CTCF from 4 labs, and all have some error highlighting
# Reusing Jim's priorities (Broad, Snyd, OpenChrom, Haib); I'm ranking UW just below OpenChrom (Kate)
# Also, when multiple from a single lab prioritizing what appear to be newer controls & protocols:
# HAIB: V, PCR?x
# SYDH: iggrab, iggmus, std
# One more complication - there are often several cases where there is
# more than one data set on the same cell+treatment+antibody combination.
# For good reasons hgBedToBedExps only wants one.
# This next bit filters them out based on a picks file that was hand
# created - picksFromDupes. This was based on the criteria that
# the Broad CTCF were used, and that Snyd was used in preference
# to OpenChromatin, which in turn was used in preference over Haib.
# The rational for this is the Broad CTCF set has been in use for a
# while with no complaints, and when examing the Pol2 signals, which
# were with CTCF the main component of the Dupes, the Haib were the
# weakest and the Snyd the strongest.
cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table
# wc -l peak.table
# 656
# reduced by 52
# Make three column file: fileName, cell+treatment, antibody
awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' withDupes.target.table > fileCellAb.target.tab
# Create config file out for clustering job, using a table assigning
# cell lines+treatments to single letter codes edited by hand.
# Find cell types that aren't yet in cellLetter file:
cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellLetter3.tab
regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab -noLetter > & ! noLetter.out &
sort noLetter.out | uniq | awk '{print $2}' > noLetter.txt
wc -l noLetter.txt
# 43 new cell type+treatment combos
cat noLetter.txt >> cellLetter3.tab
# hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90)
regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab
regClusterBedExpCfg -tabList fileCellAb.target.tab cluster.target.cfg -cellLetter=cellLetter3.tab -noLetter
# Do the actual clustering
hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps >&! beds2exps.out
# 17 errs (dupes detected) --> add these to picksFromDupes2 and hand edit
# NOTE: 4 of these were from same lab, different protocols, etc.:
# USF-1, A549+EtOH; NRSF, HepG2; NRSF, SK-N-SH; Pol2(phosphS2), K562
# Leaving these out for now (hand-editing peak.table), saving in file dupesBad.txt
cat picksFromDupes.tab beds2exps.out | sort > picksFromDupes2.tab
wc -l picksFromDupes2.tab
# 52
wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes2.tab peak.table
# Prune above 4 dupes, and redo config & clustering
wc -l peak.table
# 635
awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab
hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps > beds2exps2.out
# Loaded 635 records from cluster.cfg
# Got 131 sources
# Got 201 factors
# Load database
hgLoadBed hg19 wgEncodeRegTfbsClusteredV3 peak.bed
# Read 4735242 elements of size 15 from peak.bed
# was 2750490 in previous version
hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV3 ~/kent/src/hg/lib/expRecord.sql peak.exps
# Create inputTrackTable - six columns:
awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table > wgEncodeRegTfbsClusteredInputs3.tab
hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV3 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV3.tab
#######################################################################
# continue with target-based clustering
hgBedsToBedExps -dupeLetterOk cluster.target.cfg peak.target.bed peak.target.exps >&! beds2exps.target.out
wc -l beds2exps.target.out
# 95 dupes
# NOTE: this was too laborious by hand. When redone, script to list pairs in date submitted/resubmitted or experiment ID order, to ease picking later experiment
# EXPERIMENTAL: TF motif tracks, from EMBL/Aus (MEME toolkit)
# These are generated using FIMO tool:
# Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011
# http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full
# http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi
mkdir /cluster/data/hg19/bed/tfbsFimo
cd /cluster/data/hg19/bed/tfbsFimo
hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed
# Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed
hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed
# Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed
#######################################################################