####################################################################### # ENCODE Regulation track (In progress March 2010, very rough instructions # as am still defining how to build track...). JK # make root dir and link in our bad.bed file for filtering out # blacklisted and multiple mapping regions. mkdir -p /cluster/data/hg19/bed/wgEncodeReg cd /cluster/data/hg19/bed/wgEncodeReg ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed # Create the DNAse peak clusters subtrack. # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq # linked into directory /hive/users/kent/regulate/dnase/peaks mkdir dnase cd dnase mkdir peaks ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks # Process these into clusters in a bed file and load clusters into # table. /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables uw02 peak.lst peak.table regCluster peak.table /dev/null peak.bed awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed # Make wgEncodeRegDnaseClusteredInput table. Start with mdbQuery, and # then do some massaging since not completely in sync with file list. mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \ wgEncodeRegDnaseClusteredInputs.tab # Make transcription factor binding site clusters. Start with Anshul's # nice uniform peak calls, downloaded from # ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal # using the user name and password in the ENCODE wiki at: # http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data # Put all of the ones that actually are based on two replicates into # /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks # Once that is done do a little work towards making the cluster input. cd /cluster/data/hg19/bed/wgEncodeReg/tfbs /bin/ls peaks/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables ans01 peak.lst partial.table # Unfortunately at this point only about 90% of the peak files actually # have metaDb data associated with them in metaDb, so run C program that # takes it from metaDb if possible, otherwise tries to get it from file # names. regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table # One more complication - there are often several cases where there is # more than one data set on the same cell+treatment+antibody combination. # For good reasons hgBedToBedExps only wants one. # This next bit filters them out based on a picks file that was hand # created - picksFromDupes. This was based on the criteria that # the Broad CTCF were used, and that Snyd was used in preference # to OpenChromatin, which in turn was used in preference over Haib. # The rational for this is the Broad CTCF set has been in use for a # while with no complaints, and when examing the Pol2 signals, which # were with CTCF the main component of the Dupes, the Haib were the # weakest and the Snyd the strongest. cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab . wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table # Make three column file: fileName, cell+treatment, antibody awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab # Create config file out for clustering job, using a table assigning # cell lines+treatments to single letter codes edited by hand. In # the future may be able to automate this by having fixed upper case codes for # tier1&2 cell lines, and then just use first letter lower case codes # for rest. That is the pattern cellLetter2.tab follows anyway. cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab . regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab # Do the actual clustering hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps # Load database (reloaded 3/26/2012 to fix a relatively rare bug) hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps # Create inputTrackTable - six columns: awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab # Handle the RNA seq, starting by pooling it and filtering out multiple # mapping and blacklisted regions. mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph # Now have to normalize. This takes two passes. The first just creates # a tab-separated file with normalization values. A batch file is made # by running awk on the tab file, and this batch file executes the # second half, transforming the bedGraphs to normalized values. The # normalization standardizes all files as if the total signal integrated to 10 # billion over the genome. rm -f ../norm.tab foreach i (*SigPooled.bedGraph) echo processing $i echo -n $i " " >> ../norm.tab awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab end awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh chmod a+x ../norm.csh mkdir ../normalized ../norm.csh # Make bigWigs cd ../normalized foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end # Link into /gbdb ln -s `pwd`/*.bw /gbdb/hg19/bbi # Tell database about it foreach i (*.bw) hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i end # Clean up cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn rm pooled/*.bedGraph normalized/*.bedGraph # Make metadata: (forgive the long line) # JK TODO - add Hepg2 and Helas3 from here on down mdbQuery out=ra "select * from hg19 where \ obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \ | sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \ | grep -v ^replicate > mdb.ra # Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra # At this point actually are done with this track, except for # all the trackDb work on the overlayed histone mod tracks. # The rest is for some coloring stuff that is not yet actually used. # Do work to figure out how to color cells. This is based on # hierarchical clustering of wgEncodeUwAffyAllExon. mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors cd /hive/data/genomes/hg19/bed/wgEncodeReg # Start with some alas time consuming data cleanup to get rid of # overlaps - losing about 10% of the data in the process which is # totally ok. mkdir allExon cd allExon mkdir inLinks noOverlap onlyCommon ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks cd inLinks foreach i (*.gz) bedRemoveOverlap $i ../noOverlap/$i:r end cd .. # Now make up a file of regions present in all inputs for further data # cleanup bedCommonRegions noOverlap/*.* > commonExons.bed cd noOverlap foreach i (*.broadPeak) echo processing $i bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i end rm -r noOverlap # Now pool replicates where possible mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \ | sed 's/\.gz//g' > replicate.ra encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged mkdir merged chmod a+x merge.sh merge.sh # Resort merged output - somehow gets slightly messed by # encodeMergeReplicates. cd merged foreach i (*.broadPeak) echo sorting $i sort -k 1,1 -k 2,2n $i > tmp mv tmp $i end # Make file that converts fileName to cell raToTab merged.ra -cols=fileName,cell stdout \ | sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab # Now build the tree. Took 2.5 minutes ls -1 merged/*.broadPeak > clusterInput.lst regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances ######################################################################### # 2012 ENCODE Regulation supertrack updates: #7526 (kate) ######################################################################### # DNase Clusters: #9954 (2013-01-11) # # Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs). # See doc/encodeDccHg19/awgUniform.txt # There are 125 cell type+treatments represented -- UW, Duke, and Uw+Duke for 14 cell types # Previous track had 74 cd /hive/data/genomes/hg19/bed/wgEncodeReg mkdir -p dnaseAwg cd dnaseAwg set dir = `pwd` cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak ls *.narrowPeak > $dir/peak.lst # generate a normalization factor # output format: # 0 1 2 # wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 . regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out & # Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster) regCluster $dir/peak.table /dev/null $dir/peak.bed # Read 125 sources from peak.table # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes # filter out low scoring, single peaks in cluster and load in database cd $dir awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed wc -l wgEncodeRegDnaseClusteredV2.bed # 1281988 wgEncodeRegDnaseClusteredV2.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed # NOTE: V1 had ~1M clusters, V2 has 1.2M # It would be nice if hgc sorted datasets by signal or cellType # Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting) # was: objName, cellType, treatment, replicate, lab, dateUnrestricted # For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also, # this data has all passed timestamp restriction, so don't need to show # objName, cellType, treatment, lab(s) # DISPLAY: Consider item name change to % assays represented instead of count. # Also, could color item name label to highlight ubiquitous vs. cell-type-specific # (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?) # Make wgEncodeRegDnaseClusteredInputV2 table. mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \ ~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab # Set up downloads cd $dir mkdir downloads gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz # use (existing) ENCODE DCC downloads dir set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered cd $dload # call existing stuff release1 (use ENCODE-style downloads scheme) mkdir release1 ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1 cp README.txt md5sum.txt files.txt release1 # new release dir mkdir release2 ln -s release2 beta ln -s release2 releaseLatest cd $dir/downloads ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload cd $dload ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest # include older files as well ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest cp README.txt releaseLatest cd releaseLatest md5sum *.gz > md5sum.txt # vim README.txt cp README.txt md5sum.txt .. # add minimal metaObject to metaDb to associate new table with (pseudo-)composite # wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads cat > wgEncodeRegDnaseClustered.ra << 'EOF' metaObject wgEncodeRegDnaseClusteredV2 objType table composite wgEncodeRegDnaseClustered fileName wgEncodeRegDnaseClusteredV2.bed.gz tableName wgEncodeRegDnaseClusteredV2 project wgEncode 'EOF' ######################################################################### # Preliminary/Experimental/Older work below ######################################################################### # ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate) # This version will have: # * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use) # http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt # These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell), # AWG did not pool reps or plus/minus for their hub (but we will) # AWG has filtered for bad regions so no longer necessary # * Possibly a 10th cell type (HMEC) for Tx and Histone # Note: single replicate for HMEC Tx (others have 2) # * Two additional cell lines for Histones (Hela, HepG2) # - Note Hela & HepG2 already added to Tx track (July2012 release) # * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/) # wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw # * Add latest TF's, and use AWG uniform peak calls # * Add latest DNase calls (possibly use AWG uniform) # (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub) # AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt # - can use subGroups line to determine cell type, but need to take care # with treatments, which are in filename but not subGroups # - NOTE: AWG includes Duke data processed like UW. Where both # labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+) # Previous track: 74 cell types (2 reps), all UW. This version: 125 cell types, pooled replicates, merged UW+Duke # Also considering adding new data types from AWG hub: # - Short RNA's # - AWG uniform RNA contigs # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/) # - Segmentations (chromHMM, segWay, or combined) # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations) # - FAIRE (cell-specific, union over 25 cell types) # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/) # NOTE: following Jim's methods from initial version # make root dir and link in our bad.bed file for filtering out # blacklisted and multiple mapping regions. cd /hive/data/genomes/hg19/bed mkdir wgEncodeRegV2 cd wgEncodeRegV2 ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed . #(was /hive/users/kent/regulate/companion/mapping/bad.bed) ###### # Small RNA's # For signal, pool strands and replicates ###### cd /hive/data/genomes/hg19/bed/ mkdir -p wgEncodeRegV2/shortRna/pooled cd wgEncodeRegV2/shortRna/pooled cat > list.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf) echo "processing ${cell}" ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig 'EOF' csh list.csh >&! list.out & # There are 6 of overalpping with histones and long RNAs. 4 other tier2's. cat > pool.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \ wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph end 'EOF' csh pool.csh >&! pool.out & # got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing # for now with others to see if its worth it. cat > big.csh << 'EOF' foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end 'EOF' csh big.csh >&! big.out & # add to database set dir = /gbdb/hg19/bbi/encodeRegV2 mkdir $dir ln -s `pwd`/*.bw $dir foreach i (*.bw) hgBbiDbLink hg19 $i:r $dir/$i end ###### # Handle the RNA seq, starting by pooling it and filtering out multiple # mapping and blacklisted regions. ###### cd /hive/data/genomes/hg19/bed/ mkdir -p wgEncodeRegV2/txn/pooled cd wgEncodeRegV2/txn/pooled cat > poolTx.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \ wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph end 'EOF' csh poolTx.csh >&! poolTx.out & # ~8 minutes per cell type # NOTE: 25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex) # Add in HMEC, which has single replicates cat > poolHmec.csh << 'EOF' foreach cell (Hmec) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \ wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph end 'EOF' csh poolHmec.csh >&! poolHmec.out & # Normalize. This takes two passes. The first creates # a tab-separated file with normalization values. A batch file is made # by running awk on the tab file, and this batch file executes the # second half, transforming the bedGraphs to normalized values. The # normalization standardizes all files as if the total signal integrated to 10 # billion over the genome. rm -f ../norm.tab cat > makeNorm.csh << 'EOF' foreach i (*SigPooled.bedGraph) echo processing $i echo -n $i " " >> ../norm.tab awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab end awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh 'EOF' csh makeNorm.csh >&! makeNorm.out & # ~2 mins per cell type mkdir ../normalized csh ../norm.csh >&! ../norm.out & #### STOPPED HERE cd ../normalized # make bigWigs cat > big.csh << 'EOF' foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end 'EOF' csh big.csh >&! big.out & # add to database set dir = /gbdb/hg19/bbi/encodeRegV2 mkdir $dir ln -s `pwd`/*.bw $dir foreach i (*.bw) hgBbiDbLink hg19 $i:r $dir/$i end ###### # Possible new track: FAIRE Peaks Union across 25 cells, from AWG ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p faire/peaks cd faire/peaks wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh csh wget.csh >&! wget.out & mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed # head -1 *.bed chr1 10033 10376 # bed 3 hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed # Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed ###### # Create the DNAse peak clusters subtrack. ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p dnase/peaks cd dnase wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh csh wget.csh >&! wget.out & # 125 files (.bb), dated Nov 22, 2011 # Sample case had ~150K peaks (by bigBedInfo) # These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats #$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head chr1 10180 10330 . 0 . 26.000000 -1 -1 -1 # NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform # rename to UCSC style and convert to bed: wgEncodeAwgDnasePeak ls *.fdr01peaks.hg19.bb | \ perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh # convert to bed and rename csh unbig.csh >&! unbig.out # udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0 # This dataset was truncated # Repeated wget and bigBedToBed for HepG2 (8/23) ls -1 *AwgDnase*.bed > peak.lst # Datasets with treated cells (hints from long label) grep '(' uniformDnase.txt | grep longLabel > treated.txt wc -l treated.txt # 7 datasets # Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox # TODO: Edit to rename # generate a normalization factor to # convert signalValue to score (0-1000). Also note score field; narrowPeak is 7 (default) # output format: # 0 1 2 # wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 . regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out & # Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster) regCluster peak.table /dev/null peak.bed #Read 125 sources from peak.table # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes # filter out low scoring, single peaks in cluster and load in database awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed wc -l wgEncodeRegDnaseClusteredV2.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed # Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed # load individual peak files as a track (needed by hgc) # TODO: generate labels out of metadata cat > loadDnase.csh << 'EOF' set b = wgEncodeAwgDnase set t = ${b}Peak foreach f (${b}*) set table = $f:r hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'` echo " track $table" >> $t.ra echo " type narrowPeak" >> $t.ra echo " parent $t" >> $t.ra echo " shortLabel $cell DNase" >> $t.ra echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra echo " subGroups cellType=$cell treatment=None" >> $t.ra echo "" >> $t.ra end 'EOF' # NOTE: V1 had ~1M clusters. # Other data tables: Need to load the peak files as they are used by hgc # These should probably also be subtracks. # It would be nice if hgc sorted datasets by signal or cellType # Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting) # was: objName, cellType, treatment, replicate, lab, dateUnrestricted # For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also, # this data has all passed timestamp restriction, so don't need to show # objName, cellType, treatment, lab(s) # DISPLAY: Consider item name change to % assays represented instead of count. # Also, could color item name label to highlight ubiquitous vs. cell-type-specific # (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?) ###### # Uniform histone signal for overlay wiggles # Note: there are UW signal files as well -- omitting these for consistency, # Check if merging would be a better option ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p histone/signal cd histone/signal cat > getHistones.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec) foreach histone (H3k4me1 H3k4me3 H3k27ac) set var = "wgEncodeBroadHistone${cell}${histone}" echo $var curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw end end 'EOF' csh getHistones.csh >&! getHistones.out & # save real files (>1k size) as curl creates info files when match used mkdir sav find . -name '*.bw' -a -size +1k -exec mv '{}' sav \; rm *.bw mv sav/* . rmdir sav # note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why) # use curl to retrieve by pattern match as we don't have read perms on directory. Then remove non-useful files curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw # good one is '*Std_1Rep', remove 2Rep and 3Rep rm *Hela*Std_[2-3]* wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw ###### # TFBS from Jan2011 freeze ###### # The TFBS track is a specialized type: factorSource. Requires 3 tables: # track table: wgEncodeRegTfbsClustered (bed15) # source table: wgEncodeRegTfbsCells # Maps cell type (including treatment) to single letter # Currently, there are fixed unique assignments for Tier1 and 2. # There are all upper case. # Tier 3 just use first char of cell name, lower-cased. Not unique at all! # Looks like only rows 2 and 3 of the table are actually used/informative # Ex. row 0 G GM12878 n/a n/a n/a 3 n/a,n/a/,GM12878, # inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs # DISPLAY: Better with target protein instead of antibody ? # DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak # found it the cell type # DISPLAY: fix ordering of cell letters -- they should be sorted # (in cellLetter.tab first, so Exps files has correct id's) cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p tfbs/peaks cd tfbs/peaks wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt # this file has metadata # there are 3 'method' values: SPP, PeakSeq, wiggler # there are 2 'quality' values: Good, Caution # There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets # Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets # -> basically the same data, but cleaned up sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh csh wget.csh >&! wget.out & # 416 files csh wget2.csh >&! wget2.out & # 41 (have Histone in filename) # sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb # from hub trackDb, metadata is in subGroups line: # subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP # Note 'quality'. Think about dropping those with quality='Caution'. Unfortunately some # Tier1 , esp. Pol2 are only available with this quality # Q: What does Rep1 mean ? Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual) # (Jim's methods exluded those not based on 2 replicates) # Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC) # Q: How is quality assigned ? # rename to UCSC style and convert to bed: wgEncodeAwgTfbsPeak cd .. ls peaks/spp.optimal.*.bb | \ perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh # convert to bed and rename csh unbig.csh >&! unbig.out & # NOTE: ~10% (49 of 457 total) are dupes from different labs # Factors are: CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK # MAX, POL2, RAD21, SRF, YY1 ls -1 peaks/*.bed > peak.lst ###### # TFBS from July 2012 freeze # Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on # input Anshul received from users. And there are more data sets (708 vs. 495) # Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering # (as in previous track) ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p tfbs2012/peaks cd tfbs2012/peaks ls -l filelist.txt # 708 filelist.txt # Downloaded from # set user = encode-box-01 # set pwd = enc*deDOWN wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal # using the user name and password in the ENCODE wiki at: # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp) # These are also available at: # http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt # NOTE: quality metrics are available here: # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7 # -> Check with Anshul for list of data sets to exclude from clustering based on quality. # For now, use all. # sample filename: # spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz cd .. ls peaks/*.narrowPeak.gz > peak.lst # extract UCSC object name from peaks file name, and generate a per-file normalization factor to # convert signalValues to scores (0-1000). # Also note score field position; for narrowPeak this is 6 (default) # output format: # 0 1 2 # (fields 2, 3, 4 are unused) regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! regTable.out & wc -l partial.table # 708 head -1 partial.table # peaks/spp.idrOptimal.bf.wgEncodeBroadHistoneDnd41CtcfAlnRep0.bam_VS_wgEncodeBroadHistoneDnd41ControlStdAlnRep0.bam.narrowPeak.gz 0 1 2 6 4.52593 wgEncodeBroadHistoneDnd41CtcfAlnRep1 # add metadata from metaDb or as fallback, from parsing filename regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3 # failed in 1 object: # Can't get metadata for wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1 from metaDb, parsing filename # UCSC metadata has Rep3 for pcr2x protocol, but V0415101 for others ? # Tweak this one as special case: # TODO: revisit this sed 's/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1/wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep3/' partial.table > partial.fixed.table # cluster by antibody, and by target regClusterAttachMetadataToTableOfTables hg19 partial.fixed.table withDupes.table -verbose=3 regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table withDupes.target.table -verbose=3 # find dupes in target-based table (already done for antibody-based) # 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs # limit to single cell/factor/treatment (remove cross-lab duplication). # Consider pooling instead ?? Or picking based on quality metrics ? # -> wait for Anshul's recommendations (he will summarize) -- now they are too # hard to compare: # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7 # eg., there are GM12878-CTCF from 4 labs, and all have some error highlighting # Reusing Jim's priorities (Broad, Snyd, OpenChrom, Haib); I'm ranking UW just below OpenChrom (Kate) # Also, when multiple from a single lab prioritizing what appear to be newer controls & protocols: # HAIB: V, PCR?x # SYDH: iggrab, iggmus, std # One more complication - there are often several cases where there is # more than one data set on the same cell+treatment+antibody combination. # For good reasons hgBedToBedExps only wants one. # This next bit filters them out based on a picks file that was hand # created - picksFromDupes. This was based on the criteria that # the Broad CTCF were used, and that Snyd was used in preference # to OpenChromatin, which in turn was used in preference over Haib. # The rational for this is the Broad CTCF set has been in use for a # while with no complaints, and when examing the Pol2 signals, which # were with CTCF the main component of the Dupes, the Haib were the # weakest and the Snyd the strongest. cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab . wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table # wc -l peak.table # 656 # reduced by 52 # Make three column file: fileName, cell+treatment, antibody awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' withDupes.target.table > fileCellAb.target.tab # Create config file out for clustering job, using a table assigning # cell lines+treatments to single letter codes edited by hand. # Find cell types that aren't yet in cellLetter file: cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellLetter3.tab regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab -noLetter > & ! noLetter.out & sort noLetter.out | uniq | awk '{print $2}' > noLetter.txt wc -l noLetter.txt # 43 new cell type+treatment combos cat noLetter.txt >> cellLetter3.tab # hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90) regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab regClusterBedExpCfg -tabList fileCellAb.target.tab cluster.target.cfg -cellLetter=cellLetter3.tab -noLetter # Do the actual clustering hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps >&! beds2exps.out # 17 errs (dupes detected) --> add these to picksFromDupes2 and hand edit # NOTE: 4 of these were from same lab, different protocols, etc.: # USF-1, A549+EtOH; NRSF, HepG2; NRSF, SK-N-SH; Pol2(phosphS2), K562 # Leaving these out for now (hand-editing peak.table), saving in file dupesBad.txt cat picksFromDupes.tab beds2exps.out | sort > picksFromDupes2.tab wc -l picksFromDupes2.tab # 52 wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes2.tab peak.table # Prune above 4 dupes, and redo config & clustering wc -l peak.table # 635 awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter3.tab hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps > beds2exps2.out # Loaded 635 records from cluster.cfg # Got 131 sources # Got 201 factors # Load database hgLoadBed hg19 wgEncodeRegTfbsClusteredV3 peak.bed # Read 4735242 elements of size 15 from peak.bed # was 2750490 in previous version hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV3 ~/kent/src/hg/lib/expRecord.sql peak.exps # Create inputTrackTable - six columns:
awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table > wgEncodeRegTfbsClusteredInputs3.tab hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV3 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV3.tab ####################################################################### # continue with target-based clustering hgBedsToBedExps -dupeLetterOk cluster.target.cfg peak.target.bed peak.target.exps >&! beds2exps.target.out wc -l beds2exps.target.out # 95 dupes # NOTE: this was too laborious by hand. When redone, script to list pairs in date submitted/resubmitted or experiment ID order, to ease picking later experiment # EXPERIMENTAL: TF motif tracks, from EMBL/Aus (MEME toolkit) # These are generated using FIMO tool: # Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011 # http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full # http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi mkdir /cluster/data/hg19/bed/tfbsFimo cd /cluster/data/hg19/bed/tfbsFimo hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed # Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed # Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed #######################################################################