# for emacs: -*- mode: sh; -*- ############################################################################# # ENCODE Analysis Working Group (AWG) tracks on hg19 # # Data in these tracks are generated by uniform processing pipelines that merge replicates # and apply consistent analysis across different data production labs, so they are # of greater interest to most users (and easier to use) than the ENCODE lab tracks. # # Wiki pages for AWG: ############################################################################# # Uniform DNase peaks (2013-1-23 DONE kate) # # Note - this work started in conjunction with update of ENCODE Regulation super-track # as it is needed for the DNase Clusters track. # # Bob Thurman at UW ENCODE (Stam lab) generated this data by running the UW HotSpot # pipeline on UW and Duke datasets (aligned reads). # It is currently hosted on the ENCODE Analysis Hub. # Note: DNase signal tracks from AWG are also available on the hub -- these # were generated from aligned reads using the wiggler pipeline, # without any replicate or lab merging. cd /hive/data/genomes/hg19/bed mkdir -p wgEncodeAwgDnaseUniform # Track composite will be wgEncodeAwgDnaseUniform # get track description, trackDb, and files from EBI site that hosts analysis hub wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt cp uniformDnase.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgDnaseUniform.html sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh mkdir bb narrowPeak cd bb csh ../wget.csh >&! ../wget.out & # 125 files (.bb), dated Nov 22, 2011 # Sample case had ~150K peaks (by bigBedInfo) # These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats #$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head chr1 10180 10330 . 0 . 26.000000 -1 -1 -1 # rename to UCSC style and convert to bed: wgEncodeAwgDnaseUniPk.narrowPeak ls *.fdr01peaks.hg19.bb | \ perl -wpe 's^(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)^bigBedToBed $1$2$3$4$5 ../narrowPeak/$1Awg$3\u\L$2\E\u\L$4\EUniPk.narrowPeak^' > ../unbig.csh # convert to bed and rename csh ../unbig.csh >&! ../unbig.out & # unbig.out is empty, all went OK cd narrowPeak cat > ../loadDnase.csh << 'EOF' set b = wgEncodeAwgDnase set v = ${b}Uniform set t = ../$b.ra set m = ../$b.metaDb.ra rm -f $t $m #gunzip *.gz foreach f (${b}*.narrowPeak) set table = $f:r hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f gzip $f set md5sum = `md5sum $f.gz | awk '{print $1}'` set lab = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$2/'` set cell = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$3/'` echo " track $table" >> $t echo " type narrowPeak" >> $t echo " parent $v" >> $t echo " shortLabel $cell DNase" >> $t echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t echo " subGroups cellType=$cell treatment=None lab=$lab view=Peaks" >> $t echo "" >> $t echo " metaObject $table" >> $m echo " objType table" >> $m echo " cell $cell" >> $m echo " treatment None" >> $m echo " composite $b" >> $m echo " dataType DnaseSeq" >> $m echo " dataVersion ENCODE Jan 2011 Freeze" >> $m echo " dccAccession X" >> $m echo " project wgEncode" >> $m echo " tableName $table" >> $m echo " fileName $f.gz" >> $m echo " md5sum $md5sum" >> $m echo " view Peaks" >> $m echo "" >> $m end 'EOF' csh ../loadDnase.csh >&! ../loadDnase.out & # NOTE: error loading wgEncodeAwgDnaseDukeHuh7.5UniPk due to presence of '.' # I didn't catch this till much later... renaming to *Huh75* hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 wgEncodeAwgDnaseDukeHuh75UniPk wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Reading wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Read 182986 elements of size 10 from wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Sorted # Creating table definition for wgEncodeAwgDnaseDukeHuh75UniPk # Saving bed.tab # Loading hg19 set trackDb = ~/kent/src/hg/makeDb/trackDb/human/hg19 cp wgEncodeAwgDnaseUniform.ra $trackDb # merge in composite settings from analysis hub, edit labels for length, treatment # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment # We could use a tabToRa too... cp wgEncodeAwgDnase.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra pushd $trackDb/human/hg19/metaDb/alpha # add composite settings (as in ENCODE UW Dnase track) # edit cells, treatments to match CV. mdbUpdate hg19 -test wgEncodeAwgDnaseUniform.ra mdbUpdate hg19 wgEncodeAwgDnaseUniform.ra popd # obtain lab and DCC accession from metaDb rm -f accession.metaDb.ra mdbPrint hg19 -vars="lab=UW replicate=1 dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeUwDnase/wgEncodeAwgDnaseUw/' -e 's/PkRep1/UniPk/' > accession.metaDb.ra mdbPrint hg19 -vars="lab=Duke dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeOpenChromDnase/wgEncodeAwgDnaseDuke/' -e 's/Pk/UniPk/' >> accession.metaDb.ra raMerge accession.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra > metaDb.new.ra mv metaDb.new.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.new.ra # note -- could have used above to get CV cell type as well (I did this manually) # Add object for composite (copy from UW DNase) # edit to remove orphan stanzas, create wgEncodeAwgDnaseUniform.ra # load and print with magic number pushd $trackDb; make; popd mdbPrint hg19 -vars="composite=wgEncodeAwgDnaseUniform" > wgEncodeAwgDnaseUniform.ra # check in to source control # set up file downloads in expected locale mkdir downloads set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19 # must use name of composite mkdir -p $dir/wgEncodeAwgDnaseUniform cd $dir/wgEncodeAwgDnaseUniform # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO') mkdir release1 ln -s release1 beta ln -s release1 releaseLatest cd /hive/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeaks cat << 'EOF' > link.csh foreach f (*.gz) echo $f ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgDnaseUniform/releaseLatest end 'EOF' csh link.csh cd $dir/wgEncodeAwgDnaseUniform/releaseLatest # add files.txt, md5sum.txt encodeMkFilesList hg19 ln md5sum.txt releaseLatest # add README sed 's/wgEncodeUwDnase/wgEncodeAwgDnaseUniform/' $dir/wgEncodeUwDnase/README.txt > $dir/wgEncodeAwgDnaseUniform cp $dir/wgEncodeUwDnase/README.txt $dir/wgEncodeAwgDnaseUniform/releaseLatest # get file list for pushQ cd ~/kent/src/hg/makeDb/doc/encodeDccHg19 /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgDnaseUniform 1 > wgEncodeAwgDnaseUniform.release1.notes # QA preview (vs. redmine issue) encodeQaInit hg19 wgEncodeAwgDnaseUniform 1 9850 -t ############################################################################# # Uniform TFBS (2013-1-24 IN PROGRESS kate) # # This track is based on data from the March 2012 ENCODE data freeze, processed by # Anshul Kundaje's pipeline (https://sites.google.com/site/anshulkundaje/projects/idr). # This round of uniform processing has a slightly more relaxed threshold (2% FDR vs 1% based on # input Anshul received from users than used for the Jan 2011 data freeze (Analysis pubs and # and UCSC Txn Factor ChIP-seq V2). And there are more data sets (708 vs. 495) # # Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering # (as in previous track). cd /hive/data/genomes/hg19/bed mkdir -p wgEncodeAwgTfbsUniform cd wgEncodeAwgTfbsUniform set build = `pwd` mkdir ebi; cd ebi # set user = encode-box-01 # set pwd = enc*deDOWN wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal # using the user name and password in the ENCODE wiki at: # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp) # These are also available at: # http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt # NOTE: quality metrics are available here: # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7 # -> Check with Anshul for list of data sets to exclude from clustering based on quality. # For now, use all. # sample filename: # spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz ls -l filelist.txt # 708 filelist.txt cd .. # extract UCSC object names (experiment and control) from peaks file name ls ebi/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! regTable.out & # attach metadata, by antibody and by target #regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3 #regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table \ #withDupes.target.table -verbose=3 # NOTE: partial.fixed.table corrected HAIB GM12878 Egr-1 to Rep3, but lacks controls # 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs ###################### # unzip and rename to UCSC style : wgEncodeAwgTfbs[]UniPk.narrowPeak # load table # create trackDb and metaDb entries # zip and md5sum # save cellType and antibody targets for subGroups at the end cat > nameTable.csh << 'EOF' # create table name for an experiment object set base = $1 set exp = $2 set vars = `echo $exp | \ sed -e 's/Control.*AlnRep1//' -e 's/StdAlnRep1//' -e 's/AlnRep1//' -e 's/wgEncode//' \ -e 's/AlnRep3//' -e 's/Tfbs//' -e 's/BroadHistone/Broad/' -e 's/OpenChromChip/Uta/'` set table = ${base}${vars}UniPk echo $table 'EOF' cat > loadOne.csh << 'EOF' # load table set infile = $1 set exp = $2 set outdir = $3 set base = $4 set table = `csh nameTable.csh $base $exp` hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $infile # return value 'EOF' cat > makeTag.csh << 'EOF' echo $1 | perl -wpe 's/[^a-z0-9]+//ig; s/(.*)/\U$1/' 'EOF' cat > trackOne.csh << 'EOF' #create track set exp = $1 set composite = $2 set base = $3 set cellFile = $4 set factorFile = $5 # get essential metadata set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=lab,cell,antibody,treatment stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set lab = $meta[1] set cell = $meta[2] set tier = `encodeCvTerm cell $cell tier` # color subtracks with tier 1 and 2 cells if ($tier != "3") then set color = `encodeCvTerm cell $cell color` if ($status != 0) then unset color endif endif set cellTag = `csh makeTag.csh $cell` set antibody = $meta[3] set factor = `encodeCvTerm antibody $antibody target` if ($#meta == 4) then set treatment = $meta[4] endif echo "$exp $lab $cell $tier $antibody $treatment" set factorTag = `csh makeTag.csh $factor` echo "$factorTag=$factor" >> $factorFile if ($tier != "3") then set cellTag = "a${tier}${cellTag}" echo "$cellTag=$cell (Tier_${tier})" >> $cellFile else echo "$cellTag=$cell" >> $cellFile endif set table = `csh nameTable.csh $base $exp` set trackDb = ${composite}.ra echo " track $table" >> $trackDb # first cut at default tracks on if ($tier != "3" && \ ($factor == "CTCF" || $factor == "JUN" || $factor == "E2F4" || $factor == "MYC" )) then echo " parent $composite" >> $trackDb else echo " parent $composite off" >> $trackDb endif if ($?treatment && $treatment != "None") then set treatmentLabel = " ($treatment)" else set treatmentLabel = "" endif echo " shortLabel ${cell}${treatmentLabel} $factor" >> $trackDb echo " longLabel $cell${treatmentLabel} TFBS Uniform Peaks of ${antibody} from ENCODE/${lab}/Analysis" >> $trackDb echo " subGroups tier=a${tier}0 cellType=$cellTag factor=$factorTag lab=$lab">> $trackDb if ($?color) then echo " color $color" >> $trackDb endif echo "" >> $trackDb 'EOF' cat > metaOne.csh << 'EOF' # create metadata file set exp = $1 set composite = $2 set base = $3 set table = `csh nameTable.csh $base $exp` set metaDb = ${composite}.metaDb.ra # get essential metadata set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=lab,cell,antibody,treatment stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set lab = $meta[1] set cell = $meta[2] set antibody = $meta[3] set factor = `encodeCvTerm antibody $antibody target` if ($#meta == 4) then set treatment = $meta[4] else set treatment = None endif # now the control info (very inconsistent) set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=controlId,control stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set controlId = $meta[1] # the 'control' term is used In SYDH, but not HAIB if ($#meta == 2) then set control = $meta[2] endif # finally, the DCC accession set accession = "X" set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=dccAccession stdout` if ($status == 0) then set accession = $meta endif echo "$table : $exp $lab $cell $antibody $treatment $controlId" echo "metaObject $table" >> $metaDb echo "objType table" >> $metaDb echo "cell $cell" >> $metaDb echo "target $factor" >> $metaDb echo "antibody $antibody" >> $metaDb if ($?control) then echo "control $control" >> $metaDb endif echo "controlId $controlId" >> $metaDb echo "lab $lab" >> $metaDb echo "treatment $treatment" >> $metaDb echo "composite $composite" >> $metaDb echo "dataType ChipSeq" >> $metaDb echo "dataVersion ENCODE Mar 2012 Freeze" >> $metaDb echo "dccAccession $accession" >> $metaDb echo "project wgEncode" >> $metaDb echo "tableName $table" >> $metaDb echo "fileName $table.narrowPeak.gz" >> $metaDb echo "view Peaks" >> $metaDb echo "" >> $metaDb 'EOF' cat > renameOne.csh << 'EOF' # rename files based on generated table names set exp = $1 set base = $2 set infile = $3 set outdir = $4 set table = `csh nameTable.csh $base $exp` cp -p $infile $outdir/$table.narrowPeak.gz 'EOF' cat > loadTfbs.csh << 'EOF' # parse filename and metaObjects for experiment and control from output # of regClusterMakeTableOfTables set fileList = $1 set base = wgEncodeAwgTfbs set composite = ${base}Uniform set outdir = downloads mkdir -p $outdir @ ct = 1 foreach x (`sed 's/\t/|/g' $fileList`) set obj = `echo $x | awk -F\| '{print $1, $7, $8}'` set infile = $obj[1] set exp = $obj[2] set ctrl = $obj[3] echo "${ct} Track $exp composite=$composite base=$base infile=$infile ctrl=$ctrl" @ ct++ # csh loadOne.csh $infile $exp $outdir $base # csh trackOne.csh $exp $composite $base cells.txt factors.txt # csh metaOne.csh $exp $composite $base csh renameOne.csh $exp $base $infile $outdir end 'EOF' #ERROR: wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1 # Seems there is only a Rep3 for this prefix. There exist Rep1 and Rep2 for a different protocol #V0416101 ( vs Pcr2x). And the two protocols have different dccAccessions... We will # consider the Rep3 was used as it was later data and Anshul's filename has Pcr2x # (and try to verify with Anshul). # -> handle this one manually csh loadTfbs.csh partial.table >&! loadTfbs.out & sort factors.txt | uniq | sed 's/$/ \\/' > factors.uniq.txt sort cells.txt | uniq | sed 's/$/ \\/' > cells.uniq.txt # insert above as subGroups in trackDb grep track wgEncodeAwgTfbsUniform.ra grep metaObject wgEncodeAwgTfbsUniform.metaDb.ra | wc -l # 707 wc -l partial.table # 708 grep metaObject *.metaDb.ra | grep -v Uchi | wc -l # 694 # NOTE: lost one here -- need to investigate. And remove UChicago from metadata. # The lost file (from below): wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk # It is a single replicate, Rep3 (?!), so couldn't be located in metadata as Rep1 set trackDb = ~/kent/src/hg/makeDb/trackDb cp wgEncodeAwgTfbsUniform.ra human/hg19/$trackDb # add composite settings (as in ENCODE SYDH TFBS track) # merge in composite settings from analysis hub, edit labels for length, treatment # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment # We could use a tabToRa too... # INVALID cv lookup: control = 'n/a' not found. in obj: wgEncodeAwgTfbsUtaK562CtcfUniPk # 274 of these # INVALID cv lookup: antibody = 'Pol2(b)' DEPRECATED. in obj: wgEncodeAwgTfbsBroadHelas3Pol2bUniPk # 4 of these pushd $trackDb/human/hg19/metaDb/alpha mdbUpdate -delete hg19 -composite=wgEncodeAwgTfbsUniform.ra mdbUpdate hg19 wgEncodeAwgTfbsUniform.ra # 707 (note -- need to remove UChicago) mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra git add wgEncodeAwgTfbsUniform.ra # commit # grab track description from previous version on the Analysis hub # - edit for conformance to browser style # - update with new counts for cell types, etc. # - have Anshul update (done) mkdir html cd html wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/uniformTfbs.html wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/trackDb.txt cp uniformTfbs.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgTfbsUniform.html cd .. # fix up labels # short (<=17) hgsql hg19 -e 'select shortLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(shortLabel) > 17' > badShorts.txt # use + # to distinguish same experiment diff labs, add single char lab id: # h=HudsonAlpha, s=Stanford, t=UTexas-Austin, w=UWashington, c=USC, v=Harvard, y=Yale, b=Broad # Add '2, e.g. 'h2, if 2 experiments from same lab (e.g. different control or antibody) hgsql hg19 -e 'select longLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(longLabel) > 80' > badLongs.txt # Shorten by abbreviating treatments, and excise 'from' # set up file downloads in expected locale set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19 # must use name of composite mkdir -p $dir/wgEncodeAwgTfbsUniform cd $dir/wgEncodeAwgTfbsUniform # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO') mkdir release1 ln -s release1 beta ln -s release1 releaseLatest cd /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads cat << 'EOF' > link.csh foreach f (*.gz) echo $f ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform/releaseLatest end 'EOF' csh link.csh cd $dir/wgEncodeAwgTfbsUniform/releaseLatest # add files.txt, md5sum.txt encodeMkFilesList hg19 -md5 # wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk.narrowPeak.gz is an orphaned file, there is no matching met adata # add in missing dataset cd $build grep Gm12878Egr1 partial.table > haibFix.table # edit to change to Rep3 csh loadTfbs.csh haibFix.table >&! loadHaib.out cp wgEncodeAwgTfbsUniform.metaDb.ra haibFix.metaDb.ra # edit out all but new entry mdbUpdate hg19 haibFix.metaDb.ra mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.metaDb.new.ra diff wgEncodeAwgTfbsUniform.metaDb.new.ra $trackDb/human/hg19/metaDb/alpha # just the HAIB fix pushd $trackDb/human/hg19/metaDb/alpha mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra cd $dir/wgEncodeAwgTfbsUniform/releaseLatest encodeMkFilesList hg19 -md5 # add composite to metaDb (use wgEncodeAwgDnaseUniform as example) # fix up main downloads dir to look like releaseLates # add README sed 's/wgEncodeHaibTfbs/wgEncodeAwgTfbsUniform/' $dir/wgEncodeHaibTfbs/README.txt > $dir/wgEncodeAwgTfbsUniform/README.txt # edit cp $dir/wgEncodeTfbsUniform/README.txt $dir/wgEncodeAwgTfbsUniform/releaseLatest # get file list for pushQ cd ~/kent/src/hg/makeDb/doc/encodeDccHg19 /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgTfbsUniform 1 > wgEncodeAwgTfbsUniform.release1.notes # check in notes file # QA preview (vs. redmine issue) encodeQaInit hg19 wgEncodeAwgTfbsUniform 1 10042 -t ############################################### # Master list of 2.9M DNase sites from Analysis (in progress kate) # (Chromatin landscape Nature paper, Sep 6 2012) # Filename is in Supplementary data PDF, p. 49, # Supplementary Methods section 1.2, "DHS Master List and its annotation" cd /hive/data/genomes/hg19/bed set track = wgEncodeAwgDnaseMaster cd $track mkdir ebi; cd ebi wget ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/byDataType/openchrom/jan2011/combined_peaks/multi-tissue.master.ntypes.simple.hg19.bed wc -l * # 2890742 multi-tissue.master.ntypes.simple.hg19.bed head -3 * chr1 10120 10270 MCV-37 39.232800 chr1 10440 10590 MCV-4 36.369500 chr1 16140 16290 MCV-5 18.768900 # Minimum value 4.215660 # Maximum value 5445.000000 # reformat to narrowPeak cd .. set file = wgEncodeAwgDnaseMaster.narrowPeak awk -v OFS="\t" '{print $1, $2, $3, $4, 0, ".", $5, -1, -1, -1}' ebi/*.simple*.bed > $file # Name field has count of cell types having DHS overlapping the master site hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $track $file Read 2890742 elements of size 10 from wgEncodeAwgDnaseMaster.narrowPeak