#!/bin/csh -f exit # This is the make doc for the hg17 ENCODE data. # NOTE: many of these tracks were lifted from hg16 with # semi-automated processing. The liftOver leftovers were moved # to the subdirectories "mapped" and "unmapped" of the main\ # work area, /cluster/data/encode/convertHg17 # create work area mkdir /cluster/data/encode/convertHg17 cd /cluster/data/encode/convertHg17 ln -s /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain hg16ToHg17.chain # Inventory ENCODE tables on hg16 (hgwbeta) ssh hgwbeta "echo select tableName from trackDb where tableName like \'encode%\' and settings not like \'%composite%\' order by tableName | hgsql hg16" > tables.txt wc -l tables.txt # 350 tables.txt set encodeBin = /cluster/data/encode/bin/scripts csh $encodeBin/listEncodeTables.csh hg16 > tableTypes.txt grep bed tableTypes.txt > tables.bed.txt ########################################################################## # DOWNLOADS ssh hgwdev cd /usr/local/apache/htdocs/hg17 mkdir -p encode cd encode # release terms cp ../../hg16/encode/README.txt . # annotation database # request admin set up automated database dump mkdir database # auxiliary data files mkdir datafiles # sequences mkdir regions cp ../../hg16/encode/regions/README.txt regions # edit README cd /cluster/data/encode/convertHg17 hgsql hg17 -N -e \ "SELECT name, chrom, chromStart, chromEnd FROM encodeRegions ORDER BY name">regions.txt ssh kolossus cd /cluster/data/encode/convertHg17 mkdir regions cd regions /cluster/data/encode/bin/scripts/encodeSequences.pl -upper \ ../regions.txt /iscratch/i/hg17/nib > hg17.fa /cluster/data/encode/bin/scripts/encodeSequences.pl -masked \ ../regions.txt /iscratch/i/hg17/nib > hg17.msk.fa faSize detailed=on hg17.fa > hg17_count.txt gzip *.fa md5sum *.fa.gz > md5sum.txt # copy regions/README.txt from hg16 and edit ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/encode ln -s /cluster/data/encode/convertHg17/regions . # October MSA freeze ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/encode mkdir alignments ln -s /cluster/data/encode/downloads/msa/SEP-2005 . # terms of use cp /usr/local/apache/htdocs/goldenPath/hg16/encode/alignments/README.txt . ########################################################################### ########################################################################### # Tracks lifted from hg16 ########################################################################## # GIS PET (2005-08-23 kate) # New genome-wide data (cMyc) submitted by Chialin (2006-10-25) cd /cluster/data/encode/convertHg17 # use mysqldump to generate .sql w/ schema, and .txt with data set t = encodeGisRnaPetHCT116 $encodeBin/dumpTable.csh hg16 $t wc -l $t.txt # 112782 encodeGisRnaPetHCT116.txt # create table hgsql hg17 < $t.sql # convert data coordinates ~/bin/i386/liftOver $t.txt -hasBin -bedPlus=12 \ hg16ToHg17.chain $t.tab $t.unmapped wc -l $t.tab $t.unmapped # 112701 encodeGisRnaPetHCT116.tab # 162 encodeGisRnaPetHCT116.unmapped # load into database echo "LOAD DATA local INFILE '$t.tab' INTO TABLE $t" | hgsql hg17 hgsql hg17 -N -s -e "SELECT COUNT(*) FROM $t" # 112701 checkTableCoords hg17 $t # Now try scripted version csh $encodeBin/convertBedTable.csh hg16 hg17 encodeGisRnaPetMCF7 12 # encodeGisRnaPetMCF7 hg16 104304 hg17 104187 csh $encodeBin/convertBedTable.csh hg16 hg17 encodeGisChipPet 12 # encodeGisChipPet hg16 65513 hg17 65510 # 2006-10-25 cMyc data cd /cluster/data/encode/GIS mkdir -p cMyc/2006-10-25/lab # copy files from ftp dir cd cMyc/2006-10-25 # use Angie's methods from hg16 to generate score from cluster count grep '^chr' lab/GIS_c-Myc_P493.bed | \ perl -wpe 'chomp; @w = split; \ if ($w[3] =~ /^\d+-(\d+)$/) { \ $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \ } else { die "parse"; } \ $_ = join("\t", @w) . "\n";' > myc.bed hgLoadBed -strict hg17 encodeGisChipPetMycP493 myc.bed # Loaded 276788 elements of size 12 checkTableCoords encodeGisChipPetMycP493 # Create a composite track and merge in P53 and STAT1 data # as subtracks ########################################################################## # KNOWN+PRED RNA (2005-08-29 kate) cd /cluster/data/encode/convertHg17 grep encodeRna tables.bed.txt # encodeRna encodeGenes bed 6 + $encodeBin/convertBedTable.csh hg16 hg17 encodeRna 6 ########################################################################## # TBA23 Evofold (2005-08-23 kate) cd /cluster/data/encode/convertHg17 csh $encodeBin/convertBedTable.csh hg16 hg17 encode_tba23EvoFold 6 # 739 encode_tba23EvoFold.txt # Reading liftover chains # Mapping coordinates # 739 encode_tba23EvoFold.tab # 0 encode_tba23EvoFold.unmapped # 739 total # encode_tba23EvoFold hg16 739 hg17 739 ########################################################################## # Transcription Levels Group # BU FIRST EXON grep encodeBu tables.bed.txt # encodeBuFirstExonCerebrum encodeTxLevels bed 12 + # encodeBuFirstExonColon encodeTxLevels bed 12 + # encodeBuFirstExonHeart encodeTxLevels bed 12 + # encodeBuFirstExonKidney encodeTxLevels bed 12 + # encodeBuFirstExonLiver encodeTxLevels bed 12 + # encodeBuFirstExonLung encodeTxLevels bed 12 + # encodeBuFirstExonSkMuscle encodeTxLevels bed 12 + # encodeBuFirstExonSpleen encodeTxLevels bed 12 + # encodeBuFirstExonStomach encodeTxLevels bed 12 + # encodeBuFirstExonTestis encodeTxLevels bed 12 + set buTables = `echo "SHOW TABLES LIKE 'encodeBuFirstExon%'" | hgsql -N -s hg16` foreach t ($buTables) csh $encodeBin/convertBedTable.csh hg16 hg17 $t 12 checkTableCoords hg17 $t end # RIKEN CAGE grep encodeRikenCage tables.bed.txt # encodeRikenCageMinus encodeTxLevels bedGraph 4 # encodeRikenCagePlus encodeTxLevels bedGraph 4 csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCageMinus 4 # Creating hg16 encodeRikenCageMinus.sql and encodeRikenCageMinus.txt # 6156 encodeRikenCageMinus.txt # Reading liftover chains # Mapping coordinates # 6153 encodeRikenCageMinus.tab # 6 encodeRikenCageMinus.unmapped # 6159 total # encodeRikenCageMinus hg16 6156 hg17 6153 csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCagePlus 4 # csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCagePlus 4 # Creating hg16 encodeRikenCagePlus.sql and encodeRikenCagePlus.txt # 5688 encodeRikenCagePlus.txt # Reading liftover chains # Mapping coordinates # 5639 encodeRikenCagePlus.tab # 98 encodeRikenCagePlus.unmapped # 5737 total # encodeRikenCagePlus hg16 5688 hg17 5639 ########################################################################## # CHIP/CHIP GROUP # # STANFORD CHIP # encodeStanfordChip* bedGraph 4 tracks cat > doStan.csh << 'EOF' set stanTables = \ `echo "SHOW TABLES LIKE 'encodeStanfordChip%'" | hgsql -N -s hg16` foreach t ($stanTables) csh /cluster/data/encode/bin/scripts/convertBedTable.csh \ hg16 hg17 $t 4 end 'EOF' csh doStan.csh >&! doStan.log grep hg17 doStan.log | wc -l # 12 tracks (6 smoothed) # encodeStanfordChipHCT116Sp1 hg16 369633 hg17 369465 # encodeStanfordChipSmoothedHCT116Sp1 hg16 137439 hg17 137361 # UCD Ng csh $encodeBin/convertBedTable.csh hg16 hg17 encodeUCDavisE2F1Median 4 # encodeUCDavisE2F1Median hg16 382884 hg17 382713 # UCSD/LI CHIP # encodeUcsdChip* bedGraph 4 tracks (total 36) cat > doUcsd.csh << 'EOF' set ucsdTables = \ `echo "SHOW TABLES LIKE 'encodeUcsdChip%'" | hgsql -N -s hg16` foreach t ($ucsdTables) csh /cluster/data/encode/bin/scripts/convertBedTable.csh \ hg16 hg17 $t 4 end 'EOF' csh doUcsd.csh >&! doUcsd.log grep hg17 doUcsd.log | wc -l # 36 tracks # encodeUcsdChipAch3Imr90 hg16 24348 hg17 24339 # encodeUcsdChipHeLaH3H4tmH3K4_p30 hg16 24537 hg17 24528 ########################################################################## # TRANSCRIPTION LEVELS TRACKS (2005-08-24 kate) # grep encodeTxLevels in tables.bed.txt and edit out already # completed tracks. Prefix each table with a call to convertBedTable # and suffix with bed field count # Tracks are: Stanford RTPCR, Yale TARS csh doTx.csh >&! doTx.log grep hg17 doTx.log | wc -l # 9 tracks ########################################################################## # CHROMATIN & CHROMOSOMES TRACKS (2005-08-24 kate) # Regulome, NHGRI DNase, Stanford Meth, UVA csh doChrom.csh >&! doChrom.log # 37 tables # do Stanford Meth Smoothed tables that weren't converted because # hg16 tables had incorrect capitalization wrt trackDb # and so weren't being displayed csh doChrom2.csh >&! doChrom2.log ########################################################################## # CHIP/CHIP TRACKS (2005-08-24 kate) # Sanger, UCSD Nimblegen doChip.csh >&! doChip.log ########################################################################## # VARIATION TRACKS (2005-08-24 kate) # HapMap, Reseq, Sanger Gene Expr csh doVar.csh >&! doVar.log grep hg17 doVar.log # encodeReseqRegions hg16 10 hg17 10 # encodeSangerGenoExprAssociation hg16 13674 hg17 13674 csh doHap.csh >&! doHap.log grep hg17 doHap.log # encodeHapMapAlleleFreqCEU hg16 20772 hg17 20772 # encodeHapMapAlleleFreqCHB hg16 19629 hg17 19629 # encodeHapMapAlleleFreqJPT hg16 19629 hg17 19629 # encodeHapMapAlleleFreqYRI hg16 19520 hg17 19520 csh /cluster/data/encode/bin/scripts/convertBedTable.csh \ hg16 hg17 encodeRecomb 4 ########################################################################## # AFFY CHIP/CHIP TRACKS (2005-08-24 kate) csh doAffy.csh >&! doAffy.log # 41 doAffy.csh grep hg17 doAffy.log | wc -l # 41 # do tracks missing from RR! csh doAffy2.csh >&! doAffy2.log wc -l doAffy2.csh # 6 doAffy2.csh grep hg17 doAffy2.log | wc -l # 6 ########################################################################## # WIG TRACKS (2005-08-24 kate) doWig.csh > doWig.log # 75 tables ########################################################################## # YALE TRACKS (2005-08-31 kate) doYale.csh > doYale.log wc -l doYale.csh # 54 doYale.csh grep hg17 doYale.log | wc -l # 50 # redo the 4 that failed doYale2.csh > doYale2.log grep hg17 doYale2.log | wc -l # 4 tracks !########################################################################## ########################################################################## # Tracks submitted in hg17 coords ########################################################################## # GENCODE Sanger Havana annotations (2005-08-18 kate) # Used latest (6/7/05) data submission, which was submitted # in hg17 coords and lifted to hg16. This was described in makeEncodeHg16.doc ssh hgwdev cd /cluster/data/encode/Gencode cd 2005-06-07 ldHgGene -gtf -genePredExt hg17 encodeGencodeGene gencode.vega.gtf # 2888 gene predictions checkTableCoords hg17 encodeGencodeGene grep intron gencode.gtf | wc -l # 15814 grep -v not_tested gencode.gtf | sed -e 's/-intron/-/g' | \ ldGencodeIntron hg17 encodeGencodeIntron stdin # 469 introns # load gene class table hgsql hg17 < ~/kent/src/hg/lib/gencodeGeneClass.sql echo "LOAD DATA LOCAL INFILE 'gencodeGeneClass.tab' into table gencodeGeneClass" | hgsql hg17 wc -l gencodeGeneClass.tab # 2888 gencodeGeneClass.tab ########################################################################## # EGASP Partial (2005-08-18 kate) # Gene tracks submitted for the EGASP competition were hg17-based # by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) # These were lifted to hg17, as described in makeEncodeHg16.doc # NOTE: Problem with encodeEgaspPartAugustusAny table detected # and fixed on 2006-01-09. It was somehow loaded with Genemark full data... cd /cluster/data/encode cd EGASP/Partial wc -l lab/*.gtf # 1778 lab/ASPic.gtf # 4215 lab/AceSCAN.gtf # 2692 lab/Augustus_EST-Protein.gtf # 2347 lab/Augustus_abinitio.gtf # 2736 lab/Augustus_any.gtf # 2567 lab/Augustus_dualgenome.gtf # 3458 lab/GeneZilla.gtf # 2194 lab/SAGA.gtf # NOTE: exclude ASPic, which contains only intron records # Filenames above, with _CHR_COORDS_hg17.gff appended, are chrom coordinate versions # GeneZilla ldHgGene hg17 encodeEgaspPartGenezilla lab/GeneZilla.*.gff # 656 gene predictions genePredCheck -db=hg17 encodeEgaspPartGenezilla # SAGA # Strip out trailing ## on lines where manual changes were made # (see notes in .gtf file) sed -e 's/ ##.*//' lab/SAGA.*.gff | \ ldHgGene hg17 encodeEgaspPartSaga stdin # 378 gene predictions genePredCheck -db=hg17 encodeEgaspPartSaga # Augustus ln -s lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff augustus.est.gff ln -s lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff augustus.abinitio.gff ln -s lab/Augustus_any.gtf_CHR_COORDS_hg17.gff augustus.any.gff ln -s lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff augustus.dual.gff foreach f (augustus.*.gff) set t = `echo $f | sed -e 's/augustus.\(.*\).gff/encodeEgaspPartAugustus\u\1/'` ldHgGene -genePredExt hg17 $t $f checkTableCoords hg17 $t end # augustus.abinitio.gff 418 gene predictions # augustus.any.gff 399 gene predictions # augustus.dual.gff 413 gene predictions # augustus.est.gff 381 gene predictions # Reload .est predictions (2006-01-09 kate) ldHgGene -genePredExt hg17 encodeEgaspPartAugustusEst augustus.est.gff # augustus.est.gff 381 gene predictions checkTableCoords hg17 encodeEgaspPartAugustusEst # AceSCAN # Split into two tracks -- conserved, and other, based on feature ldHgGene -predTab hg17 encodeEgaspPartAceCons aceCons.gp # 117 gene predictions ldHgGene -predTab hg17 encodeEgaspPartAceOther aceOther.gp # 727 gene predictions genePredCheck -db=hg17 encodeEgaspPartAceCons encodeEgaspPartAceOther ########################################################################## # EGASP Full (2005-06-27 kate) # Gene tracks submitted for the EGASP competition were hg17-based # by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) cd /cluster/data/encode cd EGASP/Full # Process "standard" gff files # NOTE: must dummy out scores -- float values cat > doGene.hg17.csh << 'EOF' ls *.gp | grep -v hg16 > gpList foreach f (`cat gpList`) wc -l $f set b = $f:r set t = encodeEgaspFull$b ldHgGene -predTab hg17 $t $f genePredCheck -db=hg17 $t end 'EOF' csh doGene.hg17.csh >&! doGene.hg17.log # process special files cd custom cat > doGene.hg17.csh << 'EOF' foreach f (Jigsaw.gp Ensembl.gp EnsemblPseudo.gp Exonhunter.gp GeneId.gp Sgp2.gp Twinscan.gp) set b = $f:r set t = encodeEgaspFull$b ldHgGene -genePredExt -predTab hg17 $t $b.gp genePredCheck -db=hg17 $t end 'EOF' # << for emacs csh doGene.hg17.csh >&! doGene.hg17.log # NOTE: OK to have missing exonFrames # Reading Ensembl.gp # 735 gene predictions # Reading EnsemblPseudo.gp # 34 gene predictions # Reading Exonhunter.gp # 1435 gene predictions # Reading GeneId.gp # 476 gene predictions # Reading Sgp2.gp # 930 gene predictions # Reading Twinscan.gp # 954 gene predictions end 'EOF' # << for emacs csh doGene.hg17.csh >&! doGene.hg17.log # process others set t = "encodeEgaspFullGenemark" ldHgGene -predTab hg17 $t Genemark.gp # 890 gene predictions genePredCheck -db=hg17 $t # create genepreds containing just exons flanking U12 introns set t = encodeEgaspFullGeneIdU12 ldHgGene -predTab -genePredExt hg17 $t geneId.introns.gp # 24 gene predictions genePredCheck -db=hg17 $t set t = encodeEgaspFullSgp2U12 ldHgGene -predTab -genePredExt hg17 $t sgp2.introns.gp # 20 gene predictions genePredCheck -db=hg17 $t ########################################################################## # EGASP Update # Submitted in hg17 coords # Jigsaw cd /cluster/data/encode cd EGASP/Jigsaw/2005-06-01 ldHgGene -predTab -genePredExt hg17 encodeEgaspUpdJigsaw jigsaw.gp # 454 gene predictions genePredCheck -db=hg17 encodeEgaspUpdJigsaw # Augustus cd /cluster/data/encode cd EGASP/Augustus/2005-06-22 foreach f (abinitio.gp any.gp dual.gp est.gp) genePredCheck $f set t = `echo $f | sed -e 's/\(.*\).gp/encodeEgaspUpdAugustus\u\1/'` ldHgGene -predTab -genePredExt hg17 $t $f checkTableCoords hg17 $t end # Reading abinitio.gp # 622 gene predictions # Reading any.gp # 571 gene predictions # Reading dual.gp # 617 gene predictions # Reading est.gp # 543 gene predictions # Exogean cd /cluster/data/encode cd EGASP/Exogean/2005-06-23 ldHgGene -predTab hg17 encodeEgaspUpdExogean exogean.gp # 850 gene predictions genePredCheck -db=hg17 encodeEgaspUpdExogean # GeneIDU12 and SgpU12 cd /cluster/data/encode cd EGASP/GeneIdU12/2005-06-10/ # create GTF files from submitted GFF's awk -F\\t '/^chr/ {printf "%s\t%s\tCDS\t%s\t%s\t.\t%s\t%s\tgene_id \"%s\"; transcript_id \"%s\"; exon_type \"%s\";\n", $1, $2, $4, $5, $7, $8, $9, $9, $3}' < lab/UCSC-hg17-GeneID-U12-track.gff | grep -v intron > geneId.hg17.gtf ldHgGene -genePredExt hg17 encodeEgaspUpdGeneId geneId.hg17.gtf # 476 gene predictions genePredCheck -db=hg17 encodeEgaspUpdGeneId awk -F\\t '/^chr/ {printf "%s\t%s\tCDS\t%s\t%s\t.\t%s\t%s\tgene_id \"%s\"; transcript_id \"%s\"; exon_type \"%s\";\n", $1, $2, $4, $5, $7, $8, $9, $9, $3}' < lab/UCSC-hg17-SGP2-U12-track.gff | grep -v intron > sgp2.hg17.gtf ldHgGene -genePredExt hg17 encodeEgaspUpdSgp2 sgp2.hg17.gtf # 930 gene predictions genePredCheck -db=hg17 encodeEgaspUpdSgp2 # create genepreds containing just exons flanking U12 introns # use U12 annotation as gene name, so it appears on details page grep U12 geneId.hg17.gtf | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > geneId.introns.hg17.gtf ldHgGene -genePredExt hg17 encodeEgaspUpdGeneIdU12 geneId.introns.hg17.gtf # 24 gene predictions grep U12 sgp2.hg17.gtf | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > sgp2.introns.hg17.gtf ldHgGene -genePredExt hg17 encodeEgaspUpdSgp2U12 sgp2.introns.hg17.gtf # 20 gene predictions # EGASP Yale Pseudogenes # Update submitted by Deyou Zheng 8/18/05 cd /cluster/data/encode cd EGASP/yale/latest wc -l lab/*.submitted # 184 lab/YalePgene-NCBI35.gtf.submitted # NOTE: this is fewer than the previous submission -- I confirmed # with Deyou that this is correct. # munge to create CDS entries to display, and assign pseudogene # name as transcript_id, and pseudogene type as gene_id so # it displays on details page sed -e 's/pseudogene\t/CDS\t/' -e 's/pgene_type/gene_id/' \ -e 's/alt_name ENCODE_Yale/transcript_id /' \ lab/YalePgene-NCBI35.gtf.submitted > yale.hg17.gtf ldHgGene -genePredExt hg17 encodeEgaspUpdYalePseudo yale.hg17.gtf # 184 gene predictions genePredCheck -db=hg17 encodeEgaspUpdYalePseudo # Fgenesh++ # Update submitted 9/30/05 by Victor Solovyev to Julien Legarde at # IMIM, to fix 4 regions (predictions originally on hg16, redone # for hg17) cd /cluster/data/encode/EGASP mkdir -p Fgenesh/2005-09-30/lab cd Fgenesh/2005-09-30/lab wget ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_Update/FGenesh++_corrected_update.gtf_CHR_COORDS_hg17.gff wget ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_partial/FGenesh++_corrected_partial.gtf_CHR_COORDS_hg17.gff cd .. cat *.gff | ldHgGene hg17 encodeEgaspUpdFgenesh stdin genePredCheck -db=hg17 encodeEgaspUpdFgenesh # 820 gene predictions ########################################################################## # STANFORD PROMOTERS cd /cluster/data/encode/StanfordPromoters rm previous mv latest previous mkdir 2005-08-23 ln -s 2005-08-23 latest mkdir latest/lab # copy updated files from Sara Hartman's email. # Both hg16 and hg17 versions were included: # hg16: StanfordPromoters__08.23.txt # hg17: StanfordPromoters_hg17__08.24.txt # Use Angie's processing from hg16, slightly modified cd latest cat > doProm.csh << 'EOF' foreach f (lab/StanfordPromoters_hg17*.txt) set cellType = `echo $f | perl -wpe 's^lab/StanfordPromoters_hg17_(.*)_.*^$1^'` echo $cellType if ($cellType == "Average") then tail +2 $f \ | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \ $w[3] =~ tr/01/-+/; \ $_ = join("\t", \ $w[2], $w[4], $w[5], $w[0], $w[9], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \ $w[8]) . "\n";' \ | makeColoredBed > encodeStanfordPromoters$cellType.hg17.bed else tail +2 $f \ | grep -v "Bad Txfn" \ | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \ $w[3] =~ tr/01/-+/; \ $_ = join("\t", \ $w[2], $w[4], $w[5], $w[0], $w[15], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \ $w[8], $w[9], $w[10], $w[11], $w[12], $w[13], $w[14]) . "\n";' \ | makeColoredBed > encodeStanfordPromoters$cellType.hg17.bed endif end 'EOF' csh doProm.csh >&! doProm.log cat > doLoad.csh << 'EOF' foreach f (encode*.bed) set track = $f:r:r if ($track == "encodeStanfordPromotersAverage") then hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/$track.sql \ hg17 $track $f else sed -e "s/encodeStanfordPromoters/$track/" \ $HOME/kent/src/hg/lib/encodeStanfordPromoters.sql > /tmp/esp.sql hgLoadBed -tab -noBin -sqlTable=/tmp/esp.sql hg17 $track $f endif end 'EOF' csh doLoad.csh >&! doLoad.log # Put the negative control data spreadsheet out for download. ssh kkstore03 cd /cluster/data/encode/StanfordPromoters/latest/lab nice gzip hg17_NegControlDataStanfordPromoters.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles mkdir -p stanfordPromoters cd stanfordPromoters cp -p \ /cluster/data/encode/StanfordPromoters/latest/lab/hg17_NegControlDataStanfordPromoters.txt.gz \ NegativeControlDataStanfordPromoters.txt.gz # Added a README.txt (edited form Angie's hg16 version) ########################################################################## # UV Replication -- Segregation, Origins, and Origin Confidence tracks # New data for Oct. freeze (but submitted in hg16 coords) # All data are bed3 # Contact: Chris Taylor (cmt5n@cs.virginia.edu) # 2007-04-14: Chris Taylor sent data for the ENm011 region for the # Replication track - this data was lost due to a problem with one of # Affy's mapping files (hartera). Update is add missing data is only for the # Mid, Late and Pan-S subtracks of the UVa DNA Rep Seg track. The Early # subtrack does not have data in this region. (DONE, hartera, 2007-04-16) # New Origins data, by new method (bubble trapping) submitted 2007-05-04 # by Chris Taylor. 2 datasets (GM and HeLa cells) on Affy ENCODE arrays. # New subtracks added for Ori-Bubble (HeLa) and Ori-NS (HeLa and GM06990) (hartera, 2007-05-07) cd /cluster/data/encode/UVa mkdir -p 2005-08-30 cd 2005-08-30 mkdir lab # Segregation data - 4 subtracks (Early, Mid, Late, Pan-S) # 4 custom tracks in a single file -- use Hiram's script to split /cluster/data/encode/BU/orchid/2005-06-09/splitTracks.pl \ lab/segchunks.hg16.qced.bed # creates t0, t1, t2, t3 awk < lab/segchunks.hg16.qced.bed '/track/ {print $2}' #name=early #name=mid #name=late #name=pans grep -v "^track" t0 > encodeUvaDnaRepEarly.hg16.bed grep -v "^track" t1 > encodeUvaDnaRepMid.hg16.bed grep -v "^track" t2 > encodeUvaDnaRepLate.hg16.bed grep -v "^track" t3 > encodeUvaDnaRepPanS.hg16.bed rm t0 t1 t2 t3 foreach f (encodeUvaDnaRep*.hg16.bed) set d = $f:r:r echo $d liftOver $f /cluster/data/encode/convertHg17/hg16ToHg17.chain \ $d.hg17.bed $d.unmapped hgLoadBed -noBin -strict hg17 $d $d.hg17.bed end # Redo with hg17 resubmitted data cd /cluster/data/encode/UVa cd 2005-10-15 /cluster/data/encode/bin/scripts/splitTracks.pl lab/segregation.hg17.bed grep -v "^track" t0 > encodeUvaDnaRepEarly.bed grep -v "^track" t1 > encodeUvaDnaRepMid.bed grep -v "^track" t2 > encodeUvaDnaRepLate.bed grep -v "^track" t3 > encodeUvaDnaRepPanS.bed rm t0 t1 t2 t3 foreach f (encodeUvaDnaRep*.bed) set d = $f:r echo $d hgLoadBed -noBin -strict hg17 $d $d.bed end # Origin predictions -- fixed at 200bp set t = encodeUvaDnaRepOriginsPred ln -s lab/originspred.hg16.qced.bed $t.hg16.bed liftOver $t.hg16.bed \ /cluster/data/encode/convertHg17/hg16ToHg17.chain \ $t.hg17.bed $t.unmapped hgLoadBed -noBin -strict hg17 $t $t.hg17.bed # Loaded 289 elements of size 3 # Origin confidence intervals -- varying length for averaged origins set t = encodeUvaDnaRepOriginsConf ln -s lab/originsconf.hg16.qced.bed $t.hg16.bed liftOver $t.hg16.bed \ /cluster/data/encode/convertHg17/hg16ToHg17.chain \ $t.hg17.bed $t.unmapped hgLoadBed -noBin -strict hg17 $t $t.hg17.bed # Loaded 270 elements of size 3 # Smoothed TR50 data # 500K 1bp float scores # wiggle with span=1 set table = encodeUvaDnaRepTr50 grep -v '^track' lab/smoothedtr50.hg17.wig | \ wigEncode stdin $table.wig $table.wib # upper limit 6.36, lower limit 2.05 set dir = /gbdb/hg17/encode/UVa/2005-10-15 mkdir -p $dir hgLoadWiggle -pathPrefix=$dir hg17 $table $table.wig ln -s `pwd`/$table.wib $dir # Update of tracks to add lost data for ENm011 region for the # Replication track - UVa DNA Rep Seg (hartera). This extra data is # in hg17 coordinates. cd /cluster/data/encode/UVa/ mkdir 2007-04-14/lab cd 2007-04-14/lab # copy data updates here - sent by e-mail: # Early_ENm011.bed, Late_ENm011.bed, Mid_ENm011.bed, Pans_ENm011.bed # Early_ENm011.bed is empty because there were no Early intervals in # ENm011 so no need to update the encodeUvaDnaRepEarly subtrack table. cd /cluster/data/encode/UVa/2007-04-14 grep -v "^track" ./lab/Mid_ENm011.bed > UvaDnaRepMid.bed grep -v "^track" ./lab/Late_ENm011.bed > UvaDnaRepLate.bed grep -v "^track" ./lab/Pans_ENm011.bed > UvaDnaRepPanS.bed foreach s (Mid Late PanS) echo $s cat /cluster/data/encode/UVa/2005-10-15/encodeUvaDnaRep${s}.bed \ UvaDnaRep${s}.bed | sort -k1 > encodeUvaDnaRep${s}.bed end # Reload these tables foreach f (encodeUvaDnaRep*.bed) set d = $f:r echo $d hgLoadBed -noBin hg17 $d $d.bed end # New Origins data, by new method (bubble trapping) submitted 2007-05-04 # by Chris Taylor. 2 datasets (GM and HeLa cells) on Affy ENCODE arrays. # This is for the UVa DNA Rep Ori track (University of Virginia DNA # Replication Origins track). cd /cluster/data/encode/Uva mkdir -p 2007-05-04/lab cd 2007-05-04/lab cp /var/ftp/encode/* . #-rw-r--r-- 1 kate protein 5710 May 3 23:03 Ori-Bubble-HeLa.bed #-rw-r--r-- 1 kate protein 1880 May 3 23:01 Ori-Bubbledescription.txt #-rw-r--r-- 1 kate protein 18379 May 3 23:02 Ori-NS-GM.bed #-rw-r--r-- 1 kate protein 10524 May 3 23:02 Ori-NS-HeLa.bed #-rw-r--r-- 1 kate protein 2127 May 3 23:02 Ori-NSdescription.txt # Load data into database (hartera, 2007-05-07) # New methods used are Bubble and Nascent Strand (NS). # Ori-Bubble-HeLa.bed - Bubble method, HeLa cells # Ori-NS-HeLa.bed - Nascent strand method, HeLa cells # Ori-NS-GM.bed - Nascent strand method, GM06990 cells cd /cluster/data/encode/UVa/2007-05-04 grep -v "^track" ./lab/Ori-Bubble-HeLa.bed > UvaDnaRepOriginsBubbleHela.bed grep -v "^track" ./lab/Ori-NS-HeLa.bed > UvaDnaRepOriginsNSHela.bed grep -v "^track" ./lab/Ori-NS-GM.bed > UvaDnaRepOriginsNSGM.bed foreach f (UvaDnaRepOrigins*.bed) set d = $f:r echo $d hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log end # add trackDb.encode.ra entries for new subtracks. Merge the new method # descriptions with the encodeUvaDnaRepOrigins.html description. # new data submitted 2007-05-11 to replace the original track (Heavy-light # DNA method) and new description including methods for this new data: # Ori-TR50.bed and Ori-description.html. mkdir -p /cluster/data/encode/UVa/2007-05-11/lab/ cd /cluster/data/encode/UVa ln -s 2007-05-11 latest cd 2007-05-11 # prepare and load the Ori-TR50 data grep -v "^track" ./lab/Ori-TR50.bed > UvaDnaRepOriginsTR50Hela.bed foreach f (UvaDnaRepOrigins*.bed) set d = $f:r echo $d hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log end # add trackDb.encode.ra entry for the new subtrack. Use the new # description to replace the old one. # There was an extra column in the Ori-TR50.bed file with a confidence # metric that should be removed (Chris Taylor suggested this when asked # about this column). # Remove the extra column and re-load table (2007-05-30, hartera) cd /cluster/data/encode/UVa/2007-05-11 rm UvaDnaRepOriginsTR50Hela.bed grep -v "^track" ./lab/Ori-TR50.bed \ | awk 'BEGIN {OFS="\t"} {print $1,$2,$3;}' \ > UvaDnaRepOriginsTR50Hela.bed hgsql -e 'drop table encodeUvaDnaRepOriginsTR50Hela;' hg17 foreach f (UvaDnaRepOrigins*.bed) set d = $f:r echo $d hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log end ########################################################################## # Indels from Jim Mullikin # Heather, Sept. 2005 ssh hgwdev cd /cluster/data/encode/NHGRI/mullikin/hg17 hgsql hg17 < encodeIndels.sql split4.pl < hg17.ENCODE.DIPtrack.Q23.bed4+ > split4.out # use a modified makeColoredBed ./makeColoredBed < split4.out > encodeIndels.bed # don't use -strict because we have lots of simple insertions (where chromStart = chromEnd) hgLoadBed hg17 encodeIndels -tab -sqlTable=encodeIndels.sql encodeIndels.bed # check reference length mysql> select chrom, chromStart, chromEnd, (chromEnd-chromStart) as size, traceName, reference, length(reference) as refsize from encodeIndels where (chromEnd-chromStart) != length(reference) and length(reference) > 1; # Empty set (0.07 sec) ########################################################################## # Boston University ORChID track - (2005-09-18 kate) # data developer contact: Jay Greenbaum jj@bu.edu ssh hgwdev cd /cluster/data/encode/BU mkdir -p orchid/2005-09-08/lab cd -p orchid/2005-09-08/lab wget --timestamping "http://dna.bu.edu/%7Ejj/cleavage_data_hg17/oh_cleavage_hg17.wig.gz" cd .. mkdir wib # NOTE: continue reluctantly with non-standard table name # as in hg16 wigEncode lab/oh_cleavage_hg17.wig.gz \ encodeBu_ORChID1.wig wib/encodeBu_ORChID1.wib # upper limit 1.58, lower limit -0.56 # load set dir = /gbdb/hg17/encode/Bu/2005-09-08 mkdir -p $dir hgLoadWiggle -pathPrefix=$dir hg17 encodeBu_ORChID1 encodeBu_ORChID1.wig mkdir -p $dir/wib ln -s `pwd`/wib/encodeBu_ORChID1.wib $dir/wib ########################################################################## # Genome Institute of Singapore -ChIP/PET of STAT1 TFBS (2005-09-29 kate) # Submitted 9/19 by Atif Shahab cd /cluster/data/encode/GIS mkdir chip mkdir -p 2005-09-19/lab ln -s 2005-09-19 latest cd latest # copy files from FTP dir to lab subdi4 # files: 2 bed files (stim and nonstim) and 1 doc file # use antiword to convert doc file to txt ln -s lab/STAT1+stimulation.bed Gif.bed ln -s lab/STAT1+w:o_stimulation.bed NoGif.bed # Use cluster-count info, now embedded into the name, to make scored BED: # (Angie's methods from hg16) foreach f (Gif.bed NoGif.bed) set d = $f:r echo $d set table = encodeGisChipPetStat1$d perl -wpe 'chomp; @w = split; \ if ($w[3] =~ /^\d+-(\d+)$/) { \ $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \ } else { die "parse"; } \ $_ = join("\t", @w) . "\n";' \ $f > ${table}.tab hgLoadBed hg17 $table ${table}.tab checkTableCoords hg17 $table end # Reading encodeGisChipPetStat1Gif.tab # Loaded 4007 elements of size 12 # Reading encodeGisChipPetStat1NoGif.tab # Loaded 3180 elements of size 12 # NOTE: These counts correspond with the doc file they provided # Unlike the previous GIS Chip/chip dataset, these are only # in the ENCODE regions. I requested the genome-wide # data -- they will provide this later. ########################################################################## # Genome Institute of Singapore - PET RNA (2005-10-19 kate) # Submitted 10/11 by Atif Shahab # 3 datasets - 5FU treated HCT116 cells, # MCF7 untreated # Estrogen-treated MCF7 (new) # Replace data in existing subtracks, and add new one cd /cluster/data/encode/GIS mkdir -p rna/2005-10-11/lab # copy files from FTP dir cd rna/2005-10-11/lab ln -s MCF7_estrogen_treated.bed lab/MCF7Estr-hg17.bed # use Angie's loading process from hg16 cat > load.csh << 'EOF' foreach f (lab/HCT116-hg17.bed lab/MCF7-hg17.bed lab/MCF7Estr-hg17.bed) set cellType = `echo $f:t:r | sed -e 's/-hg17//'` echo $cellType set table = encodeGisRnaPet$cellType grep '^chr' $f | \ perl -wpe \ 'chomp; @w = split; \ if ($w[3] =~ /\d+-(\d+)-(\d+)/) { \ ($mc, $ac) = ($1, $2, $3); \ if ($mc == 1) { $w[8] = ($ac > 1) ? "35,35,175" : "160,160,188"; } \ elsif ($mc > 1) { $w[8] = ($ac > 1) ? "180,120,0" : "225,150,0"; } \ else { die "mc $mc" } \ } else { die "parse"; } \ $_ = join(" ", @w) . "\n";' > $table.bed hgLoadBed hg17 $table $table.bed end 'EOF' csh load.csh >&! load.log rm *.bed ########################################################################## # UCSD/LI Nimblegen Hela # Data submitted on hg17 for June freeze cd /cluster/data/encode/UCSD/nimblegen/2005-06-01 foreach f (lab/Nim*/*.wig) set t = `echo $f:t:r | sed -e \ 's/rnap/encodeUcsdNgHeLaRnap/; s/tmh3k4/encodeUcsdNgHeLaH3K4me3/;'` echo $t grep "^chr" $f | hgLoadBed -onServer -bedGraph=4 hg17 $t stdin checkTableCoords hg17 $t end # Produces 4 tables, encodeUcsdNgHeLa{Rnap,H3K4me3}_p{0,30} # Loaded 385149 elements of size 4 # UCSD/Ludwig Institute Nimblegen chip/chip (2005-10-07 KATE) # New data submission # New data 2006-12-04 by Keith Ching load.csh << 'EOF' foreach f (`ls lab/*.wig`) set table = `echo $f:t:r | sed -e 's/\(.*\)/encodeUcsdNgHeLa\u\1/'` echo $table grep '^chr' $f | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.csh >&! load.log # Created hg17 composite track with all 16 datasets # The hg16 composite only has the first 4 submitted ########################################################################## # UCSD/LI Chip/Chip on Nimblegen and PCR platforms (2006-12-04) # from Keith Ching load.ng.csh << 'EOF' foreach f (t0 t1 t2 t3 t4 t5 t6 t7 t8) set ab = `sed -n '/track/s/.*name=\(.*\)_0 description.*/\1/p' $f` set d = HeLa$ab mv $f $d.wig set table = encodeUcsdLiNg$d echo $table grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.ng.csh >&! load.ng.log & #Loaded 385149 elements of size 4 #encodeUcsdLiNgHeLaH3 #encodeUcsdLiNgHeLaH3ac #encodeUcsdLiNgHeLaH4ac #encodeUcsdLiNgHeLaH3K4me1 #encodeUcsdLiNgHeLaH3K4me2 #encodeUcsdLiNgHeLaH3K4me3 #encodeUcsdLiNgHeLaTAF1 #encodeUcsdLiNgHeLaRNAPII #encodeUcsdLiNgHeLap300 /cluster/data/encode/bin/scripts/splitTracks.pl lab/encode.pcr.wig # total lines read: 196240, track declarations: 8, data lines: 196232 cat > load.pcr.csh << 'EOF' foreach f (t0 t1 t2 t3 t4 t5 t6 t7) set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f` set d = ${cell}${ab} mv $f $d.wig set table = encodeUcsdLiPcr$d echo $table grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.pcr.csh >&! load.pcr.log & # Loaded 24529 elements of size 4 # encodeUcsdLiPcrGM06990CTCF # encodeUcsdLiPcrHeLaCTCF # encodeUcsdLiPcrU937CTCF # encodeUcsdLiPcrGM06990H3K4me1 # encodeUcsdLiPcrGM06990H3K4me3 # encodeUcsdLiPcrHeLaH3K4me3 # encodeUcsdLiPcrIMR90H3K4me3 # encodeUcsdLiPcrGM06990TAF1 # NOTE: GM/CTCF data was also submitted in May. # Keith Ching advises dropping this version of the data and # keeping May. hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990CTCF" # New data submitted 2007-03-01 # Data includes various cell lines and antibodies using both PCR and # Nimblegen and there are no gamma interferon treatments. # Nimblegen data (11 tracks): 3 histones, Pol2, TAF2, CTCF in GM06990 # 4 histones, CTCF in HeLa # PCR data (20 tracks): histones, CTCF, TAFII, p300 in GM, K562, HeLa, IMR90, Tonsil # NOTE: PCR data submitted in hg16 coords # NOTE: For PCR data there are 4 tracks with the same label (TAF2/p250) # and 3 tracks with another (p300_C). The name generator was confused by underscore # in antibody name. # This data was resubmitted on 8/28 (see below) cd /cluster/data/encode/UCSD mkdir -p 2007-03-01/lab cd 2007-03-01/lab # load data from FTP site cd .. /cluster/data/encode/bin/scripts/splitTracks.pl lab/nimblegen.wig # total lines read: 4236650, track declarations: 11, data lines: 4236639 cat > load.ng.csh << 'EOF' foreach f (t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10) set ab = `sed -n '/track/s/.*name=\([^_()][^_()]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\)_Chromatin description.*/\1/p" $f` set d = ${cell}${ab} mv $f $d.wig set table = encodeUcsdLiNg$d echo $table grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.ng.csh >&! load.ng.log & # Loaded 385149 elements of size 4 # encodeUcsdLiNgGM06990TAFII # encodeUcsdLiNgGM06990H3K4me3 # encodeUcsdLiNgGM06990H3K27Ac # encodeUcsdLiNgGM06990RNAPII # encodeUcsdLiNgGM06990CTCF # encodeUcsdLiNgGM06990H3K18Ac # encodeUcsdLiNgHeLaCTCF # encodeUcsdLiNgHeLaH3K18Ac # encodeUcsdLiNgHeLaH3K27Ac # encodeUcsdLiNgHeLaH3K9Ac # encodeUcsdLiNgHeLaH3K27me3 # New data submitted 2007-05-23 # PCR data (7 tracks): Histones, TAF2, Pol2, CTCF in GM and K562 # NOTE: submitted in hg16 coords # Lifted and reloaded 2007-09-05 (kate) mkdir -p 2007-05-23/lab cd 2007-05-23/lab cp /var/ftp/encode/ucsc.zip . unzip ucsc.zip wc -l *.wig # 24538 ave_H3K18Ac_GM.rst.wig # 24538 ave_H3K27Ac_GM.rst.wig # 24538 ave_H3K9Ac_GM.rst.wig # 24538 ave_H3K9Ac_K562.rst.wig # 24538 ave_RNAPII_GM.rst.wig # 24538 ave_TAF250_GM.rst.wig # 24538 ave_ctcf_gm.rst.wig cd .. cat > load.pcr.csh << 'EOF' foreach f (lab/*.wig) set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f` set d = ${cell}${ab} set table = encodeUcsdLiPcr$d echo $table grep '^chr' $f | liftOver stdin \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \ $d.hg17.bedGraph $d.unmapped hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph checkTableCoords hg17 $table end 'EOF' csh load.pcr.csh >&! load.pcr.log & # Loaded 24528 elements of size 4 # NOTE: dropped 9 elements on chrX # encodeUcsdLiPcrGMH3K18Ac # encodeUcsdLiPcrGMH3K27Ac # encodeUcsdLiPcrGMH3K9Ac # encodeUcsdLiPcrK562H3K9Ac # encodeUcsdLiPcrGMRNAPII # encodeUcsdLiPcrGMTAF250 # encodeUcsdLiPcrgmctcf # rename for consistency hgsql hg17 -e "alter table encodeUcsdLiPcrgmctcf rename to encodeUcsdLiPcrGMCTCF" # New data submitted 2007-05-29 # Nimblegen data (10 tracks): TAF, histones in GM and K562 mkdir -p 2007-05-29/lab cd 2007-05-29/lab # load data from FTP site cd .. cat > load.ng.csh << 'EOF' foreach f (lab/*.wig) set ab = `sed -n '/track/s/.*name=\([^_()][^_()]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\)_Chromatin description.*/\1/p" $f` set d = ${cell}${ab} mv $f $d.wig set table = encodeUcsdLiNg$d echo $table grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.ng.csh >&! load.ng.log & # Loaded 385149 elements of size 4 # encodeUcsdLiNgK562TAFII # encodeUcsdLiNgK562H3K4me1 # encodeUcsdLiNgK562H3K27Ac # encodeUcsdLiNgK562H3K18Ac # encodeUcsdLiNgK562H3K4me3 # encodeUcsdLiNgK562H3K4me2 # encodeUcsdLiNgK562H3K9Ac # encodeUcsdLiNgGM06990H3K4me1 # encodeUcsdLiNgGM06990H3K9Ac # encodeUcsdLiNgGM06990H3K4me2 # Mar07 data resubmitted 2007-08-28 # 20 sets of PCR data, submitted in hg16 coords # Loaded 2007-09-04 (kate) mkdir -p 2007-08-28-Mar/lab cd 2007-08-28-Mar/lab mv /var/ftp/encode/UCSC200702.zip . unzip UCSC200702.zip wc -l *.wig #24538 ave_CTCF_K562.rst.wig #24538 ave_H2AZ_HeLa.rst.wig #24538 ave_H3K18Ac_K562.rst.wig #24538 ave_H3K27Ac_HeLa.rst.wig #24538 ave_H3K27Ac_K562.rst.wig #24538 ave_H3K4me1_IMR90.rst.wig #24538 ave_H3K4me1_K562.rst.wig #24538 ave_H3K4me2_GM06990.rst.wig #24538 ave_H3K4me2_K562.rst.wig #24538 ave_H3K4me3_K562.rst.wig #24538 ave_H4K20me1_HeLa.rst.wig #24538 ave_H4K20me2_HeLa.rst.wig #24538 ave_H4K20me3_HeLa.rst.wig #24538 ave_TAF250_GM06990.rst.wig #24538 ave_TAF250_HeLa.rst.wig #24538 ave_TAF250_K562.rst.wig #24538 ave_TAF250_Tonsil.rst.wig #24538 ave_p300_GM06990.rst.wig #24538 ave_p300_IMR90.rst.wig #24538 ave_p300_K562.rst.wig cd .. # confirmed that no datasets below will overwrite previously # loaded tables (no resubmissions). See oldtables.txt and tables.txt cat > load.pcr.csh << 'EOF' foreach f (lab/*.wig) set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f` set d = ${cell}${ab} set table = encodeUcsdLiPcr$d echo $table grep '^chr' $f | liftOver stdin \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \ $d.hg17.bedGraph $d.unmapped hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph checkTableCoords hg17 $table end 'EOF' csh load.pcr.csh >&! load.pcr.log & # Loaded 24528 elements of size 4 # NOTE: 9 elements on chrX dropped by lift # encodeUcsdLiPcrK562CTCF # encodeUcsdLiPcrHeLaH2AZ # encodeUcsdLiPcrK562H3K18Ac # encodeUcsdLiPcrHeLaH3K27Ac # encodeUcsdLiPcrK562H3K27Ac # encodeUcsdLiPcrIMR90H3K4me1 # encodeUcsdLiPcrK562H3K4me1 # encodeUcsdLiPcrGM06990H3K4me2 # encodeUcsdLiPcrK562H3K4me2 # encodeUcsdLiPcrK562H3K4me3 # encodeUcsdLiPcrHeLaH4K20me1 # encodeUcsdLiPcrHeLaH4K20me2 # encodeUcsdLiPcrHeLaH4K20me3 # encodeUcsdLiPcrGM06990TAF250 # encodeUcsdLiPcrHeLaTAF250 # encodeUcsdLiPcrK562TAF250 # encodeUcsdLiPcrTonsilTAF250 # encodeUcsdLiPcrGM06990p300 # encodeUcsdLiPcrIMR90p300 # encodeUcsdLiPcrK562p300 # NOTE: GM/TAF data was also in the May submission. # Keith Ching advises dropping the March submission and # keeping the May hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF250" # Aug07 data submitted 2007-08-28 # Loaded 2007-09-05 (kate) # PCR data: 4 datasets (YY1 and p300 in HeLa, H4Ac in K562, CTCF in IMR90) # Submitted in hg16 coordinates mkdir -p 2007-08-28/lab cd 2007-08-28/lab mv /var/ftp/encode/encode200708.zip unzip encode200708.zip # 24538 ave_CTCF_IMR90.rst.wig # 24538 ave_H4Ac_K562.rst.wig # 24538 ave_YY1_HeLa.rst.wig # 24538 ave_p300_HeLa.rst.wig # 61 encode_pcr_desc.html cd .. cat > load.pcr.csh << 'EOF' foreach f (lab/*.wig) set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f` set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f` set d = ${cell}${ab} set table = encodeUcsdLiPcr$d echo $table grep '^chr' $f | liftOver stdin \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \ $d.hg17.bedGraph $d.unmapped hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph checkTableCoords hg17 $table end 'EOF' csh load.pcr.csh >&! load.pcr.log & # Loaded 24528 elements of size 4 # encodeUcsdLiPcrIMR90CTCF # encodeUcsdLiPcrK562H4Ac # encodeUcsdLiPcrHeLaYY1 # encodeUcsdLiPcrHeLap300 # data distribution for 2007 PCR data grep ^chr *12-04/lab/*pcr*wig *Mar/*.hg17.bedGraph *5-23/*.hg17.bedGraph *-08-28/*.hg17.bedGraph | awk '{print $4}' | sort -nr | head # 102.5771 # 38.3034 # 27.3306 # 26.7377 # 24.0476 grep ^chr *12-04/lab/*pcr*wig *Mar/*.hg17.bedGraph *5-23/*.hg17.bedGraph *-08-28/*.hg17.bedGraph | awk '{print $4}' | sort -nr | grep -v ^102 | textHistogram stdin -real -binSize=.2 -maxBinCount=50 large values truncated: need 191 bins or larger binSize than 0.2 0.000000 ***************************** 153175 0.200000 ***** 27298 0.400000 **** 18398 0.600000 ************ 64313 0.800000 ************************************************************ 313021 1.000000 ************************************************* 257011 1.200000 ************ 63866 1.400000 **** 22781 1.600000 ** 9400 1.800000 * 6465 2.000000 * 3904 2.200000 * 4096 2.400000 2151 2.600000 1658 2.800000 1313 grep ^chr *12-04/lab/*nimb*wig *-03-01/*.wig *5-29 | awk '{print $4}' | sort -nr | head #4.48 #4.440 #4.40 #4.319 # NOTE: multiple submissions for the same experiment were provided: # CTCF in GM: 12/06, 5/07 # TAF1 in GM: 12/06, 3/07, 5/07 # Asked Keith Ching -- he says keep only the 5/7 versions hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990CTCF" hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF1" hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF250" ########################################################################## # UT-Austin (Vishy Iyer lab) Chip/chip (2005-10-10 kate) cd /cluster/data/encode mkdir UTexas/2005-10-01/lab cd UTexas/2005-10-01/lab # copy file from FTP dir # 8 .wig data files (4 experiments, with raw data, and "peaks"), plus description file cat > load.csh << 'EOF' foreach f (`ls lab/*.wig`) set table = `echo $f:t:r | sed -e 's/HeLa/HeLa_NoSerum/;s/NoSerum//;s/Serum4hr/Stim/;s/2091/2091fib/;s/\(.*\)_\(.*\)_\(.*\)_\(.*\)/encodeUtexChip\1\3\2\u\4/'` echo $table grep '^chr' $f | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin checkTableCoords hg17 $table end 'EOF' csh load.csh >&! load.log # Created composite track with 8 subtracks ########################################################################## # Affy Chip/chip and RNA (kate) # submitted by Hari_Tammana@affymetrix.com (Oct. 3) # with clarifications as to display from Phil Kapranov at Affy # HeLa data update submitted 12/15 by Hari Tamani cd /cluster/data/encode/Affy mkdir 2005-10-03/lab cd 2005-10-03/lab # copy file from FTP dir (500M) affy_oct1.tar.gz # two data dirs: CHIP, RNA # 10 descriptions for CHIP dir, 3 for RNA dir # RNA has 2 dirs (bed, wig) with each # having 3 cell lines (GM06990, HeLa, HL60; the HL60 # data has 4 timepoints (0, 2, 8, 32) # README's (and discussions with Phil) indicate the # wig's are replacements for previous RNA Signal # data, and bed's are replacement Transfrags # The CHIP .wig files are similar to the previous # Affy Pval data, but analyzed with stricter analysis # criteria. The .bed files are comparable to the Sites # track. 2 factors are repeats from previous track # (HisH4 TetraAc, Pol2), and 3 are new (H3K9K14DiAc, # p63_ActD (with Actinomycin D treatment), # p63_mActD (without Actinomycin D treatment) # The Pol2, HisH4, and H3* data are at 4 timepoints. # These should be loaded in addition to previous tracks # (not replacements). Later the earlier ("lenient") # analysis will be submitted for the 3 new factors, # and these will be added to the previous Affy Chip/chip tracks # on hg17. # Transfrags (6 subtracks) cd /cluster/data/encode/Affy/2005-10-03 tail +2 lab/RNA/bed/GM06990/EC_AS_GM06990_RCyP+_C01vsNULL.sig.gr.bed \ | hgLoadBed -noBin hg17 encodeAffyRnaGm06990Sites stdin # 4377 elements tail +2 lab/RNA/bed/HeLa/EC_AS_HeLaS3_RCyP+_C01vsNULL.sig.gr.bed \ | hgLoadBed -noBin hg17 encodeAffyRnaHeLaSites stdin # 2037 elements cat > loadSites.csh << 'EOF' foreach f (lab/RNA/bed/HL60/??/*HL60*.bed) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SitesHr$1/;'` echo $track tail +2 $f \ | hgLoadBed -noBin hg17 $track stdin end 'EOF' csh loadSites.csh >&! loadSites.log # Update HeLa sites (12/15) cd /cluster/data/encode/Affy/2005-11-22 tail +2 lab//Affy_HeLa/bed/EC_AS_HeLa_RCyP+_C01vsNULL.sig.gr.bed | \ hgLoadBed -strict -noBin hg17 encodeAffyRnaHeLaSites stdin # 7254 elements # RNA Signal (6 subtracks) set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03 mkdir -p $gbdbDir/wib mkdir wib wig set track = encodeAffyRnaGm06990Signal cat lab/RNA/wig/GM06990/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir set track = encodeAffyRnaHeLaSignal cat lab/RNA/wig/HeLa/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir cat > loadSig.csh << 'EOF' set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03 foreach f (lab/RNA/wig/HL60/??/*HL60*C01vsNULL.sig.wig) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SignalHr$1/;'` echo $track cat $f \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir end 'EOF' csh loadSig.csh >&! loadSig.log # Create a single composite track for RNA and Transfrags # Update HeLa signal (2005-12-15 kate) cd /cluster/data/encode/Affy/2005-11-22 mkdir wib wig set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03 rm $gbdbDir/wib/$track.wib set track = encodeAffyRnaHeLaSignal cat lab/Affy_HeLa/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig | \ wigEncode stdin wig/$track.wig wib/$track.wib # Converted stdin, upper limit 1591.50, lower limit -779.75 ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir # CHIP/Chip sites (2005-10-24 kate) cd /cluster/data/encode/Affy/2005-10-03 # Load up 12 tables of ChIP/chip sites at (3 factors, 4 timepoints) # plus 2 more for ActD at 1 timepoint cat > loadChipBed.csh << 'EOF' foreach f (lab/CHIP/bed/*/??/*.bed) set factor = `echo $f:h:h:t | sed 's/Pol2/Rnap/; s/Hish4/H4Kac4/'` set hr = $f:h:t set table = encodeAffyChIpHl60SitesStrict${factor}Hr$hr echo $table grep "^chr" $f | hgLoadBed -noBin hg17 $table stdin end grep "^chr" lab/CHIP/bed/p63_ActD/*.bed | hgLoadBed -noBin hg17 \ encodeAffyChIpHl60SitesStrictP63_ActD stdin grep "^chr" lab/CHIP/bed/p63_mActD/*.bed | hgLoadBed -noBin hg17 \ encodeAffyChIpHl60SitesStrictP63_mActD stdin 'EOF' csh loadChipBed.csh >&! loadChipBed.log # Chip/chip signal and pvalue cat > loadChipWig.csh << 'EOF' set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03 foreach d (lab/CHIP/wig/p63_ActD lab/CHIP/wig/p63_mActD) set factor = $d:t set prefix = encodeAffyChIpHl60; set track = ${prefix}SignalStrict$factor echo $track cat $d/*.sig.median.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir set track = ${prefix}PvalStrict$factor echo $track cat $d/*.pval.median.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir end foreach d (lab/CHIP/wig/*/??) set hr = $d:t set factor = $d:h:t set track = ${prefix}SignalStrict${factor}Hr$hr echo $track cat $d/*.sig.median.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir set track = ${prefix}PvalStrict${factor}Hr$hr echo $track cat $d/*.pval.median.wig \ | wigEncode stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir end 'EOF' csh loadChipWig.csh >&! loadChipWig.log # create 2 composite tracks: # Affy Strict ChIP (contains Oct freeze Sites and Pval subtracks) # Affy Strict Sig (contains Oct freeze Signal) # and reformat June freeze tracks (2 composites w/ 10 factors each) as: # Affy Loose ChIP (contains Jun freeze Sites and Pval subtracks) ########################################################################## # U North Carolina FAIRE (2005-10-24 kate) # Peaks data updated (2006-04-13 kate and 2006-05-01, hartera) # Added description for updated Peaks data - provided by # Paul Giresi (paulg@email.unc.edu) (2006-06-13, hartera) # Finished data update for ChIPOTle peaks track and updatd the downloads # and changed the original Signal and Peaks subtracks to a BED graph # so that data in tables looks more like the raw data (on request of # Paul Giresi) (DONE, 2006-08-16, hartera) # submitted by Paul Giresi, from Jason Lieb's lab # later, Paul submitted an "averages" file for the # raw data (but doesn't include the "peaks") # On 10/24, submitted peaks averages. # The averages files are: # FAIREavg_data.gff (for Signal, averages of all four replicates) # FAIREavg_peaks.gff (for Peaks, data after running peak-finding software # on the Signal averages data above). # Both of these files are in wiggle format. cd /cluster/data/encode mkdir UNC/2005-10-10/lab cd UNC/2005-10-10/lab # copy files from FTP dir # 8 .gff data files plus description file # NOTE: these are actually .bed and .wig files # the .bed files are "peaks", and the .wig are "raw" # NOTE: these files are basically replicates, # we really want to show just the averages -- # Submitter says OK to just post for download mkdir -p download # convert to UNIX format foreach f (lab/*norm*.gff) set t = $f:t:r echo $t dos2unix -n $f download/$t.bed end # slightly different format for "peaks" files foreach f (lab/*fpr01*.gff) set t = $f:t:r echo $t dos2unix -l -n $f download/$t.bed end cd download gzip *.bed md5sum *.bed > md5sum.txt # add README file with data terms ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles mkdir -p $dir ln -s /cluster/data/encode/UNC/2005-10-10/download $dir/UncFaire # averages # Probes are 50 bp with 12 bp overlap and the start of each spot on # the chromosomes were listed. Changing the span to 38 bp removed the # overlap. This should only have been done for the Signal and not Peaks # data (from e-mail from Paul Giresi, 2006-08-08) sed 's/span=50/span=38/' lab/FAIREavg_data.gff > Signal.wig sed 's/span=50/span=38/' lab/FAIREavg_peaks.gff > Peaks.wig #_data.gff: -2.61 to 3.63 #_peaks.gff: .47 to 3.63 # using viewLimits: .2 to 2.6 # wiggle0 with span=50 # around 380K records, so load it wiggle, not bedGraph cat > load.csh << 'EOF' foreach f (Signal.wig Peaks.wig) set type = $f:r set table = encodeUncFaire$type wigEncode $f $table.wig $table.wib set dir = /gbdb/hg17/encode/UNC/2005-10-10 mkdir -p $dir hgLoadWiggle -pathPrefix=$dir hg17 $table $table.wig mkdir -p $dir ln -s `pwd`/$table.wib $dir end 'EOF' csh load.csh >&! load.log # update peaks data (2006-04-13 kate) cd /cluster/data/encode mkdir UNC/2006-04-13/lab cd UNC/2006-04-13/lab # lab/OfficialChIPOTle_PEAKS.gff is a bedGraph format and this contains # the new peaks data after a data reanalysis. # lab/FAIREavg_OfficialPeaks.gff is a file in wiggle format with # the Signal track data first (the same as for the original track) # and then the new Peaks data. # trim precision for the peaks data: awk 'NR !=1 {printf("%s\t%d\t%d\t%.3f\n", $1, $2, $3, $4)}' \ lab/OfficialChIPOTle_PEAKS.gff > peaks.bedGraph # data range: 0 - 3.627 # load data as bedGraph (2006-05-01, hartera) # edit file and remove line: track 0 0 0.000 # and then load hgLoadBed -strict -bedGraph=4 hg17 \ encodeUncFairePeaksApr2006 peaks.bedGraph # added this as a new subtrack to human/trackDb.encode.ra to see what # it looks like. # In ~/kent/src/hg/makeDb/trackDb/human/trackDb.encode.ra # add the following lines to the subtrack entry since it will inherit # from the parent track otherwise which is a wiggle type. # track encodeUncFairePeaksApr2006 subTrack encodeUncFaire shortLabel UNC FAIRE Peaks Apr. 06 longLabel UNC FAIRE Peaks (Formaldehyde Assisted Isolation of Regulatory Elements) Apr. 2006 Update noInherit on type bedGraph 4 maxHeightPixels 128:16:16 autoScale off windowingFunction mean viewLimits .2:2.6 color 20,150,20 altColor 50,100,50 priority 3 # Description update added (2006-06-13, hartera). # Add new description for new Peaks data subtrack from # FAIRE_peaks_DESC.htm to trackDb/human/encodeUncFaire.html # Data is not correct so reload the Peaks data sent by Paul Giresi # FAIRE_peaks1e-025_feat_track.gff in lab directory ssh hgwdev mkdir -p /cluster/data/encode/UNC/2006-05/lab cd /cluster/data/encode/UNC/2006-05 # remove first line tail +2 lab/FAIRE_peaks1e-025_feat_track.gff > peaks.bedGraph # and then load as bedGraph hgLoadBed -strict -bedGraph=4 hg17 \ encodeUncFairePeaksChipotle peaks.bedGraph # edit human/trackDb.encode.ra entry above to use May2006 table. # edit these lines too: # track encodeUncFairePeaksChipotle # longLabel UNC FAIRE Peaks (Formaldehyde Assisted Isolation of Regulatory # Elements) (ChIPOTle) # viewLimits 0.0:2.7 # color 0,0,255 # Reload the original Peaks and Signal data as BED graph so that the # table downloads look more like the original data so that the number # of lines in the table is the same as the number of peaks for the Peaks # track. Used original data with span=50. This is the size of the probes. cd /cluster/data/encode/UNC/2005-10-10 mkdir -p bedGraphFormat cd bedGraphFormat /cluster/bin/scripts/varStepToBedGraph.pl ../lab/FAIREavg_data.gff \ > signal.bedGraph # Processed 385194 lines input, 385149 data lines, 44 variable step # declarations /cluster/bin/scripts/varStepToBedGraph.pl ../lab/FAIREavg_peaks.gff \ > peaksOriginal.bedGraph # Processed 845 lines input, 800 data lines, 44 variable step declarations # Reload the Signals and Peaks tables with this data. hgsql -e "drop table encodeUncFaireSignal;" hg17 hgsql -e "drop table encodeUncFairePeaks;" hg17 hgLoadBed -strict -bedGraph=4 hg17 \ encodeUncFaireSignal signal.bedGraph hgLoadBed -strict -bedGraph=4 hg17 \ encodeUncFairePeaks peaksOriginal.bedGraph # update human/trackDb.encode.ra so that type is # type bedGraph 4 # for the parent track. cd /cluster/data/encode/UNC/2005-10-10/lab cp FAIREavg_data.gff ../download/FAIREavg_data.wig cp FAIREavg_peaks.gff ../download/FAIREavg_peaks.wig cd ../download gzip *.wig # change the Signal files to wig extension as these are wiggle format foreach f (*CHR.bed.gz) set g=$f:r:r mv $f ${g}.wig.gz end # Add ChIPOTle data to downloads. cp /cluster/data/encode/UNC/2006-05/lab/FAIRE_peaks1e-025_feat_track.gff \ FAIRE_peaks1e-025_feat_track.bed gzip FAIRE_peaks1e-025_feat_track.bed # Add description of these files to README.txt and update the md5sum.txt rm md5sum.txt md5sum *.gz > md5sum.txt # Look at data using histogram to help decide viewLimit: cd /cluster/data/encode/UNC/2005-10-10/bedGraphFormat textHistogram -binSize=0.1 -maxBinCount=65 -col=4 -minVal=-2.7 -real \ signal.bedGraph > signal.hist textHistogram -binSize=0.1 -maxBinCount=40 -col=4 -real \ peaksOriginal.bedGraph > peaksOriginal.hist textHistogram -binSize=0.1 -maxBinCount=40 -col=4 -real \ ../../2006-05/peaks.bedGraph > peaksChipotle.hist # set minLimit, maxLimit, viewLimit and increased default pixel size for # subtrack so that y axis scale is shown, in human/trackDb.encode.ra: # maxHeightPixels 128:24:16 # minLimit -2.61 # maxLimit 3.63 # viewLimits -0.6:0.7 # for the Peaks subtracks, the viewLimits were set as: # viewLimits 0.4:3.7 ########################################################################## # Gencode Genes (2005-10-10 kate) # Files are on Gencode/IMIM web site, our contact for this round is France Denoed # France requested 3 subtracks: genes, putatives, and pseudogenes # NTOE: reloaded encodeGencodeKnown from updated _genes_ file 10/14 (kate) # Update 2007-03-28 (Kate). Received from Julien Lagarde (jlagarde@imim.es) # Julien requested 5 subtracks: reference genes, putative, polymorphic, # pseudogenes, polyA features. Track update (DONE, 2007-04-14, hartera) # Renamed the gencodeGeneClassOct05 table to encodeGencodeGeneClassOct05 # (DONE, 2007-09-09, hartera) cd /cluster/data/encode mkdir -p Gencode/2005-10-07/lab cd Gencode/2005-10-07/lab wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/README wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_genes_CHR_coord.gtf wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_putative_CHR_coord.gtf wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_pseudogenes_CHR_coord.gtf cd .. ldHgGene -gtf -genePredExt hg17 encodeGencodeKnown \ lab/44regions_genes_CHR_coord.gtf # Read 2637 transcripts in 45565 lines in 1 files # 2637 groups 21 seqs 13 sources 5 feature types # 2608 gene predictions genePredCheck -db=hg17 encodeGencodeKnown ldHgGene -gtf -genePredExt hg17 encodeGencodePutative lab/44regions_putative_CHR_coord.gtf # 156 gene predictions genePredCheck -db=hg17 encodeGencodePutative ldHgGene -gtf -genePredExt hg17 encodeGencodePseudo lab/44regions_pseudogenes_CHR_coord.gtf # 197 gene predictions genePredCheck -db=hg17 encodeGencodePutative # create composite track: "Gencode Oct Gene" with 3 subtracks # Introns track grep intron lab/*.gtf | wc -l # 25421 # ignore "not tested" introns grep intron lab/*.gtf | grep -v not_tested | wc -l # 483 # NOTE: need verision of loader with new status value added cat lab/*.gtf | grep -v not_tested | sed -e 's/-intron/-/g' | \ ~/bin/i386/ldGencodeIntron hg17 encodeGencodeIntronOct stdin # 483 introns in 1 files # create gene class table sed 's/gencodeGeneClass/gencodeGeneClassOct/' \ ~/kent/src/hg/lib/gencodeGeneClass.sql | hgsql hg17 cat lab/*.gtf | grep VEGA | \ awk '{printf "%s\t%s\n", $10, $2}' | \ sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' \ -e 's/_val/_gencode_conf/' -e 's/Antisense/Novel_transcript/' | \ sort | uniq > gencodeGeneClassOct.tab wc -l gencodeGeneClassOct.tab # 2961 echo "LOAD DATA LOCAL INFILE 'gencodeGeneClassOct.tab' into table gencodeGeneClassOct" | hgsql hg17 # Rename the gencodeGeneClassOct05 table so that is has the prefix # "encode" in line with all other ENCODE tables. (20007-09-09, hartera) hgsql -e \ 'alter table gencodeGeneClassOct05 rename encodeGencodeGeneClassOct05;' hg17 # Make the change in trackDb/human/trackDb.encode.ra so that the # itemClassTbl is encodeGencodeGeneClassOct05 for Gencode Genes Oct05. ###################################################################### # Update 2007-03-28 from Julien Lagarde (Kate) # Track update with five subtracks (in progress, 2007-04-10, hartera). # Gencode reference genes: "Known" and "Novel_CDS" # Gencode putative: "Novel_Transcript", "Putative", "TEC", "Artifact" # Gencode polymorphic: "Polymorphic" # Gencode pseudogenes: "Processed_pseudogene", "Unprocessed_pseudogene" # Gencode polyA features: "polyA_signal", "polyA_site", "pseudo_polyA" # cut -f2 *chr_coords_hg17.gff | sort | uniq shows all the types. # New description sent 2007-04-12, ucsc_description2.html # Update finished 2007-04-14, hartera. # Renamed the gencodeGeneClassMar07 table to encodeGencodeGeneClassMar07 # (DONE, 2007-09-09, hartera) cd /cluster/data/encode/Gencode mkdir -p 2007-03-28/lab ln -s 2007-03-28 latest cd 2007-03-28/lab wget -r -nv -nd -np ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/current/gff/EN\*hg17.gff cd /cluster/data/encode/Gencode/2007-03-28 egrep -h 'Known|Novel_CDS' lab/*.gff > encodeGencodeGeneKnownMar07.gff egrep -h 'Novel_Transcript|Putative|TEC|Artifact' lab/*.gff \ > encodeGencodeGenePutativeMar07.gff egrep -h 'Polymorphic' lab/*.gff > encodeGencodeGenePolymorphicMar07.gff egrep -h 'pseudogene' lab/*.gff > encodeGencodeGenePseudoMar07.gff egrep -h 'polyA' lab/*.gff | \ awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $4, $5, $2, "0", $7;}' | \ sed -e 's/VEGA_//' \ > encodeGencodeGenePolyA.bed wc -l encode* # total is 33492 which is the same number of lines in the lab/*.gff files. # load these into the database: foreach c (Known Putative Polymorphic Pseudo) set table = encodeGencodeGene${c}Mar07 echo $table ldHgGene -genePredExt hg17 $table ${table}.gff genePredCheck -db=hg17 $table end # encodeGencodeGeneKnownMar07 - 2991 gene predictions # encodeGencodeGenePutativeMar07 - 372 gene predictions # encodeGencodeGenePolymorphicMar07 - 25 gene predictions # encodeGencodeGenePseudoMar07 - 191 gene predictions # everything looks fine with genePredCheck. A bug was fixed by Mark # in ldHgGene since there was an error with loading initially. Tables # were reloaded on 2007-04-12. # The BED table can have an itemRgb column to specify coloring for items. # polyA_signal: brown, polyA_site: orange, pseudo_polyA: pink # Add the colours in r,g,b format in column 9: awk 'BEGIN {OFS="\t"} {if ($4 ~ /signal/) print $0, "0", "0", "94,38,5"; \ else if ($4 ~ /site/) print $0, "0", "0", "255,102,0"; \ else if ($4 ~ /pseudo/) print $0, "0", "0", "255,153,255";}' \ encodeGencodeGenePolyA.bed > encodeGencodeGenePolyAMar07.bed # Load in the polyA features BED file: hgLoadBed hg17 encodeGencodeGenePolyAMar07 encodeGencodeGenePolyAMar07.bed # Loaded 1807 elements of size 6 # Create the gene class table: # modify table name and add Polymorhpic class to enum: sed -e 's/gencodeGeneClass/gencodeGeneClassMar07/' \ ~/kent/src/hg/lib/gencodeGeneClass.sql \ > gencodeGeneClassMar07.sql perl -pi.bak -e \ "s/Unprocessed_pseudogene\'/Unprocessed_pseudogene\', \'Polymorphic\'/" \ gencodeGeneClassMar07.sql rm *.bak cat lab/*.gff | grep VEGA | grep -v polyA | \ awk '{printf "%s\t%s\n", $10, $2}' | \ sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' | \ sort | uniq > gencodeGeneClassMar07.tab wc -l gencodeGeneClassMar07.tab # 3579 gencodeGeneClassMar07.tab # load into database: hgLoadSqlTab hg17 gencodeGeneClassMar07 gencodeGeneClassMar07.sql \ gencodeGeneClassMar07.tab # Create a human/trackDb.encode.ra entry and the description page. cp -p ucsc_description.html \ ~/kent/src/hg/makeDb/trackDb/human/encodeGencodeGeneMar07.html # E-mailed Julien Lagarde on 2007-04-13 to ask if a Gencode Introns # track update is also required. E-mail from Julien on 2007-04-16 states # that the old Gencode Introns track represented RT-PCR verification of # individual exon junctions. This information has now integrated into gene # objects by HAVANA annotators so the Introns track is now obsolete - # this is also stated in the track description. # Rename the gencodeGeneClassMar07 table so that is has the prefix # "encode" in line with all other ENCODE tables. (20007-09-09, hartera) hgsql -e \ 'alter table gencodeGeneClassMar07 rename encodeGencodeGeneClassMar07;' hg17 # Make the change in trackDb/human/trackDb.encode.ra so that the # itemClassTbl is encodeGencodeGeneClassMar07 for Gencode Genes Mar07. ########################################################################## # NHGRI DNaseI HS (2005-10-24 kate) # Submitter: Greg Crawford # 2 datasets: CD4, GM06690 with different methodology from previous # Additional (raw) data for both cell types submitted 12/6/05 # Additional data submitted 8/10/06: Raw & Pval for HelaS3 and GM cells # Submitted new PVAL (3 cell lines) data on 8/11 # Additional data submission from Greg (at Duke now) 9/22/06 -- HepG2 cell line # Additional data (DNAse array )for IMR90, K562 and H9 cells # (raw and pvalue) submitted 2/26/07 # Track update to add subtracks for IMR90, K562 and H9 cells (Raw and Pval for # DNase-chip method). Old Method subtracks (for GM06990 and CD4+ T cells) were # removed and any references to them in the description were also removed. # (DONE, 2004-04-10, hartera) cd /cluster/data/encode/NHGRI/crawford mkdir -p 2005-10-11/lab cd 2005-10-11/lab # copy 2 data files from FTP site # lift to hg17 ln -s lab/Crawford_DNase_chip_CD4_hg16.txt Cd4.hg16.bed awk '{printf "%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$6}' \ lab/Crawford_DNase_chip_GM06990_hg16.txt > Gm06990.hg16.bed # oops - mistakenly deleted lab/Crawford*txt files cat > load.csh << 'EOF' foreach f (Cd4.hg16.bed Gm06990.hg16.bed) set cell = $f:r:r liftOver $cell.hg16.bed \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \ $cell.hg17.bed $cell.unmapped hgLoadBed hg17 encodeNhgriDnaseHsChip$cell $cell.hg17.bed end 'EOF' csh load.csh >&! load.log # Add these two tracks to the hg17 NHGRI DNase track # Rename 2 data tables lifted from hg16 hgsql hg17 -e "ALTER TABLE encodeNhgriDnaseHsAct RENAME TO encodeNhgriDnaseHsMpssCd4Act" hgsql hg17 -e "ALTER TABLE encodeNhgriDnaseHsNonAct RENAME TO encodeNhgriDnaseHsMpssCd4" # Raw data ln -s lab/NHGRI_DNase_chip_CD4_na_RAW.bed Cd4.raw.hg16.bed ln -s lab/NHGRI_DNase_chip_GM_RAW.bed Gm06990.raw.hg16.bed cat > loadRaw.csh << 'EOF' foreach f (Cd4.raw.hg16.bed Gm06990.raw.hg16.bed) set cell = $f:r:r:r liftOver $f \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \ $cell.raw.hg17.bed $cell.raw.unmapped hgLoadBed -strict -bedGraph=4 hg17 \ encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed end 'EOF' csh loadRaw.csh >&! loadRaw.log # Loaded 382713 elements of size 4 # 8/10/06 Data submission cd /cluster/data/encode/NHGRI/crawford mkdir -p 2006-08-10/lab cd 2006-08-10/lab cp -p /var/ftp/encode/Crawford* . ls # Crawford_DNase-chip_GM06990_PVAL_hg17.bed # Crawford_DNase-chip_GM06990_RAW_HG17.bed # Crawford_DNase-chip_HeLaS3_PVAL_HG17.bed # Crawford_DNase-chip_HeLaS3_RAW_hg17.bed # NOTE: the GM Raw data is identical to that submitted on 10/11/05 set cell = Gm06990 awk '{print $1, $2, $3, $5}' lab/Crawford_DNase-chip_GM06990_RAW_HG17.bed \ > $cell.raw.hg17.bed hgLoadBed -strict -bedGraph=4 hg17 \ encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed # Loaded 382713 elements of size 4 set cell = Hela awk '{print $1, $2, $3, $5}' lab/Crawford_DNase-chip_HeLaS3_RAW_hg17.bed \ > $cell.raw.hg17.bed hgLoadBed -strict -bedGraph=4 hg17 \ encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed # Loaded 385149 elements of size 4 # Note: different item count from data for CD4 and GM # Submitted new PVAL data on 8/11 #Crawford_DNase-chip_CD4_PVAL_hg17.bed10 #Crawford_DNase-chip_GM06990_PVAL_hg17.bed10 #Crawford_DNase-chip_HeLaS3_PVAL_hg17.bed12 ln -s lab/Crawford_DNase-chip_GM06990_PVAL_hg17.bed10 Gm06990.pval.hg17.bed ln -s lab/Crawford_DNase-chip_HeLaS3_PVAL_hg17.bed12 Hela.pval.hg17.bed ln -s lab/Crawford_DNase-chip_CD4_PVAL_hg17.bed10 Cd4.pval.hg17.bed # load Pval data as bed5floatscore, with pval mapped to integer score (0-1000) # for display purposes # format: chr start end name score pVal cat > loadPval.csh << 'EOF' foreach cell (Cd4 Gm06990 Hela) set lcell = `echo $cell | sed 's/\(.*\)/\L\1/'` awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR-1, $5 * 35 + 100, $5)}' $cell.pval.hg17.bed > $cell.pval.bed5+ set table = encodeNhgriDnaseHsChipPval$cell sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \ $table.sql hgsql hg17 -e "DROP TABLE IF EXISTS $table" hgsql hg17 < $table.sql hgLoadBed -strict -sqlTable=$table.sql hg17 \ $table $cell.pval.bed5+ checkTableCoords hg17 $table end 'EOF' csh loadPval.csh >&! loadPval.log # min = 3.13 max = 24.513 # Reading Cd4.pval.bed5+ #Loaded 1262 elements of size 6 #Reading Gm06990.pval.bed5+ #Loaded 1098 elements of size 6 #Reading Hela.pval.bed5+ #Loaded 1042 elements of size 6 # data submission 9/22/06 -- HepG2 cell line cd /cluster/data/encode/NHGRI/crawford mkdir -p 2006-09-22/lab cd 2006-09-22 ln -s lab/Crawford_DNase-chip_HepG2_PVAL_HG17.bed HepG2.pval.bed ln -s lab/Crawford_DNase-chip_HepG2_RAW_hg17.bed HepG2.raw.bed set cell = HepG2 # load RAW awk '{print $1, $2, $3, $5}' $cell.raw.bed | \ hgLoadBed -strict -bedGraph=4 hg17 \ encodeNhgriDnaseHsChipRaw$cell stdin # Loaded 385149 elements # load PVAL set lcell = hepg2 awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR, $5 * 35 + 100, $5)}' $cell.pval.bed > $cell.pval.bed5+ set table = encodeNhgriDnaseHsChipPval$cell sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \ $table.sql hgsql hg17 -e "DROP TABLE IF EXISTS $table" hgsql hg17 < $table.sql hgLoadBed -strict -sqlTable=$table.sql hg17 \ $table $cell.pval.bed5+ checkTableCoords hg17 $table # Additional data (DNAse array )for IMR90, K562 and H9 cells # (raw and pvalue) submitted 2/26/07 cd /cluster/data/encode ln -s NHGRI/crawford Duke cd Duke mkdir -p 2007-02-26/lab # copy files from FTP area cd 2007-02-26 # (DONE, 2007-04-03, hartera) ln -s lab/Crawford_DNase-chip_H9_pvalue.bed H9.pval.bed ln -s lab/Crawford_DNase-chip_H9_RAW_HG17.bed H9.raw.bed ln -s lab/Crawford_DNase-chip_IMR90_pvalue.bed Imr90.pval.bed ln -s lab/Crawford_DNase-chip_IMR90_RAW_HG17.bed Imr90.raw.bed ln -s lab/Crawford_DNase-chip_K562_pvalue.bed K562.pval.bed ln -s lab/Crawford_DNase-chip_K562_RAW_HG17.bed K562.raw.bed # load RAW (-strict is now default for hgLoadBed) foreach cell (H9 Imr90 K562) awk '{print $1, $2, $3, $5}' $cell.raw.bed | \ hgLoadBed -bedGraph=4 hg17 \ encodeNhgriDnaseHsChipRaw$cell stdin end # check table coordinates for raw data: foreach cell (H9 Imr90 K562) set table = encodeNhgriDnaseHsChipRaw$cell echo $table checkTableCoords hg17 $table end # load PVAL foreach cell (H9 Imr90 K562) set lcell = `echo $cell | sed 's/\(.*\)/\l\1/'` awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR, $5 * 35 + 100, $5)}' $cell.pval.bed > $cell.pval.bed5+ set table = encodeNhgriDnaseHsChipPval$cell sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \ $table.sql hgsql hg17 -e "DROP TABLE IF EXISTS $table" hgsql hg17 < $table.sql hgLoadBed -sqlTable=$table.sql hg17 $table $cell.pval.bed5+ echo "Checking coords in $table" checkTableCoords hg17 $table end # Added the new subtracks to the hg17/trackDb.encode.ra and updated the # description. # (2007-04-10, hartera) # GM06990 and CD4+ T cell subtracks done by the old method were removed from # trackDb/human/hg17/trackDb.encode.ra and also any references to them # in the description. Tables for these subtracks are: # encodeNhgriDnaseHsChipGm06990 and encodeNhgriDnaseHsChipCd4. ########################################################################## # Sanger Chip/chip Hits and Centers (2005-10-24 kate) # From Paul Flicek, at EBI # 14 files (3 cells, most with 5 factors), each file having # 3 tracks: chip/chip, HMM regions, HMM centers # Christoph says to just display the HMM regions & centers tracks # from Paul's files. These were generated from June freeze # chip/chip, plus newly submitted HeLa data from Christoph Koch (10/7). cd /cluster/data/encode/sanger/chipchip mkdir -p 2005-10-18/lab cd 2005-10-18/lab # HeLa chip/chip cat > loadHela.csh << 'EOF' foreach f (lab/*_HeLa-S3_1.wig.txt) set b = `echo $f:t:r:r | sed 's/-S3_1//; s/_//'` echo $b grep "^chr" $f | sort -k1,1 -k2,2n > chip.$b.wig hgLoadBed -bedGraph=4 hg17 encodeSangerChip$b chip.$b.wig end 'EOF' csh loadHela.csh >&! loadHela.log & # split HMM tracks out of files cat > load.csh << 'EOF' foreach f (lab/*.split.wig.txt) set b = `echo $f:t:r:r:r | sed 's/-2//; s/-//g'` echo $b /cluster/data/encode/bin/scripts/splitTracks.pl $f rm t0 grep '^chr' t1 | sort -k1,1 -k2,2n > $b.wig; rm t1 hgLoadBed -bedGraph=4 hg17 encodeSangerChipHit$b $b.wig checkTableCoords hg17 encodeSangerChipHit$b grep '^chr' t2 | sed 's/ 1$//' > $b.bed; rm t2 hgLoadBed -noBin hg17 encodeSangerChipCenter$b $b.bed checkTableCoords hg17 encodeSangerChipCenter$b end 'EOF' csh load.csh >&! load.log ############################################################################# # Measuring TARs and TransFrags distances to SINEs and LINEs # # # Using the table browser on genome.ucsc.edu on Hg17, select the # Alu SINEs and L1,L2 LINEs by setting filter at: # repClass=LINE or SINE # repFamily=L1, L2 or Alu # request fields: swScore, genoName, genoStart, genoEnd, repNames # save to file L1_LINE_Hg17.txt.gz, L2_LINE_Hg17.txt.gz # Alu_SINE_Hg17.txt.gz ########################################################################## # UW/Regulome DnaseI HS (2005-10-28, 11-17 kate) # NOTE: trimmed overlaps in baseline files, as per Scott Kuehn cd /cluster/data/encode/Regulome mkdir -p 2005-11-16 cd 2005-11-16 cat > load.csh << 'EOF' foreach cell (CACO2 CD34 GM HeLa HepG2 Huh7 K562 SKNSH) echo $cell hgLoadBed -noBin -strict hg17 \ encodeRegulomeQuality$cell lab/$cell.qc.bed hgLoadBed -noBin -strict hg17 \ encodeRegulomeAmplOdd$cell lab/$cell.oddAmps.bed hgLoadBed -noBin -strict hg17 \ encodeRegulomeAmplEven$cell lab/$cell.evenAmps.bed hgLoadBed -noBin -strict -bedGraph=5 hg17 \ encodeRegulomeProb$cell lab/$cell.hs.bed sort -k1,1 -k2,2n lab/$cell.baseline.bed | \ /cluster/data/encode/bin/scripts/trimOverlap.pl | \ hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \ encodeRegulomeBase$cell stdin end 'EOF' csh load.csh >&! load.log & ########################################################################## # UC Davis Chip/chip (new C-Myc data) (2005-10-29 kate) # Add as subtrack to existing track # New datafiles for hits (c-Myc and E2F1) submitted 2006-10-24 # by Mark Bieda # convert to bedGraph cd /cluster/data/encode/UcDavis/2005-10-12 set table = encodeUCDavisChipMyc awk '{printf "%s\t%s\t%s\t%s\n", $1,$4,$5,$6}' lab/myc_median.gff | \ sort -k1,1 -k2,2n > $table.bed hgLoadBed -strict -bedGraph=4 hg17 $table $table.bed # Loaded 385149 elements # hits data # 2 files: E2F1_HelaFIGS_T02P0001S50G2CHR.gff myc_helafix_hg17_T02P0001S50G2CHR.gff # NOTE: E2F1 data submitted in hg16 coords # Load as bed 5 -- generating item names from _ # at recommendation of Mark Bieda cd /cluster/data/encode/UcDavis mkdir -p 2006-10-24/lab # copy files from FTP dir cd 2006-10-24 awk '{printf "%s\t%d\t%d\t%s_%s\t%d\n", $1,$4,$5,$1,$4,$6}' \ lab/E2F1_HelaFIGS_T02P0001S50G2CHR.gff > e2f1.hg16.bed liftOver e2f1.hg16.bed \ /cluster/data/encode/convertHg17/hg16ToHg17.over.chain.gz \ e2f1.hg17.bed e2f1.unmapped # 1 unmapped (in ENm006) # chrX 152137876 152138192 chrX_152137876 2 hgLoadBed -strict hg17 encodeUcDavisChipHitsE2F1 e2f1.hg17.bed # Loaded 204 elements of size 5 checkTableCoords hg17 encodeUcDavisChipHitsE2F1 awk '{printf "%s\t%d\t%d\t%s_%s\t%d\n", $1,$4,$5,$1,$4,$6}' \ lab/myc_helafix_hg17_T02P0001S50G2CHR.gff > myc.hg17.bed hgLoadBed -strict hg17 encodeUcDavisChipHitsMyc myc.hg17.bed # Loaded 172 elements of size 5 checkTableCoords hg17 encodeUcDavisChipHitsMyc # NOTE: drop old tables after review ########################################################################## # UC Davis Chip/chip (new data, PolII and Taf in GM and HelaS3 cells) # (2007-05-02 ting) # Add as subtrack to existing track # New datafiles submitted 2007-3-23 # by Mark Bieda # # # convert to bedGraph cd /cluster/data/encode/UcDavis/2007-03-23 awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/GM_POLII_qmed_m3.gff \ > encodeUCDavisPolII_GM.bed awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/GM_Taf_qmed_m3.gff \ > encodeUCDavisTaf_GM.bed awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/HelaS3_POL_qmed_m3.gff \ > encodeUCDavisPolII_HelaS3.bed awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/HelaS3_Taf_qmed_m3.gff \ > encodeUCDavisTaf_HelaS3.bed hgLoadBed -bedGraph=4 hg17 encodeUCDavisPolII_GM \ encodeUCDavisPolII_GM.bed # Loaded 385149 elements of size 4 # Sorted # Creating table definition for encodeUCDavisPolII_GM hgLoadBed -bedGraph=4 hg17 encodeUCDavisTaf_GM \ encodeUCDavisTaf_GM.bed # Loaded 385149 elements of size 4 # Sorted # Creating table definition for encodeUCDavisTaf_GM hgLoadBed -bedGraph=4 hg17 encodeUCDavisPolII_HelaS3 \ encodeUCDavisPolII_HelaS3.bed # Loaded 385149 elements of size 4 # Sorted # Creating table definition for encodeUCDavisPolII_HelaS3 hgLoadBed -bedGraph=4 hg17 encodeUCDavisTaf_HelaS3 \ encodeUCDavisTaf_HelaS3.bed # Loaded 385149 elements of size 4 # Sorted # Creating table definition for encodeUCDavisTaf_HelaS3 checkTableCoords hg17 encodeUCDavisPolII_GM checkTableCoords hg17 encodeUCDavisPolII_HelaS3 checkTableCoords hg17 encodeUCDavisTaf_GM checkTableCoords hg17 encodeUCDavisTaf_HelaS3 # Note: currently these 4 tables are subtracks under encodeUCDavisChip # Change table names since we don't want "_" there. --ting, 062707 hgsql hg17 -e "ALTER TABLE encodeUCDavisPolII_GM RENAME TO encodeUCDavisPolIIGM;" hgsql hg17 -e "ALTER TABLE encodeUCDavisPolII_HelaS3 RENAME TO encodeUCDavisPolIIHelaS3;" hgsql hg17 -e "ALTER TABLE encodeUCDavisTaf_GM RENAME TO encodeUCDavisTafGM;" hgsql hg17 -e "ALTER TABLE encodeUCDavisTaf_HelaS3 RENAME TO encodeUCDavisTafHelaS3;" # Release note, --ting, 072407 # encodeUCDavisChip track was replaced by encodeUcDavisChipHits before Nature paper publication. # With new raw data submitted, it was decided to release the encodeUCDavisChip track with both # old data and new data. # Note: Mark and Peggy will provide new Hits data that go in parallel with the raw data. # (2007-7-24) The following subtracks for encodeUCDavisChip were released: (Ann) # encodeUCDavisPolIIGM # encodeUCDavisPolIIHelaS3 # encodeUCDavisTafGM # encodeUCDavisTafHelaS3 # encodeUCDavisE2F1Median # encodeUCDavisChipMyc # ########################################################################## # Yale TAR and TransMap (2005-10-31 kate) # Submitted: 10/14 by Joel Rozowsky # 5 bed files (TARs) and 5 wig files (Signal) # Replacements for June tracks (and drop individual 10 Neu samples) # Methods changed somewhat -- use new description from Joel's email # NOTE: adjusted start coord -1 to coorespond to their DART entries -- # verifying with Joel # new data submitted 3/20/07 by guoneng.zhong@yale.edu (kate) cd /cluster/data/encode/yale/rna/2005-10-14 cat > loadBed.csh << 'EOF' foreach f (lab/*.bed) set table = `echo $f:t:r | sed 's/_//g; s/CTRL/Untr/; s/ncbi35//; s/Placenta/Plac/; s/Neutrophil/Neut/'` echo $table sed 's/http.*acc=//' $f | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $4}' | \ hgLoadBed -strict hg17 $table stdin end 'EOF' csh loadBed.csh >&! loadBed.log # NOTE: trim overlaps in regions resulting from array design # Joel should have done this for us -- he will verify the files. # Also, need to adjust coords +1 as per J. Rozowsky cat > loadSig.csh << 'EOF' mkdir -p wig wib set gdir = /gbdb/hg17/encode/YaleRna/2005-10-14 mkdir -p $gdir/wib foreach f (lab/*.wig) set table = `echo $f:t:r | sed 's/_//g; s/Transcript/Trans/; s/CTRL/Untr/; s/ncbi35//; s/Placenta/Plac/; s/Neutrophil/Neut/'` echo $table grep "^chr" $f | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' | \ sort -k1,1 -k2,2n | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim wigEncode $table.trim wig/$table.wig wib/$table.wib hgLoadWiggle -pathPrefix=$gdir hg17 $table wig/$table.wig ln -s `pwd`/wib/$table.wib $gdir/wib end 'EOF' csh loadSig.csh >&! loadSig.log rm -f *.trim # restoring Neutrophil table which somehow got dropped from hgwdev # 2006-01-04 kate set table = encodeYaleAffyNeutRNATransMap wigEncode $table.trim wig/$table.wig wib/$table.wib # Converted encodeYaleAffyNeutRNATransMap.trim, upper limit 3275.25, lower limit -2658.03 set gdir = /gbdb/hg17/encode/YaleRna/2005-10-14 hgLoadWiggle -pathPrefix=$gdir hg17 $table wig/$table.wig # post wig downloads ssh kkstore03 cd /cluster/data/encode/yale/rna/latest mkdir downloads foreach f (*.trim) set table = ($f:r) echo $table gzip -c $f > downloads/$table.bedGraph.gz end # new data submitted 3/20/07 by guoneng.zhong@yale.edu (kate) ssh kkstore03 cd /cluster/data/encode/yale/rna mkdir -p 2007-03-20/lab cd 2007-03-20/lab mv /var/ftp/encode/joel_affy.tgz tar xvfrz joel_affy.tgz # Loaded into hg17 2007-06-11 - Hiram cd /cluster/data/encode/yale/rna/2007-03-20 ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Relaxed_ncbi35.bed \ ./encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Stringent_ncbi35.bed \ ./encodeYaleAffyHELAS3PolyARNATarsStringent.bed ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Transcript_Map_ncbi35.wig \ ./encodeYaleAffyHELAS3PolyARNATranscriptMap_lab.wig ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Relaxed_ncbi35.bed \ ./encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Stringent_ncbi35.bed \ ./encodeYaleAffyHELAS3TotalRNATarsStringent.bed ln -s \ lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Transcript_Map_ncbi35.wig \ ./encodeYaleAffyHELAS3TotalRNATranscriptMap_lab.wig # It looks like the start coordinates in the .wig are now correct # 0-relative, but they still need to be trimmed to not overlap. # And previously, both start and end were bumped one, these ends # have not been bumped one. # And they appear to have a bunch of extra blanks around the tab # separators, clean them up with the sed. for F in ./encodeYaleAffyHELAS3PolyARNATranscriptMap_lab.wig \ ./encodeYaleAffyHELAS3TotalRNATranscriptMap_lab.wig do T=`echo $F | sed -e "s/_lab//"` sed -e "s/ //g" ${F} | sort -k1,1 -k2,2n \ | /cluster/data/encode/bin/scripts/trimOverlap.pl > ${T}.trim done wigEncode encodeYaleAffyHELAS3PolyARNATranscriptMap.wig.trim \ encodeYaleAffyHELAS3PolyARNATranscriptMap.wig \ encodeYaleAffyHELAS3PolyARNATranscriptMap.wib wigEncode encodeYaleAffyHELAS3TotalRNATranscriptMap.wig.trim \ encodeYaleAffyHELAS3TotalRNATranscriptMap.wig \ encodeYaleAffyHELAS3TotalRNATranscriptMap.wib hgLoadWiggle -pathPrefix=/gbdb/hg17/wib hg17 \ encodeYaleAffyHELAS3PolyARNATranscriptMap \ encodeYaleAffyHELAS3PolyARNATranscriptMap.wig ln -s `pwd`/encodeYaleAffyHELAS3PolyARNATranscriptMap.wib \ /gbdb/hg17/wib/encodeYaleAffyHELAS3PolyARNATranscriptMap.wib hgLoadWiggle -pathPrefix=/gbdb/hg17/wib hg17 \ encodeYaleAffyHELAS3TotalRNATranscriptMap \ encodeYaleAffyHELAS3TotalRNATranscriptMap.wig ln -s `pwd`/encodeYaleAffyHELAS3TotalRNATranscriptMap.wib \ /gbdb/hg17/wib/encodeYaleAffyHELAS3TotalRNATranscriptMap.wib rm -f encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed rm -f encodeYaleAffyHELAS3PolyARNATarsStringent.bed rm -f encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed rm -f encodeYaleAffyHELAS3TotalRNATarsStringent.bed awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \ lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Relaxed_ncbi35.bed \ > encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \ lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Stringent_ncbi35.bed \ > encodeYaleAffyHELAS3PolyARNATarsStringent.bed awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \ lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Relaxed_ncbi35.bed \ > encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \ lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Stringent_ncbi35.bed \ > encodeYaleAffyHELAS3TotalRNATarsStringent.bed for F in encodeYaleAffyHELAS3PolyARNATarsRelaxed \ encodeYaleAffyHELAS3PolyARNATarsStringent \ encodeYaleAffyHELAS3TotalRNATarsRelaxed \ encodeYaleAffyHELAS3TotalRNATarsStringent do hgLoadBed hg17 ${F} ${F}.bed done ########################################################################## # Yale Chip/chip (2005-10-31 kate) # Final submission: 10/26 by Zhengdong Zhang # signal, pval, and sites for 5 factors (50x38 array) # Sites file has URL to Gerstein lab as 5th field. # I'm extracting the accession from it and saving # as the name field in a BED5. Score range: .602-3.23 # Scale *330 produces integer range 200-1000. # NOTE: >50% of sites are < 1 data value, so use 200 as low score # New data submitted 2006-11-29 by guoneng.zhong@yale.edu # 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa S3 and GM06990 cells # Loaded into P-Value and Signal tracks data into database # Loaded into database (DONE, 2006-12-19, kate) # New data submitted 2007-01-30 by guoneng.zhong@yale.edu # 7 datasets, 5 factors in HeLa S3 and K562 cells # Loaded into P-Value and Signal tracks data into database # (DONE, 2007-06-02 - 2007-06-03, hartera) # Resubmitted hits data for Nov06 and Jan07 - this is for the Sites track # submitted 2007-03-20 by guoneng.zhong@yale.edu # Loaded Sites data for Nov06 and Jan07 (DONE, 2007-06-23, hartera) # New data submitted 2007-03-27 by guoneng.zhong@yale.edu # 3 datasets, 3 factors in HeLa S3 cells # New data submitted 2007-06-06 by guoneng.zhong@yale.edu # This is the same as the data from 2007-03-27 (April 2007 batch) but it # has only one *-signal.wig file for each directory and additionally, there is # now P-value data. # Prepared and loaded P-value and Signal data # (DONE, 2007-06-06 - 2007-06-07, hartera) # Loaded Sites (hits) data (DONE, hartera, 2007-06-23) # New data submitted 2007-06-15 by guoneng.zhong@yale.edu # 1 dataset, 1 factor in HeLa S3 cells # Loaded P-value and Signal data (2007-06-18, hartera) # Loaded Sites (hits) data (DONE, hartera, 2007-06-23) awk '{print $4}' *_?/*-hits.bed | sort -n | textHistogram -real -binSize=.5 stdin 0.500000 ************************************************************ 628 1.000000 ************** 148 1.500000 ** 25 2.000000 ***************** 173 2.500000 ** 24 3.000000 1 # signal data dist: 0.000000 ************************************************************ 1782529 0.500000 *** 93246 1.000000 * 24171 1.500000 11657 2.000000 1764 2.500000 185 3.000000 13 # New data submitted 2006-11-29 by guoneng.zhong@yale.edu # 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa and GM06990 cd /cluster/data/encode/yale/chip/2005-10-26 mkdir -p wig wib cat > loadSites.csh << 'EOF' set pfx = encodeYaleChip foreach d (lab/{jun,fos,taf,baf155,baf170}) set factor = $d:t set Factor = `echo $factor | sed 's/\(.*\)/\u\1/'` echo $Factor set p = $d/Encode_Yale_ChIpChip_${factor}_Hela_Maskless50merevery38bp # load sites set table = ${pfx}Sites$Factor echo $table set f = ${p}_Sites.bed dos2unix $f sed -e "s/bed5FloatScore/$table/" \ $HOME/kent/src/hg/lib/bed5FloatScore.sql > $table.sql sed 's/=/ /' $f | \ awk '{printf "%s\t%d\t%d\t%s\t%d\t%.3f\n",$1,$2-1,$3,$6,($4 * 330),$4}' |\ hgLoadBed -strict -sqlTable=$table.sql hg17 $table stdin end 'EOF' csh loadSites.csh >&! loadSites.log cat > loadSig.csh << 'EOF' set pfx = encodeYaleChip foreach d (lab/{jun,fos,taf,baf155,baf170}) set factor = $d:t set Factor = `echo $factor | sed 's/\(.*\)/\u\1/'` echo $Factor set p = $d/Encode_Yale_ChIpChip_${factor}_Hela_Maskless50merevery38bp # load pval set table = ${pfx}Pval$Factor echo $table set f = ${p}_Pvalue.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim # load signal set table = ${pfx}Signal$Factor echo $table set f = ${p}_Signal.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim end 'EOF' csh loadSig.csh >&! loadSig.log & rm -f *.trim *.sql # description files # use server with "antiword" available foreach d (lab/{jun,fos,taf,baf155,baf170}) antiword $d/*.doc > ${d:t}.txt end # New data submitted 2006-11-29 by guoneng.zhong@yale.edu # 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa and GM06990 # Loaded into database (2006-12-19, kate) cd /cluster/data/encode/yale/chip mdkir 2006-11-29/lab cd 2006-11-29/lab tar xfz yale_batch_1.tgz # files are in 6 dirs, 1 per experiment # info.txt in experiment dir indicates antibody/cell line # files to load are: miyoung_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig # pvalue and wig appear same format as previous, with overlap, # so use same processing as previous datasets (above) # Score range in hits files: 0.255 - 3.788 # The low score is lower than previous data sets (.6), so similar # scaling for score (*330) will produce a near-invisible 84 score # Check with submitter. Sites data distribution is: 0.000000 ******************************* 1399 0.500000 ************************************************************ 2731 1.000000 ********************************* 1494 1.500000 ****** 288 2.000000 * 62 2.500000 18 3.000000 6 3.500000 2 # signal data dist 0.000000 ************************************************************ 2162644 0.500000 *** 111537 1.000000 * 24294 1.500000 3919 2.000000 786 2.500000 353 3.000000 26 3.500000 51 cd /cluster/data/encode/yale/chip/2006-11-29 ln -s lab/miyoung_1 pol2n_hela ln -s lab/miyoung_2 pol2n_gm06990 ln -s lab/miyoung_3 pol2_hela ln -s lab/miyoung_4 pol2_gm06990 ln -s lab/miyoung_7 h4kac4_hela ln -s lab/miyoung_8 h4kac4_gm06990 cat > loadSig.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (pol2n_hela pol2n_gm06990 pol2_hela pol2_gm06990 h4kac4_hela h4kac4_gm06990) set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell # load pval set table = ${pfx}Pval$Factor$Cell echo $table set f = $d/*-*-*-pvalue.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim # load signal set table = ${pfx}Signal$Factor$Cell echo $table set f = $d/*-*-*-signal.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim end 'EOF' loadSig.csh >&! loadSig.log & egrep 'trim|Loaded' *.log Reading encodeYaleChipPvalPol2nHela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalPol2nHela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalPol2nGm06990.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalPol2nGm06990.trim Loaded 385149 elements of size 4 Reading encodeYaleChipPvalPol2Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalPol2Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalPol2Gm06990.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalPol2Gm06990.trim Loaded 385149 elements of size 4 Reading encodeYaleChipPvalH4kac4Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalH4kac4Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalH4kac4Gm06990.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalH4kac4Gm06990.trim Loaded 385149 elements of size 4 # Note size difference between Gm06990 and Hela -- inform submitter. # E-mail from April 4, 2007 states the following: # [The researcher] said that there were no real deliberate # reasons for the slight discrepancy. The arrays were newer so there were # more features than the hela in the past. # Only P-Value and Signal data loaded because hits data was updated on # 2007-03-20. # New data submitted 2007-01-30 by guoneng.zhong@yale.edu # 7 datasets # 5 factors in HeLa and K562 cells # Prepared data and loaded into database (2007-06-02 - 2007-06-03, hartera) cd /cluster/data/encode/yale/chip/2007-01-30 mkdir lab cd lab tar xvfz jan07_batch.tgz # files are in 7 dirs, 1 per experiment # info.txt in experiment dir indicates antibody/cell line # files to load are: miyoung_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig # and ghia_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig # pvalue and wig appear same format as previous, with overlap, # so use same processing as previous datasets (above). # Signal scores range: 0-3.001 # P-values scores range: 0-20.0 # Hits scores range: 0.223-2.681 cd /cluster/data/encode/yale/chip/2007-01-30 # P-values data distribution: 0.000000 ************************************************************ 1845236 2.000000 *********** 331763 4.000000 ****** 195195 6.000000 **** 127591 8.000000 *** 84006 10.000000 ** 52971 12.000000 * 29541 14.000000 12279 16.000000 0 18.000000 0 20.000000 7749 # Signal data distribution: 0.000000 ************************************************************ 2455784 0.500000 ***** 187810 1.000000 * 33419 1.500000 8165 2.000000 1084 2.500000 68 3.000000 1 # For ghia_8, the target is BAF155 (NOT p65 as in the ghia_8/info.txt # file. For miyoung_10, the antibody is sc-372 which is against the # C-terminus of NFkB p65 and for miyoung_11, the antibody is sc-109 which # is against the N-terminus of NFkB p65. # Create links to data: ln -s lab/jan07_batch/ghia_8 baf155_k562 ln -s lab/jan07_batch/ghia_10 baf170_k562 ln -s lab/jan07_batch/ghia_14 baf47_k562 ln -s lab/jan07_batch/ghia_15 baf47_hela ln -s lab/jan07_batch/ghia_24 stat1_hela_ifna ln -s lab/jan07_batch/miyoung_10 p65c_hela_tnfa ln -s lab/jan07_batch/miyoung_11 p65n_hela_tnfa # format is antibody_cell except for # format for the last 3 is antibody_cell_treatment # ifna = interferon alpha, tnfa = tumor necrosis factor (TNF) alpha # Load the new subtracks for Signal and P-value data: cat > loadSig.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (baf155_k562 baf170_k562 baf47_k562 stat1_hela_ifna p65c_hela_tnfa p65n_hela_tnfa) set Stim="" if ($d =~ baf*) then set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell else set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | perl -wpe 's/^.*_(.*)_.*$/$1/'` set stim = `echo $d | perl -wpe 's/.*_.*_//' ` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` set Stim = `echo $stim | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell $Stim endif # load pval set table = ${pfx}Pval$Factor$Cell$Stim echo $table set f = $d/*-*-*-pvalue.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim # load signal set table = ${pfx}Signal$Factor$Cell echo $table set f = $d/*-*-*-signal.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim end 'EOF' chmod +x loadSig.csh loadSig.csh >&! loadSig.log & egrep 'trim|Loaded' *.log Reading encodeYaleChipPvalBaf155K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalBaf155K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalBaf170K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalBaf170K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalBaf47K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalBaf47K562.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalStat1HelaIfna.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalStat1HelaIfna.trim Loaded 385149 elements of size 4 Reading encodeYaleChipPvalP65cHelaTnfa.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalP65cHelaTnfa.trim Loaded 385149 elements of size 4 Reading encodeYaleChipPvalP65nHelaTnfa.trim Loaded 385149 elements of size 4 Reading encodeYaleChipSignalP65nHelaTnfa.trim Loaded 385149 elements of size 4 # Only P-Value and Signal data loaded because hits data was updated on # March 20, 2007. # Added trackDb entries for Nov06 and Jan07 P-value and Signal data. # New data submitted 2007-03-27 by guoneng.zhong@yale.edu # Started processing data (2007-06-06, hartera) cd /cluster/data/encode/yale/chip/2007-03-27 mkdir lab cd lab unzip apr07_batch.zip cd /cluster/data/encode/yale/chip/2007-03-27 # files are in 3 dirs, 1 per experiment # info.txt in experiment dir indicates antibody/cell line # files to load are: ghia_?/*hits_N.0.bed, *signal.wig, *pvalue.wig. # For hits, N is the % false discovery rate (FDR). load all of the # 10% FDR files and note whether each hit appears first in the # 1, 5 or 10 % FDR dataset. # pvalue and wig appear same format as previous, with overlap, # so use same processing as previous datasets (above). # Signal data range: 0 - 4.659 # Signal data distribution: 0.000000 ************************************************************ 1074500 0.500000 *** 62133 1.000000 7327 1.500000 1543 2.000000 1119 2.500000 529 3.000000 489 3.500000 408 4.000000 104 4.500000 11 # Create links to data: ln -s lab/apr07_batch/ghia_12 smarca4_hela ln -s lab/apr07_batch/ghia_17 smarca6_hela ln -s lab/apr07_batch/ghia_25 nrsf_hela # Only Signal data so no P-Value data to load # E-mailed track contributors to ask about data and also for SMARCA4 # description as none in the readme.txt. Also, asked why there is no # P-value data and also if the hits file contains 0-based or 1-based # start coordinates (2007-06-06, hartera) # Contributors say that the composite file (xxxxx_yyyy_zzzzz-signal.wig) # for each dataset is the average of the three replicates which are in # the other 3 files (xxxxx-signal.wig, yyyyy-signal.wig, # zzzzz-signal.wig). # New data submitted 2007-06-06 by guoneng.zhong@yale.edu # This is the same as the data from 2007-03-27 (April 2007 batch) but it # has only one *-signal.wig file for each directory and additionally, there is # now P-value data. # Prepared and loaded P-value and Signal data (2007-06-06 - 2007-06-07, hartera) mkdir -p /cluster/data/encode/yale/chip/2007-06-06/lab cd /cluster/data/encode/yale/chip/2007-06-06/lab unzip yale-apr07_batch_b.zip cd /cluster/data/encode/yale/chip/2007-06-06 # files are in 3 dirs, 1 per experiment # info.txt in experiment dir indicates antibody/cell line # files to load are: ghia_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig # pvalue and wig appear same format as previous, with overlap, # so use same processing as previous datasets (above). # Signal data range: 0 - 4.659 # P-value data range: 0 - 14.574 # Hits for 10% FDR range: 0.340833 - 4.659167 # Signal data distribution: 0.000000 ************************************************************1074500 0.500000 *** 62133 1.000000 7327 1.500000 1543 2.000000 1119 2.500000 529 3.000000 489 3.500000 408 4.000000 104 4.500000 11 # P-value distribution: 0.000000 ************************************************************ 849713 2.000000 ********* 133584 4.000000 ***** 73052 6.000000 *** 43713 8.000000 ** 25797 10.000000 * 13724 12.000000 6421 14.000000 2159 # make links to directories: ln -s lab/apr07_batch_b/ghia_12 smarca4_hela ln -s lab/apr07_batch_b/ghia_17 smarca6_hela ln -s lab/apr07_batch_b/ghia_25 nrsf_hela cat > loadSig.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (smarca4_hela smarca6_hela nrsf_hela) set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell # load pval set table = ${pfx}Pval$Factor$Cell echo $table set f = $d/*_*_*-pvalue.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim # load signal set table = ${pfx}Signal$Factor$Cell echo $table set f = $d/*_*_*-signal.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim end 'EOF' chmod +x loadSig.csh loadSig.csh >&! loadSig.log & egrep 'trim|Loaded' *.log Reading encodeYaleChipPvalSmarca4Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalSmarca4Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalSmarca6Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalSmarca6Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipPvalNrsfHela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalNrsfHela.trim Loaded 382721 elements of size 4 # Added trackDb entries for subtracks for P-value and Signal Apr07 data. # New data submitted 2007-06-15 by guoneng.zhong@yale.edu # Loaded P-value and Signal data (2007-06-18, hartera) cd /cluster/data/encode/yale/chip/2007-06-15 mkdir lab cd lab unzip yale-jun07_batch.zip cd /cluster/data/encode/yale/chip/2007-06-15 # files are in 1 dir, 1 per experiment # antibody is against me3K27 Histone H3, so this is H3K27me3 # info.txt in experiment dir indicates antibody/cell line # files to load are: ghia_?/*_*_*-hits.bed, *signal.wig, *pvalue.wig # pvalue and wig appear same format as previous, with overlap, # so use same processing as previous datasets (above). # Signal data range: 0 - 2.704 # P-value data range: 0 - 14.574 # Hits for 10% FDR range: 0.426667 - 2.704167 # Signal data distribution: 0.000000 ************************************************************ 336894 0.500000 ***** 30711 1.000000 ** 11315 1.500000 * 3403 2.000000 375 2.500000 23 # P-value distribution: 0.000000 ************************************************************ 291121 2.000000 ***** 23592 4.000000 **** 19303 6.000000 *** 16880 8.000000 *** 14049 10.000000 ** 9892 12.000000 * 5884 14.000000 2000 # make link to directory: ln -s lab/jun07_batch/ghia_49 h3k27me3_hela cat > loadSig.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (h3k27me3_hela) set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell # load pval set table = ${pfx}Pval$Factor$Cell echo $table set f = $d/*_*_*-pvalue.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim # load signal set table = ${pfx}Signal$Factor$Cell echo $table set f = $d/*_*_*-signal.wig sort -k1,1 -k2,2n $f | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim hgLoadBed -bedGraph=4 hg17 $table $table.trim end 'EOF' chmod +x loadSig.csh loadSig.csh >&! loadSig.log & egrep 'trim|Loaded' *.log Reading encodeYaleChipPvalH3k27me3Hela.trim Loaded 382721 elements of size 4 Reading encodeYaleChipSignalH3k27me3Hela.trim Loaded 382721 elements of size 4 # added human/hg17/trackDb.encode.ra subtrack entry for Pval and Sig tracks # added information about the antibody and its target to the # tracks' description pages. # Resubmitted hits data for Nov06 and Jan07 - this is for the Sites track. # submitted 2007-03-20 by guoneng.zhong@yale.edu # E-mailed to ask whether start coordinates are 0-based or 1-based, 2007-06-06 # Received e-mail on 2007-06-18 to confirm that the start coordinates # for the hits data for the Sites track are 1-based. # Started preparing data (2007-06-19 - 2007-06-20, hartera) /cluster/data/encode/yale/chip mkdir -p 2007-03-20/lab cd 2007-03-20/lab cp /var/ftp/encode/ucsc_resubmission.tgz . tar tvfz *.tgz # Loading resubmitted data (hartera, 2007-06-03) # This is only the hits data as only this has changed for the # November 2006 and January 2007 submissions. Load as subtracks for the # Yale ChIP Sites track. cd /cluster/data/encode/yale/chip/2007-03-20 # These data sets relate to the 2006-11-29 data: ln -s lab/miyoung_1 pol2n_hela ln -s lab/miyoung_2 pol2n_gm06990 ln -s lab/miyoung_3 pol2_hela ln -s lab/miyoung_4 pol2_gm06990 ln -s lab/miyoung_7 h4kac4_hela ln -s lab/miyoung_8 h4kac4_gm06990 # format is antibody_cell # These data sets relate to the 2007-01-30 data: ln -s lab/ghia_8_2 baf155_k562 ln -s lab/ghia_10_1 baf170_k562 ln -s lab/ghia_14_1 baf47_k562 ln -s lab/ghia_15 baf47_hela ln -s lab/ghia_24 stat1_hela_ifna ln -s lab/miyoung_10 p65c_hela_tnfa ln -s lab/miyoung_11 p65n_hela_tnfa # format for the last 3 is antibody_cell_treatment # ifna = interferon alpha, tnfa = tumor necrosis factor (TNF) alpha # These are BED 4 files. Previously, we were sent BED 5 files with an ID # to link to the DART database. Add a Sites name and add the False # Discovery rate as an extra column. # Load the Sites track data: # Create a bed5FloatScoreWithFdr.sql table with an extra integer data field # for the False Discovery Rate (FDR) from the bed5FloatScore.sql definition # in $HOME/kent/src/hg/lib. The False Discovery Rate is the lowest rate # used at which the data was included. Data for FDRs of 1, 5 and 10% were # submitted for this dataset so, for example an item included with FDR of 1% # will also be included for an FDR of 5% and 10%, if the FDR is 5% then it is # also included in the data generated with an FDR of 10%. cat << 'EOF' > bed5FloatScoreWithFdr.sql # bed5FloatScore.sql was originally generated by the autoSql program, which also # generated bed5FloatScore.c and bed5FloatScore.h. This creates the database representation of # an object which can be loaded and saved from RAM in a fairly # automatic way. # bed5FloatScore.sql was edited to create the bed5FloatScoreWithFdr.sql # definition. #BED 5 (with 0-1000 score), but also with floating-point score and false # discovery rate (FDR). CREATE TABLE bed5FloatScoreWithFdr ( bin smallint not null, # Index field chrom varchar(255) not null, # Chromosome chromStart int unsigned not null, # Start position in chromosome chromEnd int unsigned not null, # End position in chromosome name varchar(255) not null, # Name of item score int not null, # 0-1000 score for useScore shading floatScore float not null, # Floating point score. fdr int not null, # False discovery rate #Indices INDEX(chrom(16),bin), INDEX(chrom(16),chromStart), INDEX(name(16)) ); 'EOF' # prepare data and load tables for Sites track # check first that the *hits*.bed files are all sorted for the comm # command to work correctly. If not, these should be sorted before running the # loadSites.csh script. cat > loadSites.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (pol2n_hela pol2n_gm06990 pol2_hela pol2_gm06990 h4kac4_hela h4kac4_gm06990 baf155_k562 baf170_k562 baf47_k562 stat1_hela_ifna p65c_hela_tnfa p65n_hela_tnfa) echo $d set Stim = "" if (($d =~ baf*) || ($d =~ pol2*) || ($d =~ h4kac4*)) then set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell else set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | perl -wpe 's/^.*_(.*)_.*$/$1/'` set stim = `echo $d | perl -wpe 's/.*_.*_//' ` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` set Stim = `echo $stim | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell $Stim endif # first add FDR rate to Sites data files foreach f ($d/*hits*.bed) echo $f dos2unix $f if ($f =~ *hits_1.0.bed) then cp $f Sites1.txt else if ($f =~ *hits_5.0.bed) then cp $f hits5.txt else if ($f =~ *hits_10.0.bed) then cp $f hits10.txt endif end comm -13 Sites1.txt hits5.txt > Sites5.txt comm -13 hits5.txt hits10.txt > Sites10.txt awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt rm hits*.txt Sites*.txt # load Sites tables set f = ${d}SitesFdr.txt set table = ${pfx}Sites$Factor$Cell$Stim echo $table sed -e "s/bed5FloatScoreWithFdr/$table/" \ bed5FloatScoreWithFdr.sql > $table.sql sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \ {ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \ | hgLoadBed -sqlTable=$table.sql hg17 $table stdin end 'EOF' chmod +x loadSites.csh # load tables (hartera, 2007-06-23) loadSites.csh >&! loadSites.log & egrep 'encode|Loaded' *.log encodeYaleChipSitesPol2nHela Loaded 1000 elements of size 7 encodeYaleChipSitesPol2nGm06990 Loaded 1000 elements of size 7 encodeYaleChipSitesPol2Hela Loaded 1000 elements of size 7 encodeYaleChipSitesPol2Gm06990 Loaded 1000 elements of size 7 encodeYaleChipSitesH4kac4Hela Loaded 1000 elements of size 7 encodeYaleChipSitesH4kac4Gm06990 Loaded 1000 elements of size 7 encodeYaleChipSitesBaf155K562 Loaded 5 elements of size 7 encodeYaleChipSitesBaf170K562 Loaded 675 elements of size 7 encodeYaleChipSitesBaf47K562 Loaded 668 elements of size 7 encodeYaleChipSitesStat1HelaIfna Loaded 1000 elements of size 7 encodeYaleChipSitesP65cHelaTnfa Loaded 1000 elements of size 7 encodeYaleChipSitesP65nHelaTnfa Loaded 1000 elements of size 7 # Added subtracks to trackDb/human/hg17/trackDb.encode.ra for # the encodeYaleChipSites track. Release beta version created with just # the existing tracks. Release alpha version includes new subtracks. # Added new antibodies and treatments to the description page. Created # new description: encodeYaleChipSitesNew.html for release alpha in the # trackDb.entry. # Load hits data for Apr07 batch into the Sites track. The April data # batch was originally submitted on 2007-03-27 but there was some # missing data so it was resubmitted on 2007-06-06 by # guoneng.zhong@yale.edu # (hartera, 2007-06-23) cd /cluster/data/encode/yale/chip/2007-06-06 # symbolic links already made for Pval and Signal data: # smarca4_hela, smarca6_hela, nrsf_hela, format is antibody_cell # copy over the MySQL table definition: cp /cluster/data/encode/yale/chip/2007-03-20/bed5FloatScoreWithFdr.sql . # check first that the *hits*.bed files are all sorted for the comm # command to work correctly. If not, these should be sorted before running the # loadSites.csh script. # create loading script: cat > loadSites.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (smarca4_hela smarca6_hela nrsf_hela) echo $d set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell # first add FDR rate to Sites data files foreach f ($d/*hits*.bed) echo $f dos2unix $f if ($f =~ *hits_1.0.bed) then cp $f Sites1.txt else if ($f =~ *hits_5.0.bed) then cp $f hits5.txt else if ($f =~ *hits_10.0.bed) then cp $f hits10.txt endif end comm -13 Sites1.txt hits5.txt > Sites5.txt comm -13 hits5.txt hits10.txt > Sites10.txt awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt rm hits*.txt Sites*.txt # load Sites tables set f = ${d}SitesFdr.txt set table = ${pfx}Sites$Factor$Cell echo $table sed -e "s/bed5FloatScoreWithFdr/$table/" \ bed5FloatScoreWithFdr.sql > $table.sql sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \ {ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \ | hgLoadBed -sqlTable=$table.sql hg17 $table stdin end 'EOF' chmod +x loadSites.csh loadSites.csh >&! loadSites.log & egrep 'encode|Loaded' loadSites.log encodeYaleChipSitesSmarca4Hela Loaded 470 elements of size 7 encodeYaleChipSitesSmarca6Hela Loaded 1000 elements of size 7 encodeYaleChipSitesNrsfHela Loaded 1639 elements of size 7 # Add new subtracks to encodeYaleChipSites release alpha # trackDb.encoder.ra entry and update description as for the Nov06 and # Jan07 data - see above. # Load hits data for Jun07 batch into the Sites track. # Data was submitted on 2007-06-15 by guoneng.zhong@yale.edu # (hartera, 2007-06-23) cd /cluster/data/encode/yale/chip/2007-06-15 # symbolic links already made for Pval and Signal data: # h3k27me3_hela format is antibody_cell # copy over the MySQL table definition: cp /cluster/data/encode/yale/chip/2007-03-20/bed5FloatScoreWithFdr.sql . # check first that the *hits*.bed files are all sorted for the comm # command to work correctly. If not, these should be sorted before running the # loadSites.csh script. # create loading script: cat > loadSites.csh << 'EOF' #!/bin/csh -ef set pfx = encodeYaleChip foreach d (h3k27me3_hela) echo $d set factor = `echo $d | sed 's/_.*//'` set cell = `echo $d | sed 's/.*_//'` set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'` set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'` echo $Factor $Cell # first add FDR rate to Sites data files foreach f ($d/*hits*.bed) echo $f dos2unix $f if ($f =~ *hits_1.0.bed) then cp $f Sites1.txt else if ($f =~ *hits_5.0.bed) then cp $f hits5.txt else if ($f =~ *hits_10.0.bed) then cp $f hits10.txt endif end comm -13 Sites1.txt hits5.txt > Sites5.txt comm -13 hits5.txt hits10.txt > Sites10.txt awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt rm hits*.txt Sites*.txt # load Sites tables set f = ${d}SitesFdr.txt set table = ${pfx}Sites$Factor$Cell echo $table sed -e "s/bed5FloatScoreWithFdr/$table/" \ bed5FloatScoreWithFdr.sql > $table.sql sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \ {ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \ | hgLoadBed -sqlTable=$table.sql hg17 $table stdin end 'EOF' chmod +x loadSites.csh loadSites.csh >&! loadSites.log & egrep 'encode|Loaded' loadSites.log encodeYaleChipSitesH3k27me3Hela Loaded 2553 elements of size 7 # Add new subtracks to encodeYaleChipSites release alpha # trackDb.encoder.ra entry and update description as for the Nov06 and # Jan07 data - see above. # GEO accessions for each experiment were added to the descriptions for # the Yale Chip Pval, Signal and Sites tracks in the table of the # antibody and antibody target descriptions. foreach b (1, 5, 10) awk 'BEGIN {OFS="\t"} {print $0, $b;}' Sites${b}.bed end comm -13 70561_70573_70629-hits_1.0.bed 70561_70573_70629-hits_5.0.bed \ > hits5.0.bed comm -13 70561_70573_70629-hits_5.0.bed 70561_70573_70629-hits_10.0.bed \ > hits10.0 bed # Load hits data for Apr07 batch: # Load hits data for Jun07 batch: ########################################################################## # UTexas STAGE (2005-10-31, 11-17 kate) # Submitted 10/15 by Akshay Bhinge # Resubmitted 11/17 # 2 files - raw and peaks, for c-Myc in HeLa # range .001 to 1.0. Peaks restricted to >.8 # Adjusted data in Tags file: set score=1 items to 300 so # they'll be visible with gray-scale tags requested by Akshay. # (This is why it's loaded as blocked bed). Huh ?? # # New data (raw tags for STAT1 in HeLa) submitted 2006-10-16 #cd /cluster/data/encode/UTexas/stage/2005-10-15 cd /cluster/data/encode/UTexas/stage/2005-11-17 grep '^chr' lab/myc.tag.prob.bed | \ awk '{if ($5 == 1) $5 = 300; \ printf("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t0\t1\t%d,\t0\n", \ $1, $2, $3, $4, $5, $6, $2, $3, $3 - $2)}' | \ hgLoadBed -noBin -strict hg17 encodeUtexStageMycHelaTags stdin # 813 elements grep '^chr' lab/myc.stage.peaks.bed | \ hgLoadBed -noBin -strict hg17 \ encodeUtexStageMycHelaPeaks stdin # 26 elements # Created composite track with 2 subtracks # 2006-10-30 # Reload cMyc data in simple bed w/o score adjustment # and load new data cd /cluster/data/encode/UTexas/stage mkdir -p 2006-10-16/lab rm latest; ln -s 2006-10-16 latest cd latest grep '^chr' lab/stat1.tags.ucsc.bed | \ awk '{printf("%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$5)}' | \ hgLoadBed -noBin -strict hg17 encodeUtexStageStat1HelaTags stdin # Loaded 937 elements of size 6 checkTableCoords hg17 encodeUtexStageStat1HelaTags cd ../2005-11-17 grep '^chr' lab/myc.tag.prob.bed | \ awk '{printf("%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$5)}' | \ hgLoadBed -noBin -strict hg17 encodeUtexStageCMycHelaTags stdin ########################################################################## # Univ. Uppsala, Sweden Chip/chip # Submitted by Claes & Ola # 4 files with chrom, start, end, integer score # Sites file in GFF format # NOTE: this was submitted in hg16, without notifying us. # I'm reloading after lifting, 2005-12-05 cd /cluster/data/encode/Uppsala/2005-11-07 /cluster/data/encode/bin/scripts/splitTracks.pl lab/chip.wig mv t0 Usf1.wig mv t1 Hnf3b.wig mv t2 Hnf4a.wig mv t3 Ach3.wig mv t4 Sites.gff # load data for individual factors # NOTE: rounded overly long float scores cat > load.csh << 'EOF' foreach factor (Usf1 Hnf3b Hnf4a Ach3) awk '/^chr/ {printf("%s\t%s\t%s\t%.3f\n", $1, $2, $3, $4)}' $factor.wig |\ liftOver stdin /cluster/data/encode/convertHg17/hg16ToHg17.chain \ $factor.hg17.bed $factor.unmapped hgLoadBed -strict -bedGraph=4 hg17 encodeUppsalaChip$factor $factor.hg17.bed end 'EOF' csh load.csh >&! load.log & # sites (they refer to as Tentative Binding Sites) # NOTE: I added an item name, of the form "uutbs.#" grep -v track Sites.gff | sort -k1,1 -k2,2n | \ awk '{printf ("%s\t%d\t%d\tuutbs.%d\t%d\n", $1, $4, $5, NR, $6)}' | \ liftOver stdin /cluster/data/encode/convertHg17/hg16ToHg17.chain \ sites.hg17.bed sites.unmapped hgLoadBed -noSort -noBin -strict hg17 encodeUppsalaChipSites sites.hg17.bed # Loaded 327 elements of size 5 ########################################################################## # MSA tracks from Sept. 2005 freeze # Use links from Wiki for data submission (as per Elliott Margulies) # NOTE: mapping of sequence name to assembly is in column 7 of # metadata.txt file in Elliott's MSA release # Assemblies in this freeze are: canFam1 danRer2 fr1 galGal2 mm6 # monDom1 panTro1 rheMac1 rn3 tetNig1 # NOTE: reloaded phastCons scores (previously only manual regions # were loaded (2006-05-03 kate) # Reloaded elements with updated files from Elliott (2006-06-22 kate) # TBA alignments cd /cluster/data/encode/TBA mkdir -p SEP-05/lab cd SEP-05/lab wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/tba.v2.maf.tar cd .. foreach f (lab/*/*.maf.gz) echo $f gunzip -c $f | \ sed 's/^s human\./s hg17./; s/^s dog\./s canFam1./; \ s/^s zebrafish\./s danRer2./; s/^s fugu\./s fr1./; \ s/^s chicken\./s galGal2./; s/^s mouse\./s mm6./; \ s/^s monodelphis\./s monDom1./; s/^s chimp\./s panTro1./; \ s/^s macaque\./s rheMac1./; s/^s rat\./s rn3./; \ s/^s tetraodon\./s tetNig1./;' \ > $f:t:r:r:e.maf end set gdir = /gbdb/hg17/encode/TBA/maf mkdir -p $gdir rm -f $gdir/*.maf ln -s /cluster/data/encode/TBA/SEP-05/*.maf $gdir hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeTbaAlign >&! load.log # lots of "score too small" messages -- these are OK. cat *.maf | hgLoadMafSummary hg17 encodeTbaSummary stdin # create tree image: # edit tree.nh to create species.nh with common names cd /cluster/data/encode/MSA/SEP-2005 mkdir phylo cd phylo wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh /cluster/bin/phast/draw_tree -b -s tree_4d.tba.v2.nh > species28.ps # photoshop to enhance, then save as gif/jpg cp /cluster/data/encode/MSA/SEP-2005/phylo/species28.jpg \ /usr/local/apache/htdocs/images/phylo/species28.jpg # MLAGAN alignments cd /cluster/data/encode/MLAGAN mkdir -p SEP-05/lab cd SEP-05/lab wget http://ai.stanford.edu/~asimenos/ENCODE_Oct-2005_maf.tgz cd .. cat > project.csh << 'EOF' mkdir -p tmp set tmpDir = tmp foreach d (lab/EN[mr]*) set r = $d:t echo $r set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \ hgsql -N hg17` set start = \ `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \ hgsql -N hg17` set size = \ `echo "SELECT size from chromInfo WHERE chrom='$c'" | \ hgsql -N hg17` /cluster/data/encode/MLAGAN/mafCoord.pl < $d/$r.maf \ human.1 hg17.$c $start $size | \ sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf echo "projecting $r" /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg17.$c > $r.maf echo "finished $r" end 'EOF' set gdir = /gbdb/hg17/encode/MLAGAN/SEP-05/maf mkdir -p $gdir rm -f $gdir/*.maf ln -s /cluster/data/encode/MLAGAN/SEP-05/*.maf $gdir hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeMlaganAlign >&! load.log # lots of "score too small" messages -- these are OK. cat *.maf | hgLoadMafSummary hg17 encodeMlaganSummary stdin # MAVID alignments cd /cluster/data/encode/MAVID mkdir -p SEP-05/lab cd SEP-05/lab wget http://hanuman.math.berkeley.edu/~cdewey/encode/alignments/ENCODE_SEP-2005_MAVID_MAF_ABS.tar.gz cd .. cat > project.csh << 'EOF' set tmpDir = tmp mkdir $tmpDir foreach f (lab/ABS/*.maf) set r = $f:t:r echo $r set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \ hgsql -N hg17` sed 's/^a$/a score=0.0/; s/^s *human/s hg17/' $f > $tmpDir/$r.maf echo "projecting $r" /cluster/bin/penn/maf_project $tmpDir/$r.maf hg17.$c > $r.maf echo "finished $r" end 'EOF' set gdir = /gbdb/hg17/encode/MAVID/SEP-05/maf mkdir -p $gdir rm -f $gdir/*.maf ln -s /cluster/data/encode/MAVID/SEP-05/*.maf $gdir hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeMavidAlign >&! load.log cat *.maf | hgLoadMafSummary hg17 encodeMavidSummary stdin # conserved elements # Scores: binCons are all 1000, gerp range is 6.75 - 4813.26 # phastCons is 10-18088 # Force gerp to integer for consistent table format, but don't # bother scaling at this point (and don't use to score on display) # For some reason, phastCons has + strand -- strip this out # NOTE: coords are ENCODE-region based, so need to adjust # by start of region (Elliott used custom tracks offset= to do this). # NOTE: Updated GERP elements 2/1/06, with new data from Greg Cooper # overwriting Elliott's elements. This is doc'ed in the GERP section. cd /cluster/data/encode/MSA mkdir -p SEP-05/elements.2005-12-12/lab cd SEP-05/elements.2005-12-12/lab wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/target.align.conservation.v1.tar.gz # data update from Elliott , to fix off-by-one start coords mkdir -p SEP-05/elements.2006-06-22/lab cd SEP-05/elements.2006-06-22/lab # copy in align_elements_tracks.tar.gz # contains 9 tracks of elements (3 aligners * binCons, gerp, phastCons) cd .. cat > load.csh << 'EOF' foreach f (lab/*.bed) set root = $f:t:r set align = `echo $root:e | perl -wpe 's/(.*)/\u$1/'` set cons = `echo $root:r | perl -wpe 's/(.*)/\u$1/'` set table = encode${align}${cons}El hgLoadBed -strict hg17 $table $f end 'EOF' csh load.csh >&! load.log & # CONSENSUS ELEMENTS cd /cluster/data/encode/MSA mkdir -p SEP-05/consensus/lab cd SEP-05/consensus/lab wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/consensus.conservation.v1.tar.gz cd .. ln -s lab/or.or.bed MsaElUnion.bed ln -s lab/and.and.bed MsaElIntersect.bed ln -s lab/two.two.bed MsaElModerate.bed cat > load.csh << 'EOF' foreach f (MsaEl*.bed) echo $f set b = $f:r set t = encode$b hgLoadBed -strict -noBin hg17 $t $f end 'EOF' csh load.csh >&! load.log #Reading MsaElIntersect.bed #Loaded 30645 elements of size 4 #Reading MsaElModerate.bed #Loaded 36793 elements of size 4 # conservation cd /cluster/data/encode/MSA mkdir -p SEP-05/conservation/lab cd SEP-05/conservation/lab wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/phastCons.wig.tar.gz cd .. cat > load.csh << 'EOF' # TBA gunzip -c lab/tba/*/phast/human.EN*.gz | \ wigEncode stdin tbaPhastCons.wig tbaPhastCons.wib set d = /gbdb/hg17/encode/TBA/SEP-05 ln -s `pwd`/tbaPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeTbaPhastCons tbaPhastCons.wig # MLAGAN gunzip -c lab/mlagan/*/phast/human.EN*.gz | \ wigEncode stdin mlaganPhastCons.wig mlaganPhastCons.wib set d = /gbdb/hg17/encode/MLAGAN/SEP-05 ln -s `pwd`/mlaganPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganPhastCons mlaganPhastCons.wig # MAVID gunzip -c lab/mavid/*/phast/human.EN*.gz | \ wigEncode stdin mavidPhastCons.wig mavidPhastCons.wib set d = /gbdb/hg17/encode/MAVID/SEP-05 ln -s `pwd`/mavidPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMavidPhastCons mavidPhastCons.wig 'EOF' csh load.csh >&! load.log & ########################################################################## # MSA GERP Conservation (2005-02-06 kate) # Submitted 2/1/06 by Greg Coooper cd /cluster/data/encode/MSA/Gerp mkdir -p 2006-02-01/lab cd 2006-02-01/lab wget http://baumbox.stanford.edu/~coopergm/ENCODE/GERP_Cons_SepFreeze_Jan.zip unzip GERP_Cons_SepFreeze_Jan.zip cd .. # TBA cat lab/chr*_GERP_TBA_scores.wig | \ wigEncode stdin tbaGerpCons.wig tbaGerpCons.wib # upper limit 4.48, lower limit -29.86 set d = /gbdb/hg17/encode/TBA/SEP-05 ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/tbaGerpCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeTbaGerpCons tbaGerpCons.wig # MLAGAN cat lab/chr*_GERP_MLAGAN_scores.wig | \ wigEncode stdin mlaganGerpCons.wig mlaganGerpCons.wib # upper limit 4.48, lower limit -25.74 set d = /gbdb/hg17/encode/MLAGAN/SEP-05 ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/mlaganGerpCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganGerpCons mlaganGerpCons.wig # MAVID cat lab/chr*_GERP_MAVID_scores.wig | \ wigEncode stdin mavidGerpCons.wig mavidGerpCons.wib # upper limit 4.48, lower limit -22.58 set d = /gbdb/hg17/encode/MAVID/SEP-05 ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/mavidGerpCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMavidGerpCons mavidGerpCons.wig # Elements. Note: scores from 307-1000. This data also # upper limit 4.48, lower limit -22.58 # includes a 6th field with an unscaled float score, which # will be included in the table, but not used for display # with unscaled scores. # Adding item names (.#) for consistency with other MSA elements # subtracks lab/GERP_TBA_Cons.bed # Post wiggles for downloads (2007-04-16 kate) ssh kkstore03 cd /cluster/data/encode/MSA/Gerp cd 2006-02-01 mkdir downloads cat > makeDownloads.csh << 'EOF' foreach prog (TBA MLAGAN MAVID) echo $prog ls lab/chr*_${prog}_*.wig | \ sed 's/lab\/chr//' | sort -n | sed 's/^/lab\/chr/' | \ xargs cat | \ gzip -c -4 > download/GERP_${prog}.scores.wig.gz end 'EOF' # << happy emacs csh makeDownloads.csh >&! makeDownloads.log & ########################################################################## # MSA SCONE Conservation (2005-12-12 kate) # From Harvard Med School, Saurabh Asthana # Reusbmitted 12/21/05 # Resubmitted 6/22/07 (kate) cd /cluster/data/encode/MSA mkdir -p SconeCons/2005-12-21/lab ln -s SconeCons/2005-12-21 latest cd latest/lab mkdir bed; cd bed wget http://genetics.bwh.harvard.edu/graft/bed/sconeRegions.NOV-2005.bed.tar.bz2 bunzip2 sconeRegions.NOV-2005.bed.tar.bz2 tar xvf sconeRegions.NOV-2005.bed.tar cd .. cd ..; mkdir wig; cd wig wget http://ika.bwh.harvard.edu/graft/wig/scone.NOV-2005.wig.tar.bz2 bunzip2 scone.NOV-2005.wig.tar.bz2 tar xvf scone.NOV-2005.wig.tar cd ../.. # elements cat > load.csh << 'EOF' set out = sconeRegions.bed rm -f $out foreach f (lab/bed/*.bed) set r = $f:t:r echo $r grep '^chr' $f | \ awk '{printf("%s\t%d\t%d\t%s\t1000\n", \ $1,$2,$3,$4)}' >> $out end hgLoadBed -strict hg17 encodeTbaSconeEl $out 'EOF' csh load.csh >&! load.log & # Loaded 18817 elements featureBits -enrichment hg17 encodeRegions encodeTbaSconeEl # encodeRegions 1.047%, encodeTbaSconeEl 0.083%, both 0.083%, cover 7.92%, enrich 95.55x featureBits -enrichment hg17 encodeRegions encodeTbaPhastConsEl # encodeRegions 1.047%, encodeTbaPhastConsEl 0.063%, both 0.063%, cover 6.04%, enrich 95.55x featureBits -enrichment hg17 encodeRegions encodeTbaGerpEl # encodeRegions 1.047%, encodeTbaGerpEl 0.057%, both 0.057%, cover 5.47%, enrich 95.55x featureBits -enrichment hg17 encodeRegions encodeTbaBinConsEl # encodeRegions 1.047%, encodeTbaBinConsEl 0.060%, both 0.060%, cover 5.71%, enrich 95.55x # conservation cat lab/wig/*.wig | \ wigEncode stdin tbaScone.wig tbaScone.wib set d = /gbdb/hg17/encode/TBA/SEP-05 ln -s `pwd`/tbaScone.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeTbaSconeCons tbaScone.wig # Resubmitted 6/22/07 (kate) ssh kkstore03 cd /cluster/data/encode/MSA/SconeCons mkdir -p 2007-06-22/lab rm latest ln -s SconeCons/2007-06-22 latest cd latest/lab wget -r -nd http://ika.bwh.harvard.edu/graft/tracks/pvalue.SCONE.wig.tar.bz2 bunzip2 pvalue.SCONE.wig.tar.bz2 tar xvf pvalue.SCONE.wig.tar cd .. # conservation ssh hgwdev cd /cluster/data/encode/MSA/SconeCons/latest cat lab/*.wig | \ wigEncode stdin tbaScone.wig tbaScone.wib set d = /gbdb/hg17/encode/TBA/SEP-05/Update/Scone mkdir -p $d ln -s /cluster/data/encode/MSA/SconeCons/2007-06-22/tbaScone.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeTbaSconeConsUpdate tbaScone.wig # elements ssh kkstore03 cd /cluster/data/encode/MSA/SconeCons cd 2007-06-22/lab mkdir bed cd bed wget -r -nd http://ika.bwh.harvard.edu/graft/tracks/elements.SCONE.bed.tar.bz2 bunzip2 elements.SCONE.bed.tar.bz2 tar xvf elements.SCONE.bed.tar cd .. cat > load.csh << 'EOF' set out = sconeRegions.bed rm -f $out foreach f (lab/bed/*.bed) set r = $f:t:r echo $r grep '^chr' $f | \ awk '{printf("%s\t%d\t%d\t%s\t1000\n", \ $1,$2,$3,$4)}' >> $out end 'EOF' csh load.csh >&! load.log & ssh hgwdev cd /cluster/data/encode/MSA/SconeCons/latest hgLoadBed hg17 encodeTbaSconeElUpdate sconeRegions.bed # Loaded 33293 elements of size 5 # previous: Loaded 18817 elements ########################################################################## # MSA Conservation (2005-12-07 kate) # Just phastCons and GERP for this freeze (x3 aligners) cd /cluster/data/encode/MSA/SEP-05 mkdir -p conservation/lab cd conservation/lab wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/phastCons.wig.tar.gz tar xvfz phastCons.wig.tar.gz cd .. cat > load.csh << 'EOF' # TBA gunzip -c lab/tba/*/phast/human.ENm*.gz | \ wigEncode stdin tbaPhastCons.wig tbaPhastCons.wib set d = /gbdb/hg17/encode/TBA/SEP-05 ln -s `pwd`/tbaPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeTbaPhastCons tbaPhastCons.wig # MLAGAN gunzip -c lab/mlagan/*/phast/human.ENm*.gz | \ wigEncode stdin mlaganPhastCons.wig mlaganPhastCons.wib set d = /gbdb/hg17/encode/MLAGAN/SEP-05 ln -s `pwd`/mlaganPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganPhastCons mlaganPhastCons.wig # MAVID gunzip -c lab/mavid/*/phast/human.ENm*.gz | \ wigEncode stdin mavidPhastCons.wig mavidPhastCons.wib set d = /gbdb/hg17/encode/MAVID/SEP-05 ln -s `pwd`/mavidPhastCons.wib $d hgLoadWiggle -pathPrefix=$d hg17 encodeMavidPhastCons mavidPhastCons.wig 'EOF' csh load.csh >&! load.log ########################################################################## # MSA alignment agreement # From Ariel Schwartz, UC Berkeley cd /cluster/data/encode/MSA mkdir alignAgreement/2005-11-16/lab cd alignAgreement/2005-11-16/lab touch Mean.wig MavidMlagan.wig MavidTba.wig MlaganTba.wig touch MavidUngapped.wig MlaganUngapped.wig TbaUngapped.wig cat > split.csh << 'EOF' foreach f (lab/*.wig.gz) echo $f gunzip $f /cluster/data/encode/bin/scripts/splitTracks.pl $f:r cat t0 >> Mean.wig cat t1 >> MavidMlagan.wig cat t2 >> MavidTba.wig cat t3 >> MlaganTba.wig cat t4 >> MavidUngapped.wig cat t5 >> MlaganUngapped.wig cat t6 >> TbaUngapped.wig rm t? gzip $f:r end 'EOF' csh split.csh >&! split.log & mkdir wig wib cat > load.csh << 'EOF' set dir = /gbdb/hg17/encode/MSA/alignAgree/2005-11-16 mkdir -p $dir foreach f (*.wig) set table = encodeMsaAlign$f:r echo $table egrep -v "browser|track" $f | \ wigEncode stdin wig/$table.wig wib/$table.wib hgLoadWiggle -pathPrefix=$dir hg17 $table wig/$table.wig ln -s `pwd`/wib/$table.wib $dir end 'EOF' csh load.csh >&! load.log & ########################################################################## # Harvard TBA Conservation (2005-12-12 kate) # From Saurabh Asthana # Dept. of Medicine, Brigham & Women's Hospital, Harvard Medical School cd /cluster/data/encode/MSA/SconeCons mkdir -p 2005-12-01/lab cd 2005-12-01/lab wget http://ika.bwh.harvard.edu/graft/wig/scone.NOV-2005.wig.tar.bz2 wget http://genetics.bwh.harvard.edu/graft/bed/sconeRegions.NOV-2005.bed.tar.bz2 mkdir -p bed wig # NOTE: files are actually gzipped mv scone.NOV-2005.wig.tar.bz2 wig/scone.wig.tar.gz mv sconeRegions.NOV-2005.bed.tar.bz2 bed/sconeRegions.bed.tar.gz cd .. # Conservation scores cat lab/wig/*.wig | grep -v track | \ wigEncode # Conserved Elements # Add these to the TBA Elements track as a subtrack # For table consistency, assign item names of the form .#, # and a score=1000 set bed = sconeRegions.bed rm $out foreach f (lab/bed/*.bed) set r = $f:t:r echo $r grep '^chr' $f | \ awk -v REGION=$r '{printf("%s\t%d\t%d\t%s.%d\t%d\n", \ $1,$2,$3,REGION, NR,1000)}' >> $bed end hgLoadBed -strict hg17 encodeTbaSconeEl $bed # Loaded 18784 elements of size 5 ########################################################################## # UW/Regulome Chromatin Accessibility Profiling (CAP) - RENAMED, see below # Submitted 2006-1-17 by Scott Kuehn # update of data received on 2006-05-04 (sent to Kate) by Scott Kuehn # Update done 2006-05-19 - 2006-05-23 (hartera) # Not called CAP anymore, now called DNase array for DNase I # Track short label is now: UW DNase GM # long label: ENCODE UW DNase/Array GM06690 - DNase I # sensitivity/hypersensitivity in GM06990 Cells # Data is for lymphoblastoid cells (GM06990). # Updated long label in trackDb.ra to: UW Array DNase I # sensitivity/hypersensitivity in GM06990 Cells (hartera, 2007-02-26) cd /cluster/data/encode/Regulome mkdir -p 2006-05-04/lab cd 2006-05-04 awk '{printf("%s\t%s\t%s\t%.3f\n", $1, $2, $3, $5)}' \ lab/Encode.DNase-Array-GM06990.Probes.hg17.bed | \ sort -k1,1 -k2,2n | \ /cluster/data/encode/bin/scripts/trimOverlap.pl > sens.bed hgLoadBed -strict -bedGraph=4 hg17 encodeRegulomeDnaseGM06990Sens sens.bed # the Encode.DNase-Array-GM06990.DHSs.hg17.bed file has a float score # use the encodeRegulomeDnaseSitesSKNSH.sql renamed as sites.sql which # has an int and a float score field perl -pi.bak -e 's/SitesSKNSH/GM06990Sites/' sites.sql rm *.bak # scale scores to 0-1000. use linear transform. awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,($5 * 105),$5}' \ lab/Encode.DNase-Array-GM06990.DHSs.hg17.bed \ | sort -k1,1 -k2,2n > linearScaledSites.bed hgLoadBed -sqlTable=sites.sql hg17 encodeRegulomeDnaseGM06990Sites \ linearScaledSites.bed # authors provided track-description.html # Add this to trackDb/human/hg17 as encodeRegulomeDnaseArray.html # trackDb entry - track is renamed as encodeRegulomeDnaseArray # Previously track was called encodeRegulomeCap. ########################################################################## # SANGER CHIP/CHIP (2006-03-16 kate) # Updated (2006-08-08 kate) # 5 histone mods in HFL cells, to be added to existing track # Submitted by Rob Andrews # Data in two additional cell lines (MOLT4 and PTR8) submitted # 8/8/08 by Rob -- 8 additional subtracks. # Updated (DONE, 2007-01-09, hartera) # 4 histone mods in GM06990 cells and data for CTCF antibody (CCCTC-binding # factor (zinc finger protein)) also in GM06990 to be added to existing track. # Total of 5 new subtracks submitted by Rob Andrews: rma@sanger.ac.uk # Update of the above 5 new subtracks (DONE, 2007-01-18, hartera) # The 2007-01-09 update had incorrect data. Corrected data was received # on 2007-01-18 for the five new subtracks and the track was updated. # New data was submitted by Christopher Koch: cmk@sanger.ac.uk ssh hgwdev cd /cluster/data/encode/sanger/chipchip mkdir -p 2006-03-16/lab cd 2006-03-16 cp /var/ftp/encode/*.wig.txt lab grep "^chr" lab/H3K4me1_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me1HFL1.bed grep "^chr" lab/H3K4me2_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me2HFL1.bed grep "^chr" lab/H3K4me3_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me3HFL1.bed grep "^chr" lab/H3ac_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \ H3acHFL1.bed grep "^chr" lab/H4ac_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \ H4acHFL1.bed cat > load.csh << 'EOF' foreach f (*.bed) set t = $f:r echo $t hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed end 'EOF' csh load.csh >&! load.log & # loaded 23996 elements for 5 tables ssh hgwdev cd /cluster/data/encode/sanger/chipchip mkdir -p 2008-08-8/lab cd 2006-08-08 cp /var/ftp/encode/*.wig.txt lab grep "^chr" lab/H3K4me1_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me1Ptr8.bed grep "^chr" lab/H3K4me2_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me2Ptr8.bed grep "^chr" lab/H3K4me3_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me3Ptr8.bed grep "^chr" lab/H3K4me1_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me1Molt4.bed grep "^chr" lab/H3K4me2_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me2Molt4.bed grep "^chr" lab/H3K4me3_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me3Molt4.bed grep "^chr" lab/H3ac_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \ H3acMolt4.bed grep "^chr" lab/H4ac_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \ H4acMolt4.bed cat > load.csh << 'EOF' foreach f (*.bed) set t = $f:r echo $t hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed end 'EOF' csh load.csh >&! load.log & # loaded 23983 elements for 8 tables # update by adding these additional 5 subtracks. # replaced the 5 new subtracks with corrected data (2007-01-18) ssh hgwdev cd /cluster/data/encode/sanger/chipchip mkdir -p 2007-01-18/lab rm latest ln -s /cluster/data/encode/sanger/chipchip/2007-01-18 \ /cluster/data/encode/sanger/chipchip/latest cd latest mv /var/ftp/encode/*.wig.txt lab grep "^chr" lab/CTCF_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ CTCF.bed grep "^chr" lab/H3K27me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K27me3.bed grep "^chr" lab/H3K36me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K36me3.bed grep "^chr" lab/H3K79me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K79me3.bed grep "^chr" lab/H3K9me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K9me3.bed # load data into hg17 database cat > load.csh << 'EOF' foreach f (*.bed) set t = $f:r echo $t hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed end 'EOF' csh load.csh >&! load.log & # update trackDb/human/hg17/trackDb.encode.ra to add these new subtracks # to the encodeSangerChip track. update the encodeSangerChip.html to # list the new antibodies used and links for them from the information on # the Sanger data access page: # http://www.sanger.ac.uk/PostGenomics/encode/data-access.shtml ####################################################################### # ENCODE PSEUDOGENE TRACK (DONE, 2006-03-30, hartera) # Yontao reloaded the encodePseudogeneUcsc2 table with shorter # names for the pseudogenes as they were cut off in the browser so now # NM_001017421|chr2|+|1 would be NM_001017421|1 # The class table needs to be reloaded. Yontao provided a file: # encodePseudogeneUcsc2-forload.class # get a dump of the current table without the ucsc2 entries. ssh hgwdev cd /cluster/data/encode/pseudogene/class hgsql -N -e 'select * from encodePseudogeneClass where owner != "ucsc2";' \ hg17 > encodePseudogeneClassNoUcsc2.txt cat encodePseudogeneClassNoUcsc2.txt encodePseudogeneUcsc2-forload.class \ > allPseudogenesClass.txt sort -k3,3 allPseudogenesClass.txt > encodePseudogeneClass2.txt # the consensus sequences have different names in the Class table, the # names had been changed to Vega gene names. Get the Class from the gtf in # /cluster/data/encode/pseudogene/consensus awk 'BEGIN {OFS="\t"} {print $10, $2}' \ ../consensus/consensus.jan6.hg17.gtf | sort | uniq \ > pgConsensus.class sed -e 's/VEGA_//' pgConsensus.class | sed -e 's/"//g' \ | sed -e 's/;//' > pgConsensusClass.txt wc -l pgConsensusClass.txt # 201 pgConsensusClass.txt awk 'BEGIN {OFS="\t"}{print $0,"consensus"}' pgConsensusClass.txt | sort \ > pgConsensusClassSorted.txt # reload the encodePseudogeneClass table hgsql -N -e 'select * from encodePseudogeneClass where owner != "ucsc2" and owner != "consensus";' \ hg17 > pseudoClassNoUcsc2OrConsensus.txt cat pseudoClassNoUcsc2OrConsensus.txt encodePseudogeneUcsc2-forload.class \ pgConsensusClassSorted.txt > allPseudogenesClass.txt sort -k3,3 allPseudogenesClass.txt > encodePseudogeneClass2.txt wc -l encodePseudogeneClass2.txt # 995 encodePseudogeneClass2.txt # only 830 load as there are dupicate names - 165 names are shared # between the consensus and havana subtracks. These names # need to be unique as they are the primary key. Checked that the # class is the same for havana and consensus subtracks where the # name is the same so reload table with one entry for these genes. # remove havana and consensus pseudogenes grep -v havana allPseudogenesClass.txt | grep -v consensus \ > pseudoNoHavananNoCons.txt wc -l pseudoNoHavananNoCons.txt # 616 pseudoNoHavananNoCons.txt # prepare consensus set not in havana awk 'BEGIN {OFS="\t"}{print $0,"consensus"}' consOnly > consOnlyWithOwner # prepare havana set not in consensus awk 'BEGIN {OFS="\t"}{print $0,"havana"}' havanaOnly2 > havanaOnlyWithOwner # prepare set common to consensus and havana awk 'BEGIN {OFS="\t"}{print $0,"havana or consensus"}' \ nameAndClass.ConsAndHavana > havanaAndConsWithOwner wc -l *Owner # 36 consOnlyWithOwner # 165 havanaAndConsWithOwner # 2 havanaOnlyWithOwner cat pseudoNoHavananNoCons.txt consOnlyWithOwner havanaAndConsWithOwner \ havanaOnlyWithOwner > allPseudogenesClass2.txt sort -k3,3 allPseudogenesClass2.txt > encodePseudogeneClass2.txt wc -l encodePseudogeneClass2.txt # 819 encodePseudogeneClass2.txt # reload table hgsql -e 'drop table encodePseudogeneClass;' hg17 hgsql hg17 < encodePseudogeneClass.sql echo "load data local infile 'encodePseudogeneClass2.txt' into \ table encodePseudogeneClass" | hgsql hg17 ########################################################################## # Univ. Uppsala, Sweden Chip/chip (butyrate-treated H3Ac, H4Ac) # Submitted 5/29/06 by Adam Ameur # 6 subtracks # DONE - 2006-06-13 - Hiram cd /cluster/data/encode/Uppsala mkdir -p 2006-05-09/lab cd 2006-05-09/lab unzip H3H4ac_butyrate.zip cat << '_EOF_' > splitTrack.pl #!/usr/bin/env perl use warnings; use strict; open (FH,") { if ($line =~ m/^track/) { $outFile = sprintf("track_%d", $trackCount++); open (OUT,">$outFile") or die "can not open $outFile: $!"; } else { print OUT $line; } } close(OUT); close (FH) '_EOF_' # << emacs happy chmod +x splitTrack.pl ./splitTrack.pl # looking at the track definitions to get some reasonable names: grep "^track" H3H4ac_butyrate.tracks mv track_1 encodeUppsalaChipH3acBut0h.wig.txt mv track_2 encodeUppsalaChipH3acBut12h.wig.txt mv track_3 encodeUppsalaChipH4acBut0h.wig.txt mv track_4 encodeUppsalaChipH4acBut12h.wig.txt mv track_5 encodeUppsalaChipH3acBut0vs12.itemRgb.txt mv track_6 encodeUppsalaChipH4acBut0vs12.itemRgb.txt # encoding wigEncode encodeUppsalaChipH3acBut0h.wig.txt \ encodeUppsalaChipH3acBut0h.wig encodeUppsalaChipH3acBut0h.wib # upper limit 15.68, lower limit 0.17 wigEncode encodeUppsalaChipH3acBut12h.wig.txt \ encodeUppsalaChipH3acBut12h.wig encodeUppsalaChipH3acBut12h.wib # upper limit 6.55, lower limit 0.22 wigEncode encodeUppsalaChipH4acBut0h.wig.txt \ encodeUppsalaChipH4acBut0h.wig encodeUppsalaChipH4acBut0h.wib # upper limit 14.47, lower limit 0.19 wigEncode encodeUppsalaChipH4acBut12h.wig.txt \ encodeUppsalaChipH4acBut12h.wig encodeUppsalaChipH4acBut12h.wib # upper limit 6.58, lower limit 0.05 mkdir /gbdb/hg17/encode/Uppsala ln -s `pwd`/*.wib /gbdb/hg17/encode/Uppsala/ # using the -tmpDir will cause the temp file to be removed hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH3acBut0h \ -pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH3acBut0h.wig hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH3acBut12h \ -pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH3acBut12h.wig hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH4acBut0h \ -pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH4acBut0h.wig hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH4acBut12h \ -pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH4acBut12h.wig # they don't have their score data normalized, find min, max, etc... ave -col=5 encodeUppsalaChipH3acBut0vs12.itemRgb.txt # min 0.404995 # max 7.091458 # -> max - min = 6.686463 echo "7.091458 - 0.404995" | bc # 6.686463 # plugging in those numbers, normalize the score column # 0.000001 from the min value to avoid -0 in the output awk ' { score = 1000.0*($5 - 0.404994)/6.686463 for (i=1; i < 5; ++i) { printf "%s\t", $i } printf "%d\t", score for (i=6; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } ' encodeUppsalaChipH3acBut0vs12.itemRgb.txt | \ hgLoadBed -tmpDir=/scratch/tmp -strict hg17 \ encodeUppsalaChipH3acBut0vs12 stdin # using the -tmpDir will cause the temp file to be removed # same deal for the other one ave -col=5 encodeUppsalaChipH4acBut0vs12.itemRgb.txt # min 0.347273 # max 2.833333 echo "2.833333 - 0.347273" | bc # 2.486060 # plugging in those numbers, normalize the score column # 0.000001 from the min value to avoid -0 in the output awk ' { score = 1000.0*($5 - 0.347272)/2.486060 for (i=1; i < 5; ++i) { printf "%s\t", $i } printf "%d\t", score for (i=6; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } ' encodeUppsalaChipH4acBut0vs12.itemRgb.txt | \ hgLoadBed -tmpDir=/scratch/tmp -strict hg17 \ encodeUppsalaChipH4acBut0vs12 stdin # To see what would be reasonable view limits, look at these # histograms and see where the majority of the data is hgWiggle -doHistogram -hBinSize=0.16 -hBinCount=100 -hMinVal=0.0 \ -db=hg17 encodeUppsalaChipH3acBut0h # running each of the wiggle tracks, it looks like 95% of the data # is in the region 0 to 2.0 ########################################################################## # UW/Regulome QCP data # To replace existing tracks # Submitted 5/19/06 by John Stam # 1 zip file data: UW_may06_ENCODE_data, plus Description.doc cd /cluster/data/encode/Regulome mkdir -p 2006-05-19/lab cd 2006-05-19/lab # deposit data mkdir data cd data unzip ../*.zip cd .. ls data # CD4.baseline.hg17.bed HMEC.baseline.hg17.bed NHBE.baseline.hg17.bed # CaCo2.baseline.hg17.bed HRE.baseline.hg17.bed PANC.baseline.hg17.bed # CaLU3.baseline.hg17.bed HeLa.baseline.hg17.bed SAEC.baseline.hg17.bed # EryAdult.baseline.hg17.bed HepG2.baseline.hg17.bed SKnSH.baseline.hg17.bed # EryFetal.baseline.hg17.bed Huh7.baseline.hg17.bed # GM.baseline.hg17.bed K562.baseline.hg17.bed # loading bedGraph 5 data type: for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \ Huh7 K562 NHBE PANC SAEC SKnSH do sort -k1,1 -k2,2n data/$CELL.baseline.hg17.bed \ | /cluster/data/encode/bin/scripts/trimOverlap.pl \ | hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \ encodeUWRegulomeBase$CELL stdin done # gross statistics for the data awk '{print $5}' data/*.baseline.hg17.bed | ave stdin # Q1 -0.201029 # median 0.000000 # Q3 0.207335 # average 0.018950 # min -5.454980 # max 6.327273 # count 291642 # total 5526.507662 # standard deviation 0.489442 # a histogram of the data: awk '{print $5}' data/* | textHistogram -verbose=2 -binSize=0.12 \ -maxBinCount=100 -minVal=-5.5 -real -pValues stdin \ > histogram.data # looking at that, it appers that 95% of the data is within the # range of -1.0 to 1.0 # The note that came with this data said to set view limits # at 0.5 : 3.0 # Making the trackDb entries, taking colors from: # http://genome-test.cse.ucsc.edu/~hiram/rgbItemExamples.html rm -f trackDb.entries.txt I=1 export I for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \ Huh7 K562 NHBE PANC SAEC SKnSH do echo " track encodeUWRegulomeBase${CELL}" echo " subTrack encodeUWRegulomeBase" echo " shortLabel ${CELL}" echo " longLabel ${CELL} DNaseI Sensitivity" case $I in 1) echo " color 0,0,255";; 2) echo " color 0,48,224";; 3) echo " color 0,96,176";; 4) echo " color 0,119,153";; 5) echo " color 0,153,119";; 6) echo " color 0,187,85";; 7) echo " color 56,238,0";; 8) echo " color 0,255,0";; 9) echo " color 68,238,0";; 10) echo " color 96,192,326";; 11) echo " color 136,170,0";; 12) echo " color 170,136,0";; 13) echo " color 204,102,0";; 14) echo " color 238,68,0";; 15) echo " color 255,0,0";; 16) echo " color 255,0,255";; esac echo " priority ${I}" echo I=`expr $I + 1` done > trackDb.entries.txt ########################################################################## # EBI PECAN Alignments (IN PROGRESS 2006-06-22 kate) # From Ben Paten cd /cluster/data/encode mkdir -p PECAN/SEP-05/lab cd PECAN/SEP-05/lab wget http://www.ebi.ac.uk/~bjp/pecan/encode_sept_pecan_mafs.tar.bz2 bunzip2 encode_sept_pecan_mafs.tar.bz2 tar xvf encode_sept_pecan_mafs.tar cd .. cat > project.csh << 'EOF' mkdir -p tmp set tmpDir = tmp foreach f (lab/*MAF/EN[mr]*) set r = $f:t:r echo $r set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \ hgsql -N hg17` set start = \ `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \ hgsql -N hg17` set size = \ `echo "SELECT size from chromInfo WHERE chrom='$c'" | \ hgsql -N hg17` /cluster/data/encode/bin/scripts/mafCoord.pl < $f \ human.0 hg17.$c $start $size | \ sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf echo "projecting $r" /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg17.$c > $r.maf echo "finished $r" end 'EOF' csh project.csh >&! project.log & rm -fr tmp set gdir = /gbdb/hg17/encode/PECAN/SEP-05/maf mkdir -p $gdir rm -f $gdir/*.maf ln -s /cluster/data/encode/PECAN/SEP-05/*.maf $gdir hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodePecanAlign >&! load.log # lots of "score too small" messages -- these are OK. cat *.maf | hgLoadMafSummary hg17 encodePecanSummary stdin ########################################################################## # UW/Regulome QCP data again 2006-07-05 - Hiram ssh hgwdev cd /cluster/data/encode/Regulome/2006-06-13/lab for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \ Huh7 K562 NHBE PANC SAEC SKnSH do ls -og ${CELL}.normalized_060206.hg17.bed sort -k1,1 -k2,2n ${CELL}.normalized_060206.hg17.bed \ | /cluster/data/encode/bin/scripts/trimOverlap.pl \ | hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \ encodeUWRegulomeBase${CELL} stdin done # gross statistics for the data awk '{print $5}' *.normalized_060206.hg17.bed | ave stdin # Q1 -0.494604 # median -0.000000 # Q3 0.510167 # average 0.046097 # min -13.409856 # max 15.554195 # count 297650 # total 13720.736560 # standard deviation 1.203400 # calculate histogram 100 bin size: echo -13.5 15.6 | awk '{print ($2-$1)/100}' # 0.291 # a histogram of the data: awk '{print $5}' *.normalized_060206.hg17.bed | \ textHistogram -verbose=2 -binSize=0.292 \ -maxBinCount=100 -minVal=-13.5 -real -pValues stdin \ > histogram.data # looks like the majority of the data is within -1.0 to 1.0 # The trackDb entries made previously should be OK ############################################################ # DLESS acs 05/02/06 # sorry this is a bit sketchy. See me with questions cd /cluster/home/acs/encode-dless/hg17 # make tree model tree_doctor /cluster/home/acs/DLESS-CSHL/encode17.mod --rename "human->hg17" > tree.mod tree_doctor --tree-only tree.mod > tree.nh # make SS files, annotated with indels by Brian cat > prepAlignmentsIndels.sh < /scratch/${TARGET}.sso /cluster/bin/phast/msa_view /scratch/$TARGET.sso -i SS -o SS --seqs hg17,chimp,baboon,macaque,marmoset,galago,rat,mouse,rabbit,cow,dog,rfbat,armadillo,elephant,tenrec,monodelphis,platypus --gap-strip ALL | /cluster/home/acs/phast-opteron/bin/msa_view - -i SS -o SS --order hg17,chimp,baboon,macaque,marmoset,galago,rat,mouse,rabbit,cow,dog,rfbat,armadillo,elephant,tenrec,monodelphis,platypus > /cluster/bluearc/encode/TBA/SEP-05/ss-indels/${TARGET}.sso # second call adds rows of missing data for missing species rm /scratch/$TARGET.sso EOF chmod +x prepAlignmentsIndels.sh mkdir -p /cluster/bluearc/encode/TBA/SEP-05/maf-indels /cluster/bluearc/encode/TBA/SEP-05/ss-indels rsync -avz /cluster/store11/encodeMafAnno/TBA/APR-26/human.*.maf /cluster/bluearc/encode/TBA/SEP-05/maf-indels # location of Brian's files hgsql hg17 -e "select * from encodeRegions" --skip-column-names > regions.txt awk '{printf "prepAlignmentsIndels.sh %s %s %s %s\n", $1, $2, $3, $4}' regions.txt > jobList7 # never mind numbering; there were some other experimental runs that I've omitted # para create, para push, etc. # get indel histories and estimate indel params mkdir -p consElements awk '{printf "select chrom, chromStart - %d + 1, chromEnd - %d + 1 from encodeTbaPhastConsEl where chrom = \"%s\" and chromStart >= %d and chromEnd <= %d\n", $3, $3, $2, $3, $4 > $1 ".sql"}' regions.txt for file in *.sql ; do hgsql hg17 --skip-column-names < $file > consElements/`basename $file .sql`.bed ; done rm *.sql cat > indelHistoryParsBrian.sh < /cluster/bluearc/encode/DLESS/IH-indels/$TARGET.pars.ih EOF chmod +x indelHistoryParsBrian.sh rm -f jobList8 mkdir -p /cluster/bluearc/encode/DLESS/IH-indels awk '{print $1}' regions.txt > targets for t in `cat targets` ; do echo "indelHistoryParsBrian.sh $t" >> jobList8 ; done # para create, para push, etc. cat > indelModelsBrian.sh < IM-indels/$TARGET.pars.im EOF chmod +x indelModelsBrian.sh rm -f jobList9 mkdir -p IM-indels for t in `cat targets` ; do echo "indelModelsBrian.sh $t" >> jobList9 ; done # para create, para push, etc. # average estimates across targets sed 's/,//g' IM-indels/*.pars.im | awk '{if ($2 == 0) {nbg++; a_bg += $6; b_bg += $9; t_bg += $12} else if ($2 == 1) {nco++; a_co += $6; b_co += $9; t_co += $12}} END {printf "bg: alpha = %f, beta = %f, tau = %f\nco: alpha = %f, beta = %f, tau = %f\n", a_bg/nbg, b_bg/nbg, t_bg/nbg, a_co/nco, b_co/nco, t_co/nco}' > ave.pars.brian.im #bg: alpha = 0.033417, beta = 0.053284, tau = 0.052852 #co: alpha = 0.011655, beta = 0.020610, tau = 0.065395 # now estimate DLESS params by ML cat > doDlessEstimateParsBrian.sh < /cluster/bluearc/encode/DLESS/ESTIMATE-indels/$TARGET.pars.gff 2> /cluster/bluearc/encode/DLESS/ESTIMATE-indels/$TARGET.pars.stderr EOF chmod +x doDlessEstimateParsBrian.sh mkdir -p /cluster/bluearc/encode/DLESS/ESTIMATE-indels awk '{printf "doDlessEstimateParsBrian.sh %s %s\n", $1, $2}' regions.txt > jobList10 # para create, para push, etc. # average estimates across targets rm -f estimates.pars.brian.txt grep '^Done' -l /cluster/bluearc/encode/DLESS/ESTIMATE-indels/*.pars.stderr > tmp1 for file in `cat tmp1` ; do tail -9 $file | head -1 | awk '{print $2, $3}'>> estimates.pars.brian.txt ; done awk '{x += $1; y += $2} END {print "Pars:", x/NR, y/NR}' estimates.pars.brian.txt rm tmp1 # Pars: 0.0551889 0.261488 # predict elements cat > doDlessPars.sh < /cluster/bluearc/encode/DLESS/GFF/$TARGET.pars.gff 2> /cluster/bluearc/encode/DLESS/STDERR/$TARGET.pars.stderr EOF chmod +x doDlessPars.sh mkdir -p /cluster/bluearc/encode/DLESS/GFF /cluster/bluearc/encode/DLESS/STDERR awk '{printf "doDlessPars.sh %s %s\n", $1, $2}' regions.txt > jobList4 # para create, para push, etc. # compute P-values with phyloP cat > doGeneric.sh <' $2 $1 ${*:3} > $2 EOF chmod +x doGeneric.sh mkdir -p /cluster/bluearc/encode/DLESS/DLESSP rm -f jobList5 for t in `cat targets` ; do \ echo "./doGeneric.sh /cluster/bin/phast/dlessP /cluster/bluearc/encode/DLESS/DLESSP/$t.pars.dlessP /cluster/bluearc/encode/TBA/SEP-05/ss-indels/$t.sso -i SS /cluster/home/acs/encode-dless/hg17/tree.mod /cluster/bluearc/encode/DLESS/GFF/$t.pars.gff" >> jobList5 ;\ done # para create, para push, etc. # load track echo "drop table if exists encodeDless" | hgsql hg17 cat /cluster/bluearc/encode/DLESS/DLESSP/*.pars.dlessP | grep -v '^#' | sort -k1,1 -k2,2n | awk 'NF == 24' |sed 's/hg17/human/' > dless.dat awk '{ if (($6 == "conserved" && $8 < 0.05) || ($6 == "gain" && $8 < 0.05 && $9 > 0.05 && $10 < 0.05) || ($6 == "loss" && $8 > 0.05 && $9 < 0.05 && $11 < 0.05)) print $0}' dless.dat > dless.filtered.dat sed 's/dless/encodeDless/g' ~/kent/src/hg/lib/dless.sql | hgsql hg17 echo "load data local infile 'dless.filtered.dat' into table encodeDless" | hgsql hg17 ####################################################################### # YALE RFBR (Regulatory Factor Binding Regions) DATA # (DONE, 2006-11-01-2006-11-13, hartera) # Data provided by Mark Gerstein's lab at Yale. # Contact: zhengdong.zhang@yale.edu # E-mail from Zhengdong on Nov. 20, 2006 to say that currently there is a # discussion among some ENCODE people if DHS sites should be included # when the clusters and deserts are generated. Therefore, it should not be # released to the public site yet. # Approved for release to public site by Zhengdong Zhang on 2007-01-24. ssh hgwdev mkdir /cluster/data/encode/yale/rfbr # move data from ftp site mv /var/ftp/encode/encode-tf-clusters-deserts.zip \ /cluster/data/encode/yale/rfbr cd /cluster/data/encode/yale/rfbr unzip encode-tf-clusters-deserts.zip # 3 files: # encode-tf-clusters-deserts-description.txt # encode-tf-clusters.bed # encode-tf-deserts.bed # remove the URL from the name field and leave the just the accession sed -e \ 's/http\:\/\/dart\.gersteinlab\.org\/cgi\-bin\/ar\/lookup\.cgi?acc=//' \ encode-tf-clusters.bed > clusters.bed sed -e \ 's/http\:\/\/dart\.gersteinlab\.org\/cgi\-bin\/ar\/lookup\.cgi?acc=//' \ encode-tf-deserts.bed > deserts.bed # load data files as BED files hgLoadBed hg17 encodeYaleChipRfbrClusters clusters.bed hgLoadBed hg17 encodeYaleChipRfbrDeserts deserts.bed # add description page cp encode-tf-clusters-deserts-description.txt \ ~/kent/src/hg/makeDb/trackDb/human/hg17/encodeYaleChipRfbr.html # edit description to add extra method details requested from contributor # Add trackDb entry to trackDb.encode.ra for hg17. see encodeYaleAffyRNATars # and use the same url and urlLabel and dataVersion is Dec 2005 here. # that is the contributors data freeze, not an ENCODE one. ################################################################ # EvoFold ENCODE track, Jakob Skou Pedersen 12.03.2006 # This is an update of the existing TBA23 EvoFold track (table: # encode_tba23EvoFold), but a new track is made because of naming, # etc. # These are one time predictions accompanying the encode paper, so we # start by fetching the data from the web and modify the element # scores and names. ssh hgwdev cd /cluster/data/encode/Evofold wget http://www.soe.ucsc.edu/~jsp/encFolds/bed/nativeTop100perc.bed cat nativeTop100perc.bed | awk 'BEGIN{OFS="\t"} {$5=int(100*$5); $4=$4 "_" $6 "_" $5; print}' > encodeEvoFold.bed # encodeEvoFold.bed is a 9-column bed file: column 1-6 contain standard # information, column 7 is element length, column 8 is the RNA # secondary structure in parentheses format, and column is9 a # commaseparated list of position specific confidence scores (floats). cat /cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql | sed -e 's/evofold/encodeEvoFold/' > tmp.sql hgLoadBed -notItemRgb -sqlTable=tmp.sql hg17 encodeEvoFold encodeEvoFold.bed ########################################################################### # UVienna RNA structure data (DONE, 2007-01-03 - 2007-01-04, hartera) # Submitted 2006-12-15 by Stefan Washietl at the University of Vienna # ssh hgwdev mkdir -p /cluster/data/encode/UVienna/2006-12-15/lab cd /cluster/data/encode/UVienna/ ln -s /cluster/data/encode/UVienna/2006-12-15 \ /cluster/data/encode/UVienna/latest # copy rnaz.bed and rnaz_track.html to /cluster/data/encode/UVienna/latest/lab cd /cluster/data/encode/UVienna/latest/lab # remove header line tail +2 rnaz.bed > rnazNoHeader.bed # load the data in rnaz.bed into the hg17 database hgLoadBed -notItemRgb -strict hg17 encodeUViennaRnaz rnazNoHeader.bed # Reading rnazNoHeader.bed # Loaded 3707 elements of size 4 # Sorted # then add trackDb.ra entry in trackDb/human/hg17/trackDb.encode.ra # and add the description. cp rnaz_track.html \ ~/kent/src/hg/makeDb/trackDb/human/hg17/encodeUViennaRnaz.html # edit description and change re-format publications ############################################################################ # Gencode Loci RACEfrags - 5' RACE-ARRAY experiments on Gencode loci # Submitted on 2007-04-11 by France Denoeud: fdenoeud@genoscope.cns.fr # of the Gencode group. # Create a directory for the data and copy the data there. Data was # sent by e-mail. (DONE, hartera, 2007-04-25) mkdir -p /cluster/data/encode/GencodeRACEfrags/2007-04-11/lab # Data is in file: RACEFRAGS_UCSC.gff # Also add the e-mails from France as a README.txt - these explain how the # track should be displayed. It was decided that the BAC end pairs display # should be used where the arrows between blocks have no lines through them. # README.txt is in 2007-04-11 directory. cd /cluster/data/encode/GencodeRACEfrags/ ln -s 2007-04-11 latest cd 2007-04-11 # 16 subtracks: # 5' RACE Primers and 15 other cell/tissue types called RACEfrags from $2 # where $2 is cell/tissue in column 2. # Sometimes there are overlapping "exons". awk '{print $2;}' lab/RACEFRAGS_UCSC.gff | sort | uniq # list of primer and cell/tissue types: # 5RACE_primer # Brain # Colon # GM06990 # HL60 # HeLa # Heart # Kidney # Liver # Lung # Muscle # Placenta # Small-Intest # Spleen # Stomach # Testis # change 5RACE_primer to just Primer: awk '{print $2;}' lab/RACEFRAGS_UCSC.gff | sort | uniq > subtracks.txt # 5' RACE was performed on about 400 ENCODE genes and the RACE products were # pooled together (in 5 pools of about 80 genes each) for hybridization on the # tiling arrays. There were a few pooling erros therefore sometimes a gene # product from tissue A is sometimes in pool1 and the product from the same # gene in tissue B is in pool 2 or there are RACEfrags for the same gene in # the same tissue that came from different pools (France Denoeud, 2007-04-17) # create GFF files for loading, change 3rd column to CDS so ldHgGene loads # tables correctly. foreach s (`cat subtracks.txt`) echo $s set t = $s if ($s == "5RACE_primer") then set t = "Primer" else if ($s == "HeLa") then set t = "Hela" else if ($s == "Small-Intest") then set t = "SmallIntest" endif grep $s lab/RACEFRAGS_UCSC.gff > encodeGencodeRaceFrags${t}.gff if (t == "Primer") then perl -pi.bak -e 's/(\s+)primer/$1CDS/' encodeGencodeRaceFrags${t}.gff else perl -pi.bak -e 's/racefrag_pool[0-9]+/CDS/' encodeGencodeRaceFrags${t}.gff endif end # Load the gff files for the subtracks into the database: cat << 'EOF' > load.csh foreach f (*.gff) set table = $f:r echo $table ldHgGene hg17 $table $f end 'EOF' chmod +x load.csh csh load.csh >&! load.log rm *.bak load.log # Loading program will merge overlapping RACEfrags to show one "exon". # Added trackDb track and hgFindSpec search entries to trackDb.ra. # Reordered tracks in trackDb/trackDb.encode.ra and edited description # (2007-04-25, hartera). # The encodeGencodeRaceFragsPrimer.gff contains a "." insteand of a strand # so add + for the strand. (2007-05-10, hartera) hgsql -e 'update encodeGencodeRaceFragsPrimer set strand = "+";' hg17 # Added trackDb.encode.ra setting: # autoTranslate 0 # to remove protein translation on details page as it does not make sense # to have it for this track. Also added code to hgTrackUi.c so that the # genePred track configurations controls for selecting the item label and # codon coloring are not drawn as they are not applicable either for this # track. # Download custom files from France made available May 15, 2007. # (hartera, 2007-05-22) cd /cluster/data/encode/GencodeRACEfrags/2007-04-11 mkdir custom cd custom wget --timestamping \ http://genome.imim.es/~jlagarde/tmp/racefrags_customfiles.tgz gunzip racefrags_customfiles.tgz tar -xvf racefrags_customfiles.tar # On 2007-05-29, France sent the table that contains the links to the custom # track files above. We need to host this on our server too. A link on the # track description page should lead to the table. # Download table (Sequences_Description.html) from e-mail to # /cluster/data/encode/GencodeRACEFrags/2007-04-11/custom/ # make directory for the custom tracks in the ENCODE datafiles directory cd /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles/ mkdir -p GencodeRACEfrags cd GencodeRACEfrags ln -s /cluster/data/encode/GencodeRACEfrags/2007-04-11/custom 2007-04-11 cd /cluster/data/encode/GencodeRACEfrags/2007-04-11/custom # change the links in the table for the Sequence_Description.html # http://hgdownload.cse.ucsc.edu/goldenPath/hg17/encode/datafiles/GencodeRACEfrags/2007-04-11/custom_file_* sed -e 's/genome\.imim\.es\/%7Efdenoeud/hgdownload\.cse\.ucsc\.edu\/goldenPath\/hg17\/encode\/datafiles\/GencodeRACEfrags\/2007\-04\-11/g' Sequences_Description.html \ > raceFragSequencesTable.html # Change link in the encodeGencodeRaceFrags.html description page to point # to the table on the hgdownloads server. # Table and custom tracks pushed to hgdownloads on 2007-06-01. ########################################################################### # Affy EC chrom21/chrom22 RELOAD (Andy 2008-03-20) ssh hgwdev bash cd /cluster/data/encode/Affy mkdir -p 2008-03-20/{lab,processed} cd 2008-03-20/lab/ cp /var/ftp/encode/encode_ext_RNA_hg17_chr21-22.tar . tar xf encode_ext_RNA_hg17_chr21-22.tar find . -name '*bz2' -exec bunzip2 '{}' \; mkdir -p processed/hg17/{download,bed} for f in lab/BW0/bed/*; do tiss=`echo $f | sed 's/.*\///;s/\.bed//'`; newF=processed/hg17/bed/encodeAffyEc1${tiss}Sites.bed; tail +2 $f > $newF; done for f in lab/BW25/bed/*; do tiss=`echo $f | sed 's/.*\///;s/\.bed//'`; newF=processed/hg17/bed/encodeAffyEc51${tiss}Sites.bed; tail +2 $f > $newF; done cd processed/hg17/bed/ for bed in *.bed; do hgLoadBed hg17 ${bed%.bed} $bed; done cd ../../ # beds loaded, now for the wiggles... for f in lab/BW0/wig/*; do tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`; table=encodeAffyEc1${tiss}Signal downDir=processed/hg17/download wig=processed/hg17/wigTable/${table}.tab wib=processed/hg17/wib/${table}.wib zip=${downDir}/${table}.wigVar.gz tail +2 $f | gzip -c > $zip wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log pushd /gbdb/hg17/encode/wib ln -s `dirs -1`/$wib popd hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig done for f in lab/BW25/wig/*; do tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`; table=encodeAffyEc51${tiss}Signal downDir=processed/hg17/download wig=processed/hg17/wigTable/${table}.tab wib=processed/hg17/wib/${table}.wib zip=${downDir}/${table}.wigVar.gz tail +2 $f | gzip -c > $zip wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log pushd /gbdb/hg17/encode/wib ln -s `dirs -1`/$wib popd hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig done mkdir /data/apache/htdocs/goldenPath/hg17/encode/downloads cd /data/apache/htdocs/goldenPath/hg17/encode/downloads ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/download/*.gz . # found out these have bad headers. I'm going to reduce the span # from 25 to 1. cd /cluster/data/encode/Affy/2008-03-20 for f in lab/BW0/wig/*; do tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`; table=encodeAffyEc1${tiss}Signal downDir=processed/hg17/download wig=processed/hg17/wigTable/${table}.tab wib=${table}.wib zip=${downDir}/${table}.wigVar.gz tail +2 $f | sed 's/span=25/span=1/' | gzip -c > $zip wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log mv $wib processed/hg17/wib hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig done for f in lab/BW25/wig/*; do tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`; table=encodeAffyEc51${tiss}Signal downDir=processed/hg17/download wig=processed/hg17/wigTable/${table}.tab wib=${table}.wib zip=${downDir}/${table}.wigVar.gz tail +2 $f | sed 's/span=25/span=1/' | gzip -c > $zip wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log mv $wib processed/hg17/wib/ hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig done # Renames (2008-04-14 Andy) # first GM06690 -> GM06990 ssh hgwdev cd /cluster/data/encode/Affy/2008-03-20/processed for f in `find . -name '*GM0*'`; do newF=`echo $f | sed 's/6690/6990/'`; mv $f $newF; done for f in `find . -name '*GM06990*.tab'`; do sed 's/GM06690/GM06990/' $f > tmp; mv tmp $f; done rm /gbdb/hg1{7,8}/encode/wib/encodeAffyEc{5,}1GM06690Signal.wib pushd /gbdb/hg17/encode/wib ln -s `dirs +1`/hg17/wib/encodeAffyEc{5,}1GM06990Signal.wib . popd pushd /gbdb/hg18/encode/wib ln -s `dirs +1`/hg18/wib/encodeAffyEc{5,}1GM06990Signal.wib . popd pushd /usr/local/apache/htdocs/goldenPath/hg17/encode/wig/ rm encodeAffyEc*GM0* ln -s `dirs +1`/hg17/download/encodeAffyEc{5,}1GM06990Signal.wigVar.gz . cd ../../../hg18/encode/wig rm encodeAffyEc*GM0* ln -s `dirs +1`/hg18/wigVar/encodeAffyEc{5,}1GM06990Signal.wigVar.gz . for db in hg1{7,8}; do cd $db/wigTable for table in encodeAffyEc{5,}1GM06990Signal; do hgLoadWiggle -pathPrefix=/gbdb/${db}/encode/wib $db $table ${table}.tab done cd ../../ done for db in hg1{7,8}; do for table in `echo show tables like \'encodeAffyEc%GM066%\' | hgsql $db | tail +2`; do echo drop table $table | hgsql $db; done done for db in hg1{7,8}; do for table in encodeAffyEc{5,}1GM06990Sites; do hgLoadBed $db $table ${db}/bed/${table}.bed done done # now Testes -> Testis # Delete rm /gbdb/hg1{7,8}/encode/wib/encodeAffyEc*Testes*.wib rm /usr/local/apache/htdocs/goldenPath/hg1{7,8}/encode/wig/encodeAffyEc*Testes*.wigVar.gz for db in hg1{7,8}; do for table in `echo show tables like \'encodeAffyEc%Testes%\' | \ hgsql $db | tail +2`; do echo drop table $table | hgsql $db; done; done # Change for f in `find . -name '*Testes*'`; do newF=`echo $f | sed 's/Testes/Testis/'`; mv $f $newF; done for f in `find . -name '*Testis*.tab'`; do sed 's/Testes/Testis/' $f > tmp; mv tmp $f; done # add links pushd /gbdb/hg17/encode/wib ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/wib/encodeAffyEc*Testis*.wib . cd ../../../hg18/encode/wib ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg18/wib/encodeAffyEc*Testis*.wib . cd /usr/local/apache/htdocs/goldenPath/hg17/encode/wig ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/download/encodeAffyEc*Testis*.wigVar.gz . cd ../../../hg18/encode/wig/ ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg18/wigVar/encodeAffyEc*Testis*.wigVar.gz . popd # load beds for db in hg1{7,8}; do pushd ${db}/bed for bed in *Testis*; do hgLoadBed $db ${bed%.bed} $bed done popd done # load wiggles for db in hg1{7,8}; do pushd ${db}/wigTable for tab in *Testis*; do hgLoadWiggle -pathPrefix=/gbdb/${db}/encode/wib $db ${tab%.tab} $tab done popd done