#!/bin/csh -f exit # NHGRI DNASE I HYPERSENSITIVE SITES (2004-10-15 kate) # Updated 2005-06-01 angie, see below # Data from Greg Crawford at the Collins Lab cd /cluster/data/encode/NHGRI set data = lab/DNaseI_HS_NHGRI_ENCODE2.txt sed '/^track/d' $data > DNAseHS.bed hgLoadBed hg16 encode_NHGRI_DNAseHS DNAseHS.bed # Updated 2005-06-01 angie cd /cluster/data/encode/NHGRI/crawford/2005-06-01 # Jim asked to add scores for grayscale-coloring: # clusters of 2 drawn in 50%, clusters of 3 drawn in 75%, # and clusters of 4 or more drawn in 100% black. # Shorten names: CD4_not_activated_500bp_13381_2 --> 13381_2 tail +2 CD4_Activated.txt \ | perl -wpe 'if (/CD4_\S+500bp_(\d+)_(\d+)/) { \ $id = $1 . "_" . $2; \ $score = ($2 >= 4) ? 1000 : $2 * 250; \ s/CD4\S+/$id\t$score/; } else { die "parse"; }' \ | hgLoadBed hg16 encodeNhgriDnaseHsAct stdin tail +2 CD4_Not_Activated.txt \ | perl -wpe 'if (/CD4_\S+500bp_(\d+)_(\d+)/) { \ $id = $1 . "_" . $2; \ $score = ($2 >= 4) ? 1000 : $2 * 250; \ s/CD4\S+/$id\t$score/; } else { die "parse"; }' \ | hgLoadBed hg16 encodeNhgriDnaseHsNonAct stdin # STANFORD PROMOTERS FROM RICK MYERS LAB (2004-10-17 kate) # Updated with new data 2005-05-26 angie, see below # Updated with new data 2005-09-15 kate, see below /cluster/data/encode/StanfordPromoters mkdir StanfordPromoters_20040825 cd StanfordPromoters_20040825 csh makePromoterFiles.csh > promoterExps.tab wc -l promoterExps.exp # 14 # currently hgTracks only looks in hgFixed for the experiment table # (add a trackDb option) hgExperiment hgFixed encode_Stanford_Promoters \ promoterExps.tab promoterPos.bed promoterData.tab hgLoadBed hg16 encode_Stanford_Promoters encode_Stanford_Promoters.tab # data update (2005-04-12 kate) /cluster/data/encode/StanfordPromoters mkdir -p 2005-04-12; cd 2005-04-12 mkdir lab; cd lab # save from email ls # ENCODE_DATA_SUBMIT_2005.04.12.txt UCSCREADME.txt # NOTE: scores are outside of 1-1000. I've asked for # a redo from Sara. # UPDATED 2005-09-07 kate (using angie's procedure) cd /cluster/data/encode/StanfordPromoters mkdir 2005-08-23 cd 2005-08-23 rm latest previous ln -s 2005-05-25 previous ln -s 2005-08-23 latest mkdir latest/lab cd latest/lab # Saved email attachments from Sara Hartman. # Angie's notes... # Used ooffice to translate Description.doc to Description.html and # NegativeControlDataStanfordPromoters.xls to tab-sep'd ....txt. # Datafile column descriptions from Sara's email 5/25/2005, augmented # with the bed9+ fields that we will shuffle them into: # 1->4) Accession number of the mRNA used to predict the promoter # 2->10) Gene model ID. If a gene has more than one predicted promoter # they would have the same gene model ID. # 3->1) chromosome # 4->6) strand (0 for negative, 1 for positive) # 5->2) Promoter Start coordinate # 6->3) Promoter End Coordinate # 7->X) Promoter length # 8->11) Promoter Name [really a description -- some of these are quoted] # 9->12) Luciferase signal A # 10->13) Renilla Signal A # 11->14) Luciferase Signal B # 12->15) Renilla Signal B # 13->16) Average Luciferase/Renilla Ratio # 14->17) Normalized Luciferase/Renilla Ratio # 15->18) Normalized and log2 transformed Luciferase Renilla Ratio # 16->5) Score (from 0-1000) # -- The Average file is like that but missing columns 9 to 14. # Some cell-line data files have some rows with "Bad Txfn" in the last # 4 columns; Sara said we should ignore those rows. # Kate tweaked Angie's scripting for newer data, and wrapped in a file... cat > doProm.hg16.csh << 'EOF' foreach f (lab/StanfordPromoters_[A-Z]*.txt) set cellType = `echo $f | perl -wpe 's^lab/StanfordPromoters_(.*)_.*^$1^'` echo $cellType if ($cellType == "Average") then tail +2 $f \ | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \ $w[3] =~ tr/01/-+/; \ $_ = join("\t", \ $w[2], $w[4], $w[5], $w[0], $w[9], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \ $w[8]) . "\n";' \ | makeColoredBed > encodeStanfordPromoters$cellType.hg16.bed else tail +2 $f \ | grep -v "Bad Txfn" \ | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \ $w[3] =~ tr/01/-+/; \ $_ = join("\t", \ $w[2], $w[4], $w[5], $w[0], $w[15], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \ $w[8], $w[9], $w[10], $w[11], $w[12], $w[13], $w[14]) . "\n";' \ | makeColoredBed > encodeStanfordPromoters$cellType.hg16.bed endif end 'EOF' csh doProm.hg16.csh >&! doProm.hg16.log cat > doLoad.hg16.csh << 'EOF' foreach f (encode*.hg16.bed) set track = $f:r:r if ($track == "encodeStanfordPromotersAverage") then hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/$track.sql \ hg16 $track $f else sed -e "s/encodeStanfordPromoters/$track/" \ $HOME/kent/src/hg/lib/encodeStanfordPromoters.sql > /tmp/esp.sql hgLoadBed -tab -noBin -sqlTable=/tmp/esp.sql hg16 $track $f endif end 'EOF' csh doLoad.hg16.csh > &! doLoad.hg16.log #Angie did enrichment analysis on May data... featureBits hg16 -enrichment encode_Stanford_Promoters \ encodeStanfordPromotersSnu182 #encode_Stanford_Promoters 0.011%, encodeStanfordPromotersSnu182 0.011%, both 0.011%, cover 99.84%, enrich 9194.85x featureBits hg16 encode_Stanford_Promoters \!encodeStanfordPromotersSnu182 \ -bed=stdout #chr21 32827801 32828259 chr21.1 #chr21 33617743 33617751 chr21.2 #chr11 1749652 1749673 chr11.1 #chr7 126437698 126437700 chr7.1 #chr5 131669622 131669641 chr5.1 #508 bases of 2865248791 (0.000%) in intersection # -- of those, only the first is not in the new set at all. featureBits hg16 \!encode_Stanford_Promoters encodeStanfordPromotersSnu182 \ -bed=stdout #chr21 33617194 33617203 chr21.1 #chr18 32827801 32828259 chr18.1 #chr7 126438195 126438196 chr7.1 #chr2 235023631 235024097 chr2.1 #934 bases of 2865248791 (0.000%) in intersection # -- only the second and the last are not in the old set at all. # Put the negative control data spreadsheet out for download. mkdir -p \ /usr/local/apache/htdocs/goldenPath/hg16/encode/datafiles/stanfordPromoters nice gzip NegativeControlDataStanfordPromoters.txt cp -p NegativeControlDataStanfordPromoters.{xls,txt.gz} \ /usr/local/apache/htdocs/goldenPath/hg16/encode/datafiles/stanfordPromoters/ # Added a README.txt. # UVA DNA REPLICATION TEMPORAL PROFILING (2004-10-22 kate) # Updated with new data 2005-05-25 angie, see below # Dutta lab (Univ. Virginia) replication data # Submitted as custom track, coords on hg15 cd /cluster/data/encode/UVa mkdir 2004-10-22 ln -s 2004-10-22 latest cd latest mkdir -p lab cd lab # download data # data (was MS-WORD) dos2unix lab/Dutta_Rep_profile.txt # documentation (ditto) dos2unix lab/UCSC-_data_release.txt # split file of customtracks into multiple files ./splitCustomTracks.pl lab/Dutta_Rep_profile.txt repl wc -l *.?-?{,?} # 163 repl.0-2 # 301 repl.2-4 # 296 repl.4-6 # 523 repl.6-8 # 421 repl.8-10 # 1704 total wc -l lab/Dutta_Rep_profile.txt # 1704 lab/Dutta_Rep_profile.txt cat > checkEncodeTable.csh << 'EOF' hgsql hg16 -N -s -e "SELECT er.name, count(*) FROM encodeRegions er, $1 hs WHERE er.chrom=hs.chrom AND hs.chromStart between er.chromStart AND er.chromEnd group by er.name" 'EOF' # << for emacs # load w/o binning to maintain ordering by chromStart foreach f (repl.?-?{,?}) liftOver $f /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain \ $f.hg16.bed $f.unmapped set time = `echo $f:e | sed 's/-/_/'` hgLoadBed -noBin hg16 encode_UVA_DNARep_$time $f.hg16.bed csh checkEncodeTable.csh encode_UVA_DNARep_$time > $time.ct end wc -l *.hg16.bed # 160 repl.0-2.hg16.bed # 298 repl.2-4.hg16.bed # 294 repl.4-6.hg16.bed # 520 repl.6-8.hg16.bed # 416 repl.8-10.hg16.bed # 1688 total wc -l *.unmapped # 0 repl.0-2.unmapped # 0 repl.2-4.unmapped # 0 repl.4-6.unmapped # 0 repl.6-8.unmapped # 0 repl.8-10.unmapped # 0 total cut -f 1 *.ct | sort | uniq > regions wc -l regions # 43 regions (1 missing) hgsql hg16 -N -s -e "SELECT name from encodeRegions" | sort | diff - regions # UPDATED 2005-05-25 WITH NEW DATA (angie) cd /cluster/data/encode/UVa mkdir 2005-05-25 rm latest ln -s 2005-05-25 latest ln -s 2004-10-22 previous cd latest mkdir lab cd lab # saved email attachments here. # symlink to nicer names: ln -s encode0to2hrAuD4030.bed.txt repl.0 ln -s encode2to4hrAuD4030.bed.txt repl.2 ln -s encode4to6hrAuD4030.bed.txt repl.4 ln -s encode6to8hrAuD4030.bed.txt repl.6 ln -s encode8to10hrAuD4030.bed.txt repl.8 wc -l repl.* # 249 repl.0 # 618 repl.2 # 305 repl.4 # 569 repl.6 # 429 repl.8 # 2170 total # Any 0-length items? foreach f (encode*) echo $f awk '$3 == $2 {print;}' $f end # Any >1Mbp items? (Previously there were some of these items spanning # multiple encode regions... cd lab foreach f (encode*) echo $f awk '$3 - $2 > 1000000 {print;}' $f end cd .. foreach F (lab/repl.?) set f = $F:t echo $f tail +2 $F \ | liftOver stdin /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain \ $f.hg16.bed $f.unmapped set time = $f:e hgLoadBed -noBin hg16 encodeUvaDnaRep$time $f.hg16.bed csh /cluster/data/encode/UVa/2004-10-22/checkEncodeTable.csh \ encodeUvaDnaRep$time > $time.ct end wc -l *.bed # 248 repl.0.hg16.bed # 616 repl.2.hg16.bed # 304 repl.4.hg16.bed # 568 repl.6.hg16.bed # 428 repl.8.hg16.bed # 2164 total wc -l *.unmapped # 0 repl.0.unmapped # 2 repl.2.unmapped # 0 repl.4.unmapped # 0 repl.6.unmapped # 0 repl.8.unmapped # 2 total cat repl.2.unmapped ##Split in new #chr21 39652869 39703581 # The main chr21 over.chain is interrupted by a little inversion chain # 39661243 39662854... so try liftOvering the start and end: cat > hg15.chr21troublespot.bed < regions wc -l regions # 43 regions (1 missing) hgsql hg16 -N -s -e "SELECT name from encodeRegions" | sort \ | diff - regions # < ENm011 foreach t (0 2 4 6 8) set t2 = `expr $t + 2` featureBits hg16 -enrichment encode_UVA_DNARep_${t}_$t2 \ encodeUvaDnaRep$t end #encode_UVA_DNARep_0_2 0.154%, encodeUvaDnaRep0 0.154%, both 0.154%, cover 99.90%, enrich 648.98x #encode_UVA_DNARep_2_4 0.621%, encodeUvaDnaRep2 0.621%, both 0.620%, cover 99.93%, enrich 161.04x #encode_UVA_DNARep_4_6 0.181%, encodeUvaDnaRep4 0.181%, both 0.181%, cover 99.75%, enrich 551.92x #encode_UVA_DNARep_6_8 0.418%, encodeUvaDnaRep6 0.418%, both 0.418%, cover 99.90%, enrich 239.17x #encode_UVA_DNARep_8_10 0.178%, encodeUvaDnaRep8 0.177%, both 0.177%, cover 99.42%, enrich 562.02x # UCSD CHIP/CHIP (2004-09-30 kate) # Bing Ren's Lab # Provided by Chunxu Qu [mailto:qchunxu@ucsd.edu] on Oct. 1, 2004 # Unpack data and rename files ssh kksilo cd /cluster/data/encode mkdir -p UCSD/project1 cd UCSD/project1 gunzip project1.tar.gz foreach f (*.wig) mv $f $f.bed end # consists of 8 custom track wiggle BED files ssh hgwdev mkdir /gbdb/hg16/encode/UCSD_ChIP/wib cd /cluster/data/encode/UCSD/project1 mkdir wig cd wig cat > makeWig.csh << 'EOF' foreach f (../*.bed) # get base name set b = $f:t:r:r # strip non-informative parts of name set n = `echo $b | sed -e 's/ave_//' -e 's/_p//' | tr '[a-z]' '[A-Z]'` echo $n # create wiggle data files from BED # NOTE: wigBedToBinary has been replaced by wigEncode sed -e '/^browser/d' -e '/^track/d' $f | \ wigBedToBinary stdin $n.wig $n.wib ln -s `pwd`/$n.wib /gbdb/hg16/encode/UCSD_ChIP/wib # load into database hgLoadWiggle hg16 encode_UCSD_ChIP_$n $n.wig \ -pathPrefix=/gbdb/hg16/encode/UCSD_ChIP/wib end 'EOF' # << for emacs csh makeWig.csh >&! makeWig.log & # Converted stdin, upper limit 16.00, lower limit 0.00, etc. hgsql hg16 -s -e "select count(*) from encode_UCSD_ChIP_RNAP_HELA" # 19571 # create entries in trackDb, default height 16 pixels # Use Jim's suggestion to associate a color with a cell line # Pol2 Hela (black) # Pol2 THP1 (darkish blue) # Pol2 IMR90 (darkish red) # Pol2 HCT116 (darkish green) # TAF1 Hela (black) # TAF1 THP1 (darkish blue) # TAF1 IMR90 (darkish red) # TAF1 HCT116 (darkish green) # UCSD/Ludwig Institute Histone Methylation (2005-04-21 kate) # NOTE: this data was submitted 12/14/04, but somehow fell thru # the cracks (it was the week before CSHL). # Files received from email: ave_AcH3_imr90.wig ave_MeH3K4_imr90.wig # Provided by Chunxu Qu [mailto:qchunxu@ucsd.edu] cd /cluster/data/encode/UCSD cd project1 mv ave_AcH3_imr90.wig ave_AcH3_imr90_p.wig.bed mv ave_MeH3K4_imr90.wig ave_MeH3K4_imr90_p.wig.bed mkdir -p /gbdb/hg16/encode/UcsdChip/wib mkdir wig.2005-04-21 cd latest # edit makeWig.csh to use wigEncode and standard table naming cat > makeWig.csh << 'EOF' foreach f (../*.bed) # get base name set b = $f:t:r:r # convert to standard format (initial caps) "AntibodyCell" set n = `echo $b | perl -wpe 's/ave_(.*)_(.*)_p/\u\L$1\E\u\L$2\E/'` echo $n # create wiggle data files from BED sed -e '/^browser/d' -e '/^track/d' $f | \ wigEncode stdin $n.wig $n.wib ln -s `pwd`/$n.wib /gbdb/hg16/encode/UcsdChip/wib # load into database hgLoadWiggle hg16 encodeUcsdChip$n $n.wig \ -pathPrefix=/gbdb/hg16/encode/UcsdChip/wib end 'EOF' # << for emacs csh makeWig.csh >&! makeWig.log & # cleanup TODO: remove old /gbdb files, doc files, tables # on hgwdev, beta & RR # try as bedGraph type cd /cluster/data/encode/UCSD cd project1 mkdir bed.2005-04-21 ln -s bed.2005-04-21 latest cd latest cat > makeBed.csh << 'EOF' foreach f (../*.bed) # get base name set b = $f:t:r:r # convert to standard format (initial caps) "AntibodyCell" set n = `echo $b | perl -wpe 's/ave_(.*)_(.*)_p/\u\L$1\E\u\L$2\E/'` echo $n # create wiggle data files from BED sed -e '/^browser/d' -e '/^track/d' $f > $n.bed # load into database hgLoadBed hg16 encodeUcsdChip$n $n.bed end 'EOF' # << for emacs csh makeBed.csh >&! makeBed.log & # set up 4 composite tracks (per antibody), 4 cell lines each # using colors to highlight cell types: # Hela (black) # THP1 (darkish blue) # IMR90 (darkish red) # HCT116 (darkish green) # UCSD Histone Methylation Chip/chip on IFN-g induced Hela (2005-05-26) cd /cluster/data/encode/UCSD mkdir -p 2005-05-26 cd 2005-05-26 # copied files from ftp dir # TODO: load as bedGraph, as above # bed format; use vi to strip header lines cp tmH3K4_p0.wig tmH3K4_p0.bed cp tmH3K4_p30.wig tmH3K4_p30.bed # save originals mv tmH3K4_p0.wig tmH3K4_p0.wig.orig mv tmH3K4_p30.wig tmH3K4_p30.wig.orig # load wigEncode tmH3K4_p0.bed tmH3K4_p0.wig tmH3K4_p0.wib ln -s `pwd`/tmH3K4_p0.wib /gbdb/hg16/encode/UcsdChip/wib hgLoadWiggle hg16 encode_UCSD_ChIP_tmH3K4_p0 tmH3K4_p0.wig -pathPrefix=/gbdb/hg16/encode/UcsdChip/wib # STANFORD MLAGAN MSA OF FREEZE4 (2005-06-13 kate) # From George Asimenos, Batzoglou lab (asimenos@stanford.edu) cd /cluster/data/encode mkdir -p MLAGAN/freeze4/{cons,maf} cd MLAGAN/freeze4/maf # move maf submission in tar xvfz ENCODE_LAGAN_maf.tgz mkdir out tmp # human hg16 # mouse mm6 # rat rn3 # macaque rheMac1 rhesus # dog canFam1 # chimp panTro1 # monodelphis monDom1 opossum # xenopus xenTro1 x_tropicalis # fugu fr1 # zebrafish danRer2 # tetraodon tetNig1 cat > mafProject.csh << 'EOF' set tmpDir = /cluster/data/encode/MLAGAN/freeze4/maf/tmp set outDir = /cluster/data/encode/MLAGAN/freeze4/maf/out foreach r (EN[mr]*) echo $r set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \ hgsql -N hg16` set start = \ `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \ hgsql -N hg16` set size = \ `echo "SELECT size from chromInfo WHERE chrom='$c'" | \ hgsql -N hg16` /cluster/data/encode/MLAGAN/mafCoord.pl < $r/$r.maf \ human.1 hg16.$c $start $size | \ sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg16.$c \ > $outDir/$r.maf end 'EOF' # << for emacs csh mafProject.csh >&! mafProject.log # seg faults in ENm008 and ENr324 # move maf files for these regions to "bad" dir # Minmei created projected files for these regions. # (our installed maf_project must be outdated) # change species to match browser cd /cluster/data/encode/MLAGAN/freeze4/maf mkdir load foreach r (EN[mr]*) sed -e 's/monodelphis/opossum/' -e 's/macaque/rhesus/' \ -e 's/xenopus/x_tropicalis/' out/$r.maf > load/$r.maf end mkdir -p /gbdb/hg16/encode/MLAGAN/freeze4/mafs set d = /gbdb/hg16/encode/MLAGAN/freeze4/mafs rm $d/*.maf ln -s `pwd`/load/*.maf $d hgLoadMaf -WARN hg16 encodeMlaganAlign -pathPrefix=$d # lots of "score too small" errors. Allow this for now. # Submitters may want to fix this later. # create summary table for zoomed out views of pairwise cd /cluster/data/encode/MLAGAN/freeze4/maf/latest cat out/*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 hg16 encodeMlaganSummary stdin # conservation wiggles cat lab/chr*.wig | \ wigEncode stdin GerpCons.wig GerpCons.wib # Converted stdin, upper limit 3.00, lower limit 0.00 # GERP (from Greg Cooper) mkdir -p /gbdb/hg16/encode/MLAGAN/freeze4 rm -f /gbdb/hg16/encode/MLAGAN/freeze4/GerpCons.wib ln -s `pwd`/GerpCons.wib /gbdb/hg16/encode/MLAGAN/freeze4 hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/MLAGAN/freeze4 hg16 \ encodeMlaganGerpCons GerpCons.wig # binCons and phastCons (from Elliott) # Reload on 6/30 (phastCons changed) cd /cluster/data/encode/MLAGAN/freeze4/cons/latest/lab/jun30 wget -r -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_lagan cd ../.. gunzip -c lab/jun30/human.EN*.pC.*.wig.gz | \ wigEncode stdin phastCons.wig phastCons.wib # Converted stdin, upper limit 1.00, lower limit 0.00 gunzip -c lab/jun30/EN*.binCons.lagan.*.gz | \ wigEncode stdin binCons.wig binCons.wib # Converted stdin, upper limit 1.00, lower limit 0.00 set d = /gbdb/hg16/encode/MLAGAN/freeze4 rm -f $d/{phastCons,binCons}.wib ln -s `pwd`/phastCons.wib $d ln -s `pwd`/binCons.wib $d hgLoadWiggle -pathPrefix=$d hg16 encodeMlaganPhastCons phastCons.wig hgLoadWiggle -pathPrefix=$d hg16 encodeMlaganBinCons binCons.wig # conserved elements (from GERP) # NOTE: abbreviate item names and round float scores perl -we 'while (<>) { \ if (/^chr/) { \ ($chrom,$start,$end,$name,$score) = split; \ $name =~ s/GERP.score=/GERP./; \ printf("%s\t%d\t%d\t%s\t%.0f\n", $chrom,$start,$end,$name,$score); \ } }' lab/ENCODE_GERP_cons.bed > encodeMlaganGerpEl.tab hgLoadBed -noBin hg16 encodeMlaganGerpEl encodeMlaganGerpEl.tab # Loaded 48480 elements of size 5 # load conserved elements (6 tables) # scored elements # Updated 6/30 for Elliott cd /cluster/data/encode/MSA/latest hgLoadBed -noBin hg16 encodeMlaganBinConsEl lab/lagan.binCons.bed # loaded 18677 hgLoadBed -noBin hg16 encodeMlaganPhastConsEl lab/lagan.phastCons.bed # loaded 36105 elements # unscored - fix at score=500 ln -s lab/lagan.or.bed MlaganUnionEl.bed ln -s lab/lagan.or.nc.bed MlaganNcUnionEl.bed ln -s lab/lagan.and.bed MlaganIntersectEl.bed ln -s lab/lagan.and.nc.bed MlaganNcIntersectEl.bed foreach f (Mlagan*.bed) set b = $f:r set t = encode$b sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin checkTableCoords hg16 $t end # CONSENSUS ELEMENTS: MLAGAN AND TBA CONSERVED ELEMENTS (2005-06-29 kate) # From Elliott Margulies cd /cluster/data/encode mkdir -p MSA/freeze4 cd MSA/freeze4 mkdir jun30/lab cd jun30/lab wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/constrained_elements.tar.gz tar xvfz *.gz cd ../../ # load conserved elements (combined lagan and tba) (6 tables) cd /cluster/data/encode/MSA/freeze4 mkdir -p jun30/lab cd jun30/lab wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/constrained_elements.tar.gz cd ../.. ln -s lab/tl.or.bed AllUnionEl.bed ln -s lab/tl.or.nc.bed AllNcUnionEl.bed ln -s lab/tl.and.bed AllIntersectEl.bed ln -s lab/tl.and.nc.bed AllNcIntersectEl.bed foreach f (All*.bed) set b = $f:r set t = encode$b sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin checkTableCoords hg16 $t end # STANFORD MSA OF FREEZE2 (2004-11-17 kate) # From George Asimenos, Batzoglou lab (asimenos@stanford.edu) cd /cluster/data/encode cd MLAGAN/freeze2 mkdir lab out tmp set tmpDir = /cluster/data/encode/MLAGAN/freeze2/tmp set outDir = /cluster/data/encode/MLAGAN/freeze2/out cd lab wget -r http://ai.stanford.edu/~asimenos/beta_encode_Nov_2004/ mv ai.stanford.edu/*asimenos/beta* . rm -fr ai.stanford.edu cd beta_encode_Nov_2004/data cat > makeMaf.csh << 'EOF' set tmpDir = /cluster/data/encode/MLAGAN/freeze2/tmp set outDir = /cluster/data/encode/MLAGAN/freeze2/out foreach r (EN*) echo $r set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \ hgsql -N hg16` set start = \ `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \ hgsql -N hg16` set size = \ `echo "SELECT size from chromInfo WHERE chrom='$c'" | \ hgsql -N hg16` bunzip2 -c $r/$r.maf.bz2 | \ /cluster/data/encode/MLAGAN/mafCoord.pl human hg16.$c $start $size |\ sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg16.$c \ > $outDir/$r.human.maf end 'EOF' # << for emacs csh makeMaf.csh >&! makeMaf.log & mkdir /gbdb/hg16/encode_MSA2_MLAGAN ln -s $outDir/*.human.maf /gbdb/hg16/encode_MSA2_MLAGAN hgLoadMaf -WARN hg16 encode_MSA2_MLAGAN # create description page from lab/index.html # AFFY TRANSCRIPTION AND CHIP/CHIP TEMPORAL PROFILING (2004-12-08 kate) # (RELOADED TABLES 2005-05-31 as bedGraph types - Hiram) # UPDATED 2005-06-06 (ChIP/chip) and 2005-06-08 (RNA) angie - see below # HL60 cells stimulated with Retinoic Acid # Transcription and PolII ChIP/Chip at 4 timepoints (8 experiments) # wig & bed for each experiment # filenames: EC_HL60_RA_RNA_00hr_AS.hs.NCBIv34.{bed,wig} # EC_HL60_RA_RNApol2_00hr_AS_b.hs.NCBIv34.{bed,wig} # Use these to construct 16 tracks: # track encodeAffyRnaHl60Hr{00,02,08,32} # track encodeAffyChIpRnapHl60Hr{00,02,08,32} # RNA/Affy HL60 0hr # ChIP/Affy Pol2 0hr mkdir -p /gbdb/hg16/encode/Affy/2004-12-03/wib cd /cluster/data/encode mkdir -p Affy/Dec03_2004 cd Affy/Dec03_2004 wget http://transcriptome.affymetrix.com/download/ENCODE/ucsc_formats.tar.bz2 bunzip2 ucsc_formats.tar.bz2 # trailing garbage after EOF ignored tar xvf ucsc_formats.tar # NOTE: the script below creates $hrs as 0,2,8,32 # and tables were renamed to 00,02,08,32, so change script # next time it's loaded. cd ucsc_formats cat > load.csh << 'EOF' foreach f (bed/*.bed) set base = $f:t:r set info = `echo $f:t | sed 's/EC_HL60_RA_\([^_][^_]*\)_\([0-9][0-9]*\)hr.*/\1.\2/'` set hrs = `expr $info:e + 0` set exp = `echo $info:r | \ sed -e 's/RNApol2/ChIpRnap/' -e 's/RNA/Rna/'` if ($exp == "Rna") then set meas = "Sig" else set meas = "Pval" endif set track = encodeAffy${exp}Hl60SitesHr$hrs sed '1d' $f | \ hgLoadBed -noBin hg16 $track stdin set ct = `hgsql -N -s -e "select count(*) from $track" hg16` echo "track $track" echo "$f:t $ct" set track = encodeAffy${exp}Hl60${meas}Hr$hrs sed '/^track/d' wig/$base.wig | wigEncode stdin $track.wig $track.wib ln -s `pwd`/$track.wib /gbdb/hg16/encode/Affy/2004-12-03/wib # load into database hgLoadWiggle hg16 ${track} $track.wig \ -pathPrefix=/gbdb/hg16/encode/Affy/2004-12-03/wib end 'EOF' # << for emacs csh load.csh >&! load.log & grep EC_HL60 load.log | grep -v wig > loaded.txt # email loaded.txt to contributor awk '{print $1}' bed/*.bed | sort | uniq > chroms.txt # use in trackDb chromosomes setting grep encodeAffy load.log > trackDb.ra # use as basis of hg16 trackDb entries # (create 4 composite tracks) cd description # edit cp EC_HL60_RA_RNA_00hr_AS_hs.NCBIv34.bed.dsc encodeAffyRnaHl60Sites.html cp EC_HL60_RA_RNA_00hr_AS_hs.NCBIv34.wig.dsc encodeAffyRnaHl60Sig.html cp EC_HL60_RA_RNApol2_00hr_AS_hs.NCBIv34.bed.dsc encodeAffyChIpRnapHl60Sites.html cp EC_HL60_RA_RNApol2_00hr_AS_hs.NCBIv34.wig.dsc encodeAffyChIpRnapHl60Pval.html # UPDATED ChIP/chip 2005-06-06 angie cd /cluster/data/encode/Affy/2005-06-05 # Load up the 41 bed3 tables of ChIP/chip sites... foreach f (chipchip/bed/*.bed) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/Pol2/Rnap/; s/RARecA/Rara/; s/TFIIB-R/TFIIB/; \ s/(\S+)_(\d+)hr/encodeAffyChIpHl60Sites\u\L$1\EHr$2/; \ s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'` echo $track tail +2 $f \ | hgLoadBed -noBin hg16 $track stdin end # Loop over those bed files again to make trackDb.ra contents... set pri = 2 foreach f (chipchip/bed/*.bed) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/Pol2/Rnap/; s/RARecA/Rara/; s/TFIIB-R/TFIIB/; \ s/(\S+)_(\d+)hr/encodeAffyChIpHl60Sites\u\L$1\EHr$2/; \ s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'` set factor = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_\d+hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/RARecA/RARA/; s/TFIIB-R/TFIIB/; \ s/H3K27T/H3K27me3/; s/HisH4/H4Kac4/;'` set hh = `echo $f:t:r:r:r | perl -wpe 's/.*_(\d+)hr_.*/$1/'` set hr = `expr $hh + 0` set fIndex = `echo $factor | sed -e \ 's/Brg1/0/; s/CEBPe/1/; s/CTCF/2/; s/H3K27me3/3/; s/H4Kac4/4/; \ s/P300/5/; s/PU1/6/; s/Pol2/7/; s/RARA/8/; s/SIRT1/9/; \ s/TFIIB/10/;'` if ($fIndex == 10) then set r = 0 set g = 0 set b = 200 else set r = `expr 25 \* \( 9 - $fIndex \)` set g = `expr 25 \* $fIndex` set b = 0 endif echo track $track echo subTrack encodeAffyChIpHl60SitesHr$hh echo shortLabel Affy $factor RA ${hr}h echo longLabel Affymetrix ChIP/Chip \($factor retinoic acid-treated HL-60, ${hr}hrs\) Sites echo color $r,$g,$b echo priority $pri echo "" set pri = `expr $pri + 2` end ssh kolossus cd /cluster/data/encode/Affy/2005-06-01 # chr7 has >150,000 rows, and >100,000 means trouble for bedGraph # performance; so load this as wiggle instead. # Fixup off-by-one error in these files: 2005-08-01 - Hiram # Add the -lift=1 to the wigEncode command # The data in the .wig files is actually 0-relative, it should # have been 1-relative. Add 1 to get it to be 1-relative. mkdir wig wib foreach f (chipchip/wig/*.median.wig.bz2) set track = `echo $f:t:r:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/Pol2/Rnap/; s/RARecA/Rara/; s/TFIIB-R/TFIIB/; \ s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \ s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'` echo $track bzcat $f | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib end # lower limit is 0 for all. Upper limit varies quite a bit. # *Hr00: 72.60 (encodeAffyChIpHl60PvalCtcfHr00) to 534.54 (H4Kac4) # *Hr02: 65.62 (encodeAffyChIpHl60PvalCtcfHr02) to 425.79 (H4Kac4) # *Hr08: 85.14 (encodeAffyChIpHl60PvalSirt1Hr08) to 505.86 (H4Kac4) # *Hr32: 50.18 (encodeAffyChIpHl60PvalCtcfHr32) to 490.03 (H4Kac4) # Use the top upper value as the limit for now... if the other # subtracks look too smushed, maybe we can adjust? # back on hgwdev: cd /cluster/data/encode/Affy/2005-06-01 set gbdbDir = /gbdb/hg16/encode/Affy/2005-06-01 mkdir -p $gbdbDir/wib foreach f (chipchip/wig/*.median.wig.bz2) set track = `echo $f:t:r:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/Pol2/Rnap/; s/RARecA/Rara/; s/TFIIB-R/TFIIB/; \ s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \ s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'` echo $track ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir end # same trackDb-helper: set pri = 1 foreach f (chipchip/wig/*.median.wig.bz2) set track = `echo $f:t:r:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/Pol2/Rnap/; s/RARecA/Rara/; s/TFIIB-R/TFIIB/; \ s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \ s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'` set factor = `echo $f:t:r:r:r:r | perl -wpe \ 's/EC_AS_HL60_DN_RA_//; \ s/_\d+hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \ s/RARecA/RARA/; s/TFIIB-R/TFIIB/; \ s/H3K27T/H3K27me3/; s/HisH4/H4Kac4/;'` set hh = `echo $f:t:r:r:r | perl -wpe 's/.*_(\d+)hr_.*/$1/'` set hr = `expr $hh + 0` set fIndex = `echo $factor | sed -e \ 's/Brg1/0/; s/CEBPe/1/; s/CTCF/2/; s/H3K27me3/3/; s/H4Kac4/4/; \ s/P300/5/; s/PU1/6/; s/Pol2/7/; s/RARA/8/; s/SIRT1/9/; \ s/TFIIB/10/;'` if ($fIndex == 10) then set r = 0 set g = 0 set b = 200 else set r = `expr 25 \* \( 9 - $fIndex \)` set g = `expr 25 \* $fIndex` set b = 0 endif echo track $track echo subTrack encodeAffyChIpHl60PvalHr$hh echo shortLabel Affy $factor RA ${hr}h echo longLabel Affymetrix ChIP/Chip \($factor retinoic acid-treated HL-60, ${hr}hrs\) P-Value echo color $r,$g,$b echo priority $pri echo "" set pri = `expr $pri + 2` end # UPDATED RNA 2005-06-08 angie cd /cluster/data/encode/Affy/2005-06-01 tail +2 rna/bed/EC_AS_GM06990_RCyP+_C01vsNULL.sig.gr.bed \ | hgLoadBed -noBin hg16 encodeAffyRnaGm06990Sites stdin tail +2 rna/bed/EC_AS_HeLa_RCyP+_C01vsNULL.sig.gr.bed \ | hgLoadBed -noBin hg16 encodeAffyRnaHeLaSites stdin foreach f (rna/bed/*HL60*.bed) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SitesHr$1/;'` echo $track tail +2 $f \ | hgLoadBed -noBin hg16 $track stdin end set gbdbDir = /gbdb/hg16/encode/Affy/2005-06-01 mkdir -p $gbdbDir/wib set track = encodeAffyRnaGm06990Signal # Fixup off-by-one error in these files: 2005-08-01 - Hiram bzcat rna/wig/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig.bz2 \ | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib # bzcat rna/wig/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig.bz2 \ # | wigEncode stdin wig/$track.wig wib/$track.wib #Converted stdin, upper limit 1686.50, lower limit -899.25 ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir set track = encodeAffyRnaHeLaSignal # Fixup off-by-one error in these files: 2005-08-01 - Hiram bzcat rna/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig.bz2 \ | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib # bzcat rna/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig.bz2 \ # | wigEncode stdin wig/$track.wig wib/$track.wib #Converted stdin, upper limit 1250.00, lower limit -763.50 ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir # Reloaded these four 6/29/05 after Jim and Hiram found that they'd # been loaded with the HeLa data due to a copy/paste error: # Fixup off-by-one error in these files: 2005-08-01 - Hiram # add -lift=1 to the wigEncode command foreach f (rna/wig/*HL60*C01vsNULL.sig.wig.bz2) set track = `echo $f:t:r:r:r | perl -wpe \ 's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SignalHr$1/;'` echo $track bzcat $f \ | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib ln -s `pwd`/wib/$track.wib $gbdbDir/wib/ nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir end #Converted stdin, upper limit 1405.00, lower limit -1052.50 #Converted stdin, upper limit 1238.50, lower limit -1019.75 #Converted stdin, upper limit 1587.00, lower limit -1168.00 #Converted stdin, upper limit 1290.00, lower limit -910.75 # BOSTON UNIVERSITY / ZHIPING WENG LAB FIRST EXON DATA (WORKING 2005-01-27 kate) ssh kksilo cd /cluster/data/encode mkdir -p BU cd BU #mkdir -p 2005-01-06 #ln -s 2005-01-06/latest #cd latest # copy file from email: BED.tar.gz #tar xvfz BED.tar.gz #mkdir -p 2005-04-29 #ln -s 2005-04-29/latest mkdir -p 2005-05-04 ln -s 2005-05-04 latest cd latest # save file from email tar xvfz BED.BUMay4.tar.gz cd BED cp rcPCREncodeDataDescription.html ~/kent/src/hg/makeDb/trackDb/human/hg16/encodeBuFirstExon.html # checkin description file and edit cd .. # adjust chromStart cat > lowerStart.awk << 'EOF' {printf ("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6, $7-1, $8, $9, $10, $11, $12, $13);} 'EOF' # << for emacs # rearrange data colums to be BED 13, add RGB values to "reserved" field # based on score makeColoredBed < BED/cerebralcortex.bed | \ awk -f lowerStart.awk > encodeBuFirstExonCerebrum.bed makeColoredBed < BED/colon.bed | \ awk -f lowerStart.awk > encodeBuFirstExonColon.bed makeColoredBed < BED/heart.bed | \ awk -f lowerStart.awk > encodeBuFirstExonHeart.bed makeColoredBed < BED/kidney.bed | \ awk -f lowerStart.awk > encodeBuFirstExonKidney.bed makeColoredBed < BED/liver.bed | \ awk -f lowerStart.awk > encodeBuFirstExonLiver.bed makeColoredBed < BED/lung.bed | \ awk -f lowerStart.awk > encodeBuFirstExonLung.bed makeColoredBed < BED/skeletalmuscle.bed | \ awk -f lowerStart.awk > encodeBuFirstExonSkMuscle.bed makeColoredBed < BED/spleen.bed | \ awk -f lowerStart.awk > encodeBuFirstExonSpleen.bed makeColoredBed < BED/stomach.bed | \ awk -f lowerStart.awk > encodeBuFirstExonStomach.bed makeColoredBed < BED/testis.bed | \ awk -f lowerStart.awk > encodeBuFirstExonTestis.bed # take a look at data distribution (previous was skewed toward 0) awk '{print $5}' *.bed | sort -n > scores.txt textHistogram scores.txt -binSize=100 -maxBinCount=11 # 0 107.000000 # 100 8.000000 # 200 32.000000 # 300 53.000000 # 400 91.000000 # 500 130.000000 # 600 186.000000 # 700 118.000000 # 800 33.000000 # 900 1.000000 # 1000 1.000000 ## cluster at 0, followed by a nice bell curve cat > load.csh << 'EOF' foreach f (Cerebrum Colon Heart Kidney Liver Lung SkMuscle Spleen Stomach Testis) hgLoadBed hg16 -noBin encodeBuFirstExon encodeBuFirstExon$f.bed -sqlTable=${HOME}/kent/src/hg/lib/encodeBuFirstExon.sql hgsql hg16 -e "DROP TABLE IF EXISTS encodeBuFirstExon$f" hgsql hg16 -e "ALTER TABLE encodeBuFirstExon RENAME TO encodeBuFirstExon$f" checkTableCoords hg16 encodeBuFirstExon$f end 'EOF' csh load.csh >&! load.log # generate list of chromosomes with data for trackDb entry foreach f (Cerebrum Colon Heart Kidney Liver Lung SkMuscle Spleen Stomach Testis) /cluster/data/encode/scripts//showChromList.csh hg16 encodeBuFirstExon$f end #chr11,chr13,chr15,chr16,chr19,chr2,chr5,chr7,chr9,chrX # GENCODE Sanger Havana annotations (2005-06-07 kate) # previous versions are in other dirs # reloaded class table 6/13 based on request of Havana team ssh hgwdev mkdir -p /cluster/data/encode/Gencode cd /cluster/data/encode/Gencode mkdir 2005-06-07 cd 2005-06-07 wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=5 \ ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/CHR_coord_hg17/global_files/44regions_CHR_coord.gtf # Previous: Downloaded: 7,718,475 bytes in 2 files (note: 1/2 as many as previous) # Current: Downloaded: 7,753,057 bytes in 2 files mv current/CHR_coord_hg17/global_files/44regions_CHR_coord.gtf . rm -fr current wc -l 44regions_CHR_coord.gtf # CURRENT: 46418 44regions_CHR_coord.gtf # PREVIOUS: 46191 44regions_CHR_coord.gtf ln -s 44regions_CHR_coord.gtf gencode.gtf # converting to genePred before lifting to create gene track # due to problems in liftOver -gff found by Mark D. # Exclude non-Vega sources as per France Denoued at IMIM grep VEGA gencode.gtf > gencode.vega.gtf ldHgGene -out=gencode.gp -gtf -genePredExt null null gencode.vega.gtf # CURRENT: Read 2888 transcripts in 46127 lines in 1 files # PREVIOUS: Read 2542 transcripts in 40133 lines in 1 files liftOver -genePred gencode.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ gencode.hg16.gp gencode.gp.unmapped wc -l gencode.hg16.gp # previous 2540 gencode.hg16.gp # 2851 gencode.hg16.gp wc -l gencode.gp.unmapped # PREVIOUS # 4 -> 2 problem transcripts # AC005538.4-001 and AC005538.4-006 # UPDATE # 37 unmapped, all on chrX except: # AC005538.4-001 and AC005538.4-006 on chr2 Each transcript had 1 exon unmapped -- notifying submitter ldHgGene -predTab -genePredExt hg16 encodeGencodeGene gencode.hg16.gp # 2851 gene predictions checkTableCoords hg16 encodeGencodeGene # introns track liftOver -gff gencode.gtf \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ gencode.hg16.gtf gencode.gtf.unmapped wc -l gencode.hg16.gtf # CURRENT: 46239 gencode.hg16.gtf # PREVIOUS: 46014 gencode.hg16.gtf grep intron gencode.hg16.gtf | wc -l # 15695 grep intron gencode.hg16.gtf | grep -v not_tested | wc -l # CURRENT: 467 # PREVIOUS: 397 grep -v not_tested gencode.hg16.gtf | sed -e 's/-intron/-/g' | \ ldGencodeIntron hg16 encodeGencodeIntron stdin # CURRENT: 467 introns in 1 files # PREVIOUS: 397 introns in 1 files # OLDER: 425 introns in 1 files # create gene class table hgsql hg16 < ~/kent/src/hg/lib/gencodeGeneClass.sql awk '{printf "%s\t%s\n", $10, $2}' gencode.vega.gtf | \ sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' \ -e 's/_val/_gencode_conf/' -e 's/Antisense/Novel_transcript/' | \ sort | uniq > gencodeGeneClass.tab wc -l gencodeGeneClass.tab # 2888 gencodeGeneClass.tab echo "LOAD DATA LOCAL INFILE 'gencodeGeneClass.tab' into table gencodeGeneClass" | hgsql hg16 # create table of accessions in reference set # These are genes with classes: Known, Novel_CDS, # Novel_transcript_gencode_conf, Putative_gencode_conf egrep 'Known|Novel_CDS|Novel_transcript_gencode_conf|Putative_gencode_conf' \ gencodeGeneClass.tab > gencodeGeneRef.tab sed 's/gencodeGeneClass/gencodeGeneRef/' \ ~/kent/src/hg/lib/gencodeGeneClass.sql | \ hgsql hg16 wc -l encodeGencodeRef.tab ldHgGene -predTab -genePredExt hg16 encodeGencodeGene gencode.hg16.gp # add to all.joiner # create table of reference annotaton (unfortunate redundant processing) # for data analysis fair. We may want to replace this with a composite # track with 2 subtracks: one for reference annotation, plus "other" # Very hokey -- I used TB to filter by the 4 categories # (Known, Novel_CDS, Novel_transcript_gencode_conf, Putative_gencode_conf) # inventory sources for genepreds (genes will be colored by these) hgsql -N hg16 \ -e "select distinct(class) from gencodeGeneClass" > classes.txt wc -l classes.txt # 10 # run Mark D's gene checker on VEGA_Known and email results to submitter echo " select encodeGencodeGene.* from encodeGencodeGene, gencodeGeneClass where encodeGencodeGene.name=gencodeGeneClass.name and gencodeGeneClass.class='Known'" | hgsql hg16 -N > gencode.known.gp /cluster/bin/i386/gene-check -incl-ok -nib-dir /cluster/data/hg16/nib -details-out encodeGencodeGene.chk-details gencode.known.gp encodeGencodeGene.chk # create a filtered table for use at data analysis workshop cd /cluster/data/encode/Gencode/2005-06-07 hgsql hg16 < encodeGencodeGeneFiltered.sql insert into encodeGencodeGeneFiltered select t1.* from encodeGencodeGene as t1, gencodeGeneClass as t2 where t1.name = t2.name and t2.class = "Known"; # 2291 rows insert into encodeGencodeGeneFiltered select t1.* from encodeGencodeGene as t1, gencodeGeneClass as t2 where t1.name = t2.name and t2.class = "Novel_CDS"; # 104 rows ########################################################################## # Sanger Chip/chip (DONE 2005-03-14 kate) ssh hgwdev cd /cluster/data/encode/sanger/ mkdir -p chipchip/2005-02-15 cd chipchip/2005-02-15 # obtain data files from ftp site cp /var/ftp/encode/H3K4me3_GM06990_1.description.txt . cp /var/ftp/encode/H3K4me3_GM06990_1.wig.txt . rm /var/ftp/encode/H3K*.txt # 4 columns: chrom, start, end, score (float) # data value range is .12 to 74.875 # format as BED file grep "^chr" H3K4me3_GM06990_1.wig.txt > H3K4me3_GM06990_1.bed # create table schema for BED 4 with float score cat > table.sql << 'EOF' CREATE TABLE tablename ( chrom varchar(255) not null, # Chromosome (this species) chromStart int unsigned not null, # Start position in chromosome (forward strand) chromEnd int unsigned not null, # End position in chromosome score float not null, # Overall score #Indices INDEX(chrom(13),chromStart), INDEX(chrom(13),chromEnd) ); 'EOF' # << for emacs # NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load # these bedGraph types without this special sql business. set tablename = encodeSangerChipH3me3GM06990 sed "s/tablename/$tablename/" table.sql > $tablename.sql hgsql hg16 < $tablename.sql hgLoadBed -noBin -sqlTable=$tablename.sql hg16 $tablename H3K4me3_GM06990_1.bed # Loaded 17604 elements of size 4 # create trackDb entry: # type bedGraph 4 # shortLabel ChIP/Sanger # longLabel Sanger ChIP/Chip (H3K4me3 GM06990) in ENCODE Regions # minLimit 0 # maxLimit 75 ########################################################################## # Genome Institute of Singapore -ChIP/PET of p53 TFBS (2005-03-31 kate) # UPDATED (2005-06-13 angie) cd /cluster/data/encode mkdir -p GIS/2005-04-07 cd GIS/2005-04-07 # files: description.html, p53ChIP-PET.bed sed '1d' p53ChIP-PET.bed | hgLoadBed hg16 encodeGisChipPet stdin # Loaded 65513 elements of size 12 # UPDATE (2005-06-13 angie) # Use cluster-count info, now embedded into the name, to make scored BED: cd /cluster/data/encode/GIS/p53/2005-06-10 tail +2 p53ChIP-PET-hg16.bed \ | perl -wpe 'chomp; @w = split; \ if ($w[3] =~ /^\d+-(\d+)$/) { \ $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \ } else { die "parse"; } \ $_ = join("\t", @w) . "\n";' \ | hgLoadBed hg16 encodeGisChipPet stdin # checkTableCoords cron job caught some overlapping blocks. # I would like to fix it, but Atif prefers to leave in the overlapping # blocks for consistency with what is on their site! :[ Oh well. ########################################################################## # TEST RE-LOADING AFFY TRANSCRIPTION AND CHIP/CHIP TEMPORAL PROFILING # - Hiram 2005-05-24 # as bedGraph type tracks instead of wiggle to provide accurate # data retrieval of the original data rather than have the wiggle # transformations interferring. An experiment to see if there is # a large difference in storage requirements for this data. ssh hgwdev mkdir /cluster/data/encode/Affy/Dec03_2004/bedGraph cd /cluster/data/encode/Affy/Dec03_2004/bedGraph # NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load # these bedGraph types without this special sql business. for HR in 00 02 08 32 do rm -f encodeAffyRnaHl60SigHr${HR}.bed rm -f bedGraph.sql zcat ../ucsc_formats/wig/EC_HL60_RA_RNA_${HR}hr_AS.hs.NCBIv34.wig.gz | \ /cluster/bin/scripts/varStepToBedGraph.pl /dev/stdin | \ sort -k1,1 -k2,2n > encodeAffyRnaHl60SigHr${HR}.bed hgsql -e "drop table encodeAffyRnaHl60SigHr${HR}" hg16 2> /dev/stderr echo "CREATE TABLE encodeAffyRnaHl60SigHr${HR} (" > bedGraph.sql echo 'bin smallint unsigned not null,' >> bedGraph.sql echo 'chrom varchar(255) not null,' >> bedGraph.sql echo 'chromStart int unsigned not null,' >> bedGraph.sql echo 'chromEnd int unsigned not null,' >> bedGraph.sql echo 'dataValue float not null,' >> bedGraph.sql echo 'INDEX(chrom(6),bin)' >> bedGraph.sql echo ');' >> bedGraph.sql hgLoadBed -sqlTable=bedGraph.sql hg16 encodeAffyRnaHl60SigHr${HR} \ encodeAffyRnaHl60SigHr${HR}.bed done # # Measuring previous data requirements: ssh hgwdev cd /gbdb/hg16/encode/Affy/2004-12-03/wib du -hscL encodeAffyRna* # 19M encodeAffyRnaHl60SigHr00.wib # 19M encodeAffyRnaHl60SigHr02.wib # 19M encodeAffyRnaHl60SigHr08.wib # 19M encodeAffyRnaHl60SigHr32.wib # 76M total # examining the "show table status" on these tables shows: # index length: 277,504 * 4 = 1,110,016 = 1.06 Mb # data length: 3,291,136 * 4 = 13,181,472 = 12.57 Mb # therefore total size: 76 + 1 + 13 = 90 Mb # These new tables loaded are: # index length: 9,033,728 * 4 = 36,134,912 = 34.46 Mb # data length: 17,331,152 * 4 = 69,096,304 = 65.90 Mb # thus total length: 34 + 66 = 100 Mb # At this density of data these storage areas are certainly # comparable and practically equivalent. # NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load # these bedGraph types without this special sql business. # do the other set of data too for HR in 00 02 08 32 do rm -f encodeAffyChIpRnapHl60PvalHr${HR}.bed rm -f bedGraph.sql zcat \ ../ucsc_formats/wig/EC_HL60_RA_RNApol2_${HR}hr_AS_b.hs.NCBIv34.wig.gz \ | /cluster/bin/scripts/varStepToBedGraph.pl /dev/stdin | \ sort -k1,1 -k2,2n > encodeAffyChIpRnapHl60PvalHr${HR}.bed hgsql -e "drop table encodeAffyChIpRnapHl60PvalHr${HR}" hg16 2> /dev/stderr echo "CREATE TABLE encodeAffyChIpRnapHl60PvalHr${HR} (" > bedGraph.sql echo 'bin smallint unsigned not null,' >> bedGraph.sql echo 'chrom varchar(255) not null,' >> bedGraph.sql echo 'chromStart int unsigned not null,' >> bedGraph.sql echo 'chromEnd int unsigned not null,' >> bedGraph.sql echo 'dataValue float not null,' >> bedGraph.sql echo 'INDEX(chrom(6),bin)' >> bedGraph.sql echo ');' >> bedGraph.sql hgLoadBed -sqlTable=bedGraph.sql hg16 \ encodeAffyChIpRnapHl60PvalHr${HR} \ encodeAffyChIpRnapHl60PvalHr${HR}.bed done # ######################################################################## # Sanger Chip/chip updated (DONE - 2005-05-25 - Hiram) # data supplier contacts: Christoph Koch # Ian Dunham Robert Andrews ssh hgwdev mkdir /cluster/data/encode/sanger/chipchip/2005-05-25 cd /cluster/data/encode/sanger/chipchip rm latest ln -s 2005-05-25 latest cd 2005-05-25 cp -p /var/ftp/encode/*GM06990* . # convert to simple bedGraph files: (stripping browser and track # line definitions from the custom track format), make sure they # are sorted properly. grep "^chr" H3K4me1_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me1_GM06990_1.bed grep "^chr" H3K4me3_GM06990_2.wig.txt | sort -k1,1 -k2,2n > \ H3K4me3_GM06990_2.bed grep "^chr" H4ac_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H4ac_GM06990_1.bed grep "^chr" H3K4me2_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me2_GM06990_1.bed grep "^chr" H3ac_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \ H3ac_GM06990_1.bed # find min and max in the data: awk '{print $4}' *.bed | sort -n | head -1 # 0.02 awk '{print $4}' *.bed | sort -n | tail -1 # 72.34 # This used to be a single track with only one data set # Place these new five data sets into a single composite track # Simplify the naming scheme here: ln -s H3K4me1_GM06990_1.bed H3K4me1.bed ln -s H3K4me2_GM06990_1.bed H3K4me2.bed ln -s H3K4me3_GM06990_2.bed H3K4me3.bed ln -s H3ac_GM06990_1.bed H3ac.bed ln -s H4ac_GM06990_1.bed H4ac.bed # NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load # these bedGraph types without this special sql business. # so the names can now be used in a for loop: for T in H3K4me1 H3K4me2 H3K4me3 H3ac H4ac do hgsql -e "drop table encodeSangerChip${T}" hg16 2> /dev/stderr echo "CREATE TABLE encodeSangerChip${T} (" > bedGraph.sql echo 'bin smallint unsigned not null,' >> bedGraph.sql echo 'chrom varchar(255) not null,' >> bedGraph.sql echo 'chromStart int unsigned not null,' >> bedGraph.sql echo 'chromEnd int unsigned not null,' >> bedGraph.sql echo 'score float not null,' >> bedGraph.sql echo 'INDEX(chrom(6),bin)' >> bedGraph.sql echo ');' >> bedGraph.sql hgLoadBed -sqlTable=bedGraph.sql hg16 encodeSangerChip${T} ${T}.bed done # transform the five description pages into one single page. # see hg16/trackDb.ra for encodeSangerChipH3H4 composite track # definitions # clean up /var/ftp/encode rm -f /var/ftp/encode/Description_H3K4me1_GM06990_1.html rm -f /var/ftp/encode/Description_H3K4me2_GM06990_1.html rm -f /var/ftp/encode/Description_H3K4me3_GM06990_2.html rm -f /var/ftp/encode/Description_H3ac_GM06990_1.html rm -f /var/ftp/encode/Description_H4ac_GM06990_1.html rm -f /var/ftp/encode/H3K4me1_GM06990_1.wig.txt rm -f /var/ftp/encode/H3K4me2_GM06990_1.wig.txt rm -f /var/ftp/encode/H3K4me3_GM06990_2.wig.txt rm -f /var/ftp/encode/H3ac_GM06990_1.wig.txt rm -f /var/ftp/encode/H4ac_GM06990_1.wig.txt # Sanger confirmed they no longer need the old data, remove the # table from hgwdev: hgsql -e "drop table encodeSangerChip;" hg16 # Additional data submitted 2005-06-02 ssh hgwdev cd /cluster/data/encode/sanger/chipchip/2005-06-02 ln -s grep "^chr" H3K4me2_K562_1.wig.txt | sort -k1,1 -k2,2n > \ H3K4me2K562.bedGraph4 grep "^chr" H4ac_K562_1.wig.txt | sort -k1,1 -k2,2n > \ H4acK562.bedGraph4 grep "^chr" H3ac_K562_1.wig.txt | sort -k1,1 -k2,2n > \ H3acK562.bedGraph4 for T in H3K4me2K562 H4acK562 H3acK562 do hgLoadBed -bedGraph=4 hg16 encodeSangerChip${T} ${T}.bedGraph4 done # Additional data submitted 2005-06-27 (Heather) ssh hgwdev cd /cluster/data/encode/sanger/chipchip/2005-06-27 grep "^chr" H3K4me3_K562_1.wig.txt | sort -k1,1 -k2,2n > H3K4me3_K562.bedGraph4 hgLoadBed -bedGraph=4 hg16 encodeSangerChipH3K4me3K562 H3K4me3_K562.bedGraph4 ######################################################################## # DONE - 2005-06-06 - Hiram # Loading tables UCSD Ludwig Institute ChIP/Chip(H3K4me3 Hela and # H3K4Me2 Hela/IFN-g - two tables here replace the data we # received on 2005-05-26 -> tmH3K4_p0.wig and tmH3K4_p30.wig # The data in the two files is identical except for their track # label comments. # data supplier contact: Bing Ren biren@UCSD.EDU ssh hgwdev cd /cluster/data/encode/UCSD/2005-05-31 for T in acH3 acH4 dmH3K4 stat1 tmH3K4 do grep "^chr" ${T}_p0.wig | sort -k1,1 -k2,2n \ > encodeUcsdChipHeLaH3H4${T}_p0.bedGraph4 grep "^chr" ${T}_p30.wig | sort -k1,1 -k2,2n \ > encodeUcsdChipHeLaH3H4${T}_p30.bedGraph4 hgLoadBed -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p0 \ encodeUcsdChipHeLaH3H4${T}_p0.bedGraph4 hgLoadBed -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p30 \ encodeUcsdChipHeLaH3H4${T}_p30.bedGraph4 done # a couple of later additions, just under the wire cd /cluster/data/encode/UCSD/2005-06-06 # rename to track name ln -s RNAP/rnap_p0.wig encodeUcsdChipHeLaH3H4RNAP_p0.wig ln -s Taf250/taf250_p0.wig encodeUcsdChipHeLaH3H4TAF250_p0.wig ln -s RNAP/rnap_p30.wig encodeUcsdChipHeLaH3H4RNAP_p30.wig ln -s Taf250/taf250_p30.wig encodeUcsdChipHeLaH3H4TAF250_p30.wig for T in RNAP TAF250 do grep "^chr" encodeUcsdChipHeLaH3H4${T}_p0.wig | sort -k1,1 -k2,2n \ > encodeUcsdChipHeLaH3H4${T}_p0.bed grep "^chr" encodeUcsdChipHeLaH3H4${T}_p30.wig | sort -k1,1 -k2,2n \ > encodeUcsdChipHeLaH3H4${T}_p30.bed hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p0 \ encodeUcsdChipHeLaH3H4${T}_p0.bed hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p30 \ encodeUcsdChipHeLaH3H4${T}_p30.bed done # extensive edits were performed on the delivered description files # they were pretty sparse # GIS RNA PET (2005-06-03 angie) # Data updated (previous data was mistakenly in hg17 coords) (2006-10-19 kate) # Data saved to: #cd /cluster/data/encode/GIS/rna5fu/2005-05-31 #ln -s 2005-05-31 /cluster/data/encode/GIS/rna5fu/latest cd /cluster/data/encode/GIS/rna/2005-10-31 # Jim requested colored bed. Names are of the form $id-$mapCount-$atCount. # Coloring scheme (inspired by Chuck's transcriptome track): # ($mapCount == 1 && $atCount > 1): dark blue # ($mapCount == 1 && $atCount == 1): light blue # ($mapCount > 1 && $atCount > 1): medium brown # ($mapCount > 1 && $atCount == 1): light brown # Angie's process, slightly modified (kate) cat > load.hg16.csh << 'EOF' foreach f (lab/HCT116-hg16.bed lab/MCF7-hg16.bed) set cellType = `echo $f:t:r | sed -e 's/-hg16//'` echo $cellType set table = encodeGisRnaPet$cellType grep '^chr' $f | \ perl -wpe \ 'chomp; @w = split; \ if ($w[3] =~ /\d+-(\d+)-(\d+)/) { \ ($mc, $ac) = ($1, $2, $3); \ if ($mc == 1) { $w[8] = ($ac > 1) ? "35,35,175" : "160,160,188"; } \ elsif ($mc > 1) { $w[8] = ($ac > 1) ? "180,120,0" : "225,150,0"; } \ else { die "mc $mc" } \ } else { die "parse"; } \ $_ = join(" ", @w) . "\n";' > $table.bed hgLoadBed hg16 $table $table.bed end 'EOF' csh load.hg16.csh >&! load.hg16.log rm *.bed # UCSD/LI Nimblegen whole genome TAF1 & condensed validation (2005-06-02 angie) # Data saved to: cd /cluster/data/encode/UCSD/nimblegen/2005-05-31 # TAF1.bed contains two separate tracks: known and novel sites. # I manually split it into two files: TAF1_known.bed and TAF1_novel.bed. # Coordinates were off-by-1 so adjust before loading: awk '{print $1 "\t" $2-1 "\t" $3-1;}' TAF1_known.bed \ | hgLoadBed -noBin hg16 encodeUcsdNgChipKnownSites stdin awk '{print $1 "\t" $2-1 "\t" $3-1;}' TAF1_novel.bed \ | hgLoadBed -noBin hg16 encodeUcsdNgChipNovelSites stdin # There are >1M rows on the big chromosomes, while 100k seems to be # kind of an upper limit for bedGraph, performance-wise (>3sec load). # So load as wiggle; adjust coords here too: ssh kolossus cd /cluster/data/encode/UCSD/nimblegen/2005-05-31 set track = encodeUcsdNgChipSignal awk '$1 == "variableStep" {print;} \ $1 ~ /[0-9]+/ {print $1+1 "\t" $2;}' chr*.sub \ > $track.varStep wigEncode $track.varStep $track.wig $track.wib #Converted stdin, upper limit 6.47, lower limit -5.61 # back on hgwdev: set track = encodeUcsdNgChipSignal set gbdbDir = /gbdb/hg16/encode/UCSD/2005-05-31/wib mkdir -p $gbdbDir ln -s `pwd`/$track.wib $gbdbDir/ nice hgLoadWiggle hg16 $track $track.wig -pathPrefix=$gbdbDir # Load validation subtracks; standardize hist mod names and adjust coords: foreach f (*.wig) set cellType = `echo $f:r | perl -wpe 's/(\S+)_condensed_sorted/\u$1/; \ s/Ach3/H3ac/g; s/Meh3k4/H3K4me/g;'` set table = encodeUcsdNgValChip$cellType echo $f ' -> ' $table tail +5 $f | awk '{print $1 "\t" $2-1 "\t" $3-1 "\t" $4;}' \ | nice hgLoadBed -onServer -bedGraph=4 hg16 $table stdin end # back on kolossus, clean up: gzip encodeUcsdNgChipSignal.wig encodeUcsdNgChipSignal.varStep rm *.tab # UCSD/LI Nimblegen RnaP & H3K4me3 (2005-06-03 angie) ssh kolossus cd /cluster/data/encode/UCSD/nimblegen/2005-06-01 # Data are in hg17 coords, so must liftOver to hg16. foreach f (lab/Nim*/*.wig) set track = `echo $f:t:r | sed -e \ 's/rnap/encodeUcsdNgHeLaRnap/; s/tmh3k4/encodeUcsdNgHeLaH3K4me3/;'` echo $track tail +5 $f \ | liftOver stdin /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain \ hg16.$track.bed hg16.$track.unmapped end wc -l hg16* # 384969 hg16.encodeUcsdNgHeLaH3K4me3_p0.bed # 360 hg16.encodeUcsdNgHeLaH3K4me3_p0.unmapped # 384969 hg16.encodeUcsdNgHeLaH3K4me3_p30.bed # 360 hg16.encodeUcsdNgHeLaH3K4me3_p30.unmapped # 384969 hg16.encodeUcsdNgHeLaRnap_p0.bed # 360 hg16.encodeUcsdNgHeLaRnap_p0.unmapped # 384969 hg16.encodeUcsdNgHeLaRnap_p30.bed # 360 hg16.encodeUcsdNgHeLaRnap_p30.unmapped # All but one of the unmapped's were from chrX, and most of those were # "Deleted in new" (i.e. in hg17 but not hg16). Seems OK. # There are <100,000 per chrom, so load as bedGraph. ssh hgwdev cd /cluster/data/encode/UCSD/nimblegen/2005-06-01 foreach f (hg16.*.bed) hgLoadBed -onServer -bedGraph=4 hg16 $f:t:r $f end ########################################################################### # TBA Alignments from Elliott Margulies (2005-06-06 kate) mkdir /cluster/data/encode/TBA cd /cluster/data/encode/TBA mkdir 2005-06-08 cd 2005-06-08 mkdir lab; cd lab wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=5 \ ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_tba/MAY-2005/ # load mafs gunzip *.maf.gz mkdir -p /gbdb/hg16/encode/TBA/maf rm -f /gbdb/hg16/encode/TBA/maf/*.maf ln -s /cluster/data/encode/TBA/latest/lab/*.maf /gbdb/hg16/encode/TBA/maf cd .. hgLoadMaf -pathPrefix=/gbdb/hg16/encode/TBA/maf -WARN \ hg16 encodeTbaAlign >&! load.log # Loaded 229243 mafs in 44 files from /gbdb/hg16/encode/TBA/maf # NOTE: many warnings about "Score too small" # load phastCons conservation wiggle gunzip -c lab/wig/human.EN*.pC.*.wig.gz | \ wigEncode stdin phastCons.wig phastCons.wib rm -f /gbdb/hg16/encode/TBA/phastCons.wib ln -s `pwd`/phastCons.wib /gbdb/hg16/encode/TBA hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/TBA hg16 \ encodeTbaPhastCons phastCons.wig # Updated with 6/22 binCons data at Elliott's request (2005-06-29 kate) cd /cluster/data/encode/TBA/latest/lab/wig/ mkdir jun30 cd jun30 wget -r -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_tba/MAY-2005/wig/ cd ../../ gunzip -c lab/wig/jun30/EN*.binCons.*.wig.gz | \ wigEncode stdin binCons.wig binCons.wib gunzip -c lab/wig/jun30/human*.pC.*.wig.gz | \ wigEncode stdin phastCons.wig phastCons.wib set d = /gbdb/hg16/encode/TBA rm -f $d/binCons.wib $d/phastCons.wib ln -s `pwd`/binCons.wib $d ln -s `pwd`/phastCons.wib $d hgLoadWiggle -pathPrefix=$d hg16 encodeTbaPhastCons phastCons.wig hgLoadWiggle -pathPrefix=$d hg16 encodeTbaPhastCons phastCons.wig # load MAF summary table cat lab/*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 hg16 encodeTbaSummary stdin # load conserved elements (6 tables) # scored elements hgLoadBed -noBin hg16 encodeTbaBinConsEl lab/tba.binCons.bed # loaded 18677 hgLoadBed -noBin hg16 encodeTbaPhastConsEl lab/tba.phastCons.bed # loaded 36105 elements # unscored - fix at score=500 ln -s lab/tba.or.bed TbaUnionEl.bed ln -s lab/tba.or.nc.bed TbaNcUnionEl.bed ln -s lab/tba.and.bed TbaIntersectEl.bed ln -s lab/tba.and.nc.bed TbaNcIntersectEl.bed foreach f (Tba*.bed) set b = $f:r set t = encode$b sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin checkTableCoords hg16 $t end # load GERP conservation score and elements # conservation wiggle cat lab/GERP/chr*.wig | \ wigEncode stdin GerpCons.wig GerpCons.wib # Converted stdin, upper limit 3.00, lower limit 0.00 mkdir -p /gbdb/hg16/encode/TBA rm -f /gbdb/hg16/encode/TBA/GerpCons.wib ln -s `pwd`/GerpCons.wib /gbdb/hg16/encode/TBA hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/TBA hg16 \ encodeTbaGerpCons GerpCons.wig # conserved elements # NOTE: round float scores set t = encodeTbaGerpEl perl -we 'while (<>) { \ if (/^chr/) { \ ($chrom,$start,$end,$name,$score) = split; \ $name =~ s/GERP.score=/GERP./; \ printf("%s\t%d\t%d\t%s\t%.0f\n", $chrom,$start,$end,$name,$score); \ } }' lab/GERP/ENC_GERP_TBA_cons.bed > $t.tab hgLoadBed -noBin hg16 $t $t.tab # Loaded 40576 elements of size 5 ########################################################################### # Loading Yale Affy ChIP/Chip data (DONE - 2005-06-03 - Hiram) # Two files were found to have bad start,end coordinates, they # were resubmitted by Joel Rozowsky: # encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Pvalue_Map.wig # encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Signal_Map.wig # This prompted the addition of the -strict option to hgLoadBed # data supplier contact: Joel Rozowsky joel.rozowsky@yale.edu cd /cluster/data/encode/yale/affy/2005-06-01/CHIPCHIP_STAT1 # rename the data files to their resulting table names: ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Pvalue_Map.wig \ encodeYaleChIPSTAT1HeLaBingRenPval.wig ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Signal_Map.wig \ encodeYaleChIPSTAT1HeLaBingRenSig.wig ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Sites.bed \ encodeYaleChIPSTAT1HeLaBingRenSites.bed ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Pvalue_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess36mer36bpPval.wig ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Signal_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess36mer36bpSig.wig ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Sites.bed \ encodeYaleChIPSTAT1HeLaMaskLess36mer36bpSite.bed ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Pvalue_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess50mer38bpPval.wig ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Signal_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess50mer38bpSig.wig ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Sites.bed \ encodeYaleChIPSTAT1HeLaMaskLess50mer38bpSite.bed ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Pvalue_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess50mer50bpPval.wig ln -s \ encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Signal_Map.wig \ encodeYaleChIPSTAT1HeLaMaskLess50mer50bpSig.wig ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Sites.bed \ encodeYaleChIPSTAT1HeLaMaskLess50mer50bpSite.bed # now they can simply be loaded, the .wig files: for T in BingRenPval BingRenSig MaskLess36mer36bpPval MaskLess50mer38bpPval \ MaskLess50mer50bpPval MaskLess36mer36bpSig MaskLess50mer38bpSig \ MaskLess50mer50bpSig do TBL="encodeYaleChIPSTAT1HeLa${T}" grep "^chr" ${TBL}.wig | sort -k1,1 -k 2,2n | hgLoadBed -strict -bedGraph=4 hg16 ${TBL} stdin done # The bed files: for T in BingRenSites MaskLess36mer36bpSite MaskLess50mer38bpSite \ MaskLess50mer50bpSite do TBL="encodeYaleChIPSTAT1HeLa${T}" grep "^chr" ${TBL}.bed | sort -k1,1 -k 2,2n | hgLoadBed -strict hg16 ${TBL} stdin done # very good description html files were delivered with this data # set. # The four Signal data sets were packaged into the composite track: # encodeYaleChIPSTAT1Sig # The four Sites data sets were packaged into the composite track: # encodeYaleChIPSTAT1Sites # The four Pvalue data sets were packaged into the composite track: # encodeYaleChIPSTAT1Pval ####################################################################### # Yale Affy RNA data sets (DONE - 2005-06-07 - Hiram) # data supplier contact: Joel Rozowsky joel.rozowsky@yale.edu # NOTE 2005-08-01 ### *** It turns out something was off-by-one with the # coordinates on all these data points. At this time we are not yet # certain what happened. At any rate, this data is being reloaded to # fix these coordinate problems. cd /cluster/data/encode/yale/affy/2005-06-01/RNA_NEUTROPHIL # create symlinks to rename the files to their table names: ln -s encode_Yale_Affy_Neutrophil_RNA_Tars.html \ encodeYaleAffyNeutRNATars.html ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep1.bed \ encodeYaleAffyNeutRNATars01.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep2.bed \ encodeYaleAffyNeutRNATars02.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep3.bed \ encodeYaleAffyNeutRNATars03.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep4.bed \ encodeYaleAffyNeutRNATars04.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep5.bed \ encodeYaleAffyNeutRNATars05.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep6.bed \ encodeYaleAffyNeutRNATars06.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep7.bed \ encodeYaleAffyNeutRNATars07.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep8.bed \ encodeYaleAffyNeutRNATars08.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep9.bed \ encodeYaleAffyNeutRNATars09.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep10.bed \ encodeYaleAffyNeutRNATars10.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Tars.bed \ encodeYaleAffyNeutRNATarsAll.bed ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.html \ encodeYaleAffyNeutRNATransMap.html ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep1.wig \ encodeYaleAffyNeutRNATransMap01.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep2.wig \ encodeYaleAffyNeutRNATransMap02.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep3.wig \ encodeYaleAffyNeutRNATransMap03.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep4.wig \ encodeYaleAffyNeutRNATransMap04.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep5.wig \ encodeYaleAffyNeutRNATransMap05.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep6.wig \ encodeYaleAffyNeutRNATransMap06.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep7.wig \ encodeYaleAffyNeutRNATransMap07.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep8.wig \ encodeYaleAffyNeutRNATransMap08.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep9.wig \ encodeYaleAffyNeutRNATransMap09.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep10.wig \ encodeYaleAffyNeutRNATransMap10.wig ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.wig \ encodeYaleAffyNeutRNATransMapAll.wig # The TransMap tracks were too slow loaded as bedGraph, load them as # wiggle data. Something was fishy with trying to pipe wigEncode # stdout to hgLoadWiggle stdin. It kept complaining about some # line as if the stderr of the wigEncode was getting through the # pipe. Hence the temporary w.${TBL} file: # Re-loaded 2005-08-01 to add 1 to start,end coordinates mkdir -p /gbdb/hg16/encode/Yale/Affy/wib mkdir wib for T in All do TBL="encodeYaleAffyNeutRNATransMap${T}" rm -f w.${TBL} wib/${TBL}.wib t.wig echo -e "chr1\t148374651\t148374672\t0" > t.wig grep "^chr" ${TBL}.wig | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig sort -k1,1 -k2,2n t.wig | \ wigEncode stdin w.${TBL} wib/${TBL}.wib hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 ${TBL} w.${TBL} rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib ln -s `pwd`/wib/${TBL}.wib \ /gbdb/hg16/encode/Yale/Affy/wib rm -f w.${TBL} t.wig done for T in 01 02 03 04 05 06 07 08 09 10 do TBL="encodeYaleAffyNeutRNATransMap${T}" rm -f w.${TBL} wib/${TBL}.wib grep "^chr" ${TBL}.wig | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' \ | sort -k1,1 -k2,2n | \ wigEncode stdin w.${TBL} wib/${TBL}.wib hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 ${TBL} w.${TBL} rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib ln -s `pwd`/wib/${TBL}.wib \ /gbdb/hg16/encode/Yale/Affy/wib rm -f w.${TBL} done # Loading the Tars files: for T in All 01 02 03 04 05 06 07 08 09 10 do TBL="encodeYaleAffyNeutRNATars${T}" echo "${TBL}.bed" grep "^chr" ${TBL}.bed | \ awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \ | sort -k1,1 -k2,2n | \ hgLoadBed -strict hg16 ${TBL} stdin done # The file encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.wig # had an unusual error in its chromEnd column, it was # manually fixed here and verified with Joel that this was the # correct fix: # 51506c51506 # < chr2 234999988 2.35e+08 -5.7625 # --- # > chr2 234999988 235000006 -5.7625 # 717784c717784 # < chrX 151999980 1.52e+08 0.6375 # --- # > chrX 151999980 152000005 0.6375 # This is a pattern of errors that will be seen in other files in # this data set. Some kind of PERL thing Joel suspects. # The 11 TransMap data sets are grouped into the composite track: # encodeYaleAffyRNATransMap # And the 11 TARs are grouped into: encodeYaleAffyRNATars # Same procedure for the PLACENTA tissue samples: ln -s encode_Yale_Affy_Placenta_RNA_Tars.bed \ encodeYaleAffyPlacRNATars.bed ln -s encode_Yale_Affy_Placenta_RNA_Tars.html \ encodeYaleAffyPlacRNATars.html ln -s encode_Yale_Affy_Placenta_RNA_Transcript_Map.html \ encodeYaleAffyPlacRNATransMap.html ln -s encode_Yale_Affy_Placenta_RNA_Transcript_Map.wig \ encodeYaleAffyPlacRNATransMap.wig # There is only one of each type here # Reloaded 2005-08-01, added 1 to all start,end coordinates and # added the missing first probe at chr1 148374651 148374672 mkdir wib rm -f t.wig echo -e "chr1\t148374651\t148374672\t0" > t.wig grep "^chr" encodeYaleAffyPlacRNATransMap.wig | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig sort -k1,1 -k2,2n t.wig | \ wigEncode stdin tmp.wig wib/encodeYaleAffyPlacRNATransMap.wib hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 \ encodeYaleAffyPlacRNATransMap tmp.wig rm -f /gbdb/hg16/encode/Yale/Affy/wib/encodeYaleAffyPlacRNATransMap.wib ln -s `pwd`/wib/encodeYaleAffyPlacRNATransMap.wib \ /gbdb/hg16/encode/Yale/Affy/wib rm -f tmp.wig rm -f t.wig grep "^chr" encodeYaleAffyPlacRNATars.bed | \ awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \ | sort -k1,1 -k2,2n | \ hgLoadBed -strict hg16 encodeYaleAffyPlacRNATars stdin # These two were added to the previously mentioned composites: # encodeYaleAffyRNATransMap and encodeYaleAffyRNATars # Same procedure for the NB4 data: cd /cluster/data/encode/yale/affy/2005-06-01/RNA_NB4 ln -s encode_Yale_Affy_NB4_RA_RNA_Tars.bed \ encodeYaleAffyNB4RARNATars.bed ln -s encode_Yale_Affy_NB4_Untreated_RNA_Tars.bed \ encodeYaleAffyNB4UntrRNATars.bed ln -s encode_Yale_Affy_NB4_TPA_RNA_Tars.bed \ encodeYaleAffyNB4TPARNATars.bed ln -s encode_Yale_Affy_NB4_RA_RNA_Transcript_Map.wig \ encodeYaleAffyNB4RARNATransMap.wig ln -s encode_Yale_Affy_NB4_TPA_RNA_Transcript_Map.wig \ encodeYaleAffyNB4TPARNATransMap.wig ln -s encode_Yale_Affy_NB4_Untreated_RNA_Transcript_Map.wig \ encodeYaleAffyNB4UntrRNATransMap.wig for T in NB4RA NB4TPA NB4Untr do TNAME=encodeYaleAffy${T}RNATars grep "^chr" ${TNAME}.bed | \ awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \ | sort -k1,1 -k2,2n | \ hgLoadBed -strict hg16 ${TNAME} stdin done mkdir wib for T in NB4RA NB4TPA NB4Untr do TBL=encodeYaleAffy${T}RNATransMap rm -f w.${T} t.wig wib/${TBL}.wib echo -e "chr1\t148374651\t148374672\t0" > t.wig grep "^chr" ${TBL}.wig | \ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig sort -k1,1 -k2,2n t.wig | \ wigEncode stdin w.${T} wib/${TBL}.wib hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 \ ${TBL} w.${T} rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib ln -s `pwd`/wib/${TBL}.wib /gbdb/hg16/encode/Yale/Affy/wib rm -f w.${T} t.wig done # These six were added to the previously mentioned composites: # encodeYaleAffyRNATransMap and encodeYaleAffyRNATars # The primary .html file was enhanced to include the specifics of # each of these three data sets into the one document ######################################################################### # Farnham Lab/UC Davis Nimblegen track (DONE - 2005-06-07 - Hiram) # Data supplier contact: Kyle J. Munn - kmunn@nimblegen.com cd /cluster/data/encode/UcDavis/2005-06-01 awk '{printf "%s\t%s\t%s\t%s\n", $1,$4,$5,$6}' E2F1_Median.gff | \ sort -k1,1 -k2,2n > encodeUCDavisE2F1Median.bedGraph4 hgLoadBed -strict -bedGraph=4 hg16 encodeUCDavisE2F1Median \ encodeUCDavisE2F1Median.bedGraph4 ######################################################################### # Rearrangement of existing UCSD tracks, combining into a single # composite track: encodeLIChIP - (DONE - 2005-06-07 - Hiram) # In general, the data supplier contact for UCSD data is: # Bing Ren biren@UCSD.EDU # long label: Ludwig Institute/UCSD ChIP/Chip (Pol2, TAF1, AcH3, H3K4me, # H3K27me3 antibodies) # short label: LI ChIP Various ssh hgwdev # two new data tracks: cd /cluster/data/encode/UCSD/2005-06-01 grep "^chr" suz12.wig | sort -k1,1 -k2,2n > \ encodeUcsdChipH3K27me3Suz12.bedGraph grep "^chr" tmh3k27.wig | sort -k1,1 -k2,2n > \ encodeUcsdChipH3K27me3.bedGraph hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipH3K27me3Suz12 \ encodeUcsdChipH3K27me3Suz12.bedGraph hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipH3K27me3 \ encodeUcsdChipH3K27me3.bedGraph # Reloading existing tracks to change the data value column from a # string to a float: mkdir /cluster/data/encode/UCSD/fix_columns cd /cluster/data/encode/UCSD/fix_columns # fetch data from existing tables for T in encodeUcsdChipMeh3k4Imr90 encodeUcsdChipAch3Imr90 \ encodeUcsdChipTaf250Hct116 encodeUcsdChipTaf250Imr90 \ encodeUcsdChipTaf250Thp1 encodeUcsdChipTaf250Hela \ encodeUcsdChipRnapHct116 encodeUcsdChipRnapImr90 \ encodeUcsdChipRnapThp1 encodeUcsdChipRnapHela do hgsql -N -e "describe ${T}" hg16 > ${T}.desc hgsql -N -e \ "select chrom,chromStart,chromEnd,name from ${T};" hg16 | \ sort -k1,1 -k2,2n > ${T}.bed done # reload these tables as bed graph, adding a _f to the table names: for T in encodeUcsdChipMeh3k4Imr90 encodeUcsdChipAch3Imr90 \ encodeUcsdChipTaf250Hct116 encodeUcsdChipTaf250Imr90 \ encodeUcsdChipTaf250Thp1 encodeUcsdChipTaf250Hela \ encodeUcsdChipRnapHct116 encodeUcsdChipRnapImr90 \ encodeUcsdChipRnapThp1 encodeUcsdChipRnapHela do hgLoadBed -strict -bedGraph=4 hg16 ${T}_f ${T}.bed done # each individual track's short and long label: # LI Pol2 HeLa - Ludwig Institute ChIP/Chip (Pol2 ab, HeLa cells) # LI Pol2 THP1 - Ludwig Institute ChIP/Chip (Pol2 ab, THP1 cells) # LI Pol2 IMR90 - Ludwig Institute ChIP/Chip (Pol2 ab, IMR90 cells) # LI Pol2 HCT116 - Ludwig Institute ChIP/Chip (Pol2 ab, HCT116 cells) # LI TAF1 HeLa - Ludwig Institute ChIP/Chip (TAF1 ab, HeLa cells) # LI TAF1 THP1 - Ludwig Institute ChIP/Chip (TAF1 ab, THP1 cells) # LI TAF1 IMR90 - Ludwig Institute ChIP/Chip (TAF1 ab, IMR90 cells) # LI TAF1 HCT116 - Ludwig Institute ChIP/Chip (TAF1 ab, HCT116 cells) # LI AcH3 IMR90 - Ludwig Institute ChIP/Chip (AcH3 ab, IMR90 cells) # LI H3K4me IMR90 - Ludwig Institute ChIP/Chip (MeH3K4 ab, IMR90 cells) # LI SUZ12 HeLa - Ludwig Institute ChIP/Chip (SUZ12 protein ab, HeLa cells # unstimulated) # LI H3K27me3 HeLa - Ludwig Institute ChIP/Chip (H3K27me3 ab, HeLa cells # unstimulated) # Replaces the composite track: encodeUcsdChipRnap # LI ChIP Pol2 - Ludwig Institute ChIP/Chip (Pol2 initiation form antibody) # which had four subtracks. # And: encodeUcsdChipTaf250 # LI ChIP TAF1 - Ludwig Institute ChIP/Chip (TAF1 antibody) # which had four subtracks. # And the single track: encodeUcsdChipAch3Imr90 # LI AcH3 IMR90 - Ludwig Institute ChIP/Chip (AcH3 antibody, IMR90 cells) # And the single track: encodeUcsdChipMeh3k4Imr90 #LI MeH3K4 IMR90 - Ludwig Institute ChIP/Chip (MeH3K4 antibody, IMR90 cells) ########################################################################## # EGASP Partial (2005-06-22 kate) # Gene tracks submitted for the EGASP competition, submitted # by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) cd /cluster/data/encode mkdir -p EGASP/Partial/lab cd EGASP/Partial/lab wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=5 -nd \ ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_partial/ wc -l lab/*.gtf # 1778 lab/ASPic.gtf # 4215 lab/AceSCAN.gtf # 2692 lab/Augustus_EST-Protein.gtf # 2347 lab/Augustus_abinitio.gtf # 2736 lab/Augustus_any.gtf # 2567 lab/Augustus_dualgenome.gtf # 3458 lab/GeneZilla.gtf # 2194 lab/SAGA.gtf # NOTE: exclude ASPic, which contains only intron records # Filenames above, with _CHR_COORDS_hg17.gff appended, are chrom coordinate versions # GeneZilla ldHgGene -out=genezilla.gp null null lab/GeneZilla.*.gff # Read 454 transcripts in 3949 lines in 1 files # 454 groups 20 seqs 1 sources 1 feature types # 454 gene predictions liftOver -genePred genezilla.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ genezilla.hg16.gp genezilla.gp.unmapped wc -l genezilla.hg16.gp genezilla.gp.unmapped # 655 genezilla.hg16.gp # 2 genezilla.gp.unmapped # ENr131-19 ldHgGene -predTab hg16 encodeEgaspPartGenezilla genezilla.hg16.gp # 655 gene predictions genePredCheck -db=hg16 encodeEgaspPartGenezilla # SAGA # Strip out trailing ## on lines where manual changes were made # (see notes in .gtf file) sed -e 's/ ##.*//' lab/SAGA.*.gff | \ ldHgGene -out=saga.gp null null stdin # 378 gene predictions liftOver -genePred saga.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ saga.hg16.gp saga.gp.unmapped wc -l saga.hg16.gp saga.gp.unmapped # 377 saga.hg16.gp # 2 saga.gp.unmapped # ENr131.3.0 ldHgGene -predTab hg16 encodeEgaspPartSaga saga.hg16.gp # 377 gene predictions genePredCheck -db=hg16 encodeEgaspPartSaga # Augustus wc -l lab/Augustus*.gff # 2646 lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff # 2301 lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff # 2690 lab/Augustus_any.gtf_CHR_COORDS_hg17.gff # 2521 lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff ln -s lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff augustus.est.gff ln -s lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff augustus.abinitio.gff ln -s lab/Augustus_any.gtf_CHR_COORDS_hg17.gff augustus.any.gff ln -s lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff augustus.dual.gff # convert to genePred foreach f (augustus.*.gff) set b = $f:r ldHgGene -out=$b.gp -genePredExt null null $f end # Reading abinitio.gff # 418 gene predictions # Reading any.gff # 399 gene predictions # Reading dual.gff # 413 gene predictions # Reading est.gff # 381 gene predictions # lift to hg16 foreach f (augustus.*.gp) set b = $f:r liftOver -genePred $b.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.hg16.gp $b.gp.unmapped end wc -l augustus.*.hg16.gp # 417 augustus.abinitio.hg16.gp # 398 augustus.any.hg16.gp # 412 augustus.dual.hg16.gp # 380 augustus.est.hg16.gp wc -l augustus.*.unmapped # 2 augustus.abinitio.gp.unmapped # 2 augustus.any.gp.unmapped # 2 augustus.dual.gp.unmapped # 2 augustus.est.gp.unmapped # load foreach f (augustus.*.hg16.gp) genePredCheck $f set t = `echo $f | sed -e 's/augustus.\(.*\).hg16.gp/encodeEgaspPartAugustus\u\1/'` ldHgGene -predTab -genePredExt hg16 $t $f checkTableCoords hg16 $t end # AceSCAN # Split into two tracks -- conserved, and other, based on feature wc -l lab/AceSCAN*.gff # 4163 lab/AceSCAN.gtf_CHR_COORDS_hg17.gff sed -n '/ACE\[+\]exon/s//exon/p' lab/AceSCAN*.gff > aceCons.scored.gff sed -n '/ACE\[-\]exon/s//exon/p' lab/AceSCAN*.gff > aceOther.scored.gff wc -l ace*.gff # 159 aceCons.gff # 4004 aceOther.gff cat > doGene.csh << 'EOF' foreach f (ace*.scored.gff) echo $f set b = $f:r:r awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' $f | \ ldHgGene -out=$b.gp null null stdin liftOver -genePred $b.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.hg16.gp $b.gp.unmapped wc -l $b.hg16.gp $b.gp.unmapped set t = `echo $b | perl -wpe 's/(.*)/encodeEgaspPart\u$1/'` ldHgGene -predTab hg16 $t $b.hg16.gp genePredCheck -db=hg16 $t end 'EOF' csh doGene.csh >&! doGene.log # aceCons # 117 gene predictions # aceOther # 727 gene predictions (708 mapped) ########################################################################## # EGASP Full (2005-06-27 kate) # Gene tracks submitted for the EGASP competition, submitted # by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) cd /cluster/data/encode mkdir -p EGASP/Full/lab cd EGASP/Full/lab wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=5 \ ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_on_all_regions/ -nd wc -l *.gtf cd .. mkdir custom # 34059 Aceview.gtf ln -s lab/Aceview.*.gff Aceview.gff # 11968 CSTminer.gtf # SKIP THIS (it's sequence tags, not gene preds) # 3241 DOGFISH-C.gtf ln -s lab/DOGFISH-C.*.gff Dogfish.gff # 13106 EnsEMBL_predictions.gtf ln -s `pwd`/lab/EnsEMBL*gff custom/Ensembl.gff # Has frames # 19004 ExoGean.gtf ln -s lab/ExoGean.*.gff Exogean.gff # 9273 ExonHunter.gtf ln -s `pwd`/lab/ExonHunter.*.gff custom/Exonhunter.gff # NOTE: has frame and start_codon annotations # 11679 FGenesh++.gtf ln -s lab/FGene*.gff Fgenesh.gff # 688 FProm.gtf # SKIPPING (promoter, not genepred) # 4178 GeneID_U12.gtf ln -s `pwd`/lab/GeneID_U12.*.gff custom/GeneIdU12.gff # 12107 GeneMark_on_masked_seq.gtf ln -s `pwd`/lab/GeneMark*.gff custom/Genemark.gff # Has transcript_ID but not gene_id # Redo Jigsaw 2005-08-16 kate # 4004 JigSaw.gtf ln -s `pwd`/lab/JigSaw.*.gff custom/Jigsaw.gff # 740 McPromoter.gtf # SKIPPING (promoters) # 12816 N-SCAN-Pairagon_any_v1.gtf sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_any*.gff > PairagonAny.gff # 12798 N-SCAN-Pairagon_mRNA_EST_v1.gtf sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_mRNA*.gff > PairagonMrna.gff # 7676 N-SCAN-Pairagon_multiple_v2.gtf sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_multiple*.gff > PairagonMultiple.gff # 7676 N-SCAN-Pairagon_novel_v2.gtf sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_novel*.gff > PairagonNovel.gff # 6107 SGP_U12.gtf ln -s `pwd`/lab/SGP_U12.*.gff custom/SGP2U12.gff # 10007 SPIDA.gtf ln -s lab/SPIDA.*.gff Spida.gff # 208 SoftBerry_pseudogenes.gtf sed -e 's/pseudogene\t/CDS\t/' lab/SoftBerry*.gff > SoftberryPseudo.gff # 11540 TwinScan-MARS.gtf sed -e 's/transcript_id "W-U-Mm/transcript_id "/' lab/Twin*.gff > custom/Twinscan.gff # Has start_codon annotation and frames # 252 Uncover.gtf # SKIPPING (skipped exons, not gene preds) # Process "standard" gff files # NOTE: must dummy out scores -- float values cat > doGene.csh << 'EOF' foreach f (*.gff) set b = $f:r echo $f awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' $f | \ ldHgGene -out=$b.gp null null stdin liftOver -genePred $b.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.hg16.gp $b.gp.unmapped wc -l $b.hg16.gp $b.gp.unmapped set t = encodeEgaspFull$b ldHgGene -predTab hg16 $t $b.hg16.gp genePredCheck -db=hg16 $t end 'EOF' csh doGene.csh >&! doGene.log # NOTE: warnings about missing frames are OK. We're using -genePredExt # just to pick up the gene name # process special files cd custom grep -v pseudogene Ensembl.gff | \ ldHgGene -genePredExt -out=Ensembl.gp null null stdin # 735 gene predictions grep pseudogene Ensembl.gff | \ ldHgGene -genePredExt -out=EnsemblPseudo.gp null null stdin # 34 ldHgGene -gtf -genePredExt -out=Exonhunter.gp null null Exonhunter.gff # 1435 gene predictions ldHgGene -gtf -genePredExt -out=Jigsaw.gp null null Jigsaw.gff awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' Twinscan.gff | \ ldHgGene -genePredExt -gtf -out=Twinscan.gp null null stdin # 954 gene predictions # GENEID and SGP2 with special intron tracks # NOTE: dummy out scores (wrong format -- float, and neg. values) awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < GeneIdU12.gff > geneId.gff ldHgGene -genePredExt -out=GeneId.gp null null geneId.gff # 476 gene predictions awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < SGP2U12.gff > sgp2.gff ldHgGene -genePredExt -out=Sgp2.gp null null sgp2.gff # 930 gene predictions # lift and load some customs cat > doGene.csh << 'EOF' foreach f (*.gp) set b = $f:r echo $f liftOver -genePred $b.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.hg16.gp $b.gp.unmapped wc -l $b.hg16.gp $b.gp.unmapped set t = encodeEgaspFull$b ldHgGene -genePredExt -predTab hg16 $t $b.hg16.gp genePredCheck -db=hg16 $t end 'EOF' # << for emacs csh doGene.csh >&! doGene.log # process others perl -wpe 's/(.*transcript_id) (\S+)$/$1 $2 gene_id $2/' Genemark.gff | \ ldHgGene -out=Genemark.gp null null stdin # 880 gene predictions liftOver -genePred \ Genemark.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ Genemark.hg16.gp Genemark.gp.unmapped set t = "encodeEgaspFullGenemark" ldHgGene -predTab hg16 $t Genemark.hg16.gp # 872 gene predictions genePredCheck -db=hg16 $t # create genepreds containing just exons flanking U12 introns # use U12 annotation as gene name, so it appears on details page grep U12 geneId.gff | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > geneId.introns.gff ldHgGene -genePredExt -out=geneId.introns.gp null null geneId.introns.gff # 24 gene predictions liftOver -genePred geneId.introns.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ geneId.introns.hg16.gp geneId.introns.gp.unmapped set t = encodeEgaspFullGeneIdU12 ldHgGene -predTab -genePredExt hg16 $t geneId.introns.hg16.gp genePredCheck -db=hg16 $t set b = sgp2 grep U12 $b.gff | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > $b.introns.gff ldHgGene -genePredExt -out=$b.introns.gp null null $b.introns.gff # 20 gene predictions liftOver -genePred $b.introns.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.introns.hg16.gp $b.introns.gp.unmapped set t = encodeEgaspFullSgp2U12 ldHgGene -predTab -genePredExt hg16 $t $b.introns.hg16.gp genePredCheck -db=hg16 $t ########################################################################## # TIGR Jigsaw Genes from EGASP (2005-06-20 kate) # Contrib: Jonathan Allen # NOTE: hg17 coords cd /cluster/data/encode mkdir -p EGASP/Jigsaw/2005-06-01/lab cd EGASP/Jigsaw/2005-06-01/lab # convert to genepred, in preparation for dropping to hg16 # NOTE: no start_codons, so omit -gtf option grep '^chr' lab/jigsaw_ucsc.gtf | \ ldHgGene -out=jigsaw.gp -genePredExt null null stdin # Read 454 transcripts in 3949 lines in 1 files # 454 groups 20 seqs 1 sources 1 feature types # 454 gene predictions liftOver -genePred jigsaw.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ jigsaw.hg16.gp jigsaw.gp.unmapped wc -l jigsaw.hg16.gp jigsaw.gp.unmapped # 450 jigsaw.hg16.gp # 8 jigsaw.gp.unmapped ldHgGene -predTab -genePredExt hg16 encodeEgaspUpdJigsaw jigsaw.hg16.gp # 450 gene predictions genePredCheck -db=hg16 encodeEgaspUpdJigsaw ########################################################################## # AUGUSTUS GENES FROM EGASP (2005-06-10 kate) # Contrib: Mario Stanke (mstanke@gwdg.de) # Department of Bioinformatics of the University of Gottingen in Germany # submitted in hg17 coords. # NOTE: these were dropped from Update track -- Mario says they # belong in the Partial. ssh hgwdev cd /cluster/data/encode mkdir -p EGASP/Augustus cd EGASP/Augustus mkdir 2005-06-22 cd 2005-06-22 mkdir lab cd lab wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=5 http://augustus.gobics.de/predictions/encode/chr_coords/ --accept=gtf # NOTE: remove non .gtf files wc -l *.gtf # 4075 44encode.augustus.ESTprotein.chrcoord.sgtf # 3701 44encode.augustus.abinitio.chrcoords.gtf # 4162 44encode.augustus.anyInformation.chrcoords.gtf # 4026 44encode.augustus.dualgenome.chrcoords.gtf cd .. ln -s lab/44encode.augustus.ESTprotein.chrcoords.gtf est.gtf ln -s lab/44encode.augustus.abinitio.chrcoords.gtf abinitio.gtf ln -s lab/44encode.augustus.anyInformation.chrcoords.gtf any.gtf ln -s lab/44encode.augustus.dualGenome.chrcoords.gtf dual.gtf # convert to genePred foreach f (*.gtf) set b = $f:t:r ldHgGene -out=$b.gp -genePredExt null null $f end #Reading est.gtf #543 gene predictions #Reading abinitio.gtf #622 gene predictions #Reading any.gtf #571 gene predictions #Reading dual.gtf #617 gene predictions # lift to hg16 foreach f (*.gp) set b = $f:r liftOver -genePred $b.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ $b.hg16.gp $b.gp.unmapped end wc -l *.hg16.gp # 616 abinitio.hg16.gp # 568 any.hg16.gp # 612 dual.hg16.gp # 541 est.hg16.gp wc -l *.unmapped # 12 abinitio.gp.unmapped # Missing 1 or 2 exons: ENr131.g14.1, ENm006.g18.1, ENm006.g20.1, # ENm006.g23.1, ENm006.g28.1, ENm006.g49.1 # 6 any.gp.unmapped # ENr131.g14.1, ENm006.g21.1, ENm006.g41.1 # 10 dual.gp.unmapped # ENr131.g14.1, ENm006.g16.1, ENm006.g20.1, ENm006.g25.1 , ENm006.g42.1 # 4 esg.gp.unmapped # Missing 1 exon: ENr131.g12.1, ENm006.g21.1 # load foreach f (*.hg16.gp) genePredCheck $f set t = `echo $f | sed -e 's/\(.*\).hg16.gp/encodeEgaspUpdAugustus\u\1/'` ldHgGene -predTab -genePredExt hg16 $t $f checkTableCoords hg16 $t end ########################################################################## # EXOGEAN GENES FROM EGASP (2005-06-10 kate) # Contrib: Sarah Djebali, ENS-Dyogean (France) cd /cluster/data/encode mkdir -p EGASP/Exogean cd EGASP/Exogean mkdir 2005-06-23 cd 2005-06-23 mkdir lab # retrieve files from FTP site wc -l lab/*.gtf # 18958 All44_EXOGEAN_new3_ucsc.gtf # reformat file with tab delimiters , add dummy frame so we # can pick up transcript ID # convert to genePred perl -wpe 's/(chr\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)/$1\t$2\t$3\t$4\t$5\t$6\t$7\t.\t$8/' lab/*exogean*.gtf > exogean.gtf ldHgGene -out=exogean.gp null null exogean.gtf # 850 gene predictions liftOver -genePred exogean.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \ exogean.hg16.gp exogean.gp.unmapped wc -l exogean.hg16.gp exogean.gp.unmapped # 834 exogean.hg16.gp # 32 exogean.gp.unmapped # 16 unmapped # ENr131.2.{13,14,15} # ENm006.2.{7,9,10,11,12,13,14} # ENm006.9.{1,2,21,22,23,24} ldHgGene -predTab hg16 encodeEgaspUpdExogean exogean.hg16.gp # 834 gene predictions genePredCheck -db=hg16 encodeEgaspUpdExogean ########################################################################## # GENEIDU12 and SGPU12 GENES FROM EGASP (2005-06-24 kate) # Contrib: Tyler Alioto, IMIM, Barcelona cd /cluster/data/encode mkdir -p EGASP/GeneIdU12 cd EGASP/GeneIdU12 mkdir -p 2005-06-10/lab cd 2005-06-10 wc -l lab/*.gtf # 3551 lab/GeneIDu12_hg16.gtf # 5020 lab/SGP2u12_hg16.gtf # load gene preds, replacing real-valued score with '.' awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < lab/GeneIDu12_hg16.gtf > geneId.gtf awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < lab/SGP2u12_hg16.gtf > sgp2.gtf ldHgGene -genePredExt hg16 encodeEgaspUpdGeneId geneId.gtf # 475 gene predictions genePredCheck -db=hg16 encodeEgaspUpdGeneId ldHgGene -genePredExt hg16 encodeEgaspUpdSgp2 sgp2.gtf # 929 gene predictions genePredCheck -db=hg16 encodeEgaspUpdSgp2 # create genepreds containing just exons flanking U12 introns # use U12 annotation as gene name, so it appears on details page grep U12 geneId.gtf | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > geneId.introns.gtf ldHgGene -genePredExt hg16 encodeEgaspUpdGeneIdU12 geneId.introns.gtf # 27 gene predictions grep U12 sgp2.gtf | perl -wpe \ 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \ > sgp2.introns.gtf ldHgGene -genePredExt hg16 encodeEgaspUpdSgp2U12 sgp2.introns.gtf # 20 gene predictions ########################################################################## # EGASP Yale Pseudogenes (2005-06-25 kate) # Contact: Deyou Zheng cd /cluster/data/encode mkdir -p EGASP/yale/2005-06-20/lab cd EGASP/yale/2005-06-20 wc -l lab/*.submitted # 203 lab/YalePgene-NCBI34.gtf.submitted # munge to create CDS entries to display, and assign pseudogene # name as transcript_id, and pseudogene type as gene_id so # it displays on details page sed -e 's/pseudogene\t/CDS\t/' -e 's/pgene_type/gene_id/' \ -e 's/alt_name ENCODE_Yale/transcript_id /' \ lab/YalePgene-NCBI34.gtf.submitted > yale.gtf ldHgGene -genePredExt hg16 encodeEgaspUpdYalePseudo yale.gtf # 203 gene predictions ########################################################################## # Yale Nimblegen data (DONE - 2005-06-08 - Hiram) # Data supplier contact: Olof Emanuelsson olof.emanuelsson@yale.edu ssh hgwdev cd /cluster/data/encode/yale/nimblegen/2005-06-01 # symlink data files to the names as their table names will be: D="encode_Yale_MAS_pbclassicprot_01Jun2005" fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars-FWD_Maskless36 merevery36bp.html > \ encodeYaleMASPlacRNATarsFwdMless36mer36bp.html fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars-REV_Maskless36 merevery36bp.html > \ encodeYaleMASPlacRNATarsRevMless36mer36bp.html ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars_FWD.Maskless36merever y36bp.bed \ encodeYaleMASPlacRNATarsFwdMless36mer36bp.bed ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars_REV.Maskless36merever y36bp.bed \ encodeYaleMASPlacRNATarsRevMless36mer36bp.bed fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-FWD_ Maskless36merevery36bp.html > \ encodeYaleMASPlacRNATransMapFwdMless36mer36bp.html ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-FWD_Maskles s36merevery36bp.wig \ encodeYaleMASPlacRNATransMapFwdMless36mer36bp.wig fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-REV_ Maskless36merevery36bp.html > \ encodeYaleMASPlacRNATransMapRevMless36mer36bp.html ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-REV_Maskles s36merevery36bp.wig \ encodeYaleMASPlacRNATransMapRevMless36mer36bp.wig # load the tables for T in encodeYaleMASPlacRNATarsFwdMless36mer36bp \ encodeYaleMASPlacRNATarsRevMless36mer36bp do grep "^chr" ${T}.bed | sort -k1,1 -k2,2n | \ hgLoadBed -strict hg16 ${T} stdin done # Combine these two tables into the composite: encodeYaleMASPlacRNATars # Yale MAS TAR - Yale MAS array, RNA Transcriptionally Active Regions, # Placenta, Nimblegen Protocol for T in encodeYaleMASPlacRNATransMapFwdMless36mer36bp \ encodeYaleMASPlacRNATransMapRevMless36mer36bp do grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \ hgLoadBed -bedGraph=4 -strict hg16 ${T} stdin done # Combine these two tables into the composite: encodeYaleMASPlacRNATransMap # Yale MAS RNA - Yale MAS array, RNA Transcript Map, Placenta, # Nimblegen Protocol ####################################################################### # More Yale Nimblegen data - 2005-07-05 - Hiram cd /cluster/data/encode/yale/nimblegen/2005-05-31 tar xvzf encode_Yale_MAS_nimbleprot_31May2005.tar.gz D="encode_Yale_MAS_nimbleprot_31May2005" fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars-FWD_Maskless36merevery36bp.html > \ encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp.html fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars-REV_Maskless36merevery36bp.html > \ encodeYaleMASNB4RNANProtTarsREVMless36mer36bp.html # fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.html > \ encodeYaleMASNB4RNANprotTMFWDMless36mer36bp.html fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.html > \ encodeYaleMASNB4RNANprotTMREVMless36mer36bp.html # fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars-FWD_Maskless36merevery36bp.html > \ encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp.html fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars-REV_Maskless36merevery36bp.html > \ encodeYaleMASPlacRNANprotTarsREVMless36mer36bp.html # fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.html > \ encodeYaleMASPlacRNANprotTMFWDMless36mer36bp.html fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.html > \ encodeYaleMASPlacRNANprotTMREVMless36mer36bp.html #### ln -s \ ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars_FWD.Maskless36merevery36bp.bed \ encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp.bed ln -s \ ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars_REV.Maskless36merevery36bp.bed \ encodeYaleMASNB4RNANProtTarsREVMless36mer36bp.bed # ln -s \ ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.wig \ encodeYaleMASNB4RNANprotTMFWDMless36mer36bp.wig ln -s \ ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.wig \ encodeYaleMASNB4RNANprotTMREVMless36mer36bp.wig # ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars_FWD.Maskless36merevery36bp.bed \ encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp.bed ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars_REV.Maskless36merevery36bp.bed \ encodeYaleMASPlacRNANprotTarsREVMless36mer36bp.bed # ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.wig \ encodeYaleMASPlacRNANprotTMFWDMless36mer36bp.wig ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.wig \ encodeYaleMASPlacRNANprotTMREVMless36mer36bp.wig # load the tables for T in encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp \ encodeYaleMASNB4RNANProtTarsREVMless36mer36bp \ encodeYaleMASPlacRNANprotTarsREVMless36mer36bp \ encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp do grep "^chr" ${T}.bed | sort -k1,1 -k2,2n | \ hgLoadBed -strict hg16 ${T} stdin done # Combine these four tables into the existing composite: # encodeYaleMASPlacRNATars # Yale MAS TAR - Yale MAS array, RNA Transcriptionally Active Regions, # Placenta, Nimblegen Protocol for T in encodeYaleMASNB4RNANprotTMFWDMless36mer36bp \ encodeYaleMASNB4RNANprotTMREVMless36mer36bp \ encodeYaleMASPlacRNANprotTMFWDMless36mer36bp \ encodeYaleMASPlacRNANprotTMREVMless36mer36bp do grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \ hgLoadBed -bedGraph=4 -strict hg16 ${T} stdin done # Combine these four tables into the existing composite: # encodeYaleMASPlacRNATransMap # Yale MAS RNA - Yale MAS array, RNA Transcript Map, Placenta, # Nimblegen Protocol ####################################################################### # Boston University ORChID track - (DONE - 2005-06-09 - Hiram) # data developer contact: Jay Greenbaum jj@bu.edu ssh hgwdev cd /cluster/data/encode/BU/orchid/2005-06-09 wget --timestamping "http://orchid.bu.edu/encode/tracks/description.html" \ -O description.html wget --timestamping "http://orchid.bu.edu/encode/tracks/oh_cleavage.wig" \ -O oh_cleavage.wig # It turns out this oh_cleavage.wig file has data for three # tracks in it, and all the data is precisely exactly identically # the same. So, only one of the tracks is needed. # Use this perl script to split up this data into the three # data files: t0 t1 t2 ./splitTracks.pl oh_cleavage.wig # encode one of the data files: mkdir wib wigEncode t0 encodeBu_ORChID1.wig wib/encodeBu_ORChID1.wib # and load hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Bu/2005-06-09 hg16 \ encodeBu_ORChID1 encodeBu_ORChID1.wig mkdir -p /gbdb/hg16/encode/Bu/2005-06-09/wib ln -s `pwd`/wib/encodeBu_ORChID1.wib /gbdb/hg16/encode/Bu/2005-06-09/wib # trackDb entry is for: track encodeBu_ORChID1 # shortLabel BU ORChID # longLabel Boston University ORChID # type wig -0.56 1.58 # maxHeightPixels 128:36:16 # windowingFunction Mean # color 44,44,200 # altColor 200,44,44 # viewLimits 0.22:0.5 # autoScale Off # STANFORD DNA METHYLATION, RTPCR AND CHIP (2005-06-20 angie) # RTPCR: cd /cluster/data/encode/stanford/2005-05-31/ tail +2 lab/stanford_rtpcr \ | perl -wpe 'chomp; @w = split("\t"); \ $w[1] =~ s/\>/gt/; $w[1] =~ s/(EN\w\d+) /$1_/; \ $score = int(300 + ($w[6] / 10)); $score = 1000 if ($score > 1000); \ $_ = "$w[2]\t$w[3]\t$w[4]\t$w[1]\t$score\t$w[5]\t$w[6]\n";' \ | hgLoadBed hg16 \ -noBin -sqlTable=$HOME/kent/src/hg/lib/encodeStanfordRtPcr.sql \ encodeStanfordRtPcr stdin # Methylation: cd /cluster/data/encode/stanford/2005-05-31/ set scoreColumn = 5 foreach cellType (Be2C CRL1690 HCT116 HT1080 HepG2 JEG3 Snu182 U87) tail +2 lab/stanford_meth \ | awk '$'$scoreColumn' != "outlier" { \ print $2 " " $3-1 " " $4 " " $'$scoreColumn';}' \ | hgLoadBed -bedGraph=4 hg16 encodeStanfordMeth$cellType stdin set scoreColumn = `expr $scoreColumn + 1` end set pri = 1 foreach cellType (Be2C CRL1690 HCT116 HT1080 HepG2 JEG3 Snu182 U87) echo track encodeStanfordMeth$cellType echo subTrack encodeStanfordMeth echo shortLabel Stan Meth $cellType echo longLabel Stanford Methylation \($cellType cells\) echo priority $pri echo "" set pri = `expr $pri + 1` end ssh kolossus cd /cluster/data/encode/stanford/2005-05-31/ perl -we 'while (<>) { \ if (/^track name=(\S+)/) { \ $track = "encodeStanfordMethSmoothed$1"; \ close(OUT); open(OUT, ">$track.bed") || die; \ } else { \ @w = split; print OUT "$w[0]\t$w[1]\t$w[2]\t$w[4]\n"; \ } }' lab/stanford_meth.bed sort -k4n,4n encodeStanfordMethSmoothed*.bed | head sort -k4n,4n encodeStanfordMethSmoothed*.bed | tail # range of scores: 0 to 364088 # back on hgwdev: foreach f (encodeStanfordMethSmoothed*.bed) hgLoadBed -bedGraph=4 hg16 $f:r $f end # ChIP/chip: cd /cluster/data/encode/stanford/2005-05-31/ set scoreColumn = 5 foreach specifier (HCT116Sp1 HCT116Sp3 JurkatSp1 JurkatSp3 K562Sp1 K562Sp3) tail +2 lab/stanford_sp_ChIP \ | sed -e 's/chr0/chr/;' \ | awk '$'$scoreColumn' != "outlier" { \ print $2 " " $3-1 " " $4 " " $'$scoreColumn';}' \ | hgLoadBed -bedGraph=4 hg16 encodeStanfordChip$specifier stdin set scoreColumn = `expr $scoreColumn + 1` end set pri = 1 foreach specifier (HCT116Sp1 HCT116Sp3 JurkatSp1 JurkatSp3 K562Sp1 K562Sp3) set cellType = `echo $specifier | perl -wpe 's/(Sp1|Sp3)//'` set factor = `echo $specifier | perl -wpe 's/(HCT116|Jurkat|K562)//'` echo track encodeStanfordChip$specifier echo subTrack encodeStanfordChip echo shortLabel Stan $cellType $factor echo longLabel Stanford ChIP/chip \($cellType cells, $factor ChIP\) echo priority $pri echo "" set pri = `expr $pri + 1` end ssh kolossus cd /cluster/data/encode/stanford/2005-05-31/ perl -we 'while (<>) { \ if (/^track name=(\S+)/) { \ $track = "encodeStanfordChipSmoothed$1"; \ $track =~ s@_ChIP/total@@; $track =~ s@_@@; \ close(OUT); open(OUT, ">$track.bed") || die $!; \ } else { \ @w = split; print OUT "$w[0]\t$w[1]\t$w[2]\t$w[4]\n"; \ } }' lab/stanford_sp_ChIP.bed sort -k4n,4n encodeStanfordChipSmoothed*.bed | head sort -k4n,4n encodeStanfordChipSmoothed*.bed | tail # range of scores: 0 to 721474 # back on hgwdev: foreach f (encodeStanfordChipSmoothed*.bed) hgLoadBed -bedGraph=4 hg16 $f:r $f end rm bed.tab # SANGER GENOTYPE/EXPRESSION ASSOCIATION (angie 6/25/05) cd /cluster/data/encode/sanger/genotypeExpressionAssociation sed -e 's/bed5FloatScore/encodeSangerGenoExprAssociation/' \ $HOME/kent/src/hg/lib/bed5FloatScore.sql > /tmp/tmp.sql grep --no-filename -v encode_region sanger*.txt \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", \ $1, $2, $3, $6, (250 + ($4 * 375)), $4}' \ | hgLoadBed hg16 encodeSangerGenoExprAssociation stdin \ -sqlTable=/tmp/tmp.sql rm bed.tab ########################################################################## # Indels from Jim Mullikin # Heather, June 2005 ssh hgwdev cd /cluster/data/encode/NHGRI/mullikin hgsql hg16 < encodeIndels.sql split4.pl < hg16.ENCODE.DIPtrack.Q23.bed4+ > split4.out # use a modified makeColoredBed ./makeColoredBed < split4.out > encodeIndels.bed # don't use -strict because we have lots of simple insertions (where chromStart = chromEnd) hgLoadBed hg16 encodeIndels -tab -sqlTable=encodeIndels.sql encodeIndels.bed # check reference length mysql> select chrom, chromStart, chromEnd, (chromEnd-chromStart) as size, traceName, reference, length(reference) as refsize from encodeIndels where (chromEnd-chromStart) != length(reference) and length(reference) > 1; # Empty set (0.07 sec) ########################################################################## # Recombination Rates from Gil McVean # Heather, June 2005 # 10 ENCODE regions that were resequenced: # ENr112 (chr2) # ENr131 (chr2) # ENr113 (chr4) # ENm010 (chr7) # ENm013 (chr7) # ENm014 (chr7) # ENr321 (chr8) # ENr232 (chr9) # ENr123 (chr12) # ENr213 (chr18) # units are cM/Mb # extra precision in input files ssh hgwdev cd /cluster/data/encode/oxford/mcvean # a few notes on the data: had to add "chr" and flip start/end fix.pl < ENCODE_release16.txt.2 > ENCODE_release16.bed hgLoadBed -strict -sqlTable=bedGraph.sql hg16 encodeRecomb ENCODE_release16.bed ########################################################################## # Recombination Rates from Gil McVean # Daryl, July 15, 2005 ## data from Chris Spencer and GilMcVean in Oxford cd /cluster/data/encode/oxford/mcvean tail +2 hotspot.txt | sed 's/p/\t/;s/q/\t/;s/ /\t/' | awk '{printf "chr%s\t%d\t%d\n", $1, $3-1, $3}' > hotspot.bed hgLoadBed hg16 encodeRecombHotspot hotspot.bed ########################################################################## # HapMap SNPs # Heather, June 2005 # 10 ENCODE regions that were resequenced: # ENr112 (chr2) # ENr131 (chr2) # ENr113 (chr4) # ENm010 (chr7) # ENm013 (chr7) # ENm014 (chr7) # ENr321 (chr8) # ENr232 (chr9) # ENr123 (chr12) # ENr213 (chr18) ssh hgwdev cd /cluster/data/encode/hapmap/16c.1/alleleFrequencies ./formatAll.csh #!/bin/csh foreach pop (CEU CHB JPT YRI) rm -f allele_freqs_$pop.bed zcat allele_freqs_EN*${pop}* | awk -f split.awk > allele_freqs_$pop.bed end !/chrom/ {printf "%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%d\n",tolower($2),$3-1,$3,$1,500+(1000*($12>$15?$15:$12)),$4,$6,$11,$14,$12,$15,($12>$15?$15:$12),$17;} hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqCEU.sql hg16 encodeHapMapAlleleFreqCEU allele_freqs_CEU.bed hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqCHB.sql hg16 encodeHapMapAlleleFreqCHB allele_freqs_CHB.bed hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqJPT.sql hg16 encodeHapMapAlleleFreqJPT allele_freqs_JPT.bed hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqYRI.sql hg16 encodeHapMapAlleleFreqYRI allele_freqs_YRI.bed ########################################################################## # HapMap resequencing coverage # Daryl, July 11, 2005 foreach pop (CEU CHB JPT YRI) grep $pop coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCov$pop.bedGraph end grep CHB coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCovHCB.bedGraph grep JAP coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCovJPT.bedGraph foreach pop (CEU CHB JPT YRI) nice time hgLoadBed -bedGraph=4 -strict hg16 encodeHapMapCovBedGraph$pop encodeHapMapCov$pop.bedGraph > & load$pop.log end set wibPath = /gbdb/hg16/encode/sanger/wib foreach pop (CEU CHB JPT YRI) time wigEncode encodeHapMapCov$pop.bedGraph encodeHapMapCov$pop.wig encodeHapMapCov$pop.wib rm -f $wibPath/encodeHapMapCov$pop.wib ln -s `pwd`/encodeHapMapCov$pop.wib $wibPath chmod o+r . encodeHapMapCov$pop.wib time hgLoadWiggle -pathPrefix=$wibPath hg16 encodeHapMapCov$pop encodeHapMapCov$pop.wig end ########################################################################## # ENCODE Regulome # Heather, June 2005 ssh hgwdev # I am hand spliting these files on account of being rushed # Could have used /cluster/data/encode/BU/2005-06-09/splitTracks.pl cd /cluster/data/encode/Regulome/sens # waiting for more granular data from John Stam # this is wiggle data # using /gbdb/hg16/encode/Regulome/june2005 # John Stam told me amplicons are the same for all cell lines # this is bed 5 cd /cluster/data/encode/Regulome/2005-06-12/amplicons/odd grep -h "^chr" *CACO2.txt > odd.bed hgLoadBed -strict -sqlTable=odd.sql hg16 encodeRegulomeAmpliconOdd odd.bed cd /cluster/data/encode/Regulome/2005-06-12/amplicons/even grep -h "^chr" *CACO2.txt > even.bed hgLoadBed -strict -sqlTable=even.sql hg16 encodeRegulomeAmpliconEven even.bed cd /cluster/data/encode/Regulome/2005-06-12/prob # this is bedGraph grep -h "^chr" *CACO2.txt > CACO2.bed grep -h "^chr" *GM06900.txt > GM06900.bed grep -h "^chr" *SKNSH.txt > SKNSH.bed grep -h "^chr" *ERY.txt > ERY.bed grep -h "^chr" *K562.txt > K562.bed hgLoadBed -strict -sqlTable=encodeRegulomeProbCACO2.sql hg16 encodeRegulomeProbCACO2 CACO2.bed hgLoadBed -strict -sqlTable=encodeRegulomeProbGM06990.sql hg16 encodeRegulomeProbGM06990 GM06990.bed hgLoadBed -strict -sqlTable=encodeRegulomeProbSKNSH.sql hg16 encodeRegulomeProbSKNSH SKNSH.bed hgLoadBed -strict -sqlTable=encodeRegulomeProbERY.sql hg16 encodeRegulomeProbERY ERY.bed hgLoadBed -strict -sqlTable=encodeRegulomeProbK562.sql hg16 encodeRegulomeProbK562 K562.bed cd /cluster/data/encode/Regulome/2005-06-12/quality # this is bed 5 grep -h "^chr" *CACO2.txt > CACO2.bed grep -h "^chr" *GM06900.txt > GM06900.bed grep -h "^chr" *SKNSH.txt > SKNSH.bed grep -h "^chr" *ERY.txt > ERY.bed grep -h "^chr" *K562.txt > K562.bed hgLoadBed -strict -sqlTable=encodeRegulomeQualityCACO2.sql hg16 encodeRegulomeQualityCACO2 CACO2.bed hgLoadBed -strict -sqlTable=encodeRegulomeQualityGM06990.sql hg16 encodeRegulomeQualityGM06990 GM06990.bed hgLoadBed -strict -sqlTable=encodeRegulomeQualitySKNSH.sql hg16 encodeRegulomeQualitySKNSH SKNSH.bed hgLoadBed -strict -sqlTable=encodeRegulomeQualityERY.sql hg16 encodeRegulomeQualityERY ERY.bed hgLoadBed -strict -sqlTable=encodeRegulomeQualityK562.sql hg16 encodeRegulomeQualityK562 K562.bed ## Last Minute data fixes and additions ## Daryl; July 10, 2005 cd /cluster/data/encode/Regulome/2005-07-08/old sed 's/XXX/HepG2/' ~/kent/src/hg/lib/encodeRegulomeProb.sql > encodeRegulomeProbHepG2.sql sed 's/XXX/HepG2/' ~/kent/src/hg/lib/encodeRegulomeQuality.sql > encodeRegulomeQualityHepG2.sql splitTracks.pl stdin < ENm003_QCP_HepG2.txt grep -vh track t0 | hgLoadBed -strict -sqlTable=encodeRegulomeProbHepG2.sql hg16 encodeRegulomeProbHepG2 stdin grep -vh track t3 | hgLoadBed -strict -sqlTable=encodeRegulomeQualityHepG2.sql hg16 encodeRegulomeQualityHepG2 stdin sed 's/XXX/Huh7/' ~/kent/src/hg/lib/encodeRegulomeProb.sql > encodeRegulomeProbHuh7.sql sed 's/XXX/Huh7/' ~/kent/src/hg/lib/encodeRegulomeQuality.sql > encodeRegulomeQualityHuh7.sql cat EN*_QCP_Huh7.txt | splitTracks.pl stdin grep -vh track t0 t5 t10 t15 | hgLoadBed -strict -sqlTable=encodeRegulomeProbHuh7.sql hg16 encodeRegulomeProbHuh7 stdin grep -vh track t3 t8 t13 t18 | hgLoadBed -strict -sqlTable=encodeRegulomeQualityHuh7.sql hg16 encodeRegulomeQualityHuh7 stdin cd /cluster/data/encode/Regulome/2005-07-08/new foreach cellLine (CACO2 GM06990 HepG2 Huh7 K562 SKNSH Adult_Erythroblast) set f = encodeRegulomeBase$cellLine cat EN*_QCP_${cellLine}_Baseline.wig | wigEncode stdin $f.wig $f.wib ln -s `pwd`/$f.wib /gbdb/hg16/encode/Regulome/wib hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Regulome/wib hg16 $f $f.wig end ########################################################################## # ENCODE RNA transcription compilation (Matt Weirauch and Todd Lowe) # Daryl and Matt Weirauch, July 8, 2005 (reloaded with new evoFold scores on July 10 and again on July12) cd /cluster/data/encode/ucsc/lowe hgLoadBed -sqlTable=/cluster/home/daryl/kent/src/hg/lib/encodeRna.sql hg16 encodeRna encodeRna.bed -strict ########################################################################## # TBA23 EVOFOLD - RNA secondary structure predictions based on TBA23 (Jakob Skou Pedersen) # Jakob Skou Pedersen, July 12, 2005 # # The evofold program is still in development and the procedures used # to generate the data are likely to change some over the next few # months. The documentation of the data generation is therefore # skipped and we start with a data file provided by me. ssh -C hg17 mkdir -p /cluster/data/encode/evofold cd /cluster/data/encode/evofold cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/encode/tba23/tba_folds.bed tbaFolds.bed # tbaFolds.bed is a 9-column bed file: column 1-6 provide standard # bed information, column 7 is element length, column 8 is the RNA # secondary structure in parentheses format, and column nine is a # commaseparated list of position specific confidence scores # (floats). # do a little table swapping to use existing sql file echo "RENAME TABLE evofold to jsp_tmp" | hgsql hg16 hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold tbaFolds.bed echo "RENAME TABLE evofold to encodeTba23EvoFold" | hgsql hg16 echo "RENAME TABLE jsp_tmp to evofold" | hgsql hg16 ########################################################################## # ENCODE HapMap Primate Alleles # Daryl, July 1, 2005 ########################################################################## ## data acquisition, reformatting, cleaning, and checking ## run most of this on local disk ## get the data wget ftp://www.hapmap.org/frequencies/2005-06_16c.1_phaseI/ENCODE/non-redundant/allele_freqs_EN*.txt.gz.... ## reformat the data and remove misformatted lines formatAll.csh #!/bin/csh foreach pop (CEU CHB JPT YRI) rm -f allele_freqs_$pop.bed zcat allele_freqs/allele_freqs_EN*${pop}* | awk -f split.awk > allele_freqs_$pop.bed end rm -f alleles.bed awk '$0 !~ /\?/ {printf "%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\n",$1,$2,$3,$4,$6,$8,$9}' allele_freqs_*.bed | sort -u > alleles.bed rm -f hg16.bed removeErrors.pl < alleles.bed #!/usr/bin/perl -W %line=();%dups=(); ### collect the input data from the alleles.bed file while (<>) { chomp(); @f=split; $key="$f[0].$f[1].$f[2]"; $val="$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]"; if (defined $line{$key}) { $val .= "\t$line{$key}"; $dups{$key} = "$val"; } $line{$key}=$val; } ### write the output file of unique information for each position open(OUT,">alleles.out"); foreach $key (sort keys %line) { @f=split /\t/, $line{$key}; @k=split /\./, $key; printf OUT "$k[0]\t$k[1]\t$k[2]\t$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\n"; } ### write a file of observed errors (duplications); "?" alleles have been removed before this step. open(ERR,">alleles.err"); foreach $key (sort keys %dups) { @k=split /\./, $key; printf ERR "$k[0]\t$k[1]\t$k[2]\t$dups{$key}\n"; } ## QA -- counts should be the same, strand should be "+ -"; 4 nucleotides should be shown wc -l alleles.* cut -f 1-3 alleles.out | sort -u | wc -l cut -f 1-4 alleles.out | sort -u | wc -l cut -f 4 alleles.out | sort -u | wc -l cut -f 6 alleles.out | sort -u | xargs echo "strand " cut -f 7 alleles.out | sort -u | xargs echo "ref " cut -f 8 alleles.out | sort -u | xargs echo "other " ########################################################################## ## Start the real work ... lift to the other assemblies ## requires local over.chain files or symlinks ln -s alleles.out hg16.bed liftOver hg16.bed hg16ToPanTro1.over.chain panTro1.bed hg16toPanTro1.unmapped grep -v "#" hg16toPanTro1.unmapped > hg16toPanTro1.unmapped.bed wc -l hg16.bed panTro1.bed hg16toPanTro1.unmapped.bed liftOver hg16.bed hg16ToHg17.over.chain hg17.bed hg16toHg17.unmapped grep -v "#" hg16toHg17.unmapped > hg16toHg17.unmapped.bed wc -l hg16.bed hg17.bed hg16toHg17.unmapped.bed liftOver hg17.bed hg17ToRheMac1.over.chain rheMac1.bed hg17toRheMac1.unmapped grep -v "#" hg17toRheMac1.unmapped > hg17toRheMac1.unmapped.bed wc -l hg17.bed rheMac1.bed hg17toRheMac1.unmapped.bed ########################################################################## ## Now that the positions are available, get the bases at those positions ## requires nib files on /scratch nice getPanTro1Base.pl < panTro1.bed > panTro1Base.bed #13.440u 29.590s 1:30.91 47.3% 0+0k 0+0io 2333377pf+0w #!/usr/bin/perl -W $nibPath="/scratch/panTro1/nib"; $foo1=$foo2=@bar=""; while (<>) { chomp(); $seq="#"; ($chrom, $chromStart, $chromEnd, $foo1, $foo2, $strand, @bar) = split /\t/; $cmd="/cluster/home/daryl/bin/i386/nibFrag $nibPath/$chrom.nib $chromStart $chromEnd $strand stdout"; open(RESULT, "$cmd |") or die "can't start nibFrag:\n$cmd\n"; ; $seq=; #assumes a single line print "$_\t"; print "$seq"; close(RESULT); } ## requires 2bit file on /scratch nice getRheMac1Base.pl < rheMac1.bed > rheMac1Base.bed #5571.970u 501.900s 1:49:14.10 92.6% 0+0k 0+0io 2455781pf+0w #!/usr/bin/perl -W $twoBitPath="/cluster/data/rheMac1/rheMac1.2bit"; @bar=""; open(TMP,") { chomp($line); $seq="."; ($chrom, $chromStart, $chromEnd, @bar) = split /\t/, $line; $cmd="twoBitToFa ${twoBitPath}:${chrom}:${chromStart}-${chromEnd} stdout"; open(RESULT, "$cmd |") or die "can't start twoBitToFa:\n$cmd\n"; while () { $seq = "$_"; } if ($seq =~ /^\#/) { $seq = ".\n"; } if ($line =~ /-/) { $seq =~ tr/acgtACGT/tgcaTGCA/; } print "$line\t$seq"; close(RESULT); } ########################################################################## ## Get the qual scores to go with the bases ## run on hgwdev -- requires the database nice getOneQual.pl panTro1 < panTro1Base.bed > panTro1BaseQual.bed #42.530u 95.540s 7:19.14 31.4% 0+0k 0+0io 4260405pf+0w #!/usr/bin/perl -W $db = shift; $foo=@bar=""; while ($bed = <>) { chomp($bed); $score="0"; ($chrom, $foo, $chromEnd, @bar) = split /\t/, $bed; $cmd="hgWiggle -db=$db -position=$chrom:$chromEnd-$chromEnd -rawDataOut quality"; open(RESULT, "$cmd |") or die "can't start '$cmd'\n"; while ($line = ) { if ($line =~ /^\#/) { next; } $score=int($line+.5); } print "$bed\t$score\n"; } nice getOneQual.pl rheMac1 < rheMac1Base.bed > rheMac1BaseQual.bed #35.170u 69.300s 5:37.85 30.9% 0+0k 0+0io 4043288pf+0w ########################################################################## ## Put it all together ## run on local machine nice mergeResults.pl hg16.bed panTro1BaseQual.bed rheMac1BaseQual.bed hg16.panTro1.rheMac1.bed #!/usr/bin/perl -W if (scalar (@ARGV) != 4) {die "usage: mergeResults.pl hg16.bed panTro1BaseQual.bed rheMac1BaseQual.bed hg16.panTro1.rheMac1.bed"} $hf = shift; $cf = shift; $rf = shift; $of = shift; %c=(); %r=(); $sameC=$sameR=$sameB=$diffC=$diffR=$diffB=$missC=$missR=$missB=0; open(H,"<$hf") or die "can't open $hf"; open(C,"<$cf") or die "can't open $cf"; open(R,"<$rf") or die "can't open $rf"; open(O,">$of") or die "can't open $of"; print O "rs# humanChrom humanPos humanStrand refallele otherallele chimpChrom chimpPos chimpStrand chimpAllele chimpQual rhesusChrom rhesusPos rhesusStrand rhesusAllele rhesusQual\n"; while () { chomp(); if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } else { print "Chimp ERR: $_\n"; } } while () { chomp(); if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } else { print "Rhesus ERR: $_\n"; } } while () { chomp(); if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; } else { print "Human ERR: $_\n"; } $Hchrom=$HchromStart=$HchromEnd=$Hname=$Hscore=$Hstrand=$Hnuc1=$Hnuc2="."; $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc=$Cqual=$C="."; $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc=$Rqual=$R="."; ($Hchrom, $HchromStart, $HchromEnd, $Hname, $Hscore, $Hstrand, $Hnuc1, $Hnuc2) = split /\t/, $_; if (defined $c{$rsId}) { $C = $c{$rsId}; ($Cchrom, $CchromStart, $CchromEnd, $Cname, $Cscore, $Cstrand, $Cnuc1, $Cnuc2, $Cnuc, $Cqual) = split /\t/, $C; if (!defined($Cqual)) { $Cqual=".";} if (!defined($Cnuc)) { $Cnuc=".";} } else { $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc=$Cqual=$C="."; } if (defined $r{$rsId}) { $R = $r{$rsId}; ($Rchrom, $RchromStart, $RchromEnd, $Rname, $Rscore, $Rstrand, $Rnuc1, $Rnuc2, $Rnuc, $Rqual) = split /\t/, $R; if (!defined($Cqual)) { print "$_\n"; $Rqual=".";}} else { $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc=$Rqual=$R="."; } print O "$Hname $Hchrom $HchromEnd $Hstrand $Hnuc1 $Hnuc2 "; print O "$Cchrom $CchromEnd $Cstrand $Cnuc $Cqual "; print O "$Rchrom $RchromEnd $Rstrand $Rnuc $Rqual\n"; if (!(defined $Cnuc) || $Cnuc eq ""|| $Cnuc eq ".") { $Cnuc="x"; $missC++; } if (!(defined $Rnuc) || $Rnuc eq ""|| $Rnuc eq ".") { $Rnuc="x"; $missR++; } if ( $Cnuc eq "x" && $Rnuc eq "x") { $missB++; } else { $Hnuc1 =~ tr/a-z/A-Z/; $Hnuc2 =~ tr/a-z/A-Z/; $Cnuc =~ tr/a-z/A-Z/; if ( ($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2) ) { $sameC++; } elsif ($Cnuc ne "x"){$diffC++;} if ( ($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2) ) { $sameR++; } elsif ($Rnuc ne "x"){$diffR++;} if ((($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2)) && (($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2))) { $sameB++; } if ((($Cnuc ne $Hnuc1) && ($Cnuc ne $Hnuc2)) && (($Rnuc ne $Hnuc1) || ($Rnuc ne $Hnuc2)) && ( $Cnuc ne "x" && $Rnuc ne "x")) { $diffB++; } } } ########################################################################## ## Get summary results countDiffs.pl hg16.bed panTro1Base.bed rheMac1Base.bed #!/usr/bin/perl -W if (scalar (@ARGV) != 3){die "usage: countDiffs.pl hg16.bed panTro1Base.bed rheMac1Base.bed"} $hf = shift; $cf = shift; $rf = shift; open(H,"<$hf") or die "can't open $hf"; open(C,"<$cf") or die "can't open $cf"; open(R,"<$rf") or die "can't open $rf"; %c=(); %r=(); $Hcount=0; while () { chomp(); if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } else { print "Chimp ERR: $_\n"; } } while () { chomp(); if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } else { print "Chimp ERR: $_\n"; } } $Csame=$Cdiff=$Cmiss=$Rsame=$Rdiff=$Rmiss=0; $Hchrom=$HchromStart=$HchromEnd=$Hname=$Hscore=$Hstrand=$Hnuc1=$Hnuc2="."; while () { $Hcount++; chomp(); $C=$R="."; if (/\s(rs\d+)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(BCM-HGSC\S*)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; } elsif (/\s(JCM\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; } else { print "Human ERR: $_\n"; } if (defined $c{$rsId}) { ($Cchrom,$CchromStart,$CchromEnd,$Cname,$Cscore,$Cstrand,$Cnuc1,$Cnuc2,$Cnuc)=split /\t/,$c{$rsId};} else { $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc="."; } if (defined $r{$rsId}) { ($Rchrom,$RchromStart,$RchromEnd,$Rname,$Rscore,$Rstrand,$Rnuc1,$Rnuc2,$Rnuc)=split /\t/,$r{$rsId};} else { $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc="."; } ($Hchrom, $HchromStart, $HchromEnd, $Hname, $Hscore, $Hstrand, $Hnuc1, $Hnuc2) = split /\t/, $_; $Hnuc1 =~ tr/a-z/A-Z/; $Hnuc2 =~ tr/a-z/A-Z/; $Cnuc =~ tr/a-z/A-Z/; $Rnuc =~ tr/a-z/A-Z/; if ($Cnuc eq "" || $Cnuc eq ".") { $Cmiss++; } elsif (($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2)) { $Csame++; } else { $Cdiff++;}# print "C $Hnuc1 $Hnuc2 $Cnuc $Rnuc $rsId \n";} if ($Rnuc eq "" || $Rnuc eq ".") { $Rmiss++; } elsif (($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2)) { $Rsame++; } else { $Rdiff++;}# print "R $Hnuc1 $Hnuc2 $Cnuc $Rnuc $rsId \n";} } $Ctotl = $Csame+$Cdiff; $Cfrac = $Csame/$Ctotl; $Rtotl = $Rsame+$Rdiff; $Rfrac = $Rsame/$Rtotl; print "\tChimp\tRhesus\tHuman\n"; print "totl:\t$Ctotl\t$Rtotl\t$Hcount\n"; print "same:\t$Csame\t$Rsame\n"; print "diff:\t$Cdiff\t$Rdiff\n"; printf "frac:\t%.3f\t%.3f\n",$Cfrac,$Rfrac; print "miss:\t$Cmiss\t$Rmiss\n"; ########################################################################## ## Reformat once again and load the database awk -f makeTableBed.awk hg16.panTro1.rheMac1.bed > encodePrimateAlleles.bed $0 !~ /Chrom/ {printf "%s\t%d\t%d\t%s\t%d\t%c\t%c\t%c\t%s\t%d\t%c\t%s\t%d\t%s\t%d\t%c\t%c\t%d\n", $2,$3-1,$3,$1,0,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16} hgLoadBed hg16 encodePrimateAlleles encodePrimateAlleles.bed -sqlTable=encodePrimateAlleles.sql -tab -strict ####################################################################### # loading Riken CAGE data - 2005-06-10 - Hiram # completed 2005-07-06 - Hiram # data submitter contact: Albin Sandelin - sandelin@gsc.riken.go.jp # cd /cluster/data/encode/Riken/2005-06-08 tar xvzf CAGE_tracks.tar.gz D="CAGE_tracks" ln -s ${D}/CAGE_MINUS_STRAND_ENCODE.html encodeRikenCageMinus.html ln -s ${D}/CAGE_MINUS_STRAND_ENCODE.wig encodeRikenCageMinus.wig ln -s ${D}/CAGE_PLUS_STRAND_ENCODE.html encodeRikenCagePlus.html ln -s ${D}/CAGE_PLUS_STRAND_ENCODE.wig encodeRikenCagePlus.wig ln -s ${D}/CAGE_mapped_tags.gff encodeRikenCageMappedTags.gff ln -s ${D}/CAGE_mapped_tags.html encodeRikenCageMappedTags.html for T in encodeRikenCageMinus encodeRikenCagePlus do grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \ hgLoadBed -strict -bedGraph=4 hg16 ${T} stdin done # And some extra test data to help verify those tags are in the # correct location: ./gffToBed6.pl encodeRikenCageMappedTags.gff | \ sort -k1,1 -k2,2n > encodeRikenCageMappedTags.bed6 hgLoadBed -strict hg16 encodeRikenCageMappedTags \ encodeRikenCageMappedTags.bed6 grep "\-$" encodeRikenCageMappedTags.bed.6 | sort -k1,1 -k2,2n | \ bedItemOverlapCount hg16 -verbose=2 stdin > negOverlaps.bedGraph4 grep "+$" encodeRikenCageMappedTags.bed.6 | sort -k1,1 -k2,2n | \ bedItemOverlapCount hg16 -verbose=2 stdin > posOverlaps.bedGraph4 hgLoadBed -strict -bedGraph=4 hg16 \ encodeRikenCageMappedTagsPositive posOverlaps.bedGraph4 hgLoadBed -strict -bedGraph=4 hg16 \ encodeRikenCageMappedTagsNegative negOverlaps.bedGraph4 ##### - an off by one error was discovered in the Minus.wig data # Albin resubmitted data, but then the PLUS data was broken and the Minus # was not correct either. So, apply the following fixup script to the data: cd /cluster/data/encode/Riken/2005-07-04 cat << '_EOF_' > fixup.pl #!/usr/bin/env perl # use strict; use warnings; open (FH,"../2005-06-08/encodeRikenCageMinus.wig") or die "Can not open ../2005-06-08/encodeRikenCageMinus.wig"; while (my $line=) { chomp $line; if ($line =~ m/^chr/) { my ($chr, $start, $end, $score) = split('\s+', $line); printf "%s\t%d\t%d\t%d\n", $chr, $start-1, $end-1, $score; } else { print "$line\n"; } } close (FH); '_EOF_' # happy emacs chmod +x fixup.pl ./fixup.pl > encodeRikenCageMinus.wig # examine the region: chr4:118,973,382-118,973,413 # to verify that the plus and minus predicted gene start sites line # up properly with the raw Mapped Tags # reload this fixed data for T in encodeRikenCageMinus do grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \ hgLoadBed -strict -bedGraph=4 hg16 ${T} stdin done # The track's long label: "Riken CAGE - Predicted Gene Start Sites" # Short label: "Riken CAGE" # table names: encodeRikenCagePlus and encodeRikenCageMinus # in the composite track: encodeRikenCage # and the extra TEST TRACK ONLY tables: # composite: encodeRikenCageMappedTagsScore # short label: Riken CAGE MT # long: Riken CAGE Mapped Tags overlap count - TEST TRACK ONLY # subtracks: encodeRikenCageMappedTagsPositive # encodeRikenCageMappedTagsNegative # table: encodeRikenCageMappedTags # Riken CAGE Tags # Riken CAGE Mapped Tags - TEST TRACK ONLY ######################################################################### # Affy 25bp Probe locations (DONE - 2005-07-29 - Hiram) ssh hgwdev mkdir /cluster/data/encode/Affy/2005-07-18 wget --timestamping \ 'http://transcriptome.affymetrix.com/download/ENCODE/affy_june1/soft/RNA/EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2' \ -O EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2 bunzip2 EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2 # After some experimentation, it was determined that the # coordinates in this EC* file are actually zero-relative # and thus should be taken at face value. No need to translate # anything. There remains a question of why there was # confusion about whether this was a 1 or 0 relative file. grep "^chr" EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft \ | awk '{print $1}' | sed -e "s/_/ /" | awk ' BEGIN {count=1} { printf "%s\t%d\t%d\t%d\n", $1, $2, $2+25, count; ++count; } ' | sort -k1,1 -k2,2n | gzip > encodeAffyEncode25bpProbes.bed.gz hgLoadBed -strict hg16 encodeAffyEncode25bpProbes \ encodeAffyEncode25bpProbes.bed.gz # Create trackDb entry, was in human/hg16/trackDb.ra later moved # up to human/trackDb.ra as ENCODE moves to hg17: track encodeAffyEncode25bpProbes shortLabel Affy 25bp Probes longLabel Affymetrix 25 bp Probe Locations group encodeTxLevels visibility hide type bed 4 . priority 41 ######################################################################### # Gencode Unannotated Region Classification (DONE - 2005-08-02 - Hiram) # A number of table browser operations are going to be performed. # Saving the results into this directory: ssh hgwdev mkdir /cluster/data/encode/analysis/genes_transcripts # Using the table browser on genome-test # clade: Vertebrate, genome: Human, Assembly: July 2003 # track: Gencode Genes, group: encodeGencodeGene, # region: genome # no filter, no intersection, no correlation # output format: "chosen fields from selected and related tables" # output file: genCodeTxStartEnd.gtfLike # file type returned: gzip compressed # press the "get output" button and select the "Linked Tables" # of gencodeGeneClass, press the "Allow Selection from Checked Tables" # and then select fields: # name, chrom, strand, txStart, txEnd, gencodeGeneClass.class # the "get output" will now send you the file: # genCodeTxStartEnd.gtfLike.gz # convert that output to a bed file: zcat genCodeTxStartEnd.gtfLike.gz | awk ' { printf "%s\t%d\t%d\t%s_%s\t0\t%s\n", $2, $4, $5, $1, $6, $3 } ' | sort -k1,1 -k2,2n > genCode.bed # And filter that for the "reference set" of Gencode genes: egrep \ "Known|Novel_CDS|Novel_transcript_gencode_conf|Putative_gencode_conf" \ genCode.bed > referenceGeneExtents.bed # This becomes our filter to find the exons of interest, # Add a custom track line to this file: echo \ 'track name="referenceGencodeExtents" description="reference Gencode gene set, txStart to txEnd"' \ > referenceGeneExtents.bed.ct cat referenceGeneExtents.bed >> referenceGeneExtents.bed.ct # And then load up this file as a custom track on hg16 # Back in the table browser, select Gencode Genes, full genome, # custom track type of output, and ask for # "Exons plus 0 bases at each end" # get custom track in table browser # Now you have two custom tracks, # Intersect those two custom tracks, AND type of intersection, # to produce a bed file referenceGencodeExons.bed # Add a custom track header to this file: echo \ 'track name="referenceGencodeExons" description="Reference Gencode Exons" visibility=3' > referenceGencodeExons.bed.ct cat referenceGencodeExons.bed >> referenceGencodeExons.bed.ct # add the referenceGeneExtents.bed.ct to that: cat referenceGeneExtents.bed.ct > customTracks.bed.ct cat referenceGencodeExons.bed.ct >> customTracks.bed.ct # And load that file as a new set of two custom tracks. # Again in the table browser, do an intersection between the # geneExtents track and this Exon track, but invert the Exons # to give you a result that is NOT Exon which == Introns # Complement referenceGencodeExons before intersection/union # AND of referenceGencodeExtents and referenceGencodeExons # Select bed file output, file name: # referenceGencodeIntrons.bed # Create a special custom track that is simply a definition of all # the chrom extents in hg16 to be used for intersection purposes echo \ 'track name="chromDefinitions" description="hg16 chromosome extents, 0 to N"' > hg16.chroms.bed.ct hgsql -N -e "select chrom, size from chromInfo;" hg16 | grep -v _random \ | awk '{ printf "%s\t0\t%d\n", $1, $2}' >> hg16.chroms.bed.ct # Add this to the growing collection of tracks: cat hg16.chroms.bed.ct >> customTracks.bed.ct # And load it up for more intersections. # Again in the table browser, intersect the chromDefinitions with # the referenceGencodeExtents # AND of chromDefinitions and referenceGencodeExtents # Complement referenceGencodeExtents before intersection/union # A bed file result of this gives you the Intergenic regions # referenceIntergenicGencode.bed # Now you are ready to calculate the regions for T in referenceGencodeIntrons referenceIntergenicGencode do zcat ${T}.bed.ct.gz | grep -v track | sort -k1,1 -k2,2n | awk ' { if (($3 - $2) > 10000) { printf "%s\t%d\t%d\t%s\n", $1, $2+5000, $3-5000, $4 } } ' > ${T}.distal.bed4 zcat ${T}.bed.ct.gz | grep -v track | sort -k1,1 -k2,2n | awk ' { if (($3 - $2) > 10000) { printf "%s\t%d\t%d\t%s_5\n", $1, $2, $2+5000, $4 printf "%s\t%d\t%d\t%s_3\n", $1, $3-5000, $3, $4 } else { printf "%s\t%d\t%d\t%s_unshaved\n", $1, $2, $3, $4 } } ' > ${T}.proximal.bed4 done # The results of these are the files to load: -rw-rw-r-- 1 23830 Aug 2 16:21 referenceIntergenicGencode.proximal.bed4 -rw-rw-r-- 1 8343 Aug 2 16:21 referenceIntergenicGencode.distal.bed4 -rw-rw-r-- 1 194964 Aug 2 16:21 referenceGencodeIntrons.proximal.bed4 -rw-rw-r-- 1 12032 Aug 2 16:21 referenceGencodeIntrons.distal.bed4 # Load them into database and create trackDb entries: hgLoadBed -strict hg16 encodeGencodeIntronsDistal \ referenceGencodeIntrons.distal.bed4 hgLoadBed -strict hg16 encodeGencodeIntronsProximal \ referenceGencodeIntrons.proximal.bed4 hgLoadBed -strict hg16 encodeGencodeIntergenicDistal \ referenceIntergenicGencode.distal.bed4 hgLoadBed -strict hg16 encodeGencodeIntergenicProximal \ referenceIntergenicGencode.proximal.bed4 # composite track: encodeGencodeRegions # with those four subTracks # track encodeGencodeRegions # shortLabel Gencode Unann # longLabel Gencode Unannotated Region Classification ######################################################################### # Classifing 6 Affy transfrag tracks and 5 Yale TAR tracks into # four location catagories - (DONE - 2005-08-03 - Hiram) # With the Gencode Unannotated Region Classification completed above # And loaded into the database, this is partially a tables browser # operation and partially a command line operation. # In the table browser, create an intersection between one of # the four regions, for example IntronsDistal and one of these # eleven tracks, for example encodeAffyRnaGm06990Sites # Note the URL in your WEB browser for that intersected result, # then using your own hgTables binary, run the following loop with # the hgsid from the URL you see you your WEB browser. What's # happening here is that there are some variables that are coming # from your cart, and some being changed on this command line: ssh hgwdev cd /cluster/data/encode/analysis/genes_transcripts mkdir -p fetchedTracks hgTables=$HOME/kent/src/hg/hgTables/hgTables for G in encodeGencodeIntronsDistal encodeGencodeIntronsProximal \ encodeGencodeIntergenicDistal encodeGencodeIntergenicProximal do for Q in encodeAffyRnaGm06990Sites encodeAffyRnaHeLaSites \ encodeAffyRnaHl60SitesHr00 encodeAffyRnaHl60SitesHr02 \ encodeAffyRnaHl60SitesHr08 encodeAffyRnaHl60SitesHr32 do echo "Genome: ${G}" echo "Query: ${Q}" ${hgTables} \ "hgsid=1235494&hgta_printCustomTrackHeaders=on&boolshad.hgta_printCustomTrackHea ders=1&hgta_ctName=tb_${G}&hgta_ctDesc=table+browser+query+on+${G}&hgta_ctVis=pa ck&hgta_ctUrl=&fbQual=whole&fbUpBases=200&fbDownBases=200&hgta_doGetBed=get+BED" \ hgta_table=${G} \ hgta_intersectTable=${Q} | \ sed -e "/^Set-/d; /^Content-/d; /^$/d" > "fetchedTracks/${G}.${Q}.txt" done done # Then, loading those 44 result files: for R in IntergenicDistal IntergenicProximal \ IntronsDistal IntronsProximal do for T in encodeAffyRnaGm06990Sites \ encodeAffyRnaHeLaSites \ encodeAffyRnaHl60SitesHr00 \ encodeAffyRnaHl60SitesHr02 \ encodeAffyRnaHl60SitesHr08 \ encodeAffyRnaHl60SitesHr32 \ encodeYaleAffyNB4RARNATars \ encodeYaleAffyNB4TPARNATars \ encodeYaleAffyNB4UntrRNATars \ encodeYaleAffyNeutRNATarsAll \ encodeYaleAffyPlacRNATars do grep -v "^track" fetchedTracks/encodeGencode${R}.${T}.txt | \ sort -k1,1 -k2,2n | hgLoadBed -strict hg16 ${T}${R} stdin done done # Create the trackDb entry for these 44 tables with a single # composite container track: # track encodeNoncodingTransFrags # shortLabel Unann Transfrags # longLabel Yale and Affymetrix Unannotated TransFrags ######################################################################### # Collapse the 44 tables into four tables (DONE - 2005-08-03 - Hiram) # Using the table browser, select 11 of the tables from that track # to use in the subtrack merge, do this four times for each of the # four regions. Merging 11 tables at a time, saving four files: ssh hgwdev cd /cluster/data/encode/analysis/genes_transcripts -rw-rw-r-- 1 29102 Aug 3 15:34 allintergenicDistal.bed.gz -rw-rw-r-- 1 15294 Aug 3 15:38 allintergenicProximal.bed.gz -rw-rw-r-- 1 16319 Aug 3 15:43 allintronsDistal.bed.gz -rw-rw-r-- 1 79296 Aug 3 15:46 allintronsProximal.bed.gz # And loading those four tracks: zcat allintergenicDistal.bed.gz | hgLoadBed -strict hg16 \ encodeAllIntergenicDistal stdin zcat allintergenicProximal.bed.gz | hgLoadBed -strict hg16 \ encodeAllIntergenicProximal stdin zcat allintronsDistal.bed.gz | hgLoadBed -strict hg16 \ encodeAllIntronsDistal stdin zcat allintronsProximal.bed.gz | hgLoadBed -strict hg16 \ encodeAllIntronsProximal stdin # Create trackDb entry for a composite track containing those four: # track encodeWorkshopSelections # shortLabel Consens Unann TF # longLabel ENCODE Consensus Unannotated Transfrags