!/bin/csh -f exit ############################################################################# # This is the make doc for hg18 ENCODE ############################################################################# # Changes to ENCODE groups (2007-07-31 kate) # Change labels for Transcripts and Chrom hgsql hg18 -e "UPDATE grp SET label='ENCODE Transcription' where name='encodeTxLevels'" hgsql hg18 -e "UPDATE grp SET label='ENCODE Chromatin Structure' where name='encodeChrom'" # Merge CompGeno and Var groups (few tracks) hgsql hg18 -e "UPDATE grp SET label='ENCODE Comparative Genomics and Variation' where name='encodeCompGeno'" hgsql hg18 -e "DELETE FROM grp where name='encodeVariation'" # Retire obsolete group hgsql hg18 -e "DELETE FROM grp where name='encode'" ############################################################################# # Create encodeRegions table ssh hgwdev cd /cluster/data/encode mkdir convertHg18 ln -s convertHg18 hg18 ln -s convertHg17 hg17 cd hg18 hgsql hg17 -N -e "SELECT * FROM encodeRegions ORDER BY name" | \ liftOver stdin /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ encodeRegions.bed encodeRegions.unmapped hgLoadBed hg18 encodeRegions encodeRegions.bed cp encodeRegions.bed ~/browser/ENCODE/build35_regions.bed # cvs add, install in /usr/local/apache/htdocs/ENCODE ########################################################################## # DOWNLOADS (2007-09-21 kate) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg18 mkdir -p encode cd encode # release terms cp ../../hg17/encode/README.txt . # annotation database # request admin set up automated database dump mkdir database # auxiliary data files mkdir datafiles # sequences cd /cluster/data/encode/convertHg18 hgsql hg18 -N -e \ "SELECT name, chrom, chromStart, chromEnd FROM encodeRegions ORDER BY name">regions.txt ssh kolossus cd /cluster/data/encode/convertHg18 mkdir regions cd regions /cluster/data/encode/bin/scripts/encodeSequences.pl -upper \ ../regions.txt /iscratch/i/hg18/nib > hg18.fa /cluster/data/encode/bin/scripts/encodeSequences.pl -masked \ ../regions.txt /iscratch/i/hg18/nib > hg18.msk.fa faSize -detailed hg18.fa > hg18_count.txt gzip *.fa md5sum *.fa.gz > md5sum.txt # copy regions/README.txt from hg17 and edit ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg18/encode ln -s /cluster/data/encode/convertHg18/regions . cp ../../hg17/encode/regions/README.txt regions # edit README ############################################################################## # Lifting rampage (Andy) ssh hgwdev bash cd /cluster/data/encode/convertHg18 /cluster/data/encode/bin/scripts/listEncodeTables.csh hg17 > hg17.tables wc -l hg17.tables #554 hg17.tables (dang) # start with easy beds i.e. the ones like "bed ." grep "bed.*\." hg17.tables > easyBeds.tables grep -v "bed.*\." hg17.tables > remaining.tables wc -l easyBeds.tables #127 easyBeds.tables mkdir easyBeds for fields in 3 4 5 6 9 12; do for table in `grep "bed $fields" easyBeds.tables | cut -f1`; do hgsql hg18 -e "drop table $table" /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table $fields >> easyBeds.script.log mv $table.* easyBeds/ done done # still got 4 like "bed ." egrep -v "bed (3|4|5|6|9|12)" easyBeds.tables #encodeYaleChIPSTAT1HeLaBingRenSites encodeChip bed . #encodeYaleChIPSTAT1HeLaMaskLess36mer36bpSite encodeChip bed . #encodeYaleChIPSTAT1HeLaMaskLess50mer38bpSite encodeChip bed . #encodeYaleChIPSTAT1HeLaMaskLess50mer50bpSite encodeChip bed . # these are all bed 3 for table in `egrep -v "bed (3|4|5|6|9|12)" easyBeds.tables | cut -f1`; do /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table 3 >> easyBeds.script.log mv $table.* easyBeds/ done # ok now there's ones like "bed +" wc -l remaining.tables #427 remaining.tables grep '\+' remaining.tables > plusBed.tables grep -v '\+' remaining.tables > tmp; mv tmp remaining.tables wc -l remaining.tables plusBed.tables # 383 remaining.tables # 44 plusBed.tables # 427 total mkdir plusBeds for fields in 4 5 6 9 12; do for table in `grep "bed $fields" plusBed.tables | cut -f1`; do /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table $fields >> plusBeds.script.log mv $table.* plusBeds/ done done # how about bedGraph ones? grep bedGraph remaining.tables > bedGraph.tables grep -v bedGraph remaining.tables > tmp; mv tmp remaining.tables wc -l bedGraph.tables remaining.tables # 186 bedGraph.tables # 197 remaining.tables # 383 total mkdir bedGraph for table in `cut -f1 bedGraph.tables`; do /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table 4 >> bedGraph.script.log mv $table.* bedGraph/ done ###################################### # Continue lifting rampage (ting, 06-07-2007) # # Examining remaining.tables # All GIS tables are bed 12, so lift these grep Gis remaining.tables > Gis.tables grep -v Gis remaining.tables > tmp; mv tmp remaining.tables wc -l Gis.tables remaining.tables # 7 Gis.tables # 190 remaining.tables # 197 total mkdir bedGis doGis.csh # 7 tables lifted. # 190 remaining. #################################################### # More lifting (Andy) ssh hgwdev bash cd /cluster/data/encode/convertHg18 # genePred tables grep genePred remaining.tables > genePred.tables grep -v genePred remaining.tables > tmp; mv tmp remaining.tables wc -l genePred.tables remaining.tables # 68 genePred.tables # 122 remaining.tables # 190 total mkdir genePred for table in `cut -f1 genePred.tables`; do /cluster/data/encode/bin/scripts/convertGenePredTable.csh hg17 hg18 $table >> genePred.scripts.log; mv $table.* genePred/ done # ERRORS, uh oh # fixed /cluster/data/encode/bin/scripts/convertGenePredTable.csh # binned hg17 tables weren't working right. grep error genePred.scripts.log | sed 's/^.*converting\ \(.*\)\.txt.*$/\1/' > genePredBins.tables for table in `cat genePredBins.tables`; do /cluster/data/encode/bin/scripts/convertGenePredTable.csh hg17 hg18 $table >> genePredBins.scripts.log; mv $table.* genePred/ done # missed bed tables. There's a few like "bed5FloatScore" and "bed 3", etc. # these can be treated as normal beds grep bed remaining.tables | cut -f1,3 | sed 's/bed5FloatScore/bed 5/' > \ bedOther.tables grep -v bed remaining.tables > tmp wc -l bedOther.tables tmp # 14 bedOther.tables # 108 tmp # 122 total mkdir bedOther for fields in 3 4 5; do for table in `grep "bed $fields" bedOther.tables | cut -f1`; do /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table $fields >> bedOther.script.log mv $table.* bedOther/ done done # wiggle # first tally up which ones are in which DBs. The older ones can go hg16->hg18 instead # of hg17->hg18. Make three sets of tables and do set operations: hgsql hg16 -e 'show tables' > hg16.all.tables hgsql hg17 -e 'show tables' > hg17.all.tables grep -v wigMaf remaining.tables | cut -f1 > wig.tables grep wigMaf remaining.tables > tmp; mv tmp remaining.tables wc -l wig.tables remaining.tables # 119 wig.tables # 3 remaining.tables # 122 total # OOPS I forgot to subtract the 14 tables from last one. # How many of the wiggle tables are in hg17? I hope all 119 grep -Fw -f wig.tables hg17.all.tables | wc -l # 105 # good. Ok how about hg16? grep -Fw -f wig.tables hg16.all.tables | wc -l # 61 # I guess then hg17 should have 44 newer ones. grep -Fw -f wig.tables hg16.all.tables > hg16.wig.tables grep -Fwv -f hg16.wig.tables wig.tables > hg17.wig.tables wc -l *wig.tables # 61 hg16.wig.tables # 44 hg17.wig.tables # 105 wig.tables # Awesome. These two sets shouldn't intersect at all: grep -Fw -f hg16.wig.tables hg17.wig.tables | wc -l # 0 # Great. Now lets move on. Let's use hgWiggle on each of these tables to # fetch the old data. Then we'll convert that to bed 4, lift that, then # run wigEncode on the lifted data. mkdir ../hg18.wib mkdir -p /gbdb/hg18/encode/wib mkdir fromHg16.wig for table in `cat hg16.wig.tables`; do hgWiggle -db=hg16 $table \ | grep -v "^#" | awk -f varStepToBed.awk > $table.old.wig liftOver -bedPlus=3 -tab $table.old.wig /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz \ $table.new.wig $table.unmapped sort -k1,1 -k2,2n $table.new.wig > tmp.wig; mv tmp.wig $table.new.wig wigEncode $table.new.wig $table.wig $table.wib 2>> wigFromHg16.log mv $table.wib ../hg18.wib/ ln -s /cluster/data/encode/hg18.wib/${table}.wib /gbdb/hg18/encode/wib/${table}.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 $table $table.wig mv $table.*wig $table.unmapped fromHg16.wig/ done mkdir fromHg17.wig for table in `cat hg17.wig.tables`; do hgWiggle -db=hg17 $table \ | grep -v "^#" | awk -f varStepToBed.awk > $table.old.wig liftOver -bedPlus=3 -tab $table.old.wig /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ $table.new.wig $table.unmapped sort -k1,1 -k2,2n $table.new.wig > tmp.wig; mv tmp.wig $table.new.wig wigEncode $table.new.wig $table.wig $table.wib 2>> wigFromHg17.log mv $table.wib ../hg18.wib/ ln -s /cluster/data/encode/hg18.wib/${table}.wib /gbdb/hg18/encode/wib/${table}.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 $table $table.wig mv $table.*wig $table.unmapped fromHg17.wig/ done ########################################################## # Wig lifting # we need to find all the old wiggle data and lift that. # Start with hg16 ssh hgwdev bash cd /cluster/data/encode/convertHg18 # find those hg16 tables cat > affyChipChip.hg16.wig.tables << "EOF" encodeAffyChIpHl60PvalBrg1Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Brg1_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalBrg1Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Brg1_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalBrg1Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Brg1_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalBrg1Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Brg1_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCebpeHr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CEBPe_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCebpeHr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CEBPe_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCebpeHr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CEBPe_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCebpeHr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CEBPe_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCtcfHr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CTCF_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCtcfHr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CTCF_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCtcfHr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CTCF_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalCtcfHr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_CTCF_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH3K27me3Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_H3K27T_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH3K27me3Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_H3K27T_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH3K27me3Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_H3K27T_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH3K27me3Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_H3K27T_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH4Kac4Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_HisH4_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH4Kac4Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_HisH4_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH4Kac4Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_HisH4_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalH4Kac4Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_HisH4_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalP300Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_P300_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalP300Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_P300_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalP300Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_P300_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalP300Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_P300_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalPu1Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_PU1_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalPu1Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_PU1_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalPu1Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_PU1_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalPu1Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_PU1_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRaraHr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_RARecA_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRaraHr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_RARecA_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRaraHr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_RARecA_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRaraHr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_RARecA_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRnapHr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Pol2_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRnapHr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Pol2_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRnapHr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Pol2_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalRnapHr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_Pol2_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalSirt1Hr00 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_SIRT1_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalSirt1Hr02 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_SIRT1_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalSirt1Hr08 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_SIRT1_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalSirt1Hr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_SIRT1_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 encodeAffyChIpHl60PvalTfiibHr32 Affy/2005-06-01/chipchip/wig/EC_AS_HL60_DN_RA_TFIIB-R_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3.pval.median.wig.bz2 EOF mkdir -p wigs/hg16 cd wigs/hg16 cat ../../affyChipChip.hg16.wig.tables | while read -a line; do chain=/gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz table=${line[0]}; oldWig=$table.hg16.wig newWig=$table.hg18.wig bad=$table.hg18.unmapped wib=$table.wib wigTable=$table.tab file=/cluster/data/encode/${line[1]}; echo $table bzcat $file | tail +2 | awk -f ../../varStepToBed.awk | \ awk 'BEGIN{OFS="\t"}{print $1, $2+1, $3, $4;}' > $oldWig; liftOver -bedPlus=3 $oldWig $chain $newWig $bad bedSort $newWig tmp mv tmp $newWig wigEncode $newWig $wigTable $wib done # One more for BU Orchid awk -f ../../varStepToBed.awk ../../../BU/orchid/2005-06-09/t0 > encodeBu_ORChID1.hg16.wig liftOver -bedPlus=3 encodeBu_ORChID1.hg16.wig /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz encodeBu_ORChID1.hg18.wig encodeBu_ORChID1.hg18.unmapped bedSort encodeBu_ORChID1.hg18.wig tmp; mv tmp encodeBu_ORChID1.hg18.wig wigEncode encodeBu_ORChID1.hg18.wig encodeBu_ORChID1.tab encodeBu_ORChID1.wib # NOTE: this track was replaced with newer data -- the lift was # never used. # Encode hapmap coverage for graph in ../../../sanger/coverage/encode*.bedGraph; do table=${graph%.bedGraph} table=${table#*coverage\/} liftOver -bedPlus=3 $graph /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz \ $table.hg18.wig $table.hg18.unmapped bedSort $table.hg18.wig tmp; mv tmp $table.hg18.wig wigEncode $table.hg18.wig $table.tab $table.wib done # hg17 tables cd ../ mkdir hg17 cd hg17 cat | while read -a line; do table=${line[0]}; file=/cluster/data/encode/${line[1]}; chain=/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz; awk -f ../../varStepToBed.awk $file > $table.hg17.wig; bedSort $table.hg17.wig tmp; mv tmp $table.hg17.wig liftOver -bedPlus=3 $table.hg17.wig $chain $table.hg18.wig $table.hg18.unmapped; bedSort $table.hg18.wig tmp; mv tmp $table.hg18.wig; wigEncode $table.hg18.wig $table.tab $table.wib done << "EOF" encodeAffyChIpHl60PvalStrictH3K9K14DHr00 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/00/EC_AS_HL60_DN_RA_H3K9K14D_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictH3K9K14DHr02 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/02/EC_AS_HL60_DN_RA_H3K9K14D_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictH3K9K14DHr08 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/08/EC_AS_HL60_DN_RA_H3K9K14D_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictH3K9K14DHr32 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/32/EC_AS_HL60_DN_RA_H3K9K14D_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictHisH4Hr00 Affy/2005-10-03/lab/CHIP/wig/HisH4/00/EC_AS_HL60_DN_RA_HisH4_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictHisH4Hr02 Affy/2005-10-03/lab/CHIP/wig/HisH4/02/EC_AS_HL60_DN_RA_HisH4_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictHisH4Hr08 Affy/2005-10-03/lab/CHIP/wig/HisH4/08/EC_AS_HL60_DN_RA_HisH4_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictHisH4Hr32 Affy/2005-10-03/lab/CHIP/wig/HisH4/32/EC_AS_HL60_DN_RA_HisH4_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictp63_ActD Affy/2005-10-03/lab/CHIP/wig/p63_ActD/EC_AS_ME180_ActD_p63_C01_EC_AS_ME180_CombInput_PlMinActD_B1_B2_B3.pval.median.wig encodeAffyChIpHl60PvalStrictp63_mActD Affy/2005-10-03/lab/CHIP/wig/p63_mActD/EC_AS_ME180_Ctrl_p63_C01_EC_AS_ME180_CombInput_PlMinActD_B1_B2_B3.pval.median.wig encodeAffyChIpHl60PvalStrictPol2Hr00 Affy/2005-10-03/lab/CHIP/wig/Pol2/00/EC_AS_HL60_DN_RA_Pol2_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictPol2Hr02 Affy/2005-10-03/lab/CHIP/wig/Pol2/02/EC_AS_HL60_DN_RA_Pol2_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictPol2Hr08 Affy/2005-10-03/lab/CHIP/wig/Pol2/08/EC_AS_HL60_DN_RA_Pol2_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60PvalStrictPol2Hr32 Affy/2005-10-03/lab/CHIP/wig/Pol2/32/EC_AS_HL60_DN_RA_Pol2_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.pval.median.wig encodeAffyChIpHl60SignalStrictH3K9K14DHr00 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/00/EC_AS_HL60_DN_RA_H3K9K14D_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictH3K9K14DHr02 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/02/EC_AS_HL60_DN_RA_H3K9K14D_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictH3K9K14DHr08 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/08/EC_AS_HL60_DN_RA_H3K9K14D_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictH3K9K14DHr32 Affy/2005-10-03/lab/CHIP/wig/H3K9K14D/32/EC_AS_HL60_DN_RA_H3K9K14D_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictHisH4Hr00 Affy/2005-10-03/lab/CHIP/wig/HisH4/00/EC_AS_HL60_DN_RA_HisH4_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictHisH4Hr02 Affy/2005-10-03/lab/CHIP/wig/HisH4/02/EC_AS_HL60_DN_RA_HisH4_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictHisH4Hr08 Affy/2005-10-03/lab/CHIP/wig/HisH4/08/EC_AS_HL60_DN_RA_HisH4_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictHisH4Hr32 Affy/2005-10-03/lab/CHIP/wig/HisH4/32/EC_AS_HL60_DN_RA_HisH4_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictp63_ActD Affy/2005-10-03/lab/CHIP/wig/p63_ActD/EC_AS_ME180_ActD_p63_C01_EC_AS_ME180_CombInput_PlMinActD_B1_B2_B3.sig.median.wig encodeAffyChIpHl60SignalStrictp63_mActD Affy/2005-10-03/lab/CHIP/wig/p63_mActD/EC_AS_ME180_Ctrl_p63_C01_EC_AS_ME180_CombInput_PlMinActD_B1_B2_B3.sig.median.wig encodeAffyChIpHl60SignalStrictPol2Hr00 Affy/2005-10-03/lab/CHIP/wig/Pol2/00/EC_AS_HL60_DN_RA_Pol2_00hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictPol2Hr02 Affy/2005-10-03/lab/CHIP/wig/Pol2/02/EC_AS_HL60_DN_RA_Pol2_02hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictPol2Hr08 Affy/2005-10-03/lab/CHIP/wig/Pol2/08/EC_AS_HL60_DN_RA_Pol2_08hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyChIpHl60SignalStrictPol2Hr32 Affy/2005-10-03/lab/CHIP/wig/Pol2/32/EC_AS_HL60_DN_RA_Pol2_32hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B4_B5.sig.median.wig encodeAffyRnaGm06990Signal Affy/2005-10-03/lab/RNA/wig/GM06990/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig encodeAffyRnaHeLaSignal Affy/2005-11-22/lab/Affy_HeLa/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig encodeAffyRnaHl60SignalHr00 Affy/2005-10-03/lab/RNA/wig/HL60/00/EC_AS_HL60_RWP+_RA_00hr_C01vsNULL.sig.wig encodeAffyRnaHl60SignalHr02 Affy/2005-10-03/lab/RNA/wig/HL60/02/EC_AS_HL60_RWP+_RA_02hr_C01vsNULL.sig.wig encodeAffyRnaHl60SignalHr08 Affy/2005-10-03/lab/RNA/wig/HL60/08/EC_AS_HL60_RWP+_RA_08hr_C01vsNULL.sig.wig encodeAffyRnaHl60SignalHr32 Affy/2005-10-03/lab/RNA/wig/HL60/32/EC_AS_HL60_RWP+_RA_32hr_C01vsNULL.sig.wig encodeUvaDnaRepTr50 UVa/2005-10-15/lab/smoothedtr50.hg17.wig EOF # Uppsala hg17 is already in bed format cat | while read -a line; do table=${line[0]}; file=/cluster/data/encode/${line[1]}; chain=/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz; liftOver -bedPlus=3 $file $chain $table.hg18.wig $table.hg18.unmapped; bedSort $table.hg18.wig tmp; mv tmp $table.hg18.wig; wigEncode $table.hg18.wig $table.tab $table.wib done << "EOF" encodeUppsalaChipH3acBut0h Uppsala/2006-05-29/lab/encodeUppsalaChipH3acBut0h.wig.txt encodeUppsalaChipH3acBut12h Uppsala/2006-05-29/lab/encodeUppsalaChipH3acBut12h.wig.txt encodeUppsalaChipH4acBut0h Uppsala/2006-05-29/lab/encodeUppsalaChipH4acBut0h.wig.txt encodeUppsalaChipH4acBut12h Uppsala/2006-05-29/lab/encodeUppsalaChipH4acBut12h.wig.txt EOF cat | while read -a line; do table=${line[0]}; file=/cluster/data/encode/${line[1]}; chain=/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz; liftOver -bedPlus=3 $file $chain $table.hg18.wig $table.hg18.unmapped; bedSort $table.hg18.wig tmp; mv tmp $table.hg18.wig; wigEncode $table.hg18.wig $table.tab $table.wib done << "EOF" encodeYaleAffyNeutRNATransMap yale/rna/2005-10-14/encodeYaleAffyNeutRNATransMap.trim encodeYaleAffyNB4RARNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_RA_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyNB4TPARNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_TPA_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyNB4UntrRNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_CTRL_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyPlacRNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_Placenta_RNA_Transcript_Map_ncbi35.wig EOF # ERRORS ... the first one worked, the others need trimming. cat | while read -a line; do table=${line[0]}; file=/cluster/data/encode/${line[1]}; chain=/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz; liftOver -bedPlus=3 $file $chain $table.hg18.wig $table.hg18.unmapped; bedSort $table.hg18.wig stdout | /cluster/data/encode/bin/scripts/trimOverlap.pl > tmp; mv tmp $table.hg18.wig; wigEncode $table.hg18.wig $table.tab $table.wib; done << "EOF" encodeYaleAffyNB4RARNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_RA_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyNB4TPARNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_TPA_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyNB4UntrRNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_NB4_CTRL_RNA_Transcript_Map_ncbi35.wig encodeYaleAffyPlacRNATransMap yale/rna/2005-10-14/lab/encode_Yale_Affy_Placenta_RNA_Transcript_Map_ncbi35.wig EOF # Forgot an hg16 one table=encodeUcsdNgChipSignal file=/cluster/data/encode/UCSD/nimblegen/2005-05-31/encodeUcsdNgChipSignal.varStep chain=/gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz awk -f ../../varStepToBed.awk $file > $table.hg16.wig liftOver -bedPlus=3 $table.hg16.wig $chain $table.hg18.wig $table.hg18.unmapped bedSort $table.hg18.wig stdout | /cluster/data/encode/bin/scripts/trimOverlap.pl > tmp mv tmp $table.hg18.wig wigEncode $table.hg18.wig $table.tab $table.wib ########################################################################## # Boston University ORChID track - (2007-06-29 ting) # data developer contact: Steve Parker parker@bu.edu # This is a new dataset to replace the old one, for the same track. # On hg17 the track name is encodeBu_ORChID1, was commented as "non-standard table name" # I took this chance to rename it as encodeBUORChID on hg18. ssh hgwdev cd /cluster/data/encode/BU mkdir -p orchid/2007-06-29/lab cd -p orchid/2007-06-29/lab wget --timestamping "http://dna.bu.edu/parker/.data/orchid_hg18_encode.wig.gz" cd .. mkdir wib # The file orchid_hg18_encode.wig.gz from data provider contains 0-based coordinates, # thus wigEncode choked on it -- specifically, at chr16, position 0 (ENm008). # I compared this new data to the old dataset (2005-09-08) and made sure that this # is the case. I saved the original file to 'original.wig.gz', and added 1 to all # positions in orchid_hg18_encode.wig.gz wigEncode lab/orchid_hg18_encode.wig.gz encodeBUORChID.wig \ wib/encodeBUORChID.wib # Converted lab/orchid_hg18_encode.wig.gz, upper limit 1.64, lower limit -0.98 # load set dir = /gbdb/hg18/encode/BU/2007-06-29 mkdir -p $dir hgLoadWiggle -pathPrefix=$dir hg18 encodeBUORChID encodeBUORChID.wig mkdir -p $dir/wib ln -s `pwd`/wib/encodeBUORChID.wib $dir/wib # create encodeBUORChID.html at trackDb/human/hg18/ ############################################################################# # Stanford NRSF ChIP-seq (DONE, Heather, July 2007) ssh hgwdev cd /cluster/data/encode/stanford/2007-03-14 liftOver fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed core.unmapped liftOver control_fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.control.bed control.unmapped hgLoadBed hg18 encodeStanfordNRSFEnriched hg18.bed -tab hgLoadBed hg18 encodeStanfordNRSFControl hg18.control.bed -tab ############################################################################ # Yale ENCODE Lifting ssh hgwdev cd /cluster/data/encode/convertHg18 for table in `cat yale.lst`; do echo select tableName,type from trackDb where tableName=\"$table\" \ | hgsql hg17 | tail +2 >> yale.tables done sed -e 's/bed5FloatScoreWithFdr/bed 5/' \ -e 's/bedGraph\ 4/bed 4/' -e 's/bed5FloatScore/bed 5/' \ < yale.tables > tmp.tables mv tmp.tables yale.tables mkdir yale for fields in 4 5; do for table in `grep "bed $fields" yale.tables | cut -f1`; do /cluster/data/encode/bin/scripts/convertBedTable.csh hg17 hg18 \ $table $fields >> yale.script.log mv $table.* yale/ done done ########################################################################### # Pseudogenes Class table copied from hg17 (20087-08-01 kate) # This table is copied unchanged. ssh hgwdev cd /cluster/data/encode/convertHg18 mkdir pseudogene cd pseudogene hgsqldump --all --tab=. hg17 encodePseudogeneClass hgsql hg18 < encodePseudogeneClass.sql echo "LOAD DATA LOCAL INFILE 'encodePseudogeneClass.txt' \ into table encodePseudogeneClass" | hgsql hg18 ########################################################################### # Affy EC chrom21/chrom22 (Andy DONE 2007-07-20) ssh hgwdev bash cd /cluster/data/encode/Affy mkdir -p 2007-07-12/lab cd 2007-07-12/ mkdir -p processed/{bed,wigTable,wib,download} cd lab/ cp /var/ftp/encode/encode_ext_RNA_hg18_chr21-22.tar.gz . tar xfz encode/encode_ext_RNA_hg18_chr21-22.tar.gz rm encode/encode_ext_RNA_hg18_chr21-22.tar.gz cd ../ find lab -name '*.bed' > renamesBed.txt find lab -name '*.wig' > renamesWig.txt # Make 2nd column for table name cat renamesBed.txt | while read -a line; do tail +2 ${line[0]} > processed/bed/${line[1]}.bed hgLoadBed hg18 ${line[1]} processed/bed/${line[1]}.bed done rm bed.tab cat renamesWig.txt | while read -a line; do table=${line[1]} origFile=${line[0]} tail +2 $origFile > processed/download/${table}.wig wigEncode processed/download/${table}.wig processed/wigTable/${table}.tab \ processed/wib/${table}.wib 2>> processed/wigEncode.log pushd /gbdb/hg18/encode/wib ln -s /cluster/data/encode/Affy/2007-07-12/processed/wib/${table}.wib popd hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 $table processed/wigTable/${table}.tab gzip processed/download/${table}.wig done cd /usr/local/apache/htdocs/goldenPath/hg18/encode ln -s /cluster/data/encode/Affy/2007-07-12/processed/download/*.gz . ########################################################################### # Yale Pol II Chip (Chip-seq) (DONE Andy 11-07-2007) cd /cluster/data/encode/yale mkdir -p 2007-07-17/lab cd 2007-07-17/lab unzip Yale_jul17_v2.zip set table = wgEncodeYaleChipSeqPol2HelaSites hgLoadBed hg18 $table lab/PolII/PolII_hg18-sites.bed #Reading PolII_hg18-sites.bed #Loaded 87253 elements of size 4 #Sorted #Creating table definition for encodeYalePolIISites #Saving bed.tab #start -142, end 1144 out of range in findBin (max is 512M) # CONTACTED submitter to ask about negative coordinate. # For now, leave out chrM sed '/^chrM/d' lab/PolII/PolII_hg18-sites.bed | hgLoadBed hg18 $table stdin # NOTE: max score=1779, min score=7 # data distribution awk '{print $4}' pol2.bed | sort -n | textHistogram stdin -binSize=100 ; 0 ************************************************************ 83113 100 ** 3300 200 545 300 144 400 58 500 37 600 20 700 6 800 6 900 0 1000 2 1100 2 1200 0 1300 0 1400 0 1500 0 1600 0 1700 1 bedSort PolII_hg18-signal.wig tmp.wig v tmp.wig PolII_hg18-signal.wig ../../../bin/scripts/trimOverlap.pl < PolII_hg18-signal.wig > tmp.wig mv tmp.wig PolII_hg18-signal.wig mv encodeYalePolIISignal.wib /cluster/data/encode/hg18.wib/ ln -s /cluster/data/encode/hg18.wib/encodeYalePolIISignal.wib /gbdb/hg18/encode/wib/ hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 encodeYalePolIISignal encodeYalePolIISignal.wig #Connected to database hg18 for track encodeYalePolIISignal #Creating wiggle table definition in hg18.encodeYalePolIISignal #Saving wiggle.tab #WARNING: Exceeded chr18_random size 4406 > 4262. dropping 145 data point(s) # hmmm... that's not a good warning. I wonder if these guys got the genome wrong. hgLoadBed hg18 encodeYalePolIISites PolII_hg18-sites.bed #Reading PolII_hg18-sites.bed #Loaded 87253 elements of size 4 #Sorted #Creating table definition for encodeYalePolIISites #Saving bed.tab #start -142, end 1144 out of range in findBin (max is 512M) # MORE ERRORS. Clearly this submission wasn't quite meant to be just yet. # to be continued... # continued... made a "resub" dir and copied the resubmitted zipfile there. cd /cluste/data/2007-07-17/resub unzip Yale_jul17_v2.zip cd PolII/ trimObBedLines PolII_hg18-signal.wig > ../../processed/wgEncodeYalePolIISignal.wigBed pushd ../../processed/ wigEncode wgEncodeYalePolIISignal.wigBed wgEncodeYalePolIISignal.wig wgEncodeYalePolIISignal.wib gzip wgEncodeYalePolIISignal.wigBed cd ../../../hg18.wib ln -s ../yale/2007-07-17/processed/wgEncodeYalePolIISignal.wib cd /gbdb/hg18/encode/wib ln -s /cluster/data/encode/hg18.wib/wgEncodeYalePolIISignal.wib cd /usr/local/apache/htdocs/goldenPath/hg18/encode/wig ln -s /cluster/data/encode/yale/2007-07-17/processed/wgEncodeYalePolIISignal.wigBed.gz popd hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 wgEncodeYalePolIISignal wgEncodeYalePolIISignal.wig ######################################################################### # YALE STAT1 (more ChIP-seq) (DONE, Andy 2007-11-20) cd /cluster/data/encode/yale mkdir 2007-08-08 cd 2007-08-08/ cp /var/ftp/encode/Yale_aug8.zip . unzip Yale_aug8.zip cd STAT1/ trimObBedLines hg18 STAT1_hg18-signal.wig wgEncodeYaleStat1Signal.wigBed trimObBedLines hg18 STAT1_hg18-sites.bed wgEncodeYaleStat1Sites.bed gzip wgEncodeYaleStat1Signal.wigBed wigEncode wgEncodeYaleStat1Signal.wigBed.gz wgEncodeYaleStat1Signal.{wig,wib} cd ../ mkdir lab processed mv readme_aug8.txt STAT1 lab/ rm Yale_aug8.zip mv lab/STAT1/wgEncodeYaleStat1Si* processed/ pushd ../../hg18.wib/ ln -s ../yale/2007-08-08/processed/wgEncodeYaleStat1Signal.wib cd /gbdb/hg18/encode/wib ln -s /cluster/data/encode/hg18.wib/wgEncodeYaleStat1Signal.wib popd cd processed/ hgLoadWiggle -pathPrefix=/gbdb/hg18/encode/wib hg18 wgEncodeYaleStat1Signal wgEncodeYaleStat1Signal.wig hgLoadBed hg18 wgEncodeYaleStat1Sites wgEncodeYaleStat1Sites.bed ########################################################################## # Genome Institute of Singapore PET data (2007-08-30 ting) # Submitted 8/22 by Atif Shahab and Chia-lin Wei # Three new PET datasets on human embryonic stem cell hES3. # One polyA-RNA dataset, and two ChIP-PET datasets of H3K4me3 and H3K27me3. # Build them as subtracks into existing GIS tracks: GIS-RNA-PET and GIS-CHIP-PET. ssh hgwdev cd /cluster/data/encode/GIS/ mkdir 2007-08-22 cd 2007-08-22 mkdir lab cd lab cp /var/ftp/encode/gis.tar.gz ./ gunzip gis.tar.gz tar -xvf gis.tar # obtained 3 data files: H3K27me3.bed H3K4me3.bed polyA.bed # These are mapped on hg17, first lift. cd /cluster/data/encode/GIS/2007-08-22 liftOver lab/polyA.bed ../../convertHg18/hg17ToHg18.over.chain.gz \ polyA-hg18.bed polyA-unmapped.bed # 426301 lifted, 34 unmapped liftOver lab/H3K4me3.bed ../../convertHg18/hg17ToHg18.over.chain.gz \ H3K4me3.bed H3K4me3-unmapped.bed # 679752 lifted, 13 unmapped liftOver lab/H3K27me3.bed ../../convertHg18/hg17ToHg18.over.chain.gz \ H3K27me3.bed H3K27me3-unmapped.bed # 992509 lifted, 25 unmapped # GIS data are not scored. Based on Angie and Kate's previous work, # scored BED can be made from item name. Use scoreGisBed.pl to do so. scoreGisBed.pl polyA-hg18.bed 2 encodeGisRnaPetHes3.bed scoreGisBed.pl H3K4me3-hg18.bed 1 encodeGisChipPetHes3H3K4me3.bed scoreGisBed.pl H3K27me3-hg18.bed 1 encodeGisChipPetHes3H3K27me3.bed # load on hg18 hgLoadBed hg18 encodeGisRnaPetHes3 encodeGisRnaPetHes3.bed # Loaded 426301 elements of size 12 hgLoadBed hg18 encodeGisChipPetHes3H3K4me3 encodeGisChipPetHes3H3K4me3.bed # Loaded 679752 elements of size 12 hgLoadBed hg18 encodeGisChipPetHes3H3K27me3 encodeGisChipPetHes3H3K27me3.bed # Loaded 992509 elements of size 12 # modified trackDb.encodeTxLevel.ra, trackDb.encodeChip.ra, # encodeGisChipPetAll.html, encodeGisRnaPet.html ########### # Promote UCSD genome-wide Chip tracks: # UCSD TAF1 IMR90 Chip/chip to Regulation group # (2007-09-14 kate) # See hg18.txt ###################################################### # Add strand information for encodeGencodeRace data - ting 09-27-2007 # ENCODE 5RACE data do not contain strand information. This # information is very important, and can be derived from # available GENCODE and 5RACE data. # There are two relatively simple strategies to derive strand # information. However, there are several exceptions to either # strategy. Therefore I will combine these two strategies in # this one script. # Strategy 1: a RACE primer should extend from 3' end of a transcript # towards 5' end. Therefore, if any RACE frag from # this primer extends towards the right of the primer # location, it means the gene goes from right to left, # i.e. on - strand. Therefore, the primer should be # on the + strand, and the corresponding RACEfrag should # be on the - strand (same as gene). By the same token, # if a RACEfrag extends toward left, it indicates that # the primer is on - strand, while the gene and RACEfrag # are on + strand. # The only case that such relationship can not be determined # is when the RACEfrag contains only one exon, and the # primer locates in that exon. It is not sure if the # RACEfrag extends to the right or left. # This strategy leaves 3 primers undetermined. # # Strategy 2: RACE primers should be designed based on GENCODE # exons. Therefore, the orientation of the primer can be # determined by its overlapping GENCODE exon. In this case, # the primer is on the opposite strand of the GENCODE exon, # and any RACEfrag from this primer should be on the opposite # strand of the primer. # There exist several exceptions, where the primer is # located outside of exons. It is probably ok if instead # look at the nearest exon if it doesn't overlap with any. # This strategy leaves 37 primers undetermined. # Combining 1 and 2 all primers are determined for their orientation. # # Instead of working on the original gff files, I decide to work on # data files after hg18 migration. These files are genePred formatted. # Working folder is # /cluster/store6/encode/GencodeRACEfrags/2007-04-11/strand ssh hdwdev cd /cluster/data/encode/GencodeRACEfrags/latest/ mkdir strand cd strand cp /cluster/data/encode/convertHg18/genePred/*Race*.tab ./ cp /cluster/data/encode/convertHg18/genePred/encodeGencodeGeneKnownMar07.tab ./ ./addRacePrimerStrand.pl encodeGencodeRaceFragsPrimer.tab encodeGencodeGeneKnownMar07.tab csh load.csh > & ! load.log # encodeGencodeRaceFragsBrain # Reading encodeGencodeRaceFragsBrain.tab # 269 gene predictions # encodeGencodeRaceFragsColon # Reading encodeGencodeRaceFragsColon.tab # 269 gene predictions # encodeGencodeRaceFragsGM06990 # Reading encodeGencodeRaceFragsGM06990.tab # 236 gene predictions # encodeGencodeRaceFragsHL60 # Reading encodeGencodeRaceFragsHL60.tab # 236 gene predictions # encodeGencodeRaceFragsHeart # Reading encodeGencodeRaceFragsHeart.tab # 261 gene predictions # encodeGencodeRaceFragsHela # Reading encodeGencodeRaceFragsHela.tab # 168 gene predictions # encodeGencodeRaceFragsKidney # Reading encodeGencodeRaceFragsKidney.tab # 293 gene predictions # encodeGencodeRaceFragsLiver # Reading encodeGencodeRaceFragsLiver.tab # 243 gene predictions # encodeGencodeRaceFragsLung # Reading encodeGencodeRaceFragsLung.tab # 290 gene predictions # encodeGencodeRaceFragsMuscle # Reading encodeGencodeRaceFragsMuscle.tab # 238 gene predictions # encodeGencodeRaceFragsPlacenta # Reading encodeGencodeRaceFragsPlacenta.tab # 275 gene predictions # encodeGencodeRaceFragsPrimer # Reading encodeGencodeRaceFragsPrimer.tab # 365 gene predictions # encodeGencodeRaceFragsSmallIntest # Reading encodeGencodeRaceFragsSmallIntest.tab # 277 gene predictions # encodeGencodeRaceFragsSpleen # Reading encodeGencodeRaceFragsSpleen.tab # 275 gene predictions # encodeGencodeRaceFragsStomach # Reading encodeGencodeRaceFragsStomach.tab # 300 gene predictions # encodeGencodeRaceFragsTestis # Reading encodeGencodeRaceFragsTestis.tab # 292 gene predictions # Strand information is added for primers and all RACEfrags. ###################################################### # LIFT NHGRI DIPs from hg17 (2007-10-22 kate) sh hgwdev cd /cluster/data/encode/NHGRI/mullikin/hg17 hgsql hg18 < encodeIndels.sql zcat encodeIndels.bed.gz | tail +2 | \ liftOver -bedPlus=8 stdin /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ encodeIndels.hg18.bed encodeIndels.hg18.unmapped # lost 670 items (of 11452 total) # This is high -- nearly 6%, and losses were in all regions, # not just chrX. hgLoadBed hg18 encodeIndels -tab -sqlTable=encodeIndels.sql \ encodeIndels.hg18.bed # change group name to merge in variation hgsql hg18 -e "update grp set name='encodeCompAndVar' where name='encodeCompGeno'" ######################################################### # 2007-11-08 (ASZ) # These wig files were shown to not match their corresponding database table # Dropped them from the hgdownload server: #/goldenPath/hg18/encode/wig/encodeUppsalaChipH3acBut0h.wigBed.gz #/goldenPath/hg18/encode/wig/encodeUppsalaChipH3acBut12h.wigBed.gz #/goldenPath/hg18/encode/wig/encodeUppsalaChipH4acBut0h.wigBed.gz #/goldenPath/hg18/encode/wig/encodeUppsalaChipH4acBut12h.wigBed.gz #/goldenPath/hg18/encode/wig/encodeYaleAffyNB4RARNATransMap.wigBed.gz #/goldenPath/hg18/encode/wig/encodeYaleAffyNB4TPARNATransMap.wigBed.gz #/goldenPath/hg18/encode/wig/encodeYaleAffyNB4UntrRNATransMap.wigBed.gz #/goldenPath/hg18/encode/wig/encodeYaleAffyNeutRNATransMap.wigBed.gz #/goldenPath/hg18/encode/wig/encodeYaleAffyPlacRNATransMap.wigBed.gz ######################################################### # Yale RACE (2007-11-15 galt) # cd /cluster/data/encode/yale/ mkdir race cd race mkdir 2007-11-15 ln -s 2007-11-15/ latest cd latest mkdir lab cd lab wget http://homes.gersteinlab.org/people/jiangdu/race_seq/race_desc.html wget http://homes.gersteinlab.org/people/jiangdu/race_seq/conserved_transcripts-til-20070402.bed tail +5 conserved_transcripts-til-20070402.bed | gawk '{print$1}' | sort -u | head chr11 chr21 chr22 cp race_desc.html ${HOME}/kent/src/hg/makeDb/trackDb/human/hg18/encodeYaleRace.html cvs add ${HOME}/kent/src/hg/makeDb/trackDb/human/hg18/encodeYaleRace.html tail +5 conserved_transcripts-til-20070402.bed | hgLoadBed hg18 encodeYaleRace stdin vi trackDb.encodeTxLevels.ra --- track encodeYaleRace superTrack encodeYaleRnaSuper dense shortLabel Yale RACE longLabel Yale RACE 420 primarily novel TARs in ENCODE regions group encodeTxLevels priority 32.0 chromosomes chr11,chr21,chr22 visibility hide type bed 12 . dataVersion ENCODE Nov 2007 origAssembly hg18 --- vi ${HOME}/kent/src/hg/makeDb/trackDb/human/hg18/encodeYaleRnaSuper.html #edit to add the new RACE track to the Credits section ############################################################################# # TBA alignments from Margulies lab, NHGRI (2008-2-20 kate) # Submitted by Gayle McEwen (mceweng@mail.nih.gov), from their DEC-07 freeze # Requested doc update (README & track description) from Elliott on 2/20 # Conservation scores: (BinCons and ChaiCons provided 3/24/08 ssh kkstore03 cd /cluster/data/encode/TBA mkdir -p DEC-07/2008-01-10/lab cd DEC-07/2008-01-10/lab wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode/freeze/DEC-2007/tba-DEC-2007.tar.gz tar xvfz tba-DEC-2007.tar.gz cd .. mkdir maf cat > getMafs.csh << 'EOF' foreach f (lab/tba/*/*.maf.gz) set r = $f:t:r:r:e echo $r gunzip -c $f | \ sed -e 's/^s human\./s hg18./' \ -e 's/^s mouse\./s mm9./' \ -e 's/^s cow\./s bosTau3./' \ -e 's/^s dog\./s canFam2./' \ -e 's/^s chicken\./s galGal3./' \ -e 's/^s monodelphis\./s monDom4./' \ -e 's/^s chimp\./s panTro2./' \ -e 's/^s macaque\./s rheMac2./' \ -e 's/^s orangutan\./s ponAbe2./' \ -e 's/^s rat\./s rn4./' \ > maf/$r.maf end 'EOF' csh getMafs.csh >&! getMafs.log & # Score too small messages -- can be ignored (the score isn't meaningful) # Add gap annotation # prepare bed files with gap info ssh kkstore03 cd /cluster/data/encode/TBA cd DEC-07/2008-01-10 mkdir anno cd anno mkdir maf run cd run cat > species.lst << 'EOF' hg18 bosTau3 canFam2 galGal3 monDom4 panTro2 rheMac2 ponAbe2 mm9 rn4 'EOF' cat > doNBed.csh << 'EOF' foreach db (`cat species.lst`) echo -n "$db " set cdir = /cluster/data/$db if (! -e $cdir/$db.N.bed) then echo "creating N.bed" twoBitInfo -nBed $cdir/$db.2bit $cdir/$db.N.bed else echo "" endif end 'EOF' csh doNBed.csh >&! doNBed.log & rm -f nBeds foreach db (`grep -v hg18 species.lst`) echo "$db " ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds end cat > doAnno.csh << 'EOF' foreach f (../../maf/*.maf) set b = $f:t echo $f nice mafAddIRows -nBeds=nBeds $f \ /cluster/data/hg18/hg18.2bit ../maf/$b end 'EOF' #<< happy emacs csh doAnno.csh >&! doAnno.log & # Load MAF table with annotated mafs. Also load summary table. ssh hgwdev set mdir = /cluster/data/encode/TBA/DEC-07/2008-01-10/anno/maf cd $mdir set gdir = /gbdb/hg18/encode/TBA/DEC-07/2008-01-10/maf rm -f $gdir/*.maf mkdir -p $gdir ln -s $mdir/*.maf $gdir hgLoadMaf -pathPrefix=$gdir -WARN hg18 encodeTbaAlignDec07 >&! load.log & cat *.maf | hgLoadMafSummary hg18 encodeTbaSummaryDec07 stdin cd .. # Reannotate with newer mafAddIRows having distinctive rows for # tandem dups (by request of JK) # 2008-10-23 kate # again (another fix to mafAddIRows) 2008-10-27 kate ssh kolossus cd /cluster/data/encode/TBA cd DEC-07/2008-01-10 cd anno/run # edit doAnno.csh to use new version csh doAnno.csh >&! doAnno.log & ssh hgwdev set mdir = /cluster/data/encode/TBA/DEC-07/2008-01-10/anno/maf cd $mdir set gdir = /gbdb/hg18/encode/TBA/DEC-07/2008-01-10/maf hgLoadMaf -pathPrefix=$gdir -WARN hg18 encodeTbaAlignDec07 >&! load.log & # Ignore 'score too small' errors cat *.maf | hgLoadMafSummary hg18 encodeTbaSummaryDec07 stdin #Created 141213 summary blocks from 8144409 components and 389847 mafs from stdin #Loading into hg18 table encodeTbaSummaryDec07... # Gene frames ssh hgwdev cd /cluster/data/encode/TBA/DEC-07/2008-01-10 mkdir frames cd frames # Pick gene tables, according to the following criteria: # KG if present, else refGene if >10000 entries, else ensGene (unless dog), # else mgcGenes, else mrnas if > 10000 else none. In all cases # except none, add in refGene. # NOTE: shortcut by using sources from hg18 multiz framing # (added braney 2008-03-01) use geneCode for hg18, no # genes from ponAbe2 hg18: encodeGencodeGeneKnownMar07 bosTau3: mrna canFam2: mrna galGal3: mrna monDom4: ensGene panTro2: refGene rheMac2: ensGene rn4: knownGene mm9: knownGene # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes cat > getGenes.csh << 'EOF' rm -fr genes mkdir -p genes set mrnaDbs = "bosTau3 canFam2 galGal3" foreach queryDb ($mrnaDbs) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $queryDb > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using encodeGencodeGeneKnownMar07 for hg18 # using knownGene for rn4 mm9 # using refGene for panTro2 # using ensGene for monDom4, rheMac2 # genePreds; (must keep only the first 10 columns for knownGene) #set geneDbs = "hg18 mm9 rn4 panTro2 monDom4 rheMac2 ponAbe2" # NOTE: next time include ponAbe2, using ensGene set geneDbs = "hg18 mm9 rn4 panTro2 monDom4 rheMac2" foreach queryDb ($geneDbs) if ($queryDb == "monDom4" || $queryDb == "rheMac2") then set geneTbl = ensGene else if ($queryDb == "panTro2") then set geneTbl = refGene else if ($queryDb == "rn4" || $queryDb == "mm9") then set geneTbl = knownGene else if ($queryDb == "hg18") then set geneTbl = encodeGencodeGeneMar07 endif hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz end 'EOF' csh getGenes.csh >&! getGenes.log & ssh kkstore03 cd /cluster/data/encode/TBA/DEC-07/2008-01-10/frames (cat ../maf/*.maf | nice genePredToMafFrames hg18 stdin stdout bosTau3 genes/bosTau3.gp.gz canFam2 genes/canFam2.gp.gz galGal3 genes/galGal3.gp.gz hg18 genes/hg18.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz monDom4 genes/monDom4.gp.gz | nice gzip > mafFrames.gz) >& frames.log & ssh hgwdev cd /cluster/data/encode/TBA/DEC-07/2008-01-10/frames nice hgLoadMafFrames hg18 encodeTbaFramesDec07 mafFrames.gz >& loadFrames.log & # Post downloads ssh kkstore03 cd /cluster/data/encode/TBA/DEC-07/2008-01-10/ mkdir downloads cd anno/maf # redo to include re-annotated mafs (with 'T' lines for tandem dups) # 2008-11-06 kate tar cvfz ../../downloads/encodeTba.maf.tgz *.maf # Obtain sequence freeze ssh kkstore03 cd /cluster/data/encode/MSA mkdir -p DEC-07/lab cd DEC-07/lab wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode/freeze/DEC-2007/DEC-2007.tar.gz # Received README.txt for sequence freeze # and encodeTbaAlign.html update from Gayle McEwen, 6/12/08 cp encodeTbaAlign_DEC-2007.html ~/kent/src/hg/makeDb/trackDb/human/hg18/encodeTbaAlignDec07.html # checkin to CVS cd .. mkdir downloads cd downloads ln -s /cluster/data/encode/MSA/DEC-07/lab/DEC-2007.tar.gz . ln -s /cluster/data/encode/MSA/DEC-07/lab/seq sequences # Received species tree from Gayle 8/08 cp ../lab/conserved.mod tree_4d.tba.nh # edit to remove phastCons-specific header # edit tree to remove species not in this dataset: gorilla, lemur, black_lemur, sheep, # muntjak_indian, ajbat, cpbat, eehedgehog, wallaby, dunnart, torgoise, xenopus # tetraodon, fugu, zebrafish, pig tail +2 seq/metadata.txt | awk '{print $1}' | sort | uniq > species.txt echo `cat species.txt|sed 's/$/,/'` | sed 's/ //g' > speciesList.txt /cluster/bin/phast/tree_doctor --prune-all-but `cat speciesList.txt` ../lab/tree_Dec2007.nh | sed 's/:0.000000//g' > species36.nh # Create tree image with phyloGif -- use 700 height, preserve underscores # encode_36way.gif cp encode_36way.gif ~/browser/images/phylo/ # checkin to CVS cp ../lab/README_DEC-2007.txt README.txt # fix typo -- it's the Dec not Sep freeze # edit DIRECTORY structure section a bit to reflect this downloads organiatoin # post for download ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg18/encode mkdir -p MSA/DEC-2007 cd MSA/DEC-2007 ln -s /cluster/data/encode/MSA/DEC-07/downloads/{README.txt,DEC-2007.tar.gz} . ln -s /cluster/data/encode/MSA/DEC-07/downloads/{tree_4d.tba.nh,species36.nh} . cp ~/browser/images/phylo/encode_36way.gif . mkdir -p alignments/TBA/ cd alignments/TBA ln -s /cluster/data/encode/TBA/DEC-07/2008-01-10/downloads/encodeTba.maf.tgz encodeTbaDec07.maf.tgz # Conservation mkdir -p ChaiCons/2008-03-24/lab BinCons/2008-03-24/lab # copy files from Gayle McEwan email # binCons files are formatted start end name score, # where score is always 1000 # Lift these to hg18 coordinates, and remove score field. echo "select chromStart, name, chromEnd-chromStart, chrom from encodeRegions" | hgsql -N hg18 | sed 's/$/\t30000000/' > /cluster/data/encode/MSA/encodeRegions.lft liftUp ChaiCons.bed /cluster/data/encode/MSA/encodeRegions.lft warn lab/CHAI.bed wc -l ChaiCons.bed3 lab/CHAI.bed #208916 ChaiCons.bed #208916 lab/CHAI.bed awk '{printf "%s\t%d\t%d\tchai.%d\n", $1, $2, $3, NR}' ChaiCons.bed3 > ChaiCons.bed4 hgLoadBed hg18 encodeTbaChaiConsDec07 ChaiCons.bed4 # Loaded 208916 elements of size 3 cd ../../BinCons/2008-03-24 liftUp -type=.bed stdout /cluster/data/encode/MSA/encodeRegions.lft warn lab/BINCONS.bed |\ sed 's/1000$//' > BinCons.bed wc -l BinCons.bed lab/BINCONS.bed # 117793 BinCons.bed # 117836 lab/BINCONS.bed # difference due to blank lines in source file: grep '^$' lab/* | wc -l # 43 hgLoadBed hg18 encodeTbaBinConsDec07 BinCons.bed ############################################################################# # encodeGencodeGeneKnownMar07 (2010-04-07 markd) # Was discovered to be corrupted on hgwdev and all servers # cd /cluster/data/encode/convertHg18/genePred genePredCheck -db=hg18 encodeGencodeGeneKnownMar07.tab checked: 2991 failed: 0 hgLoadGenePred -genePredExt hg18 encodeGencodeGeneKnownMar07 encodeGencodeGeneKnownMar07.tab genePredCheck -db=hg18 encodeGencodeGeneKnownMar07