#!/bin/csh exit; # start with Mercator data from Colin Dewey, and copy it to working directory: # /cluster/data/encode/ortho3/mercatorInput/encodeMaps20050304.tar.gz # #-----Original Message----- #From: Colin Dewey [mailto:cdewey@eecs.berkeley.edu] #Sent: Friday, March 04, 2005 1:09 PM #To: Daryl Thomas #Subject: Re: hg17 ENCODE region coordinates # #Hi Daryl, # #Thanks for the coordinates. I should have realized that I could grab #these from the table browser, sorry! The table browser is very nice #now... I was happily surprised that I could do table joins! # #Here are our new orthology mappings. Everything is in the same form as #last time, so hopefully there will be no problems. But do let me know #if you run into anything. # #Also, it seems that our group has yet to get alignments to you in MAF #format. So I've decided to take on this task. When I'm done, perhaps #I can send you our alignments from the last freeze just to make sure #that I have the formats right? # #Thanks, # #Colin # ${HOME}/kent/src/hg/lib/encodeRegionMercator.sql #create table encodeRegionsMercator ( # bin smallint(5) unsigned not null, # chrom varchar(255) not null, # chromStart int(10) unsigned not null, # chromEnd int(10) unsigned not null, # name varchar(255) not null, # score int(10) unsigned not null, # strand char(1) not null #) ### :::::::::::::: mercatorInput/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W while (<>) { chomp(); if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN\D\d\d\d)\.(\d+)_of.*/) { printf("$1\t$2\t$3\t$4\t$5\n"); } else { die "Bad input line: $_"; } } ### set dir = /cluster/data/encode/ortho3/mercatorInput mkdir -p $dir cd $dir tar xfz encodeMaps20050304.tar.gz chmod g+w $dir/* foreach db (bosTau1 canFam1 danRer2 galGal2 mm5 monDom1 panTro1 rn3 tetNig1) echo ========= $db ========== rm -f $db.log $db.unmerged.bed5 $db.original.bed mv -f $db.bed $db.original.bed makeBed5.pl < $db.original.bed > $db.unmerged.bed5 liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed5 $db.bed5 >& $db.log hgLoadBed -tab -sqlTable=${HOME}/kent/src/hg/lib/encodeRegionsMercator.sql $db encodeRegionsMercator $db.original.bed >>& $db.log hgLoadBed -tab $db encodeRegionsMercatorMerged $db.bed5 >>& $db.log end rm -f bed.tab ######################################################################## ############# liftOver orthologous regions ########################### ######################################################################## set dir = /cluster/data/encode/ortho3/liftOverRegions mkdir -p $dir cd $dir hgsql hg17 -e "SELECT * FROM encodeRegions ORDER BY name" | tail +2 > hg17.bed hgsql -h genome-testdb hgcentraltest -e "select toDb, path from liftOverChain " | grep /gbdb/hg17 | grep -v hg16 > hg17.over.chain.files ######################################################################## # CHIMP # THESE NOTES ARE FROM ORTHO2 # used chains from reciprocal best net -- these were more # chopped up, but regions in these don't span unbridged gaps # minMatch .7 leaves ENr324 and ENm011 unmapped # Lowered match threshold until region coverage looked good (to .4) # (except m11, for some reason), with minimum of fragmentation # Same at .3 and .2 -- at the point where regions look reasonable, # lowering the threshold in this way doesn't change them. # NOTE: reducing minSizeQ to 10000, due to fragmented panTro1 assembly liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=10000 -multiple -chainTable=hg17.chainPanTro1 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain \ panTro1.unmerged.bed5 panTro1.unmapped.bed -verbose=2 >& panTro1.log liftOverMerge -mergeGap=20000 -verbose=2 panTro1.unmerged.bed5 panTro1.bed5 >>& panTro1.log wc -l panTro1.*bed* # 81 panTro1.bed5 # 0 panTro1.unmapped.bed # 107 panTro1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' panTro1.bed5 > panTro1.bed hgLoadBed -noBin panTro1 encodeRegionsLiftOver panTro1.bed >>& panTro1.log ######################################################################## # RAT liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=20000 -multiple -chainTable=hg17.chainRn3 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain \ rn3.unmerged.bed5 rn3.unmapped.bed -verbose=2 >& rn3.log liftOverMerge -mergeGap=20000 -verbose=2 rn3.unmerged.bed5 rn3.bed5 >>& rn3.log wc -l rn3.*bed* # 53 rn3.bed5 # 0 rn3.unmapped.bed # 65 rn3.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rn3.bed5 > rn3.bed hgLoadBed -noBin rn3 encodeRegionsLiftOver rn3.bed >>& rn3.log ######################################################################## # CHICKEN # allow smaller chain sizes in query (only 1K, vs. 20K in close species) liftOver -minMatch=.01 minSizeT=4000 \ -multiple -minSizeQ=1000 -chainTable=hg17.chainGalGal2 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain \ galGal2.unmerged.bed5 galGal2.unmapped.bed -verbose=2 >& galGal2.log liftOverMerge -mergeGap=20000 -verbose=2 galGal2.unmerged.bed5 galGal2.bed5 >>& galGal2.log wc -l galGal2.*bed* # 94 galGal2.bed5 # 4 galGal2.unmapped.bed # 131 galGal2.unmerged.bed5 ##Partially deleted in new #chr13 29418015 29918015 ENr111 #chr5 141880150 142380150 ENr212 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' galGal2.bed5 > galGal2.bed hgLoadBed -noBin galGal2 encodeRegionsLiftOver galGal2.bed >>& galGal2.log ######################################################################## # MOUSE MM5 liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain \ mm5.unmerged.bed5 mm5.unmapped.bed -verbose=2 >& mm5.log liftOverMerge -mergeGap=20000 -verbose=2 mm5.unmerged.bed5 mm5.bed5 >>& mm5.log wc -l mm5.*bed* # 54 mm5.bed5 # 0 mm5.unmapped.bed # 60 mm5.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' mm5.bed5 > mm5.bed hgLoadBed -noBin mm5 encodeRegionsLiftOver mm5.bed >>& mm5.log ######################################################################## # DOG liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=20000 -multiple -chainTable=hg17.chainCanFam1 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain \ canFam1.unmerged.bed5 canFam1.unmapped.bed -verbose=2 >& canFam1.log liftOverMerge -mergeGap=20000 -verbose=2 canFam1.unmerged.bed5 canFam1.bed5 >>& canFam1.log wc -l canFam1.*bed* # 51 canFam1.bed5 # 0 canFam1.unmapped.bed # 58 canFam1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' canFam1.bed5 > canFam1.bed hgLoadBed -noBin canFam1 encodeRegionsLiftOver canFam1.bed >>& canFam1.log ######################################################################## # OPOSSUM liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=5000 -multiple -chainTable=hg17.chainMonDom1 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain \ monDom1.unmerged.bed5 monDom1.unmapped.bed -verbose=2 >& monDom1.log liftOverMerge -mergeGap=20000 -verbose=2 monDom1.unmerged.bed5 monDom1.bed5 >>& monDom1.log wc -l monDom1.*bed* # 143 monDom1.bed5 # 0 monDom1.unmapped.bed # 191 monDom1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' monDom1.bed5 > monDom1.bed hgLoadBed -noBin monDom1 encodeRegionsLiftOver monDom1.bed >>& monDom1.log ######################################################################## # TETRA rm -f tetNig1.log tetNig1.*bed* liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=1000 -multiple -chainTable=hg17.chainTetNig1 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain \ tetNig1.unmerged.bed5 tetNig1.unmapped.bed -verbose=2 >& tetNig1.log liftOverMerge -mergeGap=20000 -verbose=2 tetNig1.unmerged.bed5 tetNig1.bed5 >>& tetNig1.log wc -l tetNig1.*bed* # 197 tetNig1.bed5 # 12 tetNig1.unmapped.bed # 225 tetNig1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' tetNig1.bed5 > tetNig1.bed hgLoadBed -noBin tetNig1 encodeRegionsLiftOver tetNig1.bed >>& tetNig1.log ######################################################################## # ZEBRAFISH rm -f danRer2.log danRer2.*bed* liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=1000 -multiple -chainTable=hg17.chainDanRer2 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain \ danRer2.unmerged.bed5 danRer2.unmapped.bed -verbose=2 >& danRer2.log liftOverMerge -mergeGap=20000 -verbose=2 danRer2.unmerged.bed5 danRer2.bed5 >>& danRer2.log wc -l danRer2.*bed* # 238 danRer2.bed5 # 2 danRer2.unmapped.bed # 278 danRer2.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' danRer2.bed5 > danRer2.bed hgLoadBed -noBin danRer2 encodeRegionsLiftOver danRer2.bed >>& danRer2.log ######################################################################## # COW liftOver -minMatch=.01 -minSizeT=4000 \ -minSizeQ=20000 -multiple -chainTable=hg17.chainBosTau1 \ hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain \ /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain bosTau1.unmerged.bed5 bosTau1.unmapped.bed -verbose=2 >& bosTau1.log liftOverMerge -mergeGap=20000 -verbose=2 bosTau1.unmerged.bed5 bosTau1.bed5 >>& bosTau1.log wc -l bosTau1.*bed* # awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' bosTau1.bed5 > bosTau1.bed hgLoadBed -noBin bosTau1 encodeRegionsLiftOver bosTau1.bed >>& bosTau1.log rm -f bed.tab ######################################################################## ############# Create consensus regions ####################### ############# (Union of liftOver and Mercator) ####################### ######################################################################## ### :::::::::::::: consensus/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W #assumes that input is sorted by name (region) $region=""; $regionCounter=0; while (<>) { chomp(); if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d)/) { if ($region eq $4) { $regionCounter++; } else { $region=$4; $regionCounter=1; } printf("$1\t$2\t$3\t$4\t$regionCounter\n"); } elsif (/^(.*)\s+(\d+)\s+(\d+)\s+(MEN[mr]\d\d\d)/) { # regions with names that begin with 'M' have been manually marked for # removal and can be ignored. next; } else { die "Bad input line: $_"; } } ### # create consensus regions and load them set dir = /cluster/data/encode/ortho3/consensus mkdir -p $dir cd $dir foreach db (panTro1 rn3 galGal2 mm5 canFam1 monDom1 tetNig1 danRer2) rm -f $db.log $db.*.bed* $db.encodeRegionsConsensus* # db table db table out file err file regionOrtho $db.encodeRegionsLiftOver $db.encodeRegionsMercatorMerged $db.encodeRegionsConsensusUnmerged.bed $db.regionOrtho.err >& $db.log sort -k4 -k1.4n -k2n $db.encodeRegionsConsensusUnmerged.bed | makeBed5.pl > $db.encodeRegionsConsensusUnmerged.bed5 liftOverMerge -mergeGap=20000 -verbose=2 $db.encodeRegionsConsensusUnmerged.bed5 $db.encodeRegionsConsensus1.bed5 >>& $db.log awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus1.bed5 > $db.encodeRegionsConsensus1.bed hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus1.bed >>& $db.log # remove the regions that are smaller than 2 kb and update the local file from the table hgsql $db -e "select * from encodeRegionsConsensus where chromEnd-chromStart<2000" > $db.smallRegions.bed hgsql $db -e "delete from encodeRegionsConsensus where chromEnd-chromStart<2000" hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" > $db.encodeRegionsConsensus2.bed sort -k4 -k1.4n -k2n $db.encodeRegionsConsensus2.bed | makeBed5.pl > $db.encodeRegionsConsensus.bed5 awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus.bed5 > $db.encodeRegionsConsensus.bed hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus.bed >>& $db.log end ######################################################################## ############# Make html frames pages for analysis #################### ######################################################################## set dir = /cluster/data/encode/ortho3/html mkdir -p $dir cd $dir rm -f positionFile hgsql -e "select chrom, chromStart+1, chromEnd, name from encodeRegions order by name" hg17 | tail +2 | awk '{printf "%s:%d-%d\t%s\n",$1,$2,$3,$4}' > positionFile cp -f ${HOME}/kent/src/hg/encode/regionOrtho/mkOrthologAllFrame.pl /cluster/bin/scripts/mkOrthologAllFrame.pl chmod ug+x /cluster/bin/scripts/mkOrthologAllFrame.pl # 20-30 seconds each - lots of nib lookups foreach db (panTro1 rn3 galGal2 mm5 canFam1 monDom1 tetNig1 danRer2) echo `date` $db rm -f $db.html mkOrthologAllFrame.pl descriptionFile positionFile headerFile hg17 \ $db encodeRegionsConsensus encodeRegionsLiftOver encodeRegionsMercator \ > $db.html end set outDir = /usr/local/apache/htdocs/encode/ortho3 cp -f *.html $outDir chmod o+r $outDir/*.html ## prune excessively large mercator regions set dir = /cluster/data/encode/ortho3/mercatorPrune mkdir -p $dir cd $dir rm -f updateDatabase.sql ; grep -P "\d" *Regions.bed | awk '{printf "%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$3-$2}' | updateDatabase.pl > updateDatabase.sql hgsql hg17 < updateDatabase.sql ## recompute consensus, and remake html files by copying the and executing code from those sections above ######################################################################## ############### Write AGP files from consensus regions ############### ######################################################################## #!/bin/csh -f # "Make doc" for creating AGP files for ENCODE ortholog regions based on # coordinates in bed files in the "consensus" directory ## based on makeAgp2.doc rm -f *.contig.tab */*.packing.list >& /dev/null # TODO: contigAcc files should go in $outDir (neater this way) set orthoDir = /cluster/data/encode/ortho3 set agpDir = $orthoDir/agp set consensusDir = $orthoDir/consensus set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp set regionAgp = regionAgp mkdir -p $agpDir $downloadDir chmod o+rx $downloadDir rm -rf $agpDir/* $downloadDir/* cd $agpDir ### :::::::::::::: agp/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W #assumes that input is already numbered while (<>) { chomp(); if (/chromStart/) { continue; } if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d).(\d+)/) { printf("$1\t$2\t$3\t$4_$5\n"); } else { die "Bad input line: $_"; } } ### ######################################################################## # create script to run AGP-maker on an assembly rm -rf genomeAgps.csh >& /dev/null cat > genomeAgps.csh << 'EOF' # create AGP's for a genome assembly # NOTE: should have freeze dir (ortho3) as an arg # WARNING: this script uses $consensusDir/*.bed5, but instructions # below create a separate version, from the encodeRegionsConsensus table # Suggest next time using just the $consensusDir everywhere if ($#argv != 3) then echo "usage: $0 " exit 1 endif echo echo command line: $0 $1 $2 $3 echo working dir: `pwd` set consensusDir = /cluster/data/encode/ortho3/consensus set db = $1 set org = $2 set outDir = $3 set buildDir = /cluster/data/$db set bedFile = $consensusDir/$db.encodeRegionsConsensus.bed5 set regionAgp = regionAgp mkdir -p $outDir if (-f $org.contig.tab) then set contigArg = "-contigFile=$org.contig.tab" else set contigArg = "" endif set runRegionAgp = "$regionAgp $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir" if (-f $buildDir/chrom.lst) then cat $buildDir/chrom.lst | \ xargs -iX cat $buildDir/X/chr{X,X_random}.agp | $runRegionAgp else cat $buildDir/?{,?}/*.agp | $runRegionAgp endif 'EOF' # << this line makes emacs coloring happy ######################################################################## # create test scripts cd /cluster/data/encode/ortho3/agp/tests cat > getSeqFromAcc.pl <) { chomp(); system("wget -O $dir/$_ \"$URL$_\""); } EOF # chmod ug+x getSeqFromAcc.pl cd /cluster/data/encode/ortho3/agp/tests cat > testFa.csh << 'EOF' set db = $1 set org = $2 foreach f (../$db/*.agp) set seq = $f:t:r set region = `echo $seq | sed "s/${org}_//"` set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/ortho3/consensus/$db.bed` set chr = $coords:r:r nibFrag /cluster/data/$db/nib/${chr}.nib \ $coords:r:e $coords:e + $db.nibTest/$db.$region.fa faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa end 'EOF' # << for emacs ######################################################################## # MOUSE AGP'S (DONE 2005-03-17 kate) #!/bin/csh -fe cd /cluster/data/encode/ortho3/agp set db = mm5 set org = mouse #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 56 # Create AGP's csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir $db.test $db.nibTest # note: tab required in grep (retype when pasting) # watch out for hard-coded freeze names in scripts cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions # download contigs -- not needed if already run # NOTE: preferable to extract these from build dir ./getSeqFromAcc.pl $db < $db.accessions >& $db.log awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log & head testFa.$org.log # verify that all regions are the same grep -v same testFa.$org.log cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org \ "Mus musculus" 10090 C57BL/6J MAY-2004 $db "NCBI Build 33" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 56 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp chmod o+r $downloadDir/$db.* tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 56 ######################################################################a # RAT AGP'S (DONE 2005-03-16 kate) cd /cluster/data/encode/ortho3/agp set db = rn3 set org = rat set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList #create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 # Get contig to accession map hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's csh genomeAgps.csh $db $org $db # Test cd tests mkdir -p $db.test $db.nibTest # note: tab required in grep (retype when pasting) cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions # download contig sequences to $db.contig.fa directory # note: this doesn't handle errors well -- better to extract from # build dir contig fasta files ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log # verify all contigs were transferred ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc #./accHeader.pl $db.contig.fa $db >>& $db.log awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log grep -v same testFa.$org.log cd .. # Create packing list $encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Rattus norvegicus" 10116 BN/SsNHsdMCW JUN-2003 $db "Baylor HGSC v3.1" > $db/$db.packing.list # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp chmod o+r $downloadDir/* hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l ######################################################################## # CHICKEN AGP'S (DONE 2005-03-16 kate) cd /cluster/data/encode/ortho3/agp set db = galGal2 set org = chicken #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 92 # Get contig to accession mapping (documented in makeGalGal2.doc) hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's # errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files # but there are no ENCODE orthologous sequences on these chicken chromosomes csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log & head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org \ "Gallus gallus" 9031 N/A FEB-2004 $db "CGSC Feb. 2004" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 92 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 92 ######################################################################## # DOG AGP'S (DONE 2006-03-16 kate) set db = canFam1 set org = dog cd /cluster/data/encode/ortho3/agp set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList # create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's csh genomeAgps.csh $db $org $db # ignore error messages for random chrom's and chrM (not in dog assembly) # Tests cd tests mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 \ $db $org "Canis Familiaris" 9615 N/A JUL-2004 $db \ "Broad Institute v. 1.0" > $db/$db.packing.list # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l # 52 ######################################################################## # CHIMP AGP'S (DONE 2005-03-16 kate) set db = panTro1 set org = chimp cd /cluster/data/encode/ortho3/agp set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList # create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's # NOTE: next time, put chimpChromContigs.agp into $outDir cat /cluster/data/$db/?{,?}/*.agp | \ chimpChromContigAgp stdin /cluster/data/$db/assembly.agp chimpChromContigs.agp regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \ $db/$db.encodeRegionsConsensus.bed5 chimpChromContigs.agp -dir $db # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 \ $db $org "Pan troglodytes" 9598 N/A NOV-2003 $db \ "NCBI Build 1 v1" > $db/$db.packing.list # Test cd tests mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l # 81 ######################################################################## # OPOSSUM AGP'S (DONE 2005-03-17 kate) set db = monDom1 set org = opossum cd /cluster/data/encode/ortho3/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 146 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's gunzip -c /cluster/data/monDom1/broad.mit.edu/assembly.agp.gz > $db.assembly.agp regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \ $db/$db.encodeRegionsConsensus.bed5 $db.assembly.agp -dir $db rm $db.assembly.agp # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ cat > testScaffoldFa.csh << 'EOF' set db = $1 set org = $2 set freeze = $3 foreach f (../$db/*.agp) set seq = $f:t:r set region = `echo $seq | sed "s/${org}_//"` set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/$freeze/html/$db.encodeRegionsConsensus.bed` set chr = $coords:r:r twoBitToFa /cluster/data/$db/monDom1.2bit \ -seq=$chr -start=$coords:r:e -end=$coords:e $db.nibTest/$db.$region.fa faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa end 'EOF' # << make emacs happy csh testScaffoldFa.csh $db $org ortho3 >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Monodelphis domestica" 13616 N/A \ OCT-2004 $db "Broad Institute" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 146 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 146 ######################################################################## # TETRA AGP'S (DONE 2005-03-17 kate) set db = tetNig1 set org = tetra cd /cluster/data/encode/ortho3/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 185 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Tetraodon nigroviridis" 99883 N/A \ FEB-2004 $db "Genoscope V7" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 185 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l ######################################################################## # ZEBRAFISH AGP'S (DONE 2005-03-17 kate) set db = danRer2 set org = zebrafish cd /cluster/data/encode/ortho3/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 247 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGP's csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ortho3 >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Danio rerio" 7955 N/A \ JUN-2004 $db "Sanger Zv4" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 247 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 247 ######################################################################## # cleanup chmod -R o+r $downloadDir/* >& /dev/null exit ######################################################################## # print summary stats cat > stats.csh << 'EOF' hgsql hg16 -N -e "select name from encodeRegions" | sort > allRegions foreach f ($1/*.list) set db = $f:t:r:r set agps = `grep AgpFile $f | wc -l` set len = `awk '/SeqLength/ {len += $2} END {print len}' $f` set missing = `grep AgpFile $f | sed 's/.*\(EN.*\)_.*/\1/' | sort|uniq | diff - allRegions` printf "%-8s %5s %10s %s\n" $db $agps $len $missing end rm allRegions 'EOF' csh stats.csh $downloadDir