#!/bin/csh exit; set ortho=ortho4 set mercatorFile=encodeMaps20050504.tar.gz # start with Mercator data from Colin Dewey, and copy it to working directory: # /cluster/data/encode/$ortho/mercatorInput/${mercatorFile} # ${HOME}/kent/src/hg/lib/encodeRegionMercator.sql #create table encodeRegionsMercator ( # bin smallint(5) unsigned not null, # chrom varchar(255) not null, # chromStart int(10) unsigned not null, # chromEnd int(10) unsigned not null, # name varchar(255) not null, # score int(10) unsigned not null, # strand char(1) not null #) ### :::::::::::::: mercatorInput/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W while (<>) { chomp(); if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN\D\d\d\d)\.(\d+)_of.*/) { printf("$1\t$2\t$3\t$4\t$5\n"); } else { die "Bad input line: $_"; } } ### set dir = /cluster/data/encode/${ortho}/mercatorInput mkdir -p $dir cd $dir tar xvfz $mercatorFile mv macMul1.bed rheMac1.bed chmod g+w $dir/* foreach db (`ls -1 *bed | sed 's/.bed//g' | xargs echo`) echo ========= $db ========== `date` rm -f $db.log $db.unmerged.bed5 makeBed5.pl < $db.bed > $db.unmerged.bed5 liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed5 $db.bed5 >& $db.log hgLoadBed -tab -sqlTable=${HOME}/kent/src/hg/lib/encodeRegionsMercator.sql $db encodeRegionsMercator $db.bed >>& $db.log hgLoadBed -tab $db encodeRegionsMercatorMerged $db.bed5 >>& $db.log end rm -f bed.tab ######################################################################## ############# liftOver orthologous regions ########################### ######################################################################## set dir = /cluster/data/encode/${ortho}/liftOverRegions mkdir -p $dir cd $dir hgsql hg17 -e "SELECT * FROM encodeRegions ORDER BY name" | tail +2 > hg17.bed hgsql -h genome-testdb hgcentraltest -e "select toDb, path from liftOverChain " | grep /gbdb/hg17 | grep -v hg16 > hg17.over.chain.files ######################################################################## # CHIMP # THESE NOTES ARE FROM ORTHO2 # used chains from reciprocal best net -- these were more # chopped up, but regions in these don't span unbridged gaps # minMatch .7 leaves ENr324 and ENm011 unmapped # Lowered match threshold until region coverage looked good (to .4) # (except m11, for some reason), with minimum of fragmentation # Same at .3 and .2 -- at the point where regions look reasonable, # lowering the threshold in this way doesn't change them. # NOTE: reducing minSizeQ to 10000, due to fragmented panTro1 assembly liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=10000 -multiple -chainTable=hg17.chainPanTro1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain panTro1.unmerged.bed5 panTro1.unmapped.bed -verbose=2 >& panTro1.log liftOverMerge -mergeGap=20000 -verbose=2 panTro1.unmerged.bed5 panTro1.bed5 >>& panTro1.log wc -l panTro1.*bed* # 81 panTro1.bed5 # 0 panTro1.unmapped.bed # 107 panTro1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' panTro1.bed5 > panTro1.bed hgLoadBed -noBin panTro1 encodeRegionsLiftOver panTro1.bed >>& panTro1.log ######################################################################## # MACAQUE # NOTE: reducing minSizeQ to 10000, due to fragmented rheMac1 assembly liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=10000 -multiple -chainTable=hg17.chainRheMac1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17.rheMac1.over.chain rheMac1.unmerged.bed5 rheMac1.unmapped.bed -verbose=2 >& rheMac1.log liftOverMerge -mergeGap=20000 -verbose=2 rheMac1.unmerged.bed5 rheMac1.bed5 >>& rheMac1.log wc -l rheMac1.*bed* # 495 rheMac1.bed5 # 0 rheMac1.unmapped.bed # 544 rheMac1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rheMac1.bed5 > rheMac1.bed hgLoadBed -noBin rheMac1 encodeRegionsLiftOver rheMac1.bed >>& rheMac1.log ######################################################################## # RAT liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainRn3 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain rn3.unmerged.bed5 rn3.unmapped.bed -verbose=2 >& rn3.log liftOverMerge -mergeGap=20000 -verbose=2 rn3.unmerged.bed5 rn3.bed5 >>& rn3.log wc -l rn3.*bed* # 53 rn3.bed5 # 0 rn3.unmapped.bed # 65 rn3.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rn3.bed5 > rn3.bed hgLoadBed -noBin rn3 encodeRegionsLiftOver rn3.bed >>& rn3.log ######################################################################## # CHICKEN # allow smaller chain sizes in query (only 1K, vs. 20K in close species) liftOver -minMatch=.01 minSizeT=4000 -multiple -minSizeQ=1000 -chainTable=hg17.chainGalGal2 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain galGal2.unmerged.bed5 galGal2.unmapped.bed -verbose=2 >& galGal2.log liftOverMerge -mergeGap=20000 -verbose=2 galGal2.unmerged.bed5 galGal2.bed5 >>& galGal2.log wc -l galGal2.*bed* # 94 galGal2.bed5 # 4 galGal2.unmapped.bed # 131 galGal2.unmerged.bed5 ##Partially deleted in new #chr13 29418015 29918015 ENr111 #chr5 141880150 142380150 ENr212 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' galGal2.bed5 > galGal2.bed hgLoadBed -noBin galGal2 encodeRegionsLiftOver galGal2.bed >>& galGal2.log ######################################################################## # MOUSE MM6 liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple hg17.bed /cluster/data/hg17/bed/liftOver/hg17.mm6.over.chain.gz mm6.unmerged.bed5 mm6.unmapped.bed -verbose=2 >& mm6.log liftOverMerge -mergeGap=20000 -verbose=2 mm6.unmerged.bed5 mm6.bed5 >>& mm6.log wc -l mm6.*bed* # 53 mm6.bed5 # 0 mm6.unmapped.bed # 67 mm6.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' mm6.bed5 > mm6.bed hgLoadBed -noBin mm6 encodeRegionsLiftOver mm6.bed >>& mm6.log ######################################################################## # DOG liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainCanFam1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain canFam1.unmerged.bed5 canFam1.unmapped.bed -verbose=2 >& canFam1.log liftOverMerge -mergeGap=20000 -verbose=2 canFam1.unmerged.bed5 canFam1.bed5 >>& canFam1.log wc -l canFam1.*bed* # 51 canFam1.bed5 # 0 canFam1.unmapped.bed # 58 canFam1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' canFam1.bed5 > canFam1.bed hgLoadBed -noBin canFam1 encodeRegionsLiftOver canFam1.bed >>& canFam1.log ######################################################################## # OPOSSUM liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=5000 -multiple -chainTable=hg17.chainMonDom1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain monDom1.unmerged.bed5 monDom1.unmapped.bed -verbose=2 >& monDom1.log liftOverMerge -mergeGap=20000 -verbose=2 monDom1.unmerged.bed5 monDom1.bed5 >>& monDom1.log wc -l monDom1.*bed* # 143 monDom1.bed5 # 0 monDom1.unmapped.bed # 191 monDom1.unmerged.bed5 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' monDom1.bed5 > monDom1.bed hgLoadBed -noBin monDom1 encodeRegionsLiftOver monDom1.bed >>& monDom1.log ######################################################################## # TETRA rm -f tetNig1.log tetNig1.*bed* liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=1000 -multiple -chainTable=hg17.chainTetNig1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain tetNig1.unmerged.bed5 tetNig1.unmapped.bed -verbose=2 >& tetNig1.log liftOverMerge -mergeGap=20000 -verbose=2 tetNig1.unmerged.bed5 tetNig1.bed5 >>& tetNig1.log wc -l tetNig1.*bed* # 137 tetNig1.bed5 # 14 tetNig1.unmapped.bed # 167 tetNig1.unmerged.bed5 # #Partially deleted in new #chr5 141880150 142380150 ENr212 # #Deleted in new #chr2 51570355 52070355 ENr112 #chr4 118604258 119104258 ENr113 #chr16 25780427 26280428 ENr211 #chr14 52947075 53447075 ENr311 #chr11 130604797 131104797 ENr312 #chr16 60833949 61333949 ENr313 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' tetNig1.bed5 > tetNig1.bed hgLoadBed -noBin tetNig1 encodeRegionsLiftOver tetNig1.bed >>& tetNig1.log ######################################################################## # ZEBRAFISH rm -f danRer2.log danRer2.*bed* liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=1000 -multiple -chainTable=hg17.chainDanRer2 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain danRer2.unmerged.bed5 danRer2.unmapped.bed -verbose=2 >& danRer2.log liftOverMerge -mergeGap=20000 -verbose=2 danRer2.unmerged.bed5 danRer2.bed5 >>& danRer2.log wc -l danRer2.*bed* # 238 danRer2.bed5 # 2 danRer2.unmapped.bed # 278 danRer2.unmerged.bed5 #Deleted in new #chr11 130604797 131104797 ENr312 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' danRer2.bed5 > danRer2.bed hgLoadBed -noBin danRer2 encodeRegionsLiftOver danRer2.bed >>& danRer2.log ######################################################################## # COW liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple -chainTable=hg17.chainBosTau1 hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain bosTau1.unmerged.bed5 bosTau1.unmapped.bed -verbose=2 >& bosTau1.log liftOverMerge -mergeGap=20000 -verbose=2 bosTau1.unmerged.bed5 bosTau1.bed5 >>& bosTau1.log wc -l bosTau1.*bed* # 241 bosTau1.bed5 # 2 bosTau1.unmapped.bed # 243 bosTau1.unmerged.bed5 #Deleted in new #chr13 112338064 112838064 ENr132 awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' bosTau1.bed5 > bosTau1.bed hgLoadBed -noBin bosTau1 encodeRegionsLiftOver bosTau1.bed >>& bosTau1.log rm -f bed.tab ######################################################################## ############# Create consensus regions ####################### ############# (Union of liftOver and Mercator) ####################### ######################################################################## ### :::::::::::::: consensus/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W #assumes that input is sorted by name (region) $region=""; $regionCounter=0; while (<>) { chomp(); if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d)/) { if ($region eq $4) { $regionCounter++; } else { $region=$4; $regionCounter=1; } printf("$1\t$2\t$3\t$4\t$regionCounter\n"); } elsif (/^(.*)\s+(\d+)\s+(\d+)\s+(MEN[mr]\d\d\d)/) { # regions with names that begin with 'M' have been manually marked for # removal and can be ignored. next; } elsif (/chromStart/) { # header line next; } else { die "Bad input line: $_"; } } ### # create consensus regions and load them set dir = /cluster/data/encode/${ortho}/consensus mkdir -p $dir cd $dir date foreach db (`ls -1 /cluster/data/encode/${ortho}/mercatorInput/*bed | sed 's/.bed//g' | cut -f7 -d"/" | grep -v '\.' | xargs echo`) rm -f $db.log $db.*.bed* $db.encodeRegionsConsensus* regionOrtho $db.encodeRegionsLiftOver $db.encodeRegionsMercatorMerged $db.encodeRegionsConsensusUnmerged.bed $db.regionOrtho.err >& $db.log sort -k4 -k1.4n -k2n $db.encodeRegionsConsensusUnmerged.bed | makeBed5.pl > $db.encodeRegionsConsensusUnmerged.bed5 liftOverMerge -mergeGap=20000 -verbose=2 $db.encodeRegionsConsensusUnmerged.bed5 $db.encodeRegionsConsensus1.bed5 >>& $db.log awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus1.bed5 > $db.encodeRegionsConsensus1.bed hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus1.bed >>& $db.log hgsql $db -e "select * from encodeRegionsConsensus where chromEnd-chromStart<2000" > $db.smallRegions.bed hgsql $db -e "delete from encodeRegionsConsensus where chromEnd-chromStart<2000" hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" > $db.encodeRegionsConsensus2.bed sort -k4 -k1.4n -k2n $db.encodeRegionsConsensus2.bed | makeBed5.pl > $db.encodeRegionsConsensus.bed5 awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus.bed5 > $db.encodeRegionsConsensus.bed hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus.bed >>& $db.log end date ######################################################################## ############# Make html frames pages for analysis #################### ######################################################################## set dir = /cluster/data/encode/${ortho}/html mkdir -p $dir cd $dir rm -f positionFile hgsql -e "select chrom, chromStart+1, chromEnd, name from encodeRegions order by name" hg17 | tail +2 | awk '{printf "%s:%d-%d\t%s\n",$1,$2,$3,$4}' > positionFile cp -f ${HOME}/kent/src/hg/encode/regionOrtho/mkOrthologAllFrame.pl /cluster/bin/scripts/mkOrthologAllFrame.pl chmod ug+x /cluster/bin/scripts/mkOrthologAllFrame.pl ln -s ../../ortho3/html/descriptionFile . ln -s ../../ortho3/html/headerFile . # 20-30 seconds each - lots of nib lookups foreach db (`ls -1 /cluster/data/encode/${ortho}/mercatorInput/*bed | sed 's/.bed//g' | cut -f7 -d"/" | grep -v '\.' | xargs echo`) echo `date` $db rm -f $db.html mkOrthologAllFrame.pl descriptionFile positionFile headerFile hg17 $db encodeRegionsConsensus encodeRegionsLiftOver encodeRegionsMercator > $db.html end #Wed May 4 14:19:44 PDT 2005 bosTau1 #Wed May 4 14:30:57 PDT 2005 canFam1 #Wed May 4 14:31:17 PDT 2005 danRer2 #Wed May 4 14:31:33 PDT 2005 fr1 #Wed May 4 14:31:39 PDT 2005 galGal2 #Wed May 4 14:31:50 PDT 2005 mm6 #Wed May 4 14:32:14 PDT 2005 monDom1 #Wed May 4 14:32:40 PDT 2005 panTro1 #Wed May 4 14:33:03 PDT 2005 rheMac1 #Wed May 4 14:38:44 PDT 2005 rn3 #Wed May 4 14:39:07 PDT 2005 tetNig1 #Wed May 4 14:39:18 PDT 2005 xenTro1 #Wed May 4 14:39:35 PDT 2005 set outDir = /usr/local/apache/htdocs/encode/${ortho} mkdir -p $outDir cp -f *.html $outDir chmod o+r $outDir/*.html chmod o+rx $outDir ## prune excessively large mercator regions set dir = /cluster/data/encode/${ortho}/mercatorPrune mkdir -p $dir cd $dir rm -f removeBigRegions.sql grep -P "\d" ../html/*encodeRegionsMercator.bed | awk '{printf "%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$3-$2}' | removeBigRegions.pl > removeBigRegions.sql hgsql hg17 < removeBigRegions.sql ## recompute consensus, and remake html files by copying and executing the code from those sections above #Wed May 4 21:15:38 PDT 2005 bosTau1 #Wed May 4 21:25:57 PDT 2005 canFam1 #Wed May 4 21:26:15 PDT 2005 danRer2 #Wed May 4 21:26:29 PDT 2005 fr1 #Wed May 4 21:26:34 PDT 2005 galGal2 #Wed May 4 21:26:45 PDT 2005 mm6 #Wed May 4 21:27:06 PDT 2005 monDom1 #Wed May 4 21:27:29 PDT 2005 panTro1 #Wed May 4 21:27:51 PDT 2005 rheMac1 #Wed May 4 21:33:04 PDT 2005 rn3 #Wed May 4 21:33:25 PDT 2005 tetNig1 #Wed May 4 21:33:35 PDT 2005 xenTro1 ######################################################################## ############### Write AGP files from consensus regions ############### ######################################################################## # TODO: contigAcc files should go in $outDir (neater this way) #ortho is set previously and is a reminder here #set ortho=ortho4 set orthoDir = /cluster/data/encode/${ortho} set agpDir = $orthoDir/agp set consensusDir = $orthoDir/consensus set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp set regionAgp = regionAgp mkdir -p $agpDir $downloadDir $agpDir/tests chmod o+rx $downloadDir rm -rf $agpDir/* $downloadDir/* cd $agpDir #rm -f *.contig.tab */*.packing.list >& /dev/null ### :::::::::::::: agp/makeBed5.pl :::::::::::::: #!/usr/bin/perl -W #assumes that input is already numbered while (<>) { chomp(); if (/chromStart/) { next; } if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d).(\d+)/) { printf("$1\t$2\t$3\t$4\t$5\n"); } else { die "Bad input line: $_"; } } ### chmod ug+x makeBed5.pl ######################################################################## # create script to run AGP-maker on an assembly rm -rf genomeAgps.csh >& /dev/null cat > genomeAgps.csh << 'EOF' #!/bin/csh # create AGPs for a genome assembly # NOTE: should have freeze dir (${ortho}) as an arg # WARNING: this script uses $consensusDir/*.bed5, but instructions # below create a separate version, from the encodeRegionsConsensus table # Suggest next time using just the $consensusDir everywhere if ($#argv != 4) then echo "usage: $0 " exit 1 endif echo echo "command line:" $0 $1 $2 $3 $4 echo "working dir:" `pwd` set db = $1 set org = $2 set outDir = $3 set ortho = $4 set consensusDir = /cluster/data/encode/${ortho}/consensus set buildDir = /cluster/data/$db set bedFile = $consensusDir/$db.encodeRegionsConsensus.bed5 set regionAgp = regionAgp mkdir -p $outDir if (-f $org.contig.tab) then set contigArg = "-contigFile=$org.contig.tab" else set contigArg = "" endif set runRegionAgp = "$regionAgp $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir" #if (-f $buildDir/chrom.lst) then # cat $buildDir/chrom.lst | xargs -iX cat $buildDir/X/chr{X,X_random}.agp | $runRegionAgp #else cat $buildDir/?{,?}/*.agp | $runRegionAgp #endif 'EOF' # << this line makes emacs coloring happy ######################################################################## # create test scripts cd /cluster/data/encode/${ortho}/agp/tests cat > getSeqFromAcc.pl <) { chomp(); if (-e "$dir/$_") { next; } system("wget -O $dir/$_ \"$URL$_\""); } EOF # chmod ug+x getSeqFromAcc.pl cd /cluster/data/encode/${ortho}/agp/tests cat > testFa.csh << 'EOF' #!/bin/csh set db = $1 set org = $2 set ortho = $3 ln -s /cluster/data/encode/${ortho}/consensus/$db.encodeRegionsConsensus.bed /cluster/data/encode/${ortho}/consensus/$db.bed foreach f (../$db/*.agp) set seq = $f:t:r set region = `echo $seq | sed "s/${org}_//"` set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/${ortho}/consensus/$db.bed` set chr = $coords:r:r nibFrag /cluster/data/$db/nib/${chr}.nib $coords:r:e $coords:e + $db.nibTest/$db.$region.fa faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa end 'EOF' # << for emacs ######################################################################## # MOUSE AGPs cd /cluster/data/encode/${ortho}/agp #set db = mm5 set db = mm6 set org = mouse #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus order by name" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 57 # Create AGPs genomeAgps.csh $db $org $db $ortho # Test cd tests rm -fr $db.test $db.nibTest mkdir $db.test $db.nibTest # note: tab required in grep (retype when pasting) ***** READ THIS ************ cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions # download contigs -- not needed if already run # NOTE: preferable to extract these from build dir getSeqFromAcc.pl $db $ortho < $db.accessions >& $db.log awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ testFa.csh $db $org ${ortho} >&! testFa.$org.log & head testFa.$org.log # verify that all regions are the same grep -v same testFa.$org.log cd .. # Create packing list set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList $encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Mus musculus" 10090 C57BL/6J MAY-2004 $db "NCBI Build 33" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 57 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp chmod o+r $downloadDir/$db.* tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 57 ######################################################################a # RAT AGPs cd /cluster/data/encode/${ortho}/agp set db = rn3 set org = rat set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList #create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus order by name" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 # Get contig to accession map hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs csh genomeAgps.csh $db $org $db $ortho # Test cd tests mkdir -p $db.test $db.nibTest # note: tab required in grep (retype when pasting) ********************************* cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions # download contig sequences to $db.contig.fa directory # note: this doesn't handle errors well -- better to extract from # build dir contig fasta files ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log # verify all contigs were transferred ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc #./accHeader.pl $db.contig.fa $db >>& $db.log awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log grep -v same testFa.$org.log cd .. # Create packing list $encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Rattus norvegicus" 10116 BN/SsNHsdMCW JUN-2003 $db "Baylor HGSC v3.1" > $db/$db.packing.list # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp chmod o+r $downloadDir/* hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l ######################################################################## # CHICKEN AGPs cd /cluster/data/encode/${ortho}/agp set db = galGal2 set org = chicken #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 92 # Get contig to accession mapping (documented in makeGalGal2.doc) hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs # errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files # but there are no ENCODE orthologous sequences on these chicken chromosomes csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log & head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org \ "Gallus gallus" 9031 N/A FEB-2004 $db "CGSC Feb. 2004" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 92 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 92 ######################################################################## # DOG AGPs set db = canFam1 set org = dog cd /cluster/data/encode/${ortho}/agp set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList # create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs csh genomeAgps.csh $db $org $db # ignore error messages for random chrom's and chrM (not in dog assembly) # Tests cd tests mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 \ $db $org "Canis Familiaris" 9615 N/A JUL-2004 $db \ "Broad Institute v. 1.0" > $db/$db.packing.list # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l # 52 ######################################################################## # CHIMP AGPs set db = panTro1 set org = chimp cd /cluster/data/encode/${ortho}/agp set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList # create consensus list mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs # NOTE: next time, put chimpChromContigs.agp into $outDir cat /cluster/data/$db/?{,?}/*.agp | \ chimpChromContigAgp stdin /cluster/data/$db/assembly.agp chimpChromContigs.agp regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \ $db/$db.encodeRegionsConsensus.bed5 chimpChromContigs.agp -dir $db # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 \ $db $org "Pan troglodytes" 9598 N/A NOV-2003 $db \ "NCBI Build 1 v1" > $db/$db.packing.list # Test cd tests mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Copy to downloads area cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp hgsql -e "select count(*) from $db.encodeRegionsConsensus" tar tvfz $downloadDir/$db.agp.tar.gz | wc -l grep Region $downloadDir/$db.packing.list | wc -l # 81 ######################################################################## # OPOSSUM AGPs set db = monDom1 set org = opossum cd /cluster/data/encode/${ortho}/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 146 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs gunzip -c /cluster/data/monDom1/broad.mit.edu/assembly.agp.gz > $db.assembly.agp regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \ $db/$db.encodeRegionsConsensus.bed5 $db.assembly.agp -dir $db rm $db.assembly.agp # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ cat > testScaffoldFa.csh << 'EOF' set db = $1 set org = $2 set freeze = $3 foreach f (../$db/*.agp) set seq = $f:t:r set region = `echo $seq | sed "s/${org}_//"` set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/$freeze/html/$db.encodeRegionsConsensus.bed` set chr = $coords:r:r twoBitToFa /cluster/data/$db/monDom1.2bit \ -seq=$chr -start=$coords:r:e -end=$coords:e $db.nibTest/$db.$region.fa faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa end 'EOF' # << make emacs happy csh testScaffoldFa.csh $db $org ${ortho} >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Monodelphis domestica" 13616 N/A \ OCT-2004 $db "Broad Institute" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 146 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 146 ######################################################################## # TETRA AGPs set db = tetNig1 set org = tetra cd /cluster/data/encode/${ortho}/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 185 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Tetraodon nigroviridis" 99883 N/A \ FEB-2004 $db "Genoscope V7" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 185 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l ######################################################################## # ZEBRAFISH AGPs set db = danRer2 set org = zebrafish cd /cluster/data/encode/${ortho}/agp #create consensus list from database, verify it's the same as consensus dir mkdir $db hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5 sort $db/$db.encodeRegionsConsensus.bed5 > temp sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp # should be empty hgsql -e "select count(*) from $db.encodeRegionsConsensus" # 247 # get contig to accession mapping hgsql $db -s -e "select * from contigAcc" > $org.contig.tab # Create AGPs csh genomeAgps.csh $db $org $db # Test cd tests rm -fr $db.test $db.nibTest mkdir -p $db.test $db.nibTest # type the literal tab in cat ../$db/${org}_EN*.agp | grep -v " N" | cut -f6 > $db.accessions head $db.accessions ./getSeqFromAcc.pl $db ${ortho} < $db.accessions >& $db.log & # verify download worked ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc wc -l $db.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa cat ../$db/*.agp > $db.contig.agp agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa faSplit byname $db.fa $db.test/ csh testFa.csh $db $org ${ortho} >&! testFa.$org.log head testFa.$org.log grep -v same testFa.$org.log # no output -> passed test cd .. # Create packing list /cluster/data/encode/bin/scripts/encodeRegionPackingList \ $db/$db.encodeRegionsConsensus.bed5 $db $org "Danio rerio" 7955 N/A \ JUN-2004 $db "Sanger Zv4" > $db/$db.packing.list grep Region $db/$db.packing.list | wc -l # 247 # Copy to downloads area set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp cp -f $db/$db.packing.list $downloadDir tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp tar tvfz $downloadDir/$db.agp.tar.gz | wc -l # 247 ######################################################################## # cleanup chmod -R o+r $downloadDir/* >& /dev/null exit ######################################################################## # print summary stats cat > stats.csh << 'EOF' hgsql hg16 -N -e "select name from encodeRegions" | sort > allRegions foreach f ($1/*.list) set db = $f:t:r:r set agps = `grep AgpFile $f | wc -l` set len = `awk '/SeqLength/ {len += $2} END {print len}' $f` set missing = `grep AgpFile $f | sed 's/.*\(EN.*\)_.*/\1/' | sort|uniq | diff - allRegions` printf "%-8s %5s %10s %s\n" $db $agps $len $missing end rm allRegions 'EOF' csh stats.csh $downloadDir