#!/bin/csh exit; # Preparations for January 2006 freeze # Mercator predictions have been skipped # canFam2 is the only assembly being updated # previously predicted fr1 sequences were filtered at 1000 bp and renumbered # mm7 was processed so that the regions could be sent to NCBI (Deanna Church) for incorporation of new sequences set db = canFam2 set org = dog set ortho = ortho5 mkdir -p /cluster/data/encode/${ortho}/liftOver cd /cluster/data/encode/${ortho}/liftOver rm -f hg17.bed >& /dev/null hgsql hg17 -e "select chrom, chromStart, chromEnd, name from encodeRegions" | grep -v chromStart > hg17.bed liftOver -minMatch=0.01 -multiple -chainTable=hg17.chain${db:u} hg17.bed /cluster/data/hg17/bed/liftOver/hg17To${db:u}.over.chain.gz $db.unmerged.bed $db.unmapped.bed -verbose=4 >& $db.log liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed $db.unfiltered.bed >>& $db.log ## these next two steps are to remove small chunks and to renumber the remaining chunks awk ' $3-$2 > 30000 {print $0}' < $db.unfiltered.bed | grep -v chrM > $db.filtered.bed liftOverMerge -mergeGap=20000 -verbose=2 $db.filtered.bed $db.bed5 >>& $db.log awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' ${db}.bed5 > ${db}.bed hgLoadBed -noBin ${db} encodeRegionsLiftOver ${db}.bed >>& ${db}.log hgLoadBed -noBin ${db} encodeRegionsConsensus ${db}.bed >>& ${db}.log mkdir -p /cluster/data/encode/${ortho}/consensus cd /cluster/data/encode/${ortho}/consensus ln -s ../liftOver/${db}.bed canFam2.encodeRegionsConsensus.bed ln -s ../liftOver/${db}.bed5 canFam2.encodeRegionsConsensus.bed5 ######################################################################## # DOG AGPs mkdir -p /cluster/data/encode/${ortho}/agp cd /cluster/data/encode/${ortho}/agp ln -s /cluster/data/encode/ortho/scripts/* . set downloadDir = /usr/local/apache/htdocs/encode/${ortho}/agp set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList # create consensus list mkdir ${db} hgsql ${db} -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > ${db}/${db}.encodeRegionsConsensus.bed5 hgsql ${db} -s -e "select * from contigAcc" > ${org}.contig.tab # Create AGPs csh genomeAgps.csh ${db} ${org} ${db} ${ortho} # ignore error messages for random chrom's and chrM (not in dog assembly) # Tests mkdir tests cd tests ln -s ../../../ortho/scripts/getSeqFromAcc.pl . mkdir -p ${db}.test ${db}.nibTest cat ../${db}/${org}_EN*.agp | grep -v "\tN" | cut -f6 > ${db}.accessions head ${db}.accessions ./getSeqFromAcc.pl ${db} ${ortho} < ${db}.accessions >& ${db}.log & # verify download worked ls -l ${db}.contig.fa | grep ' 0 ' | awk '{print $9}' > ${db}.redo.acc wc -l ${db}.redo.acc # download if any left awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' ${db}.contig.fa/* > ${db}.contigfile.fa cat ../${db}/*.agp > ${db}.contig.agp agpAllToFaFile ${db}.contig.agp ${db}.contigfile.fa ${db}.fa faSplit byname ${db}.fa ${db}.test/ ln -s ../../../ortho/scripts/testFa.csh . csh testFa.csh ${db} ${org} ${ortho} >&! testFa.${org}.log head testFa.${org}.log grep -v same testFa.${org}.log # no output -> passed test cd .. # Create packing list ${encodeRegionPackingList} ${db}/${db}.encodeRegionsConsensus.bed5 ${db} ${org} "Canis Familiaris" 9615 N/A MAY-2005 ${db} "Broad Institute v. 2.0" > ${db}/${db}.packing.list # Copy to downloads area cp -f ${db}/${db}.packing.list ${downloadDir} tar cvfz ${downloadDir}/${db}.agp.tar.gz ${db}/*.agp hgsql -e "select count(*) from ${db}.encodeRegionsConsensus" tar tvfz ${downloadDir}/${db}.agp.tar.gz | wc -l grep Region ${downloadDir}/${db}.packing.list | wc -l # 69 chmod ugo+rx $downloadDir $downloadDir/.. chmod ugo+r $downloadDir/*.tar.gz $downloadDir/*.packing.list ### :::::::::::::: makeBed5.pl :::::::::::::: #!/usr/bin/perl -W #assumes that input is already numbered while (<>) { chomp(); if (/chromStart/) { next; } if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d).(\d+)/) { printf("$1\t$2\t$3\t$4\t$5\n"); } else { die "Bad input line: $_"; } } ### chmod ug+x makeBed5.pl ######################################################################## # create script to run AGP-maker on an assembly rm -rf genomeAgps.csh >& /dev/null ### :::::::::::::: genomeAgps.csh :::::::::::::: cat > genomeAgps.csh << 'EOF' #!/bin/csh # create AGPs for a genome assembly # NOTE: should have freeze dir (${ortho}) as an arg # WARNING: this script uses ${consensusDir}/*.bed5, but instructions # below create a separate version, from the encodeRegionsConsensus table # Suggest next time using just the ${consensusDir} everywhere if ($#argv != 4) then echo "usage: $0 db org outDir orthoVersion" exit 1 endif echo echo "command line:" $0 $1 $2 $3 $4 echo "working dir:" `pwd` set db = $1 set org = $2 set outDir = $3 set ortho = $4 set consensusDir = /cluster/data/encode/${ortho}/consensus set buildDir = /cluster/data/${db} set bedFile = ${consensusDir}/${db}.encodeRegionsConsensus.bed5 set regionAgp = regionAgp mkdir -p ${outDir} if (-f ${org}.contig.tab) then set contigArg = "-contigFile=${org}.contig.tab" else set contigArg = "" endif set runRegionAgp = "${regionAgp} ${contigArg} -namePrefix=${org}_ ${bedFile} stdin -dir ${outDir}" #if (-f ${buildDir}/chrom.lst) then # cat ${buildDir}/chrom.lst | xargs -iX cat ${buildDir}/X/chr{X,X_random}.agp | ${runRegionAgp} #else cat ${buildDir}/?{,?}/*.agp | ${runRegionAgp} #endif 'EOF' # << this line makes emacs coloring happy ######################################################################## # create test scripts ### :::::::::::::::: getSeqFromAcc.pl :::::::::::::::: cd /cluster/data/encode/${ortho}/agp/tests cat > getSeqFromAcc.pl <) { chomp(); if (-e "${dir}/$_") { next; } system("wget -O ${dir}/$_ \"${URL}$_\""); } EOF # chmod ug+x getSeqFromAcc.pl ### :::::::::::::: testFa.csh :::::::::::::: cd /cluster/data/encode/${ortho}/agp/tests cat > testFa.csh << 'EOF' #!/bin/csh set db = $1 set org = $2 set ortho = $3 ln -s /cluster/data/encode/${ortho}/consensus/${db}.encodeRegionsConsensus.bed /cluster/data/encode/${ortho}/consensus/${db}.bed foreach f (../${db}/*.agp) set seq = ${f}:t:r set region = `echo ${seq} | sed "s/${org}_//"` set coords = `awk -v REGION=${region} '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/${ortho}/consensus/${db}.bed` set chr = ${coords}:r:r nibFrag /cluster/data/${db}/nib/${chr}.nib ${coords}:r:e ${coords}:e + ${db}.nibTest/${db}.${region}.fa faCmp ${db}.nibTest/${db}.${region}.fa ${db}.test/${org}_${region}.fa end 'EOF' # << for emacs ############################ set db = fr1 set org = fugu set ortho = ortho5 set dir = /cluster/data/encode/${ortho}/${db} mkdir -p $dir cd $dir cp -p /cluster/data/encode/ortho/ortho4/mercatorInput/${db}.bed5 ./${db}.unmerged.bed touch ${db}.log liftOverMerge -mergeGap=2000 -verbose=2 $db.unmerged.bed $db.unfiltered.bed >>& $db.log ## these next two steps are to remove small chunks and to renumber the remaining chunks awk ' $3-$2 > 1000 {print $0}' < $db.unfiltered.bed | grep -v chrM > $db.filtered.bed liftOverMerge -mergeGap=20000 -verbose=2 $db.filtered.bed $db.bed5 >>& $db.log awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' ${db}.bed5 > ${db}.bed hgLoadBed -noBin ${db} encodeRegionsConsensus ${db}.bed >>& ${db}.log mkdir -p /cluster/data/encode/${ortho}/consensus cd /cluster/data/encode/${ortho}/consensus ln -s ../liftOver/${db}.bed ${db}.encodeRegionsConsensus.bed ln -s ../liftOver/${db}.bed5 ${db}.encodeRegionsConsensus.bed5 ####################### set db = mm7 set org = mouse set ortho = ortho5 mkdir -p /cluster/data/encode/${ortho}/liftOver cd /cluster/data/encode/${ortho}/liftOver #hgsql hg17 -e "select chrom, chromStart, chromEnd, name from encodeRegions" | grep -v chromStart >! hg17.bed time liftOver -minMatch=0.01 -multiple -chainTable=hg17.chain${db:u} hg17.bed /cluster/data/hg17/bed/liftOver/hg17To${db:u}.over.chain.gz $db.unmerged.bed $db.unmapped.bed -verbose=4 >& $db.log # 51.760u 10.900s 4:54.00 21.3% 0+0k 0+0io 20583pf+0w liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed $db.unfiltered.bed >>& $db.log ## these next two steps are to remove small chunks and to renumber the remaining chunks awk ' $3-$2 > 30000 {print $0}' < $db.unfiltered.bed | grep -v chrM > $db.filtered.bed liftOverMerge -verbose=2 $db.filtered.bed $db.bed5 >>& $db.log awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' ${db}.bed5 > ${db}.bed hgLoadBed -noBin ${db} encodeRegionsLiftOver ${db}.bed >>& ${db}.log hgLoadBed -noBin ${db} encodeRegionsConsensus ${db}.bed >>& ${db}.log rm -f bed.tab mkdir -p /cluster/data/encode/${ortho}/consensus cd /cluster/data/encode/${ortho}/consensus ln -s ../liftOver/${db}.bed ${db}.encodeRegionsConsensus.bed ln -s ../liftOver/${db}.bed5 ${db}.encodeRegionsConsensus.bed5