# for emacs: -*- mode: sh; -*- # Drosophila pseudoobscura -- # # Baylor HGSC's Drosophila Genome Project Freeze 1 assembly (Aug. 2003) # http://www.hgsc.bcm.tmc.edu/projects/drosophila/ # # DOWNLOAD SEQUENCE (DONE 10/8/03 angie) ssh kksilo mkdir /cluster/store6/dp1 cd /cluster/data ln -s /cluster/store6/dp1 dp1 cd /cluster/data/dp1 mkdir downloads cd downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/conditions_for_use wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/freeze1_readme # Freeze 1 is 759 scaffolds, 271 of which are assigned to chromosomes # in the .txt file: wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/D_pseudo_freeze1_scaffolds_8_28_03.fa wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/D_pseudo_freeze1_scaffold_to_chromosome_assignments_8_28_03.txt # Unpack scaffolds into separate files in scaffolds/ dir: cd /cluster/data/dp1 faSplit byname downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa scaffolds/ # CREATE UNORDERED CHROM FROM SCAFFOLDS (DONE 10/10/03 angie) # Note: scaffolds are separated by 1000 Bp gaps ssh kksilo cd /cluster/data/dp1 # DOH! They omitted ">" at the beginning of 8 headers where they # were noting scaffolds that they split due to spanning multiple # chromosomes. # Edit downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa to restore # the ">" at the beginning of the mangled headers. scaffoldFaToAgp downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa mkdir Un mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.agp Un/chrUn.agp mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.gap Un/chrUn.gap mv downloads/D_pseudo_freeze1_scaffolds_8_28_03.lft Un/chrUn.lft mkdir jkStuff cp Un/chrUn.lft jkStuff/liftAll.lft # Create chromosome FA file from AGP and file of masked scaffolds agpToFa -simpleMultiMixed Un/chrUn.agp chrUn Un/chrUn.fa \ downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa # CREATING DATABASE (DONE 10/10/03 angie) # Create the database. ssh hgwdev echo 'create database dp1' | hgsql '' # Make a semi-permanent read-only alias: alias dp1 "mysql -u hguser -phguserstuff -A dp1" # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql # PARTITION SCAFFOLDS FOR PROCESSING (DONE 11/14/03 angie) # For all our runs, we need results in scaffold coordinates (to share # with collaborators) as well as chrUn (to display in the browser). # It was a mistake to chop chrUn into chunks for processing the # first time around -- that leaves us with 2 sets of useless coords # as far as others are concerned. # # Chop up scaffolds into ~500k chunks (mainly to limit size of # RepeatMasker input sequence). Cat together split .lft's into # jkStuff/scaffoldsSplit.lft which will be used to lift to scaffold # coords after processing. ssh kksilo cd /cluster/data/dp1 mkdir scaffoldsSplit foreach f (scaffolds/*.fa) faSplit size $f 500000 scaffoldsSplit/$f:t:r_ \ -lift=scaffoldsSplit/$f:t:r.lft end cat scaffoldsSplit/C*.lft > jkStuff/scaffoldsSplit.lft # Create a big fasta file containing all split scaffolds, ordered # by decreasing size. Then use faSplit again -- we end up with # big scaffold chunks in their own files, but small scaffolds # grouped together. mkdir chunks cat `ls -1S scaffoldsSplit/*.fa` \ | faSplit about stdin 300000 chunks/chunk_ # RUN REPEAT MASKER (DONE 10/13/03 angie) # Note: drosophila library ("drosophila.lib") is dated May 27 '03. # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # make the run directory, output directory, and job list mkdir RMRun cp /dev/null RMRun/RMJobs foreach f ( chunks/*.fa ) set chunk = $f:t echo /cluster/bin/scripts/RMDrosophila \ /cluster/data/dp1/chunks $chunk /cluster/data/dp1/chunks \ '{'check out line+ /cluster/data/dp1/chunks/$chunk.out'}' \ >> RMRun/RMJobs end # do the run ssh kk cd /cluster/data/dp1/RMRun para create RMJobs para try para check para push para check,... #Completed: 304 of 304 jobs #Average job time: 5692s 94.87m 1.58h 0.07d #Longest job: 7392s 123.20m 2.05h 0.09d #Submission to last job: 7425s 123.75m 2.06h 0.09d # Lift up the split-scaffold .out's to scaffold .out's ssh kksilo cd /cluster/data/dp1 foreach f (chunks/*.fa.out) liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null end # Make a consolidated scaffold .out file too: liftUp scaffolds/scaffolds.fa.out jkStuff/scaffoldsSplit.lft warn \ chunks/*.fa.out > /dev/null # Lift up scaffold .out to chrUn .out. liftUp Un/chrUn.fa.out jkStuff/liftAll.lft warn scaffolds/scaffolds.fa.out \ > /dev/null # soft-mask chunk .fa's with .out's (for running TRF next) foreach chunk (chunks/chunk*.fa) maskOutFa $chunk $chunk.out $chunk -soft end # Load the .out files into the database with: ssh hgwdev hgLoadOut dp1 /cluster/data/dp1/?{,?}/*.fa.out # SIMPLE REPEATS (TRF) (DONE 11/14/03 angie) ssh kksilo mkdir -p /cluster/data/dp1/bed/simpleRepeat cd /cluster/data/dp1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach f (/cluster/data/dp1/chunks/chunk*.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # Lift to scaffold coords, then chrUn coords for track-loading: liftUp scaffoldsTrf.bed ../../jkStuff/scaffoldsSplit.lft warn \ trf/*.bed > /dev/null liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \ scaffoldsTrf.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed dp1 simpleRepeat \ /cluster/data/dp1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/14/03 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/dp1/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) echo -n "filtering $f... " awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end awk '{if ($5 <= 12) print;}' scaffoldsTrf.bed > scaffoldsTrfMask.bed # Lift up filtered trf output to chrom coords as well: mkdir trfMaskChrom liftUp trfMaskChrom/chrUn.bed ../../jkStuff/liftAll.lft warn \ scaffoldsTrfMask.bed > /dev/null # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/14/03 angie) ssh kksilo cd /cluster/data/dp1 foreach c (?{,?}) echo repeat- and trf-masking chr$c.fa maskOutFa -soft $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa maskOutFa -softAdd $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed \ $c/chr$c.fa end echo repeat- and trf-masking chunks foreach chunk (chunks/chunk*.fa) set trfMask=bed/simpleRepeat/trfMask/$chunk:t:r.bed maskOutFa -soft $chunk $chunk.out $chunk maskOutFa -softAdd $chunk $trfMask $chunk end echo repeat- and trf-masking scaffolds # maskOutFa dies if the lift file describes more seqs than are in # the fasta, so make a separate mask file for each scaffold. set rmMask=scaffolds/scaffolds.fa.out set trfMask=bed/simpleRepeat/scaffoldsTrfMask.bed # use head to grab the RepeatMasker output header, then grep contents. foreach f (scaffolds/*.fa) head -3 $rmMask > tmpMask.out grep $f:t:r $rmMask >> tmpMask.out maskOutFa -soft $f tmpMask.out $f grep $f:t:r $trfMask > tmpMask.bed maskOutFa -softAdd $f tmpMask.bed $f end rm tmpMask.{bed,out} # Put masked scaffolds and chunks on /cluster/bluearc mkdir /cluster/bluearc/drosophila/dp1/trfFa cp -p /cluster/data/dp1/scaffolds/*.fa /cluster/bluearc/drosophila/dp1/trfFa mkdir /cluster/bluearc/drosophila/dp1/chunks cp -p /cluster/data/dp1/chunks/*.fa /cluster/bluearc/drosophila/dp1/chunks # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 10/13/03 angie) # Translate to nib ssh kksilo cd /cluster/data/dp1 mkdir nib # redone 11/14/03: faToNib -softMask Un/chrUn.fa nib/chrUn.nib # Make symbolic links from /gbdb/dp1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/dp1/nib ln -s /cluster/data/dp1/nib/chrUn.nib /gbdb/dp1/nib # Load /gbdb/dp1/nib paths into database and save size info. ssh hgwdev hgsql dp1 < /cluster/data/src/hg/lib/chromInfo.sql cd /cluster/data/dp1 hgNibSeq -preMadeNib dp1 /gbdb/dp1/nib chrUn.nib echo "select chrom,size from chromInfo" | hgsql -N dp1 > chrom.sizes # load gap track ssh hgwdev hgLoadGap dp1 /cluster/data/dp1 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/10/03 angie) # Copy all the data from the table "grp" # in the existing database "rn1" to the new database ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql dp1 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 12/2/03 angie) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("dp1", "Aug. 2003", "/gbdb/dp1/nib", "D.pseud.", \ "chrUn:827700-845800", 1, 56, "D.pseud.", \ "Drosophila pseudoobscura", "/gbdb/dp1/html/description.html", \ 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("D.pseud.", "dp1");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add dp1 in all the right places and do make update # go public on genome-test #make alpha cvs commit makefile # Add trackDb directories mkdir drosophila/dp1 cvs add drosophila/dp1 cvs commit drosophila/dp1 PRODUCING GENSCAN PREDICTIONS (TODO 11/14/03 angie) # Run on small cluster -- genscan needs big mem. ssh kkr1u00 mkdir /cluster/data/dp1/bed/genscan cd /cluster/data/dp1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Make hard-masked contigs mkdir /cluster/data/dp1/scaffoldsMasked foreach f (/cluster/data/dp1/scaffolds/*.fa) maskOutFa $f hard /cluster/data/dp1/scaffoldsMasked/$f:t.masked end # Generate a list file, contigs.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f contigs.list touch contigs.list foreach f ( `ls -1S /cluster/data/dp1/scaffoldsMasked/*.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> contigs.list end cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 contigs.list single gsub jobList para create jobList para try para check para push #Completed: 759 of 759 jobs #Average job time: 16s 0.27m 0.00h 0.00d #Longest job: 424s 7.07m 0.12h 0.00d #Submission to last job: 1538s 25.63m 0.43h 0.02d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # chr14_21, chr16_4 # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/dp1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/dp1/bed/genscan ldHgGene dp1 genscan genscan.gtf hgPepPred dp1 generic genscanPep genscan.pep hgLoadBed dp1 genscanSubopt genscanSubopt.bed # MAKE DOWNLOADABLE FILES (DONE (no mrna.zip) 12/1/03 angie) ssh kksilo cd /cluster/data/dp1 mkdir zips zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out # use head to keep the RepeatMasker output header of the first file, # and tail to skip past it in all subsequent files. head -3 chunks/chunk_00.scaf.out > scaffolds.fa.out foreach f ( chunks/*_??.scaf.out chunks/*_???.scaf.out) tail +4 $f >> scaffolds.fa.out end zip zips/scaffoldOut.zip scaffolds.fa.out rm scaffolds.fa.out zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa foreach f (?{,?}/chr?{,?}.fa) maskOutFa $f hard $f.masked end zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked zip -j zips/scaffoldFa.zip scaffolds/*.fa zip -j zips/scaffoldFaMasked.zip scaffoldsMasked/*.fa.masked cd bed/simpleRepeat zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed zip ../../zips/scaffoldTrf.zip scaffoldsTrfMask.bed cd ../.. zip -j zips/liftAll.zip jkStuff/liftAll.lft foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp1 cd /usr/local/apache/htdocs/goldenPath/dp1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cp -p /cluster/data/dp1/zips/*.zip bigZips # SCAFFOLDS TRACK (DONE 12/1/03 angie) # make ctgPos (contig name, size, chrom, chromStart, chromEnd) from liftAll ssh hgwdev mkdir /cluster/data/dp1/bed/ctgPos cd /cluster/data/dp1/bed/ctgPos perl -we 'while (<>) { \ ($cStart, $ctg, $size, $chrom, undef) = split; \ if ($ctg =~ /^Contig/) { \ $cEnd = $cStart + $size; \ print "$ctg\t$size\t$chrom\t$cStart\t$cEnd\n"; \ } \ }' \ ../../jkStuff/liftAll.lft > ctgPos.tab hgsql dp1 < ~/kent/src/hg/lib/ctgPos.sql echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql dp1 # ALL-DROSOPHILA REFSEQ (DONE 12/2/03 angie) ssh kksilo mkdir /cluster/data/dp1/bed/drosRefSeq cd /cluster/data/dp1/bed/drosRefSeq grep "^Drosophila" /cluster/data/genbank/data/species.lst > dros.lst grep "Drosophila" \ /cluster/data/genbank/data/processed/refseq.138.0/full/mrna.gbidx \ | awk '{print $1;}' > refSeqAcc.lst perl -wpe 's/^>(\w+)\.\d+/>$1/' \ /cluster/data/genbank/data/processed/refseq.138.0/full/mrna.fa \ > allRefSeq.fa faSomeRecords allRefSeq.fa refSeqAcc.lst drosRefSeq.fa rm allRefSeq.fa faSplit sequence drosRefSeq.fa 4 drs ls -1S drs*.fa > refSeq.lst ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst mkdir psl # translated blat because these are xeno: cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.lst refSeq.lst gsub spec ssh kk cd /cluster/data/dp1/bed/drosRefSeq para create spec para try, check, push, check, ... #Completed: 1216 of 1216 jobs #Average job time: 85s 1.41m 0.02h 0.00d #Longest job: 308s 5.13m 0.09h 0.00d #Submission to last job: 439s 7.32m 0.12h 0.01d # back on kksilo: pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 -minCover=0.3 liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > drosRefSeq.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load it up: ssh hgwdev cd /cluster/data/dp1/bed/drosRefSeq # hgc needs the track table name to start with xeno... ln -s drosRefSeq.psl xenoDrosRefSeq.psl hgLoadPsl dp1 xenoDrosRefSeq.psl mkdir /gbdb/dp1/drosMrna ln -s /cluster/data/dp1/bed/drosRefSeq/drosRefSeq.fa /gbdb/dp1/drosMrna/ hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosRefSeq.fa # ALL-DROSOPHILA MRNA (DONE 12/2/03 angie) ssh kksilo mkdir /cluster/data/dp1/bed/drosMrna cd /cluster/data/dp1/bed/drosMrna grep "Drosophila" \ /cluster/data/genbank/data/processed/genbank.138.0/full/mrna.gbidx \ | awk '{print $1;}' > mrnaAcc.lst perl -wpe 's/^>(\w+)\.\d+/>$1/' \ /cluster/data/genbank/data/processed/genbank.138.0/full/mrna.fa \ > allMrna.fa faSomeRecords allMrna.fa mrnaAcc.lst drosMrna.fa rm allMrna.fa faSplit sequence drosMrna.fa 4 drs ls -1S drs*.fa > mrna.lst ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst mkdir psl # translated blat because these are xeno: cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.lst mrna.lst gsub spec ssh kk cd /cluster/data/dp1/bed/drosMrna para create spec para try, check, push, check, ... #Completed: 1216 of 1216 jobs #Average job time: 69s 1.15m 0.02h 0.00d #Longest job: 102s 1.70m 0.03h 0.00d #Submission to last job: 351s 5.85m 0.10h 0.00d # back on kksilo: pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > drosMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load it up: ssh hgwdev cd /cluster/data/dp1/bed/drosMrna # hgc needs the track table name to start with xeno... ln -s drosMrna.psl xenoDrosMrna.psl hgLoadPsl dp1 xenoDrosMrna.psl ln -s /cluster/data/dp1/bed/drosMrna/drosMrna.fa /gbdb/dp1/drosMrna/ hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosMrna.fa # ALL-DROSOPHILA EST (DONE 12/2/03 angie) ssh kksilo mkdir /cluster/data/dp1/bed/drosEst cd /cluster/data/dp1/bed/drosEst foreach idx (/cluster/data/genbank/data/processed/genbank.138.0/full/est.*.gbidx) grep "Drosophila" $idx \ | awk '{print $1;}' > $idx:t:r.lst perl -wpe 's/^>(\w+)\.\d+/>$1/' $idx:r.fa \ > $idx:t:r.all.fa faSomeRecords $idx:t:r.all.fa $idx:t:r.lst $idx:t:r.fa rm $idx:t:r.all.fa end cat est.*.lst > drosEst.lst cat est.*.fa > drosEst.fa faSplit sequence drosEst.fa 20 drs rm est.*.lst est.*.fa ls -1S drs*.fa > est.lst ls -1S /cluster/data/dp1/chunks/chunk_*.fa > chunks.lst mkdir psl # translated blat because these are xeno: cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.lst est.lst gsub spec ssh kk cd /cluster/data/dp1/bed/drosEst para create spec para try, check, push, check, ... #Completed: 6080 of 6080 jobs #Average job time: 63s 1.04m 0.02h 0.00d #Longest job: 109s 1.82m 0.03h 0.00d #Submission to last job: 485s 8.08m 0.13h 0.01d # back on kksilo: pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.10 liftUp scaffold.psl ../../jkStuff/scaffoldsSplit.lft warn cooked.psl liftUp chrom.psl ../../jkStuff/liftAll.lft warn scaffold.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > drosEst.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load it up: ssh hgwdev cd /cluster/data/dp1/bed/drosEst # hgc needs the track table name to start with xeno... ln -s drosEst.psl xenoDrosEst.psl hgLoadPsl dp1 xenoDrosEst.psl ln -s /cluster/data/dp1/bed/drosEst/drosEst.fa /gbdb/dp1/drosMrna/ hgLoadSeq dp1 /gbdb/dp1/drosMrna/drosEst.fa # LIFTOVER BDGP GENES (IN PROGRESS 12/9/03 angie) ssh hgwdev mkdir /cluster/data/dp1/bed/bdgpAnnotations cd /cluster/data/dp1/bed/bdgpAnnotations echo 'select * from bdgpGene' | hgsql -N dm1 > dm1.bdgpGene.tab liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \ dm1.bdgpGene.tab \ /cluster/data/dm1/bed/blastz.dp1.2003-11-14/axtChainScaffoldQ/all.chain \ scaffold.bdgpLiftGene.tab scaffold.unmapped.tab liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \ dm1.bdgpGene.tab \ /cluster/data/dm1/bed/blastz.dp1.2003-11-14/axtChainChrom/all.chain \ chrUn.bdgpLiftGene.tab chrUn.unmapped.tab wc -l dm1.bdgpGene.tab scaffold.bdgpLiftGene.tab chrUn.bdgpLiftGene.tab grep ^# chrUn.unmapped.tab | awk '{print $1 " " $2 " " $3;}' \ | sort | uniq -c | sort -gr hgsql dp1 < input.lst echo '#LOOP' > gsub echo 'doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}' >> gsub echo '#ENDLOOP' >> gsub gensub2 input.lst single gsub spec echo '#\!/bin/csh' > doChain echo 'axtChain $1 /cluster/data/dp1/scaffoldsNib /cluster/data/dm1/nib $2 > $3' \ >> doChain chmod a+x doChain mkdir chain out para create spec para try, check, push, check, ... #Completed: 671 of 671 jobs #Average job time: 11s 0.19m 0.00h 0.00d #Longest job: 45s 0.75m 0.01h 0.00d #Submission to last job: 639s 10.65m 0.18h 0.01d ssh kksilo cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain mkdir -p ../axtChainChrom/chain liftUp ../axtChainChrom/all.chain \ /cluster/data/dp1/jkStuff/liftAll.lft warn all.chain cd ../axtChainChrom/chain chainSplit chain all.chain ssh hgwdev cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainChrom/chain foreach i (*.chain) set c = $i:r hgLoadChain dp1 ${c}_chainDm1 $i echo done $c end # Create the nets. You can do this while the database is loading ssh kksilo cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold # First do a crude filter that eliminates many chains so the # real chainer has less work to do. mkdir preNet cd chain foreach i (*.chain) echo preNetting $i chainPreNet $i /cluster/data/dp1/scaffold.sizes \ /cluster/data/dm1/chrom.sizes ../preNet/$i end cd .. # Run the main netter, putting the results in n1. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i chainNet $i -minSpace=1 /cluster/data/dp1/scaffold.sizes \ /cluster/data/dm1/chrom.sizes ../n1/$n /dev/null end cd .. # Classify parts of net as syntenic, nonsyntenic etc. cat n1/*.net | netSyntenic stdin noClass.net # The final step of net creation needs the database. # Best to wait for the database load to finish if it # hasn't already. ssh hgwdev cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold netClass -liftT=/cluster/data/dp1/jkStuff/liftAll.lft -noAr \ noClass.net dp1 dm1 fruitfly.net rm -r n1 noClass.net netFilter -minGap=10 fruitfly.net > fruitflyFilt.net liftUp ../axtChainChrom/fruitflyFiltChrom.net \ /cluster/data/dp1/jkStuff/liftAll.lft warn fruitflyFilt.net hgLoadNet dp1 netDm1 ../axtChainChrom/fruitflyFiltChrom.net # Move back to the file server to create axt files corresponding # to the net. ssh kksilo cd /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold mkdir ../axtNetScaffold netSplit fruitflyFilt.net fruitflyNet cd fruitflyNet foreach i (*.net) set c = $i:r netToAxt -maxGap=300 $i ../chain/$c.chain \ /cluster/data/dp1/scaffoldsNib /cluster/data/dm1/nib \ ../../axtNetScaffold/$c.axt echo done ../axtNetScaffold/$c.axt end cd .. rm -r fruitflyNet mkdir ../axtNetChrom cd ../axtNetChrom liftUp tmp.axt /cluster/data/dp1/jkStuff/liftAll.lft warn \ ../axtNetScaffold/*.axt axtSort tmp.axt chrUn.axt rm tmp.axt # Load up the axtNet (alignment score wiggle) track: ssh hgwdev mkdir /gbdb/dp1/axtNetDm1 foreach f (/cluster/data/dp1/bed/blastz.dm1.swap/axtNetChrom/chr*.axt) ln -s $f /gbdb/dp1/axtNetDm1 end hgLoadAxt dp1 axtNetDm1 # ADD BLASTZ/CHAIN/NET DOWNLOADABLE FILES (DONE 12/3/03 angie) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp1/vsDm1 cd /usr/local/apache/htdocs/goldenPath/dp1/vsDm1 gzip -c \ /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/all.chain \ > chain.gz gzip -c \ /cluster/data/dp1/bed/blastz.dm1.swap/axtChainScaffold/fruitflyFilt.net \ > net.gz mkdir axtNet foreach f (/cluster/data/dp1/bed/blastz.dm1.swap/axtNetScaffold/*.axt) gzip -c $f > axtNet/$f:t.gz end # Make a README.txt which explains the files & formats.