# for emacs: -*- mode: sh; -*- # Drosophila pseudoobscura -- # # Baylor HGSC's Drosophila Genome Project Freeze 1 assembly (Aug. 2003) # http://www.hgsc.bcm.tmc.edu/projects/drosophila/ # # Note: this is the same assembly as dp1! But now the browser is scaffold- # based, not chrUn-based which everybody hated. # DOWNLOAD SEQUENCE (DONE 8/2/04 angie) ssh kksilo mkdir /cluster/store8/dp2 cd /cluster/data ln -s /cluster/store8/dp2 dp2 cd /cluster/data/dp2 mkdir jkStuff bed mkdir downloads cd downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/conditions_for_use wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/freeze1_readme # Freeze 1 is 759 scaffolds, 271 of which are assigned to chromosomes # in the .txt file: wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/D_pseudo_freeze1_scaffolds_8_28_03.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/fasta/D_pseudo_freeze1_scaffold_to_chromosome_assignments_8_28_03.txt # Unpack scaffolds into separate files in scaffolds/ dir: cd /cluster/data/dp2 mkdir scaffolds zcat downloads/D_pseudo_freeze1_scaffolds_8_28_03.fa.gz \ | faSplit byname stdin scaffolds/ # FIX UP SCAFFOLDS WITH FORMATTING PROBLEMS (DONE 8/2/04 angie) # ***ASSEMBLY-SPECIFIC*** SHOULD NOT BE NECESSARY IN THE FUTURE! # Some scaffolds ended up getting split by Baylor at the last minute, # and ".fa" was left appended to their fasta record names which caused # faSplit to put them in ".fa.fa" files: ls -1 scaffolds/*.fa.fa #scaffolds/Contig3286_Contig7811A.fa.fa #scaffolds/Contig3286_Contig7811B.fa.fa #scaffolds/Contig4971_Contig7717A.fa.fa #scaffolds/Contig4971_Contig7717B.fa.fa #scaffolds/Contig6811_Contig7852A.fa.fa #scaffolds/Contig6811_Contig7852B.fa.fa #scaffolds/Contig7891_Contig7492A.fa.fa #scaffolds/Contig7891_Contig7492B.fa.fa foreach f (scaffolds/*.fa.fa) perl -wpe 's/^(>Contig\d+_Contig\w+).fa/$1/' $f \ > $f:r head -2 $f:r faCmp $f $f:r end rm scaffolds/*.fa.fa # Since dp1 is the same assembly, double-check against it to make sure # we did the right thing in both places. foreach f (scaffolds/*.fa) faCmp $f /cluster/data/dp1/$f | grep -v ' are the same$' end # PARTITION SCAFFOLDS FOR PROCESSING (DONE 8/2/04 angie) # Chop up scaffolds into ~500k chunks (mainly to limit size of # RepeatMasker input sequence). Cat together split .lft's into # jkStuff/liftAll.lft which will be used to lift to scaffold # coords after processing. ssh kksilo cd /cluster/data/dp2 mkdir scaffoldsSplit foreach f (scaffolds/*.fa) faSplit size $f 500000 scaffoldsSplit/$f:t:r_ \ -lift=scaffoldsSplit/$f:t:r.lft end cat scaffoldsSplit/C*.lft > jkStuff/liftAll.lft # Create a big fasta file containing all split scaffolds, ordered # by decreasing size. Then use faSplit again -- we end up with # big scaffold chunks in their own files, but small scaffolds # grouped together. mkdir chunks cat `ls -1S scaffoldsSplit/*.fa` \ | faSplit about stdin 300000 chunks/chunk_ # CREATING DATABASE (DONE 8/2/04 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 514G 1.2T 31% /var/lib/mysql hgsql '' -e 'create database dp2' # RUN REPEAT MASKER (DONE 8/2/04 angie) # January ("March") '04 version of RepeatMasker and libs. # make the run directory, output directory, and job list cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/dp2/$2 /bin/cp ../chunks/$2 /tmp/dp2/$2/ pushd /tmp/dp2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 popd /bin/cp /tmp/dp2/$2/$2.out ./ /bin/rm -fr /tmp/dp2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/dp2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/dp2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun RMOut cp /dev/null RMRun/RMJobs foreach f ( chunks/*.fa ) set chunk = $f:t echo ../jkStuff/RMDrosophila \ /cluster/data/dp2/RMOut $chunk \ '{'check in line+ /cluster/data/dp2/chunks/$chunk'}' \ '{'check out line+ /cluster/data/dp2/RMOut/$chunk.out'}' \ >> RMRun/RMJobs end # do the run ssh kk cd /cluster/data/dp2/RMRun para create RMJobs para try, check, push, check,... #Completed: 304 of 304 jobs #Average job time: 5720s 95.33m 1.59h 0.07d #Longest job: 7412s 123.53m 2.06h 0.09d #Submission to last job: 7423s 123.72m 2.06h 0.09d # Lift up the split-scaffold .out's to scaffold .out's ssh kksilo cd /cluster/data/dp2 foreach f (RMOut/*.fa.out) liftUp $f:r:r.scaf.out jkStuff/liftAll.lft warn $f > /dev/null end # Make a consolidated scaffold .out file too: head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out foreach f (RMOut/*.scaf.out) tail +4 $f >> RMOut/scaffolds.fa.out end # Load the .out files into the database with: ssh hgwdev hgLoadOut dp2 /cluster/data/dp2/RMOut/scaffolds.fa.out # hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk, # but we want a non-split with no prefix table: hgsql dp2 -e 'rename table scaffolds_rmsk to rmsk' # Fix up the indices too: hgsql dp2 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(9), bin); \ create index genoStart on rmsk (genoName(9), genoStart); \ create index genoEnd on rmsk (genoName(9), genoEnd);' # SIMPLE REPEATS (TRF) (DONE 8/3/04 angie) ssh kksilo mkdir -p /cluster/data/dp2/bed/simpleRepeat cd /cluster/data/dp2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach f (/cluster/data/dp2/chunks/*.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end csh -fe jobs.csh \ |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \ > jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # Lift chunk results to one file for all scaffolds: liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed dp2 simpleRepeat \ /cluster/data/dp2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 8/3/04 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/dp2/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end liftUp scaffoldsTrfMask.bed ../../jkStuff/liftAll.lft warn trfMask/*.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 8/3/04 angie) ssh kksilo cd /cluster/data/dp2 echo repeat- and trf-masking chunks foreach chunk (chunks/chunk*.fa) set trfMask=bed/simpleRepeat/trfMask/$chunk:t:r.bed maskOutFa -soft $chunk RMOut/$chunk:t.out $chunk maskOutFa -softAdd $chunk $trfMask $chunk end echo repeat- and trf-masking scaffolds # maskOutFa dies if the lift file describes more seqs than are in # the fasta, so make a separate mask file for each scaffold. set rmMask=RMOut/scaffolds.fa.out set trfMask=bed/simpleRepeat/scaffoldsTrfMask.bed # use head to grab the RepeatMasker output header, then grep contents # for scaffold. foreach f (scaffolds/*.fa) head -3 $rmMask > tmpMask.out grep $f:t:r $rmMask >> tmpMask.out maskOutFa -soft $f tmpMask.out $f grep $f:t:r $trfMask > tmpMask.bed maskOutFa -softAdd $f tmpMask.bed $f end rm tmpMask.{bed,out} # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 8/2/04 angie) # Translate to nib ssh kksilo cd /cluster/data/dp2 mkdir nib foreach f (scaffolds/*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make a .2bit file for blat: faToTwoBit scaffolds/*.fa dp2.2bit # Make symbolic links from /gbdb/dp2/nib to the real nibs and 2bit. ssh hgwdev mkdir -p /gbdb/dp2/nib ln -s /cluster/data/dp2/nib/*.nib /gbdb/dp2/nib ln -s /cluster/data/dp2/dp2.2bit /gbdb/dp2/nib # Load /gbdb/dp2/nib paths into database and save size info. # Make index on chromInfo.name non-unique (in its first N chars) # to accommodate 22-char scaffold names like # Contig7891_Contig7492A, Contig7891_Contig7492B # Also, index first 9 chars instead of first 16 -- the 6th to 9th # chars are almost unique (except for those A and B scaffolds). ssh hgwdev cd /cluster/data/dp2 hgNibSeq -preMadeNib -index='INDEX(chrom(9))' dp2 /gbdb/dp2/nib \ /gbdb/dp2/nib/*.nib #135845121 total bases hgsql dp2 -N -e 'select chrom,size from chromInfo' > chrom.sizes wc -l chrom.sizes # 759 chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING (DONE 8/3/04 angie) # Copy all the data from the table "grp" # in the existing database "rn1" to the new database ssh hgwdev hgsql dp2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 8/3/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("dp2", "Aug. 2003", "/gbdb/dp2/nib", "D. pseudoobscura", \ "Contig1006_Contig943:3363975-3378814", 1, 56, \ "D. pseudoobscura", \ "Drosophila pseudoobscura", "/gbdb/dp2/html/description.html", \ 0, 0, "Baylor HGSC Freeze 1");' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "dp2" where genome = "D. pseudoobscura"' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit trackDb/makefile to add dp2 to the DBS variable. mkdir drosophila/dp2 # Create a simple drosophila/dp2/description.html file. cvs add drosophila/dp2 cvs add drosophila/dp2/description.html make update # go public on genome-test cvs commit makefile cvs commit drosophila/dp2 mkdir /gbdb/dp2/html # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # PRODUCING GENSCAN PREDICTIONS (DONE 8/3/04 angie) # Run on small cluster -- genscan needs big mem. ssh kksilo mkdir /cluster/data/dp2/bed/genscan cd /cluster/data/dp2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Make hard-masked scaffolds mkdir /cluster/data/dp2/scaffoldsMasked foreach f (/cluster/data/dp2/scaffolds/*.fa) maskOutFa $f hard /cluster/data/dp2/scaffoldsMasked/$f:t.masked end # Generate a list file, scaffolds.list, of all the hard-masked scaffolds # that *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null scaffolds.list foreach f ( `ls -1S /cluster/data/dp2/scaffoldsMasked/*.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> scaffolds.list end cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 scaffolds.list single gsub jobList ssh kki cd /cluster/data/dp2/bed/genscan para create jobList para try, check, push, check, ... #Completed: 759 of 759 jobs #Average job time: 7s 0.12m 0.00h 0.00d #Longest job: 127s 2.12m 0.04h 0.00d #Submission to last job: 666s 11.10m 0.18h 0.01d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Concatenate scaffold-level results: ssh kksilo cd /cluster/data/dp2/bed/genscan cat gtf/*.gtf > genscan.gtf cat subopt/*.bed > genscanSubopt.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/dp2/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf dp2 genscan genscan.gtf hgPepPred dp2 generic genscanPep genscan.pep hgLoadBed dp2 genscanSubopt genscanSubopt.bed # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch dp2 genscanPep 0501071300.00 # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 8/3/04 angie) ssh kkr1u00 mkdir /iscratch/i/dp2 cp -pR /cluster/data/dp2/nib /iscratch/i/dp2/ iSync # MAKE DOWNLOADABLE FILES (DONE 8/16/04 angie) ssh kksilo mkdir /cluster/data/dp2/zips cd /cluster/data/dp2 zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out zip -j zips/scaffoldFa.zip scaffolds/*.fa zip -j zips/scaffoldFaMasked.zip scaffoldsMasked/*.fa.masked zip -j zips/scaffoldTrf.zip bed/simpleRepeat/scaffoldsTrfMask.bed foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp2 cd /usr/local/apache/htdocs/goldenPath/dp2 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips cp -p /cluster/data/dp2/zips/*.zip . md5sum *.zip > md5sum.txt # SWAP DM1-DP2 BLASTZ (DONE 8/3/04 angie) ssh kksilo mkdir -p /cluster/data/dp2/bed/blastz.dm1.swap.2004-08-03/axtScaffold ln -s blastz.dm1.swap.2004-08-03 /cluster/data/dp2/bed/blastz.dm1 cd /cluster/data/dp2/bed/blastz.dm1 set aliDir = /cluster/data/dm1/bed/blastz.dp2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtScaffold cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtScaffold/$f:t end du -sh $aliDir/axtChrom unsorted axtScaffold #292M /cluster/data/dm1/bed/blastz.dp2/axtChrom #294M unsorted #294M axtScaffold rm -r unsorted # CHAIN MELANOGASTER BLASTZ (DONE 8/4/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dp2/bed/blastz.dm1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/dp2/bed/blastz.dm1/axtScaffold/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -verbose=0 $1 \ /iscratch/i/dp2/nib \ /iscratch/i/dm1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 671 of 671 jobs #Average job time: 4s 0.07m 0.00h 0.00d #Longest job: 10s 0.17m 0.00h 0.00d #Submission to last job: 669s 11.15m 0.19h 0.01d # now on the fileserver, sort chains ssh kksilo cd /cluster/data/dp2/bed/blastz.dm1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/dp2/bed/blastz.dm1/axtChain/chain cat *.chain | hgLoadChain dp2 chainDm1 stdin # NET MELANOGASTER BLASTZ (DONE 8/4/04 angie) ssh kksilo cd /cluster/data/dp2/bed/blastz.dm1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dp2/bed/blastz.dm1/axtChain netClass -noAr noClass.net dp2 dm1 melanogaster.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dp2/bed/blastz.dm1/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dp2/bed/blastz.dm1/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet dp2 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net \ | hgLoadNet dp2 netSyntenyDm1 stdin # MAKE AXTNET (DONE 8/17/04 angie) ssh kksilo cd /cluster/data/dp2/bed/blastz.dm1/axtChain netSplit melanogaster.net net cd .. mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/dp2/nib \ /cluster/data/dm1/nib stdout \ | axtSort stdin axtNet/$chr.axt end # MAKE VSDM1 DOWNLOADABLES (DONE 8/17/04 angie) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp2/vsDm1 cd /usr/local/apache/htdocs/goldenPath/dp2/vsDm1 gzip -c \ /cluster/data/dp2/bed/blastz.dm1/axtChain/all.chain \ > melanogaster.chain.gz gzip -c \ /cluster/data/dp2/bed/blastz.dm1/axtChain/melanogaster.net \ > melanogaster.net.gz mkdir axtNet foreach f (/cluster/data/dp2/bed/blastz.dm1/axtNet/*.axt) gzip -c $f > axtNet/$f:t.gz end # Make a README.txt which explains the files & formats. md5sum *.gz */*.gz > md5sum.txt # LIFTOVER BDGP GENES (TODO 8/?/03 angie) ssh hgwdev mkdir /cluster/data/dp2/bed/bdgpAnnotations cd /cluster/data/dp2/bed/bdgpAnnotations echo 'select * from bdgpGene' | hgsql -N dm1 > dm1.bdgpGene.tab liftOver -minMatch=0.5 -minBlocks=0.5 -fudgeThick -genePred \ dm1.bdgpGene.tab \ /cluster/data/dm1/bed/blastz.dp2/axtChain/all.chain \ dp2.bdgpLiftGene.tab dp2.unmapped.tab wc -l dm1.bdgpGene.tab dp2.*.tab grep ^# dp2.unmapped.tab | awk '{print $1 " " $2 " " $3;}' \ | sort | uniq -c | sort -gr ldHgGene -predTab dp2 bdgpLiftGene dp2.bdgpLiftGene.tab hgsql dp2 -e 'create table bdgpLiftGenePep select * from dm1.bdgpGenePep' # MAKE 11.OOC FILE FOR BLAT (DONE 8/4/04 angie) # Use -repMatch=75 (based on size -- for human we use 1024, and # mosquito size is ~5% of human => 50 -- but I worry it doesn't scale # linearly so be conservative and require more matches to be "overused") ssh kkr1u00 mkdir /cluster/data/dp2/bed/ooc cd /cluster/data/dp2/bed/ooc ls -1 /cluster/data/dp2/nib/*.nib > nib.lst mkdir /cluster/bluearc/dp2 blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dp2/11.ooc -repMatch=75 #Wrote 6348 overused 11-mers to /cluster/bluearc/dp2/11.ooc cp -p /cluster/bluearc/dp2/*.ooc /iscratch/i/dp2/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 8/4/04 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines (note scaffold-browser settings): # dp2 (D. pseudoobscura) dp2.genome = /iscratch/i/dp2/nib/*.nib dp2.lift = no dp2.refseq.mrna.native.load = no dp2.refseq.mrna.xeno.load = yes dp2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 dp2.genbank.mrna.xeno.load = yes dp2.genbank.est.xeno.load = no dp2.downloadDir = dp2 dp2.perChromTables = no cvs ci etc/genbank.conf # Since D. pseudoobscura is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Edit src/align/gbBlat to add /iscratch/i/dp2/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) RefSeq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dp2 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp2 featureBits dp2 xenoRefGene #14616774 bases of 135845121 (10.760%) in intersection # Clean up: rm -rf work/initial.dp2 # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dp2 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp2 featureBits dp2 mrna #SELECT ,,,,,,,,,, FROM mrna WHERE < 3647374 AND > 0 # oops, try this instead: featureBits dp2 all_mrna #17931 bases of 135845121 (0.013%) in intersection # Wow. that is so paltry, perhaps we shouldn't even bother having a # mrna track?? featureBits dp2 xenoMrna #15763946 bases of 135845121 (11.604%) in intersection # Clean up: rm -rf work/initial.dp2 # -initial for ESTs nice bin/gbAlignStep -srcDb=genbank -type=est -initial dp2 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 dp2 & # Doh! wow, surprised that there are really no ESTs: #eieio 2004.08.04-19:18:53 dp2.initalign: nothing to align # Clean up: rm -rf work/initial.dp2 # MAKE GCPERCENT (DONE 8/4/04 angie) ssh hgwdev mkdir /cluster/data/dp2/bed/gcPercent cd /cluster/data/dp2/bed/gcPercent # create and load gcPercent table hgsql dp2 < ~/src/hg/lib/gcPercent.sql hgGcPercent dp2 ../../nib # EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie) ssh kksilo mkdir /cluster/data/dp2/bed/fakeAgp cd /cluster/data/dp2/bed/fakeAgp cat ../../scaffolds/*.fa \ | faGapSizes stdin \ -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000 # More blocks of N's than we would expect are exactly 10bp long. That is # probably the min gap size for the assembly: cat ../../scaffolds/*.fa \ | hgFakeAgp -minContigGap=10 stdin fake.agp ssh hgwdev hgLoadGap -unsplit dp2 /cluster/data/dp2/bed/fakeAgp/fake.agp # TWINSCAN (FROM BAYLOR) (DONE 8/4/04 angie) ssh kksilo mkdir /cluster/data/dp2/bed/twinscan cd /cluster/data/dp2/bed/twinscan wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/freeze01_twinscan.pep.fa wget -r ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/twinscan cat ftp.hgsc.bcm.tmc.edu/pub/data/Dpseudoobscura/gene_predictions/twinscan/*/*.gff \ | sed -e 's/\.fon\.ace_linear//g' > twinscan.gtf # Doh, freeze01_twinscan.pep.fa has names that don't match with the names # in the GTF.... and the pep names have coords that don't quite jive with # the coords in the GTF. Looks like a version difference -- discard. rm freeze01_twinscan.pep.fa ssh hgwdev cd /cluster/data/dp2/bed/twinscan ldHgGene -gtf -genePredExt dp2 twinscan twinscan.gtf featureBits -enrichment dp2 xenoRefGene twinscan #xenoRefGene 10.760%, twinscan 15.319%, both 8.474%, cover 78.76%, enrich 5.14x # Compare with genscan: featureBits -enrichment dp2 xenoRefGene genscan #xenoRefGene 10.760%, genscan 18.141%, both 9.393%, cover 87.30%, enrich 4.81x #TODO : ask somebody at Baylor about AGP/gap info for the scaffolds, and the #twinscan coord differences. # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 8/26/04 angie) ssh hgwdev echo 'insert into blatServers values("dp2", "blat3", "17778", 1, 0); \ insert into blatServers values("dp2", "blat3", "17779", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track (DONE 2004-08-24 braney) ssh kksilo mkdir -p /cluster/data/dp2/blastDb cd /cluster/data/dp2/blastDb for i in ../scaffolds/*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/dp2/blastDb cp /cluster/data/dp2/blastDb/* /iscratch/i/dp2/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/dp2/bed/tblastn.dm1FB cd /cluster/data/dp2/bed/tblastn.dm1FB ls -1S /iscratch/i/dp2/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/dp2/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/759) = 94.799100 split -l 95 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/dp2/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 150282 of 150282 jobs # CPU time in finished jobs: 2977330s 49622.16m 827.04h 34.46d 0.094 y # IO & Wait Time: 999926s 16665.44m 277.76h 11.57d 0.032 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest job: 815s 13.58m 0.23h 0.01d # Submission to last job: 8894s 148.23m 2.47h 0.10d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.list gensub2 chain.lst single chainGsub chainSpec para create chainSpec # should run this on the mini-cluster or with my shove script # so you can limit the number of jobs starting to 3 or 4 para try, push... exit # back to kksilo cd /cluster/data/dp2/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastDm1FB.psl ssh hgwdev cd /cluster/data/dp2/bed/tblastn.dm1FB hgLoadPsl dp2 blastDm1FB.psl echo "drop index tName on blastDm1FB ;" | hgsql dp2 echo "drop index tName_2 on blastDm1FB ;" | hgsql dp2 echo "drop index tName_3 on blastDm1FB ;" | hgsql dp2 echo "create index tName on blastDm1FB (tName(9), bin) ;" | hgsql dp2 echo "create index tName_2 on blastDm1FB (tName(9), tStart);" | hgsql dp2 echo "create index tName_3 on blastDm1FB (tName(9), tEnd);" | hgsql dp2 # End tblastn # GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie) ssh hgwdev mkdir /cluster/data/dp2/bed/geneMapper cd /cluster/data/dp2/bed/geneMapper wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroPse_1.gff # Don't use -genePredExt... there are no start/stop_codon items, so # all get marked "incmpl", and name2 always gets the same value as name. # Get rid of custom track header lines: egrep -v '^(track|browser) ' DroPse_1.gff \ | ldHgGene -gtf dp2 geneMapper stdin